Atomic add running faster than naive add

atomic.cu

#include <stdio.h>
#include "gputimer.h"

#define NUM_THREADS 1000000
#define ARRAY_SIZE  64

#define BLOCK_WIDTH 100

void print_array(int *array, int size)
{
    printf("{ ");
    for (int i = 0; i < size; i++)  { printf("%d ", array[i]); }
    printf("}\n");
}

__global__ void increment_naive(int *g)
{
	// which thread is this?
	int i = blockIdx.x * blockDim.x + threadIdx.x; 

	// each thread to increment consecutive elements, wrapping at ARRAY_SIZE
	i = i % ARRAY_SIZE;  
	g[i]++;
}

__global__ void increment_atomic(int *g)
{
	// which thread is this?
	int i = blockIdx.x * blockDim.x + threadIdx.x; 

	// each thread to increment consecutive elements, wrapping at ARRAY_SIZE
	i = i % ARRAY_SIZE;  
	atomicAdd(& g[i], 1);
}

int main(int argc,char **argv)
{   
    GpuTimer timer;
    printf("%d total threads in %d blocks writing into %d array elements\n",
           NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);

    // declare and allocate host memory
    int h_array[ARRAY_SIZE];
    const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
 
    // declare, allocate, and zero out GPU memory
    int * d_array;
    cudaMalloc((void **) &d_array, ARRAY_BYTES);
    cudaMemset((void *) d_array, 0, ARRAY_BYTES); 

    // launch the kernel - comment out one of these
    timer.Start();
    
    // Instructions: This program is needed for the next quiz
    // uncomment increment_naive to measure speed and accuracy 
    // of non-atomic increments or uncomment increment_atomic to
    // measure speed and accuracy of  atomic icrements
    // increment_naive<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);
    for (int i = 0 ; i < 1000 ; i++)
    	increment_naive<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);
    timer.Stop();
    
    // copy back the array of sums from GPU and print
    cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost);
    print_array(h_array, ARRAY_SIZE);
    printf("Naive Time elapsed = %g ms\n", timer.Elapsed());
    
    cudaMemset((void *) d_array, 0, ARRAY_BYTES); 
    timer.Start();
    for (int i = 0 ; i < 1000 ; i++)
    	increment_atomic<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);
    timer.Stop();
    cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost);
    print_array(h_array, ARRAY_SIZE);
    printf("Atomic Time elapsed = %g ms\n", timer.Elapsed());
 
    // free GPU memory allocation and exit
    cudaFree(d_array);
    return 0;
}

gputimer.h

#ifndef __GPU_TIMER_H__
#define __GPU_TIMER_H__

struct GpuTimer
{
      cudaEvent_t start;
      cudaEvent_t stop;
 
      GpuTimer()
      {
            cudaEventCreate(&start);
            cudaEventCreate(&stop);
      }
 
      ~GpuTimer()
      {
            cudaEventDestroy(start);
            cudaEventDestroy(stop);
      }
 
      void Start()
      {
            cudaEventRecord(start, 0);
      }
 
      void Stop()
      {
            cudaEventRecord(stop, 0);
      }
 
      float Elapsed()
      {
            float elapsed;
            cudaEventSynchronize(stop);
            cudaEventElapsedTime(&elapsed, start, stop);
            return elapsed;
      }
};

#endif  /* __GPU_TIMER_H__ */

Sample OUTPUT of atomic.cu:

1000000 total threads in 10000 blocks writing into 64 array elements
{ 1422332 1422332 1422332 1422332 1426044 1426044 1426044 1426044 1443638 1443638 1443638 1443638 1444478 1444478 1444478 1444478 1457852 1457852 1457852 1457852 1459852 1459852 1459852 1459852 1461448 1461448 1461448 1461448 1459399 1459399 1459399 1459399 1422533 1422533 1422533 1422533 1425910 1425910 1425910 1425910 1444729 1444729 1444729 1444729 1445048 1445048 1445048 1445048 1458022 1458022 1458022 1458022 1460112 1460112 1460112 1460112 1461860 1461860 1461860 1461860 1457872 1457872 1457872 1457872 }
Naive Time elapsed = 931.293 ms
{ 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 15625000 }
Atomic Time elapsed = 327.587 ms