Why is this dot product code not thread safe?

Hello All,

Here’s my dot product code which has shown itself to be variable :

#include <cuda.h>
#include <iostream>
#include <stdio.h>


#define N (2048*2048)
#define THREADS_PER_BLOCK 512

__global__ void dot( int *a, int *b, int *c ) {

    __shared__ int temp[THREADS_PER_BLOCK];

    int index = threadIdx.x + blockIdx.x * blockDim.x;
    temp[threadIdx.x] = a[index] * b[index];

   __syncthreads();

   if( 0 == threadIdx.x ) {
      int sum = 0; 
      *c = 0;
      for( int i = 0; i < THREADS_PER_BLOCK; i++ )
         sum += temp[i];
      atomicAdd( c , sum );
   }

   return;
}

int main(void) {

    int *a, *b, *c;
    int *dev_a, *dev_b, *dev_c;

    int size = N * sizeof( int );

   cudaMalloc((void**)&dev_a, size );
   cudaMalloc((void**)&dev_b, size );
   cudaMalloc((void**)&dev_c, sizeof( int ) );

   a = (int *)malloc( size );
   b = (int *)malloc( size );
   c = (int *)malloc( sizeof( int ) );
   *c = 0;

   for (int i=0; i < N; ++i) {

      a[i] = i;
      b[i] = i;
   }

// copy inputs to device
   cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice );
   cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice );

// launch dot() kernel

   dot<<< N/THREADS_PER_BLOCK, THREADS_PER_BLOCK >>>( dev_a, dev_b, dev_c );

// copy device result back to host copy of c

   cudaMemcpy( c, dev_c, sizeof( int ) , cudaMemcpyDeviceToHost );

   std::cout << "Result of dot product is : " << *c << std::endl;

   free( a ); free( b ); free( c );
   cudaFree( dev_a );
   cudaFree( dev_b );
   cudaFree( dev_c );

   return 0;
}

I have no idea why it doesn’t seem to want to work for me but it works fine for small numbers like 2048 as N.

Turns out, I was overflowing the signed integer values. Omg.

My code works with floats though so thank you God almighty :)

I feel silly XD