Guys,
I am doing some simple experiments on the dot product code from Introduction to CUDA C presented at GTC 2010.
I added a modification in the kernel to support when the problem input is not a multiple of the THREADS_PER_BLOCK size. As long as I launch the code with N multiple of the threads per block, it works fine.
Now, if I launch it in example with 255*255 = 65025, and I want to compute the dot product of 2 arrays all with elements equal to 1, it gives the wrong result 65056. This is given by 2033 (# of blocks) * 32 (threads per block).
Debugging on VS I found out the problem is on the last block (2032) where the threads with index > N show a behavior I can’t explain. Here it is:
-
Add a conditional breakpoint on thread: @blockIdx(2032,0,0) && @threadIdx(2,0,0)
-
index = 65026 (as expected, so the if should not be taken)
-
The next step while debugging goes to atomicAdd (inside the “if”), and the index value shows a ??? value. (Note, if I comment out the atomicAdd, it will go to the “for” line.
I don’t understand what it can be the problem. Any help would be appreciated. Thanks
L
#include <stdio.h>
#include <cutil_inline.h>
#define N (255*255)
#define THREADS_PER_BLOCK 32
__global__ void dot( int *a, int *b, int *c) {
__shared__ long float temp[THREADS_PER_BLOCK];
int index = threadIdx.x + blockIdx.x * blockDim.x;
//Filter threads that are not necessary
if(index < N) {
//Elements multiplication
temp[threadIdx.x] = a[index] * b[index];
__syncthreads();
if( 0 == threadIdx.x ) {
int sum = 0;
for( int i= 0; i< THREADS_PER_BLOCK; i++ )
sum += temp[i];
atomicAdd( c , sum );
}
}
}
int main( void ) {
int *a, *b, *c; // host copies of a, b, c
int *dev_a, *dev_b, *dev_c; // device copies of a, b, c
int size = N * sizeof( int); // we need space for N ints
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
cudaMalloc( (void**)&dev_c, sizeof(int) );
a = (int*)malloc( size );
b = (int*)malloc( size );
c = (int*)malloc( sizeof(int) );
for (int i = 0; i < N; i ++) {
a[i] = 1;
b[i] = 1;
}
*c = 0;
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);
// launch dot() kernel
dot<<< (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK,THREADS_PER_BLOCK>>>( dev_a, dev_b, dev_c);
// copy device result back to host copy of c
cudaMemcpy( c, dev_c, sizeof(int) , cudaMemcpyDeviceToHost);
printf("result %d\n", *c);
free( a ); free( b ); free( c );
cudaFree( dev_a);
cudaFree( dev_b);
cudaFree( dev_c);
cudaFree( dev_c);
getchar();
return 0;
}