Hello:

I am attempting to learn CUDA, and I have experience with OpenMP, MPI, and pthreads. I want to try to implement a naive summation as a parallel reduction. I realize it is not efficient, but it will help me be sure I am learning CUDA correctly, and I can’t find any examples similar. So here is the kernel that does not provide me with the correct overall sum:

**global** void sumArray(float *input_cu, float *sum_cu, int blockSize, int numPoints, int numThreads)

{

// control variables

int pid = threadIdx.x;

int startIndex = (pid * blockSize);

int stopIndex = (startIndex + blockSize - 1);

if(pid == (numThreads - 1))

{

stopIndex = (numPoints - 1);

}

// overall sums, MUST put in shared, use extern to defer sizing upon declaration

extern **shared** float sums;

// find local sum

float localSum = 0.0;

for(int lcv = startIndex; lcv <= stopIndex; lcv++)

{

localSum = localSum + input_cu[lcv];

}

// update overall sum array

sums[pid] = localSum;

// update global sum (KDE_N pointer)

__syncthreads();

if(pid == 0)

{

*sum_cu = 0;

for(int lcv = 0; lcv < numThreads; lcv++)

{

*sum_cu = *sum_cu + sums[lcv];

}

}

}

And below is the context it is called from. I have a 8600GT NVIDIA card, and the data set is roughly 16 million float values. Calling context:

cudaMemcpy(data_cu, data, numPoints * sizeof(float), cudaMemcpyHostToDevice);

float KDE_N = 0.0;

float *KDE_N_cu;
cudaMalloc((void**)&KDE_N_cu, sizeof(float));

sumArray<<<1,512>>>(data_cu, KDE_N_cu, (numPoints / 512), numPoints, 512);

cudaMemcpy(&KDE_N, KDE_N_cu, sizeof(float), cudaMemcpyDeviceToHost);

cudaFree(KDE_N_cu);

printf(“N:\t%3.0f\n”, KDE_N);

float KDE_N_test = 0.0;

for(int lcv = 0; lcv < numPoints; lcv++)

{

KDE_N_test += data[lcv];

}

printf(“N_chk:\t%3.0f\n”, KDE_N_test);

Thanks for your help!