Hello:
I am attempting to learn CUDA, and I have experience with OpenMP, MPI, and pthreads. I want to try to implement a naive summation as a parallel reduction. I realize it is not efficient, but it will help me be sure I am learning CUDA correctly, and I can’t find any examples similar. So here is the kernel that does not provide me with the correct overall sum:
global void sumArray(float *input_cu, float *sum_cu, int blockSize, int numPoints, int numThreads)
{
// control variables
int pid = threadIdx.x;
int startIndex = (pid * blockSize);
int stopIndex = (startIndex + blockSize - 1);
if(pid == (numThreads - 1))
{
stopIndex = (numPoints - 1);
}
// overall sums, MUST put in shared, use extern to defer sizing upon declaration
extern shared float sums;
// find local sum
float localSum = 0.0;
for(int lcv = startIndex; lcv <= stopIndex; lcv++)
{
localSum = localSum + input_cu[lcv];
}
// update overall sum array
sums[pid] = localSum;
// update global sum (KDE_N pointer)
__syncthreads();
if(pid == 0)
{
*sum_cu = 0;
for(int lcv = 0; lcv < numThreads; lcv++)
{
*sum_cu = *sum_cu + sums[lcv];
}
}
}
And below is the context it is called from. I have a 8600GT NVIDIA card, and the data set is roughly 16 million float values. Calling context:
cudaMemcpy(data_cu, data, numPoints * sizeof(float), cudaMemcpyHostToDevice);
float KDE_N = 0.0;
float KDE_N_cu;
cudaMalloc((void*)&KDE_N_cu, sizeof(float));
sumArray<<<1,512>>>(data_cu, KDE_N_cu, (numPoints / 512), numPoints, 512);
cudaMemcpy(&KDE_N, KDE_N_cu, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(KDE_N_cu);
printf(“N:\t%3.0f\n”, KDE_N);
float KDE_N_test = 0.0;
for(int lcv = 0; lcv < numPoints; lcv++)
{
KDE_N_test += data[lcv];
}
printf(“N_chk:\t%3.0f\n”, KDE_N_test);
Thanks for your help!