All right. RunReduce is application entry point…
[codebox]// kernel invocation
void RunReduce(int argc, char* argv)
{
unsigned int size = 1<<6; // size of element to be reduced
// create random input data on CPU
size_t bytes = size * sizeof(int);
int* toReduce = (int*) malloc(bytes);
// fill the number
for (unsigned int i = 0; i < size; i++)
{
toReduce[i] = (int)(1);
}
int gpuResult2 = ReduceGPU(toReduce, size, 2, false);
//// cleanup
//CUT_SAFE_CALL(cutDeleteTimer(timer));
free(toReduce);
}[/codebox]
… ReduceGPU is the host function…
[codebox]int ReduceGPU( int* inputData,
size_t inputCount,
int funcVersion,
bool multiGpu)
{
int gpuResult = 0;
size_t bytes = inputCount * sizeof(int);
//// allocate input and output data in global memory
int* iDataD = NULL;
CUDA_SAFE_CALL(cudaMalloc((void**)&iDataD, bytes));
// copy data directly to device's global memory
CUDA_SAFE_CALL(cudaMemcpy(iDataD, inputData, bytes, cudaMemcpyHostToDevice));
dim3 dimBlock;
dim3 dimGrid;
if (funcVersion == 1)
{// this block wont be invoked
int* oDataD = NULL;
CUDA_SAFE_CALL(cudaMalloc((void**)&oDataD, 1));
dimBlock.x = 1;
dimGrid.x = 1;
reduceKernelV1<<<dimGrid, dimBlock>>>(iDataD, oDataD, inputCount);
CUDA_SAFE_CALL(cudaMemcpy(&gpuResult, oDataD, sizeof(int), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaFree(oDataD));
} else
{
if (multiGpu)
{
} else
{
// this block will be invoked
int* oDataD = NULL;
CUDA_SAFE_CALL(cudaMalloc((void**)&oDataD, bytes));
// check device properties
int deviceCount = 0;
CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount));
// assume that there is minimum 1 gpu
cudaDeviceProp deviceProp;
CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, 0));
// in the case input count < maxThreadsPerBlock
int maxThreadsPerBlock = min(deviceProp.maxThreadsPerBlock,inputCount);
dimBlock.x = maxThreadsPerBlock;
dimGrid.x = inputCount / maxThreadsPerBlock;
reduceKernelV2<<<dimGrid, dimBlock>>>(iDataD, oDataD);
#ifdef _DEBUG
// if debug, check the content of oData
int* oData = (int*)malloc(bytes / dimGrid.x);
CUDA_SAFE_CALL(cudaMemcpy(oData, oDataD, bytes / dimGrid.x, cudaMemcpyDeviceToHost));
free(oData);
#endif
// total of input block
int j = dimGrid.x;
while (j > 1)
{
dimGrid.x = j / maxThreadsPerBlock;
reduceKernelV2<<<dimGrid, dimBlock>>>(oDataD, oDataD);
j /= maxThreadsPerBlock;
}
// copy last element
CUDA_SAFE_CALL(cudaMemcpy(&gpuResult, oDataD, sizeof(int), cudaMemcpyDeviceToHost));
// free data
CUDA_SAFE_CALL(cudaFree(oDataD));
}
}
CUDA_SAFE_CALL(cudaFree(iDataD));
return gpuResult;
}[/codebox]
And here is the kernel code, as in the SDK Samples…
[codebox]
// Version 2: using n threads
global void reduceKernelV2(int* iData, int *oData)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = iData[i];
__syncthreads();
// do reduction in shared mem
for (unsigned int s = 1; s < blockDim.x; s *= 2)
{
if (tid % (2*s) == 0)
{
sdata[tid] += sdata[tid + s];
#ifdef DEVICE_EMULATION
printf("adding %d and %d, total = %d\n", tid, tid + s, sdata[tid]);
#endif
}
__syncthreads();
}
// write (intermediate) result for this block to global memory
if (tid == 0)
{
oData[blockIdx.x] = sdata[0];
#ifdef DEVICE_EMULATION
printf("Block %d finished!\n", blockIdx.x);
#endif
}
}
[/codebox]