Hi i am newbie to CUDA programming and i would like to address a problem i run into with the parallel reduction templates given in NVIDIA SDK. I try to compute a parallel sum in C using the Cuda extensions and i didn’t manage to do it. Can somebody post a simple example of reduction in CUDA C without templates and power-of-2 array problem, so i can understand how it works?
thanks in advance.
the code i used is:
[codebox]global void cg2(float *g_odata,float ro,float r,int n)
{
// now that we are using warp-synchronous programming (below)
// we need to declare our shared memory volatile so that the compiler
// doesn't reorder stores to it and induce incorrect behavior.
extern __shared__ float sdata[];
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
unsigned int blockSize=256;
sdata[tid] = (i < n) ? (r[i]*ro[i]) : 0;
if (i + blockSize < n)
sdata[tid] += ro[i+blockSize]*r[i+blockSize];
__syncthreads();
// do reduction in shared mem
for(unsigned int s=blockDim.x/2; s>32; s>>=1)
{
if (tid < s)
{
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
#ifndef DEVICE_EMULATION
if (tid < 32)
{
if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; __syncthreads(); }
if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; __syncthreads(); }
if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; __syncthreads(); }
if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; __syncthreads(); }
if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; __syncthreads(); }
if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; __syncthreads(); }
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}[/codebox]
I use ubuntu linux 9.04 with the geforce 9500gt gpu…