Hi,
Currently studying simple reduction following CUDA SDK first examples,
I’m puzzle by __synchthread(), behavior… Here is the sample code
input values N=8 values: 0…7, kernel call is reduce0<<<1, N>>>(g_idata, g_odata);
/* This version is very inefficient -- interleaved addressing results
in divergent branching within warps */
__global__ void
reduce0(int *g_idata, int *g_odata)
{
extern __shared__ int sdata[];
// load shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
However the output values remains 0
to investigate I print intermediate values
__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
//each thread loads one element from global to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
printf("\nsdata[tid %u] = %d g_idata[i %u] = %d", tid, sdata[tid], i, g_idata[i]);
__syncthreads();
// do reduction in shared memory
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
printf("\nsdata[tid %d]= %d sdata[tid %d+s %u] = %d",
tid, sdata[tid], tid, s, sdata[tid + s]);
}
__syncthreads();
}
// write result for this block to global memory
if(tid == 0) {
g_odata[blockIdx.x] = sdata[0];
printf("\ng_odata[%d] = %d", blockIdx.x, g_odata[blockIdx.x]);
}
}
Here is the output, the global values are correctly assigned to the shared mem but it seems that after __syncthread() the shared memory is zeroed
sdata[tid 0] = 0 g_idata[i 0] = 0
sdata[tid 1] = 1 g_idata[i 1] = 1
sdata[tid 2] = 2 g_idata[i 2] = 2
sdata[tid 3] = 3 g_idata[i 3] = 3
sdata[tid 4] = 4 g_idata[i 4] = 4
sdata[tid 5] = 5 g_idata[i 5] = 5
sdata[tid 6] = 6 g_idata[i 6] = 6
sdata[tid 7] = 7 g_idata[i 7] = 7
sdata[tid 0]= 0 sdata[tid 0+s 1] = 0
sdata[tid 2]= 0 sdata[tid 2+s 1] = 0
sdata[tid 4]= 0 sdata[tid 4+s 1] = 0
sdata[tid 6]= 0 sdata[tid 6+s 1] = 0
sdata[tid 0]= 0 sdata[tid 0+s 2] = 0
sdata[tid 4]= 0 sdata[tid 4+s 2] = 0
sdata[tid 0]= 0 sdata[tid 0+s 4] = 0
g_odata[0] = 0
Please help me understand what is going wrong