I am trying to use the reduce4 kernel from reduction_kernel.cu in the cudpp library. I keep getting segmentation fault or synchronization errors. This is the reduce4 function:
template <typename T> __global__ void reduce4(T *g_idata, T *g_odata) {
extern __shared__ T sdata[];
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
sdata[tid] = g_idata[i] + g_idata[i+blockDim.x];
__syncthreads();
// do reduction in shared mem
for(unsigned int s=blockDim.x/2; s>32; s>>=1)
{
if (tid < s)
sdata[tid] += sdata[tid + s];
__syncthreads();
}
#ifndef __DEVICE_EMULATION__
if (tid < 32)
#endif
{
sdata[tid] += sdata[tid + 32]; __syncthreads();
sdata[tid] += sdata[tid + 16]; __syncthreads();
sdata[tid] += sdata[tid + 8]; __syncthreads();
sdata[tid] += sdata[tid + 4]; __syncthreads();
sdata[tid] += sdata[tid + 2]; __syncthreads();
sdata[tid] += sdata[tid + 1]; __syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
The way I am calling it in my main program is like this:
reduce4<float><<<2, 16, (2*16*sizeof(float))>>>(d_mat_a, d_mat_r);
Using it on a array of floats size 32. Is the block and thread ratio causing these errors?, looking at the function I couldn’t see any reason for these values not to work.