I am trying to perform a column wise reduction of a column major matrix using CUDA. I tried to change the reduction example to calculate the sum for each column. The kernel times out. I do not know what is causing the problem or what I can do to redress it.
template <typename T, unsigned int blockSize>
__global__ void matrix_col_reduction(T * g_idata, T * g_odata, int ncols, int N)
{
// get the thread index
unsigned int tid = threadIdx.x;
// get the current element index for the thread
unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x;
//__shared__ T sdata[ncols * blockSize];
extern __shared__ T sdata[];
// input data into the shared memory
for(int i = 0; i < ncols; i++)
{
sdata[tid + i*blockSize] = g_idata[gid + i*N];
}
__syncthreads();
for(unsigned int s = blockDim.x/2; s > 0; s>> 1) {
if(tid < s) {
for(int i = 0; i < ncols; i++)
{
sdata[tid + i*blockSize] += sdata[tid + i*blockSize + s];
}
}
__syncthreads();
}
if(tid == 0) {
for(int i = 0; i < ncols; i++)
{
g_odata[blockIdx.x + i*blockDim.x] = sdata[i*blockSize];
}
}
}