Column wise reduction of column major matrix

I am trying to perform a column wise reduction of a column major matrix using CUDA. I tried to change the reduction example to calculate the sum for each column. The kernel times out. I do not know what is causing the problem or what I can do to redress it.

template <typename T, unsigned int blockSize>
        __global__ void matrix_col_reduction(T * g_idata, T * g_odata, int ncols, int N)
        {
            // get the thread index
            unsigned int tid = threadIdx.x;
            // get the current element index for the thread
            unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x;

            //__shared__ T sdata[ncols * blockSize];
            extern __shared__ T sdata[];
            // input data into the shared memory
            for(int i = 0; i < ncols; i++) 
            {
                sdata[tid + i*blockSize] = g_idata[gid + i*N];
            }
            __syncthreads();

            for(unsigned int s = blockDim.x/2; s > 0; s>> 1) {
                if(tid < s) {
                    for(int i = 0; i < ncols; i++)
                    {
                        sdata[tid + i*blockSize] += sdata[tid + i*blockSize + s];
                    }
                }
                __syncthreads();
            }

            if(tid == 0) {
                for(int i = 0; i < ncols; i++)
                {
                    g_odata[blockIdx.x + i*blockDim.x] = sdata[i*blockSize];
                }
            }
        }