Hi.

I have a kernel which computes the sum of the elements of each column of a matrix. The kernel is as follows:

#define BLOCKSIZE_X 64

#define BLOCKSIZE_Y 4

**global** void Sum(const float *X, float *Y, const int N, const int K)

{

**shared** float x0[BLOCKSIZE_Y][BLOCKSIZE_X];

int idx = threadIdx.x;

int idy = threadIdx.y;

int colId = blockIdx.x*BLOCKSIZE_Y + idy;

if(colId < K){

// compute start of X segment

int iter = colId*N;

x0[idy][idx] = 0.0;

for (int n = idx; n < N; n += BLOCKSIZE_X){

int j = iter + n;

x0[idy][idx] += X[j];

}

__syncthreads();

// add partial means

if (BLOCKSIZE_X >= 64)

x0[idy][idx] += x0[idy][idx + 32];

if (BLOCKSIZE_X >= 32)

x0[idy][idx] += x0[idy][idx + 16];

if (BLOCKSIZE_X >= 16)

x0[idy][idx] += x0[idy][idx + 8];

if (BLOCKSIZE_X >= 8 )

x0[idy][idx] += x0[idy][idx + 4];

if (BLOCKSIZE_X >= 4 )

x0[idy][idx] += x0[idy][idx + 2];

if (BLOCKSIZE_X >= 2 )

x0[idy][idx] += x0[idy][idx + 1];

// store result to global variable

if (idx == 0){

Y[colId] = x0[idy][0];

}

}

K = 60000;

N = 1000;

dim3 threads(BLOCKSIZE_X, BLOCKSIZE_Y, 1);

dim3 grid((K+BLOCKSIZE_Y-1)/BLOCKSIZE_Y, 1, 1);

Sum<<<grid, threads>>> (X, Y, N, K);

Any suggestions on how to speedup the above kernel would be greatly helpful! Thanks.