CUDA Speedup

Hi.

I have a kernel which computes the sum of the elements of each column of a matrix. The kernel is as follows:

#define BLOCKSIZE_X 64
#define BLOCKSIZE_Y 4

global void Sum(const float *X, float *Y, const int N, const int K)
{
shared float x0[BLOCKSIZE_Y][BLOCKSIZE_X];

int idx = threadIdx.x;
int idy = threadIdx.y;

int colId = blockIdx.x*BLOCKSIZE_Y + idy;

if(colId < K){
// compute start of X segment
int iter = colId*N;

x0[idy][idx] = 0.0;
for (int n = idx; n < N; n += BLOCKSIZE_X){
int j = iter + n;
x0[idy][idx] += X[j];
}

__syncthreads();

// add partial means
if (BLOCKSIZE_X >= 64)
  x0[idy][idx] += x0[idy][idx + 32];

if (BLOCKSIZE_X >= 32)
  x0[idy][idx] += x0[idy][idx + 16];

if (BLOCKSIZE_X >= 16)
  x0[idy][idx] += x0[idy][idx +  8];

if (BLOCKSIZE_X >= 8 )
  x0[idy][idx] += x0[idy][idx +  4];

if (BLOCKSIZE_X >= 4 )
  x0[idy][idx] += x0[idy][idx +  2];

if (BLOCKSIZE_X >= 2 )
  x0[idy][idx] += x0[idy][idx +  1];

// store result to global variable
if (idx == 0){
  Y[colId] = x0[idy][0];

}
}

 K = 60000;
 N = 1000;
 dim3 threads(BLOCKSIZE_X, BLOCKSIZE_Y, 1);
 dim3 grid((K+BLOCKSIZE_Y-1)/BLOCKSIZE_Y, 1, 1);
 Sum<<<grid, threads>>> (X, Y, N, K);

Any suggestions on how to speedup the above kernel would be greatly helpful! Thanks.