Global Memory Read Throughput

Hi.

Does anyone know how to increase the global memory read throughput of the following kernel? Any suggestions are appreciated.

#define BLOCKSIZE_X 64
#define BLOCKSIZE_Y 4

global void TT(float *T, const float *X, const int *N0, const int N, const int K, const int n0)
{
shared float x[BLOCKSIZE_Y][BLOCKSIZE_X];

int idx = threadIdx.x;
int idy = threadIdx.y;

int colId = blockIdx.x*BLOCKSIZE_Y + idy;

if(colId < K){
// compute start of X segment
int iter = colId*N;

x[idy][idx] = 0.0;
float xj;
for (int n = idx; n < n0; n += BLOCKSIZE_X){
int j = iter + N0[n];
xj = X[j];
x[idy][idx] += xj;
}

__syncthreads();

// add partial sums
if (BLOCKSIZE_X >= 64)
x[idy][idx] += x[idy][idx + 32];

if (BLOCKSIZE_X >= 32)
x[idy][idx] += x[idy][idx + 16];

if (BLOCKSIZE_X >= 16)
x[idy][idx] += x[idy][idx + 8];

if (BLOCKSIZE_X >= 8 )
x[idy][idx] += x[idy][idx + 4];

if (BLOCKSIZE_X >= 4 )
x[idy][idx] += x[idy][idx + 2];

if (BLOCKSIZE_X >= 2 )
x[idy][idx] += x[idy][idx + 1];

// store result to global variable
if (idx == 0)
T[colId] = x[idy][0];
}
}