Global Memory Read Throughput

Hi.

Does anyone know how to increase the global memory read throughput of the following kernel? Any suggestions are appreciated.

#define BLOCKSIZE_X 64
#define BLOCKSIZE_Y 4

global void TT(float *T, const float *X, const int *N0, const int N, const int K, const int n0)
{
shared float x[BLOCKSIZE_Y][BLOCKSIZE_X];

int idx = threadIdx.x;
int idy = threadIdx.y;

int colId = blockIdx.x*BLOCKSIZE_Y + idy;

if(colId < K){
// compute start of X segment
int iter = colId*N;

x[idy][idx] = 0.0;
float xj;
for (int n = idx; n < n0; n += BLOCKSIZE_X){
  int j = iter + N0[n];
  xj = X[j];
  x[idy][idx] += xj;
}

__syncthreads();

// add partial sums
if (BLOCKSIZE_X >= 64)
  x[idy][idx] += x[idy][idx + 32];

if (BLOCKSIZE_X >= 32)
  x[idy][idx] += x[idy][idx + 16];

if (BLOCKSIZE_X >= 16)
  x[idy][idx] += x[idy][idx +  8];

if (BLOCKSIZE_X >= 8 )
  x[idy][idx] += x[idy][idx +  4];

if (BLOCKSIZE_X >= 4 )
  x[idy][idx] += x[idy][idx +  2];

if (BLOCKSIZE_X >= 2 )
  x[idy][idx] += x[idy][idx +  1];

// store result to global variable
if (idx == 0)
  T[colId] = x[idy][0];

}
}

I think that this is the same as thread in http://forums.nvidia.com/index.php?showtop…rt=#entry582909

what’s your status now? what’s your current bandwidth? and what’s upper bound you expect?

I think the current bandwidth is around 12 GB/sec. The maximum bandwidth for GTX 280 is 141.7 GB/sec.