Hi.
Does anyone know how to increase the global memory read throughput of the following kernel? Any suggestions are appreciated.
#define BLOCKSIZE_X 64
#define BLOCKSIZE_Y 4
global void TT(float *T, const float *X, const int *N0, const int N, const int K, const int n0)
{
shared float x[BLOCKSIZE_Y][BLOCKSIZE_X];
int idx = threadIdx.x;
int idy = threadIdx.y;
int colId = blockIdx.x*BLOCKSIZE_Y + idy;
if(colId < K){
// compute start of X segment
int iter = colId*N;
x[idy][idx] = 0.0;
float xj;
for (int n = idx; n < n0; n += BLOCKSIZE_X){
int j = iter + N0[n];
xj = X[j];
x[idy][idx] += xj;
}
__syncthreads();
// add partial sums
if (BLOCKSIZE_X >= 64)
x[idy][idx] += x[idy][idx + 32];
if (BLOCKSIZE_X >= 32)
x[idy][idx] += x[idy][idx + 16];
if (BLOCKSIZE_X >= 16)
x[idy][idx] += x[idy][idx + 8];
if (BLOCKSIZE_X >= 8 )
x[idy][idx] += x[idy][idx + 4];
if (BLOCKSIZE_X >= 4 )
x[idy][idx] += x[idy][idx + 2];
if (BLOCKSIZE_X >= 2 )
x[idy][idx] += x[idy][idx + 1];
// store result to global variable
if (idx == 0)
T[colId] = x[idy][0];
}
}
I think that this is the same as thread in http://forums.nvidia.com/index.php?showtop…rt=#entry582909
what’s your status now? what’s your current bandwidth? and what’s upper bound you expect?
I think the current bandwidth is around 12 GB/sec. The maximum bandwidth for GTX 280 is 141.7 GB/sec.