Hi.
I have a kernel which computes the sum of the elements of each column of a matrix. The kernel is as follows:
#define BLOCKSIZE_X 64
#define BLOCKSIZE_Y 4
global void Sum(const float *X, float *Y, const int N, const int K)
{
shared float x0[BLOCKSIZE_Y][BLOCKSIZE_X];
int idx = threadIdx.x;
int idy = threadIdx.y;
int colId = blockIdx.x*BLOCKSIZE_Y + idy;
if(colId < K){
// compute start of X segment
int iter = colId*N;
x0[idy][idx] = 0.0;
for (int n = idx; n < N; n += BLOCKSIZE_X){
int j = iter + n;
x0[idy][idx] += X[j];
}
__syncthreads();
// add partial means
if (BLOCKSIZE_X >= 64)
x0[idy][idx] += x0[idy][idx + 32];
if (BLOCKSIZE_X >= 32)
x0[idy][idx] += x0[idy][idx + 16];
if (BLOCKSIZE_X >= 16)
x0[idy][idx] += x0[idy][idx + 8];
if (BLOCKSIZE_X >= 8 )
x0[idy][idx] += x0[idy][idx + 4];
if (BLOCKSIZE_X >= 4 )
x0[idy][idx] += x0[idy][idx + 2];
if (BLOCKSIZE_X >= 2 )
x0[idy][idx] += x0[idy][idx + 1];
// store result to global variable
if (idx == 0){
Y[colId] = x0[idy][0];
}
}
K = 60000;
N = 1000;
dim3 threads(BLOCKSIZE_X, BLOCKSIZE_Y, 1);
dim3 grid((K+BLOCKSIZE_Y-1)/BLOCKSIZE_Y, 1, 1);
Sum<<<grid, threads>>> (X, Y, N, K);
Any suggestions on how to speedup the above kernel would be greatly helpful! Thanks.