What I have set-up this code to do (works as intended with SHARED_MEM == 0, using just global memory)
is to take a matrix and divide it into blocks of rows. Each block of the matrix is assigned to a CUDA Thread Block
and each row of that block is assigned to a thread of the Thread Block.
Shared memory is set-up to store a block of rows of that matrix and each Thread Block should have its own shared memory pool
(Thread Blocks do not share shared memory).
The code produces corrects results with shared memory “turned on” up until element #17 (TBLOCK = 16, ROWS = 2 * 16, COLS = 2 * 16).
(2 Thread Blocks with 16 threads each)
(source matrix is 32x32, source vector is 32x1, result vector is 32x1)
Here is the code:
__global__ void MatTest(
float *d_C,
float *d_A,
float *d_B
){
// Block index
int bx = blockIdx.x;
// Thread index
int tx = threadIdx.x;
int i = tx + (bx* blockDim.x);
//printf ("tx: %d\n", tx);
//printf ("bx: %d\n", bx);
//printf ("i: %d\n", i);
#if SHARED_MEM == 1
__shared__ float matA[TBLOCK * COLS];
//__shared__ float vecB[ROWS];
//__shared__ float vecC[ROWS];
int c;
for (c = 0; c < COLS; c++) {
matA[indexC(tx,c, COLS)] = d_A[indexC(i,c,COLS)];
//vecB[c] = d_B[c];
//vecC[c] = 0;
}
__syncthreads();
float t = 0;
for (c = 0; c < COLS; c++) {
t += matA[indexC(tx,c,COLS)] * d_B[c];
//d_C[i] = 2;
}
__syncthreads();
d_C[i] = t;
//this kernel uses bytes of Shared Memory and bit registers...
#endif
#if SHARED_MEM == 0
for (int c = 0; c < COLS; c++) {
d_C[i] += d_A[indexC(i,c,COLS)] * d_B[c];
//this kernel uses bytes of Shared Memory and bit registers...
}
#endif
//printf("d_C[%d]: %f\n", i, d_C[i]);
}
I have been thinking over it quite a bit, but I cannot find any obvious error so I’d gladly take some advice…
thanks in advance,
Panajev