hello all,
I have run into a strange problem with my CUDA code for matrix multiplication.
Its an implementation where a block 64 1D threads computes 64x64 elements by using register tiling for one matrix and shared memory for the other.
The blocks compute correctly except for last row of last N-1 blocks when N is the number of blocks on x dim. The difference between two consecutive blocks is always same: 896.
If for example i try 256x256 dim then
rreddy78@jetson-nano:~/projects/test$ ./matrixMulTiled8
C[256 x 256]. grid config = (4,4) for block config = (64x1)
Difference between two is: 896
Error greater than tolerance at 255,64: 3712 - 255,127: 3712
Error greater than tolerance at 255,128: 2816 - 255,191: 2816
Error greater than tolerance at 255,192: 1920 - 255,255: 1920
if I use 512x512 then:
rreddy78@jetson-nano:~/projects/test$ ./matrixMulTiled8
C[512 x 512]. grid config = (8,8) for block config = (64x1)
Difference between two is: 896
Error greater than tolerance at 511,64: 8320 - 511,127: 8320
Error greater than tolerance at 511,128: 7424 - 511,191: 7424
…
Error greater than tolerance at 511,448: 2944 - 511,511: 2944
How should we approach debugging for such a problem ? The same kernel code works for all other blocks and nothing is hard coded.
__global__ void matrixMultiplicationKernel(const float *__restrict__ A, const float *__restrict__ B, float *__restrict__ C, const int M_Size, const int W_Size, const int N_Size)
{
// blockDim.y = 1 and threadIdx.y = 0
const int ROW = (TILE_WIDTH * blockIdx.y) * blockDim.y + threadIdx.y; // Absolute C row index
const int COL = blockIdx.x * blockDim.x + threadIdx.x; // Absolute C column index
__shared__ float Ads[TILE_WIDTH][TILE_WIDTH];
register float BRT[TILE_WIDTH]; // B Register Tile
register float tmpSum[TILE_WIDTH] = {0}; // tmpSum in registers
for (int tileOffset = 0; tileOffset < W_Size; tileOffset += TILE_WIDTH)
{
// Load the B Tile into registers and A Tile into shared memory
//#pragma unroll
for (int idx = 0; idx < TILE_WIDTH; idx++)
{
BRT[idx] = B[((tileOffset + idx) * N_Size) + COL];
}
//#pragma unroll
for (int idx = 0; idx < TILE_WIDTH; idx++)
{
Ads[idx][threadIdx.x] = A[((idx + ROW) * W_Size) + (tileOffset + COL)];
}
__syncthreads();
for (int idx = 0; idx < TILE_WIDTH; idx++)
{
//#pragma unroll
for (int idx2 = 0; idx2 < TILE_WIDTH; idx2++)
tmpSum[idx] += Ads[idx][idx2] * BRT[idx2];
__threadfence();
}
__syncthreads();
}
for (int idx = 0; idx < TILE_WIDTH; idx++)
{
C[((ROW + idx) * N_Size) + COL] = tmpSum[idx];
}
}
void matrixMultiplication(const float *__restrict__ A, const float *__restrict__ B, float *__restrict__ C, const int M_Size, const int W_Size, const int N_Size)
{
// declare the number of blocks per grid and the number of threads per block
dim3 threadsPerBlock(TILE_WIDTH, 1);
dim3 blocksPerGrid(1, 1);
blocksPerGrid.x = (N_Size + (threadsPerBlock.x - 1)) / threadsPerBlock.x;
blocksPerGrid.y = (M_Size + (TILE_WIDTH - 1)) / TILE_WIDTH;
printf("C[%d x %d]. grid config = (%d,%d) for block config = (%dx%d)\n", M_Size, N_Size, blocksPerGrid.x, blocksPerGrid.y, threadsPerBlock.x, threadsPerBlock.y);
matrixMultiplicationKernel<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, M_Size, W_Size, N_Size);
checkCuda(cudaDeviceSynchronize());
}