Hi,
I’m a college student learning CUDA with the textbook Kirk and W. Hwu, “Programming Massively Parallel Processors – A Hands-on Approach”.
I tried to implement tiled matrix multiplication on my own machine. If I’m understanding correctly, the tiled version is supposed to help reduce data transfer bottleneck as it reuses data, and also resolve bank conflicts. But the runtime of my code is somehow much slower than the basic matrix multiplication implementation. I checked my code again and again and compared it to the textbook code, but I can’t find the reason why this is happening.
Could someone explane why is this happening? Below are my device info, my kernel codes, and results:
Device 0: "NVIDIA GeForce RTX 2070 SUPER"
CUDA Driver Version / Runtime Version 11.5 / 11.5
CUDA Capability Major/Minor version number: 7.5
#define TILE_SIZE 8
__global__
void tiledMatMult_kernel(float* Cd, float* Ad, float* Bd, int width) {
__shared__ float tileA[TILE_SIZE][TILE_SIZE];
__shared__ float tileB[TILE_SIZE][TILE_SIZE];
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int tx = threadIdx.x;
int ty = threadIdx.y;
float res = 0;
for (int i = 0; i < ceil((float) width / TILE_SIZE); ++i) {
if (tx + i * TILE_SIZE < width && row < width) {
tileA[ty][tx] = Ad[row * width + tx + i * TILE_SIZE];
}
else {
tileA[ty][tx] = 0;
}
if (ty + i * TILE_SIZE < width && col < width) {
tileB[ty][tx] = Bd[(ty + i * TILE_SIZE) * width + col];
}
else {
tileB[ty][tx] = 0;
}
__syncthreads();
for (int k = 0; k < TILE_SIZE; k++) {
// dot products
res += tileA[ty][k] * tileB[k][tx];
}
__syncthreads();
}
if (col < width && row < width) {
Cd[row * width + col] = res;
}
}
__global__
void matMultKer(float* Cd, float* Ad, float* Bd, int width) {
int row = threadIdx.y + blockDim.y * blockIdx.y;
int col = threadIdx.x + blockDim.x * blockIdx.x;
if (row < width && col < width) {
float res = 0;
for (int k = 0; k < width; ++k) {
res += Ad[row * width + k] * Bd[k * width + col];
}
Cd[row * width + col] = res;
}
}
Result:
.\build\MatMult.exe 128
Matrix width set to 128
Finished generating random matrices.
CPU Simple MatMult <- 5342us (5.342ms)
GPU Simple MatMult <- 871us (0.871ms)
GPU Tiled MatMult <- 233448us (233.448ms)
Result correct!