Hi. I recently started to learn CUDA programming.
I have a question related to the performance among matrix multiplications. When I compared performances among three versions of matrix multiplications, the naive one, tiled matrix multiplication and using cuBLAS, I found some awkward results among them.
Here are my codes related to the issue.
<matrixmultiplication.cuh>
#pragma once
#include "cublas_v2.h"
#include "cuda_runtime.h"
#include <iostream>
#define TILE_WIDTH 4
__global__ void MatrixMulKernel(const float *M, const float *N, float *P,
const int width) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int column = blockIdx.x * blockDim.x + threadIdx.x;
if (row < width && column < width) {
float Pvalue = .0;
for (int k = 0; k < width; ++k) {
Pvalue += M[row * width + k] * N[k * width + column];
}
P[row * width + column] = Pvalue;
}
}
__global__ void TiledMatrixMulKernel(const float *M, const float *N, float *P,
const int width) {
__shared__ float Mds[TILE_WIDTH][TILE_WIDTH];
__shared__ float Nds[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * TILE_WIDTH + ty;
int column = bx * TILE_WIDTH + tx;
float Pvalue = .0;
for (int ph = 0; ph < width / TILE_WIDTH; ++ph) {
Mds[ty][tx] = M[row * width + ph * TILE_WIDTH + tx];
Nds[ty][tx] = N[(ph * TILE_WIDTH + ty) * width + column];
__syncthreads();
for (int k = 0; k < TILE_WIDTH; k++) {
Pvalue += Mds[ty][k] * Nds[k][tx];
}
__syncthreads();
}
if (row < width && column < width) {
P[row * width + column] = Pvalue;
}
}
void testMatrixMulKernel(const int Width) {
float *A_h = (float *)malloc(Width * Width * sizeof(float));
float *B_h = (float *)malloc(Width * Width * sizeof(float));
for (auto i = 0; i < Width * Width; i++) {
A_h[i] = 1.0 * float(i) / 4.0;
B_h[i] = 2.0 * float(i) / 4.0;
}
float *A_d, *B_d, *C_d;
cudaMalloc((void **)&A_d, Width * Width * sizeof(float));
cudaMalloc((void **)&B_d, Width * Width * sizeof(float));
cudaMalloc((void **)&C_d, Width * Width * sizeof(float));
cudaMemcpy(A_d, A_h, Width * Width * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(B_d, B_h, Width * Width * sizeof(float), cudaMemcpyHostToDevice);
dim3 dimGrid(Width / TILE_WIDTH, Width / TILE_WIDTH);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
MatrixMulKernel<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, Width);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0.0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "Elapsed time for testMatrixMulKernel : " << milliseconds
<< '\n';
cudaFree(A_d);
cudaFree(B_d);
cudaFree(C_d);
cudaEventDestroy(start);
cudaEventDestroy(stop);
free(A_h);
free(B_h);
}
void testTiledMatrixMulKernel(const int Width) {
float *A_h = (float *)malloc(Width * Width * sizeof(float));
float *B_h = (float *)malloc(Width * Width * sizeof(float));
for (auto i = 0; i < Width * Width; i++) {
A_h[i] = 1.0 * float(i) / 4.0;
B_h[i] = 2.0 * float(i) / 4.0;
}
float *A_d, *B_d, *C_d;
cudaMalloc((void **)&A_d, Width * Width * sizeof(float));
cudaMalloc((void **)&B_d, Width * Width * sizeof(float));
cudaMalloc((void **)&C_d, Width * Width * sizeof(float));
cudaMemcpy(A_d, A_h, Width * Width * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(B_d, B_h, Width * Width * sizeof(float), cudaMemcpyHostToDevice);
dim3 dimGrid(Width / TILE_WIDTH, Width / TILE_WIDTH);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
TiledMatrixMulKernel<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, Width);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0.0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "Elapsed time for testTiledMatrixMulKernel : " << milliseconds
<< '\n';
cudaFree(A_d);
cudaFree(B_d);
cudaFree(C_d);
cudaEventDestroy(start);
cudaEventDestroy(stop);
free(A_h);
free(B_h);
}
void testCuBLASMatrixMulKernel(const int Width) {
float *A_h = (float *)malloc(Width * Width * sizeof(float));
float *B_h = (float *)malloc(Width * Width * sizeof(float));
for (auto i = 0; i < Width * Width; i++) {
A_h[i] = 1.0 * float(i) / 4.0;
B_h[i] = 2.0 * float(i) / 4.0;
}
float *A_d, *B_d, *C_d;
cudaMalloc((void **)&A_d, Width * Width * sizeof(float));
cudaMalloc((void **)&B_d, Width * Width * sizeof(float));
cudaMalloc((void **)&C_d, Width * Width * sizeof(float));
cudaMemcpy(A_d, A_h, Width * Width * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(B_d, B_h, Width * Width * sizeof(float), cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
const float alpha = 1.0f;
const float beta = 0.0f;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, Width, Width, Width, &alpha, A_d, Width, B_d, Width, &beta, C_d, Width);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0.0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "Elapsed time for testCuBLASMatrixMulKernel : " << milliseconds
<< '\n';
cublasDestroy(handle);
cudaFree(A_d);
cudaFree(B_d);
cudaFree(C_d);
cudaEventDestroy(start);
cudaEventDestroy(stop);
free(A_h);
free(B_h);
}
<main.cu>
#include <iostream>
#include "matrixmultiplication.cuh"
int main(int argc, const char* argv[]) {
cudaDeviceSynchronize();
const int Width = 16;
std::cout << "Arithmetic Intensity : " << 1.0 / 3.0 * float(Width) << '\n';
testMatrixMulKernel(Width);
testTiledMatrixMulKernel(Width);
testCuBLASMatrixMulKernel(Width);
return 0;
}
The result is then
Arithmetic Intensity : 5.33333
Elapsed time for testMatrixMulKernel : 0.004096
Elapsed time for testTiledMatrixMulKernel : 0.004128
Elapsed time for testCuBLASMatrixMulKernel : 0.005888
The result for testMatrixMulKernel
is even the fastest among them! It seems really strange.
Also, the parameter Width = 4096, TILE_WIDTH 64
also give the strange result:
Arithmetic Intensity : 1365.33
Elapsed time for testMatrixMulKernel : 0.002048
Elapsed time for testTiledMatrixMulKernel : 0.002048
Elapsed time for testCuBLASMatrixMulKernel : 16.6339
Finally, here is my GPU where I have implemented the code:
NVIDIA GeForce RTX 3060
The elapsed time for the naive matrix multiplication decreased even though the width of matrix has been highly increased. I think the logic for code or the method for measuring elapsed time would not work. I have already checked the result of matrix multiplication in some small sizes. I think it may not be a problem related to the logic of matrix multiplication.
Could you give me some kind advices or point out what I did wrong?
Thank you.