CuBLAS matrix multiplication is slower than the naive one

Hi. I recently started to learn CUDA programming.
I have a question related to the performance among matrix multiplications. When I compared performances among three versions of matrix multiplications, the naive one, tiled matrix multiplication and using cuBLAS, I found some awkward results among them.

Here are my codes related to the issue.
<matrixmultiplication.cuh>

#pragma once

#include "cublas_v2.h"
#include "cuda_runtime.h"
#include <iostream>

#define TILE_WIDTH 4

__global__ void MatrixMulKernel(const float *M, const float *N, float *P,
                                const int width) {
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int column = blockIdx.x * blockDim.x + threadIdx.x;
  if (row < width && column < width) {
    float Pvalue = .0;
    for (int k = 0; k < width; ++k) {
      Pvalue += M[row * width + k] * N[k * width + column];
    }
    P[row * width + column] = Pvalue;
  }
}

__global__ void TiledMatrixMulKernel(const float *M, const float *N, float *P,
                                     const int width) {
  __shared__ float Mds[TILE_WIDTH][TILE_WIDTH];
  __shared__ float Nds[TILE_WIDTH][TILE_WIDTH];

  int bx = blockIdx.x;
  int by = blockIdx.y;
  int tx = threadIdx.x;
  int ty = threadIdx.y;
  int row = by * TILE_WIDTH + ty;
  int column = bx * TILE_WIDTH + tx;

  float Pvalue = .0;
  for (int ph = 0; ph < width / TILE_WIDTH; ++ph) {
    Mds[ty][tx] = M[row * width + ph * TILE_WIDTH + tx];
    Nds[ty][tx] = N[(ph * TILE_WIDTH + ty) * width + column];
    __syncthreads();

    for (int k = 0; k < TILE_WIDTH; k++) {
      Pvalue += Mds[ty][k] * Nds[k][tx];
    }
    __syncthreads();
  }
  if (row < width && column < width) {
    P[row * width + column] = Pvalue;
  }
}

void testMatrixMulKernel(const int Width) {
  float *A_h = (float *)malloc(Width * Width * sizeof(float));
  float *B_h = (float *)malloc(Width * Width * sizeof(float));
  for (auto i = 0; i < Width * Width; i++) {
    A_h[i] = 1.0 * float(i) / 4.0;
    B_h[i] = 2.0 * float(i) / 4.0;
  }
  float *A_d, *B_d, *C_d;
  cudaMalloc((void **)&A_d, Width * Width * sizeof(float));
  cudaMalloc((void **)&B_d, Width * Width * sizeof(float));
  cudaMalloc((void **)&C_d, Width * Width * sizeof(float));

  cudaMemcpy(A_d, A_h, Width * Width * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(B_d, B_h, Width * Width * sizeof(float), cudaMemcpyHostToDevice);

  dim3 dimGrid(Width / TILE_WIDTH, Width / TILE_WIDTH);
  dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);

  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  cudaEventRecord(start);
  MatrixMulKernel<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, Width);
  cudaEventRecord(stop);

  cudaEventSynchronize(stop);
  float milliseconds = 0.0;
  cudaEventElapsedTime(&milliseconds, start, stop);

  std::cout << "Elapsed time for testMatrixMulKernel : " << milliseconds
            << '\n';

  cudaFree(A_d);
  cudaFree(B_d);
  cudaFree(C_d);
  cudaEventDestroy(start);
  cudaEventDestroy(stop);

  free(A_h);
  free(B_h);
}

void testTiledMatrixMulKernel(const int Width) {
  float *A_h = (float *)malloc(Width * Width * sizeof(float));
  float *B_h = (float *)malloc(Width * Width * sizeof(float));
  for (auto i = 0; i < Width * Width; i++) {
    A_h[i] = 1.0 * float(i) / 4.0;
    B_h[i] = 2.0 * float(i) / 4.0;
  }
  float *A_d, *B_d, *C_d;
  cudaMalloc((void **)&A_d, Width * Width * sizeof(float));
  cudaMalloc((void **)&B_d, Width * Width * sizeof(float));
  cudaMalloc((void **)&C_d, Width * Width * sizeof(float));

  cudaMemcpy(A_d, A_h, Width * Width * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(B_d, B_h, Width * Width * sizeof(float), cudaMemcpyHostToDevice);

  dim3 dimGrid(Width / TILE_WIDTH, Width / TILE_WIDTH);
  dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);

  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  cudaEventRecord(start);
  TiledMatrixMulKernel<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, Width);
  cudaEventRecord(stop);

  cudaEventSynchronize(stop);
  float milliseconds = 0.0;
  cudaEventElapsedTime(&milliseconds, start, stop);

  std::cout << "Elapsed time for testTiledMatrixMulKernel : " << milliseconds
            << '\n';

  cudaFree(A_d);
  cudaFree(B_d);
  cudaFree(C_d);
  cudaEventDestroy(start);
  cudaEventDestroy(stop);

  free(A_h);
  free(B_h);
}

void testCuBLASMatrixMulKernel(const int Width) {
  float *A_h = (float *)malloc(Width * Width * sizeof(float));
  float *B_h = (float *)malloc(Width * Width * sizeof(float));
  for (auto i = 0; i < Width * Width; i++) {
    A_h[i] = 1.0 * float(i) / 4.0;
    B_h[i] = 2.0 * float(i) / 4.0;
  }
  float *A_d, *B_d, *C_d;
  cudaMalloc((void **)&A_d, Width * Width * sizeof(float));
  cudaMalloc((void **)&B_d, Width * Width * sizeof(float));
  cudaMalloc((void **)&C_d, Width * Width * sizeof(float));

  cudaMemcpy(A_d, A_h, Width * Width * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(B_d, B_h, Width * Width * sizeof(float), cudaMemcpyHostToDevice);

  cublasHandle_t handle;
  cublasCreate(&handle);
  const float alpha = 1.0f;
  const float beta = 0.0f;
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  cudaEventRecord(start);
  cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, Width, Width, Width, &alpha, A_d, Width, B_d, Width, &beta, C_d, Width);
  cudaEventRecord(stop);

  cudaEventSynchronize(stop);
  float milliseconds = 0.0;
  cudaEventElapsedTime(&milliseconds, start, stop);

  std::cout << "Elapsed time for testCuBLASMatrixMulKernel : " << milliseconds
            << '\n';

  cublasDestroy(handle);
  cudaFree(A_d);
  cudaFree(B_d);
  cudaFree(C_d);
  cudaEventDestroy(start);
  cudaEventDestroy(stop);

  free(A_h);
  free(B_h);
}

<main.cu>

#include <iostream>
#include "matrixmultiplication.cuh"

int main(int argc, const char* argv[]) {
	cudaDeviceSynchronize();
	const int Width = 16;
	std::cout << "Arithmetic Intensity : " << 1.0 / 3.0 * float(Width) << '\n';
	testMatrixMulKernel(Width);
	testTiledMatrixMulKernel(Width);
	testCuBLASMatrixMulKernel(Width);
	return 0;
}

The result is then

Arithmetic Intensity : 5.33333
Elapsed time for testMatrixMulKernel : 0.004096
Elapsed time for testTiledMatrixMulKernel : 0.004128
Elapsed time for testCuBLASMatrixMulKernel : 0.005888

The result for testMatrixMulKernel is even the fastest among them! It seems really strange.
Also, the parameter Width = 4096, TILE_WIDTH 64 also give the strange result:

Arithmetic Intensity : 1365.33
Elapsed time for testMatrixMulKernel : 0.002048
Elapsed time for testTiledMatrixMulKernel : 0.002048
Elapsed time for testCuBLASMatrixMulKernel : 16.6339

Finally, here is my GPU where I have implemented the code:

NVIDIA GeForce RTX 3060

The elapsed time for the naive matrix multiplication decreased even though the width of matrix has been highly increased. I think the logic for code or the method for measuring elapsed time would not work. I have already checked the result of matrix multiplication in some small sizes. I think it may not be a problem related to the logic of matrix multiplication.

Could you give me some kind advices or point out what I did wrong?
Thank you.

The first test cases (TILE_WIDTH 4, Width 16) are too small to be interesting. Your measurements are basically in the low single digit microsecond range. That is basically the launch overhead for CUDA.

Rerun your initial test case with a TILE_WIDTH of 32 and a Width of 1024.

TILE_WIDTH of 64 creates illegal activity in CUDA:

dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);

That is asking for a threadblock of 64 x 64 threads. You are not doing proper CUDA error checking, so you don’t know that CUDA is not running that kernel, and instead giving you an error.

1 Like

I really appreciate for your comment. After your comment, I have added some sanity check codes based on Lei Mao’s great post : Proper CUDA Error Checking - Lei Mao's Log Book including cuBLAS.

I did not realized the maximum size of threads per block indeed.

Again, some problems arised… Here are the results after I have added some error checkers. Note that I have only wrapped functions with error checker function.
<errors.cuh>

#pragma once
#include "cublas_api.h"
#include "cuda_runtime.h"
#include "cuda.h"
#include <iostream>

#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
template <typename T>
void check(T err, const char* const func, const char* const file, const int line) {
	if (err != cudaSuccess) {
		std::cerr << "CUDA Runtime Error at: " << file << ":" << line << '\n';
		std::cerr << cudaGetErrorString(err) << " " << func << '\n';
	}
}

#define CHECK_CUBLAS_STATUS(val) checkCuBLASStatus((val), #val, __FILE__, __LINE__)
void checkCuBLASStatus(cublasStatus_t status, const char* const func, const char* const file, const int line) {
	if (status != CUBLAS_STATUS_SUCCESS) {
		std::cerr << "CUBLAS Error at : " << file << ":" << line << '\n';
		std::cerr << cublasGetStatusString(status) << " " << func << '\n';
	}
}

#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__)
void checkLast(const char* const file, const int line) {
	const auto err = cudaGetLastError();
	if (err != cudaSuccess) {
		std::cerr << "CUDA Runtime Error at: " << file << ":" << line << '\n';
		std::cerr << cudaGetErrorString(err) << '\n';
	}
}

<matrixmultiplication.cuh>

#pragma once

#include "cublas_v2.h"
#include "cuda_runtime.h"
#include "errors.cuh"
#include <iostream>

#define TILE_WIDTH 4

__global__ void MatrixMulKernel(const float *M, const float *N, float *P,
                                const int width) {
  const int row = blockIdx.y * blockDim.y + threadIdx.y;
  const int column = blockIdx.x * blockDim.x + threadIdx.x;
  if (row < width && column < width) {
    float Pvalue = .0;
    for (int k = 0; k < width; ++k) {
      Pvalue += M[row * width + k] * N[k * width + column];
    }
    P[row * width + column] = Pvalue;
  }
}

__global__ void TiledMatrixMulKernel(const float *M, const float *N, float *P,
                                     const int width) {
  __shared__ float Mds[TILE_WIDTH][TILE_WIDTH];
  __shared__ float Nds[TILE_WIDTH][TILE_WIDTH];

  const int bx = blockIdx.x;
  const int by = blockIdx.y;
  const int tx = threadIdx.x;
  const int ty = threadIdx.y;
  const int row = by * TILE_WIDTH + ty;
  const int column = bx * TILE_WIDTH + tx;

  float Pvalue = .0;
  for (int ph = 0; ph < width / TILE_WIDTH; ++ph) {
    Mds[ty][tx] = M[row * width + ph * TILE_WIDTH + tx];
    Nds[ty][tx] = N[(ph * TILE_WIDTH + ty) * width + column];
    __syncthreads();

    for (int k = 0; k < TILE_WIDTH; k++) {
      Pvalue += Mds[ty][k] * Nds[k][tx];
    }
    __syncthreads();
  }
  if (row < width && column < width) {
    P[row * width + column] = Pvalue;
  }
}

void testMatrixMulKernel(const int Width) {
  float *A_h = (float *)malloc(Width * Width * sizeof(float));
  float *B_h = (float *)malloc(Width * Width * sizeof(float));
  for (auto i = 0; i < Width * Width; i++) {
    A_h[i] = 1.0 * float(i) / 4.0;
    B_h[i] = 2.0 * float(i) / 4.0;
  }
  float *A_d, *B_d, *C_d;
  CHECK_CUDA_ERROR(cudaMalloc((void **)&A_d, Width * Width * sizeof(float)));
  CHECK_CUDA_ERROR(cudaMalloc((void **)&B_d, Width * Width * sizeof(float)));
  CHECK_CUDA_ERROR(cudaMalloc((void **)&C_d, Width * Width * sizeof(float)));

  CHECK_CUDA_ERROR(cudaMemcpy(A_d, A_h, Width * Width * sizeof(float), cudaMemcpyHostToDevice));
  CHECK_CUDA_ERROR(cudaMemcpy(B_d, B_h, Width * Width * sizeof(float), cudaMemcpyHostToDevice));

  const dim3 dimGrid(Width / TILE_WIDTH, Width / TILE_WIDTH);
  const dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);

  cudaEvent_t start, stop;
  CHECK_CUDA_ERROR(cudaEventCreate(&start));
  CHECK_CUDA_ERROR(cudaEventCreate(&stop));

  CHECK_CUDA_ERROR(cudaEventRecord(start));
  MatrixMulKernel<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, Width);
  CHECK_CUDA_ERROR(cudaEventRecord(stop));

  CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
  float milliseconds = 0.0;
  CHECK_CUDA_ERROR(cudaEventElapsedTime(&milliseconds, start, stop));

  std::cout << "Elapsed time for testMatrixMulKernel : " << milliseconds
            << '\n';

  CHECK_CUDA_ERROR(cudaFree(A_d));
  CHECK_CUDA_ERROR(cudaFree(B_d));
  CHECK_CUDA_ERROR(cudaFree(C_d));
  CHECK_CUDA_ERROR(cudaEventDestroy(start));
  CHECK_CUDA_ERROR(cudaEventDestroy(stop));

  free(A_h);
  free(B_h);

  CHECK_LAST_CUDA_ERROR();
}

void testTiledMatrixMulKernel(const int Width) {
  float *A_h = (float *)malloc(Width * Width * sizeof(float));
  float *B_h = (float *)malloc(Width * Width * sizeof(float));
  for (auto i = 0; i < Width * Width; i++) {
    A_h[i] = 1.0 * float(i) / 4.0;
    B_h[i] = 2.0 * float(i) / 4.0;
  }
  float *A_d, *B_d, *C_d;
  CHECK_CUDA_ERROR(cudaMalloc((void **)&A_d, Width * Width * sizeof(float)));
  CHECK_CUDA_ERROR(cudaMalloc((void **)&B_d, Width * Width * sizeof(float)));
  CHECK_CUDA_ERROR(cudaMalloc((void **)&C_d, Width * Width * sizeof(float)));

  CHECK_CUDA_ERROR(cudaMemcpy(A_d, A_h, Width * Width * sizeof(float), cudaMemcpyHostToDevice));
  CHECK_CUDA_ERROR(cudaMemcpy(B_d, B_h, Width * Width * sizeof(float), cudaMemcpyHostToDevice));

  const dim3 dimGrid(Width / TILE_WIDTH, Width / TILE_WIDTH);
  const dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);

  cudaEvent_t start, stop;
  CHECK_CUDA_ERROR(cudaEventCreate(&start));
  CHECK_CUDA_ERROR(cudaEventCreate(&stop));

  CHECK_CUDA_ERROR(cudaEventRecord(start));
  TiledMatrixMulKernel<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, Width);
  CHECK_CUDA_ERROR(cudaEventRecord(stop));

  CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
  float milliseconds = 0.0;
  CHECK_CUDA_ERROR(cudaEventElapsedTime(&milliseconds, start, stop));

  std::cout << "Elapsed time for testTiledMatrixMulKernel : " << milliseconds
            << '\n';

  CHECK_CUDA_ERROR(cudaFree(A_d));
  CHECK_CUDA_ERROR(cudaFree(B_d));
  CHECK_CUDA_ERROR(cudaFree(C_d));
  CHECK_CUDA_ERROR(cudaEventDestroy(start));
  CHECK_CUDA_ERROR(cudaEventDestroy(stop));

  free(A_h);
  free(B_h);
}

void testCuBLASMatrixMulKernel(const int Width) {
  float *A_h = (float *)malloc(Width * Width * sizeof(float));
  float *B_h = (float *)malloc(Width * Width * sizeof(float));
  for (auto i = 0; i < Width * Width; i++) {
    A_h[i] = 1.0 * float(i) / 4.0;
    B_h[i] = 2.0 * float(i) / 4.0;
  }
  float *A_d, *B_d, *C_d;
  CHECK_CUDA_ERROR(cudaMalloc((void **)&A_d, Width * Width * sizeof(float)));
  CHECK_CUDA_ERROR(cudaMalloc((void **)&B_d, Width * Width * sizeof(float)));
  CHECK_CUDA_ERROR(cudaMalloc((void **)&C_d, Width * Width * sizeof(float)));

  CHECK_CUDA_ERROR(cudaMemcpy(A_d, A_h, Width * Width * sizeof(float), cudaMemcpyHostToDevice));
  CHECK_CUDA_ERROR(cudaMemcpy(B_d, B_h, Width * Width * sizeof(float), cudaMemcpyHostToDevice));

  cublasHandle_t handle;
  CHECK_CUBLAS_STATUS(cublasCreate(&handle));
  const float alpha = 1.0f;
  const float beta = 0.0f;
  cudaEvent_t start, stop;
  CHECK_CUDA_ERROR(cudaEventCreate(&start));
  CHECK_CUDA_ERROR(cudaEventCreate(&stop));

  CHECK_CUDA_ERROR(cudaEventRecord(start));
  CHECK_CUBLAS_STATUS(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, Width, Width, Width, &alpha, A_d, Width, B_d, Width, &beta, C_d, Width));
  CHECK_CUDA_ERROR(cudaEventRecord(stop));

  CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
  float milliseconds = 0.0;
  CHECK_CUDA_ERROR(cudaEventElapsedTime(&milliseconds, start, stop));

  std::cout << "Elapsed time for testCuBLASMatrixMulKernel : " << milliseconds
            << '\n';

  CHECK_CUBLAS_STATUS(cublasDestroy(handle));
  CHECK_CUDA_ERROR(cudaFree(A_d));
  CHECK_CUDA_ERROR(cudaFree(B_d));
  CHECK_CUDA_ERROR(cudaFree(C_d));
  CHECK_CUDA_ERROR(cudaEventDestroy(start));
  CHECK_CUDA_ERROR(cudaEventDestroy(stop));

  free(A_h);
  free(B_h);
}


The results with (Width, TILE_WIDTH) = (1024, 4) seems approximately:

Elapsed time for testMatrixMulKernel : 9.71571
Elapsed time for testTiledMatrixMulKernel : 14.6685
Elapsed time for testCuBLASMatrixMulKernel : 0.293888

The difference between the naive one and tiled matrix multiplication seems weird. For (Width, TILE_WIDTH) = (1024, 16), the performance gets slightly better, but:

Elapsed time for testMatrixMulKernel : 2.73088
Elapsed time for testTiledMatrixMulKernel : 2.07053
Elapsed time for testCuBLASMatrixMulKernel : 0.293472

It contrasts to a theoretical performance between the naive one and the tiled reduced form (around TILE_WIDTH times as my knowledge).
Could you point out what I am missing in this code or information?
Thank you for ur help again.

++ For small Width such as 64, 128, 256, (with TILE_WIDTH=4) the results are still not so good as I expected.

There is no reason for much difference between these two numbers:

and

The only thing you are changing is TILE_WIDTH and that should have no impact on the first kernel. You are running into a benchmarking issue. To do good benchmarking, you should run a test many times and do some statistical work on the results. As a shortcut, I often run a kernel once without timing it, then do a cudaDeviceSynchronize();, then run the kernel again, timing it. This usually gives me pretty stable results.

The GPU caches will cause naive estimations of performance to be incorrect. The tiled matrix multiplication code is evidently not 16 times faster than the naive code for a tile width of 16, and one reason for this is because of GPU caches. (I’m not even sure the fundamental premise is valid, but I digress…)

1 Like

Actually, in TestMatrixMulKernel, I made a grid size and block size to (Width/TILE_WIDTH, Width/TILE_WIDTH) and (TILE_WIDTH, TILE_WIDTH) and I thought that the change of both grid size and block size would have a certain impact on the first kernel. Is there no difference of results from the change of grid, block sizes in the naive kernel?

Sorry for bad naming for variables. I have just set BLOCK_SIZE to be equal to TILE_WIDTH for simplicity.

Also, I am running a kernel starting with cudaDeviceSynchrnocize(); Indeed, my main function is
<main.cu>

int main(int argc, const char* argv[]) {
	const int Width = 1024;
	cudaDeviceSynchronize();
	// printDeviceInfo();
	testMatrixMulKernel(Width);
	cudaDeviceSynchronize();
	testTiledMatrixMulKernel(Width);
	cudaDeviceSynchronize();
	testCuBLASMatrixMulKernel(Width);
	return 0;
}

Finally, despite the existence of GPU caches, I feel suspicious about my results between the MatrixMulKernel and the TiledMatrixMulKernel. It seems much far from the ratio than I expected.

Again, I really appreciate for your helpful comments. Thank you!

You are correct about that. My mistake. The block size matters to some degree.

1 Like

When I run your original code with Width of 1024 and tile_width of 32, on an L4 GPU, I get:

Elapsed time for testMatrixMulKernel : 1.25616
Elapsed time for testTiledMatrixMulKernel : 0.911328

When I recompile the code with -Xptxas -dlcm=cg which disables the L1 cache, I get:

Elapsed time for testMatrixMulKernel : 2.55011
Elapsed time for testTiledMatrixMulKernel : 0.91232

So I am certain that caches play a role.

1 Like

Thank you again. Being lack of my knowledge about compilations, I implemented codes without various compile options. I will check ur comments carefully.

Thank you!

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.