Hi all,
I wrote a simple kernel that calculates the column sum for each row in matrix A (of shape MxK) and stores the results in matrix C (of shape MxN).
__global__ void simple_kernel(float* A, float* C, const int M, const int N, const int K) {
const int row = threadIdx.x + blockDim.x * blockIdx.x;
const int col = threadIdx.y + blockDim.y * blockIdx.y;
if (row < M && col < N) {
float acc = 0.0;
for (int p = 0; p < K; p++) {
acc += A[row * K + p];
}
C[row * N + col] = acc;
}
}
However, I’ve observed that the kernel’s performance drops significantly when the inner dimension K is divisible by 32 or 16, with a 2x slowdown in the former case and a 1.5x slowdown in the latter. I understand that the memory access pattern in the kernel is not optimal and lacks coalescing, but I’m curious about why the divisibility of the inner dimension K has such a notable impact on performance. Despite reviewing the CUDA Programming Guide and Kernel Profiling Guide, I haven’t found a clear explanation for this phenomenon. This issue isn’t related to memory bank conflicts, as the kernel doesn’t use shared memory. Additionally, it doesn’t seem to be explained by tile or wave quantization effects (please correct me if I’m mistaken). After profiling the kernel I noticed that if K is divisible by 32 this results in 2x-3x more data transfer between L1<->L2 and L2<->Device memory.
Still I don’t understand why this happens. I’m eager to deepen my understanding of CUDA and NVIDIA hardware. Any insights would be greatly appreciated! Thanks in advance!
The full code is listed below and was tested on RTX 3090:
#include <cstdlib>
#include <stdio.h>
#define NITER 1
#define BLOCKSIZE 16
#define MDIM BLOCKSIZE * 200
#define NDIM BLOCKSIZE * 200
// the kernel is 2-3x faster if KDIM not divisible by 16/32
// change KDIM to (BLOCKSIZE*200 + 1/2/3/4/5/6/7/8/9/10/11/12/13/14/15) to see the effect
#define KDIM BLOCKSIZE * 200
inline int cdiv(const int a, const int b) { return (a + b - 1) / b; }
void init_random(float* data, int size) {
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
void init_const(float* data, const int size, const float value) {
for (int i = 0; i < size; ++i)
data[i] = value;
}
__global__ void simple_kernel(float* A, float* C, const int M, const int N, const int K) {
const int row = threadIdx.x + blockDim.x * blockIdx.x;
const int col = threadIdx.y + blockDim.y * blockIdx.y;
if (row < M && col < N) {
float acc = 0.0;
for (int p = 0; p < K; p++) {
acc += A[row * K + p];
}
C[row * N + col] = acc;
}
}
int main() {
const int M = MDIM, N = NDIM, K = KDIM;
const int A_nelem = M * K;
const int C_nelem = M * N;
const int A_memsize = sizeof(float) * A_nelem;
const int C_memsize = sizeof(float) * C_nelem;
float *A_host, *C_host;
cudaMallocHost(&A_host, A_memsize);
cudaMallocHost(&C_host, C_memsize);
init_random(A_host, A_nelem);
init_const(C_host, C_nelem, 0.0);
float *A_device, *C_device;
cudaMalloc(&A_device, A_memsize);
cudaMalloc(&C_device, C_memsize);
cudaMemcpy(A_device, A_host, A_memsize, cudaMemcpyHostToDevice);
cudaMemcpy(C_device, C_host, C_memsize, cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
(cudaEventCreate(&start));
(cudaEventCreate(&stop));
dim3 threads(BLOCKSIZE, BLOCKSIZE);
dim3 grid(cdiv(M, BLOCKSIZE), cdiv(N, BLOCKSIZE));
float avg_elapsed_time_ms = 0.0;
float min_elapsed_time_ms = 1e69;
float max_elapsed_time_ms = 0.0;
float elapsed_time_ms;
for (int i = 0; i < NITER; i++) {
cudaEventRecord(start);
simple_kernel<<<grid, threads>>>(A_device, C_device, M, N, K);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time_ms, start, stop);
min_elapsed_time_ms =
elapsed_time_ms < min_elapsed_time_ms ? elapsed_time_ms : min_elapsed_time_ms;
max_elapsed_time_ms =
elapsed_time_ms > max_elapsed_time_ms ? elapsed_time_ms : max_elapsed_time_ms;
avg_elapsed_time_ms += elapsed_time_ms;
}
avg_elapsed_time_ms = avg_elapsed_time_ms / (float)NITER;
const double FLOP = 2 * (double)M * (double)N * (double)K;
const double GFLOP = FLOP * 1e-9f;
const double GFLOPS_AVG = GFLOP / (avg_elapsed_time_ms * 1e-3);
const double GFLOPS_MIN = GFLOP / (max_elapsed_time_ms * 1e-3);
const double GFLOPS_MAX = GFLOP / (min_elapsed_time_ms * 1e-3);
printf("AVG GFLOPS = %.2f\n", GFLOPS_AVG);
printf("MAX GFLOPS = %.2f\n", GFLOPS_MAX);
printf("MIN GFLOPS = %.2f\n", GFLOPS_MIN);
printf("AVG exec. time = %.2fms\n", avg_elapsed_time_ms);
printf("MAX exec. time = %.2fms\n", max_elapsed_time_ms);
printf("MIN exec. time = %.2fms\n", min_elapsed_time_ms);
cudaFreeHost(A_host);
cudaFreeHost(C_host);
cudaFree(A_device);
cudaFree(C_device);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return 0;
}