"When analyzing the shared memory access instructions of my custom CUDA SGEMM kernel, I found that the load instructions number approximately 167,772,160, while the store instructions number 16,777,216. Theoretically, I expected the ratio to be 32:1, but in reality, it’s only 10:1. What could be causing this difference?
The store instructions are reasonable.
template <unsigned int BLOCK_SIZE, unsigned int STRIDE>
__global__ void cuda_sgemm(float *A_ptr, float *B_ptr, float *C_ptr, const int M, const int N, const int K)
{
constexpr int STEP = BLOCK_SIZE * STRIDE;
int tx = threadIdx.x;
int ty = threadIdx.y;
float *A_ptr_start = A_ptr + STEP * blockIdx.y * K;
float *B_ptr_start = B_ptr + STEP * blockIdx.x;
__shared__ float A_shared[STEP][STEP];
__shared__ float B_shared[STEP][STEP];
float C_value[STRIDE][STRIDE] = {0.0f};
for (int s = 0; s < K; s += STEP)
{
for (int i = 0; i < STRIDE; ++i) {
for (int j = 0; j < STRIDE; ++j) {
A_shared[ty + i * BLOCK_SIZE][tx + j * BLOCK_SIZE] =
A_ptr_start[(ty + i * BLOCK_SIZE) * K + (tx + j * BLOCK_SIZE) + s];
B_shared[ty + i * BLOCK_SIZE][tx + j * BLOCK_SIZE] =
B_ptr_start[(ty + i * BLOCK_SIZE + s) * N + (tx + j * BLOCK_SIZE)];
}
}
__syncthreads();
//#pragma unroll
for (int i = 0; i < STRIDE; ++i) {
for (int j = 0; j < STRIDE; ++j) {
for (int k = 0; k < STEP; ++k) {
C_value[i][j] += A_shared[ty + i * BLOCK_SIZE][k] * B_shared[k][tx + j * BLOCK_SIZE];
}
}
}
__syncthreads();
}
float *C_ptr_start = C_ptr + N * blockIdx.y * STEP + blockIdx.x * STEP;
for (int i = 0; i < STRIDE; ++i) {
for (int j = 0; j < STRIDE; ++j ) {
C_ptr_start[(ty + i * BLOCK_SIZE) * N + (tx + j * BLOCK_SIZE)] = C_value[i][j];
}
}
}
void MMult_v1(int m, int n, int k, float *d_A,
float *d_B, float *d_C) {
constexpr int BLOCK = 16;
const int STRIDE = 2;
dim3 block(BLOCK, BLOCK);
dim3 grid((m + BLOCK - 1) / BLOCK / STRIDE, (n + BLOCK - 1) / BLOCK / STRIDE);
cuda_sgemm<BLOCK, STRIDE><<<grid, block>>>(d_A, d_B, d_C, m, n, k);
}
