Shared memory matrix multiplication not working

vnaw4988 · October 8, 2024, 8:46pm

I am having trouble implementing this matrix multiplication with shared memory. Any help is appreciated.

#include<iostream>
#include<assert.h>
#include<cuda_runtime.h>
#include<cuda.h>

#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
#define REP(i,n) for(int i=0;i<(n);++i)

#define M 512
#define K 512
#define N 512

#define BLOCK_SIZE_X 32
#define BLOCK_SIZE_Y 32

#define TM 4
#define tN 4

// alpha=1, beta=0
__global__
void gemm(const float * A, const float * B, float * C) {
	// 2D thread indices within a block
	const int threadRow = threadIdx.y;
	const int threadCol = threadIdx.x;

	// Global indices (matrix element coordinates)
	const int x = blockIdx.y * BLOCK_SIZE_Y + threadRow;
	const int y = blockIdx.x * BLOCK_SIZE_X + threadCol;

	__shared__ float As[BLOCK_SIZE_X][BLOCK_SIZE_Y];
	__shared__ float Bs[BLOCK_SIZE_Y][BLOCK_SIZE_X];

	if (x < M && y < N) {
		float threadResult = 0.0;

		for (int bkIdx = 0; bkIdx < K / BLOCK_SIZE_Y; ++bkIdx) {
			As[threadRow][threadCol] = A[x * K + (bkIdx * BLOCK_SIZE_Y + threadCol)];
			Bs[threadCol][threadRow] = B[(bkIdx * BLOCK_SIZE_Y + threadCol) * N + y];

			__syncthreads();

			for (int dotIdx = 0; dotIdx < BLOCK_SIZE_Y; ++dotIdx) {
				threadResult += As[threadRow][dotIdx] * Bs[dotIdx][threadCol];
			}

			__syncthreads();
		}

		C[x * N + y] = threadResult;
	}
}



int main() {
	cudaError_t err;
	float h_C[M * N], h_A[M * K], h_B[K * N];

	REP(i, M * K) h_A[i] = rand() % 2;
	REP(i, K * N) h_B[i] = rand() % 3;

	float * d_C, * d_A, * d_B;

	err = cudaMalloc((void**)&d_C, M * N * sizeof(float));
	assert(err == cudaSuccess);
	err = cudaMalloc((void**)&d_A, M * K * sizeof(float));
	assert(err == cudaSuccess);
	err = cudaMalloc((void**)&d_B, K * N * sizeof(float));
	assert(err == cudaSuccess);

	err = cudaMemcpy(d_A, h_A, M * K * sizeof(float), cudaMemcpyHostToDevice);
	assert(err == cudaSuccess);
	err = cudaMemcpy(d_B, h_B, K * N * sizeof(float), cudaMemcpyHostToDevice);
	assert(err == cudaSuccess);

	assert(M % BLOCK_SIZE_X == 0 && N % BLOCK_SIZE_Y == 0);
	dim3 gridDim(CEIL_DIV(M, BLOCK_SIZE_X), CEIL_DIV(N, BLOCK_SIZE_Y));
	dim3 blockDim(BLOCK_SIZE_X, BLOCK_SIZE_Y);

	gemm<<<gridDim, blockDim>>>(d_A, d_B, d_C);

	cudaDeviceSynchronize();
	err = cudaGetLastError();
	if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
	assert(err == cudaSuccess);

	err = cudaMemcpy(h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost);
	assert(err == cudaSuccess);

	// int error = 0.0;
	REP(i, M) {
		REP(j, N) {
			float val = 0.0;
			REP(k, K) val += h_A[i * K + k] * h_B[k * N + j];

			// error += val - h_C[i * N + j];

			assert(h_C[i * N + j] == val);
			//std::cout << h_C[i * N + j] << ' ' << val << "  ";
		}
		// std::cout << std::endl;
	}
	// std::cout << error << std::endl;

	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);
}

Robert_Crovella · October 8, 2024, 10:00pm

vnaw4988 · October 10, 2024, 7:52pm

Thank you, I made a silly error in my tiling scheme.

My sgemm kernel now takes 180 micro seconds for M=N=K=512
cuBLAS takes ~60 microseconds

I think the next step is to increase arithmetic intensity per thread.
Would that require a “grid-stride” approach?

Also, is my kernel having bank conflicts? the shared memory block dimensions are 32x32 for As and Bs. Bank conflicts happen for even strided access in a warp, but I think I account for that with how I place threads into warps:

How do I further reach cuBLAS performance?

I saw this in gemm optimization worklog

dim3 gridDim(CEIL_DIV(M, BLOCK_SIZE), CEIL_DIV(N, BLOCK_SIZE));
dim3 blockDim(BLOCK_SIZE * BLOCK_SIZE); // linear block dimension

const int threadRow = threadIdx.x / BLOCK_SIZE; // warps share rows
const int threadCol = threadIdx.x % BLOCK_SIZE;

If confused by this see full code below

sgemm.cu

#ifndef SGEMM_CUH
#define SGEMM_CUH

#ifndef KERNEL
#define KERNEL 1
#endif

#ifndef M
#define M 512
#endif

#ifndef K
#define K 512
#endif

#ifndef N
#define N 512
#endif

#ifndef alpha
#define alpha 1
#endif

#ifndef beta
#define beta 0
#endif

#if KERNEL == 1

#ifndef BLOCK_SIZE
#define BLOCK_SIZE 32
#endif

// shared memory block tiling, 1 output per thread
__global__ __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
void sgemm1(float * A, float * B, float * C) {
	const int cRow = blockIdx.x;
	const int cCol = blockIdx.y;

	const int threadRow = threadIdx.x / BLOCK_SIZE;
	const int threadCol = threadIdx.x % BLOCK_SIZE;

	A += cRow * BLOCK_SIZE * K;
	B += cCol * BLOCK_SIZE;
	C += cRow * BLOCK_SIZE * N + cCol * BLOCK_SIZE;

	__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
	__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
	
	float threadResult = 0.0;

	for (int bkIdx = 0; bkIdx < K / BLOCK_SIZE; ++bkIdx) {
		As[threadRow][threadCol] = A[threadRow * K + threadCol];
		Bs[threadRow][threadCol] = B[threadRow * N + threadCol];

		__syncthreads();
		
		A += BLOCK_SIZE;
		B += BLOCK_SIZE * N;

		for (int dotIdx = 0; dotIdx < BLOCK_SIZE; ++dotIdx) {
			threadResult += As[threadRow][dotIdx] * Bs[dotIdx][threadCol];
		}

		__syncthreads();
	}

	C[threadRow * N + threadCol] =
#if alpha == 1
	threadResult
#else
	alpha * threadResult
#endif

#if beta == 0
#else
	+= beta * C[threadRow * N + threadCol]
#endif
	;
}

#elif KERNEL == 2
// ...
#endif // KERNEL


#endif // SGEMM_CUH

sgemm_run.cu

#include<iostream>
#include<assert.h>
#include<cuda_runtime.h>
#include<cuda.h>

#define CONCAT_FACTORY(x, y) x##y
#define CONCAT(x, y) CONCAT_FACTORY(x, y)
#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
#define REP(i,n) for(int i=0;i<(n);++i)

#define HANDLE_ERROR( err ) {\
	if (err != cudaSuccess) {\
		std::cout << __FILE__ << ' ' << __LINE__ << ' ' << cudaGetErrorString( err ) << std::endl;\
		exit( 1 );\
	}\
}\

#ifndef KERNEL
#define KERNEL 2
#endif

#ifndef M
#define M 512
#endif

#ifndef K
#define K 512
#endif

#ifndef N
#define N 512
#endif

#ifndef alpha
#define alpha 1
#endif

#ifndef beta
#define beta 0
#endif

#ifndef BLOCK_SIZE
#define BLOCK_SIZE 32
#endif

#include"sgemm.cu"

int main() {
	cudaError_t err;
	float h_C[M * N], h_A[M * K], h_B[K * N];

	REP(i, M * K) h_A[i] = rand() % 4;
	REP(i, K * N) h_B[i] = rand() % 5;

	float * d_C, * d_A, * d_B;

	err = cudaMalloc((void**)&d_C, M * N * sizeof(float));
	HANDLE_ERROR(err);
	err = cudaMalloc((void**)&d_A, M * K * sizeof(float));
	HANDLE_ERROR(err);
	err = cudaMalloc((void**)&d_B, K * N * sizeof(float));
	HANDLE_ERROR(err);

	err = cudaMemcpy(d_A, h_A, M * K * sizeof(float), cudaMemcpyHostToDevice);
	HANDLE_ERROR(err);
	err = cudaMemcpy(d_B, h_B, K * N * sizeof(float), cudaMemcpyHostToDevice);
	HANDLE_ERROR(err);

	assert(M % BLOCK_SIZE == 0 && N % BLOCK_SIZE == 0);
	dim3 gridDim(CEIL_DIV(M, BLOCK_SIZE), CEIL_DIV(N, BLOCK_SIZE));
	dim3 blockDim(BLOCK_SIZE * BLOCK_SIZE);

	CONCAT(sgemm,KERNEL)<<<gridDim, blockDim>>>(d_A, d_B, d_C);

	cudaDeviceSynchronize();
	err = cudaGetLastError();
	HANDLE_ERROR(err);

	err = cudaMemcpy(h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost);
	HANDLE_ERROR(err);

	REP(i, M) {
		REP(j, N) {
			float val = 0.0;
			REP(k, K) val += h_A[i * K + k] * h_B[k * N + j];
			assert(h_C[i * N + j] == val);
			//std::cout << h_C[i * N + j] << ' ' << val << "  ";
		}
		// std::cout << std::endl;
	}

	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);
}

Robert_Crovella · October 10, 2024, 9:21pm

I didn’t spot any bank-conflicted access patterns in your modified code.

It’s an involved journey, I usually point people to this for that question.

rs277 · October 10, 2024, 10:17pm

After the Scott Gray info linked above, although dated, this may be a worthwhile read also.

vnaw4988 · October 10, 2024, 11:35pm

I’ve been meaning to read that. The thing is, he makes some changes at the SASS level. I’ll see what I can do in cuda, then ptx, but I don’t know if there’s any assembler for Pascal

I saw this talk by Chris Cecka, where they can explore the space of packed matrix layouts with this tensor algebra (see below):

logical MxN matrix vs underlying memory layout
row major: (i,j) → i * N + j
column major: (i, j) → j * N + i
(M/m’)x(N/n’) matrix of m’xn’ matrices:
(i, j) → [ (i / m’) * (N / n’) + (j / n’) ] * (m’ * n’) + [(i mod m’) * n’ + (j mod n’)]

I think you can just continue nesting, then let the compiler optimize the indices

This is a pretty cool idea. I think you can just heuristically search for some optimal mem layout at this point. Probably a powerful technique for architectures with tensor cores.

I have no qualms about investing the time to reach that level of understanding, but I’m going to switch from Pascal arch (titan xp graphics card) soon to Turing (RTX 2080-ti). Plus, the only open source assembler I know of is TuringAs. I don’t know if there’s a “MaxwellAs” or “PascalAs”.

Is there any benefit to warp level intrinsics. I think opencl 2.0 can do reductions across a warp, but how fast is that. Does cuBLAS, CUTLASS, or TensorRT use that?

rs277 · October 11, 2024, 12:23am

Yes, Robert’s second link is to Scott’s maxas Github project. I understand it can also be used with Pascal, with a little coaxing.

My hazy recollections of research I did a while ago into maxas is that many of the shortcomings he outlines in what was then the current Nvidia assembler, were addressed in later versions, based on his findings. I remember coming across him being credited in the fine print somewhere.

Topic		Replies	Views
Matrix multiplication woes large inner, small outer dimensions CUDA Programming and Performance	21	10024	March 24, 2009
Problems of matrix multiplication With and without CUDA CUDA Programming and Performance	15	10006	January 18, 2012
Is it correct that my Pascal card is calling Maxwell_gemm kernels through cublas? And if so, why is cublas unusably slow for me? CUDA Programming and Performance	6	940	August 23, 2018
Need help in implementing Matrix multiplication using Shared Memory in Numba CUDA Programming and Performance	25	5961	May 6, 2021
Cuda Latency problems Slow Cuda CUDA Programming and Performance	15	13932	September 5, 2008
Matrix Multiplication Garbage value :( CUDA Programming and Performance	10	3406	July 25, 2009
Memory problem? ...incredible slowdown CUDA Programming and Performance	29	16300	January 30, 2011
Matrix Multiplication In CUDA CUDA Programming and Performance	6	2540	May 11, 2015
matrix multiply reduction CUDA Programming and Performance	41	35550	January 15, 2011
Using Shared Memory in CUDA C/C++ Technical Blog	36	1994	October 8, 2020

Shared memory matrix multiplication not working

Related topics