Shared memory matrix multiplication not working

I am having trouble implementing this matrix multiplication with shared memory. Any help is appreciated.

#include<iostream>
#include<assert.h>
#include<cuda_runtime.h>
#include<cuda.h>

#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
#define REP(i,n) for(int i=0;i<(n);++i)

#define M 512
#define K 512
#define N 512

#define BLOCK_SIZE_X 32
#define BLOCK_SIZE_Y 32

#define TM 4
#define tN 4

// alpha=1, beta=0
__global__
void gemm(const float * A, const float * B, float * C) {
	// 2D thread indices within a block
	const int threadRow = threadIdx.y;
	const int threadCol = threadIdx.x;

	// Global indices (matrix element coordinates)
	const int x = blockIdx.y * BLOCK_SIZE_Y + threadRow;
	const int y = blockIdx.x * BLOCK_SIZE_X + threadCol;

	__shared__ float As[BLOCK_SIZE_X][BLOCK_SIZE_Y];
	__shared__ float Bs[BLOCK_SIZE_Y][BLOCK_SIZE_X];

	if (x < M && y < N) {
		float threadResult = 0.0;

		for (int bkIdx = 0; bkIdx < K / BLOCK_SIZE_Y; ++bkIdx) {
			As[threadRow][threadCol] = A[x * K + (bkIdx * BLOCK_SIZE_Y + threadCol)];
			Bs[threadCol][threadRow] = B[(bkIdx * BLOCK_SIZE_Y + threadCol) * N + y];

			__syncthreads();

			for (int dotIdx = 0; dotIdx < BLOCK_SIZE_Y; ++dotIdx) {
				threadResult += As[threadRow][dotIdx] * Bs[dotIdx][threadCol];
			}

			__syncthreads();
		}

		C[x * N + y] = threadResult;
	}
}



int main() {
	cudaError_t err;
	float h_C[M * N], h_A[M * K], h_B[K * N];

	REP(i, M * K) h_A[i] = rand() % 2;
	REP(i, K * N) h_B[i] = rand() % 3;

	float * d_C, * d_A, * d_B;

	err = cudaMalloc((void**)&d_C, M * N * sizeof(float));
	assert(err == cudaSuccess);
	err = cudaMalloc((void**)&d_A, M * K * sizeof(float));
	assert(err == cudaSuccess);
	err = cudaMalloc((void**)&d_B, K * N * sizeof(float));
	assert(err == cudaSuccess);

	err = cudaMemcpy(d_A, h_A, M * K * sizeof(float), cudaMemcpyHostToDevice);
	assert(err == cudaSuccess);
	err = cudaMemcpy(d_B, h_B, K * N * sizeof(float), cudaMemcpyHostToDevice);
	assert(err == cudaSuccess);

	assert(M % BLOCK_SIZE_X == 0 && N % BLOCK_SIZE_Y == 0);
	dim3 gridDim(CEIL_DIV(M, BLOCK_SIZE_X), CEIL_DIV(N, BLOCK_SIZE_Y));
	dim3 blockDim(BLOCK_SIZE_X, BLOCK_SIZE_Y);

	gemm<<<gridDim, blockDim>>>(d_A, d_B, d_C);

	cudaDeviceSynchronize();
	err = cudaGetLastError();
	if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
	assert(err == cudaSuccess);

	err = cudaMemcpy(h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost);
	assert(err == cudaSuccess);

	// int error = 0.0;
	REP(i, M) {
		REP(j, N) {
			float val = 0.0;
			REP(k, K) val += h_A[i * K + k] * h_B[k * N + j];

			// error += val - h_C[i * N + j];

			assert(h_C[i * N + j] == val);
			//std::cout << h_C[i * N + j] << ' ' << val << "  ";
		}
		// std::cout << std::endl;
	}
	// std::cout << error << std::endl;

	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);
}

Thank you, I made a silly error in my tiling scheme.

My sgemm kernel now takes 180 micro seconds for M=N=K=512
cuBLAS takes ~60 microseconds

I think the next step is to increase arithmetic intensity per thread.
Would that require a “grid-stride” approach?

Also, is my kernel having bank conflicts? the shared memory block dimensions are 32x32 for As and Bs. Bank conflicts happen for even strided access in a warp, but I think I account for that with how I place threads into warps:

How do I further reach cuBLAS performance?

I saw this in gemm optimization worklog

dim3 gridDim(CEIL_DIV(M, BLOCK_SIZE), CEIL_DIV(N, BLOCK_SIZE));
dim3 blockDim(BLOCK_SIZE * BLOCK_SIZE); // linear block dimension
const int threadRow = threadIdx.x / BLOCK_SIZE; // warps share rows
const int threadCol = threadIdx.x % BLOCK_SIZE;

If confused by this see full code below


sgemm.cu

#ifndef SGEMM_CUH
#define SGEMM_CUH

#ifndef KERNEL
#define KERNEL 1
#endif

#ifndef M
#define M 512
#endif

#ifndef K
#define K 512
#endif

#ifndef N
#define N 512
#endif

#ifndef alpha
#define alpha 1
#endif

#ifndef beta
#define beta 0
#endif

#if KERNEL == 1

#ifndef BLOCK_SIZE
#define BLOCK_SIZE 32
#endif

// shared memory block tiling, 1 output per thread
__global__ __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
void sgemm1(float * A, float * B, float * C) {
	const int cRow = blockIdx.x;
	const int cCol = blockIdx.y;

	const int threadRow = threadIdx.x / BLOCK_SIZE;
	const int threadCol = threadIdx.x % BLOCK_SIZE;

	A += cRow * BLOCK_SIZE * K;
	B += cCol * BLOCK_SIZE;
	C += cRow * BLOCK_SIZE * N + cCol * BLOCK_SIZE;

	__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
	__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
	
	float threadResult = 0.0;

	for (int bkIdx = 0; bkIdx < K / BLOCK_SIZE; ++bkIdx) {
		As[threadRow][threadCol] = A[threadRow * K + threadCol];
		Bs[threadRow][threadCol] = B[threadRow * N + threadCol];

		__syncthreads();
		
		A += BLOCK_SIZE;
		B += BLOCK_SIZE * N;

		for (int dotIdx = 0; dotIdx < BLOCK_SIZE; ++dotIdx) {
			threadResult += As[threadRow][dotIdx] * Bs[dotIdx][threadCol];
		}

		__syncthreads();
	}

	C[threadRow * N + threadCol] =
#if alpha == 1
	threadResult
#else
	alpha * threadResult
#endif

#if beta == 0
#else
	+= beta * C[threadRow * N + threadCol]
#endif
	;
}

#elif KERNEL == 2
// ...
#endif // KERNEL


#endif // SGEMM_CUH

sgemm_run.cu

#include<iostream>
#include<assert.h>
#include<cuda_runtime.h>
#include<cuda.h>

#define CONCAT_FACTORY(x, y) x##y
#define CONCAT(x, y) CONCAT_FACTORY(x, y)
#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
#define REP(i,n) for(int i=0;i<(n);++i)

#define HANDLE_ERROR( err ) {\
	if (err != cudaSuccess) {\
		std::cout << __FILE__ << ' ' << __LINE__ << ' ' << cudaGetErrorString( err ) << std::endl;\
		exit( 1 );\
	}\
}\

#ifndef KERNEL
#define KERNEL 2
#endif

#ifndef M
#define M 512
#endif

#ifndef K
#define K 512
#endif

#ifndef N
#define N 512
#endif

#ifndef alpha
#define alpha 1
#endif

#ifndef beta
#define beta 0
#endif

#ifndef BLOCK_SIZE
#define BLOCK_SIZE 32
#endif

#include"sgemm.cu"

int main() {
	cudaError_t err;
	float h_C[M * N], h_A[M * K], h_B[K * N];

	REP(i, M * K) h_A[i] = rand() % 4;
	REP(i, K * N) h_B[i] = rand() % 5;

	float * d_C, * d_A, * d_B;

	err = cudaMalloc((void**)&d_C, M * N * sizeof(float));
	HANDLE_ERROR(err);
	err = cudaMalloc((void**)&d_A, M * K * sizeof(float));
	HANDLE_ERROR(err);
	err = cudaMalloc((void**)&d_B, K * N * sizeof(float));
	HANDLE_ERROR(err);

	err = cudaMemcpy(d_A, h_A, M * K * sizeof(float), cudaMemcpyHostToDevice);
	HANDLE_ERROR(err);
	err = cudaMemcpy(d_B, h_B, K * N * sizeof(float), cudaMemcpyHostToDevice);
	HANDLE_ERROR(err);

	assert(M % BLOCK_SIZE == 0 && N % BLOCK_SIZE == 0);
	dim3 gridDim(CEIL_DIV(M, BLOCK_SIZE), CEIL_DIV(N, BLOCK_SIZE));
	dim3 blockDim(BLOCK_SIZE * BLOCK_SIZE);

	CONCAT(sgemm,KERNEL)<<<gridDim, blockDim>>>(d_A, d_B, d_C);

	cudaDeviceSynchronize();
	err = cudaGetLastError();
	HANDLE_ERROR(err);

	err = cudaMemcpy(h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost);
	HANDLE_ERROR(err);

	REP(i, M) {
		REP(j, N) {
			float val = 0.0;
			REP(k, K) val += h_A[i * K + k] * h_B[k * N + j];
			assert(h_C[i * N + j] == val);
			//std::cout << h_C[i * N + j] << ' ' << val << "  ";
		}
		// std::cout << std::endl;
	}

	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);
}

I didn’t spot any bank-conflicted access patterns in your modified code.

It’s an involved journey, I usually point people to this for that question.

After the Scott Gray info linked above, although dated, this may be a worthwhile read also.

I’ve been meaning to read that. The thing is, he makes some changes at the SASS level. I’ll see what I can do in cuda, then ptx, but I don’t know if there’s any assembler for Pascal

I saw this talk by Chris Cecka, where they can explore the space of packed matrix layouts with this tensor algebra (see below):

logical MxN matrix vs underlying memory layout
row major: (i,j) → i * N + j
column major: (i, j) → j * N + i
(M/m’)x(N/n’) matrix of m’xn’ matrices:
(i, j) → [ (i / m’) * (N / n’) + (j / n’) ] * (m’ * n’) + [(i mod m’) * n’ + (j mod n’)]

I think you can just continue nesting, then let the compiler optimize the indices

This is a pretty cool idea. I think you can just heuristically search for some optimal mem layout at this point. Probably a powerful technique for architectures with tensor cores.

I have no qualms about investing the time to reach that level of understanding, but I’m going to switch from Pascal arch (titan xp graphics card) soon to Turing (RTX 2080-ti). Plus, the only open source assembler I know of is TuringAs. I don’t know if there’s a “MaxwellAs” or “PascalAs”.

Is there any benefit to warp level intrinsics. I think opencl 2.0 can do reductions across a warp, but how fast is that. Does cuBLAS, CUTLASS, or TensorRT use that?

Yes, Robert’s second link is to Scott’s maxas Github project. I understand it can also be used with Pascal, with a little coaxing.

My hazy recollections of research I did a while ago into maxas is that many of the shortcomings he outlines in what was then the current Nvidia assembler, were addressed in later versions, based on his findings. I remember coming across him being credited in the fine print somewhere.