I am having trouble implementing this matrix multiplication with shared memory. Any help is appreciated.
#include<iostream>
#include<assert.h>
#include<cuda_runtime.h>
#include<cuda.h>
#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
#define REP(i,n) for(int i=0;i<(n);++i)
#define M 512
#define K 512
#define N 512
#define BLOCK_SIZE_X 32
#define BLOCK_SIZE_Y 32
#define TM 4
#define tN 4
// alpha=1, beta=0
__global__
void gemm(const float * A, const float * B, float * C) {
// 2D thread indices within a block
const int threadRow = threadIdx.y;
const int threadCol = threadIdx.x;
// Global indices (matrix element coordinates)
const int x = blockIdx.y * BLOCK_SIZE_Y + threadRow;
const int y = blockIdx.x * BLOCK_SIZE_X + threadCol;
__shared__ float As[BLOCK_SIZE_X][BLOCK_SIZE_Y];
__shared__ float Bs[BLOCK_SIZE_Y][BLOCK_SIZE_X];
if (x < M && y < N) {
float threadResult = 0.0;
for (int bkIdx = 0; bkIdx < K / BLOCK_SIZE_Y; ++bkIdx) {
As[threadRow][threadCol] = A[x * K + (bkIdx * BLOCK_SIZE_Y + threadCol)];
Bs[threadCol][threadRow] = B[(bkIdx * BLOCK_SIZE_Y + threadCol) * N + y];
__syncthreads();
for (int dotIdx = 0; dotIdx < BLOCK_SIZE_Y; ++dotIdx) {
threadResult += As[threadRow][dotIdx] * Bs[dotIdx][threadCol];
}
__syncthreads();
}
C[x * N + y] = threadResult;
}
}
int main() {
cudaError_t err;
float h_C[M * N], h_A[M * K], h_B[K * N];
REP(i, M * K) h_A[i] = rand() % 2;
REP(i, K * N) h_B[i] = rand() % 3;
float * d_C, * d_A, * d_B;
err = cudaMalloc((void**)&d_C, M * N * sizeof(float));
assert(err == cudaSuccess);
err = cudaMalloc((void**)&d_A, M * K * sizeof(float));
assert(err == cudaSuccess);
err = cudaMalloc((void**)&d_B, K * N * sizeof(float));
assert(err == cudaSuccess);
err = cudaMemcpy(d_A, h_A, M * K * sizeof(float), cudaMemcpyHostToDevice);
assert(err == cudaSuccess);
err = cudaMemcpy(d_B, h_B, K * N * sizeof(float), cudaMemcpyHostToDevice);
assert(err == cudaSuccess);
assert(M % BLOCK_SIZE_X == 0 && N % BLOCK_SIZE_Y == 0);
dim3 gridDim(CEIL_DIV(M, BLOCK_SIZE_X), CEIL_DIV(N, BLOCK_SIZE_Y));
dim3 blockDim(BLOCK_SIZE_X, BLOCK_SIZE_Y);
gemm<<<gridDim, blockDim>>>(d_A, d_B, d_C);
cudaDeviceSynchronize();
err = cudaGetLastError();
if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
assert(err == cudaSuccess);
err = cudaMemcpy(h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost);
assert(err == cudaSuccess);
// int error = 0.0;
REP(i, M) {
REP(j, N) {
float val = 0.0;
REP(k, K) val += h_A[i * K + k] * h_B[k * N + j];
// error += val - h_C[i * N + j];
assert(h_C[i * N + j] == val);
//std::cout << h_C[i * N + j] << ' ' << val << " ";
}
// std::cout << std::endl;
}
// std::cout << error << std::endl;
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}