Hi all,
I am writing a general matrix-matrix multiplication and I run into a subtle problem.
I have checked the code many times and it seems correct.
[A_Rows,A_Cols] x [B_Rows,B_Cols] = [C_Rows,C_Cols]
So I know that for matrix multiplication A_Cols == B_Rows. Now C_Rows = A_Rows and C_Cols = B_Cols.
Here is my code
#include <iostream> #include <cmath> const int NUM_REPS = 100; using namespace std; // Convenience function for checking CUDA runtime API results // can be wrapped around any runtime API call. No-op in release builds. // Convenience function for checking CUDA runtime API results // can be wrapped around any runtime API call. No-op in release builds. inline cudaError_t checkCuda(cudaError_t result) { if (result != cudaSuccess) { fprintf(stderr, "File %s, Line %d: CUDA Runtime Error: %s\n", __FILE__, __LINE__, cudaGetErrorString(result)); exit(EXIT_FAILURE); } return result; } __global__ void matrixMultiplicationKernel(const float *__restrict__ A, const int A_Rows, const int A_Cols, const float *__restrict__ B, const int B_Rows, const int B_Cols, float *__restrict__ C, const int C_Rows, const int C_Cols) { const int ROW = blockIdx.y * blockDim.y + threadIdx.y; const int COL = blockIdx.x * blockDim.x + threadIdx.x; float tmpSum = 0; //printf("(%d * %d) + %d = X:%d, (%d * %d) + %d = Y:%d\n", blockIdx.y, blockDim.y, threadIdx.y, ROW,blockIdx.x, blockDim.x, threadIdx.x, COL ); //printf("(%d,%d)\n", ROW, COL); if (ROW < C_Rows && COL < C_Cols) { // each thread computes one element of the block sub-matrix for (int i = 0; i < A_Cols; i++) { tmpSum += A[ROW * A_Cols + i] * B[i * B_Cols + COL]; } C[ROW * C_Cols + COL] = tmpSum; } } void matrixMultiplication(const float *__restrict__ A, const int A_Rows, const int A_Cols, const float *__restrict__ B, const int B_Rows, const int B_Cols, float *__restrict__ C) { // declare the number of blocks per grid and the number of threads per block // use 1 to 512 threads per block dim3 threadsPerBlock(1, 1); dim3 blocksPerGrid(1, 1); if (A_Cols == B_Rows) { const int R_C_Rows = A_Rows; const int R_C_Cols = B_Cols; // Maximum number of threads per block is 1024 for every SM threadsPerBlock.x = 32; threadsPerBlock.y = 32; blocksPerGrid.x = (R_C_Rows + 31) / 32; blocksPerGrid.y = (R_C_Cols + 31) / 32; printf("M_C[%d x %d]. grid config = (%d,%d) for block config = (32x32)\n",R_C_Rows, R_C_Cols, blocksPerGrid.x, blocksPerGrid.y); matrixMultiplicationKernel<<<blocksPerGrid, threadsPerBlock>>>(A, A_Rows, A_Cols, B, B_Rows, B_Cols, C, R_C_Rows, R_C_Cols); checkCuda(cudaDeviceSynchronize()); } } void initializeMatrix(float *M, const int M_Rows, const int M_Cols, float value) { for (int i = 0; i < M_Rows; ++i) for (int j = 0; j < M_Cols; ++j) M[i * M_Cols + j] = value; } int verifyResult(float *M, const int M_Rows, const int M_Cols, float checkValue) { for (int i = 0; i < M_Rows; ++i) for (int j = 0; j < M_Cols; ++j) if (fabs(M[i * M_Cols + j] - checkValue) > 0.0001f) { std::cerr << "Error greater than tolerance at " << i << "," << j << ": " << M[i * M_Cols + j] << std::endl; return 0; } return 1; } int main(void) { const int A_Rows = 500; const int A_Cols = 250; const int B_Rows = 250; const int B_Cols = 550; float *X; float *Y; float *Z; const int C_Rows = A_Rows; const int C_Cols = B_Cols; checkCuda(cudaMallocManaged(&X, sizeof(float) * A_Rows * A_Cols)); checkCuda(cudaMallocManaged(&Y, sizeof(float) * B_Rows * B_Cols)); checkCuda(cudaMallocManaged(&Z, sizeof(float) * C_Rows * C_Cols)); initializeMatrix(X, A_Rows, A_Cols, 1.0f); initializeMatrix(Y, B_Rows, B_Cols, 2.0f); initializeMatrix(Z, C_Rows, C_Cols, 3.0f); matrixMultiplication(X, A_Rows, A_Cols, Y, B_Rows, B_Cols, Z); if(verifyResult(Z, C_Rows, C_Cols, A_Cols * 2.0f) == 1) { // for (int i = 0; i < NUM_REPS; i++) // matrixMultiplication(X, A_Rows, A_Cols, Y, B_Rows, B_Cols, Z); } checkCuda(cudaFree(X)); checkCuda(cudaFree(Y)); checkCuda(cudaFree(Z)); checkCuda(cudaDeviceReset()); return 0; }
Now the code works when A_Rows == B_Cols (e.g 500) but does not work when they differ!