Hello, I’m trying to execute the sharing memory code example found on the nVidia CUDA Programming Guide (matrix multiplication), but I always get the C matrix with complete 0, am I doing something wrong (maybe my eyes are just tired)?
This is the complete code of what I’m trying to achieve
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <cuda.h>
//Matrices are stored in row-major order:
//M(row,col) = *(M.elements + row * M.stride + col)
typedef struct
{
int width;
int height;
int stride;
float *elements;
}Matrix;
//thread block size
#define BLOCK_SIZE 8
//Get a matrix element (device)
__device__ float GetElement(const Matrix A, const int row, const int col)
{
return A.elements[row*A.stride+col];
}
//Set a matrix element
__device__ void SetElement(Matrix A, int row, int col, float value)
{
A.elements[row*A.stride+col] = value;
}
//Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
//located col sub-matrices to the right and row sub-matrices down
//from the upper-left corner of A
__device__ Matrix GetSubMatrix(Matrix A, const int row, const int col)
{
Matrix Asub;
Asub.width = BLOCK_SIZE;
Asub.height = BLOCK_SIZE;
Asub.stride = A.stride;
Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col];
return Asub;
}
// Forward declaration of the Matrix Multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
//Matrix multiplication - Host code
//Matrix dimensions are assumed to be multiplies of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
// Load A and B to device memory
Matrix d_A;
d_A.width = A.width; d_A.height = A.height;
size_t size = A.width * A.height * sizeof(float);
cudaMalloc((void**)&d_A.elements,size);
cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width = B.width; d_B.height = B.height;
size = B.width * B.height * sizeof(float);
cudaMalloc((void**)&d_B.elements,size);
cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);
//Allocat C on device memory
Matrix d_C;
d_C.width = C.width; d_C.height = C.height;
size = C.width * C.height * sizeof(float);
cudaMalloc((void**)&d_C.elements,size);
//Invoke kernel
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
cudaError_t kernError = cudaGetLastError();
if(kernError != cudaSuccess)
fprintf(stderr,"[ERROR] MatMulKernel<<<%d,%d %d,%d>>>: %s\n",dimGrid.x,dimGrid.y,dimBlock.x,dimBlock.y,cudaGetErrorS
tring(kernError));
// Copy result from device memory
cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
// Matrix multiplication Kernel
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
//Block row and column
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
//Each thread block computes one sub-matrix Csub of C
Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
//Each thread computes one element of Csub by accumulating results into Cvalues
float Cvalue = 0;
//thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
//Loop over all the sub-matrices of A and B that are required to compute Csub
//Multiply each pair of sub-matrices together and accumulate the results
for(int m=0; m < (A.width / BLOCK_SIZE); ++m)
{
//Get submatrix Asub of A
Matrix Asub = GetSubMatrix(A, blockRow, m);
//Get submatrix Bsub of B
Matrix Bsub = GetSubMatrix(B, m, blockCol);
//Shared memory used to store Asub and Bsub respectively
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
//Load Asub and Bsub from device memory to shared memory
//Each thread loads one element of each sub-matrix
As[row][col] = GetElement(Asub,row,col);
Bs[row][col] = GetElement(Bsub,row,col);
//Synchronize to make sure the sub-matrices are loaded
//before starting the computation
__syncthreads();
//Multiply Asub and Bsub together
for(int e = 0; e < BLOCK_SIZE; ++e)
Cvalue += As[row][e] * Bs[e][col];
//synchronize to make sure that the preceding computation is
//done before loading two new sub-matrices of A and B in the next iteration
__syncthreads();
}
//Write Csub to device memory
//each thread writes one element
SetElement(Csub, row, col,Cvalue);
}
//helper (get element from matrix on host)
float element(int row, int col, Matrix A)
{
return *(A.elements + row * A.width + col);
}
int main(int argc, char *argv[])
{
Matrix A,B,C;
A.width = A.height = B.width = B.height = C.width = C.height = BLOCK_SIZE*2;
size_t size = A.width*A.height*sizeof(float);
A.elements = (float*)malloc(size);
B.elements = (float*)malloc(size);
C.elements = (float*)malloc(size);
memset(C.elements,0,size);
for(int i = 0; i < A.width*A.height; i++)
{
A.elements[i]=i;
B.elements[i]=i;
}
MatMul(A,B,C);
printf("\nResult(C):\n");
for(int i = 0; i < C.width; i++)
{
printf("\n");
for(int j = 0; j < C.height; j++)
printf("%10.f ",element(i,j,C));
}
printf("\n");
free(A.elements);
free(B.elements);
free(C.elements);
return 0;
}