nVidia CUDA Programming Guide and shared memory

Hello, I’m trying to execute the sharing memory code example found on the nVidia CUDA Programming Guide (matrix multiplication), but I always get the C matrix with complete 0, am I doing something wrong (maybe my eyes are just tired)?

This is the complete code of what I’m trying to achieve

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <errno.h>

#include <cuda.h>

//Matrices are stored in row-major order:

//M(row,col) = *(M.elements + row * M.stride + col)

typedef struct


  int width;

  int height;

  int stride;

  float *elements;


//thread block size

#define BLOCK_SIZE 8

//Get a matrix element (device)

__device__ float GetElement(const Matrix A, const int row, const int col)


  return A.elements[row*A.stride+col];


//Set a matrix element

__device__ void SetElement(Matrix A, int row, int col, float value)


  A.elements[row*A.stride+col] = value;


//Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is

//located col sub-matrices to the right and row sub-matrices down

//from the upper-left corner of A

__device__ Matrix GetSubMatrix(Matrix A, const int row, const int col)


  Matrix Asub;

  Asub.width	= BLOCK_SIZE;

  Asub.height   = BLOCK_SIZE;

  Asub.stride   = A.stride;

  Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col];

  return Asub;


// Forward declaration of the Matrix Multiplication kernel

__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);

//Matrix multiplication - Host code

//Matrix dimensions are assumed to be multiplies of BLOCK_SIZE

void MatMul(const Matrix A, const Matrix B, Matrix C)


  // Load A and B to device memory

  Matrix d_A;

  d_A.width = A.width; d_A.height = A.height;

  size_t size = A.width * A.height * sizeof(float);


  cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);

Matrix d_B;

  d_B.width = B.width; d_B.height = B.height;

  size = B.width * B.height * sizeof(float);


  cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);

//Allocat C on device memory

  Matrix d_C;

  d_C.width = C.width; d_C.height = C.height;

  size = C.width * C.height * sizeof(float);


//Invoke kernel

  dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

  dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);

  MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);

  cudaError_t kernError = cudaGetLastError();

  if(kernError != cudaSuccess)

	fprintf(stderr,"[ERROR] MatMulKernel<<<%d,%d %d,%d>>>: %s\n",dimGrid.x,dimGrid.y,dimBlock.x,dimBlock.y,cudaGetErrorS


// Copy result from device memory

  cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost);

// Free device memory





// Matrix multiplication Kernel

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)


  //Block row and column

  int blockRow = blockIdx.y;

  int blockCol = blockIdx.x;

//Each thread block computes one sub-matrix Csub of C

  Matrix Csub = GetSubMatrix(C, blockRow, blockCol);

//Each thread computes one element of Csub by accumulating results into Cvalues

  float Cvalue = 0;

//thread row and column within Csub

  int row = threadIdx.y;

  int col = threadIdx.x;

//Loop over all the sub-matrices of A and B that are required to compute Csub

  //Multiply each pair of sub-matrices together and accumulate the results

  for(int m=0; m < (A.width / BLOCK_SIZE); ++m)


	//Get submatrix Asub of A

	Matrix Asub = GetSubMatrix(A, blockRow, m);

	//Get submatrix Bsub of B

	Matrix Bsub = GetSubMatrix(B, m, blockCol);

	//Shared memory used to store Asub and Bsub respectively

	__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

	__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

	//Load Asub and Bsub from device memory to shared memory

	//Each thread loads one element of each sub-matrix

	As[row][col] = GetElement(Asub,row,col);

	Bs[row][col] = GetElement(Bsub,row,col);

	//Synchronize to make sure the sub-matrices are loaded

	//before starting the computation


	//Multiply Asub and Bsub together

	for(int e = 0; e < BLOCK_SIZE; ++e)

	  Cvalue += As[row][e] * Bs[e][col];

	//synchronize to make sure that the preceding computation is

	//done before loading two new sub-matrices of A and B in the next iteration



//Write Csub to device memory

  //each thread writes one element

  SetElement(Csub, row, col,Cvalue);


//helper (get element from matrix on host)

float element(int row, int col, Matrix A)


  return *(A.elements + row * A.width + col);


int main(int argc, char *argv[])


  Matrix A,B,C;

  A.width = A.height = B.width = B.height = C.width = C.height = BLOCK_SIZE*2;

  size_t size = A.width*A.height*sizeof(float);

A.elements = (float*)malloc(size);

  B.elements = (float*)malloc(size);

  C.elements = (float*)malloc(size);


  for(int i = 0; i < A.width*A.height; i++)







  for(int i = 0; i < C.width; i++)



	for(int j = 0; j < C.height; j++)

	  printf("%10.f ",element(i,j,C));






  return 0;