# multiplication of matrix using shared memory problem of multiplication

Hello,

I made some changes in a code of multiplying two matrices using shared memory,
the problem when I tried to modify a code to make multiplication between vector x matrix
the execution always gives the errors !!!

for example i declare

#define haa 16*10 //height of matrix A
#define waa 160 //wieght of matrix A with (weight A=height B hbb=waa ===> AxB the result of multiplication is good
#define wbb 512 //weigth of matrix B

but

#define haa 1 //height of matrix A (is a vector )
#define waa 160 //wieght of matrix A with (weight A=height B hbb=waa ===> AxB the result of multiplication is false
#define wbb 512 //weigth of matrix B

so if somebody have a solution

code :

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include “sys/time.h”

//vectoeur de test
float *AA;
float *BB;
float *CC;
float *CC_host;

//taille des matrices
#define haa 16*10 //height of matrix A
#define waa 160 //wieght of matrix A with (weight A=height B) hbb=waa
#define wbb 512 //weigth of matrix B

#define BLOCK_SIZE 16
// Forward declaration of the device multiplication function
global void Muld(float*, float*, int, int, float*);
// Host multiplication function
// Compute C = A * B
// hA is the height of A
// wA is the width of A
// wB is the width of B
void Mul(const float* A, const float* B, int hA, int wA, int wB,float* C)
{
int size;
// Load A and B to the device
size = hA * wA * sizeof(float);
float* Bd;
size = wA * wB * sizeof(float);
cudaMalloc((void**)&Bd, size);
cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice);
// Allocate C on the device
float* Cd;
size = hA * wB * sizeof(float);
cudaMalloc((void**)&Cd, size);
// Compute the execution configuration assuming
// the matrix dimensions are multiples of BLOCK_SIZE
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(wB / dimBlock.x, hA / dimBlock.y);
// Launch the device computation
Muld<<<dimGrid, dimBlock>>>(Ad, Bd, wA, wB, Cd);
// Read C from the device
cudaMemcpy(C, Cd, size, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(Bd);
cudaFree(Cd);
}

// Device multiplication function called by Mul()
// Compute C = A * B
// wA is the width of A
// wB is the width of B
global void Muld(float* A, float* B, int wA, int wB, float* C)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Index of the first sub-matrix of A processed by the block
int aBegin = wA * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * wB;
// The element of the block sub-matrix that is computed
float Csub = 0;
// Loop over all the sub-matrices of A and B required to
// compute the block sub-matrix
for (int a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep) {
// Shared memory for the sub-matrix of A
shared float As[BLOCK_SIZE][BLOCK_SIZE];
// Shared memory for the sub-matrix of B
shared float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from global memory to shared memory;
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
// Synchronize to make sure the matrices are loaded
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
for (int k = 0; k < BLOCK_SIZE; ++k)
Csub += As[ty][k] * Bs[k][tx];
// Synchronize to make sure that the preceding
// sub-matrices of A and B in the next iteration
}
// Write the block sub-matrix to global memory;
// each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
}

//multiplication in the Host
void Mul_Host(const float* A, const float* B, int hA, int wA, int wB,float* C1)
{
for(int i=0;i<haa;i++){
for(int l=0;l<wbb ;l++){
for(int m=0;m<waa;m++){
C1[l + iwbb]+=A[m +iwaa]B[l+ mwbb];
}
}
}
}

////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
int main(int argc, char **argv) {

//allocation memoire des matrices
int size_AA=haawaa; //size of matrix A
int size_BB=waa
wbb; //size of matrix B
int size_CC=haa*wbb; //size of matrix C

AA= (float )malloc(size_AAsizeof(float));
BB= (float )malloc(size_BBsizeof(float));
CC= (float )malloc(size_CCsizeof(float));
CC_host =(float )malloc(size_CCsizeof(float));

struct timeval t1_start,t1_end;
double time_d;

for (int i=0;i<haa;i++)
for(int j=0;j<waa;j++) AA[j+ i*waa]=(float)(i+j);

for (int i=0;i<waa;i++)
for(int j=0;j<wbb;j++) BB[j+ i*wbb]=(float)(i+j/2);

//execution in device
Mul(AA, BB, haa, waa, wbb,CC);

//execution in host
Mul_Host(AA,BB, haa, waa, wbb,CC_host);

//comparaison between result of CPU and GPU
for (int i=0;i<haa;i++)
for(int j=0;j<wbb;j++) if (CC_host[j+ iwaa]-CC[j+ iwaa]!=0) printf(“error\n”);

return 0;
}

I haven’t read all of the code, but it’s got a comment that seems to explain pretty well what the problem is:

If you want the program to work with arbitrary dimensions, you’ll have to round the grid dimension upwards

[font=“Courier New”] dim3 dimGrid((wB + dimBlock.x -1) / dimBlock.x, (hA + dimBlock.y - 1) / dimBlock.y);[/font]

and probably wrap the calculations inside the kernel into an appropriate conditional that prevents out-of-bounds access from threads outside the matrix dimensions.

I haven’t read all of the code, but it’s got a comment that seems to explain pretty well what the problem is:

If you want the program to work with arbitrary dimensions, you’ll have to round the grid dimension upwards

[font=“Courier New”] dim3 dimGrid((wB + dimBlock.x -1) / dimBlock.x, (hA + dimBlock.y - 1) / dimBlock.y);[/font]

and probably wrap the calculations inside the kernel into an appropriate conditional that prevents out-of-bounds access from threads outside the matrix dimensions.