Matrix multiplication

Hi,

I m new on Cuda and I m trying to implement my first matrix multiplication but it doesn’t work. Can someone help me to find the error?

[codebox]

#define TILE_WIDTH 2

#define N 5

#define M 3

#define P 2

int main{

dim3 threadsPerBlock(TILE_WIDTH,TILE_WIDTH);

dim3 numBlocks(N /threadsPerBlock.x, P /threadsPerBlock.y);

simpleMultiply<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N, M, P); // it should perform C=A*B where A[N][M], B[M][P], C[N][P]

}

global void simpleMultiply(float* A, float* B, float* C, int NN, int MM , int PP) {

int bx = blockIdx.x; int by = blockIdx.y;

int tx = threadIdx.x; int ty = threadIdx.y;



//we are processing element C[row][col];

// TILE_WIDTH is the block dimension BLOCK = [TILE_WIDTH][TILE_WIDTH]

int row = by * TILE_WIDTH + ty; 

int col = bx * TILE_WIDTH + tx; 



if ((row<N) && (col<M)){

// Matricies in Cuda are arranged by columns --> A[i][j] = A[i + N*j] where dim A[N x M]

float sum = 0.0f; 



    for (int i = 0; i < MM; i++) { 

	sum += A[row + NN*i] * B[i + MM*col];  // sum += A[row][i] * B[i][col]

} 

C[row + col*NN] = sum;

}

}[/codebox]

Try this:

[codebox]

#define TILE_WIDTH 2

#define N 5

#define M 3

#define P 2

int main{

dim3 threadsPerBlock(TILE_WIDTH, TILE_WIDTH);

dim3 numBlocks((N + TILE_WIDTH - 1) / TILE_WIDTH, (P + TILE_WIDTH - 1) / TILE_WIDTH);

simpleMultiply<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N, M, P); // it should perform C=A*B where A[N][M], B[M][P], C[N][P]

}

global void simpleMultiply(float* A, float* B, float* C, int NN, int MM , int PP) {

int bx = blockIdx.x; int by = blockIdx.y;

int tx = threadIdx.x; int ty = threadIdx.y;



//we are processing element C[row][col];

// TILE_WIDTH is the block dimension BLOCK = [TILE_WIDTH][TILE_WIDTH]

int row = by * TILE_WIDTH + ty; 

int col = bx * TILE_WIDTH + tx; 



if ((row<NN) && (col<PP)){

// Matricies in Cuda are arranged by columns --> A[i][j] = A[i + N*j] where dim A[N x M]

float sum = 0.0f; 



    for (int i = 0; i < MM; i++) { 

	sum += A[row + NN*i] * B[i + MM*col];  // sum += A[row][i] * B[i][col]

} 

C[row + col*NN] = sum;

}

}[/codebox]

Try this:

[codebox]

#define TILE_WIDTH 2

#define N 5

#define M 3

#define P 2

int main{

dim3 threadsPerBlock(TILE_WIDTH, TILE_WIDTH);

dim3 numBlocks((N + TILE_WIDTH - 1) / TILE_WIDTH, (P + TILE_WIDTH - 1) / TILE_WIDTH);

simpleMultiply<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N, M, P); // it should perform C=A*B where A[N][M], B[M][P], C[N][P]

}

global void simpleMultiply(float* A, float* B, float* C, int NN, int MM , int PP) {

int bx = blockIdx.x; int by = blockIdx.y;

int tx = threadIdx.x; int ty = threadIdx.y;



//we are processing element C[row][col];

// TILE_WIDTH is the block dimension BLOCK = [TILE_WIDTH][TILE_WIDTH]

int row = by * TILE_WIDTH + ty; 

int col = bx * TILE_WIDTH + tx; 



if ((row<NN) && (col<PP)){

// Matricies in Cuda are arranged by columns --> A[i][j] = A[i + N*j] where dim A[N x M]

float sum = 0.0f; 



    for (int i = 0; i < MM; i++) { 

	sum += A[row + NN*i] * B[i + MM*col];  // sum += A[row][i] * B[i][col]

} 

C[row + col*NN] = sum;

}

}[/codebox]

Thanks, Tera but it still doesn’t work…any others suggestions?

Thanks, Tera but it still doesn’t work…any others suggestions?

Ok, I may have missed more mistakes. They might be easier to find if you tell us what goes wrong.

EDIT: And show the code to copy to and from the device too.

The full code is

[codebox]include <stdio.h>

include <stdlib.h>

include <cutil_inline.h>

include <matGiulio.h>

include <cublas.h>

extern global void simpleMultiply(float* A, float* B, float* C,int N, int M ,int P) ;

define TILE_WIDTH 2

define N 5

define M 3

define P 2

constant float* d_A;

constant float* d_B;

float* d_C;

int main(){

float A[N][M] = {1,2,3,4,5,6,6,8,9,10,11,12,13,14,15};

float B[M][P] = {1,2,3,4,5,6};



float* h_A;

float* h_B;

float* h_C;



h_A = (float *) calloc( N*M,sizeof(float) );

h_B = (float *) calloc( M*P,sizeof(float) ); 

h_C = (float *) calloc( N*P,sizeof(float) ); 



h_A = &A[0][0];

h_B = &B[0][0];

	

cudaMalloc((void**)&d_A, N*M*sizeof(float));

cudaMalloc((void**)&d_B, M*P*sizeof(float));

cudaMalloc((void**)&d_C, N*P*sizeof(float));

cudaMemcpy(d_A, h_A , N*M*sizeof(float),cudaMemcpyHostToDevice);

cudaMemcpy(d_B, h_B , M*P*sizeof(float),cudaMemcpyHostToDevice);

cudaMemcpy(d_C, h_C , N*P*sizeof(float),cudaMemcpyHostToDevice);



dim3 threadsPerBlock(TILE_WIDTH, TILE_WIDTH);

dim3 numBlocks((N + TILE_WIDTH - 1) / TILE_WIDTH, (P + TILE_WIDTH - 1) / TILE_WIDTH);



simpleMultiply<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N, M, P); // it should perform C=A*B where A[N][M], B[M][P], C[N][P]	



printCublasDevMat(d_C,N,P);

prodMat(h_A, h_B, h_C, N, M, P);

printMat(h_C, N, P);

printf( " \n Press ENTER to continue..." );

while (getchar() != '\n');

if (system( "clear" )) system( "cls" );

return 0;

}

global void simpleMultiply(float* A, float* B, float* C, int NN, int MM , int PP) {

int bx = blockIdx.x; int by = blockIdx.y;

int tx = threadIdx.x; int ty = threadIdx.y;



//we are processing element C[row][col];

// TILE_WIDTH is the block dimension BLOCK = [TILE_WIDTH][TILE_WIDTH]

int row = by * TILE_WIDTH + ty; 

int col = bx * TILE_WIDTH + tx; 



if ((row<NN) && (col<PP)){

// Matricies in Cuda are arranged by columns --> A[i][j] = A[i + N*j] where dim A[N x M]

float sum = 0.0f; 



    for (int i = 0; i < MM; i++) { 

	sum += A[row + MM*i] * B[i + PP*col];  // sum += A[row][i] * B[i][col]

} 

C[row + col*NN] = sum;

}

}[/codebox]

and the result is

27 36

0 0

0 49

66 0

0 0

but the right result is

22 28

49 64

75 98

103 136

130 172

I am sure that all other function are fine…so the problem is the GPU kernel.

I found one error on matrix indices :(, but it still doesn’t work…the code is the same but with

[codebox]global void simpleMultiply(float* A, float* B, float* C, int NN, int MM , int PP) {

int bx = blockIdx.x; int by = blockIdx.y;

int tx = threadIdx.x; int ty = threadIdx.y;



//we are processing element C[row][col];

// TILE_WIDTH is the block dimension BLOCK = [TILE_WIDTH][TILE_WIDTH]

int row = by * TILE_WIDTH + ty; 

int col = bx * TILE_WIDTH + tx; 



if ((row<NN) && (col<PP)){

float sum = 0.0f; 

int i;

    for (i = 0; i < MM; i++) { 

	sum += A[row*MM + i] * B[i*PP + col];  // sum += A[row][i] * B[i][col]

} 

	C[row*PP + col] = sum;

}

}

[/codebox]

the result now is

and the result is

22 28

0 0

0 0

0 0

0 0

…the first row is correct!