Matrix multiplication

GiulioPU · July 1, 2010, 5:05pm

Hi,

I m new on Cuda and I m trying to implement my first matrix multiplication but it doesn’t work. Can someone help me to find the error?

[codebox]

#define TILE_WIDTH 2

#define N 5

#define M 3

#define P 2

int main{

…

dim3 threadsPerBlock(TILE_WIDTH,TILE_WIDTH);

dim3 numBlocks(N /threadsPerBlock.x, P /threadsPerBlock.y);

simpleMultiply<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N, M, P); // it should perform C=A*B where A[N][M], B[M][P], C[N][P]

…

}

global void simpleMultiply(float* A, float* B, float* C, int NN, int MM , int PP) {

int bx = blockIdx.x; int by = blockIdx.y;

int tx = threadIdx.x; int ty = threadIdx.y;



//we are processing element C[row][col];

// TILE_WIDTH is the block dimension BLOCK = [TILE_WIDTH][TILE_WIDTH]

int row = by * TILE_WIDTH + ty; 

int col = bx * TILE_WIDTH + tx; 



if ((row<N) && (col<M)){

// Matricies in Cuda are arranged by columns --> A[i][j] = A[i + N*j] where dim A[N x M]

float sum = 0.0f; 



    for (int i = 0; i < MM; i++) { 

	sum += A[row + NN*i] * B[i + MM*col];  // sum += A[row][i] * B[i][col]

} 

C[row + col*NN] = sum;

}

}[/codebox]

tera · July 1, 2010, 5:52pm

Try this:

[codebox]

#define TILE_WIDTH 2

#define N 5

#define M 3

#define P 2

int main{

…

dim3 threadsPerBlock(TILE_WIDTH, TILE_WIDTH);

dim3 numBlocks((N + TILE_WIDTH - 1) / TILE_WIDTH, (P + TILE_WIDTH - 1) / TILE_WIDTH);

simpleMultiply<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N, M, P); // it should perform C=A*B where A[N][M], B[M][P], C[N][P]

…

}

global void simpleMultiply(float* A, float* B, float* C, int NN, int MM , int PP) {

int bx = blockIdx.x; int by = blockIdx.y;

int tx = threadIdx.x; int ty = threadIdx.y;



//we are processing element C[row][col];

// TILE_WIDTH is the block dimension BLOCK = [TILE_WIDTH][TILE_WIDTH]

int row = by * TILE_WIDTH + ty; 

int col = bx * TILE_WIDTH + tx; 



if ((row<NN) && (col<PP)){

// Matricies in Cuda are arranged by columns --> A[i][j] = A[i + N*j] where dim A[N x M]

float sum = 0.0f; 



    for (int i = 0; i < MM; i++) { 

	sum += A[row + NN*i] * B[i + MM*col];  // sum += A[row][i] * B[i][col]

} 

C[row + col*NN] = sum;

}

}[/codebox]

tera · July 1, 2010, 5:52pm

Try this:

[codebox]

#define TILE_WIDTH 2

#define N 5

#define M 3

#define P 2

int main{

…

dim3 threadsPerBlock(TILE_WIDTH, TILE_WIDTH);

dim3 numBlocks((N + TILE_WIDTH - 1) / TILE_WIDTH, (P + TILE_WIDTH - 1) / TILE_WIDTH);

simpleMultiply<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N, M, P); // it should perform C=A*B where A[N][M], B[M][P], C[N][P]

…

}

global void simpleMultiply(float* A, float* B, float* C, int NN, int MM , int PP) {

int bx = blockIdx.x; int by = blockIdx.y;

int tx = threadIdx.x; int ty = threadIdx.y;



//we are processing element C[row][col];

// TILE_WIDTH is the block dimension BLOCK = [TILE_WIDTH][TILE_WIDTH]

int row = by * TILE_WIDTH + ty; 

int col = bx * TILE_WIDTH + tx; 



if ((row<NN) && (col<PP)){

// Matricies in Cuda are arranged by columns --> A[i][j] = A[i + N*j] where dim A[N x M]

float sum = 0.0f; 



    for (int i = 0; i < MM; i++) { 

	sum += A[row + NN*i] * B[i + MM*col];  // sum += A[row][i] * B[i][col]

} 

C[row + col*NN] = sum;

}

}[/codebox]

GiulioPU · July 2, 2010, 12:39pm

Thanks, Tera but it still doesn’t work…any others suggestions?

Try this:

[codebox]

define TILE_WIDTH 2

define N 5

define M 3

define P 2

int main{

…

dim3 threadsPerBlock(TILE_WIDTH, TILE_WIDTH);

dim3 numBlocks((N + TILE_WIDTH - 1) / TILE_WIDTH, (P + TILE_WIDTH - 1) / TILE_WIDTH);

simpleMultiply<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N, M, P); // it should perform C=A*B where A[N][M], B[M][P], C[N][P]

…

}

global void simpleMultiply(float* A, float* B, float* C, int NN, int MM , int PP) {

int bx = blockIdx.x; int by = blockIdx.y;

int tx = threadIdx.x; int ty = threadIdx.y;



//we are processing element C[row][col];

// TILE_WIDTH is the block dimension BLOCK = [TILE_WIDTH][TILE_WIDTH]

int row = by * TILE_WIDTH + ty; 

int col = bx * TILE_WIDTH + tx; 



if ((row<NN) && (col<PP)){

// Matricies in Cuda are arranged by columns --> A[i][j] = A[i + N*j] where dim A[N x M]

float sum = 0.0f; 



    for (int i = 0; i < MM; i++) { 

	sum += A[row + NN*i] * B[i + MM*col];  // sum += A[row][i] * B[i][col]

} 

C[row + col*NN] = sum;

}

}[/codebox]

GiulioPU · July 2, 2010, 12:39pm

Thanks, Tera but it still doesn’t work…any others suggestions?

Try this:

[codebox]

define TILE_WIDTH 2

define N 5

define M 3

define P 2

int main{

…

dim3 threadsPerBlock(TILE_WIDTH, TILE_WIDTH);

dim3 numBlocks((N + TILE_WIDTH - 1) / TILE_WIDTH, (P + TILE_WIDTH - 1) / TILE_WIDTH);

simpleMultiply<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N, M, P); // it should perform C=A*B where A[N][M], B[M][P], C[N][P]

…

}

global void simpleMultiply(float* A, float* B, float* C, int NN, int MM , int PP) {

int bx = blockIdx.x; int by = blockIdx.y;

int tx = threadIdx.x; int ty = threadIdx.y;



//we are processing element C[row][col];

// TILE_WIDTH is the block dimension BLOCK = [TILE_WIDTH][TILE_WIDTH]

int row = by * TILE_WIDTH + ty; 

int col = bx * TILE_WIDTH + tx; 



if ((row<NN) && (col<PP)){

// Matricies in Cuda are arranged by columns --> A[i][j] = A[i + N*j] where dim A[N x M]

float sum = 0.0f; 



    for (int i = 0; i < MM; i++) { 

	sum += A[row + NN*i] * B[i + MM*col];  // sum += A[row][i] * B[i][col]

} 

C[row + col*NN] = sum;

}

}[/codebox]

tera · July 2, 2010, 1:32pm

Ok, I may have missed more mistakes. They might be easier to find if you tell us what goes wrong.

EDIT: And show the code to copy to and from the device too.

GiulioPU · July 2, 2010, 1:49pm

The full code is

[codebox]include <stdio.h>

include <stdlib.h>

include <cutil_inline.h>

include <matGiulio.h>

include <cublas.h>

extern global void simpleMultiply(float* A, float* B, float* C,int N, int M ,int P) ;

define TILE_WIDTH 2

define N 5

define M 3

define P 2

constant float* d_A;

constant float* d_B;

float* d_C;

int main(){

float A[N][M] = {1,2,3,4,5,6,6,8,9,10,11,12,13,14,15};

float B[M][P] = {1,2,3,4,5,6};



float* h_A;

float* h_B;

float* h_C;



h_A = (float *) calloc( N*M,sizeof(float) );

h_B = (float *) calloc( M*P,sizeof(float) ); 

h_C = (float *) calloc( N*P,sizeof(float) ); 



h_A = &A[0][0];

h_B = &B[0][0];

	

cudaMalloc((void**)&d_A, N*M*sizeof(float));

cudaMalloc((void**)&d_B, M*P*sizeof(float));

cudaMalloc((void**)&d_C, N*P*sizeof(float));

cudaMemcpy(d_A, h_A , N*M*sizeof(float),cudaMemcpyHostToDevice);

cudaMemcpy(d_B, h_B , M*P*sizeof(float),cudaMemcpyHostToDevice);

cudaMemcpy(d_C, h_C , N*P*sizeof(float),cudaMemcpyHostToDevice);



dim3 threadsPerBlock(TILE_WIDTH, TILE_WIDTH);

dim3 numBlocks((N + TILE_WIDTH - 1) / TILE_WIDTH, (P + TILE_WIDTH - 1) / TILE_WIDTH);



simpleMultiply<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N, M, P); // it should perform C=A*B where A[N][M], B[M][P], C[N][P]	



printCublasDevMat(d_C,N,P);

prodMat(h_A, h_B, h_C, N, M, P);

printMat(h_C, N, P);

printf( " \n Press ENTER to continue..." );

while (getchar() != '\n');

if (system( "clear" )) system( "cls" );

return 0;

}

global void simpleMultiply(float* A, float* B, float* C, int NN, int MM , int PP) {

int bx = blockIdx.x; int by = blockIdx.y;

int tx = threadIdx.x; int ty = threadIdx.y;



//we are processing element C[row][col];

// TILE_WIDTH is the block dimension BLOCK = [TILE_WIDTH][TILE_WIDTH]

int row = by * TILE_WIDTH + ty; 

int col = bx * TILE_WIDTH + tx; 



if ((row<NN) && (col<PP)){

// Matricies in Cuda are arranged by columns --> A[i][j] = A[i + N*j] where dim A[N x M]

float sum = 0.0f; 



    for (int i = 0; i < MM; i++) { 

	sum += A[row + MM*i] * B[i + PP*col];  // sum += A[row][i] * B[i][col]

} 

C[row + col*NN] = sum;

}

}[/codebox]

and the result is

27 36

0 0

0 49

66 0

0 0

but the right result is

22 28

49 64

75 98

103 136

130 172

I am sure that all other function are fine…so the problem is the GPU kernel.

GiulioPU · July 2, 2010, 3:43pm

I found one error on matrix indices :(, but it still doesn’t work…the code is the same but with

[codebox]global void simpleMultiply(float* A, float* B, float* C, int NN, int MM , int PP) {

int bx = blockIdx.x; int by = blockIdx.y;

int tx = threadIdx.x; int ty = threadIdx.y;



//we are processing element C[row][col];

// TILE_WIDTH is the block dimension BLOCK = [TILE_WIDTH][TILE_WIDTH]

int row = by * TILE_WIDTH + ty; 

int col = bx * TILE_WIDTH + tx; 



if ((row<NN) && (col<PP)){

float sum = 0.0f; 

int i;

    for (i = 0; i < MM; i++) { 

	sum += A[row*MM + i] * B[i*PP + col];  // sum += A[row][i] * B[i][col]

} 

	C[row*PP + col] = sum;

}

}

[/codebox]

the result now is

and the result is

22 28

0 0

…the first row is correct!

Topic		Replies	Views
32 x 32 Matrix Multiplication CUDA Programming and Performance	2	2853	March 5, 2010
Matrix Multiplication In CUDA CUDA Programming and Performance	6	2525	May 11, 2015
Problems of matrix multiplication With and without CUDA CUDA Programming and Performance	15	9993	January 18, 2012
Is there an error in the cuda manual matrix multiplication example? CUDA Programming and Performance	11	12876	December 1, 2016
Subtle problem with matrix-matrix multiplication CUDA Programming and Performance	10	1120	December 12, 2020
Cuda matrix multiplication too slow CUDA Programming and Performance	5	13323	February 17, 2010
multiplication of matrix using shared memory problem of multiplication CUDA Programming and Performance	2	3937	September 30, 2010
CUDA Matrix Multiplication: One thread computes multiple elements CUDA Programming and Performance	4	4812	December 28, 2014
Matrix multiplication from CUDA programming guide CUDA Programming and Performance	0	1825	November 23, 2009
3x3 Matrix multiply giving incorrect result CUDA Programming and Performance	2	434	April 10, 2023

Matrix multiplication

Related topics