matrix multiplication

suriudit · March 3, 2010, 5:59pm

i wrote a code for matrix multiplication using the example given in the programming guide. i combined a code written in c++ with it and tried to compare the results. i’m getting the result in both the cases, but GPU is taking more time than the CPU. anybody knows what could be the possible reason. on my 8600gt cpu took .1 ms whereas gpu took .4 ms.

//MULTIPLIACATION OF A 2D MATRIX CUDA PROGRAM

//GLOBAL VARIABLES

int BLOCK_SIZE= 16;
int WIDTH = BLOCK_SIZE;
int HEIGHT = BLOCK_SIZE;

//HEADER FILES

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
#include <conio.h>

struct Matrix {
int width;
int height;
int* dat ;
};

// ALLOCATION OF DATA TO MATRIX

void Init(int* data, int size)
{
printf(“\n”);
for (int i = 0; i < size; ++i)
{ data[i] = i+1;
if (i%WIDTH == 0)
printf(“\n\n\n”);
printf(" %d",data[i]);
}
}

//KERNEL TO RUN ON GPU CALLED by MatMul()

global void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
int sum = 0;
int r = threadIdx.y;
int c = threadIdx.x;
//unsigned int z;
//for (z=0; z<10000000;z++)
for (int e = 0; e < A.width; ++e)
sum += A.dat[r * A.width + e]* B.dat[e * B.width + c];
C.dat[r * C.width + c] = sum;
}

// MATRIX MULTIPLICATION FUNCTION CALLIN GPU KERNEL

void MatMul(const Matrix A, const Matrix B, Matrix C)
{

Matrix d_A,d_B,d_C;
size_t size = A.width * A.height * sizeof(int);

d_A.width =A.width; d_A.height = A.width;
cudaMalloc((void**)&d_A.dat, size);
cudaMemcpy(d_A.dat,A.dat, size,cudaMemcpyHostToDevice);

d_B.width = B.width; d_B.height = B.height;
cudaMalloc((void**)&d_B.dat, size);
cudaMemcpy(d_B.dat, B.dat, size,cudaMemcpyHostToDevice);

d_C.width = C.width; d_C.height = C.height;
size = C.width * C.height * sizeof(int);
cudaMalloc((void**)&d_C.dat, size);

dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);

unsigned int timer = 0;
cutCreateTimer( &timer);
cutStartTimer( timer);

MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
cudaThreadSynchronize();

// Display Timer

cutStopTimer( timer);
printf(“Processing time: %f (ms)\n”, cutGetTimerValue( timer));
cutDeleteTimer( timer);

cudaMemcpy(C.dat, d_C.dat, size,cudaMemcpyDeviceToHost);
printf(“\n”);int z = WIDTH*HEIGHT;
for(int k=0;k<z;k++)
{
if (k%WIDTH == 0)
printf(“\n”);
printf(" %d",C.dat[k]);
}

cudaFree(d_A.dat);
cudaFree(d_B.dat);
cudaFree(d_C.dat);
}

//MAIN

void main()
{
int t;
double u;
double elapsed;
int a[20][20],b[20][20],c[20][20];
int r1,r2,r3,c1,c2,c3,i,j,k;
r1=r2=r3=c1=c2=c3=16;
/*printf(“\nEnter the size of matrix 1 :”);
printf(“r: “);
scanf_s(”%d”,&r1);
printf(“c: “);
scanf_s(”%d”,&c1);

printf("\nEnter the size of matrix 2 :");
printf("r: ");
scanf_s("%d",&r2);
printf("c: ");
scanf_s("%d",&c2);

printf("\nEnter matrix a: ");*/
int z=0;
for (i=0;i<r1;i++)
{	
	for(j=0;j<c1;j++)
	{
		z++;
		a[i][j]=z;
	}
}
 z=0;
//printf("\nEnter matrix b: ");
for (i=0;i<r2;i++)
{
	for(j=0;j<c2;j++)
	{
		z++;
		b[i][j]=z;
	}
}

for (i=0;i<r1;i++)
	for(j=0;j<c2;j++)
		c[i][j]=0;



unsigned int timer2 = 0;

cutCreateTimer( &timer2);
cutStartTimer( timer2);
if(c1!=r2)
printf(“\nMultipliation not possible”);
else
{
//for(z=0;z<10000000;z++)
for(i=0;i<r1;i++)
for(j=0;j<c2;j++)
for(k=0;k<c1;k++)
c[i][j]+=a[i][k]*b[k][j];
}
cutStopTimer( timer2);
printf(“Processing time: %f (ms)\n”, cutGetTimerValue( timer2));
cutDeleteTimer( timer2);

for (i=0;i<r1;i++)
{
	printf("\n");
	for(j=0;j<c2;j++)
		printf("\t%d",c[i][j]);
}

// getch();

Matrix h_A,h_B,h_C;
h_A.width=WIDTH;
h_A.height=HEIGHT;
h_B.width=WIDTH;
h_B.height=HEIGHT;
h_C.width=WIDTH;
h_C.height=HEIGHT;



unsigned int size = WIDTH*HEIGHT;
unsigned int mem_size = sizeof(int) * size;



h_A.dat= (int*) malloc(mem_size);
h_B.dat= (int*) malloc(mem_size);
h_C.dat= (int*) malloc(mem_size);

Init(h_A.dat, size);
Init(h_B.dat, size);

//invoke MatMul
MatMul(h_A,h_B,h_C);

getch();
}

mouser58907 · March 3, 2010, 8:14pm

How big of matrices did you do, My results for from my example were:

Input matrix size 2048 by 2048

CPU Processing time: 403951.625000 (ms) (~6m 43s )
Matrixdata checksum: 3.90314516052705e+016

– GPU without Final Optimizations
Processing time: 351.787354 (ms)
Matrixdata checksum: 3.90314516052705e+016

– GPU with 1 x 4 Thread Granularity
Processing time: 288.677216 (ms)
Matrixdata checksum: 3.90314516052705e+016

– GPU with 1 x 2 Thread Granularity and Complete Unrolling
Processing time: 262.640533 (ms)
Matrixdata checksum: 3.90314516052705e+016

LSChien · March 4, 2010, 1:32am

your kernel

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)

{

	int sum = 0;

	int r = threadIdx.y;

	int c = threadIdx.x;

	for (int e = 0; e < A.width; ++e)

		sum += A.dat[r * A.width + e]* B.dat[e * B.width + c];

	

	C.dat[r * C.width + c] = sum;

}

and exeuction configuration

dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);

shows that all thread block compute the same submatrix of C = A * B.

(1) your GPU is wrong

(2) you have race condition when writing results to matrix C

because all thread block write to the same submatrix of C

you must re-design index in your kernel code

suriudit · March 4, 2010, 2:30am

how did u input matrix size of 2048X 2048? i can’t increase my block size beyond 16, get an error. so i did my calculation for 16X 16 matrix.

suriudit · March 4, 2010, 2:33am

your kernel
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)

{

	int sum = 0;

	int r = threadIdx.y;

	int c = threadIdx.x;

	for (int e = 0; e < A.width; ++e)

		sum += A.dat[r * A.width + e]* B.dat[e * B.width + c];

	

	C.dat[r * C.width + c] = sum;

}
and exeuction configuration
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
shows that all thread block compute the same submatrix of C = A * B.

(1) your GPU is wrong

(2) you have race condition when writing results to matrix C

because all thread block write to the same submatrix of C

you must re-design index in your kernel code

i’m doing the calculation for a single thread block only. how do i increase no of blocks? my results are for a single block of 256 threads.

LSChien · March 4, 2010, 5:02am

that’s the problem, you only use one thread block, in other words, only one multiporcessor is used.

I have no idea how many multiprocessors 8600GT have. (8800 GT has 14 multiprocessor)

you under-utilize your GPU, of course, performance of GPU version is worse than performance of CPU version.

please read section 2.2 of programming guide 2.3 to further information about how to use more than one thread block.

suriudit · March 4, 2010, 6:40am

thanks…i increased the grid size to include 4 blocks of 256 threads each but i’m not able to compute 32X32 matrix. i modified the kernel as told by you:

[codebox]//KERNEL TO RUN ON GPU CALLED by MatMul()

global void MatMulKernel(Matrix A, Matrix B, Matrix C)

{

int sum = 0;

int r = blockIdx.x * blockDim.x + threadIdx.x;

int c = blockIdx.y * blockDim.y + threadIdx.y;

	for (int e = 0; e < A.width; ++e)

	sum += A.dat[r * A.width + e]* B.dat[e * B.width + c];

C.dat[r * C.width + c] = sum;

}[/codebox]

i also changed the dimensions of grid as below:

[codebox]dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

dim3 dimGrid(4,1);

[/codebox]

it computes fine for 16X16 matrix but for 32X32 matrix, the result is fine for first 16 elements of each row but next 16 elements are 0 for all 32 rows. i have added as attachment the output that i get.

LSChien · March 4, 2010, 4:09pm

[codebox]//KERNEL TO RUN ON GPU CALLED by MatMul()

global void MatMulKernel(Matrix A, Matrix B, Matrix C)

{
int sum = 0;

int r = blockIdx.x * blockDim.x + threadIdx.x;

int c = blockIdx.y * blockDim.y + threadIdx.y;

	for (int e = 0; e < A.width; ++e)

	sum += A.dat[r * A.width + e]* B.dat[e * B.width + c];
C.dat[r * C.width + c] = sum;

}[/codebox]

i also changed the dimensions of grid as below:

[codebox]dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

dim3 dimGrid(4,1);

[/codebox]

According to your kernel, I suggest

dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

dim3 dimGrid( A.height / dimBlock.x, B.width / dimBlock.y );

also you need to add boundary condition, check if r (row) and c (column) are valid or not.

suriudit · March 5, 2010, 1:31pm

According to your kernel, I suggest
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

dim3 dimGrid( A.height / dimBlock.x, B.width / dimBlock.y );
also you need to add boundary condition, check if r (row) and c (column) are valid or not.

thanks…i got the result. though i am doing a comparison between the performance of cpu and gpu. my code runs for 2048X2048 matrix on gpu but the cpu code does not run. i get an error unhandled break exception. maximum matrix size i can use for cpu is 256X256.

LSChien · March 7, 2010, 3:57am

do you modify

int a[20][20],b[20][20],c[20][20];

suriudit · March 7, 2010, 8:47am

i used malloc and got over the problem…

do u have an idea how i can design a GUI for this code??

Topic		Replies	Views
Matrix multiplication ERRORS & few thoughts on CUDA Basic programming errors need correction CUDA Programming and Performance	14	13492	January 24, 2009
matrix multiplication with large dimensions CUDA Programming and Performance	7	1692	April 9, 2011
Matrix multiplication from CUDA programming guide CUDA Programming and Performance	0	1871	November 23, 2009
Matrix Multiplication Inconsistency Different values output in every run of the matixMul program CUDA Programming and Performance	29	8986	December 16, 2009
Matrix multiplication CPG example (dimBlock x dimGrid) dim3 dimBlock X dim3 dimGrid and Thread IDs CUDA Programming and Performance	0	3586	December 2, 2011
matrixMul problem printDiff is flipping plz help me CUDA Programming and Performance	2	6780	October 15, 2007
Problems of matrix multiplication With and without CUDA CUDA Programming and Performance	15	10170	January 18, 2012
matrix multiplication--wrong answer CUDA Programming and Performance	6	3876	August 20, 2009
problem of matrix multiplication vector x matrix CUDA Programming and Performance	4	1305	August 22, 2010
CUDA Matrix Multiplication: One thread computes multiple elements CUDA Programming and Performance	4	5199	December 28, 2014

matrix multiplication

Related topics