i wrote a code for matrix multiplication using the example given in the programming guide. i combined a code written in c++ with it and tried to compare the results. i’m getting the result in both the cases, but GPU is taking more time than the CPU. anybody knows what could be the possible reason. on my 8600gt cpu took .1 ms whereas gpu took .4 ms.
//MULTIPLIACATION OF A 2D MATRIX CUDA PROGRAM
//GLOBAL VARIABLES
int BLOCK_SIZE= 16;
int WIDTH = BLOCK_SIZE;
int HEIGHT = BLOCK_SIZE;
//HEADER FILES
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
#include <conio.h>
struct Matrix {
int width;
int height;
int* dat ;
};
// ALLOCATION OF DATA TO MATRIX
void Init(int* data, int size)
{
printf(“\n”);
for (int i = 0; i < size; ++i)
{ data[i] = i+1;
if (i%WIDTH == 0)
printf(“\n\n\n”);
printf(" %d",data[i]);
}
}
//KERNEL TO RUN ON GPU CALLED by MatMul()
global void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
int sum = 0;
int r = threadIdx.y;
int c = threadIdx.x;
//unsigned int z;
//for (z=0; z<10000000;z++)
for (int e = 0; e < A.width; ++e)
sum += A.dat[r * A.width + e]* B.dat[e * B.width + c];
C.dat[r * C.width + c] = sum;
}
// MATRIX MULTIPLICATION FUNCTION CALLIN GPU KERNEL
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
Matrix d_A,d_B,d_C;
size_t size = A.width * A.height * sizeof(int);
d_A.width =A.width; d_A.height = A.width;
cudaMalloc((void**)&d_A.dat, size);
cudaMemcpy(d_A.dat,A.dat, size,cudaMemcpyHostToDevice);
d_B.width = B.width; d_B.height = B.height;
cudaMalloc((void**)&d_B.dat, size);
cudaMemcpy(d_B.dat, B.dat, size,cudaMemcpyHostToDevice);
d_C.width = C.width; d_C.height = C.height;
size = C.width * C.height * sizeof(int);
cudaMalloc((void**)&d_C.dat, size);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
unsigned int timer = 0;
cutCreateTimer( &timer);
cutStartTimer( timer);
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
cudaThreadSynchronize();
// Display Timer
cutStopTimer( timer);
printf(“Processing time: %f (ms)\n”, cutGetTimerValue( timer));
cutDeleteTimer( timer);
cudaMemcpy(C.dat, d_C.dat, size,cudaMemcpyDeviceToHost);
printf(“\n”);int z = WIDTH*HEIGHT;
for(int k=0;k<z;k++)
{
if (k%WIDTH == 0)
printf(“\n”);
printf(" %d",C.dat[k]);
}
cudaFree(d_A.dat);
cudaFree(d_B.dat);
cudaFree(d_C.dat);
}
//MAIN
void main()
{
int t;
double u;
double elapsed;
int a[20][20],b[20][20],c[20][20];
int r1,r2,r3,c1,c2,c3,i,j,k;
r1=r2=r3=c1=c2=c3=16;
/*printf(“\nEnter the size of matrix 1 :”);
printf(“r: “);
scanf_s(”%d”,&r1);
printf(“c: “);
scanf_s(”%d”,&c1);
printf("\nEnter the size of matrix 2 :");
printf("r: ");
scanf_s("%d",&r2);
printf("c: ");
scanf_s("%d",&c2);
printf("\nEnter matrix a: ");*/
int z=0;
for (i=0;i<r1;i++)
{
for(j=0;j<c1;j++)
{
z++;
a[i][j]=z;
}
}
z=0;
//printf("\nEnter matrix b: ");
for (i=0;i<r2;i++)
{
for(j=0;j<c2;j++)
{
z++;
b[i][j]=z;
}
}
for (i=0;i<r1;i++)
for(j=0;j<c2;j++)
c[i][j]=0;
unsigned int timer2 = 0;
cutCreateTimer( &timer2);
cutStartTimer( timer2);
if(c1!=r2)
printf(“\nMultipliation not possible”);
else
{
//for(z=0;z<10000000;z++)
for(i=0;i<r1;i++)
for(j=0;j<c2;j++)
for(k=0;k<c1;k++)
c[i][j]+=a[i][k]*b[k][j];
}
cutStopTimer( timer2);
printf(“Processing time: %f (ms)\n”, cutGetTimerValue( timer2));
cutDeleteTimer( timer2);
for (i=0;i<r1;i++)
{
printf("\n");
for(j=0;j<c2;j++)
printf("\t%d",c[i][j]);
}
// getch();
Matrix h_A,h_B,h_C;
h_A.width=WIDTH;
h_A.height=HEIGHT;
h_B.width=WIDTH;
h_B.height=HEIGHT;
h_C.width=WIDTH;
h_C.height=HEIGHT;
unsigned int size = WIDTH*HEIGHT;
unsigned int mem_size = sizeof(int) * size;
h_A.dat= (int*) malloc(mem_size);
h_B.dat= (int*) malloc(mem_size);
h_C.dat= (int*) malloc(mem_size);
Init(h_A.dat, size);
Init(h_B.dat, size);
//invoke MatMul
MatMul(h_A,h_B,h_C);
getch();
}