32 x 32 Matrix Multiplication

Hey guys,

I’m a beginner in CUDA. I’ve been trying to write a code to implement a 32x32 Matrix-Matrix Multiplication (For eg: A 32x1 Matrix multiplied by a 1x32 Matrix).

My system runs on Windows 7 Ultimate x86 and I use Visual Studio 2005 for now. My GPUs are dual 8800GTS in SLI running 196.21 drivers.

Now the problem is quite simple, wrong output! :)

Just to brief you, i’ve used 4 blocks, each generating 256 threads (16x16 to be precise), thus generating 256x4=1024 threads to perform the multiplication.

Just to keep things simple, I haven’t used shared memory. The objective is to write as simple a program as possible, and once that’s done, can move ahead to optimize it using shared memory.

I hope my code is self-explanatory, i’ve commented where necessary. Please have a look and do post in what the fault could be.

Thanks a lot for your help in advance!

[codebox]// 32 x 32 Matrix Multiplication

#include<stdio.h>

#include<conio.h>

#include<cuda.h>

global void mul_multipleblocks(int *a, int *b, int *c,int width)

{

int row,col,sum,k,m,n;

row=blockIdx.y*blockDim.y + threadIdx.y; //To calculate the row index of the ‘c’ element and ‘a’

col=blockIdx.x*blockDim.x + threadIdx.x; // To calculate the column index of the ‘c’ element and ‘b’

sum=0;

for (k=0; k<width; k++)

{

sum=a[rowwidth+k] * b[kwidth+col]; //Each thread computes one element of the block sub-matrix

}

c[row*width+col]=sum; // Product element stored in Matrix c

}

void main()

{

int i,j,m,n,p,q;

int *a_h,*b_h,*a_d,*b_d,*c_h,*c_d;			//a_h, b_h, c_h are host pointers to matrices A, B and C, while a_d, b_d and c_d are device pointers to matrices A, B and C

printf("Enter Matrix A order:\n");

scanf("%d%d",&m,&n);						// Matrix A = m x n

printf("Enter Matrix B order:\n");

scanf("%d%d",&p,&q);						// Matrix B = p x q

if(n==p)								// Check multiplication compatibility

{

	a_h=(int*)malloc(m*n*sizeof(int));                    // Allocate memory on host for Matrix A = mxn               

	cudaMalloc((void**)&a_d,m*n*sizeof(int));        // Allocate memory on device for Matrix A = mxn

	b_h=(int*)malloc(p*q*sizeof(int));                     // Allocate memory on host for Matrix B = pxq

	cudaMalloc((void**)&b_d,p*q*sizeof(int));          // Allocate memory on device for Matrix B = pxq

	c_h=(int*)malloc(m*q*sizeof(int));		       // Allocate memory on host for Matrix C = mxq

	cudaMalloc((void**)&c_d,m*q*sizeof(int));	       // Allocate memory on device for Matrix C = mxq

	

	for(i=0;i<m;i++)				                	//Initialize elements of Matrix A

	{

	    for(j=0;j<n;j++)

	    {

	    a_h[i*n+j]=1;

	    }

	}

	

	for(i=0;i<p;i++)							// Initialize elements of Matrix B

	{

	    for(j=0;j<q;j++)

	    {

	    b_h[i*q+j]=1;

	    }

	}

	

	printf("Matrix A:\n");					       // Print Matrix A

	for(i=0;i<m;i++)

	{

	    for(j=0;j<n;j++)

	    {

	    printf("%d ",a_h[i*n+j]);

	    }

	printf("\n");

	}

	

	printf("Matrix B:\n");						//Print Matrix B

	for(i=0;i<p;i++)

	{

	    for(j=0;j<q;j++)

	    {

	    printf("%d ",b_h[i*q+j]);

	    }

	printf("\n");

	}

	

	cudaMemcpy(a_d,a_h,m*n*sizeof(int),cudaMemcpyHostToDevice);        //a_d points to matrix A (mxn) on device

	cudaMemcpy(b_d,b_h,p*q*sizeof(int),cudaMemcpyHostToDevice);		//b_d points to matrix B (pxq) on device

	

	dim3 dimBlock(16,16);		

	

	//If Matrix A is of order (32 x 1) and Matrix B of (1 x 32) order, then Matrix C is of order (32 x 32). For 32x32=1024 threads to be generated,

	//the block size chosen is 16x16, such that there are 256 threads per block and hence, 4 such blocks in the grid to generate 256x4 = 1024 threads.

	

	dim3 dimGrid(q/dimBlock.x,m/dimBlock.y);   //For the above eg, gridDim.x=32/16=2 and gridDim.y=32/16=2. Thus there are 2x2 = 4 blocks in the grid.

	

	mul_multipleblocks<<<dimGrid, dimBlock>>>(a_d,b_d,c_d,n);       //Kernel call. Either 'n' or 'p' is passed since n=p

	cudaMemcpy(c_h,c_d,m*q*sizeof(int),cudaMemcpyDeviceToHost);    //Retrieve results and pointed by c_h

	

	printf("AxB:\n");      //Print result: Matrix C (order: mxq) = AxB

	for(i=0;i<m;i++)

	{

	    for(j=0;j<q;j++)

	    {

	    printf("%d ",c_h[i*q+j]);

	    }

	printf("\n");

	}

	getch();

	free(a_h);			//Free respective memories.

	free(b_h);

	free(c_h);

	cudaFree(a_d);

	cudaFree(b_d);

	cudaFree(c_d);

}

else				

{

printf("Multiplication not possible\n");

getch();

}

}

[/codebox]

In your host code,

a_h[i*n+j]=1;

b_h[i*q+j]=1;

c_h[i*q+j];

A, B and C are row-major with leading dimension n, q, q respectively.

Example: A is m x n (32 x 1) and B is p x q (1 x 32), then C is m x q (32 x 32)

however in your kernel, row-major mapping of B and C is wrong

and you should use "sum += " not "sum = "

__global__ void mul_multipleblocks(int *a, int *b, int  *c,int width)

{

	int row,col,sum,k,m,n;

	row=blockIdx.y*blockDim.y + threadIdx.y; //To calculate the row index of the 'c' element and 'a' 	

	col=blockIdx.x*blockDim.x + threadIdx.x; // To calculate the column index of the 'c' element and 'b'		

	sum=0;

	for (k=0; k<width; k++)  {  

		sum=a[row*width+k] * b[k*width+col];  //Each thread computes one element of the block sub-matrix  

	}  

	

	c[row*width+col]=sum;	// Product element stored in Matrix c

}

....

mul_multipleblocks<<<dimGrid, dimBlock>>>(a_d,b_d,c_d,n);

you should modify your kernel as

__global__ void mul_multipleblocks(int *a, int *b, int  *c, int n, int q )

{

	int row,col,sum,k;

	row=blockIdx.y*blockDim.y + threadIdx.y; //To calculate the row index of the 'c' element and 'a' 	

	col=blockIdx.x*blockDim.x + threadIdx.x; // To calculate the column index of the 'c' element and 'b'		

	sum = 0;

	for (k=0; k < n; k++)  {  

		sum += a[row*n + k] * b[k*q + col];  //Each thread computes one element of the block sub-matrix  

	}  

	

	c[row*q + col]=sum;	// Product element stored in Matrix c

}

Ok that was embarassing! Should’ve figured that out myself. :"> But thanks a LOT for your help!! :lol: