 # 32 x 32 Matrix Multiplication

Hey guys,

I’m a beginner in CUDA. I’ve been trying to write a code to implement a 32x32 Matrix-Matrix Multiplication (For eg: A 32x1 Matrix multiplied by a 1x32 Matrix).

My system runs on Windows 7 Ultimate x86 and I use Visual Studio 2005 for now. My GPUs are dual 8800GTS in SLI running 196.21 drivers.

Now the problem is quite simple, wrong output! :)

Just to brief you, i’ve used 4 blocks, each generating 256 threads (16x16 to be precise), thus generating 256x4=1024 threads to perform the multiplication.

Just to keep things simple, I haven’t used shared memory. The objective is to write as simple a program as possible, and once that’s done, can move ahead to optimize it using shared memory.

I hope my code is self-explanatory, i’ve commented where necessary. Please have a look and do post in what the fault could be.

[codebox]// 32 x 32 Matrix Multiplication

#include<stdio.h>

#include<conio.h>

#include<cuda.h>

global void mul_multipleblocks(int *a, int *b, int *c,int width)

{

int row,col,sum,k,m,n;

row=blockIdx.y*blockDim.y + threadIdx.y; //To calculate the row index of the ‘c’ element and ‘a’

col=blockIdx.x*blockDim.x + threadIdx.x; // To calculate the column index of the ‘c’ element and ‘b’

sum=0;

for (k=0; k<width; k++)

{

sum=a[rowwidth+k] * b[kwidth+col]; //Each thread computes one element of the block sub-matrix

}

c[row*width+col]=sum; // Product element stored in Matrix c

}

void main()

{

``````int i,j,m,n,p,q;

int *a_h,*b_h,*a_d,*b_d,*c_h,*c_d;			//a_h, b_h, c_h are host pointers to matrices A, B and C, while a_d, b_d and c_d are device pointers to matrices A, B and C

printf("Enter Matrix A order:\n");

scanf("%d%d",&m,&n);						// Matrix A = m x n

printf("Enter Matrix B order:\n");

scanf("%d%d",&p,&q);						// Matrix B = p x q

if(n==p)								// Check multiplication compatibility

{

a_h=(int*)malloc(m*n*sizeof(int));                    // Allocate memory on host for Matrix A = mxn

cudaMalloc((void**)&a_d,m*n*sizeof(int));        // Allocate memory on device for Matrix A = mxn

b_h=(int*)malloc(p*q*sizeof(int));                     // Allocate memory on host for Matrix B = pxq

cudaMalloc((void**)&b_d,p*q*sizeof(int));          // Allocate memory on device for Matrix B = pxq

c_h=(int*)malloc(m*q*sizeof(int));		       // Allocate memory on host for Matrix C = mxq

cudaMalloc((void**)&c_d,m*q*sizeof(int));	       // Allocate memory on device for Matrix C = mxq

for(i=0;i<m;i++)				                	//Initialize elements of Matrix A

{

for(j=0;j<n;j++)

{

a_h[i*n+j]=1;

}

}

for(i=0;i<p;i++)							// Initialize elements of Matrix B

{

for(j=0;j<q;j++)

{

b_h[i*q+j]=1;

}

}

printf("Matrix A:\n");					       // Print Matrix A

for(i=0;i<m;i++)

{

for(j=0;j<n;j++)

{

printf("%d ",a_h[i*n+j]);

}

printf("\n");

}

printf("Matrix B:\n");						//Print Matrix B

for(i=0;i<p;i++)

{

for(j=0;j<q;j++)

{

printf("%d ",b_h[i*q+j]);

}

printf("\n");

}

cudaMemcpy(a_d,a_h,m*n*sizeof(int),cudaMemcpyHostToDevice);        //a_d points to matrix A (mxn) on device

cudaMemcpy(b_d,b_h,p*q*sizeof(int),cudaMemcpyHostToDevice);		//b_d points to matrix B (pxq) on device

dim3 dimBlock(16,16);

//If Matrix A is of order (32 x 1) and Matrix B of (1 x 32) order, then Matrix C is of order (32 x 32). For 32x32=1024 threads to be generated,

//the block size chosen is 16x16, such that there are 256 threads per block and hence, 4 such blocks in the grid to generate 256x4 = 1024 threads.

dim3 dimGrid(q/dimBlock.x,m/dimBlock.y);   //For the above eg, gridDim.x=32/16=2 and gridDim.y=32/16=2. Thus there are 2x2 = 4 blocks in the grid.

mul_multipleblocks<<<dimGrid, dimBlock>>>(a_d,b_d,c_d,n);       //Kernel call. Either 'n' or 'p' is passed since n=p

cudaMemcpy(c_h,c_d,m*q*sizeof(int),cudaMemcpyDeviceToHost);    //Retrieve results and pointed by c_h

printf("AxB:\n");      //Print result: Matrix C (order: mxq) = AxB

for(i=0;i<m;i++)

{

for(j=0;j<q;j++)

{

printf("%d ",c_h[i*q+j]);

}

printf("\n");

}

getch();

free(a_h);			//Free respective memories.

free(b_h);

free(c_h);

cudaFree(a_d);

cudaFree(b_d);

cudaFree(c_d);

}

else

{

printf("Multiplication not possible\n");

getch();

}
``````

}

[/codebox]

``````a_h[i*n+j]=1;

b_h[i*q+j]=1;

c_h[i*q+j];
``````

A, B and C are row-major with leading dimension n, q, q respectively.

Example: A is m x n (32 x 1) and B is p x q (1 x 32), then C is m x q (32 x 32)

however in your kernel, row-major mapping of B and C is wrong

and you should use "sum += " not "sum = "

``````__global__ void mul_multipleblocks(int *a, int *b, int  *c,int width)

{

int row,col,sum,k,m,n;

row=blockIdx.y*blockDim.y + threadIdx.y; //To calculate the row index of the 'c' element and 'a'

col=blockIdx.x*blockDim.x + threadIdx.x; // To calculate the column index of the 'c' element and 'b'

sum=0;

for (k=0; k<width; k++)  {

sum=a[row*width+k] * b[k*width+col];  //Each thread computes one element of the block sub-matrix

}

c[row*width+col]=sum;	// Product element stored in Matrix c

}

....

mul_multipleblocks<<<dimGrid, dimBlock>>>(a_d,b_d,c_d,n);
``````

you should modify your kernel as

``````__global__ void mul_multipleblocks(int *a, int *b, int  *c, int n, int q )

{

int row,col,sum,k;

row=blockIdx.y*blockDim.y + threadIdx.y; //To calculate the row index of the 'c' element and 'a'

col=blockIdx.x*blockDim.x + threadIdx.x; // To calculate the column index of the 'c' element and 'b'

sum = 0;

for (k=0; k < n; k++)  {

sum += a[row*n + k] * b[k*q + col];  //Each thread computes one element of the block sub-matrix

}

c[row*q + col]=sum;	// Product element stored in Matrix c

}
``````

Ok that was embarassing! Should’ve figured that out myself. :"> But thanks a LOT for your help!! :lol: