Hey guys,
I’m a beginner in CUDA. I’ve been trying to write a code to implement a 32x32 Matrix-Matrix Multiplication (For eg: A 32x1 Matrix multiplied by a 1x32 Matrix).
My system runs on Windows 7 Ultimate x86 and I use Visual Studio 2005 for now. My GPUs are dual 8800GTS in SLI running 196.21 drivers.
Now the problem is quite simple, wrong output! :)
Just to brief you, i’ve used 4 blocks, each generating 256 threads (16x16 to be precise), thus generating 256x4=1024 threads to perform the multiplication.
Just to keep things simple, I haven’t used shared memory. The objective is to write as simple a program as possible, and once that’s done, can move ahead to optimize it using shared memory.
I hope my code is self-explanatory, i’ve commented where necessary. Please have a look and do post in what the fault could be.
Thanks a lot for your help in advance!
[codebox]// 32 x 32 Matrix Multiplication
#include<stdio.h>
#include<conio.h>
#include<cuda.h>
global void mul_multipleblocks(int *a, int *b, int *c,int width)
{
int row,col,sum,k,m,n;
row=blockIdx.y*blockDim.y + threadIdx.y; //To calculate the row index of the ‘c’ element and ‘a’
col=blockIdx.x*blockDim.x + threadIdx.x; // To calculate the column index of the ‘c’ element and ‘b’
sum=0;
for (k=0; k<width; k++)
{
sum=a[rowwidth+k] * b[kwidth+col]; //Each thread computes one element of the block sub-matrix
}
c[row*width+col]=sum; // Product element stored in Matrix c
}
void main()
{
int i,j,m,n,p,q;
int *a_h,*b_h,*a_d,*b_d,*c_h,*c_d; //a_h, b_h, c_h are host pointers to matrices A, B and C, while a_d, b_d and c_d are device pointers to matrices A, B and C
printf("Enter Matrix A order:\n");
scanf("%d%d",&m,&n); // Matrix A = m x n
printf("Enter Matrix B order:\n");
scanf("%d%d",&p,&q); // Matrix B = p x q
if(n==p) // Check multiplication compatibility
{
a_h=(int*)malloc(m*n*sizeof(int)); // Allocate memory on host for Matrix A = mxn
cudaMalloc((void**)&a_d,m*n*sizeof(int)); // Allocate memory on device for Matrix A = mxn
b_h=(int*)malloc(p*q*sizeof(int)); // Allocate memory on host for Matrix B = pxq
cudaMalloc((void**)&b_d,p*q*sizeof(int)); // Allocate memory on device for Matrix B = pxq
c_h=(int*)malloc(m*q*sizeof(int)); // Allocate memory on host for Matrix C = mxq
cudaMalloc((void**)&c_d,m*q*sizeof(int)); // Allocate memory on device for Matrix C = mxq
for(i=0;i<m;i++) //Initialize elements of Matrix A
{
for(j=0;j<n;j++)
{
a_h[i*n+j]=1;
}
}
for(i=0;i<p;i++) // Initialize elements of Matrix B
{
for(j=0;j<q;j++)
{
b_h[i*q+j]=1;
}
}
printf("Matrix A:\n"); // Print Matrix A
for(i=0;i<m;i++)
{
for(j=0;j<n;j++)
{
printf("%d ",a_h[i*n+j]);
}
printf("\n");
}
printf("Matrix B:\n"); //Print Matrix B
for(i=0;i<p;i++)
{
for(j=0;j<q;j++)
{
printf("%d ",b_h[i*q+j]);
}
printf("\n");
}
cudaMemcpy(a_d,a_h,m*n*sizeof(int),cudaMemcpyHostToDevice); //a_d points to matrix A (mxn) on device
cudaMemcpy(b_d,b_h,p*q*sizeof(int),cudaMemcpyHostToDevice); //b_d points to matrix B (pxq) on device
dim3 dimBlock(16,16);
//If Matrix A is of order (32 x 1) and Matrix B of (1 x 32) order, then Matrix C is of order (32 x 32). For 32x32=1024 threads to be generated,
//the block size chosen is 16x16, such that there are 256 threads per block and hence, 4 such blocks in the grid to generate 256x4 = 1024 threads.
dim3 dimGrid(q/dimBlock.x,m/dimBlock.y); //For the above eg, gridDim.x=32/16=2 and gridDim.y=32/16=2. Thus there are 2x2 = 4 blocks in the grid.
mul_multipleblocks<<<dimGrid, dimBlock>>>(a_d,b_d,c_d,n); //Kernel call. Either 'n' or 'p' is passed since n=p
cudaMemcpy(c_h,c_d,m*q*sizeof(int),cudaMemcpyDeviceToHost); //Retrieve results and pointed by c_h
printf("AxB:\n"); //Print result: Matrix C (order: mxq) = AxB
for(i=0;i<m;i++)
{
for(j=0;j<q;j++)
{
printf("%d ",c_h[i*q+j]);
}
printf("\n");
}
getch();
free(a_h); //Free respective memories.
free(b_h);
free(c_h);
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
}
else
{
printf("Multiplication not possible\n");
getch();
}
}
[/codebox]