Hello,
I wrote the following program that performs matrix multiplication of two square matrrices of equal dimensions. The CUDA compiler shows no error but when I execute the program on my 9600GT, it prints correctly only first row of the product matrix (matrix c or c_d on device and c_h on host ), rest are garbage values. The kernal is actually is given here on page number 9 MAtrix MultiPlication by DAvid Kirk .
Although this might be a very trivial question, but being new to CUDA I am unable to resolve it. Can someone have a look at the kernal and help me out.
Thanks in advance
/* This is a CUDA program that performs matrix multiplication on square matrices of equal dimensions */
*********/
#include <stdio.h>
#include <stdlib.h>
#include <conio.h>
#include <cuda.h>
// keranl that runs on Device
global void matrixmul(int *a, int *b, int *c, int width )
{
int i=threadIdx.x;
int j=threadIdx.y;
int k,m,n,sum;
//* sum stores the values computd by the thread
sum=0;
for(k=0;k<width;++k)
{
m= a[jwidth+k];
n= b[kwidth+i];
sum =sum+(mn);
}
c[jwidth+i]=sum; /* c is the product matrix
}
//main();
int main()
{
int i,j,m,n,k,width,sum;
int *a_h,*b_h,*a_d,*b_d,*c_h,c_d;
const int N=1000;
size_t size= Nsizeof(int);
//Memory allocation on host and device, a_h, a_d
a_h=(int*)malloc(size);
cudaMalloc((void**)&a_d,size);
//Memory allocation on host and device, b_h, b_d
b_h=(int*)malloc(size);
cudaMalloc((void**)&b_d,size);
//Memory allocation on host and device, c_h, c_d
c_h=(int*)malloc(size);
cudaMalloc((void**)&c_d,size);
//User inputs (note that row=columns)
printf(“enter the row & coloum of the 1st matrix m “);
scanf(”%d%d”,&width,&width); //row = columns=width
printf(“enter the element of 1st matrix m”);
for(i=0;i<(widthwidth);i++)
{
scanf(“%d”,&a_h[i]);
}
for(i=0;i<(widthwidth);i++)
printf(“\t%d”,a_h[i]);
{
printf(“\n”);
}
//copying data (a_h) from Host to Device in a_d
cudaMemcpy(a_d,a_h,size,cudaMemcpyHostToDevice);
printf(“enter the row & coloum of 2nd matrix n”);
scanf(“%d%d”,&width,&width); //row = columns
printf(“enter the element of 2nd matrix n”);
for(j=0;j<(widthwidth);j++)
{
scanf(“%d”,&b_h[j]);
}
for(j=0;j<(widthwidth);j++)
printf(“\t%d”,b_h[j]);
{
printf(“\n”);
}
//copying data (b_h) from Host to Device in b_d
cudaMemcpy(b_d,b_h,size,cudaMemcpyHostToDevice);
//Kernal call
matrixmul<<<1,4>>>(a_d,b_d,c_d,width );
//copying data (c_d) from Device to Host in c_h
cudaMemcpy(c_h,c_d,size,cudaMemcpyDeviceToHost);
//printing the results
for(i=0;i<(width*width);i++)
{
printf(“%d”,c_h[i]);
}
getch();
free(a_h);
cudaFree(a_d);
free(b_h);
cudaFree(b_d);
free(c_h);
cudaFree(c_d);
}