I am trying to write a program for matrix multiplication. The two matrices are of 4000X4000 size. The code compiles correctly and runs without any problem, but answer seems to be wrong. I am using Tesla 1060 and host is fedora 10 (64 bit) on 64 bit AMD. Here’s my code, I would appreciate if somebody could help me with it.
#include<stdio.h>
#include<stdlib.h>
#define BLOCK_SIZE 16
#define N 4000
global void matmul(double *A, double B, double C, int wA, int wB, int hA)
{
int k;
int i=blockDim.xblockIdx.x + threadIdx.x;
int j=blockDim.yblockIdx.y + threadIdx.y;
if(i < wA && j < hA)
{
C[jwB+i]=0;
for(k=0;k<wA;k++)
{
C[jwA+i]+=A[j*wA+k]B[kwB+i];
}
}
}
void randomInit(double* data, int size)
{
for (int i = 0; i < size; i++)
data[i] = 1.0;
}
int main(int argc, charargv[]){
int Awidth,Aheight,Bwidth,Bheight,Cwidth,Cheight;
Awidth=N;
Aheight=N;
Bwidth=N;
Bheight=N;
Cwidth=N;
Cheight=N;unsigned int size_A=AwidthAheight*sizeof(double);
double h_A=(double)malloc(size_A);
unsigned int size_B=BwidthBheightsizeof(double);
double h_B=(double)malloc(size_B);
unsigned int size_C=CwidthCheightsizeof(double);
double h_C=(double)malloc(size_C);
int Awh=AwidthAheight;
int Bwh=BwidthBheight;
randomInit(h_A,Awh);
randomInit(h_B,Bwh);
double d_A;
size_t Asize=AwidthAheight*sizeof(double);
cudaMalloc((void**)&d_A,Asize);
double d_B;
size_t Bsize=BwidthBheight*sizeof(double);
cudaMalloc((void**)&d_B,Bsize);
double d_C;
size_t Csize=CwidthCheight*sizeof(double);
cudaMalloc((void**)&d_C,Csize);
cudaMemcpy(d_A, h_A, Asize,cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, Bsize,cudaMemcpyHostToDevice);
clock_t c0,c1;
c0=clock();
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(ceil(Aheight/double(16)), ceil(Bwidth/double(16)));
matmul<<<dimGrid,dimBlock, Csize>>>(d_A, d_B, d_C, Awidth, Bwidth, Aheight);
cudaThreadSynchronize();
cudaMemcpy(h_C, d_C, CwidthCheightsizeof(double), cudaMemcpyDeviceToHost);
c1=clock();
double gput=((double)(c1-c0))/CLOCKS_PER_SEC;
printf(“GPU Matrix multiplication done in %f sec\n”,gput);
int i;
//Check the answer for first 10 elements of C
for(i=0;i<10;i++){
printf(“\n%f\n”,h_C[i]);
}
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}