I am trying to write a program for matrix multiplication. The two matrices are of 4000X4000 size. The code compiles correctly and runs without any problem, but answer seems to be wrong. I am using Tesla 1060 and host is fedora 10 (64 bit) on 64 bit AMD. Here’s my code, I would appreciate if somebody could help me with it.

#include<stdio.h>

#include<stdlib.h>

#define BLOCK_SIZE 16

#define N 4000

**global** void matmul(double *A, double *B, double C, int wA, int wB, int hA)
{
int k;
int i=blockDim.xblockIdx.x + threadIdx.x;
int j=blockDim.y*blockIdx.y + threadIdx.y;

if(i < wA && j < hA)

{

C[j*wB+i]=0;
for(k=0;k<wA;k++)
{
C[j*wA+i]+=A[j*wA+k]

*B[k*wB+i];

}

}

}

void randomInit(double* data, int size)

{

for (int i = 0; i < size; i++)

data[i] = 1.0;

}

int main(int argc, char*argv[]){
int Awidth,Aheight,Bwidth,Bheight,Cwidth,Cheight;
Awidth=N;
Aheight=N;
Bwidth=N;
Bheight=N;
Cwidth=N;
Cheight=N;unsigned int size_A=Awidth*Aheight*sizeof(double);

double

*h_A=(double*)malloc(size_A);

unsigned int size_B=Bwidth*Bheight*sizeof(double);

double *h_B=(double*)malloc(size_B);

unsigned int size_C=Cwidth*Cheight*sizeof(double);

double *h_C=(double*)malloc(size_C);

int Awh=Awidth*Aheight;
int Bwh=Bwidth*Bheight;

randomInit(h_A,Awh);

randomInit(h_B,Bwh);

double *d_A;
size_t Asize=Awidth*Aheight*sizeof(double);

cudaMalloc((void**)&d_A,Asize);

double *d_B;
size_t Bsize=Bwidth*Bheight*sizeof(double);

cudaMalloc((void**)&d_B,Bsize);

double *d_C;
size_t Csize=Cwidth*Cheight*sizeof(double);

cudaMalloc((void**)&d_C,Csize);

cudaMemcpy(d_A, h_A, Asize,cudaMemcpyHostToDevice);

cudaMemcpy(d_B, h_B, Bsize,cudaMemcpyHostToDevice);

clock_t c0,c1;

c0=clock();

dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

dim3 dimGrid(ceil(Aheight/double(16)), ceil(Bwidth/double(16)));

matmul<<<dimGrid,dimBlock, Csize>>>(d_A, d_B, d_C, Awidth, Bwidth, Aheight);

cudaThreadSynchronize();

cudaMemcpy(h_C, d_C, Cwidth*Cheight*sizeof(double), cudaMemcpyDeviceToHost);

c1=clock();

double gput=((double)(c1-c0))/CLOCKS_PER_SEC;

printf(“GPU Matrix multiplication done in %f sec\n”,gput);

int i;

//Check the answer for first 10 elements of C

for(i=0;i<10;i++){

printf("\n%f\n",h_C[i]);

}

cudaFree(d_A);

cudaFree(d_B);

cudaFree(d_C);

}