Matrix multiplication---not getting correct answer? answer for matrix multiplicatin seems to be wron

I am trying to write a program for matrix multiplication. The two matrices are of 4000X4000 size. The code compiles correctly and runs without any problem, but answer seems to be wrong. Here’s my code, I would appreciate if somebody could help me with it.

#include<stdio.h>
#include<stdlib.h>

#define BLOCK_SIZE 16
#define N 4000
global void matmul(double *A, double B, double C, int wA, int wB, int hA)
{
int k;
int i=blockDim.x
blockIdx.x + threadIdx.x;
int j=blockDim.y
blockIdx.y + threadIdx.y;

if(i < wA && j < hA)
{
C[jwB+i]=0;
for(k=0;k<wA;k++)
{
C[j
wA+i]+=A[j*wA+k]B[kwB+i];
}
}
}

void randomInit(double* data, int size)
{
for (int i = 0; i < size; i++)
data[i] = 1.0;
}

int main(int argc, charargv[]){
int Awidth,Aheight,Bwidth,Bheight,Cwidth,Cheight;
Awidth=N;
Aheight=N;
Bwidth=N;
Bheight=N;
Cwidth=N;
Cheight=N;unsigned int size_A=Awidth
Aheight*sizeof(double);
double h_A=(double)malloc(size_A);

unsigned int size_B=BwidthBheightsizeof(double);
double h_B=(double)malloc(size_B);

unsigned int size_C=CwidthCheightsizeof(double);
double h_C=(double)malloc(size_C);

int Awh=AwidthAheight;
int Bwh=Bwidth
Bheight;

randomInit(h_A,Awh);
randomInit(h_B,Bwh);

double d_A;
size_t Asize=Awidth
Aheight*sizeof(double);
cudaMalloc((void**)&d_A,Asize);

double d_B;
size_t Bsize=Bwidth
Bheight*sizeof(double);
cudaMalloc((void**)&d_B,Bsize);

double d_C;
size_t Csize=Cwidth
Cheight*sizeof(double);
cudaMalloc((void**)&d_C,Csize);

cudaMemcpy(d_A, h_A, Asize,cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, Bsize,cudaMemcpyHostToDevice);

clock_t c0,c1;
c0=clock();
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(ceil(Aheight/double(16)), ceil(Bwidth/double(16)));

matmul<<<dimGrid,dimBlock, Csize>>>(d_A, d_B, d_C, Awidth, Bwidth, Aheight);
cudaThreadSynchronize();
cudaMemcpy(h_C, d_C, CwidthCheightsizeof(double), cudaMemcpyDeviceToHost);

c1=clock();
double gput=((double)(c1-c0))/CLOCKS_PER_SEC;

printf(“GPU Matrix multiplication done in %f sec\n”,gput);
int i;

//Check the answer for first 10 elements of C
for(i=0;i<10;i++){
printf("\n%f\n",h_C[i]);
}

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}