getting wrong values in matrix multiplication

this is my program


#include
#include
#include
#include
#define N 200
#define TILE_WIDTH 20

global void MatMul(intA, int B, int* C) {

int sum;
int idx = threadIdx.x;
int idy = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int k ,uidx , uidy , i;
uidx = bxTILE_WIDTH + idx;
uidy = by
TILE_WIDTH + idy;
sum = 0;


// Allocating memory in shared memory

shared int temp1[TILE_WIDTH][TILE_WIDTH];
shared int temp2[TILE_WIDTH][TILE_WIDTH];

//copying the data to shared memory

for( i =0;i<N/TILE_WIDTH; i++)
{
temp1[idy][idx]= A[TILE_WIDTH*(byN+i) + idx+idyN];
temp2[idy][idx]= B[TILE_WIDTH*(bx+Ni) + idx+idyN];
__syncthreads();

// multiplying matrices in shared memory

for(k=0 ; k < TILE_WIDTH;k++) {
sum = sum + temp1[idy][k]*temp2[k][idx];
}
}

// synchronizing the threads

__syncthreads();
C[uidy*N + uidx] = sum;
}

int main( void ) {

int a[N][N], b[N][N], c[N][N]; //host copies of a,b,c

int *dev_a, *dev_b, *dev_c; //device copies of a,b,c

// allocate the memory on the GPU
cudaMalloc( (void**)&dev_a, N * N * sizeof(int) );
cudaMalloc( (void**)&dev_b, N * N * sizeof(int) );
cudaMalloc( (void**)&dev_c, N * N * sizeof(int) );

// fill the matrices ‘a’ and ‘b’ on the CPU

for (int i=0; i<N; i++) {
for (int j=0; j < N; j++) {
a_[j] = j+3;
b[j] = i+6;
}
}

//copy above a,b values to device

cudaMemcpy( dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice );
// Prepare timer
cudaEvent_t start, stop;
float time;

cudaEventCreate(&start);
cudaEventCreate(&stop);

//start record
cudaEventRecord(start, 0);

// Kernel invocation with N threads
dim3 dimGrid(10,10,1);
dim3 dimBlock(TILE_WIDTH,TILE_WIDTH,1);
MatMul<<>> (dev_a, dev_b, dev_c);

//stop record
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);

//this is operation time
cudaEventElapsedTime(&time, start, stop);

//clean up
cudaEventDestroy(start);
cudaEventDestroy(stop);

//copy result to host
cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost );

//output…
for (int i=0; i < N; i++){
for (int j=0; j < N; j++){

printf( “%d “, c[j]);

}
}


//free the allocated memory in device
cudaFree( dev_a );
cudaFree( dev_b );
cudaFree( dev_c );
printf(”\n multiplication done!!!\n”);
printf(“\n”);
printf(" time elapsed in ms=%f\n",time);
getch();
return 0;
}

i am getting a matrix of value 2829400
i checked in matlab the value should be a matrix of value 2871200_

Every element of the product matrix is the same, and equal to

sum((k+3)*(k+6),k=0…199)

which evaluates to 2829400.

Check your Matlab calculation again.

thanks…i was doing wrong calculation in matlab…