this is my program
#include
#include
#include
#include
#define N 200
#define TILE_WIDTH 20
global void MatMul(intA, int B, int* C) {
int sum;
int idx = threadIdx.x;
int idy = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int k ,uidx , uidy , i;
uidx = bxTILE_WIDTH + idx;
uidy = byTILE_WIDTH + idy;
sum = 0;
// Allocating memory in shared memory
shared int temp1[TILE_WIDTH][TILE_WIDTH];
shared int temp2[TILE_WIDTH][TILE_WIDTH];
//copying the data to shared memory
for( i =0;i<N/TILE_WIDTH; i++)
{
temp1[idy][idx]= A[TILE_WIDTH*(byN+i) + idx+idyN];
temp2[idy][idx]= B[TILE_WIDTH*(bx+Ni) + idx+idyN];
__syncthreads();
// multiplying matrices in shared memory
for(k=0 ; k < TILE_WIDTH;k++) {
sum = sum + temp1[idy][k]*temp2[k][idx];
}
}
// synchronizing the threads
__syncthreads();
C[uidy*N + uidx] = sum;
}
int main( void ) {
int a[N][N], b[N][N], c[N][N]; //host copies of a,b,c
int *dev_a, *dev_b, *dev_c; //device copies of a,b,c
// allocate the memory on the GPU
cudaMalloc( (void**)&dev_a, N * N * sizeof(int) );
cudaMalloc( (void**)&dev_b, N * N * sizeof(int) );
cudaMalloc( (void**)&dev_c, N * N * sizeof(int) );
// fill the matrices ‘a’ and ‘b’ on the CPU
for (int i=0; i<N; i++) {
for (int j=0; j < N; j++) {
a_[j] = j+3;
b[j] = i+6;
}
}
//copy above a,b values to device
cudaMemcpy( dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice );
// Prepare timer
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//start record
cudaEventRecord(start, 0);
// Kernel invocation with N threads
dim3 dimGrid(10,10,1);
dim3 dimBlock(TILE_WIDTH,TILE_WIDTH,1);
MatMul<<>> (dev_a, dev_b, dev_c);
//stop record
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
//this is operation time
cudaEventElapsedTime(&time, start, stop);
//clean up
cudaEventDestroy(start);
cudaEventDestroy(stop);
//copy result to host
cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost );
//output…
for (int i=0; i < N; i++){
for (int j=0; j < N; j++){
printf( “%d “, c[j]);
}
}
//free the allocated memory in device
cudaFree( dev_a );
cudaFree( dev_b );
cudaFree( dev_c );
printf(”\n multiplication done!!!\n”);
printf(“\n”);
printf(" time elapsed in ms=%f\n",time);
getch();
return 0;
}
i am getting a matrix of value 2829400
i checked in matlab the value should be a matrix of value 2871200_