# 2D Matrix Multiply with cuda....I made it all but time duration is something strange.

Hello, Cuda Developers!
I made a simple 2D matrix Multiplication with cuda following cudes.
However, i checked time duration between CPU and GPU time duration and founded there is something wrong with my result.

I got nothing in CPU timr record and GPU timer is slower than CPU.
I checked several times and remake these code again and again but no answers.

This is my GPU matrix multiply

``````__global__ void GPUprocess(int *mA, int *mB, int*res) {
int row = threadIdx.y + blockIdx.y*blockDim.y;
int col = threadIdx.x + blockIdx.x*blockDim.x;
int values = 0;
if (row < N && col < N) {
for (int inner = 0; inner < N; inner++) {
values += mA[row*N+inner] * mB[inner*N+col];
}
}
res[row*N + col] = values;
}
``````

This is my CPU matrix multiply

``````void CPUprocess(int mA[][N], int mB[][N], int res[][N]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
int values = 0;
for (int inner = 0; inner < N; inner++) {
values += mA[i][inner] * mB[inner][j];
}
res[i][j] = values;
}
}
}
``````

This is main function

``````int main() {
srand((unsigned)time(NULL));
int mA[N][N], mB[N][N], cpures[N][N];

clock_t cpustart, cpuend;

// Random number
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
mA[i][j] = rand() % N;
mB[i][j] = rand() % N;
cpures[i][j] = NULL;
}
}

//CPU 2D Matrix Multiply
cpustart = clock();
CPUprocess(mA, mB, cpures);
cpuend = clock();
double cpums = (double)((double)cpuend - cpustart / CLOCKS_PER_SEC);

// Check Matrix Result from CUP
printf("CPU Matrix Multiply===================\n");
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
printf("%d ", cpures[i][j]);
}
printf("\n");
}
printf("\n");

int host_mA[N*N], host_mB[N*N], gpures[N*N];

int start = 0;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
host_mA[start] = mA[i][j];
host_mB[start] = mB[i][j];
gpures[start] = 0;
start++;
}
}

int *dev_mA, *dev_mB, *dev_res;
int size = sizeof(int)*N*N;

printf("CUDA memory Alloc=================\n");
if (cudaMalloc((void**)&dev_mA, size) != cudaSuccess) {
printf("Problem with memory alloc with dev_mA\n");
}
if (cudaMalloc((void**)&dev_mB, size) != cudaSuccess) {
printf("Problem with memory alloc with dev_mB\n");
}
if (cudaMalloc((void**)&dev_res, size) != cudaSuccess) {
printf("Problem with memory alloc with dev_res\n");
}

printf("CUDA memory copy=================\n");
if (cudaMemcpy(dev_mA, host_mA, size, cudaMemcpyHostToDevice) != cudaSuccess) {
printf("Failed to copy a host_mA to dev_mA. (Host->Device). \n");
}
if (cudaMemcpy(dev_mB, host_mB, size, cudaMemcpyHostToDevice) != cudaSuccess) {
printf("Failed to copy a host_mA to dev_mB. (Host->Device). \n");
}

//////////////////////////////////////////////////////////////////////

printf("CUDA 2D GPU matrix Multiply======\n");
clock_t gpustart, gpuend;
gpustart = clock();

GPUprocess<<<grid, block>>>(dev_mA, dev_mB, dev_res);

gpuend = clock();
double ms = (double)((double)gpuend - gpustart / CLOCKS_PER_SEC);

printf("CUDA memory copy=================\n");
if (cudaMemcpy(gpures, dev_res, size, cudaMemcpyDeviceToHost) != cudaSuccess) {
printf("Failed to copy a host_mA to dev_mB. (Host->Device). \n");
}

// Check
for (int i = 0, int phase = 1; i < N*N; i++, phase++) {
printf("%d ", gpures[i]);
if (phase%N == 0) {
printf("\n");
}
}
printf("\n");

printf("CPU Matirx Multiply time duration : %.50lf \n", cpums);
printf("GPU Matirx Multiply time duration : %.50lf \n", ms);

return 0;
}
``````