2D Matrix Multiply with cuda....I made it all but time duration is something strange.

Hello, Cuda Developers!
I made a simple 2D matrix Multiplication with cuda following cudes.
However, i checked time duration between CPU and GPU time duration and founded there is something wrong with my result.

I got nothing in CPU timr record and GPU timer is slower than CPU.
I checked several times and remake these code again and again but no answers.
Please give me advices to my code.

This is my GPU matrix multiply

__global__ void GPUprocess(int *mA, int *mB, int*res) {
	int row = threadIdx.y + blockIdx.y*blockDim.y;
	int col = threadIdx.x + blockIdx.x*blockDim.x;
	int values = 0;
	if (row < N && col < N) {
		for (int inner = 0; inner < N; inner++) {
			values += mA[row*N+inner] * mB[inner*N+col];
		}
	}
	res[row*N + col] = values;
}

This is my CPU matrix multiply

void CPUprocess(int mA[][N], int mB[][N], int res[][N]) {
	for (int i = 0; i < N; i++) {
		for (int j = 0; j < N; j++) {
			int values = 0;
			for (int inner = 0; inner < N; inner++) {
				values += mA[i][inner] * mB[inner][j];
			}
			res[i][j] = values;
		}
	}
}

This is main function

int main() {
	srand((unsigned)time(NULL));
	int mA[N][N], mB[N][N], cpures[N][N];
	
	clock_t cpustart, cpuend;

	// Random number
	for (int i = 0; i < N; i++) {
		for (int j = 0; j < N; j++) {
			mA[i][j] = rand() % N;
			mB[i][j] = rand() % N;
			cpures[i][j] = NULL;
		}
	}

	//CPU 2D Matrix Multiply
	cpustart = clock();
	CPUprocess(mA, mB, cpures);
	cpuend = clock();
	double cpums = (double)((double)cpuend - cpustart / CLOCKS_PER_SEC);

	// Check Matrix Result from CUP
	printf("CPU Matrix Multiply===================\n");
	for (int i = 0; i < N; i++) {
		for (int j = 0; j < N; j++) {
			printf("%d ", cpures[i][j]);
		}
		printf("\n");
	}
	printf("\n");

	int host_mA[N*N], host_mB[N*N], gpures[N*N];

	int start = 0;
	for (int i = 0; i < N; i++) {
		for (int j = 0; j < N; j++) {
			host_mA[start] = mA[i][j];
			host_mB[start] = mB[i][j];
			gpures[start] = 0;
			start++;
		}
	}

	int *dev_mA, *dev_mB, *dev_res;
	int size = sizeof(int)*N*N;
	
	printf("CUDA memory Alloc=================\n");
	if (cudaMalloc((void**)&dev_mA, size) != cudaSuccess) {
		printf("Problem with memory alloc with dev_mA\n");
	}
	if (cudaMalloc((void**)&dev_mB, size) != cudaSuccess) {
		printf("Problem with memory alloc with dev_mB\n");
	}
	if (cudaMalloc((void**)&dev_res, size) != cudaSuccess) {
		printf("Problem with memory alloc with dev_res\n");
	}

	printf("CUDA memory copy=================\n");
	if (cudaMemcpy(dev_mA, host_mA, size, cudaMemcpyHostToDevice) != cudaSuccess) {
		printf("Failed to copy a host_mA to dev_mA. (Host->Device). \n");
	}
	if (cudaMemcpy(dev_mB, host_mB, size, cudaMemcpyHostToDevice) != cudaSuccess) {
		printf("Failed to copy a host_mA to dev_mB. (Host->Device). \n");
	}

	//////////////////////////////////////////////////////////////////////
	dim3 block(THREADS, THREADS,1);
	dim3 grid( (N + THREADS) / THREADS,
			   (N + THREADS) / THREADS, 1);


	printf("CUDA 2D GPU matrix Multiply======\n");
	clock_t gpustart, gpuend;
	gpustart = clock();
		
	GPUprocess<<<grid, block>>>(dev_mA, dev_mB, dev_res);

	gpuend = clock();
	double ms = (double)((double)gpuend - gpustart / CLOCKS_PER_SEC);

	printf("CUDA memory copy=================\n");
	if (cudaMemcpy(gpures, dev_res, size, cudaMemcpyDeviceToHost) != cudaSuccess) {
		printf("Failed to copy a host_mA to dev_mB. (Host->Device). \n");
	}


	// Check 
	for (int i = 0, int phase = 1; i < N*N; i++, phase++) {
		printf("%d ", gpures[i]);
		if (phase%N == 0) {
			printf("\n");
		}
	}
	printf("\n");

	printf("CPU Matirx Multiply time duration : %.50lf \n", cpums);
	printf("GPU Matirx Multiply time duration : %.50lf \n", ms);

	return 0;
}