Why is there the difference of memory copy speed between cpu>gpu and gpu>cpu

I measuded memory copy speed of cpu>gpu and gpu > cpu by simple program as below.
I used Intel Math Kernel Liblary to mesure processing time.

I changed size of Dimention float A and mesure MemCpyHosttoDevice and MemCpyDevicetoHost;
Results
1.N=64
Average MemCpyHosttoDevice 0.016046679[msec]
Average MemCpyDevicetoHost 1.30367995[msec]

2.N=128
Average MemCpyHosttoDevice 0.01684861
Average MemCpyDevicetoHost 5.08382114

3.N=256
Average MemCpyHosttoDevice 0.013203374
Average MemCpyDevicetoHost 21.7094144

4.N=512
Average MemCpyHosttoDevice 0.016856464
Average MemCpyDevicetoHost 49.8819403

I have two questions about this results.

1.the time of MemCpyHosttoDevice did not dpend on the size of dimention A, but the time of MemCpyDevicetoHost dpended on the size of dimention A,Is it natural?

2.the time of MemCpyDevicetoHost needed a lot of time than MemCpyHosttoDevice. I can not the reason why of this difference.

I’d really appriciate it if anyone will answer my questions.

Regards,

#include <stdio.h>
#include <math.h>
#include <cuda.h>
#include <time.h>
#include “mkl_service.h”

#define N 64

global void matrix_vector_multi_gpu_1_1(float *A_d,float *B_d,float *C_d)
{
int i,j;

for(j=0;j<N;j++){
	A_d[j] = 0.0F;
	for(i=0;i<N;i++){
		A_d[j] = A_d[j] + B_d[j*N + i]*C_d[i];
	}
}

}

int main()
{

int i,j,k,L;
float A[N], B[N*N] ,C[N];
float *A_d, *B_d ,*C_d;

unsigned long long t1,t2,t3,t4;



double pGhz0 = 1.0;

dim3 blocks(1,1,1);
dim3 threads(1,1,1);

for(j=0;j<N;j++){
	for(i=0;i< N;i++){
		B[j*N + i] = ((float)j)/256.0;
	}
}

for(j=0;j<N;j++){
	C[j] = 1.0F;
}

cudaMalloc((void**)&A_d, N*sizeof(float));
cudaMalloc((void**)&B_d, N*N*sizeof(float));
cudaMalloc((void**)&C_d, N*sizeof(float));

for(k=0;k<10;k++){

pGhz0 = mkl_get_cpu_frequency();
mkl_get_cpu_clocks(&t1);


cudaMemcpy(A_d, A ,N*sizeof(float), cudaMemcpyHostToDevice);



mkl_get_cpu_clocks(&t2);


cudaMemcpy(B_d, B ,N*N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(C_d, C ,N*sizeof(float), cudaMemcpyHostToDevice);

matrix_vector_multi_gpu_1_1<<< blocks, threads >>>(A_d,B_d,C_d);

mkl_get_cpu_clocks(&t3);

cudaMemcpy(A, A_d ,N*sizeof(float), cudaMemcpyDeviceToHost);


mkl_get_cpu_clocks(&t4);


printf("CPU Frequency:%12.8g[GHz] MemCpyHosttoDevice Time;%12.8g[msec]\n",pGhz0,(double)(t2-t1)/pGhz0*1.e-6);
printf("CPU Frequency:%12.8g[GHz] MemCpyDevicetoHost Time;%12.8g[msec]\n",pGhz0,(double)(t4-t3)/pGhz0*1.e-6);

}


//for(j=0;j<N;j++){
//	printf("A[ %d ] = %f \n",j,A[j]);
//}

cudaFree(A_d);
cudaFree(B_d);
cudaFree(C_d);


return 0;

}

A little tip - I would recommend using CUDA events (alongside MKL) to measure real time of any CUDA calls.

Your measurement framework would appear to report the combined time for kernel execution plus device->host copy. You would want to insert a call to cudaDeviceSynchronize() directly after the kernel call.

Thank you for your advice,njuffa,

I modefied my code accoriding with your advice as below.
Is it right?

I’d really appriciate it if you will give me your advice about new code.

Regards,

=============================================

#include <stdio.h>
#include <math.h>
#include <cuda.h>
#include <time.h>
#include “mkl_service.h”

#define N 64

global void matrix_vector_multi_gpu_1_1(float *A_d,float *B_d,float *C_d)
{
int i,j;

for(j=0;j<N;j++){
	A_d[j] = 0.0F;
	for(i=0;i<N;i++){
		A_d[j] = A_d[j] + B_d[j*N + i]*C_d[i];
	}
}

}

int main()
{

int i,j,k,L;
float A[N], B[N*N] ,C[N];
float *A_d, *B_d ,*C_d;

unsigned long long t1,t2,t3,t4;



double pGhz0 = 1.0;

dim3 blocks(1,1,1);
dim3 threads(1,1,1);

for(j=0;j<N;j++){
	for(i=0;i< N;i++){
		B[j*N + i] = ((float)j)/256.0;
	}
}

for(j=0;j<N;j++){
	C[j] = 1.0F;
}

cudaMalloc((void**)&A_d, N*sizeof(float));
cudaMalloc((void**)&B_d, N*N*sizeof(float));
cudaMalloc((void**)&C_d, N*sizeof(float));

for(k=0;k<10;k++){

pGhz0 = mkl_get_cpu_frequency();

cudaDeviceSynchronize(); mkl_get_cpu_clocks(&t1);


cudaMemcpy(A_d, A ,N*sizeof(float), cudaMemcpyHostToDevice);


cudaDeviceSynchronize(); mkl_get_cpu_clocks(&t2);


cudaMemcpy(B_d, B ,N*N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(C_d, C ,N*sizeof(float), cudaMemcpyHostToDevice);

matrix_vector_multi_gpu_1_1<<< blocks, threads >>>(A_d,B_d,C_d);

cudaDeviceSynchronize(); mkl_get_cpu_clocks(&t3);

cudaMemcpy(A, A_d ,N*sizeof(float), cudaMemcpyDeviceToHost);

cudaDeviceSynchronize(); 	mkl_get_cpu_clocks(&t4);


printf("CPU Frequency:%12.8g[GHz] MemCpyHosttoDevice Time;%12.8g[msec]\n",pGhz0,(double)(t2-t1)/pGhz0*1.e-6);
printf("CPU Frequency:%12.8g[GHz] MemCpyDevicetoHost Time;%12.8g[msec]\n",pGhz0,(double)(t4-t3)/pGhz0*1.e-6);

}


//for(j=0;j<N;j++){
//	printf("A[ %d ] = %f \n",j,A[j]);
//}

cudaFree(A_d);
cudaFree(B_d);
cudaFree(C_d);


return 0;

}