I measuded memory copy speed of cpu>gpu and gpu > cpu by simple program as below.
I used Intel Math Kernel Liblary to mesure processing time.
I changed size of Dimention float A and mesure MemCpyHosttoDevice and MemCpyDevicetoHost;
Results
1.N=64
Average MemCpyHosttoDevice 0.016046679[msec]
Average MemCpyDevicetoHost 1.30367995[msec]
2.N=128
Average MemCpyHosttoDevice 0.01684861
Average MemCpyDevicetoHost 5.08382114
3.N=256
Average MemCpyHosttoDevice 0.013203374
Average MemCpyDevicetoHost 21.7094144
4.N=512
Average MemCpyHosttoDevice 0.016856464
Average MemCpyDevicetoHost 49.8819403
I have two questions about this results.
1.the time of MemCpyHosttoDevice did not dpend on the size of dimention A, but the time of MemCpyDevicetoHost dpended on the size of dimention A,Is it natural?
2.the time of MemCpyDevicetoHost needed a lot of time than MemCpyHosttoDevice. I can not the reason why of this difference.
I’d really appriciate it if anyone will answer my questions.
Regards,
#include <stdio.h>
#include <math.h>
#include <cuda.h>
#include <time.h>
#include “mkl_service.h”
#define N 64
global void matrix_vector_multi_gpu_1_1(float *A_d,float *B_d,float *C_d)
{
int i,j;
for(j=0;j<N;j++){
A_d[j] = 0.0F;
for(i=0;i<N;i++){
A_d[j] = A_d[j] + B_d[j*N + i]*C_d[i];
}
}
}
int main()
{
int i,j,k,L;
float A[N], B[N*N] ,C[N];
float *A_d, *B_d ,*C_d;
unsigned long long t1,t2,t3,t4;
double pGhz0 = 1.0;
dim3 blocks(1,1,1);
dim3 threads(1,1,1);
for(j=0;j<N;j++){
for(i=0;i< N;i++){
B[j*N + i] = ((float)j)/256.0;
}
}
for(j=0;j<N;j++){
C[j] = 1.0F;
}
cudaMalloc((void**)&A_d, N*sizeof(float));
cudaMalloc((void**)&B_d, N*N*sizeof(float));
cudaMalloc((void**)&C_d, N*sizeof(float));
for(k=0;k<10;k++){
pGhz0 = mkl_get_cpu_frequency();
mkl_get_cpu_clocks(&t1);
cudaMemcpy(A_d, A ,N*sizeof(float), cudaMemcpyHostToDevice);
mkl_get_cpu_clocks(&t2);
cudaMemcpy(B_d, B ,N*N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(C_d, C ,N*sizeof(float), cudaMemcpyHostToDevice);
matrix_vector_multi_gpu_1_1<<< blocks, threads >>>(A_d,B_d,C_d);
mkl_get_cpu_clocks(&t3);
cudaMemcpy(A, A_d ,N*sizeof(float), cudaMemcpyDeviceToHost);
mkl_get_cpu_clocks(&t4);
printf("CPU Frequency:%12.8g[GHz] MemCpyHosttoDevice Time;%12.8g[msec]\n",pGhz0,(double)(t2-t1)/pGhz0*1.e-6);
printf("CPU Frequency:%12.8g[GHz] MemCpyDevicetoHost Time;%12.8g[msec]\n",pGhz0,(double)(t4-t3)/pGhz0*1.e-6);
}
//for(j=0;j<N;j++){
// printf("A[ %d ] = %f \n",j,A[j]);
//}
cudaFree(A_d);
cudaFree(B_d);
cudaFree(C_d);
return 0;
}