Is my way of time measurement of CUDA processing reasonable?

yoshimartina · May 20, 2014, 5:14am

I measured the Cuda processing by use C-code and Intel MKL library.
My code is as below.

I cahnged dimension type(float,double) ,chnage of dimension sizes,and various GPU boards.
And I god results as below URL.

The processing time(global void matrix_vector_multi_gpu_1_1) by cuda was increased in proportion to the size of dimensions(A_d,B_d,C_d.

But the memory copy times(host to device,device to host) were not increased in proportion to the size of dimensions necessarity.
Is this result reasonable?
If it is reasonable,please tell me the reason why the memory copy times(host to device,device to host) were not increased in proportion to the size of dimensions necessarity.

Regards,

===============used code =======================

#include <stdio.h>
#include <math.h>
#include <cuda.h>
#include <time.h>
#include “mkl_service.h”

#define N 256

global void matrix_vector_multi_gpu_1_1(double *A_d,double *B_d,double *C_d)
{
int i,j;

for(j=0;j<N;j++){
	A_d[j] = 0.0F;
	for(i=0;i<N;i++){
		A_d[j] = A_d[j] + B_d[j*N + i]*C_d[i];
	}
}

}

int main()
{

int i,j,k,L;
double A[N], B[N*N] ,C[2*N];
double *A_d, *B_d ,*C_d;

unsigned long long t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14;




double pGhz0,pGhz1,pGhz2,pGhz3,pGhz4,pGhz5,pGhz6 = 1.0;

dim3 blocks(1,1,1);
dim3 threads(1,1,1);

for(j=0;j<N;j++){
	for(i=0;i< N;i++){
		B[j*N + i] = ((double)j)/256.0;
	}
}

for(j=0;j<2*N;j++){
	C[j] = 1.0F;
}

for(k=0;k<10;k++){

cudaMalloc((void**)&A_d, N*sizeof(double));
cudaMalloc((void**)&B_d, N*N*sizeof(double));
cudaMalloc((void**)&C_d, 2*N*sizeof(double));





cudaDeviceSynchronize();pGhz0 = mkl_get_cpu_frequency(); mkl_get_cpu_clocks(&t1);


cudaMemcpy(A_d, A ,N*sizeof(double), cudaMemcpyHostToDevice);


cudaDeviceSynchronize();

mkl_get_cpu_clocks(&t2);



cudaDeviceSynchronize();pGhz1 = mkl_get_cpu_frequency(); mkl_get_cpu_clocks(&t3);

cudaMemcpy(B_d, B ,N*N*sizeof(double), cudaMemcpyHostToDevice);

cudaDeviceSynchronize(); mkl_get_cpu_clocks(&t4);




cudaDeviceSynchronize();pGhz2 = mkl_get_cpu_frequency(); mkl_get_cpu_clocks(&t5);

cudaMemcpy(C_d, C ,2*N*sizeof(double), cudaMemcpyHostToDevice);

cudaDeviceSynchronize(); mkl_get_cpu_clocks(&t6);



cudaDeviceSynchronize();pGhz6 = mkl_get_cpu_frequency(); mkl_get_cpu_clocks(&t13);
matrix_vector_multi_gpu_1_1<<< blocks, threads >>>(A_d,B_d,C_d);
cudaDeviceSynchronize(); mkl_get_cpu_clocks(&t14);


cudaDeviceSynchronize(); pGhz3 = mkl_get_cpu_frequency(); mkl_get_cpu_clocks(&t7);
cudaMemcpy(A, A_d ,N*sizeof(double), cudaMemcpyDeviceToHost);
 pGhz2 = mkl_get_cpu_frequency();
cudaDeviceSynchronize(); mkl_get_cpu_clocks(&t8);




cudaDeviceSynchronize();pGhz4 = mkl_get_cpu_frequency(); mkl_get_cpu_clocks(&t9);
cudaMemcpy(B, B_d ,N*N*sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize(); mkl_get_cpu_clocks(&t10);



cudaDeviceSynchronize();pGhz5 = mkl_get_cpu_frequency(); mkl_get_cpu_clocks(&t11);
cudaMemcpy(C, C_d ,2*N*sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize(); mkl_get_cpu_clocks(&t12);



printf("CPU:%lf[GHz] MemCpyHosttoDevice1 N:  %6d Time;%12.8g[msec]\n",pGhz0,N,  (double)(t2-t1)/pGhz0*1.e-6);
printf("CPU:%lf[GHz] MemCpyHosttoDevice2 N*N %6d Time;%12.8g[msec]\n",pGhz1,N*N,(double)(t4-t3)/pGhz1*1.e-6);
printf("CPU:%lf[GHz] MemCpyHosttoDevice3 2*N %6d Time;%12.8g[msec]\n",pGhz2,2*N,(double)(t6-t5)/pGhz2*1.e-6);

printf("CPU:%lf[GHz] CUDA Kenel Calc                Time;%12.8g[msec]\n",pGhz6,(double)(t14-t13)/pGhz2*1.e-6);

printf("CPU:%lf[GHz] MemCpyDevicetoHost1 N   %6d Time;%12.8g[msec]\n",pGhz3,N,  (double)(t8-t7)/pGhz3*1.e-6);
printf("CPU:%lf[Ghz] MemCpyDevicetoHost2 N*N %6d Time;%12.8g[msec]\n",pGhz4,N*N,(double)(t10-t9)/pGhz4*1.e-6);
printf("CPU:%lf[Ghz] MemCpyDevicetoHost3 2*N %6d Time;%12.8g[msec]\n\n",pGhz5,2*N,(double)(t12-t11)/pGhz5*1.e-6);



//for(j=0;j<N;j++){
//	printf("A[ %d ] = %f \n",j,A[j]);
//}

cudaFree(A_d);
cudaFree(B_d);
cudaFree(C_d);

}
return 0;
}

Topic		Replies	Views
Why is there the difference of memory copy speed between cpu>gpu and gpu>cpu CUDA Programming and Performance	3	1278	April 10, 2014
processing time check CUDA Programming and Performance	5	551	November 16, 2010
Why is the Kernel faster when my matrices are not initialized CUDA Programming and Performance	2	738	December 18, 2017
How much time is cudaMemcpy() use? CUDA Programming and Performance	1	4022	July 30, 2008
Memory Transfer CUDA Programming and Performance	7	2959	October 10, 2008
A few questions on CUDA performance with pictures! CUDA Programming and Performance	6	3349	January 10, 2009
CUDA trouble CUDA Programming and Performance	3	977	March 19, 2013
MATLAB examples on Linux Speed does not seem to scale with graphics card capabilities CUDA Programming and Performance	2	4553	May 26, 2009
Really strange memcpy time in matrixMul at SDK CUDA Programming and Performance	6	5103	July 9, 2009
240 versus 32 cores CUDA Programming and Performance	8	1874	April 23, 2009

Is my way of time measurement of CUDA processing reasonable?

Related topics