Is my way of time measurement of CUDA processing reasonable?

I measured the Cuda processing by use C-code and Intel MKL library.
My code is as below.

I cahnged dimension type(float,double) ,chnage of dimension sizes,and various GPU boards.
And I god results as below URL.

The processing time(global void matrix_vector_multi_gpu_1_1) by cuda was increased in proportion to the size of dimensions(A_d,B_d,C_d.

But the memory copy times(host to device,device to host) were not increased in proportion to the size of dimensions necessarity.
Is this result reasonable?
If it is reasonable,please tell me the reason why the memory copy times(host to device,device to host) were not increased in proportion to the size of dimensions necessarity.

Regards,

===============used code =======================

#include <stdio.h>
#include <math.h>
#include <cuda.h>
#include <time.h>
#include “mkl_service.h”

#define N 256

global void matrix_vector_multi_gpu_1_1(double *A_d,double *B_d,double *C_d)
{
int i,j;

for(j=0;j<N;j++){
	A_d[j] = 0.0F;
	for(i=0;i<N;i++){
		A_d[j] = A_d[j] + B_d[j*N + i]*C_d[i];
	}
}

}

int main()
{

int i,j,k,L;
double A[N], B[N*N] ,C[2*N];
double *A_d, *B_d ,*C_d;

unsigned long long t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14;




double pGhz0,pGhz1,pGhz2,pGhz3,pGhz4,pGhz5,pGhz6 = 1.0;

dim3 blocks(1,1,1);
dim3 threads(1,1,1);

for(j=0;j<N;j++){
	for(i=0;i< N;i++){
		B[j*N + i] = ((double)j)/256.0;
	}
}

for(j=0;j<2*N;j++){
	C[j] = 1.0F;
}

for(k=0;k<10;k++){

cudaMalloc((void**)&A_d, N*sizeof(double));
cudaMalloc((void**)&B_d, N*N*sizeof(double));
cudaMalloc((void**)&C_d, 2*N*sizeof(double));





cudaDeviceSynchronize();pGhz0 = mkl_get_cpu_frequency(); mkl_get_cpu_clocks(&t1);


cudaMemcpy(A_d, A ,N*sizeof(double), cudaMemcpyHostToDevice);


cudaDeviceSynchronize();

mkl_get_cpu_clocks(&t2);



cudaDeviceSynchronize();pGhz1 = mkl_get_cpu_frequency(); mkl_get_cpu_clocks(&t3);

cudaMemcpy(B_d, B ,N*N*sizeof(double), cudaMemcpyHostToDevice);

cudaDeviceSynchronize(); mkl_get_cpu_clocks(&t4);




cudaDeviceSynchronize();pGhz2 = mkl_get_cpu_frequency(); mkl_get_cpu_clocks(&t5);

cudaMemcpy(C_d, C ,2*N*sizeof(double), cudaMemcpyHostToDevice);

cudaDeviceSynchronize(); mkl_get_cpu_clocks(&t6);



cudaDeviceSynchronize();pGhz6 = mkl_get_cpu_frequency(); mkl_get_cpu_clocks(&t13);
matrix_vector_multi_gpu_1_1<<< blocks, threads >>>(A_d,B_d,C_d);
cudaDeviceSynchronize(); mkl_get_cpu_clocks(&t14);


cudaDeviceSynchronize(); pGhz3 = mkl_get_cpu_frequency(); mkl_get_cpu_clocks(&t7);
cudaMemcpy(A, A_d ,N*sizeof(double), cudaMemcpyDeviceToHost);
 pGhz2 = mkl_get_cpu_frequency();
cudaDeviceSynchronize(); mkl_get_cpu_clocks(&t8);




cudaDeviceSynchronize();pGhz4 = mkl_get_cpu_frequency(); mkl_get_cpu_clocks(&t9);
cudaMemcpy(B, B_d ,N*N*sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize(); mkl_get_cpu_clocks(&t10);



cudaDeviceSynchronize();pGhz5 = mkl_get_cpu_frequency(); mkl_get_cpu_clocks(&t11);
cudaMemcpy(C, C_d ,2*N*sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize(); mkl_get_cpu_clocks(&t12);



printf("CPU:%lf[GHz] MemCpyHosttoDevice1 N:  %6d Time;%12.8g[msec]\n",pGhz0,N,  (double)(t2-t1)/pGhz0*1.e-6);
printf("CPU:%lf[GHz] MemCpyHosttoDevice2 N*N %6d Time;%12.8g[msec]\n",pGhz1,N*N,(double)(t4-t3)/pGhz1*1.e-6);
printf("CPU:%lf[GHz] MemCpyHosttoDevice3 2*N %6d Time;%12.8g[msec]\n",pGhz2,2*N,(double)(t6-t5)/pGhz2*1.e-6);

printf("CPU:%lf[GHz] CUDA Kenel Calc                Time;%12.8g[msec]\n",pGhz6,(double)(t14-t13)/pGhz2*1.e-6);

printf("CPU:%lf[GHz] MemCpyDevicetoHost1 N   %6d Time;%12.8g[msec]\n",pGhz3,N,  (double)(t8-t7)/pGhz3*1.e-6);
printf("CPU:%lf[Ghz] MemCpyDevicetoHost2 N*N %6d Time;%12.8g[msec]\n",pGhz4,N*N,(double)(t10-t9)/pGhz4*1.e-6);
printf("CPU:%lf[Ghz] MemCpyDevicetoHost3 2*N %6d Time;%12.8g[msec]\n\n",pGhz5,2*N,(double)(t12-t11)/pGhz5*1.e-6);



//for(j=0;j<N;j++){
//	printf("A[ %d ] = %f \n",j,A[j]);
//}

cudaFree(A_d);
cudaFree(B_d);
cudaFree(C_d);

}
return 0;
}