Nsight memory copies discrepancy

I am timing a 2D NxN matrix addition using cuda. Upon debugging the code for different values of N nsight is showing different memory transfer rates. I understand that the speedup offered by cuda will decrease with smaller values of N but why will the memory transfer rates be affected?

Here is what i get

for N=200 (transfer rates in MB/s)

H2D 4571.6

D2H 4680

For N=100

H2D 1817

D2H 3022

For N=20

H2D 299

D2H 307

Thanking in anticipation

here’s is the code if it may help

#include <iostream>

#include <cuda.h>

#include <conio.h>

__global__ void MatrixAdd_d(float *A, float *B, float *C, int N)

{

   int i = blockIdx.x*blockDim.x + threadIdx.x;

   int j = blockIdx.y*blockDim.y + threadIdx.y;

   int index = i*N + j;

   if(i<N && j<N) { C[index] = A[index] + B[index]; }

}

int main()

{ 

   float *a_h, *b_h, *c_h; // pointers to host memory; CPU

   float *a_d, *b_d, *c_d; // pointers to device memory; GPU

   int blocksize=32, n=20, i, j, index;  //define n=dimensionality

   cudaEvent_t start, stop; 

   cudaEventCreate(&start); //creating events to calculate time

   cudaEventCreate(&stop);

   float elapsedTime;

    	

// allocate arrays on host

   a_h = (float *)malloc(sizeof(float)*n*n);

   b_h = (float *)malloc(sizeof(float)*n*n);

   c_h = (float *)malloc(sizeof(float)*n*n);

cudaEventRecord(start, 0);	//start timer

// allocate arrays on device

   cudaMalloc((void **)&a_d,n*n*sizeof(float));

   cudaMalloc((void **)&b_d,n*n*sizeof(float));

   cudaMalloc((void **)&c_d,n*n*sizeof(float));

   dim3 dimBlock( blocksize, blocksize );

   dim3 dimGrid( ceil(float(n)/float(dimBlock.x)), ceil(float(n)/float(dimBlock.y)) );

// initialize the arrays on host; not being timed

   for(j=0;j<n;j++) {

    for(i=0;i<n;i++) {

     index = i*n+j;

     a_h[index]=rand()%35;

     b_h[index]=rand()%35;

    }

   }

   //copy and run the code on the device

   cudaMemcpy(a_d,a_h,n*n*sizeof(float),cudaMemcpyHostToDevice);

   cudaMemcpy(b_d,b_h,n*n*sizeof(float),cudaMemcpyHostToDevice);

//execute kernel 	

   MatrixAdd_d<<<dimGrid, dimBlock>>>(a_d,b_d,c_d,n);

//copy the contents back to host memory

   cudaMemcpy(c_h,c_d,n*n*sizeof(float),cudaMemcpyDeviceToHost);

cudaThreadSynchronize();			//wait for all threads to finish

   cudaEventRecord(stop, 0); 		      // time another event creation	

  cudaEventSynchronize(stop);		

cudaEventElapsedTime(&elapsedTime, start, stop);	//calculate elapsed time

printf("passed total time taken %f",elapsedTime);

// print out the answer

/*   for(j=0;j<n;j++) {

    for(i=0;i<n;i++) {

     index = i*n+j;

     // This time the array is only 2x2 so we can print it out.

     printf("A + B = C: %d %d %f + %f = %f\n",i,j,a_h[index],b_h[index],c_h[index]);

    }

   } */

getch();

// cleanup...

   free(a_h);

   free(b_h);

   free(c_h);

   cudaFree(a_d);

   cudaFree(b_d);

   cudaFree(c_d);

return(0);

}

anyone?? please