I am timing a 2D NxN matrix addition using cuda. Upon debugging the code for different values of N nsight is showing different memory transfer rates. I understand that the speedup offered by cuda will decrease with smaller values of N but why will the memory transfer rates be affected?
Here is what i get
for N=200 (transfer rates in MB/s)
H2D 4571.6
D2H 4680
For N=100
H2D 1817
D2H 3022
For N=20
H2D 299
D2H 307
Thanking in anticipation
here’s is the code if it may help
#include <iostream>
#include <cuda.h>
#include <conio.h>
__global__ void MatrixAdd_d(float *A, float *B, float *C, int N)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
int index = i*N + j;
if(i<N && j<N) { C[index] = A[index] + B[index]; }
}
int main()
{
float *a_h, *b_h, *c_h; // pointers to host memory; CPU
float *a_d, *b_d, *c_d; // pointers to device memory; GPU
int blocksize=32, n=20, i, j, index; //define n=dimensionality
cudaEvent_t start, stop;
cudaEventCreate(&start); //creating events to calculate time
cudaEventCreate(&stop);
float elapsedTime;
// allocate arrays on host
a_h = (float *)malloc(sizeof(float)*n*n);
b_h = (float *)malloc(sizeof(float)*n*n);
c_h = (float *)malloc(sizeof(float)*n*n);
cudaEventRecord(start, 0); //start timer
// allocate arrays on device
cudaMalloc((void **)&a_d,n*n*sizeof(float));
cudaMalloc((void **)&b_d,n*n*sizeof(float));
cudaMalloc((void **)&c_d,n*n*sizeof(float));
dim3 dimBlock( blocksize, blocksize );
dim3 dimGrid( ceil(float(n)/float(dimBlock.x)), ceil(float(n)/float(dimBlock.y)) );
// initialize the arrays on host; not being timed
for(j=0;j<n;j++) {
for(i=0;i<n;i++) {
index = i*n+j;
a_h[index]=rand()%35;
b_h[index]=rand()%35;
}
}
//copy and run the code on the device
cudaMemcpy(a_d,a_h,n*n*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(b_d,b_h,n*n*sizeof(float),cudaMemcpyHostToDevice);
//execute kernel
MatrixAdd_d<<<dimGrid, dimBlock>>>(a_d,b_d,c_d,n);
//copy the contents back to host memory
cudaMemcpy(c_h,c_d,n*n*sizeof(float),cudaMemcpyDeviceToHost);
cudaThreadSynchronize(); //wait for all threads to finish
cudaEventRecord(stop, 0); // time another event creation
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop); //calculate elapsed time
printf("passed total time taken %f",elapsedTime);
// print out the answer
/* for(j=0;j<n;j++) {
for(i=0;i<n;i++) {
index = i*n+j;
// This time the array is only 2x2 so we can print it out.
printf("A + B = C: %d %d %f + %f = %f\n",i,j,a_h[index],b_h[index],c_h[index]);
}
} */
getch();
// cleanup...
free(a_h);
free(b_h);
free(c_h);
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
return(0);
}