We allocate two kinds of buffer to test the bandwidth:
1)malloc : not visible for GPU, only for CPU
2)cudaMallocManaged : both for CPU and GPU
we use different API to test the bandwidth:
a)memcpy
b)cudaMemcpy
data size: 4MiB, loop count: 100, copy direction: malloc buffer → cudaMallocManaged buffer
We found a interesting result:
1.memcpy only use 39.18ms
2.cudaMemcpy use 70.47ms
#include <stdio.h>
#include <sys/time.h>
int main() {
int *m, *u, *d;
int n = 1024*1024;
m = new int[n];
cudaMallocManaged(&u, n*sizeof(int));
for (int i = 0; i < n; ++i)
m[i] = i;
// iter
int niter = 100;
struct timeval start, stop;
double t1, t2;
// malloc to managed use cudamemcpy
gettimeofday(&start, NULL);
for (int i = 0; i < niter; ++i)
cudaMemcpy(u, m, n*sizeof(int), cudaMemcpyHostToDevice);
cudaStreamSynchronize(NULL);
gettimeofday(&stop, NULL);
t1 = (stop.tv_sec-start.tv_sec)*1e3+(stop.tv_usec-start.tv_usec)/1e3;
// malloc to managed
gettimeofday(&start, NULL);
for (int i = 0; i < niter; ++i)
memcpy(u, m, n*sizeof(int));
cudaStreamSynchronize(NULL);
gettimeofday(&stop, NULL);
t2 = (stop.tv_sec-start.tv_sec)*1e3+(stop.tv_usec-start.tv_usec)/1e3;
printf("t1: %f, t2: %f\n", t1, t2);
delete []m;
cudaFree(u);
return 0;
}