I tried comparing conventional cudaMalloc with cudaHostAlloc for DMA, result is weird

I pasted my code below, it is a modded version of code in the “cuda by example” book’s p189 cudaHostAlloc (page locked memory example).
The author compared the result with following result (on p191):
GTX285:

  • 2.77GB/s for cudaMalloc and transfer
  • 5.11GB/s for cudaHostAlloc and DMA transfer apparently.

I tried on my system with following code and cudaMalloc test looks reasonable but page-locked transfer seems out of ordinary.
My system info Intel I5-4800, 32GB, GTX1080, could that it possible my system is much better than GTI285 in DMA transfer such that it results in near zero 0.0ms resulting in almost infinite bandwidth or some coding error???

  • cudaMalloc test:
    Time using cudaMalloc(up): 8944.5 ms.
    MB/s during copy up: 4472.0.
    Time using cudaMalloc(down): 9172.5 ms.
    MB/s during copy up: 4360.8.

  • cudaHostalloc test:
    Time using cudaMalloc(up): 0.0 ms.
    MB/s during copy up: 19531249664.0.
    Time using cudaMalloc(down): 0.0 ms.
    MB/s during copy up: 39062499328.0.
    Press any key to continue . . .

#include “cuda_runtime.h”
#include “device_launch_parameters.h”

#include <stdio.h>
#define SIZE (10010241024)

float cuda_malloc_test(int size, bool up, bool hostAlloc = false) {
cudaEvent_t start, stop;
int *a, *dev_a;
float elapsedTime;

cudaEventCreate(&start);
cudaEventCreate(&stop);

if (hostAlloc) {
	a = (int*)cudaHostAlloc((void**)&a, size * sizeof(*a), cudaHostAllocDefault);
} else {
	a = (int*)malloc(size * sizeof(*a));
}
cudaMalloc((void**)&dev_a, size * sizeof(*dev_a));

cudaEventRecord(start, 0);
for (int i = 0; i < 100; i++) {
	if (up)
		cudaMemcpy(dev_a, a, size * sizeof(*dev_a), cudaMemcpyHostToDevice);
	else
		cudaMemcpy(a, dev_a, size * sizeof(*dev_a), cudaMemcpyDeviceToHost);
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);

if (hostAlloc) {
	cudaFreeHost(dev_a);
} else {
	cudaFree(dev_a);
}
cudaFree(a);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return elapsedTime;		

}

int main()
{
float elapsedTime;

printf("cudaMalloc test:\n");
float MB = (float)100 * SIZE * sizeof(int) / 1024 / 1024;
elapsedTime = cuda_malloc_test(SIZE, true);
printf("Time using cudaMalloc(up): %3.1f ms.\n", elapsedTime);
printf("MB/s during copy up: %3.1f.\n", MB / (elapsedTime / 1000));
elapsedTime = cuda_malloc_test(SIZE, false);
printf("Time using cudaMalloc(down): %3.1f ms.\n", elapsedTime);
printf("MB/s during copy up: %3.1f.\n", MB / (elapsedTime / 1000));

printf("cudaHostalloc test:\n");
elapsedTime = cuda_malloc_test(SIZE, true, 1);
printf("Time using cudaMalloc(up): %3.1f ms.\n", elapsedTime);
printf("MB/s during copy up: %3.1f.\n", MB / (elapsedTime / 1000));
elapsedTime = cuda_malloc_test(SIZE, false, 1);
printf("Time using cudaMalloc(down): %3.1f ms.\n", elapsedTime);
printf("MB/s during copy up: %3.1f.\n", MB / (elapsedTime / 1000));

}