I’ve been working with the cudaHostRegister and cudaHostUnregister functions and I’ve noticed that the latter takes very long. Even when comparing cudaHostUnregister to a cudaMemcpy on the same data it takes a very long time, even without using page-locked memory for the memcpy.
I’ve made the following short program:
#include <stdio.h>
#include <time.h>
#include <assert.h>
#include <stdlib.h>
static struct timespec tp;
static clockid_t clk = CLOCK_REALTIME;
static void tu_timer_start(void)
{
int res = clock_gettime(clk, &tp);
assert(!res);
}
static long long tu_timer_stop(void)
{
struct timespec tp_new;
long long elapsed;
int res = clock_gettime(clk, &tp_new);
assert(!res);
elapsed = 1000000000LL * (tp_new.tv_sec - tp.tv_sec) + tp_new.tv_nsec - tp.tv_nsec;
tp = tp_new;
return elapsed;
}
int main() {
const int length = 999424;
const int pagesize = 4096;
// Allocating page-aligned host data and filling it with zeroes.
int *paged, *locked;
posix_memalign((void**) &paged, pagesize, length * sizeof(int));
posix_memalign((void**) &locked, pagesize, length * sizeof(int));
memset(paged, 0, length * sizeof(int));
memset(locked, 0, length * sizeof(int));
// Allocating device data.
int *devPaged, *devLocked;
tu_timer_start();
printf("%20d\n", cudaMalloc(&devPaged, length * sizeof(int)));
printf("%20d\n", cudaMalloc(&devLocked, length * sizeof(int)));
printf("Initialization: %12lld ns\n", tu_timer_stop());
// Measure copy time with pageable data.
tu_timer_start();
printf("%20d\n", cudaMemcpy(devPaged, paged, length * sizeof(int), cudaMemcpyHostToDevice));
printf("Copy pageable: %12lld ns\n", tu_timer_stop());
// Measure time to page-lock host data.
tu_timer_start();
printf("%20d\n", cudaHostRegister(locked, length * sizeof(int), 0));
printf("Host register: %12lld ns\n", tu_timer_stop());
// Measure copy time with page-locked data.
tu_timer_start();
printf("%20d\n", cudaMemcpy(devLocked, locked, length * sizeof(int), cudaMemcpyHostToDevice));
printf("Copy page-locked: %12lld ns\n", tu_timer_stop());
// Measure time to release page-lock on host data.
tu_timer_start();
cudaHostUnregister(locked);
printf("Host unregister: %12lld ns\n", tu_timer_stop());
return 0;
}
This gives the following output on a quad-core Intel i5 760 (2.80 GHz per core) with a Tesla C2050 (with cuda return code not printed here):
Initialization: 81027005 ns
Copy pageable: 1263236 ns
Host register: 436132 ns
Copy page-locked: 706051 ns
Host unregister: 2139736 ns
This shows my problem. In my actual program it is even worse, I often measure cudaHostUnregister taking around 3460000 ns. This would suggest that it doesn’t play well with concurrent asynchronous memcopies or kernel runs,on top of being slow.
Why does this function take so long, and is there a way to speed it up? And does it really not work in parallel with memcopies and kernels, and if it does, why not?
Or is there simply a much better way of parallelizing memcopies and kernel runs?