Async DMA on the host-registered memory but remapped, expected behavior?

Hello folks,

I could find a mysterious behavior with CUDA Driver API. (CUDA version is 7.5)

The test program I attached measures the time for DMA transfer in the four scenario below.

DMA from the host memory where a shared memory segment is mapped. CUDA driver will use
its internal buffer, so less performance is expected.

DMA from the same host memory but cuMemHostRegister() is applied. So, this DMA request
shall be processed as zero-copy DMA. So, better performance is expected.

Once I unmap the above host memory, then mapped different shared memory segment on
the same address again, but cuMemHostUnregister() is not called.

DMA test after the cuMemHostUnregister().

The test results in 1, 2 and 4 are as expected.
However, I have no idea how to understand the result of the test-3.
The test program implies the test-3 runs zero-copy DMA.
However, the physical shared memory segment I mapped is different one when I call
the cuMemHostRegister().

In my understanding, cuMemHostRegister() ensures the supplied pages are on-memory,
and tracks a particular address range that allows zero-copy DMA.
However, it is not certain why it is valid on the virtual address range which was
remapped to different pages.

If CUDA driver is enough wise, it is good.
However, I worry about a potential bug that tries to kick zero-copy DMA on the
region where system does not guarantee the pages located on-memory.

[kaigai@saba misc]$ gcc hostmem.c -I /usr/local/cuda/include/ -lcuda -lrt
[kaigai@saba misc]$ ./a.out
begin DMA test-1 (synchronous HtoD)
buf: aaaa aaaa
time: 11.80
begin DMA test-2 (zero-copy HtoD)
buf: aaaa aaaa
time: 3.81
begin DMA test-3 (remap different region, but unregister)
buf: bbbb bbbb
time: 3.82
begin DMA test-4 (unregister again)
buf: bbbb bbbb
time: 11.16

Test code is below.

#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <cuda.h>

#define CUQUIT(rc,fmt,...)                          \
    do {                                            \
        if (rc != CUDA_SUCCESS)                     \
        {                                           \
            const char *__err_name;                 \
            const char *__err_str;                  \
            cuGetErrorName((rc), &__err_name);      \
            cuGetErrorString((rc), &__err_str);     \
            fprintf(stderr, "%d" fmt ": %s (%s)\n", \
                    __LINE__, # __VA_ARGS__,        \
                    __err_name, __err_str);         \
            exit(2);                                \
        }                                           \
    } while(0)

#define EQUIT(fmt,...)                              \
    do {                                            \
        fprintf(stderr, "%d" fmt ": %m\n",          \
                __LINE__, # __VA_ARGS__);           \
        exit(1);                                    \
    } while(0)

static void
run_dma_test(char *haddr, CUdeviceptr daddr, size_t length, CUstream stream)
    double      elapsed;
    int         i, nloops = 10;
    CUresult    rc;
    struct timeval tv1, tv2;

    gettimeofday(&tv1, NULL);
    for (i=0; i < nloops; i++)
        rc = cuMemcpyHtoDAsync(daddr, haddr, length, stream);
        CUQUIT(rc, "failed on cuMemcpyHtoDAsync i=%d", i);
    rc = cuMemcpyDtoHAsync(haddr, daddr, length, stream);
    CUQUIT(rc, "failed on cuMemcpyDtoHAsync");
    rc = cuStreamSynchronize(stream);
    CUQUIT(rc, "failed on cuStreamSynchronize");
    gettimeofday(&tv2, NULL);
    elapsed = (double)((tv2.tv_sec - tv1.tv_sec) * 1000000 +
                       (tv2.tv_usec - tv1.tv_usec)) / (double)1000000.0;
    printf("buf: %c%c%c%c %c%c%c%c\n",
           haddr[0], haddr[1], haddr[2], haddr[3],
           haddr[4], haddr[5], haddr[6], haddr[7]);
    printf("time: %.2f\n", elapsed);

int main(int argc, const char *argv[])
    size_t          length = 1UL << 31;
    int             fdesc;
    CUdevice        dev;
    CUcontext       cxt;
    CUresult        rc;
    CUstream        stream;
    CUdeviceptr     daddr;
    char           *haddr;

    rc = cuDeviceGet(&dev, 0);
    CUQUIT(rc, "failed on cuDeviceGet");
    rc = cuCtxCreate(&cxt, 0, dev);
    CUQUIT(rc, "failed on cuCtxCreate");
    rc = cuStreamCreate(&stream, 0);
    CUQUIT(rc, "failed on cuStreamCreate");
    rc = cuMemAlloc(&daddr, length);
    CUQUIT(rc, "failed on cuMemAlloc");

    fdesc = shm_open("/testdata_1", O_RDWR | O_CREAT | O_TRUNC, 0600);
    if (fdesc < 0)
        EQUIT("failed on shm_open");
    if (ftruncate(fdesc, length) != 0)
        EQUIT("failed on ftruncate");
    haddr = mmap(NULL, length,
                 PROT_READ | PROT_WRITE,
                 fdesc, 0);
    if (haddr == (void *)(~0UL))
        EQUIT("failed on mmap");
    memset(haddr, 'a', length);

    /* -------- 1st trial -------- */
    puts("begin DMA test-1 (synchronous HtoD)");
    run_dma_test(haddr, daddr, length, stream);

    /* -------- 2nd trial -------- */
    rc = cuMemHostRegister(haddr, length, 0);
    CUQUIT(rc, "failed on cuMemHostRegister");
    puts("begin DMA test-2 (zero-copy HtoD)");
    run_dma_test(haddr, daddr, length, stream);

    /* -------- 3rd trial -------- */
    /* unmap, then mmap again */
    if (munmap(haddr, length) != 0)
        EQUIT("failed on munmap");
    fdesc = shm_open("/testdata_2", O_RDWR | O_CREAT | O_TRUNC, 0600);
    if (fdesc < 0)
        EQUIT("failed on shm_open");
    if (ftruncate(fdesc, length) != 0)
        EQUIT("failed on ftruncate");
    if (mmap(haddr, length,
             PROT_READ | PROT_WRITE,
             MAP_SHARED | MAP_FIXED,
             fdesc, 0) != haddr)
        EQUIT("failed on mmap");
    memset(haddr, 'b', length);

    puts("begin DMA test-3 (remap different region, but unregister)");
    run_dma_test(haddr, daddr, length, stream);

    /* -------- 4th trial -------- */
    rc = cuMemHostUnregister(haddr);
    CUQUIT(rc, "failed on cuMemHostUnregister");
    puts("begin DMA test-4 (unregister again)");
    run_dma_test(haddr, daddr, length, stream);

    return 0;