Questions about virtual RAM page locks and the CUDA device...

I know that VirtualAlloc () does not actually load the memory pages until they are accessed, but does a registered page(s) with cudaHostRegister () not lock the page until it is accessed by the device? Furthermore; If so, is this page lock released when the kernel exits? Here is an example program of using a gigabyte of host memory by the device…

enum
{
    _1gbLog_        = 30,
    _SysPageLog_    = 12,
    _BlockLog_      = 5,
};

__global__ void page_kernel (__int8 * _pDeviceBytes, __int64 _PageCount)
{
    __int64 _PageIdx = threadIdx.x + blockDim.x * blockIdx.x;
    if (_PageIdx < _PageCount)
    {
        __int8 * _pPage = _pDeviceBytes + (_PageIdx << _SysPageLog_);
        (*_pPage) = 0;
    }
}

int main ()
{
    cudaError_t cudaStatus;

    cudaStatus = cudaSetDevice (0);
    if (cudaStatus != cudaSuccess)
    {
        fprintf (stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        return 1;
    }

    __int8 * _pHostBytes = (__int8 *)::VirtualAlloc (0, 1 << _1gbLog_, MEM_RESERVE|MEM_COMMIT,    PAGE_READWRITE);

    assert (_pHostBytes);

    assert (cudaSuccess == ::cudaHostRegister (_pHostBytes, 1 << _1gbLog_, cudaHostAllocPortable|cudaHostAllocMapped));

    __int8 * _pDeviceBytes = 0;

    assert (cudaSuccess == ::cudaHostGetDevicePointer ((void **)&_pDeviceBytes, (void *)_pHostBytes, 0));

    dim3 _GridDim (1 << (_1gbLog_ - _SysPageLog_ - _BlockLog_));
    dim3 _BlockDim (1 << _BlockLog_);

    page_kernel<<<_GridDim, _BlockDim>>>(_pDeviceBytes, 1 << (_1gbLog_ - _SysPageLog_));

    assert (cudaSuccess == ::cudaThreadSynchronize ());

    assert (cudaSuccess == ::cudaHostUnregister (_pHostBytes));

    assert (::VirtualFree (_pHostBytes, 0, MEM_RELEASE));

    cudaStatus = cudaDeviceReset ();
    if (cudaStatus != cudaSuccess)
    {
        fprintf (stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

It is obvious that the memory is not loaded until the kernel runs because the pages are assigned 0 at the first byte during the kernels runtime, so the pages are not locked until they are accessed by the device which also seems to be obvious. But is this lock released when the device kernel exits as a management step of the cuda devices page locker, i.e., cudaHostRegister ()? I need to know because I not only want them to be page locked as the device executes so it can access them but I also need to know so I know if I need to implement my own page lock management system.

Thanks…