cuMemHostUnregister does not release shared GPU memory on 441.xx Studio Driver with 2 GPUs

When two NVIDIA GPUs are installed, calling cuMemHostUnregister after cuMemHostRegister(CU_MEMHOSTREGISTER_PORTABLE) does not release GPU Shared Memory on one of the GPUs (as displayed by Windows Process Explorer). Repeating allocation-deallocation multiple times eventually leads to out-of-memory errors, which requires application restart to rectify.

The error is present in 441.66 DCH, 441.28 DCH, 441.12 DCH and in 441.66 Standard (non-DCH).
The problem is not reproducible in 431.86 DCH.

Tested on Windows 10 Pro version 1903 (OS Build 18362.418) with a 2080 Ti and a 1080 Ti installed.

The minimum sample demonstrating the problem is provided below:

#include <stdio.h>
#include <cuda.h>

#define CHECK(TXT) if(err != CUDA_SUCCESS) { printf("%s returned %d\n", TXT, err); return 1; }

int main()
{
    CUresult err;
    err = cuInit(0);                                                         CHECK("cuInit");

    int deviceCount;
    err = cuDeviceGetCount(&deviceCount);                                    CHECK("cuDeviceGetCount");
    if(deviceCount < 2)
    {
        printf("Need at least 2 CUDA devices, found: %d\n", deviceCount);
        return 2;
    }
    CUdevice devices[2];
    err = cuDeviceGet(devices + 0, 0);                                       CHECK("cuDeviceGet");
    err = cuDeviceGet(devices + 1, 1);                                       CHECK("cuDeviceGet");

    CUcontext gpu0Ctx, gpu1Ctx;
    err = cuCtxCreate(&gpu0Ctx, 0, devices[0]);                              CHECK("cuCtxCreate-0");
    err = cuCtxCreate(&gpu1Ctx, 0, devices[1]);                              CHECK("cuCtxCreate-1");

    const int Size = 500 * 1024 * 1024;
    void* cpuMem = new char;

    for(int i = 0; i < 1000; ++i)
    {
        printf("%d ", i);
        err = cuMemHostRegister(cpuMem, Size, CU_MEMHOSTREGISTER_PORTABLE);  CHECK("cuMemHostRegister");
        err = cuMemHostUnregister(cpuMem);                                   CHECK("cuMemHostRegister");
    }
    puts("\nFinished - press ENTER");
    getchar();

    delete[] cpuMem;

    err = cuCtxDestroy(gpu1Ctx);                                             CHECK("cuCtxDestroy-1");
    err = cuCtxDestroy(gpu0Ctx);                                             CHECK("cuCtxDestroy-0");

    return 0;
}

I recommend you file a bug using the instructions linked in a sticky post at the top of this forum.

Thanks Robert. Done already – please refer to Bug 2811027.