`nvmlDeviceGetComputeRunningProcesses_v3()` inside container reporting invalid memory usage number

On select new drivers (535-open, 555-open, 555-legacy and so on) we are struggling with retrieving GPU memory usages of individual process with NVML. Upon executing the code below inside Docker container we could observe two different behaviors, depending on the NVIDIA driver version:

work@21c3c20e3977:/workspace$ cat test.cu
#include <nvml.h>
#include <stdio.h>
#include <unistd.h>

int main(void) {
    int ret;
    unsigned int proc_count;
    float *matrix;
    nvmlDevice_t device;
    nvmlMemory_t device_memory;
    nvmlProcessInfo_t *proc;
    cudaError_t err;

    cudaGetDevice(0);

    printf("Loading memory\n");
    // Assign size variables
    size_t matrix_size = 1024 * 1024 * 1024 * sizeof(float);
    err = cudaMalloc((void **) &matrix, matrix_size);
    printf("Loaded memory\n");

    ret = nvmlInitWithFlags(0);
    if (ret != 0) {
        printf("Error while executing nvmlInitWithFlags(): %d\n", ret);
        goto end;
    }
    ret = nvmlDeviceGetHandleByIndex_v2(0, &device);
    if (ret != 0) {
        printf("Error while executing nvmlDeviceGetHandleByIndex_v2(): %d\n", ret);
        goto end;
    }
    ret = nvmlDeviceGetComputeRunningProcesses(device, &proc_count, nullptr);
    proc = (nvmlProcessInfo_t *) malloc(sizeof(nvmlProcessInfo_t) * (proc_count + 1));
    bzero(proc, sizeof(nvmlProcessInfo_t) * (proc_count + 1));

    printf("Number of processes: %u\n", proc_count);

    ret = nvmlDeviceGetComputeRunningProcesses(device, &proc_count, proc);
    if (ret != 0) {
        printf("Error while executing nvmlDeviceGetComputeRunningProcesses(): %d\n", ret);
        goto end;
    }

    for (unsigned int i = 0; i < proc_count; i++) {
        printf("PID: %d, Used Memory: %llu\n", proc[i].pid, proc[i].usedGpuMemory);
    }

    ret = nvmlDeviceGetMemoryInfo(device, &device_memory);
    if (ret != 0) {
        printf("Error while executing nvmlDeviceGetMemoryInfo(): %d\n", ret);
        goto end;
    }

    printf("Device total memory: %llu, Device used memory: %llu\n", device_memory.total, device_memory.used);

end:
    cudaFree(matrix);
    free(proc);
    return 0;
}

On majority of NVIDIA drivers (legacy <555) this would print PID of host namespace but still show accurate memory usage numbers.
But on legacy 555 or latest OpenRM drivers, NVML is aware of container PID namespace but always reports used GPU memory as 0, regardless of process’ actual VRAM consumption.

work@21c3c20e3977:/workspace$ ./test
Loading memory
Loaded memory
Number of processes: 1
PID: 370, Used Memory: 0
Device total memory: 85520809984, Device used memory: 5309333504

Could anyone tell about if this behavior is intentional? If so, will there be any alternative API provided to recall actual memory consumption of processes bound to GPU?