cuGetMemInfo fails when nvcuda.dll is loaded dynamically

I’m trying to load the driver API at runtime using LoadLibrary/getProcAdress. I’m using Windows 10 Pro, Visual Studio 2019, Cuda 9.2, and a RTX 2080TI.

#include "C:/CUDA/9.2/include/cuda.h"
#include <iostream>
#include <Windows.h>

#define LOAD_CUDA_FUNCTION(NAME) _##NAME = (NAME##_p)GetProcAddress(hinst, #NAME);

typedef CUresult(*cuCtxCreate_p)(CUcontext*, unsigned int, CUdevice);
typedef CUresult(*cuCtxDestroy_p)(CUcontext);
typedef CUresult(*cuDeviceGet_p)(CUdevice*, int);
typedef CUresult(*cuInit_p)(unsigned int);
typedef CUresult(*cuMemGetInfo_p)(size_t*, size_t*);

cuCtxCreate_p _cuCtxCreate;
cuCtxDestroy_p _cuCtxDestroy;
cuDeviceGet_p _cuDeviceGet;
cuInit_p _cuInit;
cuMemGetInfo_p _cuMemGetInfo;

int main(int argc, char** argv)
{
    HINSTANCE hinst = LoadLibrary(TEXT("nvcuda.dll"));

    LOAD_CUDA_FUNCTION(cuCtxCreate);
    LOAD_CUDA_FUNCTION(cuCtxDestroy);
    LOAD_CUDA_FUNCTION(cuDeviceGet);
    LOAD_CUDA_FUNCTION(cuInit);
    LOAD_CUDA_FUNCTION(cuMemGetInfo);

    size_t free, total;

    CUdevice dev;
    CUcontext ctx;

    _cuInit(0);
    _cuDeviceGet(&dev, 0);
    _cuCtxCreate(&ctx, 0, dev);
    _cuMemGetInfo(&free, &total);

    std::cout << total / (1024 * 1024) << "\n";

    _cuCtxDestroy(ctx);
    FreeLibrary(hinst);

    return 0;
}

When I run the program, I get no errors, but only the lower 32 bit of total and free are being changed, the upper 32 bit remain 0xcccccccc in debug mode, i.e. total = 0xccccccccdaffffff.
Is there anything wrong with the calling convention, 32/64 bit version of the DLL, …?

bye,
loki

Update: same issue with Linux, using dlopen/dlsym.

cu,
loki

Ok, it seems that I have to use cuMemGetInfo_v2…

cu,
loki