CUDA gives old values with pinned memory managed by Matlab

I am observing some weird behaviour when pinning (cudaHostRegister) memory managed by Matlab. My goal is to develop a MEX application that will first be initialized, and can be execute faster when called again with new data.

The initialization stage:

  • Pin/page-lock the memory from a Matlab array (using cudaHostRegister)
  • allocate device memory (cudaMalloc)

Then I can reuse the allocated device buffer and the pinned memory on subsequent function calls, making these subsequent function calls significantly faster.

This works great when the input coming from Matlab does not change (no size change or different array pointer). However, I would like to detect when the input data has changed size or (and this is where we get to the problem) the size remains the same but the array has been cleared and recreated by Matlab in the meantime.

When the matlab array was cleared and recreated, the output of my mex function is older data from previous function calls.

My Matlab script:

% compiling
mexcuda memRegisterTest.cu -R2018a -output memRegisterTestExec
g = gpuDevice; reset(g);
clear all 

% generate data and run mex function twice
RF = ones(2^28, 1, 'int16') * 13;
tic; result = memRegisterTestExec(RF); fprintf("result = %i,\t", result), toc
tic; result = memRegisterTestExec(RF); fprintf("result = %i,\t", result), toc

% change the data (underlying array is the same)
RF = RF * 2;
tic; result = memRegisterTestExec(RF); fprintf("result = %i,\t", result), toc
tic; result = memRegisterTestExec(RF); fprintf("result = %i,\t", result), toc

% clear and then recreate data
clear RF
RF = ones(2^28, 1, 'int16') * 46546;
tic; result = memRegisterTestExec(RF); fprintf("result = %i,\t", result), toc
tic; result = memRegisterTestExec(RF); fprintf("result = %i,\t", result), toc

This gives me the following output:

Building with 'nvcc'.
MEX completed successfully.
[DEBUG] Running initialization
result = 13,    Elapsed time is 0.076995 seconds. % first run, initialization takes time
[DEBUG] Already Initialized
result = 13,    Elapsed time is 0.043920 seconds. % subsequent runs, much faster
[DEBUG] Already Initialized
result = 13,    Elapsed time is 0.043889 seconds.
[DEBUG] Already Initialized
result = 1123,  Elapsed time is 0.044160 seconds. % data changed, still fast since same array
[DEBUG] Already Initialized
result = 1123,  Elapsed time is 0.044127 seconds.
% array cleared in between and then recreated with same size (ptr is also the same)
[DEBUG] Already Initialized
result = 1123,  Elapsed time is 0.044190 seconds. % still fast, but wrong answer!
[DEBUG] Already Initialized
result = 1123,  Elapsed time is 0.043873 seconds.

So I can see, the cudaMemcpy to device is still fast, since it is still using pinned memory (not shown here, but checked before). However, for some reason it is using the wrong memory, since the original data has been cleared in the meantime. The data is the same size, so therefore I suspect Matlab can use the same location, hence the pointer stays the same.

What is going on here, and how can I correctly detect that the memory has changed?

My code memRegisterTest.cu


#include "mex.h"
#include "gpu/mxGPUArray.h"
#include <iostream>
#include "cuda.h"
#include "cuda_runtime.h"

#define _DEBUG_MSG(str)                              \
    do                                               \
    {                                                \
        std::cout << "[DEBUG] " << str << std::endl; \
    } while (false)

// =============================================================================

static bool isInitialized = false;
// host
static size_t numel_RF_prev;          // to check for changed data size
static int16_t *h_RFData = NULL;      // pinned, keep ptr to unpin at exit/clear
static int16_t *h_RFData_prev = NULL; // previous pointer, to see if memory loc changed
// device
static int16_t *d_RFData = NULL;

void exitFcn()
{
    cudaHostUnregister(h_RFData); // unpin memory
    cudaFree(d_RFData);           // clear persistent variables
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
        mexPrintf("CUDA error: %s\n", cudaGetErrorString(err));
}

// =============================================================================

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
    // INIT GPU
    if (mxInitGPU() != MX_GPU_SUCCESS)
        mexErrMsgTxt("Could not Initialize the MathWorks GPU API.");
    mexAtExit(exitFcn);

    // check input type, must be int16
    mxClassID RF_dtype = mxGetClassID(prhs[0]);
    if (RF_dtype != mxINT16_CLASS)
        mexErrMsgIdAndTxt("Main:Input", "First input argument (RF data) must be int16.");

    h_RFData = mxGetInt16s(prhs[0]);    // get input
    size_t numel_RF = mxGetNumberOfElements(prhs[0]);

    mwSize dims[1] = {1};      // create output dummy
    plhs[0] = mxCreateNumericArray(1, dims, mxINT16_CLASS, mxREAL); 
    int16_t *h_Scalar = mxGetInt16s(plhs[0]);

    // =========================================================================
    bool sizeChanged = (numel_RF_prev != numel_RF);
    bool ptrUpdated = (h_RFData_prev != h_RFData);

    if (isInitialized)
    {
        _DEBUG_MSG("Already Initialized");
        // check if memory still has same size
        if (sizeChanged)
        {
            _DEBUG_MSG("Data size changed");
            cudaHostUnregister(h_RFData_prev); // unpin old memory
            cudaFree(d_RFData);                // free old allocation
            isInitialized = false;             // reuse init method below
        }
        else if (ptrUpdated)
        {
            _DEBUG_MSG("Data ptr changed");
            cudaHostUnregister(h_RFData_prev); // unpin old memory
            isInitialized = false;             // reuse init method below
        }
    }

    if (!isInitialized)
    {
        _DEBUG_MSG("Running initialization");
        h_RFData_prev = h_RFData;
        numel_RF_prev = numel_RF;
        // pin memory
        if (ptrUpdated)
            cudaHostRegister(h_RFData, numel_RF * sizeof(int16_t), cudaHostRegisterMapped); // cudaHostRegisterPortable);
        // malloc memory
        if (sizeChanged)
            cudaMalloc((void **)&d_RFData, numel_RF * sizeof(int16_t));
        isInitialized = true;
    }

    // copy some data back and forth
    cudaMemcpy(d_RFData, h_RFData, numel_RF * sizeof(int16_t), cudaMemcpyHostToDevice);
    cudaMemcpy(h_Scalar, d_RFData, 1 * sizeof(int16_t), cudaMemcpyDeviceToHost);

    // CUDA error checking
    cudaDeviceSynchronize();
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
        mexPrintf("CUDA error: %s\n", cudaGetErrorString(err));
}