CUDA gives old values with pinned memory managed by Matlab

rinkertt · January 6, 2022, 1:49pm

I am observing some weird behaviour when pinning (cudaHostRegister) memory managed by Matlab. My goal is to develop a MEX application that will first be initialized, and can be execute faster when called again with new data.

The initialization stage:

Pin/page-lock the memory from a Matlab array (using cudaHostRegister)
allocate device memory (cudaMalloc)

Then I can reuse the allocated device buffer and the pinned memory on subsequent function calls, making these subsequent function calls significantly faster.

This works great when the input coming from Matlab does not change (no size change or different array pointer). However, I would like to detect when the input data has changed size or (and this is where we get to the problem) the size remains the same but the array has been cleared and recreated by Matlab in the meantime.

When the matlab array was cleared and recreated, the output of my mex function is older data from previous function calls.

My Matlab script:

% compiling
mexcuda memRegisterTest.cu -R2018a -output memRegisterTestExec
g = gpuDevice; reset(g);
clear all 

% generate data and run mex function twice
RF = ones(2^28, 1, 'int16') * 13;
tic; result = memRegisterTestExec(RF); fprintf("result = %i,\t", result), toc
tic; result = memRegisterTestExec(RF); fprintf("result = %i,\t", result), toc

% change the data (underlying array is the same)
RF = RF * 2;
tic; result = memRegisterTestExec(RF); fprintf("result = %i,\t", result), toc
tic; result = memRegisterTestExec(RF); fprintf("result = %i,\t", result), toc

% clear and then recreate data
clear RF
RF = ones(2^28, 1, 'int16') * 46546;
tic; result = memRegisterTestExec(RF); fprintf("result = %i,\t", result), toc
tic; result = memRegisterTestExec(RF); fprintf("result = %i,\t", result), toc

This gives me the following output:

Building with 'nvcc'.
MEX completed successfully.
[DEBUG] Running initialization
result = 13,    Elapsed time is 0.076995 seconds. % first run, initialization takes time
[DEBUG] Already Initialized
result = 13,    Elapsed time is 0.043920 seconds. % subsequent runs, much faster
[DEBUG] Already Initialized
result = 13,    Elapsed time is 0.043889 seconds.
[DEBUG] Already Initialized
result = 1123,  Elapsed time is 0.044160 seconds. % data changed, still fast since same array
[DEBUG] Already Initialized
result = 1123,  Elapsed time is 0.044127 seconds.
% array cleared in between and then recreated with same size (ptr is also the same)
[DEBUG] Already Initialized
result = 1123,  Elapsed time is 0.044190 seconds. % still fast, but wrong answer!
[DEBUG] Already Initialized
result = 1123,  Elapsed time is 0.043873 seconds.

So I can see, the cudaMemcpy to device is still fast, since it is still using pinned memory (not shown here, but checked before). However, for some reason it is using the wrong memory, since the original data has been cleared in the meantime. The data is the same size, so therefore I suspect Matlab can use the same location, hence the pointer stays the same.

What is going on here, and how can I correctly detect that the memory has changed?

My code memRegisterTest.cu


#include "mex.h"
#include "gpu/mxGPUArray.h"
#include <iostream>
#include "cuda.h"
#include "cuda_runtime.h"

#define _DEBUG_MSG(str)                              \
    do                                               \
    {                                                \
        std::cout << "[DEBUG] " << str << std::endl; \
    } while (false)

// =============================================================================

static bool isInitialized = false;
// host
static size_t numel_RF_prev;          // to check for changed data size
static int16_t *h_RFData = NULL;      // pinned, keep ptr to unpin at exit/clear
static int16_t *h_RFData_prev = NULL; // previous pointer, to see if memory loc changed
// device
static int16_t *d_RFData = NULL;

void exitFcn()
{
    cudaHostUnregister(h_RFData); // unpin memory
    cudaFree(d_RFData);           // clear persistent variables
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
        mexPrintf("CUDA error: %s\n", cudaGetErrorString(err));
}

// =============================================================================

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
    // INIT GPU
    if (mxInitGPU() != MX_GPU_SUCCESS)
        mexErrMsgTxt("Could not Initialize the MathWorks GPU API.");
    mexAtExit(exitFcn);

    // check input type, must be int16
    mxClassID RF_dtype = mxGetClassID(prhs[0]);
    if (RF_dtype != mxINT16_CLASS)
        mexErrMsgIdAndTxt("Main:Input", "First input argument (RF data) must be int16.");

    h_RFData = mxGetInt16s(prhs[0]);    // get input
    size_t numel_RF = mxGetNumberOfElements(prhs[0]);

    mwSize dims[1] = {1};      // create output dummy
    plhs[0] = mxCreateNumericArray(1, dims, mxINT16_CLASS, mxREAL); 
    int16_t *h_Scalar = mxGetInt16s(plhs[0]);

    // =========================================================================
    bool sizeChanged = (numel_RF_prev != numel_RF);
    bool ptrUpdated = (h_RFData_prev != h_RFData);

    if (isInitialized)
    {
        _DEBUG_MSG("Already Initialized");
        // check if memory still has same size
        if (sizeChanged)
        {
            _DEBUG_MSG("Data size changed");
            cudaHostUnregister(h_RFData_prev); // unpin old memory
            cudaFree(d_RFData);                // free old allocation
            isInitialized = false;             // reuse init method below
        }
        else if (ptrUpdated)
        {
            _DEBUG_MSG("Data ptr changed");
            cudaHostUnregister(h_RFData_prev); // unpin old memory
            isInitialized = false;             // reuse init method below
        }
    }

    if (!isInitialized)
    {
        _DEBUG_MSG("Running initialization");
        h_RFData_prev = h_RFData;
        numel_RF_prev = numel_RF;
        // pin memory
        if (ptrUpdated)
            cudaHostRegister(h_RFData, numel_RF * sizeof(int16_t), cudaHostRegisterMapped); // cudaHostRegisterPortable);
        // malloc memory
        if (sizeChanged)
            cudaMalloc((void **)&d_RFData, numel_RF * sizeof(int16_t));
        isInitialized = true;
    }

    // copy some data back and forth
    cudaMemcpy(d_RFData, h_RFData, numel_RF * sizeof(int16_t), cudaMemcpyHostToDevice);
    cudaMemcpy(h_Scalar, d_RFData, 1 * sizeof(int16_t), cudaMemcpyDeviceToHost);

    // CUDA error checking
    cudaDeviceSynchronize();
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
        mexPrintf("CUDA error: %s\n", cudaGetErrorString(err));
}

Topic		Replies	Views
matlab, mex files and loops CUDA Programming and Performance	12	11628	March 9, 2012
How to make a proper cleanup after a matlab mex file that uses cuda? CUDA Programming and Performance	2	5361	June 25, 2012
MATLAB examples on Linux Speed does not seem to scale with graphics card capabilities CUDA Programming and Performance	2	4553	May 26, 2009
cudaMalloc error in big loop CUDA Programming and Performance	12	15577	May 21, 2008
Problems with cudaHostAlloc and cudaMemcpyAsync CUDA Programming and Performance	5	4460	February 8, 2010
cudaFree is returning an unrecognised error code CUDA Programming and Performance	10	7887	March 13, 2009
Cuda Out of Memory with tons of memory left? CUDA Programming and Performance	5	38923	December 23, 2009
Cuda application crashes works fine for small data and crashes for big data CUDA Developer Tools	0	359	December 8, 2020
The output of the Cuda kernel is not correct. Any problem with my gateway MEX code? CUDA Programming and Performance	1	536	March 23, 2021
Can a Kernel be too big?? CUDA_ERROR_NO_BINARY_FOR_GPU error 209 CUDA Programming and Performance	11	2950	November 13, 2017

CUDA gives old values with pinned memory managed by Matlab

Related Topics