How to share NvBufSurface with Cuda efficiently, without overhead of cuGraphicsEGLRegisterImage/cuGraphicsUnregisterResource?

Hi,
I want to ask the same question as hkumar2 in another forum:

but he did not get any answer there.

According to jetson_multimedia_api/samples/common/algorithm/cuda/NvCudaProc.cpp we have to call cuGraphicsEGLRegisterImage, cuGraphicsResourceGetMappedEglFrame and cuGraphicsUnregisterResource on every frame in order to pass NvBufSurface to Cuda.
But for 4K resolution image cuGraphicsEGLRegisterImage takes more than 1 ms and cuGraphicsUnregisterResource another 700 us, together they take 1.7 ms, which is about 5% of frame period (at 30 fps).
This is too much. This is more than some Cuda algorithms take. For comparison cudaMemcpy of the same image takes 4.3 ms

I thought that Orin has advantage of having a shared memory between CPU and GPU and buffer sharing should be very quick, but currently cuGraphicsEGLRegisterImage and cuGraphicsUnregisterResource are show stoppers.

I tried to call cuGraphicsEGLRegisterImage only once after NvBufSurface creation and it mostly works except for some small image corruption - some groups of pixels, may be cache lines, are retained in new frame from previous.

So, my question is how to share buffers with GPU efficiently, without 1.7 ms penalty.
May be there is a way to flush the cache on CPU or GPU side without registering/unregistering the image every time?
NvBufSurfaceSyncForCpu/NvBufSurfaceSyncForDevice do not appear to help.

Thank you

Hi,
The function calls are obligatory. It is supposed to work fine by calling the functions in initialization and termination. Please call NvBufSurfaceSyncForDevice() before the CUDA code and call cuCtxSynchronize() after CUDA code. To ensure the data is well synchronized.

Right now I do not have problems with Cuda seeing the latest changes from CPU,
but CPU does not always sees the latest changes from Cuda - most of data is updated, but few cachelines at various offsets are stale.
I tried calling NvBufSurfaceSyncForDevice, NvBufSurfaceSyncForCpu, cuCtxSynchronize in different combinations and order - still the data is stale.
But if I call cuGraphicsUnregisterResource then data is synchronized properly.
Apparently, cuGraphicsUnregisterResource is flushing GPU cache in some other way, not like NvBufSurfaceSyncForDevice… Are there some other APIs for flushing the cache?

I implemented the shortest sample code to reproduce the problem. It copies to shared buffer on CPU and compares on GPU and vice versa. If you run it as-is it will fail on the first loop. But if you uncomment line"//eglRegisterForEveryFrame = true", then it will succeed, but at the expense of calling cuGraphicsEGLRegisterImage, cuGraphicsResourceGetMappedEglFrame, cuGraphicsUnregisterResource on every frame, which we want to avoid.
Please, help me fix this code.

/*
/usr/local/cuda/bin/nvcc -x cu \
    -I/usr/local/cuda -I/usr/src/jetson_multimedia_api/include \
    -I/usr/src/jetson_multimedia_api/samples/common/algorithm/cuda/ \
    -I/usr/local/cuda-11.4/targets/aarch64-linux/include/ \
    -L/usr/local/cuda/lib64  -L/usr/lib/aarch64-linux-gnu/tegra/ \
   -lcuda -lnvbufsurface -lnvrm_surface -lnvrm_mem -lcudart -lnppif -lnppc -lnppisu -lnppidei \
    testSharedNvBufSurface.cpp -o testSharedNvBufSurface
*/
#include <cassert>

#include "NvBufSurface.h"
#include "NvCudaProc.h"
#include "nvosd.h"
#include "cudaEGL.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include <npp.h>
#include <cudaProfiler.h>
//Use thrust only to compare buffers on GPU using thrust::equal
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

int main(int argc, char ** argv)
{
    bool eglRegisterForEveryFrame {false};
    //eglRegisterForEveryFrame = true;//Uncomment to call very slow cuGraphicsEGLRegisterImage on every frame
    int bufferSize { 3840 * 2160 * 4 };//For simplicity let's create a 1-D 1 byte array of the same size as 4K RGB image
    assert(CUDA_SUCCESS == cuInit(0));
    CUdevice cudaDevice;
    assert(CUDA_SUCCESS == cuDeviceGet(&cudaDevice, 0));
    CUcontext context;
    assert(CUDA_SUCCESS == cuCtxCreate(&context, 0, cudaDevice));
    
    //Create NvBufSurface shared betweren CPU and GPU
    NvBufSurfaceAllocateParams inputParams = {{0}};
    inputParams.params.width = bufferSize;
    inputParams.params.height = 1;
    inputParams.params.memType = NVBUF_MEM_SURFACE_ARRAY;
    inputParams.params.layout = NVBUF_LAYOUT_PITCH;
    inputParams.params.colorFormat = NVBUF_COLOR_FORMAT_GRAY8;
    inputParams.memtag = NvBufSurfaceTag_CAMERA;
    NvBufSurface *nvbufSurf = 0;
    assert(0 == NvBufSurfaceAllocate(&nvbufSurf, 1, &inputParams));
    nvbufSurf->numFilled = 1;
    int dmaBuffer = nvbufSurf->surfaceList[0].bufferDesc;
    printf("NvBufSurfaceAllocate ret nvbufSurf %p dmaBuffer %d\n", nvbufSurf, dmaBuffer);
    //Map to CPU:
    assert(1 == nvbufSurf->surfaceList->planeParams.num_planes);
    assert(0 == NvBufSurfaceMap(nvbufSurf, 0, 0, NVBUF_MAP_READ_WRITE));
    char * sharedCpuPtr = (char *)nvbufSurf->surfaceList->mappedAddr.addr[0];
    //Map to GPU:
    assert(0 == NvBufSurfaceMapEglImage(nvbufSurf, 0));
    EGLImageKHR eglImage = nvbufSurf->surfaceList->mappedAddr.eglImage;
    printf("NvBufSurfaceMapEglImage ret eglImage %p\n", eglImage);
    CUgraphicsResource cuGraphicsResource {};
    assert(CUDA_SUCCESS == cuGraphicsEGLRegisterImage(&cuGraphicsResource, eglImage, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE));
    CUeglFrame eglFrame;
    assert(CUDA_SUCCESS == cuGraphicsResourceGetMappedEglFrame(&eglFrame, cuGraphicsResource, 0, 0));
    char * sharedGpuPtr = (char *)eglFrame.frame.pPitch[0];
    printf("Shared NvBufSurface mapped to CPU: %p GPU: %p\n", sharedCpuPtr, sharedGpuPtr);
    
    //Create 2 CPU and 2 GPU buffers:
    thrust::host_vector<uint8_t> cpuVectors[2] { 
        thrust::host_vector<uint8_t>(bufferSize), 
        thrust::host_vector<uint8_t>(bufferSize) };
    thrust::device_vector<uint8_t> gpuVectors[2] { 
        thrust::device_vector<uint8_t>(bufferSize), 
        thrust::device_vector<uint8_t>(bufferSize) };
    printf("Allocated CPU: %p %p GPU: %p %p\n", cpuVectors[0].data(), cpuVectors[1].data(), 
           gpuVectors[0].data().get(), (void*)gpuVectors[1].data().get()); 
    thrust::fill(cpuVectors[0].begin(), cpuVectors[0].end(), 0);
    thrust::fill(cpuVectors[1].begin(), cpuVectors[1].end(), 0xff);
    thrust::fill(gpuVectors[0].begin(), gpuVectors[0].end(), 0);
    thrust::fill(gpuVectors[1].begin(), gpuVectors[1].end(), 0xff);
    
    for(int frameIdx = 0; frameIdx < 10; frameIdx++) {//Simulate frame handling loop
        int buffIdx = {frameIdx & 1};
        printf("Copy on CPU\n");
        memcpy(sharedCpuPtr, cpuVectors[buffIdx].data(), bufferSize);//Copy to shared on CPU
        assert(0 == NvBufSurfaceSyncForDevice(nvbufSurf, 0, 0));
        if(eglRegisterForEveryFrame && !cuGraphicsResource) { 
            assert(CUDA_SUCCESS == cuGraphicsEGLRegisterImage(&cuGraphicsResource, eglImage, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE)); 
            assert(CUDA_SUCCESS == cuGraphicsResourceGetMappedEglFrame(&eglFrame, cuGraphicsResource, 0, 0));
        }
        printf("Compare on GPU\n");
        assert(true == thrust::equal(gpuVectors[buffIdx].begin(), gpuVectors[buffIdx].end(), sharedGpuPtr));
        printf("Copy on GPU\n");
        buffIdx = !buffIdx;
        assert(0 == cudaMemcpy(sharedGpuPtr, gpuVectors[buffIdx].data().get(), bufferSize, cudaMemcpyDeviceToDevice));
        assert(0 == cuCtxSynchronize());
        assert(0 == NvBufSurfaceSyncForCpu(nvbufSurf, 0, 0));
        if(eglRegisterForEveryFrame) { cuGraphicsUnregisterResource(cuGraphicsResource); cuGraphicsResource = nullptr; }
        printf("Compare on CPU\n");
        assert(true == thrust::equal(cpuVectors[buffIdx].begin(), cpuVectors[buffIdx].end(), sharedCpuPtr));
    }
    return true;
}

Hi,
Thanks for sharing the sample. We will try it on Jetpack 5.1.2 AGX Orin developer kit.

Hi,

Is the issue of cache lines not being flushed guaranteed to happen every time in your usecase?
Or it needs to be triggered under specific circumstances?

I tried your code on JetPack 5.1.2 with CUDA 11.4, and it worked fine without calling either cuGraphicsEGLRegisterImage() or cuGraphicsResourceGetMappedEglFrame().

I found 2 interesting things:

  1. The problem happens on any Orin running JetPack 5.1.2, but only after I call “sudo jetson_clocks” to boost clocks

  2. If there is another app running on the background, which calls cuGraphicsUnregisterResource on any buffer, even very small one, then my large 4K buffer is also flushed and the problem disappears.
    To reproduce I modified my code above, replaced lines:

    bool eglRegisterForEveryFrame {false};
    //eglRegisterForEveryFrame = true;//Uncomment to call very slow cuGraphicsEGLRegisterImage on every frame
    int bufferSize { 3840 * 2160 * 4 };

by

    bool eglRegisterForEveryFrame = getenv("eglRegisterForEveryFrame") != nullptr;
    int width = strtol(getenv("width"), nullptr, 10);
    int height = strtol(getenv("height"), nullptr, 10);
    int numFrames = strtol(getenv("numFrames"), nullptr, 10);
    printf("eglRegisterForEveryFrame %d width %d height %d numFrames %d\n", eglRegisterForEveryFrame, width, height, numFrames);
    int bufferSize { width * height * 4 };//For simplicity let's create a 1-D 1 byte array of the same size as 4K RGB image

and

    for(int frameIdx = 0; frameIdx < 10; frameIdx++)

by

    for(int frameIdx = 0; frameIdx < numFrames; frameIdx++)

Then I can run in one shell:
width=1 height=1 numFrames=10000000 eglRegisterForEveryFrame=1 ./testSharedNvBufSurface
and another shell:
width=3840 height=2160 numFrames=10 ./testSharedNvBufSurface
and the problem does not happen, but as soon as I kill the first app, then the second immediately throws assertion.

So, this is a kind of work around kludge: create a 1 byte NvBufSurface and call cuGraphicsEGLRegisterImage/cuGraphicsUnregisterResource on it in order to flush the cache on 4K image. But this is not documented and not guaranteed to work.
But this looks like another bug: if calling cuGraphicsUnregisterResource causes flushing GPU cache on another buffer in another application, then it may affect performance of the second application.

Anyway, we need a clean API to quickly flush GPU cache only for one specific NvBufSurface or, may be, for one GPU stream.
Thank you

Hi,

Have you tried cuGraphicsUnmapResources()?
The operation should be limited to a single CUDA stream.
https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPHICS.html

No, the problem is still here. Here is my updated code.I added an option to run with and without cuGraphicsUnmapResources, with and without stream. Please, double check that I use all APIs correctly.

/*
/usr/local/cuda/bin/nvcc -x cu \
    -I/usr/local/cuda -I/usr/src/jetson_multimedia_api/include \
    -I/usr/src/jetson_multimedia_api/samples/common/algorithm/cuda/ \
    -I/usr/local/cuda-11.4/targets/aarch64-linux/include/ \
    -L/usr/local/cuda/lib64  -L/usr/lib/aarch64-linux-gnu/tegra/ \
   -lcuda -lnvbufsurface -lnvrm_surface -lnvrm_mem -lcudart -lnppif -lnppc -lnppisu -lnppidei \
    testSharedNvBufSurface.cpp -o testSharedNvBufSurface
width=3840 height=2160 numFrames=10 ./testSharedNvBufSurface
or
width=3840 height=2160 numFrames=10 eglRegisterForEveryFrame=1 ./testSharedNvBufSurface
or
width=3840 height=2160 numFrames=10 cuGraphicsMapResourcesEveryFrame=1 useStream=1 ./testSharedNvBufSurface
*/
#include <cassert>

#include "NvBufSurface.h"
#include "NvCudaProc.h"
#include "nvosd.h"
#include "cudaEGL.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include <npp.h>
#include <cudaProfiler.h>
//Use thrust only to compare buffers on GPU using thrust::equal
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

int main(int argc, char ** argv)
{
    bool eglRegisterForEveryFrame = getenv("eglRegisterForEveryFrame") != nullptr;
    bool cuGraphicsMapResourcesEveryFrame = getenv("cuGraphicsMapResourcesEveryFrame") != nullptr;
    bool useStream = getenv("useStream") != nullptr;
    int width = strtol(getenv("width"), nullptr, 10);
    int height = strtol(getenv("height"), nullptr, 10);
    int numFrames = strtol(getenv("numFrames"), nullptr, 10);
    printf("eglRegisterForEveryFrame %d cuGraphicsMapResourcesEveryFrame %d width %d height %d numFrames %d\n",
           eglRegisterForEveryFrame, cuGraphicsMapResourcesEveryFrame, width, height, numFrames);
    int bufferSize { width * height * 4 };//For simplicity let's create a 1-D 1 byte array of the same size as 4K RGB image
    assert(CUDA_SUCCESS == cuInit(0));
    CUdevice cudaDevice;
    assert(CUDA_SUCCESS == cuDeviceGet(&cudaDevice, 0));
    CUcontext context;
    assert(CUDA_SUCCESS == cuCtxCreate(&context, 0, cudaDevice));
    
    CUstream hStream {0};
    if(useStream) {
        printf("Create stream\n");
        assert( cudaSuccess == cudaStreamCreate(&hStream));
    }
    
    //Create NvBufSurface shared betweren CPU and GPU
    NvBufSurfaceAllocateParams inputParams = {{0}};
    inputParams.params.width = bufferSize;
    inputParams.params.height = 1;
    inputParams.params.memType = NVBUF_MEM_SURFACE_ARRAY;
    inputParams.params.layout = NVBUF_LAYOUT_PITCH;
    inputParams.params.colorFormat = NVBUF_COLOR_FORMAT_GRAY8;
    inputParams.memtag = NvBufSurfaceTag_CAMERA;
    NvBufSurface *nvbufSurf = 0;
    assert(0 == NvBufSurfaceAllocate(&nvbufSurf, 1, &inputParams));
    nvbufSurf->numFilled = 1;
    int dmaBuffer = nvbufSurf->surfaceList[0].bufferDesc;
    printf("NvBufSurfaceAllocate ret nvbufSurf %p dmaBuffer %d\n", nvbufSurf, dmaBuffer);
    //Map to CPU:
    assert(1 == nvbufSurf->surfaceList->planeParams.num_planes);
    assert(0 == NvBufSurfaceMap(nvbufSurf, 0, 0, NVBUF_MAP_READ_WRITE));
    char * sharedCpuPtr = (char *)nvbufSurf->surfaceList->mappedAddr.addr[0];
    //Map to GPU:
    assert(0 == NvBufSurfaceMapEglImage(nvbufSurf, 0));
    EGLImageKHR eglImage = nvbufSurf->surfaceList->mappedAddr.eglImage;
    printf("NvBufSurfaceMapEglImage ret eglImage %p\n", eglImage);
    CUgraphicsResource cuGraphicsResource {};
    assert(CUDA_SUCCESS == cuGraphicsEGLRegisterImage(&cuGraphicsResource, eglImage, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE));
    CUeglFrame eglFrame;
    assert(CUDA_SUCCESS == cuGraphicsResourceGetMappedEglFrame(&eglFrame, cuGraphicsResource, 0, 0));
    char * sharedGpuPtr = (char *)eglFrame.frame.pPitch[0];
    printf("Shared NvBufSurface mapped to CPU: %p GPU: %p\n", sharedCpuPtr, sharedGpuPtr);
    
    //Create 2 CPU and 2 GPU buffers:
    thrust::host_vector<uint8_t> cpuVectors[2] { 
        thrust::host_vector<uint8_t>(bufferSize), 
        thrust::host_vector<uint8_t>(bufferSize) };
    thrust::device_vector<uint8_t> gpuVectors[2] { 
        thrust::device_vector<uint8_t>(bufferSize), 
        thrust::device_vector<uint8_t>(bufferSize) };
    printf("Allocated CPU: %p %p GPU: %p %p\n", cpuVectors[0].data(), cpuVectors[1].data(), 
           gpuVectors[0].data().get(), (void*)gpuVectors[1].data().get()); 
    thrust::fill(cpuVectors[0].begin(), cpuVectors[0].end(), 0);
    thrust::fill(cpuVectors[1].begin(), cpuVectors[1].end(), 0xff);
    thrust::fill(gpuVectors[0].begin(), gpuVectors[0].end(), 0);
    thrust::fill(gpuVectors[1].begin(), gpuVectors[1].end(), 0xff);
    
    for(int frameIdx = 0; frameIdx < numFrames; frameIdx++) {//Simulate frame handling loop
        int buffIdx = {frameIdx & 1};
        printf("Copy on CPU\n");
        memcpy(sharedCpuPtr, cpuVectors[buffIdx].data(), bufferSize);//Copy to shared on CPU
        assert(0 == NvBufSurfaceSyncForDevice(nvbufSurf, 0, 0));
        if(eglRegisterForEveryFrame && !cuGraphicsResource) { 
            assert(CUDA_SUCCESS == cuGraphicsEGLRegisterImage(&cuGraphicsResource, eglImage, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE)); 
            assert(CUDA_SUCCESS == cuGraphicsResourceGetMappedEglFrame(&eglFrame, cuGraphicsResource, 0, 0));
        }
        if(cuGraphicsMapResourcesEveryFrame) { 
            printf("map\n"); assert( cuGraphicsMapResources(1, &cuGraphicsResource, hStream) ); 
        }
        printf("Compare on GPU\n");
        assert(true == thrust::equal(thrust::cuda::par.on(hStream), 
                                     gpuVectors[buffIdx].begin(), gpuVectors[buffIdx].end(), sharedGpuPtr));
        printf("Copy on GPU\n");
        buffIdx = !buffIdx;
        if(useStream) {
            assert(0 == cudaMemcpyAsync(sharedGpuPtr, gpuVectors[buffIdx].data().get(), bufferSize, cudaMemcpyDeviceToDevice, hStream));
        }
        else {
            assert(0 == cudaMemcpy(sharedGpuPtr, gpuVectors[buffIdx].data().get(), bufferSize, cudaMemcpyDeviceToDevice));
        }
        assert(0 == cuCtxSynchronize());
        assert(0 == NvBufSurfaceSyncForCpu(nvbufSurf, 0, 0));
        if(cuGraphicsMapResourcesEveryFrame) { 
            printf("unmap\n"); assert( cuGraphicsUnmapResources(1, &cuGraphicsResource, hStream) ); 
        }
        if(eglRegisterForEveryFrame) { cuGraphicsUnregisterResource(cuGraphicsResource); cuGraphicsResource = nullptr; }
        printf("Compare on CPU\n");
        assert( !memcmp( cpuVectors[buffIdx].data(), sharedCpuPtr, bufferSize));
    }
    return true;
}

Hi,

Thanks for the update.
I want us to align on the issue met so far:

  1. If you don’t call the combination of cuGraphicsEGLRegisterImage(), cuGraphicsResourceGetMappedEglFrame(), and cuGraphicsUnregisterResource(), data in buffer is not flushed completely, and you want to know if you can get rid of the expense calling these functions.
  2. If there are two processes running concurrently, one calling all these functions, one not at all, cuGraphicsUnregisterResource() also flushes the buffer used by another process, which is not expected.

Is that right?
Also, in case you have a dedicated NVIDIA GPU, can you also test if this issue is specific to Jetson or not?

About 1 and 2: yes, this is my interpretation of the observed effects.
Except on item 2: it does not matter whether cuGraphicsUnregisterResource is called by another process or the same process - unregistering one buffer appears to cause cache flush on another one.

Currently I do not have non-Jetson GPU. I thought that you are working in Nvidia and have access to variety of GPUs? Or, may be, a person with access to cuGraphicsUnregisterResource sources will quickly see what exactly it is calling to flush the cache?
Thank you

Hi,
We have checked this and it should be good to call the functions ince in initialization and termination:

// initialization
status = cuGraphicsEGLRegisterImage(&pResource, image,
              CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
status = cuGraphicsResourceGetMappedEglFrame(&eglFrame, pResource, 0, 0);
// termination
status = cuGraphicsUnregisterResource(pResource);

And call cuCtxSynchronize() in head and tail of reach frame like:

status = cuCtxSynchronize();

__CUDA_process_code__;

status = cuCtxSynchronize();

Please give it a try.

Nope. I added extra cuCtxSynchronize before every line of code, but assertion still happening:

width=3840 height=2160 numFrames=10 extraSynchronize=1 ./testSharedNvBufSurface
Copy on CPU
cuCtxSynchronize
cuCtxSynchronize
cuCtxSynchronize
cuCtxSynchronize
Compare on GPU
cuCtxSynchronize
Copy on GPU
cuCtxSynchronize
cuCtxSynchronize
Compare on CPU
testSharedNvBufSurface: testSharedNvBufSurface.cpp:150: int main(int, char**): Assertion `!memcmp( cpuVectors[buffIdx].data(), sharedCpuPtr, bufferSize)' failed.

here is my latest code:

/*
/usr/local/cuda/bin/nvcc -x cu \
    -I/usr/local/cuda -I/usr/src/jetson_multimedia_api/include \
    -I/usr/src/jetson_multimedia_api/samples/common/algorithm/cuda/ \
    -I/usr/local/cuda-11.4/targets/aarch64-linux/include/ \
    -L/usr/local/cuda/lib64  -L/usr/lib/aarch64-linux-gnu/tegra/ \
   -lcuda -lnvbufsurface -lnvrm_surface -lnvrm_mem -lcudart -lnppif -lnppc -lnppisu -lnppidei \
    testSharedNvBufSurface.cpp -o testSharedNvBufSurface
width=3840 height=2160 numFrames=10 ./testSharedNvBufSurface
or
width=3840 height=2160 numFrames=10 eglRegisterForEveryFrame=1 ./testSharedNvBufSurface
or
width=3840 height=2160 numFrames=10 cuGraphicsMapResourcesEveryFrame=1 useStream=1 ./testSharedNvBufSurface
width=3840 height=2160 numFrames=10 extraSynchronize=1 ./testSharedNvBufSurface
*/
#include <cassert>

#include "NvBufSurface.h"
#include "NvCudaProc.h"
#include "nvosd.h"
#include "cudaEGL.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include <npp.h>
#include <cudaProfiler.h>
//Use thrust only to compare buffers on GPU using thrust::equal
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

int main(int argc, char ** argv)
{
    bool eglRegisterForEveryFrame = getenv("eglRegisterForEveryFrame") != nullptr;
    bool cuGraphicsMapResourcesEveryFrame = getenv("cuGraphicsMapResourcesEveryFrame") != nullptr;
    bool useStream = getenv("useStream") != nullptr;
    bool extraSynchronize = getenv("extraSynchronize") != nullptr;
    int width = strtol(getenv("width"), nullptr, 10);
    int height = strtol(getenv("height"), nullptr, 10);
    int numFrames = strtol(getenv("numFrames"), nullptr, 10);
    printf("eglRegisterForEveryFrame %d cuGraphicsMapResourcesEveryFrame %d extraSynchronize %d width %d height %d numFrames %d\n",
           eglRegisterForEveryFrame, cuGraphicsMapResourcesEveryFrame, extraSynchronize, width, height, numFrames);
    int bufferSize { width * height * 4 };//For simplicity let's create a 1-D 1 byte array of the same size as 4K RGB image
    assert(CUDA_SUCCESS == cuInit(0));
    CUdevice cudaDevice;
    assert(CUDA_SUCCESS == cuDeviceGet(&cudaDevice, 0));
    CUcontext context;
    assert(CUDA_SUCCESS == cuCtxCreate(&context, 0, cudaDevice));
    
    CUstream hStream {0};
    if(useStream) {
        printf("Create stream\n");
        assert( cudaSuccess == cudaStreamCreate(&hStream));
    }
    
    //Create NvBufSurface shared betweren CPU and GPU
    NvBufSurfaceAllocateParams inputParams = {{0}};
    inputParams.params.width = bufferSize;
    inputParams.params.height = 1;
    inputParams.params.memType = NVBUF_MEM_SURFACE_ARRAY;
    inputParams.params.layout = NVBUF_LAYOUT_PITCH;
    inputParams.params.colorFormat = NVBUF_COLOR_FORMAT_GRAY8;
    inputParams.memtag = NvBufSurfaceTag_CAMERA;
    NvBufSurface *nvbufSurf = 0;
    assert(0 == NvBufSurfaceAllocate(&nvbufSurf, 1, &inputParams));
    nvbufSurf->numFilled = 1;
    int dmaBuffer = nvbufSurf->surfaceList[0].bufferDesc;
    printf("NvBufSurfaceAllocate ret nvbufSurf %p dmaBuffer %d\n", nvbufSurf, dmaBuffer);
    //Map to CPU:
    assert(1 == nvbufSurf->surfaceList->planeParams.num_planes);
    assert(0 == NvBufSurfaceMap(nvbufSurf, 0, 0, NVBUF_MAP_READ_WRITE));
    char * sharedCpuPtr = (char *)nvbufSurf->surfaceList->mappedAddr.addr[0];
    //Map to GPU:
    assert(0 == NvBufSurfaceMapEglImage(nvbufSurf, 0));
    EGLImageKHR eglImage = nvbufSurf->surfaceList->mappedAddr.eglImage;
    printf("NvBufSurfaceMapEglImage ret eglImage %p\n", eglImage);
    CUgraphicsResource cuGraphicsResource {};
    assert(CUDA_SUCCESS == cuGraphicsEGLRegisterImage(&cuGraphicsResource, eglImage, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE));
    CUeglFrame eglFrame;
    assert(CUDA_SUCCESS == cuGraphicsResourceGetMappedEglFrame(&eglFrame, cuGraphicsResource, 0, 0));
    char * sharedGpuPtr = (char *)eglFrame.frame.pPitch[0];
    printf("Shared NvBufSurface mapped to CPU: %p GPU: %p\n", sharedCpuPtr, sharedGpuPtr);
    
    //Create 2 CPU and 2 GPU buffers:
    thrust::host_vector<uint8_t> cpuVectors[2] { 
        thrust::host_vector<uint8_t>(bufferSize), 
        thrust::host_vector<uint8_t>(bufferSize) };
    thrust::device_vector<uint8_t> gpuVectors[2] { 
        thrust::device_vector<uint8_t>(bufferSize), 
        thrust::device_vector<uint8_t>(bufferSize) };
    printf("Allocated CPU: %p %p GPU: %p %p\n", cpuVectors[0].data(), cpuVectors[1].data(), 
           gpuVectors[0].data().get(), (void*)gpuVectors[1].data().get()); 
    thrust::fill(cpuVectors[0].begin(), cpuVectors[0].end(), 0);
    thrust::fill(cpuVectors[1].begin(), cpuVectors[1].end(), 0xff);
    thrust::fill(gpuVectors[0].begin(), gpuVectors[0].end(), 0);
    thrust::fill(gpuVectors[1].begin(), gpuVectors[1].end(), 0xff);
    
    for(int frameIdx = 0; frameIdx < numFrames; frameIdx++) {//Simulate frame handling loop
        int buffIdx = {frameIdx & 1};
        printf("Copy on CPU\n");
        memcpy(sharedCpuPtr, cpuVectors[buffIdx].data(), bufferSize);//Copy to shared on CPU
        if(extraSynchronize) {
            printf("cuCtxSynchronize\n");
            assert(0 == cuCtxSynchronize());
        }
        assert(0 == NvBufSurfaceSyncForDevice(nvbufSurf, 0, 0));
        if(extraSynchronize) {
            printf("cuCtxSynchronize\n");
            assert(0 == cuCtxSynchronize());
        }
        if(eglRegisterForEveryFrame && !cuGraphicsResource) { 
            assert(CUDA_SUCCESS == cuGraphicsEGLRegisterImage(&cuGraphicsResource, eglImage, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE)); 
            assert(CUDA_SUCCESS == cuGraphicsResourceGetMappedEglFrame(&eglFrame, cuGraphicsResource, 0, 0));
        }
        if(extraSynchronize) {
            printf("cuCtxSynchronize\n");
            assert(0 == cuCtxSynchronize());
        }
        if(cuGraphicsMapResourcesEveryFrame) { 
            printf("map\n"); assert( cuGraphicsMapResources(1, &cuGraphicsResource, hStream) ); 
        }
        if(extraSynchronize) {
            printf("cuCtxSynchronize\n");
            assert(0 == cuCtxSynchronize());
        }
        printf("Compare on GPU\n");
        assert(true == thrust::equal(thrust::cuda::par.on(hStream), 
                                     gpuVectors[buffIdx].begin(), gpuVectors[buffIdx].end(), sharedGpuPtr));
        if(extraSynchronize) {
            printf("cuCtxSynchronize\n");
            assert(0 == cuCtxSynchronize());
        }
        printf("Copy on GPU\n");
        buffIdx = !buffIdx;
        if(useStream) {
            assert(0 == cudaMemcpyAsync(sharedGpuPtr, gpuVectors[buffIdx].data().get(), bufferSize, cudaMemcpyDeviceToDevice, hStream));
        }
        else {
            assert(0 == cudaMemcpy(sharedGpuPtr, gpuVectors[buffIdx].data().get(), bufferSize, cudaMemcpyDeviceToDevice));
        }
        printf("cuCtxSynchronize\n");
        assert(0 == cuCtxSynchronize());
        if(cuGraphicsMapResourcesEveryFrame) { 
            printf("unmap\n"); assert( cuGraphicsUnmapResources(1, &cuGraphicsResource, hStream) ); 
        }
        if(eglRegisterForEveryFrame) { cuGraphicsUnregisterResource(cuGraphicsResource); cuGraphicsResource = nullptr; }
        if(extraSynchronize) {
            printf("cuCtxSynchronize\n");
            assert(0 == cuCtxSynchronize());
        }
        printf("Compare on CPU\n");
        assert( !memcmp( cpuVectors[buffIdx].data(), sharedCpuPtr, bufferSize));
    }
    return true;
}

Hi,
Please call NvBufSurfaceSyncForCpu() before accessing the data in CPU:

        printf("Copy on CPU\n");
        NvBufSurfaceSyncForCpu();
        memcpy(sharedCpuPtr, cpuVectors[buffIdx].data(), bufferSize);//Copy to shared on CPU
        printf("Compare on CPU\n");
        NvBufSurfaceSyncForCpu();
        assert( !memcmp( cpuVectors[buffIdx].data(), sharedCpuPtr, bufferSize));

Just tested, this won’t solve the issue. In my case, I have done the CUDA processing on the frame in RGBA space, then I use NvBufSurfTransform to transform it into NV12 space for encoding. The CUDA processing won’t appear properly to the results. However, only when I enable the paired operations cuGraphicsEGLRegisterImage, and cuGraphicsUnregisterResource then the result should look as expected.

No, NvBufSurfaceSyncForCpu appears to do nothing useful, just like cuGraphicsUnmapResources - the code works (or fails) the same with or without then.
Could you, please, ask somebody with access to sources to check what cuGraphicsUnregisterResource is really doing to the GPU cache?
Thank you

I think I might be experiencing this same issue now. I have two gstreamer plugins in serial performing transforms on an image. If, for the first plugin, I only call cuGraphicsEGLRegisterImage and cuGraphicsUnregisterResource once instead of every iteration, the output image seems okay. However, if I chain these two plugins together, the output of both the first and second plugin has lines near the bottom half of each image. If both plugins call cuGraphicsEGLRegisterImage and cuGraphicsUnregisterResource each time an image is received, these problems go away.

Hi @jhnlmn
So after adding NvBufSurfaceSyncForCpu(), you still hit the assertion?

testSharedNvBufSurface: testSharedNvBufSurface.cpp:150: int main(int, char**): Assertion `!memcmp( cpuVectors[buffIdx].data(), sharedCpuPtr, bufferSize)' failed.

Of course, I was using NvBufSurfaceSyncForCpu from the start, as I mentioned in my original question.
Now, to avoid any doubts, I added extra calls to NvBufSurfaceSyncForCpu and extra printfs and I am getting:

width=3840 height=2160 numFrames=10 extraSynchronize=1 ./testSharedNvBufSurface
eglRegisterForEveryFrame 0 cuGraphicsMapResourcesEveryFrame 0 extraSynchronize 1 width 3840 height 2160 numFrames 10
NvBufSurfaceAllocate ret nvbufSurf 0xaaaad98a9f20 dmaBuffer 38
NvBufSurfaceMapEglImage ret eglImage 0xaaaad98aa841
Shared NvBufSurface mapped to CPU: 0xffff68160000 GPU: 0x205227000
Allocated CPU: 0xffff8205b010 0xffff800b6010 GPU: 0x20d0d7000 0x20f07b000
Copy on CPU
cuCtxSynchronize
cuCtxSynchronize
cuCtxSynchronize
cuCtxSynchronize
Compare on GPU
cuCtxSynchronize
Copy on GPU
NvBufSurfaceSyncForCpu
cuCtxSynchronize
NvBufSurfaceSyncForCpu
cuCtxSynchronize
NvBufSurfaceSyncForCpu
Compare on CPU
testSharedNvBufSurface: testSharedNvBufSurface.cpp:156: int main(int, char**): Assertion `!memcmp( cpuVectors[buffIdx].data(), sharedCpuPtr, bufferSize)' failed.

This is my latest test code:

/*
/usr/local/cuda/bin/nvcc -x cu \
    -I/usr/local/cuda -I/usr/src/jetson_multimedia_api/include \
    -I/usr/src/jetson_multimedia_api/samples/common/algorithm/cuda/ \
    -I/usr/local/cuda-11.4/targets/aarch64-linux/include/ \
    -L/usr/local/cuda/lib64  -L/usr/lib/aarch64-linux-gnu/tegra/ \
   -lcuda -lnvbufsurface -lnvrm_surface -lnvrm_mem -lcudart -lnppif -lnppc -lnppisu -lnppidei \
    testSharedNvBufSurface.cpp -o testSharedNvBufSurface
width=3840 height=2160 numFrames=10 ./testSharedNvBufSurface
or
width=3840 height=2160 numFrames=10 eglRegisterForEveryFrame=1 ./testSharedNvBufSurface
or
width=3840 height=2160 numFrames=10 cuGraphicsMapResourcesEveryFrame=1 useStream=1 ./testSharedNvBufSurface
width=3840 height=2160 numFrames=10 extraSynchronize=1 ./testSharedNvBufSurface
*/
#include <cassert>

#include "NvBufSurface.h"
#include "NvCudaProc.h"
#include "nvosd.h"
#include "cudaEGL.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include <npp.h>
#include <cudaProfiler.h>
//Use thrust only to compare buffers on GPU using thrust::equal
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

int main(int argc, char ** argv)
{
    bool eglRegisterForEveryFrame = getenv("eglRegisterForEveryFrame") != nullptr;
    bool cuGraphicsMapResourcesEveryFrame = getenv("cuGraphicsMapResourcesEveryFrame") != nullptr;
    bool useStream = getenv("useStream") != nullptr;
    bool extraSynchronize = getenv("extraSynchronize") != nullptr;
    int width = strtol(getenv("width"), nullptr, 10);
    int height = strtol(getenv("height"), nullptr, 10);
    int numFrames = strtol(getenv("numFrames"), nullptr, 10);
    printf("eglRegisterForEveryFrame %d cuGraphicsMapResourcesEveryFrame %d extraSynchronize %d width %d height %d numFrames %d\n",
           eglRegisterForEveryFrame, cuGraphicsMapResourcesEveryFrame, extraSynchronize, width, height, numFrames);
    int bufferSize { width * height * 4 };//For simplicity let's create a 1-D 1 byte array of the same size as 4K RGB image
    assert(CUDA_SUCCESS == cuInit(0));
    CUdevice cudaDevice;
    assert(CUDA_SUCCESS == cuDeviceGet(&cudaDevice, 0));
    CUcontext context;
    assert(CUDA_SUCCESS == cuCtxCreate(&context, 0, cudaDevice));
    
    CUstream hStream {0};
    if(useStream) {
        printf("Create stream\n");
        assert( cudaSuccess == cudaStreamCreate(&hStream));
    }
    
    //Create NvBufSurface shared betweren CPU and GPU
    NvBufSurfaceAllocateParams inputParams = {{0}};
    inputParams.params.width = bufferSize;
    inputParams.params.height = 1;
    inputParams.params.memType = NVBUF_MEM_SURFACE_ARRAY;
    inputParams.params.layout = NVBUF_LAYOUT_PITCH;
    inputParams.params.colorFormat = NVBUF_COLOR_FORMAT_GRAY8;
    inputParams.memtag = NvBufSurfaceTag_CAMERA;
    NvBufSurface *nvbufSurf = 0;
    assert(0 == NvBufSurfaceAllocate(&nvbufSurf, 1, &inputParams));
    nvbufSurf->numFilled = 1;
    int dmaBuffer = nvbufSurf->surfaceList[0].bufferDesc;
    printf("NvBufSurfaceAllocate ret nvbufSurf %p dmaBuffer %d\n", nvbufSurf, dmaBuffer);
    //Map to CPU:
    assert(1 == nvbufSurf->surfaceList->planeParams.num_planes);
    assert(0 == NvBufSurfaceMap(nvbufSurf, 0, 0, NVBUF_MAP_READ_WRITE));
    char * sharedCpuPtr = (char *)nvbufSurf->surfaceList->mappedAddr.addr[0];
    //Map to GPU:
    assert(0 == NvBufSurfaceMapEglImage(nvbufSurf, 0));
    EGLImageKHR eglImage = nvbufSurf->surfaceList->mappedAddr.eglImage;
    printf("NvBufSurfaceMapEglImage ret eglImage %p\n", eglImage);
    CUgraphicsResource cuGraphicsResource {};
    assert(CUDA_SUCCESS == cuGraphicsEGLRegisterImage(&cuGraphicsResource, eglImage, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE));
    CUeglFrame eglFrame;
    assert(CUDA_SUCCESS == cuGraphicsResourceGetMappedEglFrame(&eglFrame, cuGraphicsResource, 0, 0));
    char * sharedGpuPtr = (char *)eglFrame.frame.pPitch[0];
    printf("Shared NvBufSurface mapped to CPU: %p GPU: %p\n", sharedCpuPtr, sharedGpuPtr);
    
    //Create 2 CPU and 2 GPU buffers:
    thrust::host_vector<uint8_t> cpuVectors[2] { 
        thrust::host_vector<uint8_t>(bufferSize), 
        thrust::host_vector<uint8_t>(bufferSize) };
    thrust::device_vector<uint8_t> gpuVectors[2] { 
        thrust::device_vector<uint8_t>(bufferSize), 
        thrust::device_vector<uint8_t>(bufferSize) };
    printf("Allocated CPU: %p %p GPU: %p %p\n", cpuVectors[0].data(), cpuVectors[1].data(), 
           gpuVectors[0].data().get(), (void*)gpuVectors[1].data().get()); 
    thrust::fill(cpuVectors[0].begin(), cpuVectors[0].end(), 0);
    thrust::fill(cpuVectors[1].begin(), cpuVectors[1].end(), 0xff);
    thrust::fill(gpuVectors[0].begin(), gpuVectors[0].end(), 0);
    thrust::fill(gpuVectors[1].begin(), gpuVectors[1].end(), 0xff);
    
    for(int frameIdx = 0; frameIdx < numFrames; frameIdx++) {//Simulate frame handling loop
        int buffIdx = {frameIdx & 1};
        printf("Copy on CPU\n");
        memcpy(sharedCpuPtr, cpuVectors[buffIdx].data(), bufferSize);//Copy to shared on CPU
        if(extraSynchronize) {
            printf("cuCtxSynchronize\n");
            assert(0 == cuCtxSynchronize());
        }
        assert(0 == NvBufSurfaceSyncForDevice(nvbufSurf, 0, 0));
        if(extraSynchronize) {
            printf("cuCtxSynchronize\n");
            assert(0 == cuCtxSynchronize());
        }
        if(eglRegisterForEveryFrame && !cuGraphicsResource) { 
            assert(CUDA_SUCCESS == cuGraphicsEGLRegisterImage(&cuGraphicsResource, eglImage, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE)); 
            assert(CUDA_SUCCESS == cuGraphicsResourceGetMappedEglFrame(&eglFrame, cuGraphicsResource, 0, 0));
        }
        if(extraSynchronize) {
            printf("cuCtxSynchronize\n");
            assert(0 == cuCtxSynchronize());
        }
        if(cuGraphicsMapResourcesEveryFrame) { 
            printf("map\n"); assert( cuGraphicsMapResources(1, &cuGraphicsResource, hStream) ); 
        }
        if(extraSynchronize) {
            printf("cuCtxSynchronize\n");
            assert(0 == cuCtxSynchronize());
        }
        printf("Compare on GPU\n");
        assert(true == thrust::equal(thrust::cuda::par.on(hStream), 
                                     gpuVectors[buffIdx].begin(), gpuVectors[buffIdx].end(), sharedGpuPtr));
        if(extraSynchronize) {
            printf("cuCtxSynchronize\n");
            assert(0 == cuCtxSynchronize());
        }
        printf("Copy on GPU\n");
        buffIdx = !buffIdx;
        if(useStream) {
            assert(0 == cudaMemcpyAsync(sharedGpuPtr, gpuVectors[buffIdx].data().get(), bufferSize, cudaMemcpyDeviceToDevice, hStream));
        }
        else {
            assert(0 == cudaMemcpy(sharedGpuPtr, gpuVectors[buffIdx].data().get(), bufferSize, cudaMemcpyDeviceToDevice));
        }
        printf("NvBufSurfaceSyncForCpu\n");
        assert(0 == NvBufSurfaceSyncForCpu(nvbufSurf, 0, 0));
        printf("cuCtxSynchronize\n");
        assert(0 == cuCtxSynchronize());
        if(cuGraphicsMapResourcesEveryFrame) { 
            printf("unmap\n"); assert( cuGraphicsUnmapResources(1, &cuGraphicsResource, hStream) ); 
        }
        printf("NvBufSurfaceSyncForCpu\n");
        assert(0 == NvBufSurfaceSyncForCpu(nvbufSurf, 0, 0));
        if(eglRegisterForEveryFrame) { cuGraphicsUnregisterResource(cuGraphicsResource); cuGraphicsResource = nullptr; }
        if(extraSynchronize) {
            printf("cuCtxSynchronize\n");
            assert(0 == cuCtxSynchronize());
        }
        printf("NvBufSurfaceSyncForCpu\n");
        assert(0 == NvBufSurfaceSyncForCpu(nvbufSurf, 0, 0));
        printf("Compare on CPU\n");
        assert( !memcmp( cpuVectors[buffIdx].data(), sharedCpuPtr, bufferSize));
    }
    return true;
}

Hi,
Thanks for the confirmation. So without cuGraphicsUnregisterResource() the data is not synchronized. We will check this further. On current releases, please always call cuGraphicsEGLRegisterImage()/cuGraphicsUnregisterResource() for each frame.