Of course, I was using NvBufSurfaceSyncForCpu from the start, as I mentioned in my original question.
Now, to avoid any doubts, I added extra calls to NvBufSurfaceSyncForCpu and extra printfs and I am getting:
width=3840 height=2160 numFrames=10 extraSynchronize=1 ./testSharedNvBufSurface
eglRegisterForEveryFrame 0 cuGraphicsMapResourcesEveryFrame 0 extraSynchronize 1 width 3840 height 2160 numFrames 10
NvBufSurfaceAllocate ret nvbufSurf 0xaaaad98a9f20 dmaBuffer 38
NvBufSurfaceMapEglImage ret eglImage 0xaaaad98aa841
Shared NvBufSurface mapped to CPU: 0xffff68160000 GPU: 0x205227000
Allocated CPU: 0xffff8205b010 0xffff800b6010 GPU: 0x20d0d7000 0x20f07b000
Copy on CPU
cuCtxSynchronize
cuCtxSynchronize
cuCtxSynchronize
cuCtxSynchronize
Compare on GPU
cuCtxSynchronize
Copy on GPU
NvBufSurfaceSyncForCpu
cuCtxSynchronize
NvBufSurfaceSyncForCpu
cuCtxSynchronize
NvBufSurfaceSyncForCpu
Compare on CPU
testSharedNvBufSurface: testSharedNvBufSurface.cpp:156: int main(int, char**): Assertion `!memcmp( cpuVectors[buffIdx].data(), sharedCpuPtr, bufferSize)' failed.
This is my latest test code:
/*
/usr/local/cuda/bin/nvcc -x cu \
-I/usr/local/cuda -I/usr/src/jetson_multimedia_api/include \
-I/usr/src/jetson_multimedia_api/samples/common/algorithm/cuda/ \
-I/usr/local/cuda-11.4/targets/aarch64-linux/include/ \
-L/usr/local/cuda/lib64 -L/usr/lib/aarch64-linux-gnu/tegra/ \
-lcuda -lnvbufsurface -lnvrm_surface -lnvrm_mem -lcudart -lnppif -lnppc -lnppisu -lnppidei \
testSharedNvBufSurface.cpp -o testSharedNvBufSurface
width=3840 height=2160 numFrames=10 ./testSharedNvBufSurface
or
width=3840 height=2160 numFrames=10 eglRegisterForEveryFrame=1 ./testSharedNvBufSurface
or
width=3840 height=2160 numFrames=10 cuGraphicsMapResourcesEveryFrame=1 useStream=1 ./testSharedNvBufSurface
width=3840 height=2160 numFrames=10 extraSynchronize=1 ./testSharedNvBufSurface
*/
#include <cassert>
#include "NvBufSurface.h"
#include "NvCudaProc.h"
#include "nvosd.h"
#include "cudaEGL.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include <npp.h>
#include <cudaProfiler.h>
//Use thrust only to compare buffers on GPU using thrust::equal
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
int main(int argc, char ** argv)
{
bool eglRegisterForEveryFrame = getenv("eglRegisterForEveryFrame") != nullptr;
bool cuGraphicsMapResourcesEveryFrame = getenv("cuGraphicsMapResourcesEveryFrame") != nullptr;
bool useStream = getenv("useStream") != nullptr;
bool extraSynchronize = getenv("extraSynchronize") != nullptr;
int width = strtol(getenv("width"), nullptr, 10);
int height = strtol(getenv("height"), nullptr, 10);
int numFrames = strtol(getenv("numFrames"), nullptr, 10);
printf("eglRegisterForEveryFrame %d cuGraphicsMapResourcesEveryFrame %d extraSynchronize %d width %d height %d numFrames %d\n",
eglRegisterForEveryFrame, cuGraphicsMapResourcesEveryFrame, extraSynchronize, width, height, numFrames);
int bufferSize { width * height * 4 };//For simplicity let's create a 1-D 1 byte array of the same size as 4K RGB image
assert(CUDA_SUCCESS == cuInit(0));
CUdevice cudaDevice;
assert(CUDA_SUCCESS == cuDeviceGet(&cudaDevice, 0));
CUcontext context;
assert(CUDA_SUCCESS == cuCtxCreate(&context, 0, cudaDevice));
CUstream hStream {0};
if(useStream) {
printf("Create stream\n");
assert( cudaSuccess == cudaStreamCreate(&hStream));
}
//Create NvBufSurface shared betweren CPU and GPU
NvBufSurfaceAllocateParams inputParams = {{0}};
inputParams.params.width = bufferSize;
inputParams.params.height = 1;
inputParams.params.memType = NVBUF_MEM_SURFACE_ARRAY;
inputParams.params.layout = NVBUF_LAYOUT_PITCH;
inputParams.params.colorFormat = NVBUF_COLOR_FORMAT_GRAY8;
inputParams.memtag = NvBufSurfaceTag_CAMERA;
NvBufSurface *nvbufSurf = 0;
assert(0 == NvBufSurfaceAllocate(&nvbufSurf, 1, &inputParams));
nvbufSurf->numFilled = 1;
int dmaBuffer = nvbufSurf->surfaceList[0].bufferDesc;
printf("NvBufSurfaceAllocate ret nvbufSurf %p dmaBuffer %d\n", nvbufSurf, dmaBuffer);
//Map to CPU:
assert(1 == nvbufSurf->surfaceList->planeParams.num_planes);
assert(0 == NvBufSurfaceMap(nvbufSurf, 0, 0, NVBUF_MAP_READ_WRITE));
char * sharedCpuPtr = (char *)nvbufSurf->surfaceList->mappedAddr.addr[0];
//Map to GPU:
assert(0 == NvBufSurfaceMapEglImage(nvbufSurf, 0));
EGLImageKHR eglImage = nvbufSurf->surfaceList->mappedAddr.eglImage;
printf("NvBufSurfaceMapEglImage ret eglImage %p\n", eglImage);
CUgraphicsResource cuGraphicsResource {};
assert(CUDA_SUCCESS == cuGraphicsEGLRegisterImage(&cuGraphicsResource, eglImage, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE));
CUeglFrame eglFrame;
assert(CUDA_SUCCESS == cuGraphicsResourceGetMappedEglFrame(&eglFrame, cuGraphicsResource, 0, 0));
char * sharedGpuPtr = (char *)eglFrame.frame.pPitch[0];
printf("Shared NvBufSurface mapped to CPU: %p GPU: %p\n", sharedCpuPtr, sharedGpuPtr);
//Create 2 CPU and 2 GPU buffers:
thrust::host_vector<uint8_t> cpuVectors[2] {
thrust::host_vector<uint8_t>(bufferSize),
thrust::host_vector<uint8_t>(bufferSize) };
thrust::device_vector<uint8_t> gpuVectors[2] {
thrust::device_vector<uint8_t>(bufferSize),
thrust::device_vector<uint8_t>(bufferSize) };
printf("Allocated CPU: %p %p GPU: %p %p\n", cpuVectors[0].data(), cpuVectors[1].data(),
gpuVectors[0].data().get(), (void*)gpuVectors[1].data().get());
thrust::fill(cpuVectors[0].begin(), cpuVectors[0].end(), 0);
thrust::fill(cpuVectors[1].begin(), cpuVectors[1].end(), 0xff);
thrust::fill(gpuVectors[0].begin(), gpuVectors[0].end(), 0);
thrust::fill(gpuVectors[1].begin(), gpuVectors[1].end(), 0xff);
for(int frameIdx = 0; frameIdx < numFrames; frameIdx++) {//Simulate frame handling loop
int buffIdx = {frameIdx & 1};
printf("Copy on CPU\n");
memcpy(sharedCpuPtr, cpuVectors[buffIdx].data(), bufferSize);//Copy to shared on CPU
if(extraSynchronize) {
printf("cuCtxSynchronize\n");
assert(0 == cuCtxSynchronize());
}
assert(0 == NvBufSurfaceSyncForDevice(nvbufSurf, 0, 0));
if(extraSynchronize) {
printf("cuCtxSynchronize\n");
assert(0 == cuCtxSynchronize());
}
if(eglRegisterForEveryFrame && !cuGraphicsResource) {
assert(CUDA_SUCCESS == cuGraphicsEGLRegisterImage(&cuGraphicsResource, eglImage, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE));
assert(CUDA_SUCCESS == cuGraphicsResourceGetMappedEglFrame(&eglFrame, cuGraphicsResource, 0, 0));
}
if(extraSynchronize) {
printf("cuCtxSynchronize\n");
assert(0 == cuCtxSynchronize());
}
if(cuGraphicsMapResourcesEveryFrame) {
printf("map\n"); assert( cuGraphicsMapResources(1, &cuGraphicsResource, hStream) );
}
if(extraSynchronize) {
printf("cuCtxSynchronize\n");
assert(0 == cuCtxSynchronize());
}
printf("Compare on GPU\n");
assert(true == thrust::equal(thrust::cuda::par.on(hStream),
gpuVectors[buffIdx].begin(), gpuVectors[buffIdx].end(), sharedGpuPtr));
if(extraSynchronize) {
printf("cuCtxSynchronize\n");
assert(0 == cuCtxSynchronize());
}
printf("Copy on GPU\n");
buffIdx = !buffIdx;
if(useStream) {
assert(0 == cudaMemcpyAsync(sharedGpuPtr, gpuVectors[buffIdx].data().get(), bufferSize, cudaMemcpyDeviceToDevice, hStream));
}
else {
assert(0 == cudaMemcpy(sharedGpuPtr, gpuVectors[buffIdx].data().get(), bufferSize, cudaMemcpyDeviceToDevice));
}
printf("NvBufSurfaceSyncForCpu\n");
assert(0 == NvBufSurfaceSyncForCpu(nvbufSurf, 0, 0));
printf("cuCtxSynchronize\n");
assert(0 == cuCtxSynchronize());
if(cuGraphicsMapResourcesEveryFrame) {
printf("unmap\n"); assert( cuGraphicsUnmapResources(1, &cuGraphicsResource, hStream) );
}
printf("NvBufSurfaceSyncForCpu\n");
assert(0 == NvBufSurfaceSyncForCpu(nvbufSurf, 0, 0));
if(eglRegisterForEveryFrame) { cuGraphicsUnregisterResource(cuGraphicsResource); cuGraphicsResource = nullptr; }
if(extraSynchronize) {
printf("cuCtxSynchronize\n");
assert(0 == cuCtxSynchronize());
}
printf("NvBufSurfaceSyncForCpu\n");
assert(0 == NvBufSurfaceSyncForCpu(nvbufSurf, 0, 0));
printf("Compare on CPU\n");
assert( !memcmp( cpuVectors[buffIdx].data(), sharedCpuPtr, bufferSize));
}
return true;
}