How to prevent vpiSubmitConvertImageFormat from calling cudaGraphicsEGLRegisterImage, which kills performance?

Hi,
I am trying to use NvBufSurfaceAllocate/vpiImageCreateWrapper to create VPIImage instead of vpiImageCreate because I need to integrate VPI into my larger pipeline, which is using NvBuffer.

I am comparing performance of several VPI routines, such as vpiSubmitConvertImageFormat and vpiSubmitTemporalNoiseReduction with VPIImage from vpiImageCreate vs vpiImageCreateWrapper.

It appears that vpiSubmitConvertImageFormat(VPI_BACKEND_CUDA) is much slower on NvBuffer (by more than 1 ms).
By using nsys I found that vpiSubmitConvertImageFormat is calling cudaGraphicsEGLRegisterImage/cudaGraphicsUnregisterResource twice - once for each image.
But cudaGraphicsEGLRegisterImage is very slow function, which should be only called once and never for each frame because it will kill performance.
(this was discussed in How to share NvBufSurface with Cuda efficiently, without overhead of cuGraphicsEGLRegisterImage/cuGraphicsUnregisterResource? )

I tried to call cuGraphicsEGLRegisterImage before frame loop starts, but that makes no difference.

I attached a complete test code below.

So, questions is how to prevent vpiSubmitConvertImageFormat and other VPI functions from calling cudaGraphicsEGLRegisterImage
otherwise VPI will be very slow and not very useful on Orin.

Thank you

/*
Usage:
g++  -o tnr_file -I/usr/local/cuda-12.2/targets/aarch64-linux/include -I/usr/src/jetson_multimedia_api/include \
    -I/usr/src/jetson_multimedia_api/samples/common/algorithm/cuda/ \
    ./tnr_file.cpp -L/usr/local/cuda-12.2/targets/aarch64-linux/lib/  -lnvvpi -lcudart -L/usr/lib/aarch64-linux-gnu/tegra/ -lnvbufsurface \
    -lcuda -lnvrm_mem

sudo mkdir /mnt/tmpfs
sudo chown $USER:$USER /mnt/tmpfs
sudo mount -t tmpfs -o size=16g tmpfs /mnt/tmpfs

Test: read RGBA, convert to NV12 using CUDA, save as NV12:

gst-launch-1.0 filesrc location=/opt/nvidia/vpi3/samples/assets/noisy.mp4 ! qtdemux ! queue ! h264parse ! avdec_h264 ! \
    nvvidconv ! 'video/x-raw, format=NV12, width=2816, height=1944' ! \
    filesink location=/mnt/tmpfs/out_2816.nv12 -e

gst-launch-1.0 filesrc location=/opt/nvidia/vpi3/samples/assets/noisy.mp4 ! qtdemux ! queue ! h264parse ! avdec_h264 ! \
    nvvidconv ! 'video/x-raw, format=RGBA, width=2624, height=1944' ! \
    filesink location=/mnt/tmpfs/out_2624.rgba -e

useNvBuffer=1 preset=4 strength=1 inFile=/mnt/tmpfs/out_2816.nv12 outFile=/mnt/tmpfs/out2_2624.nv12 \
    width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
statPerFrameVpi (ms) count 998 av 2.112977 min 2.081234 max 2.210318

preset=4 strength=1 inFile=/mnt/tmpfs/out_2816.nv12 outFile=/mnt/tmpfs/out2_2624.nv12 \
    width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
statPerFrameTnr (ms) count 998 av 2.186153 min 2.061330 max 2.353319

convertCuda=1 convertFromRgbaWidth=2624 preset=4 strength=1 inFile=/mnt/tmpfs/out_2624.rgba outFile=/mnt/tmpfs/out2_2624.nv12 \
    width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
statPerFrameVpi (ms) count 998 av 2.865078 min 2.706413 max 3.287629

useNvBuffer=1 convertCuda=1 convertFromRgbaWidth=2624 preset=4 strength=1 inFile=/mnt/tmpfs/out_2624.rgba outFile=/mnt/tmpfs/out2_2624.nv12 \
    width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
statPerFrameVpi (ms) count 998 av 4.463151 min 4.291428 max 4.792270

So, without vpiSubmitConvertImageFormat time is the same no matter we use NvBuffer or not
but with conversion it is slower by about 1.7 ms

Now with profiler:
useNvBuffer=1 convertCuda=1 convertFromRgbaWidth=2624 preset=4 strength=1 inFile=/mnt/tmpfs/out_2624.rgba outFile=/mnt/tmpfs/out2_2624.nv12 \
    width=2624 height=1944 numFrames=1000 printFormat=1 nsys profile ./tnr_file
nsys stats report1.nsys-rep | grep -i register
     45.6    1,115,627,424      2,002     557,256.5     702,624.0     340,256     895,264    175,705.3  cudaGraphicsEGLRegisterImage  
     24.2      593,418,432      2,002     296,412.8     340,832.0     204,576     462,944     82,377.1  cudaGraphicsUnregisterResource

So, cudaGraphicsEGLRegisterImage and cudaGraphicsUnregisterResource are the performance killers

DISPLAY=:0 ffplay -v info -f rawvideo -pixel_format nv12 -video_size 2816x1944 /mnt/tmpfs/out2_2624.nv12

*/

#include <vpi/Event.h>
#include <vpi/Image.h>
#include <vpi/Status.h>
#include <vpi/Stream.h>
#include <vpi/algo/ConvertImageFormat.h>
#include <vpi/algo/TemporalNoiseReduction.h>

#include <cuda.h>
#include <cuda_runtime.h>

#include "NvBufSurface.h"
#include "NvCudaProc.h"
#include "cudaEGL.h"

#include <algorithm>
#include <cstring> // for memset
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>

#define ARRAY_SIZE(arr) (sizeof(arr)/sizeof(arr[0]))

int frameOrdinal {};
int printFormat {};

uint64_t getTimeNS() 
{
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec * 1000 * 1000 * 1000 + ts.tv_nsec;
}

struct Stat
{
    void Add(int value)
    {
        if(!this->count || this->min > value)
        {
            this->min = value;
        }
        if(!this->count || this->max < value)
        {
            this->max = value;
        }
        this->count++;
        this->total += value;
    }
    
    void Print(const char * name, double ratio)
    {
        printf("%s count %d av %lf min %lf max %lf\n", 
               name, this->count, this->count? (this->total * ratio / this->count) : 0.0, ratio * this->min, ratio * this->max);
    }
    
    int min {-1};
    int max {-1};
    int count {0};
    long long total {0};
};

Stat statPerFrameVpi;//Includes convert, tnr, sync

#define CHECK_STATUS(STMT)                                    \
    do                                                        \
    {                                                         \
        VPIStatus status = (STMT);                            \
        if (status != VPI_SUCCESS)                            \
        {                                                     \
            char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH];       \
            vpiGetLastStatusMessage(buffer, sizeof(buffer));  \
            std::ostringstream ss;                            \
            ss << "" #STMT "\n";                              \
            ss << vpiStatusGetName(status) << ": " << buffer; \
            throw std::runtime_error(ss.str());               \
        }                                                     \
    } while (0);

void VpiLogError(const char * name, VPIStatus status)
{
    char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH];
    vpiGetLastStatusMessage(buffer, sizeof(buffer));
    const char * statusName = vpiStatusGetName(status);
    printf("%s ret %d: %s : %s\n", name, (int)status, buffer, statusName);
}

bool VpiImagePrintFormat(VPIImage image, const char * comment)
{
    VPIImageFormat format {};
    VPIStatus status = vpiImageGetFormat(image, &format);
    if(status != VPI_SUCCESS)
    {
        VpiLogError("vpiImageGetFormat", status);
        return false;
    }
    int32_t imageWidth {}, imageHeight {};
    status = vpiImageGetSize(image, &imageWidth, &imageHeight);
    if(status != VPI_SUCCESS)
    {
        VpiLogError("vpiImageGetSize", status);
        return false;
    }

    uint32_t fourCC = vpiImageFormatGetFourCC(format);
    VPIImageBufferPitchLinear info {};
    info.numPlanes = vpiImageFormatGetPlaneCount(format);
    if(info.numPlanes < 0 || info.numPlanes > ARRAY_SIZE(info.planes))
    {
        printf("Bad numPlanes %d\n", info.numPlanes);
        return false;
    }
    //VPIPixelType types[3] {VPI_PIXEL_TYPE_INVALID, VPI_PIXEL_TYPE_INVALID, VPI_PIXEL_TYPE_INVALID};
    std::string typeNames[ARRAY_SIZE(info.planes)];
    for(int planeIdx = 0; planeIdx < info.numPlanes; planeIdx++)
    {
        info.planes[planeIdx].pixelType = vpiImageFormatGetPlanePixelType(format, planeIdx);
        typeNames[planeIdx] = vpiPixelTypeGetName(info.planes[planeIdx].pixelType);
        
        info.planes[planeIdx].width = vpiImageFormatGetPlaneWidth(format, imageWidth, planeIdx);
        info.planes[planeIdx].height = vpiImageFormatGetPlaneHeight(format, imageHeight, planeIdx);
    }
    
    printf("%s vpiImageGetFormat ret 0x%x %.4s Image Size %dx%d numPlanes %d types (per plane): 0x%x 0x%x 0x%x : %s %s %s sizes %dx%d %dx%d %dx%d\n", 
        comment, (int)format, (char*)&fourCC, 
        (int)imageWidth, (int)imageHeight,
        info.numPlanes, 
        (int)(int)info.planes[0].pixelType, (int)info.planes[1].pixelType, (int)info.planes[2].pixelType,
        typeNames[0].c_str(), typeNames[1].c_str(), typeNames[2].c_str(), 
        (int)info.planes[0].width, (int)info.planes[0].height, 
        (int)info.planes[1].width, (int)info.planes[1].height, 
        (int)info.planes[2].width, (int)info.planes[2].height 
        );
    //Sample output for NV12
    // vpiImageGetFormat ret 0xa2a10d1 NV12 Image Size 2624x1944 numPlanes 2 types (per plane): 0xffff1001 0xffff1011 0x0 : VPI_PIXEL_TYPE_U8 VPI_PIXEL_TYPE_2U8  
    //  sizes 2624x1944 1312x972 0x0
    return true;
}//bool VpiImagePrintFormat

NvBufSurface * CreateNvBuffers(int count, NvBufSurfaceColorFormat format, int width, int height, bool cuGraphicsEGLRegisterUpFront)
{
    NvBufSurface * nvbufSurf {};
    NvBufSurfaceAllocateParams inputParams = {{0}};
    inputParams.params.width = width;
    inputParams.params.height = height;
    inputParams.params.memType = NVBUF_MEM_SURFACE_ARRAY;
    inputParams.params.layout = NVBUF_LAYOUT_PITCH;
    inputParams.params.colorFormat = format;
    inputParams.memtag = NvBufSurfaceTag_CAMERA;
    assert(0 == NvBufSurfaceAllocate(&nvbufSurf, count, &inputParams));
    nvbufSurf->numFilled = count;
    
    if(cuGraphicsEGLRegisterUpFront)
    {
        for(int idx = 0; idx < nvbufSurf->numFilled; idx++)
        {
            //Map to GPU:
            assert(0 == NvBufSurfaceMapEglImage(nvbufSurf, idx));
            EGLImageKHR eglImage = nvbufSurf->surfaceList[idx].mappedAddr.eglImage;
            printf("NvBufSurfaceMapEglImage ret eglImage %p\n", eglImage);
            CUgraphicsResource cuGraphicsResource {};
            assert(CUDA_SUCCESS == cuGraphicsEGLRegisterImage(&cuGraphicsResource, eglImage, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE));
            CUeglFrame eglFrame {};
            assert(CUDA_SUCCESS == cuGraphicsResourceGetMappedEglFrame(&eglFrame, cuGraphicsResource, 0, 0));
            char * sharedGpuPtr = (char *)eglFrame.frame.pPitch[0];
            printf("Shared NvBufSurface mapped to GPU: %p\n", sharedGpuPtr);
        }
    }
    return nvbufSurf;
}

void CreateNvBufferWrapper(int fd, uint64_t memFlags, VPIImage * returnImage)
{
    VPIImageWrapperParams wrapperParams;
    wrapperParams.colorSpec = VPI_COLOR_SPEC_DEFAULT;

    VPIImageData vpiImageData;
    vpiImageData.bufferType = VPI_IMAGE_BUFFER_NVBUFFER;
    vpiImageData.buffer.fd = fd;
    printf("call vpiImageCreateWrapper VPI_IMAGE_BUFFER_NVBUFFER fd %d\n", vpiImageData.buffer.fd);
    CHECK_STATUS(vpiImageCreateWrapper(&vpiImageData, &wrapperParams, memFlags, returnImage));

    if(printFormat)
    {
        VPIImageData imgdata;
        CHECK_STATUS(vpiImageLockData(*returnImage, VPI_LOCK_READ, VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR, &imgdata));
        for(int planeIdx = 0; planeIdx < imgdata.buffer.pitch.numPlanes; planeIdx++)
        {
            int size = imgdata.buffer.pitch.planes[planeIdx].pitchBytes * imgdata.buffer.pitch.planes[planeIdx].height;
            if(printFormat/* && frameOrdinal == 0*/)
            {
                printf("Image %p planeIdx %d width %d height %d pitchBytes %d data %p\n", 
                    *returnImage, planeIdx,
                    imgdata.buffer.pitch.planes[planeIdx].width,
                    imgdata.buffer.pitch.planes[planeIdx].height,
                    imgdata.buffer.pitch.planes[planeIdx].pitchBytes,
                    imgdata.buffer.pitch.planes[planeIdx].data
                );
            }
        }
        CHECK_STATUS(vpiImageUnlock(*returnImage));
    }
}

int main(int argc, char *argv[])
{
    VPIStream stream     = NULL;
    VPIImage imgPrevious = NULL, imgInput = NULL, imgInputConverted = NULL, imgOutput = NULL, imgOutputWrapper = NULL;
    VPIImage imageCvWrapper = NULL;
    VPIPayload tnr    = NULL;

    // main return value
    int retval = 0;

    VPIBackend backend {VPI_BACKEND_VIC};
    
    const char * inFileName = getenv("inFile");
    std::ifstream inFile(inFileName);
    if(!inFile)
    {
        printf("Cannot open %s for reading\n", inFileName);
        return -1;
    }
    inFile.seekg (0, inFile.end);
    long long inFileSize = inFile.tellg();
    inFile.seekg (0, inFile.beg);
    
    const char * outFileName = getenv("outFile");
    std::ofstream outFile(outFileName);
    if(!outFile)
    {
        printf("Cannot open %s for writing\n", outFileName);
        return -1;
    }
    
    const char * temp = getenv("width");
    int width = temp? strtol(temp, nullptr, 10) : 1920;
    
    temp = getenv("height");
    int height = temp? strtol(temp, nullptr, 10) : 1080;
    
    temp = getenv("numFrames");
    int numFrames = temp? strtol(temp, nullptr, 10) : 1000;

    temp = getenv("skipFrames");
    int skipFrames = temp? strtol(temp, nullptr, 10) : 2;

    temp = getenv("printFormat");
    printFormat = temp && *temp == '1';
    
    temp = getenv("convertCuda");
    bool convertCuda = temp && *temp == '1';
    VPIBackend backendStream = backend;
    if(convertCuda)
    {
        backendStream = (VPIBackend)( uint64_t(backendStream) | VPI_BACKEND_CUDA);
    }

    CHECK_STATUS(vpiStreamCreate(backendStream, &stream));
    
    uint64_t memFlags {backend};
    memFlags |= VPI_BACKEND_CPU;//Need this to lock images
    
    temp = getenv("VPI_EXCLUSIVE_STREAM_ACCESS");
    if(temp && *temp == '1')
    {
        memFlags |= VPI_EXCLUSIVE_STREAM_ACCESS;
    }
    temp = getenv("VPI_BACKEND_CUDA");
    if(temp && *temp == '1')
    {
        memFlags |= VPI_BACKEND_CUDA;
    }
    VPIImageFormat imgFormat = VPI_IMAGE_FORMAT_NV12_ER;
    temp = getenv("VPI_IMAGE_FORMAT_YUYV_ER");
    if(temp && *temp == '1')
    {
        imgFormat = VPI_IMAGE_FORMAT_YUYV_ER;
    }
    temp = getenv("VPI_IMAGE_FORMAT_UYVY_ER");
    if(temp && *temp == '1')
    {
        imgFormat = VPI_IMAGE_FORMAT_UYVY_ER;
    }

    temp = getenv("tnrCuda");
    bool tnrCuda = temp && *temp == '1';
    
    temp = getenv("convertFromRgbaWidth");
    int convertFromRgbaWidth = temp? strtol(temp, nullptr, 10) : 0;
    
    temp = getenv("useNvBuffer");
    bool useNvBuffer = temp && *temp == '1';

    temp = getenv("cuGraphicsEGLRegisterUpFront");
    bool cuGraphicsEGLRegisterUpFront = temp && *temp == '1';
    
    uint64_t memFlagsInputs {memFlags};
    if(convertCuda && convertFromRgbaWidth)
    {
        memFlagsInputs = (VPIBackend)( uint64_t(memFlagsInputs) | VPI_BACKEND_CUDA);
    }

    if(useNvBuffer)
    {
        NvBufSurface * nvbufSurf = CreateNvBuffers(1, NVBUF_COLOR_FORMAT_NV12_ER, width, height, cuGraphicsEGLRegisterUpFront);
        CreateNvBufferWrapper(nvbufSurf->surfaceList[0].bufferDesc, memFlags, &imgPrevious);
        
        nvbufSurf = CreateNvBuffers(1, NVBUF_COLOR_FORMAT_NV12_ER, width, height, cuGraphicsEGLRegisterUpFront);
        CreateNvBufferWrapper(nvbufSurf->surfaceList[0].bufferDesc, memFlags, &imgOutput);
        
        nvbufSurf = CreateNvBuffers(1, NVBUF_COLOR_FORMAT_NV12_ER, width, height, cuGraphicsEGLRegisterUpFront);
        CreateNvBufferWrapper(nvbufSurf->surfaceList[0].bufferDesc, memFlagsInputs, &imgInput);
    }
    else
    {
        CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlagsInputs, &imgInput));
        CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlags, &imgPrevious));
        CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlags, &imgOutput));
    }
    
    if(convertFromRgbaWidth)
    {
        imgInputConverted = imgInput;//NV12
        imgInput = nullptr;
        
        if(useNvBuffer)
        {
            NvBufSurface * nvbufSurf = CreateNvBuffers(1, NVBUF_COLOR_FORMAT_RGBA, width, height, cuGraphicsEGLRegisterUpFront);
            CreateNvBufferWrapper(nvbufSurf->surfaceList[0].bufferDesc, memFlagsInputs, &imgInput);
        }
        else
        {
            CHECK_STATUS(vpiImageCreate(convertFromRgbaWidth, height, VPI_IMAGE_FORMAT_RGBA8, memFlagsInputs, &imgInput));
        }
    }
    
    VpiImagePrintFormat(imgInput, "imgInput");
    
    CHECK_STATUS(vpiCreateTemporalNoiseReduction(tnrCuda? VPI_BACKEND_CUDA : VPI_BACKEND_VIC, width, height, imgFormat, VPI_TNR_DEFAULT, &tnr));

    VPITNRParams params;
    CHECK_STATUS(vpiInitTemporalNoiseReductionParams(&params));
    
    temp = getenv("preset");
    if(temp)
    {
        params.preset = (VPITNRPreset)strtol(temp, nullptr, 10);
    }
    temp = getenv("strength");
    if(temp)
    {
        params.strength = strtod(temp, nullptr);
    }
    printf("tnr params preset: %d strength: %lf\n", (int)params.preset, (double)params.strength);
    
    temp = getenv("repeatOneFrame");
    bool repeatOneFrame = temp && *temp == '1';
    
    temp = getenv("dropResults");
    bool dropResults = temp && *temp == '1';

    VPIEvent evStart = NULL;
    VPIEvent evEnd = NULL;
    CHECK_STATUS(vpiEventCreate(backend, &evStart));
    CHECK_STATUS(vpiEventCreate(backend, &evEnd));
    
    printf("Run loop\n");
    
    long long filePos = 0;
    for(frameOrdinal = 0; frameOrdinal < numFrames; frameOrdinal++)
    {
        uint64_t timeStart = getTimeNS();
        
        if(!repeatOneFrame || frameOrdinal == 0)
        {
            //This is one way to read file: lock VPI image and read directly to it.
            VPIImageData imgdata;
            CHECK_STATUS(vpiImageLockData(imgInput, VPI_LOCK_WRITE, VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR, &imgdata));
            int frameSize {};
            for(int planeIdx = 0; planeIdx < imgdata.buffer.pitch.numPlanes; planeIdx++)
            {
                int size = imgdata.buffer.pitch.planes[planeIdx].pitchBytes * imgdata.buffer.pitch.planes[planeIdx].height;
                inFile.read((char*)imgdata.buffer.pitch.planes[planeIdx].data, size);
                frameSize += size;
                if(printFormat && frameOrdinal == 0)
                {
                    printf("planeIdx %d width %d height %d pitchBytes %d size %d total %d\n", planeIdx,
                        imgdata.buffer.pitch.planes[planeIdx].width,
                        imgdata.buffer.pitch.planes[planeIdx].height,
                        imgdata.buffer.pitch.planes[planeIdx].pitchBytes,
                        size, frameSize);
                }
            }

            CHECK_STATUS(vpiImageUnlock(imgInput));
            if(!inFile)
            {
                printf("Failed to read frame of size %d at pos %lld\n", frameSize, filePos);
                return -1;
            }
            filePos += frameSize;
            //printf("frameOrdinal %d frameSize %d filePos %lld\n", frameOrdinal, frameSize, filePos);
            if(filePos == inFileSize)
            {
                filePos = 0;
                //printf("seekg 0\n");
                inFile.seekg (0, inFile.beg);
            }
        }
        uint64_t timeStartProc = getTimeNS();
        
        VPIImage from = imgInput;
        if(convertFromRgbaWidth)
        {
            CHECK_STATUS(vpiSubmitConvertImageFormat(stream, 
                convertCuda? VPI_BACKEND_CUDA : VPI_BACKEND_VIC, 
                imgInput, imgInputConverted, nullptr));
            from = imgInputConverted;
        }
        
        CHECK_STATUS(vpiSubmitTemporalNoiseReduction(stream, tnrCuda? VPI_BACKEND_CUDA : VPI_BACKEND_VIC, tnr, 
            frameOrdinal == 0 ? nullptr: imgPrevious, from, imgOutput, &params));
        CHECK_STATUS(vpiStreamSync(stream));
        
        if(frameOrdinal >= skipFrames)//Do not count first few frames
        {
            statPerFrameVpi.Add( (int)(getTimeNS() - timeStartProc) );
        }
        
        if(!dropResults)
        {
            VPIImageData imgdata;
            CHECK_STATUS(vpiImageLockData(imgOutput, VPI_LOCK_READ, VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR, &imgdata));
            for(int planeIdx = 0; planeIdx < imgdata.buffer.pitch.numPlanes; planeIdx++)
            {
                int size = imgdata.buffer.pitch.planes[planeIdx].pitchBytes * imgdata.buffer.pitch.planes[planeIdx].height;
                outFile.write((const char*)imgdata.buffer.pitch.planes[planeIdx].data, size);
                if(printFormat && frameOrdinal == 0)
                {
                    printf("imgOutput %p planeIdx %d width %d height %d pitchBytes %d data %p\n", 
                        imgOutput, planeIdx,
                        imgdata.buffer.pitch.planes[planeIdx].width,
                        imgdata.buffer.pitch.planes[planeIdx].height,
                        imgdata.buffer.pitch.planes[planeIdx].pitchBytes,
                        imgdata.buffer.pitch.planes[planeIdx].data
                          );
                }
            }
            CHECK_STATUS(vpiImageUnlock(imgOutput));
        }
        
        std::swap(imgPrevious, imgOutput);
    }//for(int frameOrdinal = 0; frameOrdinal < numFrames; frameOrdinal++)
    
    printf("repeatOneFrame=%d dropResults=%d\n", repeatOneFrame, dropResults);
    statPerFrameVpi.Print("statPerFrameVpi (ms)", 1E-6);//Includes convert, tnr, sync
    
    vpiStreamDestroy(stream);
    vpiPayloadDestroy(tnr);
    vpiImageDestroy(imgPrevious);
    vpiImageDestroy(imgInput);
    vpiImageDestroy(imgOutput);
    vpiImageDestroy(imageCvWrapper);

    return 0;
}

Hi,
Here are some suggestions for the common issues:

1. Performance

Please run the below command before benchmarking deep learning use case:

$ sudo nvpmodel -m 0
$ sudo jetson_clocks

2. Installation

Installation guide of deep learning frameworks on Jetson:

3. Tutorial

Startup deep learning tutorial:

4. Report issue

If these suggestions don’t help and you want to report an issue to us, please attach the model, command/step, and the customized app (if any) with us to reproduce locally.

Thanks!

Hi,

We need to reproduce this issue internally to gather more info.
But could you share some info so we can get a rough idea about the issue?

How long does it take for a VPIImage created from NvBuffer?
Compared to the VPI native image, how long does the conversion take with a VPI default image?

And for the NvBuffer case, how long does it take for EGL to register and unregister?

Thanks.

I already wrote steps to reproduce at the top comment portion of the attached source file.
I also included the results of time measurement and excerpt from profiler results, which shows that run with vpiSubmitConvertImageFormat+TNR on image from vpiImageCreate is about 2.5 ms, but the same on NvBuffer is 4.2 ms.
The profiler results show that there are 2 calls to cudaGraphicsEGLRegisterImage/cudaGraphicsUnregisterResource
for each call to vpiSubmitConvertImageFormat (both source and destination buffers are registered and unregistered) and together register/unregister takes about 0.9 ms per image, 1.8 ms total, which explains difference between 2.5 ms and 4.2 ms total.

If calling vpiSubmitConvertImageFormat on NvBuffer is so slow,
may be I can call cudaGraphicsEGLRegisterImage myself, then call vpiImageCreateWrapper with VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR
and pass the resulting image to vpiSubmitConvertImageFormat and TNR?
I tried doing that, but keep getting errors
VPI_ERROR_INVALID_ARGUMENT: Current frame must have the same format configured during payload creation
VPI_ERROR_INVALID_IMAGE_FORMAT: CUDA: Conversion not implemented between VPI_IMAGE_FORMAT_RGBA8 and VPI_IMAGE_FORMAT_NV12
Apparently, I am not supplying correct parameters to vpiImageCreateWrapper?

Can you take a look, please? I attached an updated source.

/*
Usage:
Save this file to tnr_file.cpp

# Compile
g++  -o tnr_file -I/usr/local/cuda-12.6/targets/aarch64-linux/include -I/usr/src/jetson_multimedia_api/include \
    -I/usr/src/jetson_multimedia_api/samples/common/algorithm/cuda/ \
    ./tnr_file.cpp -L/usr/local/cuda-12.6/targets/aarch64-linux/lib/  -lnvvpi -lcudart -L/usr/lib/aarch64-linux-gnu/tegra/ -lnvbufsurface \
    -lcuda -lnvrm_mem

# Put large video files to tmpfs:
sudo mkdir /mnt/tmpfs
sudo chown $USER:$USER /mnt/tmpfs
sudo mount -t tmpfs -o size=16g tmpfs /mnt/tmpfs
   
sudo ~/clocks.sh --max # clocks.sh from https://docs.nvidia.com/vpi/algo_performance.html#benchmark_tables

# Make test video in nv12 format:
gst-launch-1.0 filesrc location=/opt/nvidia/vpi3/samples/assets/noisy.mp4 ! qtdemux ! queue ! h264parse ! avdec_h264 ! \
    nvvidconv ! 'video/x-raw, format=NV12, width=2816, height=1944' ! \
    filesink location=/mnt/tmpfs/out_2816.nv12 -e

# Process nv12 video with buffer from vpiImageCreate:
# Note: nv12 video does not need vpiSubmitConvertImageFormat, only vpiSubmitTemporalNoiseReduction is used
preset=4 strength=1 inFile=/mnt/tmpfs/out_2816.nv12 outFile=/mnt/tmpfs/out2_2624.nv12 \
    width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
statPerFrameVpi (ms) count 998 av 2.087361 min 2.070204 max 2.132818

# Process nv12 video with buffer from NvBufSurfaceAllocate/vpiImageCreateWrapper:
useNvBuffer=1 preset=4 strength=1 inFile=/mnt/tmpfs/out_2816.nv12 outFile=/mnt/tmpfs/out2_2624.nv12 \
    width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
statPerFrameVpi (ms) count 998 av 2.248555 min 2.139974 max 2.405052

# Process nv12 video with buffer from NvBufSurfaceAllocate/vpiImageCreateWrapper/VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR
printFormat=1 useCudaPitchLinear=1 useNvBuffer=1 preset=4 strength=1 inFile=/mnt/tmpfs/out_2816.nv12 outFile=/mnt/tmpfs/out2_2624.nv12 \
    width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
Fails with:
VPI_ERROR_INVALID_ARGUMENT: Current frame must have the same format configured during payload creation
    
# Make test video in rgba format:
gst-launch-1.0 filesrc location=/opt/nvidia/vpi3/samples/assets/noisy.mp4 ! qtdemux ! queue ! h264parse ! avdec_h264 ! \
    nvvidconv ! 'video/x-raw, format=RGBA, width=2624, height=1944' ! \
    filesink location=/mnt/tmpfs/out_2624.rgba -e

# Process rgba video with buffer from vpiImageCreate:
# Note: rgba video need vpiSubmitConvertImageFormat before vpiSubmitTemporalNoiseReduction
convertCuda=1 convertFromRgbaWidth=2624 preset=4 strength=1 inFile=/mnt/tmpfs/out_2624.rgba outFile=/mnt/tmpfs/out2_2624.nv12 \
    width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
statPerFrameVpi (ms) count 998 av 2.759379 min 2.498117 max 3.059703

# Process rgba video with buffer from NvBufSurfaceAllocate/vpiImageCreateWrapper:
useNvBuffer=1 convertCuda=1 convertFromRgbaWidth=2624 preset=4 strength=1 inFile=/mnt/tmpfs/out_2624.rgba outFile=/mnt/tmpfs/out2_2624.nv12 \
    width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
statPerFrameVpi (ms) count 998 av 4.249904 min 4.174961 max 4.397086

# Process rgba video with buffer from NvBufSurfaceAllocate/vpiImageCreateWrapper/VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR
printFormat=1 useCudaPitchLinear=1 useNvBuffer=1 convertCuda=1 convertFromRgbaWidth=2624 preset=4 strength=1 inFile=/mnt/tmpfs/out_2624.rgba outFile=/mnt/tmpfs/out2_2624.nv12 \
    width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
Fails with:
VPI_ERROR_INVALID_IMAGE_FORMAT: CUDA: Conversion not implemented between VPI_IMAGE_FORMAT_RGBA8 and VPI_IMAGE_FORMAT_NV12


So, without vpiSubmitConvertImageFormat time is the same no matter we use NvBuffer or not
but with conversion it is slower by about 1.7 ms

Now with profiler:
useNvBuffer=1 convertCuda=1 convertFromRgbaWidth=2624 preset=4 strength=1 inFile=/mnt/tmpfs/out_2624.rgba outFile=/mnt/tmpfs/out2_2624.nv12 \
    width=2624 height=1944 numFrames=1000 printFormat=1 nsys profile ./tnr_file
nsys stats report1.nsys-rep | grep -i register
     41.5      961,313,856      2,002     480,176.8     585,040.0     280,928     797,600    149,372.0  cudaGraphicsEGLRegisterImage
     26.8      620,007,040      2,002     309,693.8     338,416.0     208,064     436,128     78,890.0  cudaGraphicsUnregisterResource

So, cudaGraphicsEGLRegisterImage and cudaGraphicsUnregisterResource are the performance killers

# To verify the resulting image:
DISPLAY=:0 ffplay -v info -f rawvideo -pixel_format nv12 -video_size 2816x1944 /mnt/tmpfs/out2_2624.nv12

*/

#include <vpi/Event.h>
#include <vpi/Image.h>
#include <vpi/Status.h>
#include <vpi/Stream.h>
#include <vpi/algo/ConvertImageFormat.h>
#include <vpi/algo/TemporalNoiseReduction.h>

#include <cuda.h>
#include <cuda_runtime.h>

#include "NvBufSurface.h"
#include "NvCudaProc.h"
#include "cudaEGL.h"

#include <algorithm>
#include <cstring> // for memset
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>

#define ARRAY_SIZE(arr) (sizeof(arr)/sizeof(arr[0]))

int frameOrdinal {};
int printFormat {};

uint64_t getTimeNS() 
{
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec * 1000 * 1000 * 1000 + ts.tv_nsec;
}

struct Stat
{
    void Add(int value)
    {
        if(!this->count || this->min > value)
        {
            this->min = value;
        }
        if(!this->count || this->max < value)
        {
            this->max = value;
        }
        this->count++;
        this->total += value;
    }
    
    void Print(const char * name, double ratio)
    {
        printf("%s count %d av %lf min %lf max %lf\n", 
               name, this->count, this->count? (this->total * ratio / this->count) : 0.0, ratio * this->min, ratio * this->max);
    }
    
    int min {-1};
    int max {-1};
    int count {0};
    long long total {0};
};

Stat statPerFrameVpi;//Includes convert, tnr, sync

#define CHECK_STATUS(STMT)                                    \
    do                                                        \
    {                                                         \
        VPIStatus status = (STMT);                            \
        if (status != VPI_SUCCESS)                            \
        {                                                     \
            char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH];       \
            vpiGetLastStatusMessage(buffer, sizeof(buffer));  \
            std::ostringstream ss;                            \
            ss << "" #STMT "\n";                              \
            ss << vpiStatusGetName(status) << ": " << buffer; \
            throw std::runtime_error(ss.str());               \
        }                                                     \
    } while (0);

void VpiLogError(const char * name, VPIStatus status)
{
    char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH];
    vpiGetLastStatusMessage(buffer, sizeof(buffer));
    const char * statusName = vpiStatusGetName(status);
    printf("%s ret %d: %s : %s\n", name, (int)status, buffer, statusName);
}

bool VpiImagePrintFormat(VPIImage image, const char * comment)
{
    VPIImageFormat format {};
    VPIStatus status = vpiImageGetFormat(image, &format);
    if(status != VPI_SUCCESS)
    {
        VpiLogError("vpiImageGetFormat", status);
        return false;
    }
    int32_t imageWidth {}, imageHeight {};
    status = vpiImageGetSize(image, &imageWidth, &imageHeight);
    if(status != VPI_SUCCESS)
    {
        VpiLogError("vpiImageGetSize", status);
        return false;
    }

    uint32_t fourCC = vpiImageFormatGetFourCC(format);
    VPIImageBufferPitchLinear info {};
    info.numPlanes = vpiImageFormatGetPlaneCount(format);
    if(info.numPlanes < 0 || info.numPlanes > ARRAY_SIZE(info.planes))
    {
        printf("Bad numPlanes %d\n", info.numPlanes);
        return false;
    }
    //VPIPixelType types[3] {VPI_PIXEL_TYPE_INVALID, VPI_PIXEL_TYPE_INVALID, VPI_PIXEL_TYPE_INVALID};
    std::string typeNames[ARRAY_SIZE(info.planes)];
    for(int planeIdx = 0; planeIdx < info.numPlanes; planeIdx++)
    {
        info.planes[planeIdx].pixelType = vpiImageFormatGetPlanePixelType(format, planeIdx);
        typeNames[planeIdx] = vpiPixelTypeGetName(info.planes[planeIdx].pixelType);
        
        info.planes[planeIdx].width = vpiImageFormatGetPlaneWidth(format, imageWidth, planeIdx);
        info.planes[planeIdx].height = vpiImageFormatGetPlaneHeight(format, imageHeight, planeIdx);
    }
    
    printf("%s vpiImageGetFormat ret 0x%x %.4s Image Size %dx%d numPlanes %d types (per plane): 0x%x 0x%x 0x%x : %s %s %s sizes %dx%d %dx%d %dx%d\n", 
        comment, (int)format, (char*)&fourCC, 
        (int)imageWidth, (int)imageHeight,
        info.numPlanes, 
        (int)(int)info.planes[0].pixelType, (int)info.planes[1].pixelType, (int)info.planes[2].pixelType,
        typeNames[0].c_str(), typeNames[1].c_str(), typeNames[2].c_str(), 
        (int)info.planes[0].width, (int)info.planes[0].height, 
        (int)info.planes[1].width, (int)info.planes[1].height, 
        (int)info.planes[2].width, (int)info.planes[2].height 
        );
    //Sample output for NV12
    // vpiImageGetFormat ret 0xa2a10d1 NV12 Image Size 2624x1944 numPlanes 2 types (per plane): 0xffff1001 0xffff1011 0x0 : VPI_PIXEL_TYPE_U8 VPI_PIXEL_TYPE_2U8  
    //  sizes 2624x1944 1312x972 0x0
    return true;
}//bool VpiImagePrintFormat

struct Buffer
{
    int width;
    int height;
    NvBufSurface * nvbufSurf {};
    EGLImageKHR eglImage {};
    CUgraphicsResource cuGraphicsResource {};
    CUeglFrame eglFrame {};
};

void CreateNvBuffers(Buffer & buffer, NvBufSurfaceColorFormat format, int width, int height, bool cuGraphicsEGLRegisterUpFront)
{
    buffer.width = width;
    buffer.height = height;
    buffer.nvbufSurf = nullptr;
    NvBufSurfaceAllocateParams inputParams = {{0}};
    inputParams.params.width = width;
    inputParams.params.height = height;
    inputParams.params.memType = NVBUF_MEM_SURFACE_ARRAY;
    inputParams.params.layout = NVBUF_LAYOUT_PITCH;
    inputParams.params.colorFormat = format;
    inputParams.memtag = NvBufSurfaceTag_CAMERA;
    assert(0 == NvBufSurfaceAllocate(&buffer.nvbufSurf, 1, &inputParams));
    buffer.nvbufSurf->numFilled = 1;
    
    printf("NvBufSurfaceAllocate ret fd %d num_planes %d width %d %d height %d %d pitch %d %d\n", 
           (int)buffer.nvbufSurf->surfaceList[0].bufferDesc,
           buffer.nvbufSurf->surfaceList[0].planeParams.num_planes,
           buffer.nvbufSurf->surfaceList[0].planeParams.width[0],
           buffer.nvbufSurf->surfaceList[0].planeParams.width[1],
           buffer.nvbufSurf->surfaceList[0].planeParams.height[0],
           buffer.nvbufSurf->surfaceList[0].planeParams.height[1],
           buffer.nvbufSurf->surfaceList[0].planeParams.pitch[0],
           buffer.nvbufSurf->surfaceList[0].planeParams.pitch[1]
          );
    
    if(cuGraphicsEGLRegisterUpFront)
    {
        //Map to GPU:
        assert(0 == NvBufSurfaceMapEglImage(buffer.nvbufSurf, 0));
        buffer.eglImage = buffer.nvbufSurf->surfaceList[0].mappedAddr.eglImage;
        printf("NvBufSurfaceMapEglImage ret eglImage %p\n", buffer.eglImage);
        assert(CUDA_SUCCESS == cuGraphicsEGLRegisterImage(&buffer.cuGraphicsResource, buffer.eglImage, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE));
        assert(CUDA_SUCCESS == cuGraphicsResourceGetMappedEglFrame(&buffer.eglFrame, buffer.cuGraphicsResource, 0, 0));
        printf("Shared NvBufSurface mapped to GPU: %p %p\n", 
                buffer.eglFrame.frame.pPitch[0], buffer.eglFrame.frame.pPitch[1]
        );
    }
}

bool NvBufferWrapperCudaPitchLinear(struct Buffer & buffer, VPIImage & vpiImage, uint64_t flags)
{
    // See vpiImageCreateWrapper in https://docs.nvidia.com/vpi/group__VPI__Image.html#ga3e7cf2520dd568a7e7a9a6876ea7995c
    VPIImageData vpiImageData;
    vpiImageData.bufferType = VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR;//VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR;
    //vpiImageData.buffer is a union
    NvBufSurfaceParams & params {buffer.nvbufSurf->surfaceList[0]}; 
    if(params.colorFormat == NVBUF_COLOR_FORMAT_RGBA)
    {
        vpiImageData.buffer.pitch.format = VPI_IMAGE_FORMAT_RGBA8;
        vpiImageData.buffer.pitch.numPlanes = 1;
        vpiImageData.buffer.pitch.planes[0].pixelType = VPI_PIXEL_TYPE_4U8;
        vpiImageData.buffer.pitch.planes[0].width = buffer.eglFrame.width;
        vpiImageData.buffer.pitch.planes[0].height = buffer.eglFrame.height;
        vpiImageData.buffer.pitch.planes[0].pitchBytes = buffer.eglFrame.pitch;
        vpiImageData.buffer.pitch.planes[0].data = buffer.eglFrame.frame.pPitch[0];
        if(printFormat && frameOrdinal == 0)
        {
            printf("vpiImageCreateWrapper w %d h %d pb %d d %p\n", 
                (int)vpiImageData.buffer.pitch.planes[0].width,
                (int)vpiImageData.buffer.pitch.planes[0].height,
                (int)vpiImageData.buffer.pitch.planes[0].pitchBytes,
                (void*)vpiImageData.buffer.pitch.planes[0].data);
        }
    }
    else
    {
        vpiImageData.buffer.pitch.format = VPI_IMAGE_FORMAT_NV12;
        vpiImageData.buffer.pitch.numPlanes = 2;
        vpiImageData.buffer.pitch.planes[0].pixelType = VPI_PIXEL_TYPE_U8;
        vpiImageData.buffer.pitch.planes[0].width = params.planeParams.width[0];
        vpiImageData.buffer.pitch.planes[0].height = params.planeParams.height[0];
        vpiImageData.buffer.pitch.planes[0].pitchBytes = params.planeParams.pitch[0];
        vpiImageData.buffer.pitch.planes[0].data = buffer.eglFrame.frame.pPitch[0];
        //int firstPlaneSize = width * height;//assuming pitch == width and 1 Y byte per pixel
        vpiImageData.buffer.pitch.planes[1].pixelType = VPI_PIXEL_TYPE_2U8;
        vpiImageData.buffer.pitch.planes[1].width = params.planeParams.width[1];
        vpiImageData.buffer.pitch.planes[1].height = params.planeParams.height[1];
        vpiImageData.buffer.pitch.planes[1].pitchBytes = params.planeParams.pitch[1];
        vpiImageData.buffer.pitch.planes[1].data = buffer.eglFrame.frame.pPitch[1];
        if(printFormat && frameOrdinal == 0)
        {
            printf("vpiImageCreateWrapper w %d h %d pb %d d %p  w %d h %d pb %d d %p\n", 
                (int)vpiImageData.buffer.pitch.planes[0].width,
                (int)vpiImageData.buffer.pitch.planes[0].height,
                (int)vpiImageData.buffer.pitch.planes[0].pitchBytes,
                (void*)vpiImageData.buffer.pitch.planes[0].data,
                (int)vpiImageData.buffer.pitch.planes[1].width,
                (int)vpiImageData.buffer.pitch.planes[1].height,
                (int)vpiImageData.buffer.pitch.planes[1].pitchBytes,
                (void*)vpiImageData.buffer.pitch.planes[1].data
            );
        }
    }

    VPIImageWrapperParams wrapperParams;
    wrapperParams.colorSpec = VPI_COLOR_SPEC_DEFAULT;//TODO
    if(vpiImage)
    {//Reuse existing image
        VPIStatus status = vpiImageSetWrapper(vpiImage, &vpiImageData);
        if(status != VPI_SUCCESS)
        {
            VpiLogError("vpiImageSetWrapper", status);
            return false;
        }
    }
    else
    {
        //flags |= VPI_EXCLUSIVE_STREAM_ACCESS;
        //TODO: condider other flags  VPI_RESTRICT_MEM_USAGE, VPI_REQUIRE_BACKENDS
        //See https://docs.nvidia.com/vpi/Types_8h.html#common_flags
        VPIStatus status = vpiImageCreateWrapper(&vpiImageData, &wrapperParams, flags, &vpiImage);
        if(status != VPI_SUCCESS)
        {//VPI_ERROR_INVALID_ARGUMENT == 2
            VpiLogError("vpiImageCreateWrapper", status);
            return false;
        }
    }
    return true;
}//NvBufferWrapperCudaPitchLinear

void CreateNvBufferWrapper(struct Buffer & buffer, uint64_t memFlags, VPIImage & returnImage, uint64_t flags, bool useCudaPitchLinear)
{
    if(useCudaPitchLinear)
    {
        assert(buffer.eglFrame.frame.pPitch[0] != nullptr);
        NvBufferWrapperCudaPitchLinear(buffer, returnImage, flags);
        return;
    }
    VPIImageWrapperParams wrapperParams;
    wrapperParams.colorSpec = VPI_COLOR_SPEC_DEFAULT;

    VPIImageData vpiImageData;
    vpiImageData.bufferType = VPI_IMAGE_BUFFER_NVBUFFER;
    vpiImageData.buffer.fd = buffer.nvbufSurf->surfaceList[0].bufferDesc;
    printf("call vpiImageCreateWrapper VPI_IMAGE_BUFFER_NVBUFFER fd %d\n", vpiImageData.buffer.fd);
    CHECK_STATUS(vpiImageCreateWrapper(&vpiImageData, &wrapperParams, memFlags, &returnImage));

    if(printFormat)
    {
        VPIImageData imgdata;
        CHECK_STATUS(vpiImageLockData(returnImage, VPI_LOCK_READ, VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR, &imgdata));
        for(int planeIdx = 0; planeIdx < imgdata.buffer.pitch.numPlanes; planeIdx++)
        {
            int size = imgdata.buffer.pitch.planes[planeIdx].pitchBytes * imgdata.buffer.pitch.planes[planeIdx].height;
            if(printFormat/* && frameOrdinal == 0*/)
            {
                printf("Image %p planeIdx %d width %d height %d pitchBytes %d data %p\n", 
                    returnImage, planeIdx,
                    imgdata.buffer.pitch.planes[planeIdx].width,
                    imgdata.buffer.pitch.planes[planeIdx].height,
                    imgdata.buffer.pitch.planes[planeIdx].pitchBytes,
                    imgdata.buffer.pitch.planes[planeIdx].data
                );
            }
        }
        CHECK_STATUS(vpiImageUnlock(returnImage));
    }
}

int main(int argc, char *argv[])
{
    VPIStream stream     = NULL;
    VPIImage imgPrevious = NULL, imgInput = NULL, imgInputConverted = NULL, imgOutput = NULL, imgOutputWrapper = NULL;
    VPIImage imageCvWrapper = NULL;
    VPIPayload tnr    = NULL;

    // main return value
    int retval = 0;

    VPIBackend backend {VPI_BACKEND_VIC};
    
    const char * inFileName = getenv("inFile");
    std::ifstream inFile(inFileName);
    if(!inFile)
    {
        printf("Cannot open %s for reading\n", inFileName);
        return -1;
    }
    inFile.seekg (0, inFile.end);
    long long inFileSize = inFile.tellg();
    inFile.seekg (0, inFile.beg);
    
    const char * outFileName = getenv("outFile");
    std::ofstream outFile(outFileName);
    if(!outFile)
    {
        printf("Cannot open %s for writing\n", outFileName);
        return -1;
    }
    
    const char * temp = getenv("width");
    int width = temp? strtol(temp, nullptr, 10) : 1920;
    
    temp = getenv("height");
    int height = temp? strtol(temp, nullptr, 10) : 1080;
    
    temp = getenv("numFrames");
    int numFrames = temp? strtol(temp, nullptr, 10) : 1000;

    temp = getenv("skipFrames");
    int skipFrames = temp? strtol(temp, nullptr, 10) : 2;

    temp = getenv("printFormat");
    printFormat = temp && *temp == '1';
    
    temp = getenv("convertCuda");
    bool convertCuda = temp && *temp == '1';
    VPIBackend backendStream = backend;
    if(convertCuda)
    {
        backendStream = (VPIBackend)( uint64_t(backendStream) | VPI_BACKEND_CUDA);
    }

    CHECK_STATUS(vpiStreamCreate(backendStream, &stream));
    
    uint64_t memFlags {backend};
    memFlags |= VPI_BACKEND_CPU;//Need this to lock images
    
    temp = getenv("VPI_EXCLUSIVE_STREAM_ACCESS");
    if(temp && *temp == '1')
    {
        memFlags |= VPI_EXCLUSIVE_STREAM_ACCESS;
    }
    temp = getenv("VPI_BACKEND_CUDA");
    if(temp && *temp == '1')
    {
        memFlags |= VPI_BACKEND_CUDA;
    }
    VPIImageFormat imgFormat = VPI_IMAGE_FORMAT_NV12_ER;
    temp = getenv("VPI_IMAGE_FORMAT_YUYV_ER");
    if(temp && *temp == '1')
    {
        imgFormat = VPI_IMAGE_FORMAT_YUYV_ER;
    }
    temp = getenv("VPI_IMAGE_FORMAT_UYVY_ER");
    if(temp && *temp == '1')
    {
        imgFormat = VPI_IMAGE_FORMAT_UYVY_ER;
    }

    temp = getenv("tnrCuda");
    bool tnrCuda = temp && *temp == '1';
    
    temp = getenv("convertFromRgbaWidth");
    int convertFromRgbaWidth = temp? strtol(temp, nullptr, 10) : 0;
    
    temp = getenv("useNvBuffer");
    bool useNvBuffer = temp && *temp == '1';
    printf("useNvBuffer %d\n", useNvBuffer);

    temp = getenv("useCudaPitchLinear");
    bool useCudaPitchLinear = temp && *temp == '1';
    printf("useCudaPitchLinear %d\n", useCudaPitchLinear);

    temp = getenv("cuGraphicsEGLRegisterUpFront");
    bool cuGraphicsEGLRegisterUpFront = (temp && *temp == '1') || useCudaPitchLinear;
    
    uint64_t memFlagsInputs {memFlags};
    if(convertCuda && convertFromRgbaWidth)
    {
        memFlagsInputs = (VPIBackend)( uint64_t(memFlagsInputs) | VPI_BACKEND_CUDA);
    }

    Buffer bufferPrevious, bufferOutput, bufferInput, bufferInputRgba;
    if(useNvBuffer)
    {
        CreateNvBuffers(bufferPrevious, NVBUF_COLOR_FORMAT_NV12_ER, width, height, cuGraphicsEGLRegisterUpFront);
        CreateNvBufferWrapper(bufferPrevious, memFlags, imgPrevious, memFlags, useCudaPitchLinear);
        
        CreateNvBuffers(bufferOutput, NVBUF_COLOR_FORMAT_NV12_ER, width, height, cuGraphicsEGLRegisterUpFront);
        CreateNvBufferWrapper(bufferOutput, memFlags, imgOutput, memFlags, useCudaPitchLinear);
        
        CreateNvBuffers(bufferInput, NVBUF_COLOR_FORMAT_NV12_ER, width, height, cuGraphicsEGLRegisterUpFront);
        CreateNvBufferWrapper(bufferInput, memFlagsInputs, imgInput, memFlagsInputs, useCudaPitchLinear);
    }
    else
    {
        CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlagsInputs, &imgInput));
        CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlags, &imgPrevious));
        CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlags, &imgOutput));
    }
    
    if(convertFromRgbaWidth)
    {
        imgInputConverted = imgInput;//NV12
        imgInput = nullptr;
        
        if(useNvBuffer)
        {
            CreateNvBuffers(bufferInputRgba, NVBUF_COLOR_FORMAT_RGBA, width, height, cuGraphicsEGLRegisterUpFront);
            CreateNvBufferWrapper(bufferInputRgba, memFlagsInputs, imgInput, memFlagsInputs, useCudaPitchLinear);
        }
        else
        {
            CHECK_STATUS(vpiImageCreate(convertFromRgbaWidth, height, VPI_IMAGE_FORMAT_RGBA8, memFlagsInputs, &imgInput));
        }
    }
    
    VpiImagePrintFormat(imgInput, "imgInput");
    
    CHECK_STATUS(vpiCreateTemporalNoiseReduction(tnrCuda? VPI_BACKEND_CUDA : VPI_BACKEND_VIC, width, height, imgFormat, VPI_TNR_DEFAULT, &tnr));

    VPITNRParams params;
    CHECK_STATUS(vpiInitTemporalNoiseReductionParams(&params));
    
    temp = getenv("preset");
    if(temp)
    {
        params.preset = (VPITNRPreset)strtol(temp, nullptr, 10);
    }
    temp = getenv("strength");
    if(temp)
    {
        params.strength = strtod(temp, nullptr);
    }
    printf("tnr params preset: %d strength: %lf\n", (int)params.preset, (double)params.strength);
    
    temp = getenv("repeatOneFrame");
    bool repeatOneFrame = temp && *temp == '1';
    
    temp = getenv("dropResults");
    bool dropResults = temp && *temp == '1';

    VPIEvent evStart = NULL;
    VPIEvent evEnd = NULL;
    CHECK_STATUS(vpiEventCreate(backend, &evStart));
    CHECK_STATUS(vpiEventCreate(backend, &evEnd));
    
    printf("Run loop\n");
    
    long long filePos = 0;
    for(frameOrdinal = 0; frameOrdinal < numFrames; frameOrdinal++)
    {
        uint64_t timeStart = getTimeNS();
        
        if(!repeatOneFrame || frameOrdinal == 0)
        {
            //This is one way to read file: lock VPI image and read directly to it.
            VPIImageData imgdata;
            CHECK_STATUS(vpiImageLockData(imgInput, VPI_LOCK_WRITE, VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR, &imgdata));
            int frameSize {};
            for(int planeIdx = 0; planeIdx < imgdata.buffer.pitch.numPlanes; planeIdx++)
            {
                int size = imgdata.buffer.pitch.planes[planeIdx].pitchBytes * imgdata.buffer.pitch.planes[planeIdx].height;
                inFile.read((char*)imgdata.buffer.pitch.planes[planeIdx].data, size);
                frameSize += size;
                if(printFormat && frameOrdinal == 0)
                {
                    printf("planeIdx %d width %d height %d pitchBytes %d size %d total %d\n", planeIdx,
                        imgdata.buffer.pitch.planes[planeIdx].width,
                        imgdata.buffer.pitch.planes[planeIdx].height,
                        imgdata.buffer.pitch.planes[planeIdx].pitchBytes,
                        size, frameSize);
                }
            }

            CHECK_STATUS(vpiImageUnlock(imgInput));
            if(!inFile)
            {
                printf("Failed to read frame of size %d at pos %lld\n", frameSize, filePos);
                return -1;
            }
            filePos += frameSize;
            //printf("frameOrdinal %d frameSize %d filePos %lld\n", frameOrdinal, frameSize, filePos);
            if(filePos == inFileSize)
            {
                filePos = 0;
                //printf("seekg 0\n");
                inFile.seekg (0, inFile.beg);
            }
        }
        uint64_t timeStartProc = getTimeNS();
        
        VPIImage from = imgInput;
        if(convertFromRgbaWidth)
        {
            CHECK_STATUS(vpiSubmitConvertImageFormat(stream, 
                convertCuda? VPI_BACKEND_CUDA : VPI_BACKEND_VIC, 
                imgInput, imgInputConverted, nullptr));
            from = imgInputConverted;
        }
        
        CHECK_STATUS(vpiSubmitTemporalNoiseReduction(stream, tnrCuda? VPI_BACKEND_CUDA : VPI_BACKEND_VIC, tnr, 
            frameOrdinal == 0 ? nullptr: imgPrevious, from, imgOutput, &params));
        CHECK_STATUS(vpiStreamSync(stream));
        
        if(frameOrdinal >= skipFrames)//Do not count first few frames
        {
            statPerFrameVpi.Add( (int)(getTimeNS() - timeStartProc) );
        }
        
        if(!dropResults)
        {
            VPIImageData imgdata;
            CHECK_STATUS(vpiImageLockData(imgOutput, VPI_LOCK_READ, VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR, &imgdata));
            for(int planeIdx = 0; planeIdx < imgdata.buffer.pitch.numPlanes; planeIdx++)
            {
                int size = imgdata.buffer.pitch.planes[planeIdx].pitchBytes * imgdata.buffer.pitch.planes[planeIdx].height;
                outFile.write((const char*)imgdata.buffer.pitch.planes[planeIdx].data, size);
                if(printFormat && frameOrdinal == 0)
                {
                    printf("imgOutput %p planeIdx %d width %d height %d pitchBytes %d data %p\n", 
                        imgOutput, planeIdx,
                        imgdata.buffer.pitch.planes[planeIdx].width,
                        imgdata.buffer.pitch.planes[planeIdx].height,
                        imgdata.buffer.pitch.planes[planeIdx].pitchBytes,
                        imgdata.buffer.pitch.planes[planeIdx].data
                          );
                }
            }
            CHECK_STATUS(vpiImageUnlock(imgOutput));
        }
        
        std::swap(imgPrevious, imgOutput);
    }//for(int frameOrdinal = 0; frameOrdinal < numFrames; frameOrdinal++)
    
    printf("repeatOneFrame=%d dropResults=%d\n", repeatOneFrame, dropResults);
    statPerFrameVpi.Print("statPerFrameVpi (ms)", 1E-6);//Includes convert, tnr, sync
    
    vpiStreamDestroy(stream);
    vpiPayloadDestroy(tnr);
    vpiImageDestroy(imgPrevious);
    vpiImageDestroy(imgInput);
    vpiImageDestroy(imgOutput);
    vpiImageDestroy(imageCvWrapper);

    return 0;
}

Hi,

Thanks for your update.

Have you maximized the VPI-related hardware clock?
For example, increasing the VIC clock might help in your use case.

https://docs.nvidia.com/vpi/algo_performance.html#maxout_clocks

Thanks.

Sure, you can see “sudo ~/clocks.sh” in my code above.
However, time of cuGraphicsEGLRegisterImage does not appear to depend on VPI clock.

You could try vpiImageData.buffer.pitch.format = VPI_IMAGE_FORMAT_NV12_ER; with CUDA backend, since VPI_IMAGE_FORMAT_NV12_ER to VPI_IMAGE_FORMAT_RGBA8 is supported according to the support matrix below at VPI - Vision Programming Interface: Convert Image Format (and VPI_IMAGE_FORMAT_NV12 is not). Maybe this fixes your VPI_ERROR_INVALID_IMAGE_FORMAT issue.

Interesting topic, hope this gets solved somehow.

Nope. It does not appear to differentiate between VPI_IMAGE_FORMAT_NV12 and VPI_IMAGE_FORMAT_NV12_ER when wrapping with VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR.
Attached is slightly modified sample. If I run it with

printFormat=1 useCudaPitchLinear=1 useNvBuffer=1 convertCuda=1 convertFromRgbaWidth=2624 preset=4 strength=1 inFile=/mnt/tmpfs/out_2624.rgba outFile=/mnt/tmpfs/out2_2624.nv12     width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file

it prints:

vpiImageCreateWrapper VPI_IMAGE_FORMAT_NV12_ER ...
 vpiImageGetFormat ret 0xa2a10d1 NV12 ....

and then

VPI_ERROR_INVALID_IMAGE_FORMAT: CUDA: Conversion not implemented between VPI_IMAGE_FORMAT_RGBA8 and VPI_IMAGE_FORMAT_NV12
/*
Usage:
Save this file to tnr_file.cpp

# Compile
g++  -o tnr_file -I/usr/local/cuda-12.6/targets/aarch64-linux/include -I/usr/src/jetson_multimedia_api/include \
    -I/usr/src/jetson_multimedia_api/samples/common/algorithm/cuda/ \
    ./tnr_file.cpp -L/usr/local/cuda-12.6/targets/aarch64-linux/lib/  -lnvvpi -lcudart -L/usr/lib/aarch64-linux-gnu/tegra/ -lnvbufsurface \
    -lcuda -lnvrm_mem

# Put large video files to tmpfs:
sudo mkdir /mnt/tmpfs
sudo chown $USER:$USER /mnt/tmpfs
sudo mount -t tmpfs -o size=16g tmpfs /mnt/tmpfs
   
sudo ~/clocks.sh --max # clocks.sh from https://docs.nvidia.com/vpi/algo_performance.html#benchmark_tables

# Make test video in nv12 format:
gst-launch-1.0 filesrc location=/opt/nvidia/vpi3/samples/assets/noisy.mp4 ! qtdemux ! queue ! h264parse ! avdec_h264 ! \
    nvvidconv ! 'video/x-raw, format=NV12, width=2816, height=1944' ! \
    filesink location=/mnt/tmpfs/out_2816.nv12 -e

# Process nv12 video with buffer from vpiImageCreate:
# Note: nv12 video does not need vpiSubmitConvertImageFormat, only vpiSubmitTemporalNoiseReduction is used
preset=4 strength=1 inFile=/mnt/tmpfs/out_2816.nv12 outFile=/mnt/tmpfs/out2_2624.nv12 \
    width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
statPerFrameVpi (ms) count 998 av 2.087361 min 2.070204 max 2.132818

# Process nv12 video with buffer from NvBufSurfaceAllocate/vpiImageCreateWrapper:
useNvBuffer=1 preset=4 strength=1 inFile=/mnt/tmpfs/out_2816.nv12 outFile=/mnt/tmpfs/out2_2624.nv12 \
    width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
statPerFrameVpi (ms) count 998 av 2.248555 min 2.139974 max 2.405052

# Process nv12 video with buffer from NvBufSurfaceAllocate/vpiImageCreateWrapper/VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR
printFormat=1 useCudaPitchLinear=1 useNvBuffer=1 preset=4 strength=1 inFile=/mnt/tmpfs/out_2816.nv12 outFile=/mnt/tmpfs/out2_2624.nv12 \
    width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
Fails with:
VPI_ERROR_INVALID_ARGUMENT: Current frame must have the same format configured during payload creation
    
# Make test video in rgba format:
gst-launch-1.0 filesrc location=/opt/nvidia/vpi3/samples/assets/noisy.mp4 ! qtdemux ! queue ! h264parse ! avdec_h264 ! \
    nvvidconv ! 'video/x-raw, format=RGBA, width=2624, height=1944' ! \
    filesink location=/mnt/tmpfs/out_2624.rgba -e

# Process rgba video with buffer from vpiImageCreate:
# Note: rgba video need vpiSubmitConvertImageFormat before vpiSubmitTemporalNoiseReduction
convertCuda=1 convertFromRgbaWidth=2624 preset=4 strength=1 inFile=/mnt/tmpfs/out_2624.rgba outFile=/mnt/tmpfs/out2_2624.nv12 \
    width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
statPerFrameVpi (ms) count 998 av 2.759379 min 2.498117 max 3.059703

# Process rgba video with buffer from NvBufSurfaceAllocate/vpiImageCreateWrapper:
useNvBuffer=1 convertCuda=1 convertFromRgbaWidth=2624 preset=4 strength=1 inFile=/mnt/tmpfs/out_2624.rgba outFile=/mnt/tmpfs/out2_2624.nv12 \
    width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
statPerFrameVpi (ms) count 998 av 4.249904 min 4.174961 max 4.397086

# Process rgba video with buffer from NvBufSurfaceAllocate/vpiImageCreateWrapper/VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR
printFormat=1 useCudaPitchLinear=1 useNvBuffer=1 convertCuda=1 convertFromRgbaWidth=2624 preset=4 strength=1 inFile=/mnt/tmpfs/out_2624.rgba outFile=/mnt/tmpfs/out2_2624.nv12 \
    width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
Fails with:
VPI_ERROR_INVALID_IMAGE_FORMAT: CUDA: Conversion not implemented between VPI_IMAGE_FORMAT_RGBA8 and VPI_IMAGE_FORMAT_NV12


So, without vpiSubmitConvertImageFormat time is the same no matter we use NvBuffer or not
but with conversion it is slower by about 1.7 ms

Now with profiler:
useNvBuffer=1 convertCuda=1 convertFromRgbaWidth=2624 preset=4 strength=1 inFile=/mnt/tmpfs/out_2624.rgba outFile=/mnt/tmpfs/out2_2624.nv12 \
    width=2624 height=1944 numFrames=1000 printFormat=1 nsys profile ./tnr_file
nsys stats report1.nsys-rep | grep -i register
     41.5      961,313,856      2,002     480,176.8     585,040.0     280,928     797,600    149,372.0  cudaGraphicsEGLRegisterImage
     26.8      620,007,040      2,002     309,693.8     338,416.0     208,064     436,128     78,890.0  cudaGraphicsUnregisterResource

So, cudaGraphicsEGLRegisterImage and cudaGraphicsUnregisterResource are the performance killers

# To verify the resulting image:
DISPLAY=:0 ffplay -v info -f rawvideo -pixel_format nv12 -video_size 2816x1944 /mnt/tmpfs/out2_2624.nv12

*/

#include <vpi/Event.h>
#include <vpi/Image.h>
#include <vpi/Status.h>
#include <vpi/Stream.h>
#include <vpi/algo/ConvertImageFormat.h>
#include <vpi/algo/TemporalNoiseReduction.h>

#include <cuda.h>
#include <cuda_runtime.h>

#include "NvBufSurface.h"
#include "NvCudaProc.h"
#include "cudaEGL.h"

#include <algorithm>
#include <cstring> // for memset
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>

#define ARRAY_SIZE(arr) (sizeof(arr)/sizeof(arr[0]))

int frameOrdinal {};
int printFormat {};

uint64_t getTimeNS() 
{
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec * 1000 * 1000 * 1000 + ts.tv_nsec;
}

struct Stat
{
    void Add(int value)
    {
        if(!this->count || this->min > value)
        {
            this->min = value;
        }
        if(!this->count || this->max < value)
        {
            this->max = value;
        }
        this->count++;
        this->total += value;
    }
    
    void Print(const char * name, double ratio)
    {
        printf("%s count %d av %lf min %lf max %lf\n", 
               name, this->count, this->count? (this->total * ratio / this->count) : 0.0, ratio * this->min, ratio * this->max);
    }
    
    int min {-1};
    int max {-1};
    int count {0};
    long long total {0};
};

Stat statPerFrameVpi;//Includes convert, tnr, sync

#define CHECK_STATUS(STMT)                                    \
    do                                                        \
    {                                                         \
        VPIStatus status = (STMT);                            \
        if (status != VPI_SUCCESS)                            \
        {                                                     \
            char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH];       \
            vpiGetLastStatusMessage(buffer, sizeof(buffer));  \
            std::ostringstream ss;                            \
            ss << "" #STMT "\n";                              \
            ss << vpiStatusGetName(status) << ": " << buffer; \
            throw std::runtime_error(ss.str());               \
        }                                                     \
    } while (0);

void VpiLogError(const char * name, VPIStatus status)
{
    char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH];
    vpiGetLastStatusMessage(buffer, sizeof(buffer));
    const char * statusName = vpiStatusGetName(status);
    printf("%s ret %d: %s : %s\n", name, (int)status, buffer, statusName);
}

bool VpiImagePrintFormat(VPIImage image, const char * comment)
{
    VPIImageFormat format {};
    VPIStatus status = vpiImageGetFormat(image, &format);
    if(status != VPI_SUCCESS)
    {
        VpiLogError("vpiImageGetFormat", status);
        return false;
    }
    int32_t imageWidth {}, imageHeight {};
    status = vpiImageGetSize(image, &imageWidth, &imageHeight);
    if(status != VPI_SUCCESS)
    {
        VpiLogError("vpiImageGetSize", status);
        return false;
    }

    uint32_t fourCC = vpiImageFormatGetFourCC(format);
    VPIImageBufferPitchLinear info {};
    info.numPlanes = vpiImageFormatGetPlaneCount(format);
    if(info.numPlanes < 0 || info.numPlanes > ARRAY_SIZE(info.planes))
    {
        printf("Bad numPlanes %d\n", info.numPlanes);
        return false;
    }
    //VPIPixelType types[3] {VPI_PIXEL_TYPE_INVALID, VPI_PIXEL_TYPE_INVALID, VPI_PIXEL_TYPE_INVALID};
    std::string typeNames[ARRAY_SIZE(info.planes)];
    for(int planeIdx = 0; planeIdx < info.numPlanes; planeIdx++)
    {
        info.planes[planeIdx].pixelType = vpiImageFormatGetPlanePixelType(format, planeIdx);
        typeNames[planeIdx] = vpiPixelTypeGetName(info.planes[planeIdx].pixelType);
        
        info.planes[planeIdx].width = vpiImageFormatGetPlaneWidth(format, imageWidth, planeIdx);
        info.planes[planeIdx].height = vpiImageFormatGetPlaneHeight(format, imageHeight, planeIdx);
    }
    
    printf("%s vpiImageGetFormat ret 0x%x %.4s Image Size %dx%d numPlanes %d types (per plane): 0x%x 0x%x 0x%x : %s %s %s sizes %dx%d %dx%d %dx%d\n", 
        comment, (int)format, (char*)&fourCC, 
        (int)imageWidth, (int)imageHeight,
        info.numPlanes, 
        (int)(int)info.planes[0].pixelType, (int)info.planes[1].pixelType, (int)info.planes[2].pixelType,
        typeNames[0].c_str(), typeNames[1].c_str(), typeNames[2].c_str(), 
        (int)info.planes[0].width, (int)info.planes[0].height, 
        (int)info.planes[1].width, (int)info.planes[1].height, 
        (int)info.planes[2].width, (int)info.planes[2].height 
        );
    //Sample output for NV12
    // vpiImageGetFormat ret 0xa2a10d1 NV12 Image Size 2624x1944 numPlanes 2 types (per plane): 0xffff1001 0xffff1011 0x0 : VPI_PIXEL_TYPE_U8 VPI_PIXEL_TYPE_2U8  
    //  sizes 2624x1944 1312x972 0x0
    return true;
}//bool VpiImagePrintFormat

struct Buffer
{
    int width;
    int height;
    NvBufSurface * nvbufSurf {};
    EGLImageKHR eglImage {};
    CUgraphicsResource cuGraphicsResource {};
    CUeglFrame eglFrame {};
};

void CreateNvBuffers(Buffer & buffer, NvBufSurfaceColorFormat format, int width, int height, bool cuGraphicsEGLRegisterUpFront)
{
    buffer.width = width;
    buffer.height = height;
    buffer.nvbufSurf = nullptr;
    NvBufSurfaceAllocateParams inputParams = {{0}};
    inputParams.params.width = width;
    inputParams.params.height = height;
    inputParams.params.memType = NVBUF_MEM_SURFACE_ARRAY;
    inputParams.params.layout = NVBUF_LAYOUT_PITCH;
    inputParams.params.colorFormat = format;
    inputParams.memtag = NvBufSurfaceTag_CAMERA;
    assert(0 == NvBufSurfaceAllocate(&buffer.nvbufSurf, 1, &inputParams));
    buffer.nvbufSurf->numFilled = 1;
    
    printf("NvBufSurfaceAllocate ret fd %d num_planes %d width %d %d height %d %d pitch %d %d\n", 
           (int)buffer.nvbufSurf->surfaceList[0].bufferDesc,
           buffer.nvbufSurf->surfaceList[0].planeParams.num_planes,
           buffer.nvbufSurf->surfaceList[0].planeParams.width[0],
           buffer.nvbufSurf->surfaceList[0].planeParams.width[1],
           buffer.nvbufSurf->surfaceList[0].planeParams.height[0],
           buffer.nvbufSurf->surfaceList[0].planeParams.height[1],
           buffer.nvbufSurf->surfaceList[0].planeParams.pitch[0],
           buffer.nvbufSurf->surfaceList[0].planeParams.pitch[1]
          );
    
    if(cuGraphicsEGLRegisterUpFront)
    {
        //Map to GPU:
        assert(0 == NvBufSurfaceMapEglImage(buffer.nvbufSurf, 0));
        buffer.eglImage = buffer.nvbufSurf->surfaceList[0].mappedAddr.eglImage;
        printf("NvBufSurfaceMapEglImage ret eglImage %p\n", buffer.eglImage);
        assert(CUDA_SUCCESS == cuGraphicsEGLRegisterImage(&buffer.cuGraphicsResource, buffer.eglImage, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE));
        assert(CUDA_SUCCESS == cuGraphicsResourceGetMappedEglFrame(&buffer.eglFrame, buffer.cuGraphicsResource, 0, 0));
        printf("Shared NvBufSurface mapped to GPU: %p %p\n", 
                buffer.eglFrame.frame.pPitch[0], buffer.eglFrame.frame.pPitch[1]
        );
    }
}

bool NvBufferWrapperCudaPitchLinear(struct Buffer & buffer, VPIImage & vpiImage, uint64_t flags)
{
    // See vpiImageCreateWrapper in https://docs.nvidia.com/vpi/group__VPI__Image.html#ga3e7cf2520dd568a7e7a9a6876ea7995c
    VPIImageData vpiImageData;
    vpiImageData.bufferType = VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR;//VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR;
    //vpiImageData.buffer is a union
    NvBufSurfaceParams & params {buffer.nvbufSurf->surfaceList[0]}; 
    if(params.colorFormat == NVBUF_COLOR_FORMAT_RGBA)
    {
        vpiImageData.buffer.pitch.format = VPI_IMAGE_FORMAT_RGBA8;
        vpiImageData.buffer.pitch.numPlanes = 1;
        vpiImageData.buffer.pitch.planes[0].pixelType = VPI_PIXEL_TYPE_4U8;
        vpiImageData.buffer.pitch.planes[0].width = buffer.eglFrame.width;
        vpiImageData.buffer.pitch.planes[0].height = buffer.eglFrame.height;
        vpiImageData.buffer.pitch.planes[0].pitchBytes = buffer.eglFrame.pitch;
        vpiImageData.buffer.pitch.planes[0].data = buffer.eglFrame.frame.pPitch[0];
        if(printFormat && frameOrdinal == 0)
        {
            printf("vpiImageCreateWrapper w %d h %d pb %d d %p\n", 
                (int)vpiImageData.buffer.pitch.planes[0].width,
                (int)vpiImageData.buffer.pitch.planes[0].height,
                (int)vpiImageData.buffer.pitch.planes[0].pitchBytes,
                (void*)vpiImageData.buffer.pitch.planes[0].data);
        }
    }
    else
    {
        vpiImageData.buffer.pitch.format = VPI_IMAGE_FORMAT_NV12_ER;
        vpiImageData.buffer.pitch.numPlanes = 2;
        vpiImageData.buffer.pitch.planes[0].pixelType = VPI_PIXEL_TYPE_U8;
        vpiImageData.buffer.pitch.planes[0].width = params.planeParams.width[0];
        vpiImageData.buffer.pitch.planes[0].height = params.planeParams.height[0];
        vpiImageData.buffer.pitch.planes[0].pitchBytes = params.planeParams.pitch[0];
        vpiImageData.buffer.pitch.planes[0].data = buffer.eglFrame.frame.pPitch[0];
        //int firstPlaneSize = width * height;//assuming pitch == width and 1 Y byte per pixel
        vpiImageData.buffer.pitch.planes[1].pixelType = VPI_PIXEL_TYPE_2U8;
        vpiImageData.buffer.pitch.planes[1].width = params.planeParams.width[1];
        vpiImageData.buffer.pitch.planes[1].height = params.planeParams.height[1];
        vpiImageData.buffer.pitch.planes[1].pitchBytes = params.planeParams.pitch[1];
        vpiImageData.buffer.pitch.planes[1].data = buffer.eglFrame.frame.pPitch[1];
        if(printFormat && frameOrdinal == 0)
        {
            printf("vpiImageCreateWrapper VPI_IMAGE_FORMAT_NV12_ER w %d h %d pb %d d %p  w %d h %d pb %d d %p\n", 
                (int)vpiImageData.buffer.pitch.planes[0].width,
                (int)vpiImageData.buffer.pitch.planes[0].height,
                (int)vpiImageData.buffer.pitch.planes[0].pitchBytes,
                (void*)vpiImageData.buffer.pitch.planes[0].data,
                (int)vpiImageData.buffer.pitch.planes[1].width,
                (int)vpiImageData.buffer.pitch.planes[1].height,
                (int)vpiImageData.buffer.pitch.planes[1].pitchBytes,
                (void*)vpiImageData.buffer.pitch.planes[1].data
            );
        }
    }

    VPIImageWrapperParams wrapperParams;
    wrapperParams.colorSpec = VPI_COLOR_SPEC_DEFAULT;//TODO
    if(vpiImage)
    {//Reuse existing image
        VPIStatus status = vpiImageSetWrapper(vpiImage, &vpiImageData);
        if(status != VPI_SUCCESS)
        {
            VpiLogError("vpiImageSetWrapper", status);
            return false;
        }
    }
    else
    {
        //flags |= VPI_EXCLUSIVE_STREAM_ACCESS;
        //TODO: condider other flags  VPI_RESTRICT_MEM_USAGE, VPI_REQUIRE_BACKENDS
        //See https://docs.nvidia.com/vpi/Types_8h.html#common_flags
        VPIStatus status = vpiImageCreateWrapper(&vpiImageData, &wrapperParams, flags, &vpiImage);
        if(status != VPI_SUCCESS)
        {//VPI_ERROR_INVALID_ARGUMENT == 2
            VpiLogError("vpiImageCreateWrapper", status);
            return false;
        }
        if(printFormat && frameOrdinal == 0)
        {
            printf("vpiImageCreateWrapper ret %p\n", vpiImage);
            VpiImagePrintFormat(vpiImage, "");
        }
    }
    return true;
}//NvBufferWrapperCudaPitchLinear

void CreateNvBufferWrapper(struct Buffer & buffer, uint64_t memFlags, VPIImage & returnImage, uint64_t flags, bool useCudaPitchLinear)
{
    if(useCudaPitchLinear)
    {
        assert(buffer.eglFrame.frame.pPitch[0] != nullptr);
        NvBufferWrapperCudaPitchLinear(buffer, returnImage, flags);
        return;
    }
    VPIImageWrapperParams wrapperParams;
    wrapperParams.colorSpec = VPI_COLOR_SPEC_DEFAULT;

    VPIImageData vpiImageData;
    vpiImageData.bufferType = VPI_IMAGE_BUFFER_NVBUFFER;
    vpiImageData.buffer.fd = buffer.nvbufSurf->surfaceList[0].bufferDesc;
    printf("call vpiImageCreateWrapper VPI_IMAGE_BUFFER_NVBUFFER fd %d\n", vpiImageData.buffer.fd);
    CHECK_STATUS(vpiImageCreateWrapper(&vpiImageData, &wrapperParams, memFlags, &returnImage));

    if(printFormat)
    {
        VPIImageData imgdata;
        CHECK_STATUS(vpiImageLockData(returnImage, VPI_LOCK_READ, VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR, &imgdata));
        for(int planeIdx = 0; planeIdx < imgdata.buffer.pitch.numPlanes; planeIdx++)
        {
            int size = imgdata.buffer.pitch.planes[planeIdx].pitchBytes * imgdata.buffer.pitch.planes[planeIdx].height;
            if(printFormat/* && frameOrdinal == 0*/)
            {
                printf("Image %p planeIdx %d width %d height %d pitchBytes %d data %p\n", 
                    returnImage, planeIdx,
                    imgdata.buffer.pitch.planes[planeIdx].width,
                    imgdata.buffer.pitch.planes[planeIdx].height,
                    imgdata.buffer.pitch.planes[planeIdx].pitchBytes,
                    imgdata.buffer.pitch.planes[planeIdx].data
                );
            }
        }
        CHECK_STATUS(vpiImageUnlock(returnImage));
    }
}

int main(int argc, char *argv[])
{
    VPIStream stream     = NULL;
    VPIImage imgPrevious = NULL, imgInput = NULL, imgInputConverted = NULL, imgOutput = NULL, imgOutputWrapper = NULL;
    VPIImage imageCvWrapper = NULL;
    VPIPayload tnr    = NULL;

    // main return value
    int retval = 0;

    VPIBackend backend {VPI_BACKEND_VIC};
    
    const char * inFileName = getenv("inFile");
    std::ifstream inFile(inFileName);
    if(!inFile)
    {
        printf("Cannot open %s for reading\n", inFileName);
        return -1;
    }
    inFile.seekg (0, inFile.end);
    long long inFileSize = inFile.tellg();
    inFile.seekg (0, inFile.beg);
    
    const char * outFileName = getenv("outFile");
    std::ofstream outFile(outFileName);
    if(!outFile)
    {
        printf("Cannot open %s for writing\n", outFileName);
        return -1;
    }
    
    const char * temp = getenv("width");
    int width = temp? strtol(temp, nullptr, 10) : 1920;
    
    temp = getenv("height");
    int height = temp? strtol(temp, nullptr, 10) : 1080;
    
    temp = getenv("numFrames");
    int numFrames = temp? strtol(temp, nullptr, 10) : 1000;

    temp = getenv("skipFrames");
    int skipFrames = temp? strtol(temp, nullptr, 10) : 2;

    temp = getenv("printFormat");
    printFormat = temp && *temp == '1';
    
    temp = getenv("convertCuda");
    bool convertCuda = temp && *temp == '1';
    VPIBackend backendStream = backend;
    if(convertCuda)
    {
        backendStream = (VPIBackend)( uint64_t(backendStream) | VPI_BACKEND_CUDA);
    }

    CHECK_STATUS(vpiStreamCreate(backendStream, &stream));
    
    uint64_t memFlags {backend};
    memFlags |= VPI_BACKEND_CPU;//Need this to lock images
    
    temp = getenv("VPI_EXCLUSIVE_STREAM_ACCESS");
    if(temp && *temp == '1')
    {
        memFlags |= VPI_EXCLUSIVE_STREAM_ACCESS;
    }
    temp = getenv("VPI_BACKEND_CUDA");
    if(temp && *temp == '1')
    {
        memFlags |= VPI_BACKEND_CUDA;
    }
    VPIImageFormat imgFormat = VPI_IMAGE_FORMAT_NV12_ER;
    temp = getenv("VPI_IMAGE_FORMAT_YUYV_ER");
    if(temp && *temp == '1')
    {
        imgFormat = VPI_IMAGE_FORMAT_YUYV_ER;
    }
    temp = getenv("VPI_IMAGE_FORMAT_UYVY_ER");
    if(temp && *temp == '1')
    {
        imgFormat = VPI_IMAGE_FORMAT_UYVY_ER;
    }

    temp = getenv("tnrCuda");
    bool tnrCuda = temp && *temp == '1';
    
    temp = getenv("convertFromRgbaWidth");
    int convertFromRgbaWidth = temp? strtol(temp, nullptr, 10) : 0;
    
    temp = getenv("useNvBuffer");
    bool useNvBuffer = temp && *temp == '1';
    printf("useNvBuffer %d\n", useNvBuffer);

    temp = getenv("useCudaPitchLinear");
    bool useCudaPitchLinear = temp && *temp == '1';
    printf("useCudaPitchLinear %d\n", useCudaPitchLinear);

    temp = getenv("cuGraphicsEGLRegisterUpFront");
    bool cuGraphicsEGLRegisterUpFront = (temp && *temp == '1') || useCudaPitchLinear;
    
    uint64_t memFlagsInputs {memFlags};
    if(convertCuda && convertFromRgbaWidth)
    {
        memFlagsInputs = (VPIBackend)( uint64_t(memFlagsInputs) | VPI_BACKEND_CUDA);
    }

    Buffer bufferPrevious, bufferOutput, bufferInput, bufferInputRgba;
    if(useNvBuffer)
    {
        CreateNvBuffers(bufferPrevious, NVBUF_COLOR_FORMAT_NV12_ER, width, height, cuGraphicsEGLRegisterUpFront);
        CreateNvBufferWrapper(bufferPrevious, memFlags, imgPrevious, memFlags, useCudaPitchLinear);
        
        CreateNvBuffers(bufferOutput, NVBUF_COLOR_FORMAT_NV12_ER, width, height, cuGraphicsEGLRegisterUpFront);
        CreateNvBufferWrapper(bufferOutput, memFlags, imgOutput, memFlags, useCudaPitchLinear);
        
        CreateNvBuffers(bufferInput, NVBUF_COLOR_FORMAT_NV12_ER, width, height, cuGraphicsEGLRegisterUpFront);
        CreateNvBufferWrapper(bufferInput, memFlagsInputs, imgInput, memFlagsInputs, useCudaPitchLinear);
    }
    else
    {
        CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlagsInputs, &imgInput));
        CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlags, &imgPrevious));
        CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlags, &imgOutput));
    }
    
    if(convertFromRgbaWidth)
    {
        imgInputConverted = imgInput;//NV12
        imgInput = nullptr;
        
        if(useNvBuffer)
        {
            CreateNvBuffers(bufferInputRgba, NVBUF_COLOR_FORMAT_RGBA, width, height, cuGraphicsEGLRegisterUpFront);
            CreateNvBufferWrapper(bufferInputRgba, memFlagsInputs, imgInput, memFlagsInputs, useCudaPitchLinear);
        }
        else
        {
            CHECK_STATUS(vpiImageCreate(convertFromRgbaWidth, height, VPI_IMAGE_FORMAT_RGBA8, memFlagsInputs, &imgInput));
        }
    }
    
    VpiImagePrintFormat(imgInput, "imgInput");
    VpiImagePrintFormat(imgInputConverted, "imgInputConverted");
    
    CHECK_STATUS(vpiCreateTemporalNoiseReduction(tnrCuda? VPI_BACKEND_CUDA : VPI_BACKEND_VIC, width, height, imgFormat, VPI_TNR_DEFAULT, &tnr));

    VPITNRParams params;
    CHECK_STATUS(vpiInitTemporalNoiseReductionParams(&params));
    
    temp = getenv("preset");
    if(temp)
    {
        params.preset = (VPITNRPreset)strtol(temp, nullptr, 10);
    }
    temp = getenv("strength");
    if(temp)
    {
        params.strength = strtod(temp, nullptr);
    }
    printf("tnr params preset: %d strength: %lf\n", (int)params.preset, (double)params.strength);
    
    temp = getenv("repeatOneFrame");
    bool repeatOneFrame = temp && *temp == '1';
    
    temp = getenv("dropResults");
    bool dropResults = temp && *temp == '1';

    VPIEvent evStart = NULL;
    VPIEvent evEnd = NULL;
    CHECK_STATUS(vpiEventCreate(backend, &evStart));
    CHECK_STATUS(vpiEventCreate(backend, &evEnd));
    
    printf("Run loop\n");
    
    long long filePos = 0;
    for(frameOrdinal = 0; frameOrdinal < numFrames; frameOrdinal++)
    {
        uint64_t timeStart = getTimeNS();
        
        if(!repeatOneFrame || frameOrdinal == 0)
        {
            //This is one way to read file: lock VPI image and read directly to it.
            VPIImageData imgdata;
            CHECK_STATUS(vpiImageLockData(imgInput, VPI_LOCK_WRITE, VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR, &imgdata));
            int frameSize {};
            for(int planeIdx = 0; planeIdx < imgdata.buffer.pitch.numPlanes; planeIdx++)
            {
                int size = imgdata.buffer.pitch.planes[planeIdx].pitchBytes * imgdata.buffer.pitch.planes[planeIdx].height;
                inFile.read((char*)imgdata.buffer.pitch.planes[planeIdx].data, size);
                frameSize += size;
                if(printFormat && frameOrdinal == 0)
                {
                    printf("planeIdx %d width %d height %d pitchBytes %d size %d total %d\n", planeIdx,
                        imgdata.buffer.pitch.planes[planeIdx].width,
                        imgdata.buffer.pitch.planes[planeIdx].height,
                        imgdata.buffer.pitch.planes[planeIdx].pitchBytes,
                        size, frameSize);
                }
            }

            CHECK_STATUS(vpiImageUnlock(imgInput));
            if(!inFile)
            {
                printf("Failed to read frame of size %d at pos %lld\n", frameSize, filePos);
                return -1;
            }
            filePos += frameSize;
            //printf("frameOrdinal %d frameSize %d filePos %lld\n", frameOrdinal, frameSize, filePos);
            if(filePos == inFileSize)
            {
                filePos = 0;
                //printf("seekg 0\n");
                inFile.seekg (0, inFile.beg);
            }
        }
        uint64_t timeStartProc = getTimeNS();
        
        VPIImage from = imgInput;
        if(convertFromRgbaWidth)
        {
            CHECK_STATUS(vpiSubmitConvertImageFormat(stream, 
                convertCuda? VPI_BACKEND_CUDA : VPI_BACKEND_VIC, 
                imgInput, imgInputConverted, nullptr));
            from = imgInputConverted;
        }
        
        CHECK_STATUS(vpiSubmitTemporalNoiseReduction(stream, tnrCuda? VPI_BACKEND_CUDA : VPI_BACKEND_VIC, tnr, 
            frameOrdinal == 0 ? nullptr: imgPrevious, from, imgOutput, &params));
        CHECK_STATUS(vpiStreamSync(stream));
        
        if(frameOrdinal >= skipFrames)//Do not count first few frames
        {
            statPerFrameVpi.Add( (int)(getTimeNS() - timeStartProc) );
        }
        
        if(!dropResults)
        {
            VPIImageData imgdata;
            CHECK_STATUS(vpiImageLockData(imgOutput, VPI_LOCK_READ, VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR, &imgdata));
            for(int planeIdx = 0; planeIdx < imgdata.buffer.pitch.numPlanes; planeIdx++)
            {
                int size = imgdata.buffer.pitch.planes[planeIdx].pitchBytes * imgdata.buffer.pitch.planes[planeIdx].height;
                outFile.write((const char*)imgdata.buffer.pitch.planes[planeIdx].data, size);
                if(printFormat && frameOrdinal == 0)
                {
                    printf("imgOutput %p planeIdx %d width %d height %d pitchBytes %d data %p\n", 
                        imgOutput, planeIdx,
                        imgdata.buffer.pitch.planes[planeIdx].width,
                        imgdata.buffer.pitch.planes[planeIdx].height,
                        imgdata.buffer.pitch.planes[planeIdx].pitchBytes,
                        imgdata.buffer.pitch.planes[planeIdx].data
                          );
                }
            }
            CHECK_STATUS(vpiImageUnlock(imgOutput));
        }
        
        std::swap(imgPrevious, imgOutput);
    }//for(int frameOrdinal = 0; frameOrdinal < numFrames; frameOrdinal++)
    
    printf("repeatOneFrame=%d dropResults=%d\n", repeatOneFrame, dropResults);
    statPerFrameVpi.Print("statPerFrameVpi (ms)", 1E-6);//Includes convert, tnr, sync
    
    vpiStreamDestroy(stream);
    vpiPayloadDestroy(tnr);
    vpiImageDestroy(imgPrevious);
    vpiImageDestroy(imgInput);
    vpiImageDestroy(imgOutput);
    vpiImageDestroy(imageCvWrapper);

    return 0;
}

Hi,

Sorry for the late update.
This is a known issue which can be found in our document:

https://docs.nvidia.com/vpi/release_notes.html#autotoc_md3

  • Host images wrapped into VPIImages using vpiImageCreateWrapper might impact performance when using them with algorithms running on the CUDA backend. You should avoid wrappers in this case, and use VPIImages allocated with vpiImageCreate instead.
  • Performance might be affected when using CUDA images wrapped into VPIImages using vpiImageCreateWrapper in algorithms running in PVA, VIC and/or OFA. User should avoid using wrappers in this case, preferring to use VPIImages allocated with vpiImageCreate.

Thanks.