Hi,
I am trying to use NvBufSurfaceAllocate/vpiImageCreateWrapper to create VPIImage instead of vpiImageCreate because I need to integrate VPI into my larger pipeline, which is using NvBuffer.
I am comparing performance of several VPI routines, such as vpiSubmitConvertImageFormat and vpiSubmitTemporalNoiseReduction with VPIImage from vpiImageCreate vs vpiImageCreateWrapper.
It appears that vpiSubmitConvertImageFormat(VPI_BACKEND_CUDA) is much slower on NvBuffer (by more than 1 ms).
By using nsys I found that vpiSubmitConvertImageFormat is calling cudaGraphicsEGLRegisterImage/cudaGraphicsUnregisterResource twice - once for each image.
But cudaGraphicsEGLRegisterImage is very slow function, which should be only called once and never for each frame because it will kill performance.
(this was discussed in How to share NvBufSurface with Cuda efficiently, without overhead of cuGraphicsEGLRegisterImage/cuGraphicsUnregisterResource? )
I tried to call cuGraphicsEGLRegisterImage before frame loop starts, but that makes no difference.
I attached a complete test code below.
So, questions is how to prevent vpiSubmitConvertImageFormat and other VPI functions from calling cudaGraphicsEGLRegisterImage
otherwise VPI will be very slow and not very useful on Orin.
Thank you
/*
Usage:
g++ -o tnr_file -I/usr/local/cuda-12.2/targets/aarch64-linux/include -I/usr/src/jetson_multimedia_api/include \
-I/usr/src/jetson_multimedia_api/samples/common/algorithm/cuda/ \
./tnr_file.cpp -L/usr/local/cuda-12.2/targets/aarch64-linux/lib/ -lnvvpi -lcudart -L/usr/lib/aarch64-linux-gnu/tegra/ -lnvbufsurface \
-lcuda -lnvrm_mem
sudo mkdir /mnt/tmpfs
sudo chown $USER:$USER /mnt/tmpfs
sudo mount -t tmpfs -o size=16g tmpfs /mnt/tmpfs
Test: read RGBA, convert to NV12 using CUDA, save as NV12:
gst-launch-1.0 filesrc location=/opt/nvidia/vpi3/samples/assets/noisy.mp4 ! qtdemux ! queue ! h264parse ! avdec_h264 ! \
nvvidconv ! 'video/x-raw, format=NV12, width=2816, height=1944' ! \
filesink location=/mnt/tmpfs/out_2816.nv12 -e
gst-launch-1.0 filesrc location=/opt/nvidia/vpi3/samples/assets/noisy.mp4 ! qtdemux ! queue ! h264parse ! avdec_h264 ! \
nvvidconv ! 'video/x-raw, format=RGBA, width=2624, height=1944' ! \
filesink location=/mnt/tmpfs/out_2624.rgba -e
useNvBuffer=1 preset=4 strength=1 inFile=/mnt/tmpfs/out_2816.nv12 outFile=/mnt/tmpfs/out2_2624.nv12 \
width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
statPerFrameVpi (ms) count 998 av 2.112977 min 2.081234 max 2.210318
preset=4 strength=1 inFile=/mnt/tmpfs/out_2816.nv12 outFile=/mnt/tmpfs/out2_2624.nv12 \
width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
statPerFrameTnr (ms) count 998 av 2.186153 min 2.061330 max 2.353319
convertCuda=1 convertFromRgbaWidth=2624 preset=4 strength=1 inFile=/mnt/tmpfs/out_2624.rgba outFile=/mnt/tmpfs/out2_2624.nv12 \
width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
statPerFrameVpi (ms) count 998 av 2.865078 min 2.706413 max 3.287629
useNvBuffer=1 convertCuda=1 convertFromRgbaWidth=2624 preset=4 strength=1 inFile=/mnt/tmpfs/out_2624.rgba outFile=/mnt/tmpfs/out2_2624.nv12 \
width=2624 height=1944 numFrames=1000 printFormat=1 ./tnr_file
statPerFrameVpi (ms) count 998 av 4.463151 min 4.291428 max 4.792270
So, without vpiSubmitConvertImageFormat time is the same no matter we use NvBuffer or not
but with conversion it is slower by about 1.7 ms
Now with profiler:
useNvBuffer=1 convertCuda=1 convertFromRgbaWidth=2624 preset=4 strength=1 inFile=/mnt/tmpfs/out_2624.rgba outFile=/mnt/tmpfs/out2_2624.nv12 \
width=2624 height=1944 numFrames=1000 printFormat=1 nsys profile ./tnr_file
nsys stats report1.nsys-rep | grep -i register
45.6 1,115,627,424 2,002 557,256.5 702,624.0 340,256 895,264 175,705.3 cudaGraphicsEGLRegisterImage
24.2 593,418,432 2,002 296,412.8 340,832.0 204,576 462,944 82,377.1 cudaGraphicsUnregisterResource
So, cudaGraphicsEGLRegisterImage and cudaGraphicsUnregisterResource are the performance killers
DISPLAY=:0 ffplay -v info -f rawvideo -pixel_format nv12 -video_size 2816x1944 /mnt/tmpfs/out2_2624.nv12
*/
#include <vpi/Event.h>
#include <vpi/Image.h>
#include <vpi/Status.h>
#include <vpi/Stream.h>
#include <vpi/algo/ConvertImageFormat.h>
#include <vpi/algo/TemporalNoiseReduction.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include "NvBufSurface.h"
#include "NvCudaProc.h"
#include "cudaEGL.h"
#include <algorithm>
#include <cstring> // for memset
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#define ARRAY_SIZE(arr) (sizeof(arr)/sizeof(arr[0]))
int frameOrdinal {};
int printFormat {};
uint64_t getTimeNS()
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec * 1000 * 1000 * 1000 + ts.tv_nsec;
}
struct Stat
{
void Add(int value)
{
if(!this->count || this->min > value)
{
this->min = value;
}
if(!this->count || this->max < value)
{
this->max = value;
}
this->count++;
this->total += value;
}
void Print(const char * name, double ratio)
{
printf("%s count %d av %lf min %lf max %lf\n",
name, this->count, this->count? (this->total * ratio / this->count) : 0.0, ratio * this->min, ratio * this->max);
}
int min {-1};
int max {-1};
int count {0};
long long total {0};
};
Stat statPerFrameVpi;//Includes convert, tnr, sync
#define CHECK_STATUS(STMT) \
do \
{ \
VPIStatus status = (STMT); \
if (status != VPI_SUCCESS) \
{ \
char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH]; \
vpiGetLastStatusMessage(buffer, sizeof(buffer)); \
std::ostringstream ss; \
ss << "" #STMT "\n"; \
ss << vpiStatusGetName(status) << ": " << buffer; \
throw std::runtime_error(ss.str()); \
} \
} while (0);
void VpiLogError(const char * name, VPIStatus status)
{
char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH];
vpiGetLastStatusMessage(buffer, sizeof(buffer));
const char * statusName = vpiStatusGetName(status);
printf("%s ret %d: %s : %s\n", name, (int)status, buffer, statusName);
}
bool VpiImagePrintFormat(VPIImage image, const char * comment)
{
VPIImageFormat format {};
VPIStatus status = vpiImageGetFormat(image, &format);
if(status != VPI_SUCCESS)
{
VpiLogError("vpiImageGetFormat", status);
return false;
}
int32_t imageWidth {}, imageHeight {};
status = vpiImageGetSize(image, &imageWidth, &imageHeight);
if(status != VPI_SUCCESS)
{
VpiLogError("vpiImageGetSize", status);
return false;
}
uint32_t fourCC = vpiImageFormatGetFourCC(format);
VPIImageBufferPitchLinear info {};
info.numPlanes = vpiImageFormatGetPlaneCount(format);
if(info.numPlanes < 0 || info.numPlanes > ARRAY_SIZE(info.planes))
{
printf("Bad numPlanes %d\n", info.numPlanes);
return false;
}
//VPIPixelType types[3] {VPI_PIXEL_TYPE_INVALID, VPI_PIXEL_TYPE_INVALID, VPI_PIXEL_TYPE_INVALID};
std::string typeNames[ARRAY_SIZE(info.planes)];
for(int planeIdx = 0; planeIdx < info.numPlanes; planeIdx++)
{
info.planes[planeIdx].pixelType = vpiImageFormatGetPlanePixelType(format, planeIdx);
typeNames[planeIdx] = vpiPixelTypeGetName(info.planes[planeIdx].pixelType);
info.planes[planeIdx].width = vpiImageFormatGetPlaneWidth(format, imageWidth, planeIdx);
info.planes[planeIdx].height = vpiImageFormatGetPlaneHeight(format, imageHeight, planeIdx);
}
printf("%s vpiImageGetFormat ret 0x%x %.4s Image Size %dx%d numPlanes %d types (per plane): 0x%x 0x%x 0x%x : %s %s %s sizes %dx%d %dx%d %dx%d\n",
comment, (int)format, (char*)&fourCC,
(int)imageWidth, (int)imageHeight,
info.numPlanes,
(int)(int)info.planes[0].pixelType, (int)info.planes[1].pixelType, (int)info.planes[2].pixelType,
typeNames[0].c_str(), typeNames[1].c_str(), typeNames[2].c_str(),
(int)info.planes[0].width, (int)info.planes[0].height,
(int)info.planes[1].width, (int)info.planes[1].height,
(int)info.planes[2].width, (int)info.planes[2].height
);
//Sample output for NV12
// vpiImageGetFormat ret 0xa2a10d1 NV12 Image Size 2624x1944 numPlanes 2 types (per plane): 0xffff1001 0xffff1011 0x0 : VPI_PIXEL_TYPE_U8 VPI_PIXEL_TYPE_2U8
// sizes 2624x1944 1312x972 0x0
return true;
}//bool VpiImagePrintFormat
NvBufSurface * CreateNvBuffers(int count, NvBufSurfaceColorFormat format, int width, int height, bool cuGraphicsEGLRegisterUpFront)
{
NvBufSurface * nvbufSurf {};
NvBufSurfaceAllocateParams inputParams = {{0}};
inputParams.params.width = width;
inputParams.params.height = height;
inputParams.params.memType = NVBUF_MEM_SURFACE_ARRAY;
inputParams.params.layout = NVBUF_LAYOUT_PITCH;
inputParams.params.colorFormat = format;
inputParams.memtag = NvBufSurfaceTag_CAMERA;
assert(0 == NvBufSurfaceAllocate(&nvbufSurf, count, &inputParams));
nvbufSurf->numFilled = count;
if(cuGraphicsEGLRegisterUpFront)
{
for(int idx = 0; idx < nvbufSurf->numFilled; idx++)
{
//Map to GPU:
assert(0 == NvBufSurfaceMapEglImage(nvbufSurf, idx));
EGLImageKHR eglImage = nvbufSurf->surfaceList[idx].mappedAddr.eglImage;
printf("NvBufSurfaceMapEglImage ret eglImage %p\n", eglImage);
CUgraphicsResource cuGraphicsResource {};
assert(CUDA_SUCCESS == cuGraphicsEGLRegisterImage(&cuGraphicsResource, eglImage, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE));
CUeglFrame eglFrame {};
assert(CUDA_SUCCESS == cuGraphicsResourceGetMappedEglFrame(&eglFrame, cuGraphicsResource, 0, 0));
char * sharedGpuPtr = (char *)eglFrame.frame.pPitch[0];
printf("Shared NvBufSurface mapped to GPU: %p\n", sharedGpuPtr);
}
}
return nvbufSurf;
}
void CreateNvBufferWrapper(int fd, uint64_t memFlags, VPIImage * returnImage)
{
VPIImageWrapperParams wrapperParams;
wrapperParams.colorSpec = VPI_COLOR_SPEC_DEFAULT;
VPIImageData vpiImageData;
vpiImageData.bufferType = VPI_IMAGE_BUFFER_NVBUFFER;
vpiImageData.buffer.fd = fd;
printf("call vpiImageCreateWrapper VPI_IMAGE_BUFFER_NVBUFFER fd %d\n", vpiImageData.buffer.fd);
CHECK_STATUS(vpiImageCreateWrapper(&vpiImageData, &wrapperParams, memFlags, returnImage));
if(printFormat)
{
VPIImageData imgdata;
CHECK_STATUS(vpiImageLockData(*returnImage, VPI_LOCK_READ, VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR, &imgdata));
for(int planeIdx = 0; planeIdx < imgdata.buffer.pitch.numPlanes; planeIdx++)
{
int size = imgdata.buffer.pitch.planes[planeIdx].pitchBytes * imgdata.buffer.pitch.planes[planeIdx].height;
if(printFormat/* && frameOrdinal == 0*/)
{
printf("Image %p planeIdx %d width %d height %d pitchBytes %d data %p\n",
*returnImage, planeIdx,
imgdata.buffer.pitch.planes[planeIdx].width,
imgdata.buffer.pitch.planes[planeIdx].height,
imgdata.buffer.pitch.planes[planeIdx].pitchBytes,
imgdata.buffer.pitch.planes[planeIdx].data
);
}
}
CHECK_STATUS(vpiImageUnlock(*returnImage));
}
}
int main(int argc, char *argv[])
{
VPIStream stream = NULL;
VPIImage imgPrevious = NULL, imgInput = NULL, imgInputConverted = NULL, imgOutput = NULL, imgOutputWrapper = NULL;
VPIImage imageCvWrapper = NULL;
VPIPayload tnr = NULL;
// main return value
int retval = 0;
VPIBackend backend {VPI_BACKEND_VIC};
const char * inFileName = getenv("inFile");
std::ifstream inFile(inFileName);
if(!inFile)
{
printf("Cannot open %s for reading\n", inFileName);
return -1;
}
inFile.seekg (0, inFile.end);
long long inFileSize = inFile.tellg();
inFile.seekg (0, inFile.beg);
const char * outFileName = getenv("outFile");
std::ofstream outFile(outFileName);
if(!outFile)
{
printf("Cannot open %s for writing\n", outFileName);
return -1;
}
const char * temp = getenv("width");
int width = temp? strtol(temp, nullptr, 10) : 1920;
temp = getenv("height");
int height = temp? strtol(temp, nullptr, 10) : 1080;
temp = getenv("numFrames");
int numFrames = temp? strtol(temp, nullptr, 10) : 1000;
temp = getenv("skipFrames");
int skipFrames = temp? strtol(temp, nullptr, 10) : 2;
temp = getenv("printFormat");
printFormat = temp && *temp == '1';
temp = getenv("convertCuda");
bool convertCuda = temp && *temp == '1';
VPIBackend backendStream = backend;
if(convertCuda)
{
backendStream = (VPIBackend)( uint64_t(backendStream) | VPI_BACKEND_CUDA);
}
CHECK_STATUS(vpiStreamCreate(backendStream, &stream));
uint64_t memFlags {backend};
memFlags |= VPI_BACKEND_CPU;//Need this to lock images
temp = getenv("VPI_EXCLUSIVE_STREAM_ACCESS");
if(temp && *temp == '1')
{
memFlags |= VPI_EXCLUSIVE_STREAM_ACCESS;
}
temp = getenv("VPI_BACKEND_CUDA");
if(temp && *temp == '1')
{
memFlags |= VPI_BACKEND_CUDA;
}
VPIImageFormat imgFormat = VPI_IMAGE_FORMAT_NV12_ER;
temp = getenv("VPI_IMAGE_FORMAT_YUYV_ER");
if(temp && *temp == '1')
{
imgFormat = VPI_IMAGE_FORMAT_YUYV_ER;
}
temp = getenv("VPI_IMAGE_FORMAT_UYVY_ER");
if(temp && *temp == '1')
{
imgFormat = VPI_IMAGE_FORMAT_UYVY_ER;
}
temp = getenv("tnrCuda");
bool tnrCuda = temp && *temp == '1';
temp = getenv("convertFromRgbaWidth");
int convertFromRgbaWidth = temp? strtol(temp, nullptr, 10) : 0;
temp = getenv("useNvBuffer");
bool useNvBuffer = temp && *temp == '1';
temp = getenv("cuGraphicsEGLRegisterUpFront");
bool cuGraphicsEGLRegisterUpFront = temp && *temp == '1';
uint64_t memFlagsInputs {memFlags};
if(convertCuda && convertFromRgbaWidth)
{
memFlagsInputs = (VPIBackend)( uint64_t(memFlagsInputs) | VPI_BACKEND_CUDA);
}
if(useNvBuffer)
{
NvBufSurface * nvbufSurf = CreateNvBuffers(1, NVBUF_COLOR_FORMAT_NV12_ER, width, height, cuGraphicsEGLRegisterUpFront);
CreateNvBufferWrapper(nvbufSurf->surfaceList[0].bufferDesc, memFlags, &imgPrevious);
nvbufSurf = CreateNvBuffers(1, NVBUF_COLOR_FORMAT_NV12_ER, width, height, cuGraphicsEGLRegisterUpFront);
CreateNvBufferWrapper(nvbufSurf->surfaceList[0].bufferDesc, memFlags, &imgOutput);
nvbufSurf = CreateNvBuffers(1, NVBUF_COLOR_FORMAT_NV12_ER, width, height, cuGraphicsEGLRegisterUpFront);
CreateNvBufferWrapper(nvbufSurf->surfaceList[0].bufferDesc, memFlagsInputs, &imgInput);
}
else
{
CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlagsInputs, &imgInput));
CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlags, &imgPrevious));
CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlags, &imgOutput));
}
if(convertFromRgbaWidth)
{
imgInputConverted = imgInput;//NV12
imgInput = nullptr;
if(useNvBuffer)
{
NvBufSurface * nvbufSurf = CreateNvBuffers(1, NVBUF_COLOR_FORMAT_RGBA, width, height, cuGraphicsEGLRegisterUpFront);
CreateNvBufferWrapper(nvbufSurf->surfaceList[0].bufferDesc, memFlagsInputs, &imgInput);
}
else
{
CHECK_STATUS(vpiImageCreate(convertFromRgbaWidth, height, VPI_IMAGE_FORMAT_RGBA8, memFlagsInputs, &imgInput));
}
}
VpiImagePrintFormat(imgInput, "imgInput");
CHECK_STATUS(vpiCreateTemporalNoiseReduction(tnrCuda? VPI_BACKEND_CUDA : VPI_BACKEND_VIC, width, height, imgFormat, VPI_TNR_DEFAULT, &tnr));
VPITNRParams params;
CHECK_STATUS(vpiInitTemporalNoiseReductionParams(¶ms));
temp = getenv("preset");
if(temp)
{
params.preset = (VPITNRPreset)strtol(temp, nullptr, 10);
}
temp = getenv("strength");
if(temp)
{
params.strength = strtod(temp, nullptr);
}
printf("tnr params preset: %d strength: %lf\n", (int)params.preset, (double)params.strength);
temp = getenv("repeatOneFrame");
bool repeatOneFrame = temp && *temp == '1';
temp = getenv("dropResults");
bool dropResults = temp && *temp == '1';
VPIEvent evStart = NULL;
VPIEvent evEnd = NULL;
CHECK_STATUS(vpiEventCreate(backend, &evStart));
CHECK_STATUS(vpiEventCreate(backend, &evEnd));
printf("Run loop\n");
long long filePos = 0;
for(frameOrdinal = 0; frameOrdinal < numFrames; frameOrdinal++)
{
uint64_t timeStart = getTimeNS();
if(!repeatOneFrame || frameOrdinal == 0)
{
//This is one way to read file: lock VPI image and read directly to it.
VPIImageData imgdata;
CHECK_STATUS(vpiImageLockData(imgInput, VPI_LOCK_WRITE, VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR, &imgdata));
int frameSize {};
for(int planeIdx = 0; planeIdx < imgdata.buffer.pitch.numPlanes; planeIdx++)
{
int size = imgdata.buffer.pitch.planes[planeIdx].pitchBytes * imgdata.buffer.pitch.planes[planeIdx].height;
inFile.read((char*)imgdata.buffer.pitch.planes[planeIdx].data, size);
frameSize += size;
if(printFormat && frameOrdinal == 0)
{
printf("planeIdx %d width %d height %d pitchBytes %d size %d total %d\n", planeIdx,
imgdata.buffer.pitch.planes[planeIdx].width,
imgdata.buffer.pitch.planes[planeIdx].height,
imgdata.buffer.pitch.planes[planeIdx].pitchBytes,
size, frameSize);
}
}
CHECK_STATUS(vpiImageUnlock(imgInput));
if(!inFile)
{
printf("Failed to read frame of size %d at pos %lld\n", frameSize, filePos);
return -1;
}
filePos += frameSize;
//printf("frameOrdinal %d frameSize %d filePos %lld\n", frameOrdinal, frameSize, filePos);
if(filePos == inFileSize)
{
filePos = 0;
//printf("seekg 0\n");
inFile.seekg (0, inFile.beg);
}
}
uint64_t timeStartProc = getTimeNS();
VPIImage from = imgInput;
if(convertFromRgbaWidth)
{
CHECK_STATUS(vpiSubmitConvertImageFormat(stream,
convertCuda? VPI_BACKEND_CUDA : VPI_BACKEND_VIC,
imgInput, imgInputConverted, nullptr));
from = imgInputConverted;
}
CHECK_STATUS(vpiSubmitTemporalNoiseReduction(stream, tnrCuda? VPI_BACKEND_CUDA : VPI_BACKEND_VIC, tnr,
frameOrdinal == 0 ? nullptr: imgPrevious, from, imgOutput, ¶ms));
CHECK_STATUS(vpiStreamSync(stream));
if(frameOrdinal >= skipFrames)//Do not count first few frames
{
statPerFrameVpi.Add( (int)(getTimeNS() - timeStartProc) );
}
if(!dropResults)
{
VPIImageData imgdata;
CHECK_STATUS(vpiImageLockData(imgOutput, VPI_LOCK_READ, VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR, &imgdata));
for(int planeIdx = 0; planeIdx < imgdata.buffer.pitch.numPlanes; planeIdx++)
{
int size = imgdata.buffer.pitch.planes[planeIdx].pitchBytes * imgdata.buffer.pitch.planes[planeIdx].height;
outFile.write((const char*)imgdata.buffer.pitch.planes[planeIdx].data, size);
if(printFormat && frameOrdinal == 0)
{
printf("imgOutput %p planeIdx %d width %d height %d pitchBytes %d data %p\n",
imgOutput, planeIdx,
imgdata.buffer.pitch.planes[planeIdx].width,
imgdata.buffer.pitch.planes[planeIdx].height,
imgdata.buffer.pitch.planes[planeIdx].pitchBytes,
imgdata.buffer.pitch.planes[planeIdx].data
);
}
}
CHECK_STATUS(vpiImageUnlock(imgOutput));
}
std::swap(imgPrevious, imgOutput);
}//for(int frameOrdinal = 0; frameOrdinal < numFrames; frameOrdinal++)
printf("repeatOneFrame=%d dropResults=%d\n", repeatOneFrame, dropResults);
statPerFrameVpi.Print("statPerFrameVpi (ms)", 1E-6);//Includes convert, tnr, sync
vpiStreamDestroy(stream);
vpiPayloadDestroy(tnr);
vpiImageDestroy(imgPrevious);
vpiImageDestroy(imgInput);
vpiImageDestroy(imgOutput);
vpiImageDestroy(imageCvWrapper);
return 0;
}