I already mentioned this failure in my previous post How to prevent vpiSubmitConvertImageFormat from calling cudaGraphicsEGLRegisterImage, which kills performance?
I understand that VPI algos on images created with vpiImageCreateWrapper may be somewhat slower.
But I was not able to make vpiSubmitTemporalNoiseReduction work on VpiImage, which wraps a preallocated CUDA memory at all.
See the attached sample code. You can run it to use regular vpiImageCreate and call vpiSubmitTemporalNoiseReduction, which succeeds:
width=256 height=256 ./tnr_wrap # Use vpiImageCreate
And then run:
useWrapper=1 width=256 height=256 ./tnr_wrap # Use vpiImageCreateWrapper
# Prints:
terminate called after throwing an instance of 'std::runtime_error'
what(): vpiSubmitTemporalNoiseReduction(stream, backend, tnr, imgPrevious, imgInput, imgOutput, ¶ms)
VPI_ERROR_INVALID_ARGUMENT: Previous frame must have the same format configured during payload creation
Note that I printed detailed information about created images and the information is identical in both cases (except for CUDA addresses):
imgInput vpiImageGetFormat ret 0xa2a10d1 NV12 Image Size 256x256 numPlanes 2
plane #0: type 0xffff1001 VPI_PIXEL_TYPE_U8 size 256x256 0xffff1001 256 256 pitch 256 0x205807000
plane #1: type 0xffff1011 VPI_PIXEL_TYPE_2U8 size 128x128 0xffff1011 128 128 pitch 256 0x205817000
imgPrevious vpiImageGetFormat ret 0xa2a10d1 NV12 Image Size 256x256 numPlanes 2
plane #0: type 0xffff1001 VPI_PIXEL_TYPE_U8 size 256x256 0xffff1001 256 256 pitch 256 0x205867000
plane #1: type 0xffff1011 VPI_PIXEL_TYPE_2U8 size 128x128 0xffff1011 128 128 pitch 256 0x205877000
imgOutput vpiImageGetFormat ret 0xa2a10d1 NV12 Image Size 256x256 numPlanes 2
plane #0: type 0xffff1001 VPI_PIXEL_TYPE_U8 size 256x256 0xffff1001 256 256 pitch 256 0x205837000
plane #1: type 0xffff1011 VPI_PIXEL_TYPE_2U8 size 128x128 0xffff1011 128 128 pitch 256 0x205847000
So, why vpiSubmitTemporalNoiseReduction would fail?
And how to correct my code to make vpiSubmitTemporalNoiseReduction work on preallocated CUDA memory?
Thank you
Save this file to tnr_wrap.cpp
# Compile
g++ -o tnr_wrap -I/usr/local/cuda-12.6/targets/aarch64-linux/include \
./tnr_wrap.cpp -L/usr/local/cuda-12.6/targets/aarch64-linux/lib/ -lnvvpi -lcudart
width=256 height=256 ./tnr_wrap # Use vpiImageCreate
# Prints ok
useWrapper=1 width=256 height=256 ./tnr_wrap # Use vpiImageCreateWrapper
# Prints:
terminate called after throwing an instance of 'std::runtime_error'
what(): vpiSubmitTemporalNoiseReduction(stream, backend, tnr, imgPrevious, imgInput, imgOutput, ¶ms)
VPI_ERROR_INVALID_ARGUMENT: Previous frame must have the same format configured during payload creation
#include <vpi/Event.h>
#include <vpi/Image.h>
#include <vpi/Status.h>
#include <vpi/Stream.h>
#include <vpi/algo/TemporalNoiseReduction.h>
#include <cuda_runtime.h>
#include <fstream>
#include <iostream>
#include <sstream>
do \
{ \
VPIStatus status = (STMT); \
if (status != VPI_SUCCESS) \
{ \
vpiGetLastStatusMessage(buffer, sizeof(buffer)); \
std::ostringstream ss; \
ss << "" #STMT "\n"; \
ss << vpiStatusGetName(status) << ": " << buffer; \
throw std::runtime_error(ss.str()); \
} \
} while (0);
void CreateImageWrapper(int width, int height, VPIImageFormat imgFormat, uint64_t backend, VPIImage * vpiImage)
assert(imgFormat == VPI_IMAGE_FORMAT_NV12_ER);
int pitch = ((width + 255)/256)*256;
void * cudaPtrY {}, * cudaPtrUV;
assert(cudaSuccess == cudaMalloc(&cudaPtrY, pitch * height));
assert(cudaSuccess == cudaMalloc(&cudaPtrUV, pitch * height/2));
VPIImageData vpiImageData;
vpiImageData.buffer.pitch.format = VPI_IMAGE_FORMAT_NV12_ER;
vpiImageData.buffer.pitch.numPlanes = 2;
vpiImageData.buffer.pitch.planes[0].pixelType = VPI_PIXEL_TYPE_U8;
vpiImageData.buffer.pitch.planes[0].width = width;
vpiImageData.buffer.pitch.planes[0].height = height;
vpiImageData.buffer.pitch.planes[0].pitchBytes = pitch;
vpiImageData.buffer.pitch.planes[0].data = cudaPtrY;
vpiImageData.buffer.pitch.planes[1].pixelType = VPI_PIXEL_TYPE_2U8;
vpiImageData.buffer.pitch.planes[1].width = width / 2;
vpiImageData.buffer.pitch.planes[1].height = height / 2;
vpiImageData.buffer.pitch.planes[1].pitchBytes = pitch;
vpiImageData.buffer.pitch.planes[1].data = cudaPtrUV;
VPIImageWrapperParams wrapperParams;
wrapperParams.colorSpec = VPI_COLOR_SPEC_DEFAULT;
CHECK_STATUS(vpiImageCreateWrapper(&vpiImageData, &wrapperParams, backend, vpiImage));
bool VpiImagePrintFormat(VPIImage image, const char * comment)
VPIImageFormat format {};
CHECK_STATUS(vpiImageGetFormat(image, &format));
int32_t imageWidth {}, imageHeight {};
CHECK_STATUS(vpiImageGetSize(image, &imageWidth, &imageHeight));
uint32_t fourCC = vpiImageFormatGetFourCC(format);
int numPlanes = vpiImageFormatGetPlaneCount(format);
printf("%s vpiImageGetFormat ret 0x%x %.4s Image Size %dx%d numPlanes %d\n",
comment, (int)format, (char*)&fourCC,
(int)imageWidth, (int)imageHeight,
VPIImageData imgdata;
for(int planeIdx = 0; planeIdx < numPlanes; planeIdx++)
printf("plane #%d: type 0x%x %s size %dx%d 0x%x %d %d pitch %d %p\n",
(int)vpiImageFormatGetPlanePixelType(format, planeIdx),
vpiPixelTypeGetName(vpiImageFormatGetPlanePixelType(format, planeIdx)),
(int)vpiImageFormatGetPlaneWidth(format, imageWidth, planeIdx),
(int)vpiImageFormatGetPlaneHeight(format, imageHeight, planeIdx),
//Sample output for NV12
//imgInput vpiImageGetFormat ret 0xa2a10d1 NV12 Image Size 256x256 numPlanes 2
//plane #0: type 0xffff1001 VPI_PIXEL_TYPE_U8 size 256x256 0xffff1001 256 256 pitch 256 0x205807000
//plane #1: type 0xffff1011 VPI_PIXEL_TYPE_2U8 size 128x128 0xffff1011 128 128 pitch 256 0x205817000
return true;
}//bool VpiImagePrintFormat
int main(int argc, char *argv[])
bool useVic = getenv("VPI_BACKEND_VIC") != nullptr;
uint64_t backend = useVic? VPI_BACKEND_VIC : VPI_BACKEND_CUDA;
bool useWrapper = getenv("useWrapper") != nullptr;
const char * temp = getenv("width");
int width = temp? strtol(temp, nullptr, 10) : 256;
temp = getenv("height");
int height = temp? strtol(temp, nullptr, 10) : 256;
VPIImageFormat imgFormat = VPI_IMAGE_FORMAT_NV12_ER;
VPIStream stream = NULL;
CHECK_STATUS(vpiStreamCreate(backend, &stream));
VPIPayload tnr = NULL;
CHECK_STATUS(vpiCreateTemporalNoiseReduction(backend, width, height, imgFormat, VPI_TNR_DEFAULT, &tnr));
VPITNRParams params;
VPIImage imgInput, imgOutput, imgPrevious;
CreateImageWrapper(width, height, imgFormat, backend, &imgInput);
CreateImageWrapper(width, height, imgFormat, backend, &imgOutput);
CreateImageWrapper(width, height, imgFormat, backend, &imgPrevious);
CHECK_STATUS(vpiImageCreate(width, height, imgFormat, backend, &imgInput));
CHECK_STATUS(vpiImageCreate(width, height, imgFormat, backend, &imgOutput));
CHECK_STATUS(vpiImageCreate(width, height, imgFormat, backend, &imgPrevious));
VpiImagePrintFormat(imgInput, "imgInput");
VpiImagePrintFormat(imgOutput, "imgOutput");
VpiImagePrintFormat(imgPrevious, "imgPrevious");
CHECK_STATUS(vpiSubmitTemporalNoiseReduction(stream, backend, tnr, imgPrevious, imgInput, imgOutput, ¶ms));