Hi,
I already mentioned this failure in my previous post How to prevent vpiSubmitConvertImageFormat from calling cudaGraphicsEGLRegisterImage, which kills performance?
I understand that VPI algos on images created with vpiImageCreateWrapper may be somewhat slower.
But I was not able to make vpiSubmitTemporalNoiseReduction work on VpiImage, which wraps a preallocated CUDA memory at all.
See the attached sample code. You can run it to use regular vpiImageCreate and call vpiSubmitTemporalNoiseReduction, which succeeds:
width=256 height=256 ./tnr_wrap # Use vpiImageCreate
And then run:
useWrapper=1 width=256 height=256 ./tnr_wrap # Use vpiImageCreateWrapper
# Prints:
terminate called after throwing an instance of 'std::runtime_error'
what(): vpiSubmitTemporalNoiseReduction(stream, backend, tnr, imgPrevious, imgInput, imgOutput, ¶ms)
VPI_ERROR_INVALID_ARGUMENT: Previous frame must have the same format configured during payload creation
Aborted
Note that I printed detailed information about created images and the information is identical in both cases (except for CUDA addresses):
imgInput vpiImageGetFormat ret 0xa2a10d1 NV12 Image Size 256x256 numPlanes 2
plane #0: type 0xffff1001 VPI_PIXEL_TYPE_U8 size 256x256 0xffff1001 256 256 pitch 256 0x205807000
plane #1: type 0xffff1011 VPI_PIXEL_TYPE_2U8 size 128x128 0xffff1011 128 128 pitch 256 0x205817000
imgPrevious vpiImageGetFormat ret 0xa2a10d1 NV12 Image Size 256x256 numPlanes 2
plane #0: type 0xffff1001 VPI_PIXEL_TYPE_U8 size 256x256 0xffff1001 256 256 pitch 256 0x205867000
plane #1: type 0xffff1011 VPI_PIXEL_TYPE_2U8 size 128x128 0xffff1011 128 128 pitch 256 0x205877000
imgOutput vpiImageGetFormat ret 0xa2a10d1 NV12 Image Size 256x256 numPlanes 2
plane #0: type 0xffff1001 VPI_PIXEL_TYPE_U8 size 256x256 0xffff1001 256 256 pitch 256 0x205837000
plane #1: type 0xffff1011 VPI_PIXEL_TYPE_2U8 size 128x128 0xffff1011 128 128 pitch 256 0x205847000
So, why vpiSubmitTemporalNoiseReduction would fail?
And how to correct my code to make vpiSubmitTemporalNoiseReduction work on preallocated CUDA memory?
Thank you
/*
Usage:
Save this file to tnr_wrap.cpp
# Compile
g++ -o tnr_wrap -I/usr/local/cuda-12.6/targets/aarch64-linux/include \
./tnr_wrap.cpp -L/usr/local/cuda-12.6/targets/aarch64-linux/lib/ -lnvvpi -lcudart
width=256 height=256 ./tnr_wrap # Use vpiImageCreate
# Prints ok
useWrapper=1 width=256 height=256 ./tnr_wrap # Use vpiImageCreateWrapper
# Prints:
terminate called after throwing an instance of 'std::runtime_error'
what(): vpiSubmitTemporalNoiseReduction(stream, backend, tnr, imgPrevious, imgInput, imgOutput, ¶ms)
VPI_ERROR_INVALID_ARGUMENT: Previous frame must have the same format configured during payload creation
Aborted
*/
#include <vpi/Event.h>
#include <vpi/Image.h>
#include <vpi/Status.h>
#include <vpi/Stream.h>
#include <vpi/algo/TemporalNoiseReduction.h>
#include <cuda_runtime.h>
#include <fstream>
#include <iostream>
#include <sstream>
#define CHECK_STATUS(STMT) \
do \
{ \
VPIStatus status = (STMT); \
if (status != VPI_SUCCESS) \
{ \
char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH]; \
vpiGetLastStatusMessage(buffer, sizeof(buffer)); \
std::ostringstream ss; \
ss << "" #STMT "\n"; \
ss << vpiStatusGetName(status) << ": " << buffer; \
throw std::runtime_error(ss.str()); \
} \
} while (0);
void CreateImageWrapper(int width, int height, VPIImageFormat imgFormat, uint64_t backend, VPIImage * vpiImage)
{
assert(imgFormat == VPI_IMAGE_FORMAT_NV12_ER);
int pitch = ((width + 255)/256)*256;
void * cudaPtrY {}, * cudaPtrUV;
assert(cudaSuccess == cudaMalloc(&cudaPtrY, pitch * height));
assert(cudaSuccess == cudaMalloc(&cudaPtrUV, pitch * height/2));
VPIImageData vpiImageData;
vpiImageData.bufferType = VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR;
vpiImageData.buffer.pitch.format = VPI_IMAGE_FORMAT_NV12_ER;
vpiImageData.buffer.pitch.numPlanes = 2;
vpiImageData.buffer.pitch.planes[0].pixelType = VPI_PIXEL_TYPE_U8;
vpiImageData.buffer.pitch.planes[0].width = width;
vpiImageData.buffer.pitch.planes[0].height = height;
vpiImageData.buffer.pitch.planes[0].pitchBytes = pitch;
vpiImageData.buffer.pitch.planes[0].data = cudaPtrY;
vpiImageData.buffer.pitch.planes[1].pixelType = VPI_PIXEL_TYPE_2U8;
vpiImageData.buffer.pitch.planes[1].width = width / 2;
vpiImageData.buffer.pitch.planes[1].height = height / 2;
vpiImageData.buffer.pitch.planes[1].pitchBytes = pitch;
vpiImageData.buffer.pitch.planes[1].data = cudaPtrUV;
VPIImageWrapperParams wrapperParams;
wrapperParams.colorSpec = VPI_COLOR_SPEC_DEFAULT;
CHECK_STATUS(vpiImageCreateWrapper(&vpiImageData, &wrapperParams, backend, vpiImage));
}
bool VpiImagePrintFormat(VPIImage image, const char * comment)
{
VPIImageFormat format {};
CHECK_STATUS(vpiImageGetFormat(image, &format));
int32_t imageWidth {}, imageHeight {};
CHECK_STATUS(vpiImageGetSize(image, &imageWidth, &imageHeight));
uint32_t fourCC = vpiImageFormatGetFourCC(format);
int numPlanes = vpiImageFormatGetPlaneCount(format);
printf("%s vpiImageGetFormat ret 0x%x %.4s Image Size %dx%d numPlanes %d\n",
comment, (int)format, (char*)&fourCC,
(int)imageWidth, (int)imageHeight,
numPlanes);
VPIImageData imgdata;
CHECK_STATUS(vpiImageLockData(image, VPI_LOCK_READ, VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR, &imgdata));
for(int planeIdx = 0; planeIdx < numPlanes; planeIdx++)
{
printf("plane #%d: type 0x%x %s size %dx%d 0x%x %d %d pitch %d %p\n",
planeIdx,
(int)vpiImageFormatGetPlanePixelType(format, planeIdx),
vpiPixelTypeGetName(vpiImageFormatGetPlanePixelType(format, planeIdx)),
(int)vpiImageFormatGetPlaneWidth(format, imageWidth, planeIdx),
(int)vpiImageFormatGetPlaneHeight(format, imageHeight, planeIdx),
(int)imgdata.buffer.pitch.planes[planeIdx].pixelType,
imgdata.buffer.pitch.planes[planeIdx].width,
imgdata.buffer.pitch.planes[planeIdx].height,
imgdata.buffer.pitch.planes[planeIdx].pitchBytes,
imgdata.buffer.pitch.planes[planeIdx].data
);
}
CHECK_STATUS(vpiImageUnlock(image));
//Sample output for NV12
//imgInput vpiImageGetFormat ret 0xa2a10d1 NV12 Image Size 256x256 numPlanes 2
//plane #0: type 0xffff1001 VPI_PIXEL_TYPE_U8 size 256x256 0xffff1001 256 256 pitch 256 0x205807000
//plane #1: type 0xffff1011 VPI_PIXEL_TYPE_2U8 size 128x128 0xffff1011 128 128 pitch 256 0x205817000
return true;
}//bool VpiImagePrintFormat
int main(int argc, char *argv[])
{
bool useVic = getenv("VPI_BACKEND_VIC") != nullptr;
uint64_t backend = useVic? VPI_BACKEND_VIC : VPI_BACKEND_CUDA;
bool useWrapper = getenv("useWrapper") != nullptr;
const char * temp = getenv("width");
int width = temp? strtol(temp, nullptr, 10) : 256;
temp = getenv("height");
int height = temp? strtol(temp, nullptr, 10) : 256;
VPIImageFormat imgFormat = VPI_IMAGE_FORMAT_NV12_ER;
VPIStream stream = NULL;
CHECK_STATUS(vpiStreamCreate(backend, &stream));
VPIPayload tnr = NULL;
CHECK_STATUS(vpiCreateTemporalNoiseReduction(backend, width, height, imgFormat, VPI_TNR_DEFAULT, &tnr));
VPITNRParams params;
CHECK_STATUS(vpiInitTemporalNoiseReductionParams(¶ms));
VPIImage imgInput, imgOutput, imgPrevious;
if(useWrapper)
{
CreateImageWrapper(width, height, imgFormat, backend, &imgInput);
CreateImageWrapper(width, height, imgFormat, backend, &imgOutput);
CreateImageWrapper(width, height, imgFormat, backend, &imgPrevious);
}
else
{
CHECK_STATUS(vpiImageCreate(width, height, imgFormat, backend, &imgInput));
CHECK_STATUS(vpiImageCreate(width, height, imgFormat, backend, &imgOutput));
CHECK_STATUS(vpiImageCreate(width, height, imgFormat, backend, &imgPrevious));
}
VpiImagePrintFormat(imgInput, "imgInput");
VpiImagePrintFormat(imgOutput, "imgOutput");
VpiImagePrintFormat(imgPrevious, "imgPrevious");
CHECK_STATUS(vpiSubmitTemporalNoiseReduction(stream, backend, tnr, imgPrevious, imgInput, imgOutput, ¶ms));
printf("ok\n");
}