Common mistakes when making benchmarks is to call function in question in a loop on constant input data or do not use the results.
In the first case an optimizing compiler or runtime engine will only call the function once and reuse the result.
In the second case it will skip calling function completely because results are not used.
I wrote the shortest possible example, which reads real raw YUV frames from a file (in tmpfs), calls vpiSubmitTemporalNoiseReduction,vpiStreamSync and saves result to another YUV file.
If I run it on 1000 frames I get:
statPerFrameTnr (ms) count 998 av 4.350390 min 1.130550 max 4.923569
as you can see, it still almost 5 times slower than in your benchmark.
But if I run with parameter repeatOneFrame=1, I get:
statPerFrameTnr (ms) count 998 av 2.753826 min 1.106198 max 2.947012
now it is about 30% faster
If I run with parameter dropResults=1, then:
statPerFrameTnr (ms) count 998 av 1.939635 min 1.049718 max 2.067214
If I run with parameters repeatOneFrame=1 dropResults=1, then:
statPerFrameTnr (ms) count 998 av 0.992015 min 0.976600 max 1.092695
Now results are close to your published results, but these are totally fake results,
because neither input nor output images are accessed.
I tried using Nsight System, as you suggested, but it only confirmed my measurements that
TemporalNoiseReduction takes 5 ms instead of 1:
18.9 5,547,686,816 1,000 5,547,686.8 5,603,200.0 1,326,816 6,224,288 466,479.6 PushPop VPI:vpiStreamSync
18.1 5,311,714,592 1,000 5,311,714.6 5,363,840.0 1,193,536 5,996,288 453,219.4 PushPop VPI:TemporalNoiseReduction
16.5 4,819,926,560 1,000 4,819,926.6 4,875,072.0 788,416 4,968,224 443,469.7 PushPop VPI:sync tegra
Please, help me optimize this simplest code or provide another one, which will use real images, apply TNR and utilize the results and have the time somewhat close to your benchmarks.
Thank you
/*
Usage:
g++ -o tnr_file ./tnr_file.cpp -lnvvpi
sudo mkdir /mnt/tmpfs
sudo chown $USER:$USER /mnt/tmpfs
sudo mount -t tmpfs -o size=16g tmpfs /mnt/tmpfs
gst-launch-1.0 filesrc location=/opt/nvidia/vpi3/samples/assets/noisy.mp4 ! qtdemux ! queue ! h264parse ! avdec_h264 ! \
nvvidconv ! 'video/x-raw, format=YUY2, width=1920, height=1080' ! \
filesink location=/mnt/tmpfs/out.yuv -e
sudo ./clocks.sh --max
VPI_IMAGE_FORMAT_YUYV_ER=1 strength=1 inFile=/mnt/tmpfs/out.yuv outFile=/mnt/tmpfs/out2.yuv \
width=1920 height=1080 numFrames=1000 ./tnr_file
Result:
statPerFrameTnr (ms) count 98 av 3.967568 min 1.138996 max 4.501203
statPerFrameTnr (ms) count 998 av 4.350390 min 1.130550 max 4.923569
DISPLAY=:0 ffplay -v info -f rawvideo -pixel_format yuyv422 -video_size 1920x1080 /mnt/tmpfs/out2.yuv
repeatOneFrame=1 VPI_IMAGE_FORMAT_YUYV_ER=1 strength=1 inFile=/mnt/tmpfs/out.yuv outFile=/mnt/tmpfs/out2.yuv \
width=1920 height=1080 numFrames=1000 ./tnr_file
Result:
statPerFrameTnr (ms) count 98 av 2.559179 min 1.067350 max 2.887748
statPerFrameTnr (ms) count 998 av 2.753826 min 1.106198 max 2.947012
dropResults=1 VPI_IMAGE_FORMAT_YUYV_ER=1 strength=1 inFile=/mnt/tmpfs/out.yuv outFile=/mnt/tmpfs/out2.yuv \
width=1920 height=1080 numFrames=1000 ./tnr_file
Result:
statPerFrameTnr (ms) count 98 av 1.824454 min 1.064086 max 1.985774
statPerFrameTnr (ms) count 998 av 1.939635 min 1.049718 max 2.067214
repeatOneFrame=1 dropResults=1 VPI_IMAGE_FORMAT_YUYV_ER=1 strength=1 inFile=/mnt/tmpfs/out.yuv outFile=/mnt/tmpfs/out2.yuv \
width=1920 height=1080 numFrames=1000 ./tnr_file
statPerFrameTnr (ms) count 998 av 0.992015 min 0.976600 max 1.092695
*/
#include <vpi/Event.h>
#include <vpi/Image.h>
#include <vpi/Status.h>
#include <vpi/Stream.h>
#include <vpi/algo/ConvertImageFormat.h>
#include <vpi/algo/TemporalNoiseReduction.h>
#include <algorithm>
#include <cstring> // for memset
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
uint64_t getTimeNS()
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec * 1000 * 1000 * 1000 + ts.tv_nsec;
}
struct Stat
{
void Add(int value)
{
if(!this->count || this->min > value)
{
this->min = value;
}
if(!this->count || this->max < value)
{
this->max = value;
}
this->count++;
this->total += value;
}
void Print(const char * name, double ratio)
{
printf("%s count %d av %lf min %lf max %lf\n",
name, this->count, this->count? (this->total * ratio / this->count) : 0.0, ratio * this->min, ratio * this->max);
}
int min {-1};
int max {-1};
int count {0};
long long total {0};
};
Stat statPerFrameTnr;
#define CHECK_STATUS(STMT) \
do \
{ \
VPIStatus status = (STMT); \
if (status != VPI_SUCCESS) \
{ \
char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH]; \
vpiGetLastStatusMessage(buffer, sizeof(buffer)); \
std::ostringstream ss; \
ss << "" #STMT "\n"; \
ss << vpiStatusGetName(status) << ": " << buffer; \
throw std::runtime_error(ss.str()); \
} \
} while (0);
int main(int argc, char *argv[])
{
VPIStream stream = NULL;
VPIImage imgPrevious = NULL, imgInput = NULL, imgOutput = NULL;
VPIImage imageCvWrapper = NULL;
VPIPayload tnr = NULL;
// main return value
int retval = 0;
VPIBackend backend {VPI_BACKEND_VIC};
const char * inFileName = getenv("inFile");
std::ifstream inFile(inFileName);
if(!inFile)
{
printf("Cannot open %s for reading\n", inFileName);
return -1;
}
inFile.seekg (0, inFile.end);
int inFileSize = inFile.tellg();
inFile.seekg (0, inFile.beg);
const char * outFileName = getenv("outFile");
std::ofstream outFile(outFileName);
if(!outFile)
{
printf("Cannot open %s for writing\n", outFileName);
return -1;
}
const char * temp = getenv("wifth");
int width = temp? strtol(temp, nullptr, 10) : 1920;
temp = getenv("height");
int height = temp? strtol(temp, nullptr, 10) : 1080;
temp = getenv("numFrames");
int numFrames = temp? strtol(temp, nullptr, 10) : 1000;
temp = getenv("printFormat");
int printFormat = temp && *temp == '1';
CHECK_STATUS(vpiStreamCreate(backend, &stream));
uint64_t memFlags {backend};
memFlags |= VPI_BACKEND_CPU;//Need this to lock images
temp = getenv("VPI_EXCLUSIVE_STREAM_ACCESS");
if(temp && *temp == '1')
{
memFlags |= VPI_EXCLUSIVE_STREAM_ACCESS;
}
temp = getenv("VPI_BACKEND_CUDA");
if(temp && *temp == '1')
{
memFlags |= VPI_BACKEND_CUDA;
}
VPIImageFormat imgFormat = VPI_IMAGE_FORMAT_NV12_ER;
temp = getenv("VPI_IMAGE_FORMAT_YUYV_ER");
if(temp && *temp == '1')
{
imgFormat = VPI_IMAGE_FORMAT_YUYV_ER;
}
temp = getenv("VPI_IMAGE_FORMAT_UYVY_ER");
if(temp && *temp == '1')
{
imgFormat = VPI_IMAGE_FORMAT_UYVY_ER;
}
CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlags, &imgInput));
CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlags, &imgPrevious));
CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlags, &imgOutput));
CHECK_STATUS(vpiCreateTemporalNoiseReduction(backend, width, height, imgFormat, VPI_TNR_DEFAULT, &tnr));
VPITNRParams params;
CHECK_STATUS(vpiInitTemporalNoiseReductionParams(¶ms));
temp = getenv("preset");
if(temp)
{
params.preset = (VPITNRPreset)strtol(temp, nullptr, 10);
}
temp = getenv("strength");
if(temp)
{
params.strength = strtod(temp, nullptr);
}
printf("tnr params preset: %d strength: %lf\n", (int)params.preset, (double)params.strength);
temp = getenv("repeatOneFrame");
bool repeatOneFrame = temp && *temp == '1';
temp = getenv("dropResults");
bool dropResults = temp && *temp == '1';
VPIEvent evStart = NULL;
VPIEvent evEnd = NULL;
CHECK_STATUS(vpiEventCreate(backend, &evStart));
CHECK_STATUS(vpiEventCreate(backend, &evEnd));
printf("Run loop\n");
int filePos = 0;
for(int frameOrdinal = 0; frameOrdinal < numFrames; frameOrdinal++)
{
if(!repeatOneFrame || frameOrdinal == 0)
{
//This is one way to read file: lock VPI image and read directly to it.
VPIImageData imgdata;
CHECK_STATUS(vpiImageLockData(imgInput, VPI_LOCK_WRITE, VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR, &imgdata));
int frameSize {};
for(int planeIdx = 0; planeIdx < imgdata.buffer.pitch.numPlanes; planeIdx++)
{
int size = imgdata.buffer.pitch.planes[planeIdx].pitchBytes * imgdata.buffer.pitch.planes[planeIdx].height;
inFile.read((char*)imgdata.buffer.pitch.planes[planeIdx].data, size);
frameSize += size;
if(printFormat && frameOrdinal == 0)
{
printf("planeIdx %d width %d height %d pitchBytes %d\n", planeIdx,
imgdata.buffer.pitch.planes[planeIdx].width,
imgdata.buffer.pitch.planes[planeIdx].height,
imgdata.buffer.pitch.planes[planeIdx].pitchBytes);
}
}
CHECK_STATUS(vpiImageUnlock(imgInput));
if(!inFile)
{
printf("Failed to read frame of size %d at pos %d\n", frameSize, filePos);
return -1;
}
filePos += frameSize;
//printf("frameOrdinal %d frameSize %d filePos %d\n", frameOrdinal, frameSize, filePos);
if(filePos == inFileSize)
{
filePos = 0;
//printf("seekg 0\n");
inFile.seekg (0, inFile.beg);
}
}
uint64_t timeStart = getTimeNS();
CHECK_STATUS(vpiSubmitTemporalNoiseReduction(stream, 0, tnr,
frameOrdinal == 0 ? nullptr: imgPrevious, imgInput, imgOutput, ¶ms));
CHECK_STATUS(vpiStreamSync(stream));
if(frameOrdinal >= 2)//Do not count first few frames
{
statPerFrameTnr.Add( (int)(getTimeNS() - timeStart) );
}
if(!dropResults)
{
VPIImageData imgdata;
CHECK_STATUS(vpiImageLockData(imgOutput, VPI_LOCK_READ, VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR, &imgdata));
for(int planeIdx = 0; planeIdx < imgdata.buffer.pitch.numPlanes; planeIdx++)
{
int size = imgdata.buffer.pitch.planes[planeIdx].pitchBytes * imgdata.buffer.pitch.planes[planeIdx].height;
outFile.write((const char*)imgdata.buffer.pitch.planes[planeIdx].data, size);
}
CHECK_STATUS(vpiImageUnlock(imgOutput));
}
std::swap(imgPrevious, imgOutput);
}//for(int frameOrdinal = 0; frameOrdinal < numFrames; frameOrdinal++)
printf("repeatOneFrame=%d dropResults=%d\n", repeatOneFrame, dropResults);
statPerFrameTnr.Print("statPerFrameTnr (ms)", 1E-6);
vpiStreamDestroy(stream);
vpiPayloadDestroy(tnr);
vpiImageDestroy(imgPrevious);
vpiImageDestroy(imgInput);
vpiImageDestroy(imgOutput);
vpiImageDestroy(imageCvWrapper);
return 0;
}