Below is my repro of the VPI Image Format Converter benchmark running on an AGX with JetPack 4.4. It takes around 0.45ms. How do I speed it up to resemble the documented 0.1447ms ?
//usr/bin/g++ $0 -lnvvpi && ./a.out; exit
#include <vpi/VPI.h>
#include <vpi/algo/ImageFormatConverter.h>
#include <vector>
#include <cstring> // memset
#include <stdio.h>
int main() {
uint32_t width = 1920, height = 1080;
std::vector<uint32_t> rgbaSrcBuffer(width*height, ~0);
VPIImageData hostData;
memset(&hostData, 0, sizeof(hostData));
hostData.type = VPI_IMAGE_TYPE_RGBA8;
hostData.numPlanes = 1;
hostData.planes[0].width = width;
hostData.planes[0].height = height;
hostData.planes[0].rowStride = width * sizeof(uint32_t);
hostData.planes[0].pixelType = VPI_PIXEL_TYPE_4U8;
hostData.planes[0].data = rgbaSrcBuffer.data();
VPIImage srcRGBA, dstNV12, dstRGBA;
vpiImageWrapHostMem(&hostData, VPI_IMAGE_ONLY_CUDA, &srcRGBA);
vpiImageCreate(width, height, VPI_IMAGE_TYPE_NV12, VPI_IMAGE_ONLY_CUDA, &dstNV12);
vpiImageCreate(width, height, VPI_IMAGE_TYPE_RGBA8, VPI_IMAGE_ONLY_CUDA, &dstRGBA);
VPIStream stream;
vpiStreamCreate(VPI_DEVICE_TYPE_CUDA, &stream);
VPIEvent start, end;
vpiEventCreate(0, &start);
vpiEventCreate(0, &end);
VPIConversionPolicy convPolicy = VPI_CONVERSION_CAST;
float scale = 1.f, offset = 0.f;
vpiSubmitImageFormatConverter(stream, srcRGBA, dstNV12, convPolicy, scale, offset);
for (int i=0; i<500; i++) {
vpiSubmitImageFormatConverter(stream, dstNV12, dstRGBA, convPolicy, scale, offset);
vpiSubmitImageFormatConverter(stream, dstRGBA, dstNV12, convPolicy, scale, offset);
}
vpiSubmitImageFormatConverter(stream, dstNV12, dstRGBA, convPolicy, scale, offset);
vpiEventRecord(start, stream);
vpiSubmitImageFormatConverter(stream, dstRGBA, dstNV12, convPolicy, scale, offset);
vpiEventRecord(end, stream);
vpiEventSync(end);
float msec = -1.f;
vpiEventElapsedTime(start, end, &msec);
printf("Convert From NV12 time: %f ms\n", msec);
return 0;
}
/* https://docs.nvidia.com/vpi/algo_imageconv.html#autotoc_md47
Jetson AGX Xavier
size input output conv. scale offset CPU CUDA PVA
1920x1080 rgba8 nv12 cast 1 0 7.4 ms 0.1447 ms n/a
$ sudo nvpmodel -m0 && sudo jetson_clocks && sudo jetson_clocks --show && ./a.out && ./a.out && ./a.out
SOC family:tegra194 Machine:Jetson-AGX
Online CPUs: 0-7
CPU Cluster Switching: Disabled
cpu0: Online=1 Governor=schedutil MinFreq=2265600 MaxFreq=2265600 CurrentFreq=2265600 IdleStates: C1=0 c6=0
cpu1: Online=1 Governor=schedutil MinFreq=2265600 MaxFreq=2265600 CurrentFreq=2265600 IdleStates: C1=0 c6=0
cpu2: Online=1 Governor=schedutil MinFreq=2265600 MaxFreq=2265600 CurrentFreq=2265600 IdleStates: C1=0 c6=0
cpu3: Online=1 Governor=schedutil MinFreq=2265600 MaxFreq=2265600 CurrentFreq=2265600 IdleStates: C1=0 c6=0
cpu4: Online=1 Governor=schedutil MinFreq=2265600 MaxFreq=2265600 CurrentFreq=2265600 IdleStates: C1=0 c6=0
cpu5: Online=1 Governor=schedutil MinFreq=2265600 MaxFreq=2265600 CurrentFreq=2265600 IdleStates: C1=0 c6=0
cpu6: Online=1 Governor=schedutil MinFreq=2265600 MaxFreq=2265600 CurrentFreq=2265600 IdleStates: C1=0 c6=0
cpu7: Online=1 Governor=schedutil MinFreq=2265600 MaxFreq=2265600 CurrentFreq=2265600 IdleStates: C1=0 c6=0
GPU MinFreq=1377000000 MaxFreq=1377000000 CurrentFreq=1377000000
EMC MinFreq=204000000 MaxFreq=2133000000 CurrentFreq=2133000000 FreqOverride=1
Fan: speed=77
NV Power Mode: MAXN
Convert From NV12 time: 0.412320 ms
Convert From NV12 time: 0.464224 ms
Convert From NV12 time: 0.425632 ms
*/