@SivaRamaKrishnaNV
I have merged my own CPU loop function with img_2d source code.
Please apply below patch on your side.
As you can see the patch mainly include below two changes.
- Force to do compiler optimization with -O3
- Read original file to src_img → call yuv422pTo420pLoop and store into dst_img
diff --git a/nvmedia/img_2d/Makefile b/nvmedia/img_2d/Makefile
index 5452889..c1e7b9f 100644
--- a/nvmedia/img_2d/Makefile
+++ b/nvmedia/img_2d/Makefile
@@ -10,7 +10,7 @@ include ../../../make/nvdefs.mk
TARGETS = nvmimg_2d
-CFLAGS := $(NV_PLATFORM_OPT) $(NV_PLATFORM_CFLAGS) -I. -I../utils
+CFLAGS := $(NV_PLATFORM_OPT) $(NV_PLATFORM_CFLAGS) -O3 -I. -I../utils
CFLAGS += -DNVMEDIA_NVSCI_ENABLE
CPPFLAGS := $(NV_PLATFORM_SDK_INC) $(NV_PLATFORM_CPPFLAGS)
LDFLAGS := $(NV_PLATFORM_SDK_LIB) $(NV_PLATFORM_LDFLAGS)
diff --git a/nvmedia/img_2d/image2d.c b/nvmedia/img_2d/image2d.c
index f060a7f..1a9970a 100644
--- a/nvmedia/img_2d/image2d.c
+++ b/nvmedia/img_2d/image2d.c
@@ -64,6 +64,23 @@ destroySurface(NvMediaImage *image)
NvMediaImageDestroy(image);
}
+void yuv422pTo420pLoop(uint8_t* src, uint8_t* dst, int width, int height) {
+ //Copy Y part
+ for (int row = 0; row < height; row++) {
+ for (int col = 0; col < width; col++) {
+ *dst++=*src++;
+ }
+ }
+
+ //Copy UV part
+ for (int row = 0; row < height; row++) {
+ for (int col = 0; col < width / 2; col++) {
+ *dst++=*src++;
+ }
+ src += width / 2;
+ }
+}
+
static NvMediaStatus
blit2DImage(Blit2DTest *ctx, TestArgs* args)
{
@@ -71,7 +88,29 @@ blit2DImage(Blit2DTest *ctx, TestArgs* args)
NvMediaImageSurfaceMap surfaceMap;
uint64_t startTime,endTime;
uint64_t end1Time;
- double processingTime;
+ double processingTime = 0;
+ uint8_t * src_img;
+ uint8_t * dst_img;
+ uint32_t size = args->srcSurfAllocAttrs[0].value * args->srcSurfAllocAttrs[1].value * 2;
+
+ FILE * file_ptr = NULL;
+ file_ptr = fopen(args->inputFileName, "rb");
+ src_img = malloc(size);
+ dst_img = malloc(size);
+ if(fread(src_img, size, 1, file_ptr) != 1) {
+ LOG_ERR("%s: Error reading file: %s\n", __func__, args->inputFileName);
+ }
+ GetTimeMicroSec(&startTime);
+ yuv422pTo420pLoop(src_img, dst_img, args->srcSurfAllocAttrs[0].value, args->srcSurfAllocAttrs[1].value);
+ GetTimeMicroSec(&endTime);
+ processingTime = 0;
+ processingTime += (double)(endTime - startTime)/1000.0;
+
+ LOG_INFO("Current Allocate size is %d(%d*%d)\n", size, args->srcSurfAllocAttrs[0].value, args->srcSurfAllocAttrs[1].value);
+ LOG_INFO("CPU Loop Processing time per frame %.4f ms \n", processingTime);
+ fclose(file_ptr);
+ free(src_img);
+ free(dst_img);
processingTime = 0;
status = ReadImage(args->inputFileName, /* fileName */
nvming_2d default optimized level is -O2.
- Performance result with -O2 option
nvmedia: createSurface: NvMediaImageCreate:: Image size: 1280x720 Image type: 42
nvmedia: Current Allocate size is 1843200(1280*720)
nvmedia: CPU Loop Processing time per frame 1.3570 ms
nvmedia: WriteImage : Saving output image into file...
nvmedia: Processing time per frame 1.1920 ms
- Performance result with -O3 option
nvmedia: createSurface: NvMediaImageCreate:: Image size: 1280x720 Image type: 42
nvmedia: createSurface: NvMediaImageCreate:: Image size: 1280x720 Image type: 25
nvmedia: Current Allocate size is 1843200(1280*720)
nvmedia: CPU Loop Processing time per frame 0.7680 ms
nvmedia: WriteImage : Saving output image into file...
nvmedia: Processing time per frame 1.3410 ms
Therefore, we can found NVIDIA hardware engine may not work fast as we expected (Like FPGA did). Maybe hardware engine sync with CPU timing issue?
For now, the performance result is even slower than CPU method.