I’m trying to apply a custom conversion matrix to convert NV12 frames to RGB using nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx with NPP 12.3
.
The documentation mention that:
This is how the matrix works for the YUV420/YUV/422/NV12->RGB INVERSE
transform (note- do the offsets first):
src[0]' = src[0] + aTwist[0][3]
src[1]' = src[1] + aTwist[1][3]
src[2]' = src[2] + aTwist[2][3]
And then the remaining 3x3 twist matrix is applied using those modified values:
dst[0] = aTwist[0][0] * src[0]' + aTwist[0][1] * src[1]' + aTwist[0][2] * src[2]'
dst[1] = aTwist[1][0] * src[0]' + aTwist[1][1] * src[1]' + aTwist[1][2] * src[2]'
dst[2] = aTwist[2][0] * src[0]' + aTwist[2][1] * src[1]' + aTwist[2][2] * src[2]'
However it seems that offsets are actually applied after the twist matrix, in the same way they are applied in the reverse transformation:
dst[0] = aTwist[0][0] * src[0] + aTwist[0][1] * src[1] + aTwist[0][2] * src[2] + aTwist[0][3]
dst[1] = aTwist[1][0] * src[0] + aTwist[1][1] * src[1] + aTwist[1][2] * src[2] + aTwist[1][3]
dst[2] = aTwist[2][0] * src[0] + aTwist[2][1] * src[1] + aTwist[2][2] * src[2] + aTwist[2][3]
This is a minimal example to repoduce the issue:
#include <stdio.h>
#include <cuda_runtime.h>
#include <npp.h>
#define SIZE 2
int main(int argc, char** argv) {
Npp8u* yuv[2];
int yuv_size[2] = {SIZE * SIZE, (SIZE / 2) * SIZE};
int yuv_stride[2] = {SIZE, SIZE};
cudaMalloc((void**) &yuv[0], yuv_size[0] * sizeof(Npp8u));
cudaMalloc((void**) &yuv[1], yuv_size[1] * sizeof(Npp8u));
cudaMemset(yuv[0], 128, yuv_size[0] * sizeof(Npp8u));
cudaMemset(yuv[1], 128, yuv_size[1] * sizeof(Npp8u));
Npp8u* rgb;
int rgb_size = SIZE * SIZE * 3;
int rgb_stride = SIZE;
cudaMalloc((void**) &rgb, rgb_size * sizeof(Npp8u));
Npp32f twist[3][4] = {
{0,0,0,100},
{0,0,0,0},
{0,0,0,0},
};
NppStreamContext nppStreamCtx;
nppGetStreamContext(&nppStreamCtx);
NppiSize roi = {SIZE, SIZE};
nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx(yuv, yuv_stride, rgb, rgb_stride, roi, twist, nppStreamCtx);
Npp8u* rgb_host = (Npp8u*) malloc (rgb_size);
cudaMemcpy(rgb_host, rgb, rgb_size * sizeof(Npp8u), cudaMemcpyDeviceToHost);
for (int i = 0; i < SIZE * SIZE; i++) {
printf("%d) %d %d %d\n", i, rgb_host[i * 3], rgb_host[i * 3 + 1], rgb_host[i * 3 + 2]);
}
cudaFree(yuv[0]);
cudaFree(yuv[1]);
cudaFree(rgb);
return 0;
}
If the offsets are applied before the twist matrix, the resulting RGB matrix should all contain zeros, but instead I get:
0) 100 0 100
1) 0 0 100
2) 0 0 0
3) 0 0 0
So it looks like the offsets are applied after the multiplications. This makes it impossible to use this function to convert from NV12.