Hello,

I have been using jetson-inference as a base for our application development. The repo also provides many cuda kernels for different video format conversion (which is very helpful, thank you!).

However, I notice that there is one kind of conversion missing, which is **I420 -> RGBA**. I am wondering if it was now implemented due to some specific reason (like the conversion would be not very efficient in terms of image quality) or no.

I tried to write a kernel myself but it turns out that there are some artifacts in the output.

Here is the kernel:

```
__device__ void yuv2rgb(uint32_t* yuvi, float *red, float* green, float* blue){
const float luma = float(yuvi[0]);
const float u = float(yuvi[1]) - 128.0f;
const float v = float(yuvi[2]) - 128.0f;
*red = luma + 1.140f * v;
*green = luma - 0.395f * u - 0.581f * v;
*blue = luma + 2.3032f *u;
}
__global__ void I420ToRGBAf(uint32_t* srcImage, size_t nSourcePitch,
float4* dstImage, size_t nDestPitch,
uint32_t width, uint32_t height) {
int x, y;
uint8_t *srcImageU8 = (uint8_t *)srcImage;
uint32_t processingPitch = nSourcePitch;
x = blockIdx.x * blockDim.x + threadIdx.x;
y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width)
return; //x = width - 1;
if (y >= height)
return; // y = height - 1;
// This link helps understading of the structure
// https://en.wikipedia.org/wiki/YUV#Y%E2%80%B2UV420p_(and_Y%E2%80%B2V12_or_YV12)_to_RGB888_conversion
// Can parallalize/ optimize so that there are 4 rgba conversions
// as 4 Y's share a single U and V ?
uint32_t yuvi[3];
uint32_t offset = processingPitch * height;
yuvi[0] = srcImageU8[y * processingPitch + x];
yuvi[1] = srcImageU8[(y * processingPitch)/4 + x/2 + offset];
yuvi[2] = srcImageU8[(y * processingPitch)/4 + x/2 + offset + offset/4];
float r, g, b;
yuv2rgb(&yuvi[0], &r, &g, &b);
dstImage[y * width + x] = make_float4(r, g, b, 1.0f);
}
cudaError_t cudaI420ToRGBAf(uint8_t* srcDev, size_t srcPitch, float4* destDev,
size_t destPitch, size_t width, size_t height) {
if( !srcDev || !destDev )
return cudaErrorInvalidDevicePointer;
if( srcPitch == 0 || destPitch == 0 || width == 0 || height == 0 )
return cudaErrorInvalidValue;
const dim3 blockDim(8,8,1);
//const dim3 gridDim((width+(2*blockDim.x-1))/(2*blockDim.x), (height+(blockDim.y-1))/blockDim.y, 1);
const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height, blockDim.y), 1);
I420ToRGBAf<<<gridDim, blockDim>>>( (uint32_t*)srcDev, srcPitch, destDev, destPitch, width, height );
return CUDA(cudaGetLastError());
}
```

I have attached two images, one as a result of this kernel and one as it should be expected.

Could anyone help me spot my bug or let me know if there is anyting wrong in my fundamental understanding of I420 format?