Help Using NV12M with TensorNet (jetson-inference)

Here’s my working kernel for converting NV12 to RGBAf based on the kernels in jetson-inference:

#define COLOR_COMPONENT_MASK            0x3FF
#define COLOR_COMPONENT_BIT_SIZE        10

__global__ void myKernel(
    cudaSurfaceObject_t surfLuma,
    cudaSurfaceObject_t surfChroma,
    float4* dstImage,
    uint32_t width,
    uint32_t height)
{
    uint32_t yuv101010Pel[2];
    uint32_t x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
    uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;

    if(x >= width) return;

    if(y >= height) return;

    yuv101010Pel[0] = surf2Dread<uint8_t>(surfLuma, x+0, y) << 2;
    yuv101010Pel[1] = surf2Dread<uint8_t>(surfLuma, x+1, y) << 2;

    int y_chroma = (y >> 1);

    if(y & 1)
    {
        uint32_t chromaCb;
        uint32_t chromaCr;

        chromaCb = surf2Dread<uint8_t>(surfChroma, x, y_chroma);
        chromaCr = surf2Dread<uint8_t>(surfChroma, x+1, y_chroma);

        if(y_chroma < ((height >> 1) - 1))
        {
            chromaCb = (chromaCb + surf2Dread<uint8_t>(surfChroma, x, y_chroma+1) + 1) >> 1;
            chromaCr = (chromaCr + surf2Dread<uint8_t>(surfChroma, x+1, y_chroma+1) + 1) >> 1;
        }

        yuv101010Pel[0] |= (chromaCb << ((COLOR_COMPONENT_BIT_SIZE << 0) + 2));
        yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

        yuv101010Pel[1] |= (chromaCb << ((COLOR_COMPONENT_BIT_SIZE << 0) + 2));
        yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
    }
    else
    {
        yuv101010Pel[0] |= ((uint32_t)surf2Dread<uint8_t>(surfChroma, x, y_chroma) << ((COLOR_COMPONENT_BIT_SIZE << 0) + 2));
        yuv101010Pel[0] |= ((uint32_t)surf2Dread<uint8_t>(surfChroma, x+1, y_chroma) << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

        yuv101010Pel[1] |= ((uint32_t)surf2Dread<uint8_t>(surfChroma, x, y_chroma) << ((COLOR_COMPONENT_BIT_SIZE << 0) + 2));
        yuv101010Pel[1] |= ((uint32_t)surf2Dread<uint8_t>(surfChroma, x+1, y_chroma) << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
    }

    // this steps performs the color conversion
    float luma[2], u[2], v[2];
    float red[2], green[2], blue[2];

    luma[0] = float(yuv101010Pel[0] &   COLOR_COMPONENT_MASK);
    u[0] = float((yuv101010Pel[0] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK) - 512.0f;
    v[0] = float((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK) - 512.0f;

	red[0] = luma[0] + 1.140f * v[0];
	green[0] = luma[0] - 0.395f * u[0] - 0.581f * v[0];
	blue[0] = luma[0] + 2.032f * u[0];

    red[0]   = min(max(red[0],   0.0f), 1023.f);
    green[0] = min(max(green[0], 0.0f), 1023.f);
    blue[0]  = min(max(blue[0],  0.0f), 1023.f);

    luma[1] = float(yuv101010Pel[1] &   COLOR_COMPONENT_MASK);
    u[1] = float((yuv101010Pel[1] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK) - 512.0f;
    v[1] = float((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK) - 512.0f;

	red[1] = luma[1] + 1.140f * v[1];
	green[1] = luma[1] - 0.395f * u[1] - 0.581f * v[1];
	blue[1] = luma[1] + 2.032f * u[1];

    red[1]   = min(max(red[1],   0.0f), 1023.f);
    green[1] = min(max(green[1], 0.0f), 1023.f);
    blue[1]  = min(max(blue[1],  0.0f), 1023.f);

	const float s = 1.0f / 1024.0f * 255.0f;
	dstImage[y * width + x]     = make_float4(red[0] * s, green[0] * s, blue[0] * s, 1.0f);
	dstImage[y * width + x + 1] = make_float4(red[1] * s, green[1] * s, blue[1] * s, 1.0f);
}

// cudaNV12ToRGBA
cudaError_t myNV12ToRGBAf(
    cudaSurfaceObject_t surfLuma,
    cudaSurfaceObject_t surfChroma,
    float4* destDev,
    uint32_t width,
    uint32_t height )
{
	if( !destDev )
		return cudaErrorInvalidDevicePointer;

	if( width == 0 || height == 0 )
		return cudaErrorInvalidValue;

	const dim3 blockDim(8,8,1);
	//const dim3 gridDim((width+(2*blockDim.x-1))/(2*blockDim.x), (height+(blockDim.y-1))/blockDim.y, 1);
	const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height, blockDim.y), 1);

	myKernel<<<gridDim, blockDim>>>(surfLuma, surfChroma, destDev, width, height );
	
	return CUDA(cudaGetLastError());
}

Then I can pass in the luma/chroma surface objects from cuEGLStreamConsumer

csResource cudaResource = 0;
CUstream cudaStream = 0;
cuResult = cuEGLStreamConsumerAcquireFrame(&cudaConnection, &cudaResource, &cudaStream, -1);
if(cuResult != CUDA_SUCCESS) break;

// Get the CUDA EGL frame.
CUeglFrame cudaEGLFrame;
cuResult = cuGraphicsResourceGetMappedEglFrame(&cudaEGLFrame, cudaResource, 0, 0);
if(cuResult != CUDA_SUCCESS)
{
    ORIGINATE_ERROR("Unable to get the CUDA EGL frame (CUresult %s).",
        getCudaErrorString(cuResult));
}

m_isConnected = true;

CUDA_RESOURCE_DESC cudaResourceDesc;
CUsurfObject cudaSurfLuma = 0;
CUsurfObject cudaSurfChroma = 0;

memset(&cudaResourceDesc, 0, sizeof(cudaResourceDesc));
cudaResourceDesc.resType = CU_RESOURCE_TYPE_ARRAY;

// Create a surface from the luminance plane
cudaResourceDesc.res.array.hArray = cudaEGLFrame.frame.pArray[0];
cuResult = cuSurfObjectCreate(&cudaSurfLuma, &cudaResourceDesc);
if(cuResult != CUDA_SUCCESS)
{
    ORIGINATE_ERROR("Unable to create the luma surface object (CUresult %s)",
        getCudaErrorString(cuResult));
}

// Create a surface from the chrominance plane
cudaResourceDesc.res.array.hArray = cudaEGLFrame.frame.pArray[1];
cuResult = cuSurfObjectCreate(&cudaSurfChroma, &cudaResourceDesc);
if(cuResult != CUDA_SUCCESS)
{
    ORIGINATE_ERROR("Unable to create the chroma surface object (CUresult %s)",
        getCudaErrorString(cuResult));
}

myNV12ToRGBAf(
    cudaSurfLuma,
    cudaSurfChroma,
    frame[1],
    shared->width,
    shared->height);

Then I pass the cuda memory buffer (frame[1]) into one of the jetson-inference net objects.