Help Using NV12M with TensorNet (jetson-inference)

I have an Argus app that takes frames from the two cameras and sends them out again via Gstreamer. Now I’m trying to pass the image through a segNet (from jetson-interence) to get a streamed output.

Argus -> NvVideoConverter -> segNet -> Gstreamer

I’ve been able to get it to work in ARGB by converting the output of NvVideoConverter to a cv::Mat (like sample 11_camera_object_identifcation). Doing cv::cvtcolor to go from ABGR to RGB. Then cudaMalloc for a cuda buffer, cudaMemcpy2D to get the data into it, cudaRGBToRGBAf to get it into float format, finally passing the image through segNet and then all the way back up the rabbit hole again before passing the image out through an appsrc.

This method does approximately 1 FPS, and I know I could be a lot more efficient.

I noticed that there is a cudaNV12ToRGBAf function that seemed promising. So I switched the output of the NvVideoConverter to NV12M and tried to use cudaNV12ToRGBAf on it directly. So far no luck, if I write out the buffer to a png it’s 100% black/0x00.

Has anyone tried to do anything similar? Am I going about this the wrong way? I know that NV12M isn’t contiguous in memory, is there a way to get around that?

This is my NvVideoConverter capture dequeue callback:

bool ConsumerThread::converterCapturePlaneDqCallback(
    struct v4l2_buffer *v4l2_buf,
    NvBuffer *buffer,
    NvBuffer *shared_buffer,
    void *arg)
{
    ConsumerThread *thiz = static_cast<ConsumerThread*>(arg);
    shared_memory_t *shared = thiz->shared;
    uint8_t dev = thiz->device;

    if(!v4l2_buf)
    {
        REPORT_ERROR("Failed to dequeue buffer from converter capture plane");
        thiz->abort();
        return false;
    }

    if(v4l2_buf->m.planes[0].bytesused == 0) return false;

    NvBufferParams params;
    NvBuffer::NvBufferPlane *plane = &buffer->planes[0];

    NvBufferGetParams(plane->fd, &params);

    pthread_mutex_lock(&shared->input_mutex[dev]);

    cudaNV12ToRGBAf(
        plane->data,
        params.pitch[0],
        shared->input_cuda[dev],
        shared->width*sizeof(float4),
        shared->width,
        shared->height);

    *shared->new_input[dev] = true;

    pthread_mutex_unlock(&shared->input_mutex[dev]);

    if(thiz->m_ImageConverter->capture_plane.qBuffer(*v4l2_buf, NULL) < 0)
        ORIGINATE_ERROR("qBuffer failed");

    return true;
}

Hi,

It’s recommended to convert image to CUDA buffer from Argus directly.
There is some duplicate translation if converting Argus -> GStreamer -> CUDA -> TensorRT.

Please check sample cudaHistogram in MMAPI for information.
Try to modify the histogram kernel to the pre-process kernel in jetson_inference:
https://github.com/dusty-nv/jetson-inference/blob/master/imageNet.cu#L74

Then you can reuse the following implementation of jetson_inference once you got the imgCUDA buffer:
https://github.com/dusty-nv/jetson-inference/blob/master/segnet-console/segnet-console.cpp#L85

Thanks.

That’s exactly what I was looking for, thanks!

Here’s my working kernel for converting NV12 to RGBAf based on the kernels in jetson-inference:

#define COLOR_COMPONENT_MASK            0x3FF
#define COLOR_COMPONENT_BIT_SIZE        10

__global__ void myKernel(
    cudaSurfaceObject_t surfLuma,
    cudaSurfaceObject_t surfChroma,
    float4* dstImage,
    uint32_t width,
    uint32_t height)
{
    uint32_t yuv101010Pel[2];
    uint32_t x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
    uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;

    if(x >= width) return;

    if(y >= height) return;

    yuv101010Pel[0] = surf2Dread<uint8_t>(surfLuma, x+0, y) << 2;
    yuv101010Pel[1] = surf2Dread<uint8_t>(surfLuma, x+1, y) << 2;

    int y_chroma = (y >> 1);

    if(y & 1)
    {
        uint32_t chromaCb;
        uint32_t chromaCr;

        chromaCb = surf2Dread<uint8_t>(surfChroma, x, y_chroma);
        chromaCr = surf2Dread<uint8_t>(surfChroma, x+1, y_chroma);

        if(y_chroma < ((height >> 1) - 1))
        {
            chromaCb = (chromaCb + surf2Dread<uint8_t>(surfChroma, x, y_chroma+1) + 1) >> 1;
            chromaCr = (chromaCr + surf2Dread<uint8_t>(surfChroma, x+1, y_chroma+1) + 1) >> 1;
        }

        yuv101010Pel[0] |= (chromaCb << ((COLOR_COMPONENT_BIT_SIZE << 0) + 2));
        yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

        yuv101010Pel[1] |= (chromaCb << ((COLOR_COMPONENT_BIT_SIZE << 0) + 2));
        yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
    }
    else
    {
        yuv101010Pel[0] |= ((uint32_t)surf2Dread<uint8_t>(surfChroma, x, y_chroma) << ((COLOR_COMPONENT_BIT_SIZE << 0) + 2));
        yuv101010Pel[0] |= ((uint32_t)surf2Dread<uint8_t>(surfChroma, x+1, y_chroma) << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

        yuv101010Pel[1] |= ((uint32_t)surf2Dread<uint8_t>(surfChroma, x, y_chroma) << ((COLOR_COMPONENT_BIT_SIZE << 0) + 2));
        yuv101010Pel[1] |= ((uint32_t)surf2Dread<uint8_t>(surfChroma, x+1, y_chroma) << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
    }

    // this steps performs the color conversion
    float luma[2], u[2], v[2];
    float red[2], green[2], blue[2];

    luma[0] = float(yuv101010Pel[0] &   COLOR_COMPONENT_MASK);
    u[0] = float((yuv101010Pel[0] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK) - 512.0f;
    v[0] = float((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK) - 512.0f;

	red[0] = luma[0] + 1.140f * v[0];
	green[0] = luma[0] - 0.395f * u[0] - 0.581f * v[0];
	blue[0] = luma[0] + 2.032f * u[0];

    red[0]   = min(max(red[0],   0.0f), 1023.f);
    green[0] = min(max(green[0], 0.0f), 1023.f);
    blue[0]  = min(max(blue[0],  0.0f), 1023.f);

    luma[1] = float(yuv101010Pel[1] &   COLOR_COMPONENT_MASK);
    u[1] = float((yuv101010Pel[1] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK) - 512.0f;
    v[1] = float((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK) - 512.0f;

	red[1] = luma[1] + 1.140f * v[1];
	green[1] = luma[1] - 0.395f * u[1] - 0.581f * v[1];
	blue[1] = luma[1] + 2.032f * u[1];

    red[1]   = min(max(red[1],   0.0f), 1023.f);
    green[1] = min(max(green[1], 0.0f), 1023.f);
    blue[1]  = min(max(blue[1],  0.0f), 1023.f);

	const float s = 1.0f / 1024.0f * 255.0f;
	dstImage[y * width + x]     = make_float4(red[0] * s, green[0] * s, blue[0] * s, 1.0f);
	dstImage[y * width + x + 1] = make_float4(red[1] * s, green[1] * s, blue[1] * s, 1.0f);
}

// cudaNV12ToRGBA
cudaError_t myNV12ToRGBAf(
    cudaSurfaceObject_t surfLuma,
    cudaSurfaceObject_t surfChroma,
    float4* destDev,
    uint32_t width,
    uint32_t height )
{
	if( !destDev )
		return cudaErrorInvalidDevicePointer;

	if( width == 0 || height == 0 )
		return cudaErrorInvalidValue;

	const dim3 blockDim(8,8,1);
	//const dim3 gridDim((width+(2*blockDim.x-1))/(2*blockDim.x), (height+(blockDim.y-1))/blockDim.y, 1);
	const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height, blockDim.y), 1);

	myKernel<<<gridDim, blockDim>>>(surfLuma, surfChroma, destDev, width, height );
	
	return CUDA(cudaGetLastError());
}

Then I can pass in the luma/chroma surface objects from cuEGLStreamConsumer

csResource cudaResource = 0;
CUstream cudaStream = 0;
cuResult = cuEGLStreamConsumerAcquireFrame(&cudaConnection, &cudaResource, &cudaStream, -1);
if(cuResult != CUDA_SUCCESS) break;

// Get the CUDA EGL frame.
CUeglFrame cudaEGLFrame;
cuResult = cuGraphicsResourceGetMappedEglFrame(&cudaEGLFrame, cudaResource, 0, 0);
if(cuResult != CUDA_SUCCESS)
{
    ORIGINATE_ERROR("Unable to get the CUDA EGL frame (CUresult %s).",
        getCudaErrorString(cuResult));
}

m_isConnected = true;

CUDA_RESOURCE_DESC cudaResourceDesc;
CUsurfObject cudaSurfLuma = 0;
CUsurfObject cudaSurfChroma = 0;

memset(&cudaResourceDesc, 0, sizeof(cudaResourceDesc));
cudaResourceDesc.resType = CU_RESOURCE_TYPE_ARRAY;

// Create a surface from the luminance plane
cudaResourceDesc.res.array.hArray = cudaEGLFrame.frame.pArray[0];
cuResult = cuSurfObjectCreate(&cudaSurfLuma, &cudaResourceDesc);
if(cuResult != CUDA_SUCCESS)
{
    ORIGINATE_ERROR("Unable to create the luma surface object (CUresult %s)",
        getCudaErrorString(cuResult));
}

// Create a surface from the chrominance plane
cudaResourceDesc.res.array.hArray = cudaEGLFrame.frame.pArray[1];
cuResult = cuSurfObjectCreate(&cudaSurfChroma, &cudaResourceDesc);
if(cuResult != CUDA_SUCCESS)
{
    ORIGINATE_ERROR("Unable to create the chroma surface object (CUresult %s)",
        getCudaErrorString(cuResult));
}

myNV12ToRGBAf(
    cudaSurfLuma,
    cudaSurfChroma,
    frame[1],
    shared->width,
    shared->height);

Then I pass the cuda memory buffer (frame[1]) into one of the jetson-inference net objects.

Thanks for the feedback : )