The fastest way to decoded video frame to opengl texture?

sun_jianchuan · January 19, 2022, 2:52am

In my project, use v4l2 nvdec to decoded video frame from mp4, then copy decoded data to opengl texture.
I do it by following step:
1 Deocded video frame(No blocking mode) to NvBuffer(With an fd of dma buffer), My decoder output_plane memory type is V4L2_MEMORY_USERPTR and capture_plane queue buffer 's memory type is V4L2_MEMORY_DMABUF
layout = NvBufferLayout_BlockLinear
payloadType = NvBufferPayload_SurfArray
nvbuf_tag = NvBufferTag_VIDEO_DEC

2 Use NvEGLImageFromFd to get eglImage from decoded buffer(fd)。cause decoded frame layout is array(NV12), i create 2 surface object to warp it .(two src surface object use in cuda)

3 Regist dst ogl texture as a cuda resource, map it ,warp it as an surface object(one dst surface object in cuda).
4 perform color space convert by cuda.(read data from two src surface object , mul matrix , write rgba to dst surface obejct)
5 unregister unmap and destory surface obejct.

I found it is very slow to perform this convert by cuda.
If i don’t call cuda keneral func,the speed is fine.
If i just write data to dst surface obejct(ogl texture),without read from two src surface object(Nv12 fd),the speed slow down heavy.

what is the fast way to do this situation.
code :
if (!decodedFrame || !decodedFrame->dmaBufferFileDescriptor)
return NV_E_INVALID_PARAMETER;

HRESULT hr = NV_NOERROR;
CUcontext oldContext = nullptr;
cuCtxPopCurrent(&oldContext);
if (cuCtxPushCurrent(m_cudaContext) != CUDA_SUCCESS) {
    NvError("cuCtxPushCurrent() failed.");
    return false;
}

do {

#if defined(MRA_OPENGL)
CUresult status;
CUeglFrame eglFrame;
CUgraphicsResource eglImageResource = nullptr;
CUgraphicsResource oglTextureResource = nullptr;
CUarray cudaArray = nullptr;
CUsurfObject srcYSurfObj = 0;
CUsurfObject srcUVSurfObj = 0;
CUsurfObject dstSurfObj = 0;

    // Warp decoded frame fd as a EGLImage.
    NV_ASSERT(eglGetCurrentDisplay());
    EGLImageKHR eglImage = NvEGLImageFromFd(eglGetCurrentDisplay(), decodedFrame->dmaBufferFileDescriptor);
    if (eglImage == NULL) {
        NvError() << "Error while mapping dmabuf fd "<< decodedFrame->dmaBufferFileDescriptor << " to EGLImage";
        hr = NV_E_FAIL;
        break;
    }

    // Resigter EGLImage with a cuda resource.
    status = cuGraphicsEGLRegisterImage(&eglImageResource, eglImage, CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY);
    if (status != CUDA_SUCCESS) {
        NvError() << "cuGraphicsEGLRegisterImage() failed, error code = " << status;
        hr = NV_E_FAIL;
        break;
    }

    // Register OGLTexture with a cuda resource.
    CMraOglTexture *oglTexture = static_cast<CMraOglTexture *>(dstTexture);
    NV_ASSERT(oglTexture);
    GLuint texId = oglTexture->GetNativeTexture().nativeTex;
    // NOTE: flag must be cudaGraphicsRegisterFlagsSurfaceLoadStore that texture can binding with cuda surfac 2D to write on it.
    status = cuGraphicsGLRegisterImage(&oglTextureResource,
                                       texId,
                                       GL_TEXTURE_2D,
                                       cudaGraphicsRegisterFlagsSurfaceLoadStore);
    if (status != CUDA_SUCCESS) {
        NvError() << "cuGraphicsGLRegisterImage() failed, error code = " << status;
        hr = NV_E_FAIL;
        break;
    }

    // Get mapped EGLFrame from EGLimage resource.
    status = cuGraphicsResourceGetMappedEglFrame(&eglFrame, eglImageResource, 0, 0);
    if (status != CUDA_SUCCESS) {
        NvError() << "cuGraphicsResourceGetMappedEglFrame() failed, error code = " << status;
        hr = NV_E_FAIL;
        break;
    }

    // Create y/uv surface object from eglFrame
    CUDA_RESOURCE_DESC srcYSurfResDesc;
    memset(&srcYSurfResDesc, 0, sizeof(srcYSurfResDesc));
    srcYSurfResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
    srcYSurfResDesc.res.array.hArray = eglFrame.frame.pArray[0];
    status = cuSurfObjectCreate(&srcYSurfObj, &srcYSurfResDesc);
    if (status != CUDA_SUCCESS) {
        NvError() << "cuSurfObjectCreate() for src y plane failed, error code = " << status;
        hr = NV_E_FAIL;
        break;
    }
    CUDA_RESOURCE_DESC srcUVSurfResDesc;
    memset(&srcUVSurfResDesc, 0, sizeof(srcUVSurfResDesc));
    srcUVSurfResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
    srcUVSurfResDesc.res.array.hArray = eglFrame.frame.pArray[1];
    status = cuSurfObjectCreate(&srcUVSurfObj, &srcUVSurfResDesc);
    if (status != CUDA_SUCCESS) {
        NvError() << "cuSurfObjectCreate() for src uv plane failed, error code = " << status;
        hr = NV_E_FAIL;
        break;
    }

    // Map ogl texture cuda resource.
    status = cuGraphicsMapResources(1, &oglTextureResource, 0);
    if (status != CUDA_SUCCESS) {
        NvError() << "cuGraphicsMapResources() failed, error code = " << status;
        hr = NV_E_FAIL;
        break;
    }

    // Get mapped cuArray from OGLTexture resource.
    status = cuGraphicsSubResourceGetMappedArray(&cudaArray, oglTextureResource, 0, 0);
    if (status != CUDA_SUCCESS) {
        NvError() << "cuGraphicsSubResourceGetMappedArray() failed, error code = " << status;
        hr = NV_E_FAIL;
        break;
    }

    // Binding cuArray to cuda surface2D object.
    CUDA_RESOURCE_DESC surfResDesc;
    memset(&surfResDesc, 0, sizeof(surfResDesc));
    surfResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
    surfResDesc.res.array.hArray = cudaArray;
    status = cuSurfObjectCreate(&dstSurfObj, &surfResDesc);
    if (status != CUDA_SUCCESS) {
        NvError() << "cuSurfObjectCreate() failed, error code = " << status;
        hr = NV_E_FAIL;
        break;
    }

    // Sync cuda context, wait for all thing ready.
    status = cuCtxSynchronize();
    if (status != CUDA_SUCCESS) {
        NvError() << "cuCtxSynchronize() failed, error code = " << status;
        hr = NV_E_FAIL;
        break;
    }

    // Perform color space convert by cuda kernel.
    ColorSpaceConvertParam convertParam;
    // FIX ME: other color mode need support later.
    FillColorSpaceConvertParam(YuvColorMode_601_VideoRange, convertParam);
    // FIX ME: odd width height need support later.
    NV_ASSERT((eglFrame.width & 1) == 0);
    NV_ASSERT((eglFrame.height & 1) == 0);
    // FXI ME: hdr surce need support later.
    NV_ASSERT(eglFrame.cuFormat == CU_AD_FORMAT_UNSIGNED_INT8);

    CUDAConvertColorSpaceNV12ToBGRA(srcYSurfObj,
                                    srcUVSurfObj,
                                    dstSurfObj,
                                    convertParam,
                                    eglFrame.width,
                                    eglFrame.height);

    // Sync cuda context, wait for all thing complete.
    status = cuCtxSynchronize();
    if (status != CUDA_SUCCESS)
        NvError() << "cuCtxSynchronize() failed, error code = " << status;

    // Unmap ogl texture cuda resource.
    status = cuGraphicsUnmapResources(1, &oglTextureResource, 0);
    if (status != CUDA_SUCCESS)
        NvError() << "cuGraphicsUnmapResources() failed, error code = " << status;

    // Unregister OGLTexture.
    status = cuGraphicsUnregisterResource(oglTextureResource);
    if (status != CUDA_SUCCESS)
        NvError() << "cuGraphicsUnregisterResource() for ogl texture failed, error code = " << status;

    // Unregister EGLImage resource.
    status = cuGraphicsUnregisterResource(eglImageResource);
    if (status != CUDA_SUCCESS)
        NvError() << "cuGraphicsUnregisterResource() for elg image failed, error code = " << status;

    // Destory surface object.
    status = cuSurfObjectDestroy(srcYSurfObj);
    if (status != CUDA_SUCCESS)
        NvError() << "cuSurfObjectDestroy() for y plane failed, error code = " << status;
    status = cuSurfObjectDestroy(srcUVSurfObj);
    if (status != CUDA_SUCCESS)
        NvError() << "cuSurfObjectDestroy() for uv plane failed, error code = " << status;
    status = cuSurfObjectDestroy(dstSurfObj);
    if (status != CUDA_SUCCESS)
        NvError() << "cuSurfObjectDestroy() failed, error code = " << status;

    if(0 != NvDestroyEGLImage(eglGetCurrentDisplay(), eglImage))
        NvError() << "NvDestroyEGLImage() failed";

#else
NV_ASSERT(false);
//TODO:
#endif
} while(0);

cuCtxPopCurrent(&oldContext);
return hr;

cu code:
device inline float clamp(float val, float mn, float mx)
{
return (val >= mn) ? ((val <= mx) ? val : mx) : mn;
}

global
void ConvertColorSpaceNV12ToBGRAKernel(CUsurfObject srcYSurfObj,
CUsurfObject srcUVSurfObj,
CUsurfObject dstSurfObj,
ColorSpaceConvertParam param,
int imageWidth,
int imageheight)
{
// Calculate image coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

// Cause we process NV12 src image ,process 4 pixel in each cuda thread.
unsigned int processWidth = imageWidth / 2;
unsigned int processheight = imageheight / 2;
if (x < processWidth && y < processheight) {
    float dstR = 0, dstG = 0, dstB = 0, dstA = 255.f;
    uchar4 srcY;
    uchar2 srcUV;
    float4 srcFloatY;
    float2 srcFloatUV;

    // Read from src image memory
    surf2Dread(&srcY.x, srcYSurfObj, 2 * x, 2 * y);
    surf2Dread(&srcY.y, srcYSurfObj, 2 * x + 1, 2 * y);
    surf2Dread(&srcY.z, srcYSurfObj, 2 * x, 2 * y + 1);
    surf2Dread(&srcY.w, srcYSurfObj, 2 * x + 1, 2 * y + 1);
    surf2Dread(&srcUV.x, srcUVSurfObj, 2 * x, y);
    surf2Dread(&srcUV.y, srcUVSurfObj, 2 * x + 1, y);

    // Perform color space convert
    srcFloatY.x = srcY.x - param.yuvOffset[0];
    srcFloatY.y = srcY.y - param.yuvOffset[0];
    srcFloatY.z = srcY.z - param.yuvOffset[0];
    srcFloatY.w = srcY.w - param.yuvOffset[0];
    srcFloatUV.x = srcUV.x - param.yuvOffset[1];
    srcFloatUV.y = srcUV.y - param.yuvOffset[2];

    // 1st pixel
    dstR = srcFloatY.x * param.matYuvToRgb.data[0][0] +
           srcFloatUV.x * param.matYuvToRgb.data[0][1] +
           srcFloatUV.y * param.matYuvToRgb.data[0][2];
    dstG = srcFloatY.x * param.matYuvToRgb.data[1][0] +
           srcFloatUV.x * param.matYuvToRgb.data[1][1] +
           srcFloatUV.y * param.matYuvToRgb.data[1][2];
    dstB = srcFloatY.x * param.matYuvToRgb.data[2][0] +
           srcFloatUV.x * param.matYuvToRgb.data[2][1] +
           srcFloatUV.y * param.matYuvToRgb.data[2][2];
    dstR = clamp(dstR, 0.f, 255.f);
    dstG = clamp(dstG, 0.f, 255.f);
    dstB = clamp(dstB, 0.f, 255.f);
    surf2Dwrite(make_uchar4(dstR, dstG, dstB, dstA), dstSurfObj, 2 * x * 4, 2 * (processheight - y - 1));

    // 2nd pixel
    dstR = srcFloatY.y * param.matYuvToRgb.data[0][0] +
           srcFloatUV.x * param.matYuvToRgb.data[0][1] +
           srcFloatUV.y * param.matYuvToRgb.data[0][2];
    dstG = srcFloatY.y * param.matYuvToRgb.data[1][0] +
           srcFloatUV.x * param.matYuvToRgb.data[1][1] +
           srcFloatUV.y * param.matYuvToRgb.data[1][2];
    dstB = srcFloatY.y * param.matYuvToRgb.data[2][0] +
           srcFloatUV.x * param.matYuvToRgb.data[2][1] +
           srcFloatUV.y * param.matYuvToRgb.data[2][2];
    dstR = clamp(dstR, 0.f, 255.f);
    dstG = clamp(dstG, 0.f, 255.f);
    dstB = clamp(dstB, 0.f, 255.f);
    surf2Dwrite(make_uchar4(dstR, dstG, dstB, dstA), dstSurfObj, (2 * x + 1) * 4, 2 * (processheight - y - 1));

    // 3rd pixel
    dstR = srcFloatY.z * param.matYuvToRgb.data[0][0] +
           srcFloatUV.x * param.matYuvToRgb.data[0][1] +
           srcFloatUV.y * param.matYuvToRgb.data[0][2];
    dstG = srcFloatY.z * param.matYuvToRgb.data[1][0] +
           srcFloatUV.x * param.matYuvToRgb.data[1][1] +
           srcFloatUV.y * param.matYuvToRgb.data[1][2];
    dstB = srcFloatY.z * param.matYuvToRgb.data[2][0] +
           srcFloatUV.x * param.matYuvToRgb.data[2][1] +
           srcFloatUV.y * param.matYuvToRgb.data[2][2];
    dstR = clamp(dstR, 0.f, 255.f);
    dstG = clamp(dstG, 0.f, 255.f);
    dstB = clamp(dstB, 0.f, 255.f);
    surf2Dwrite(make_uchar4(dstR, dstG, dstB, dstA), dstSurfObj, 2 * x * 4, 2 * (processheight - y - 1) + 1);

    // 4th pixel
    dstR = srcFloatY.w * param.matYuvToRgb.data[0][0] +
           srcFloatUV.x * param.matYuvToRgb.data[0][1] +
           srcFloatUV.y * param.matYuvToRgb.data[0][2];
    dstG = srcFloatY.w * param.matYuvToRgb.data[1][0] +
           srcFloatUV.x * param.matYuvToRgb.data[1][1] +
           srcFloatUV.y * param.matYuvToRgb.data[1][2];
    dstB = srcFloatY.w * param.matYuvToRgb.data[2][0] +
           srcFloatUV.x * param.matYuvToRgb.data[2][1] +
           srcFloatUV.y * param.matYuvToRgb.data[2][2];
    dstR = clamp(dstR, 0.f, 255.f);
    dstG = clamp(dstG, 0.f, 255.f);
    dstB = clamp(dstB, 0.f, 255.f);
    surf2Dwrite(make_uchar4(dstR, dstG, dstB, dstA), dstSurfObj, (2 * x + 1) * 4, 2 * (processheight - y - 1) + 1);
}

}

void CUDAConvertColorSpaceNV12ToBGRA(CUsurfObject srcYSurfObj,
CUsurfObject srcUVSurfObj,
CUsurfObject dstSurfObj,
ColorSpaceConvertParam &param,
int imageWidth,
int imageHeight)
{
dim3 threadsperBlock(16, 16);
dim3 numBlocks((imageWidth / 2 + threadsperBlock.x - 1) / threadsperBlock.x,
(imageHeight / 2 + threadsperBlock.y - 1) / threadsperBlock.y);
ConvertColorSpaceNV12ToBGRAKernel<<<numBlocks, threadsperBlock>>>(srcYSurfObj,
srcUVSurfObj,
dstSurfObj,
param,
imageWidth,
imageHeight);
}

DaneLLL · January 19, 2022, 4:56am

Hi,
Using NvBuffer APIs is the optimal solution. For further improvement, you can try to shift the task of format conversion from GPU to VIC(hardware converter) by calling NvBufferTransform().

We have added 20W modes from Jetpack 4.6, please execute sudo nvpmodel -m 7 and sudo jetson_clocks to get maximum throughput of Xavier NX. All power modes are listed in developer guide

system · February 9, 2022, 5:44am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Error generated while running the code after connecting the camera Jetson Xavier NX gstreamer , nvbugs	45	1256	January 2, 2024
How to save image to file ? Jetson TX2	22	5889	October 18, 2021
dps4 on xavier: failed to register EGLImage in cuda DeepStream SDK	16	2072	October 12, 2021
EGLStream(CUDA) -> cv::cuda::GpuMat using Argus & nppi Computer Vision & Image Processing opencv , cuda	16	1778	August 31, 2023
How to process NvBuffer video data in CUDA Jetson TX2	19	5647	October 18, 2018
How is the original image of NVBufSurface encoded as JPG via NVJPEG? Jetson Xavier NX graphics	2	1497	October 18, 2021
How to create opencv gpumat from nvstream? DeepStream SDK	36	14576	July 27, 2021
LibArgus EGLStream to nvivafilter Jetson TX2	12	6259	October 18, 2021
How can I create new NvBufSurface from cv::cuda::GpuMat with NVBUF_MEM_SURFACE_ARRAY DeepStream SDK deepstream	13	47	April 24, 2025
Efficient memory integration of CUDA NPP functions and HW NvVideoEncoder (HEVC/H264) Jetson TX2	11	2841	October 18, 2021

The fastest way to decoded video frame to opengl texture?

Related topics