The fastest way to decoded video frame to opengl texture?

In my project, use v4l2 nvdec to decoded video frame from mp4, then copy decoded data to opengl texture.
I do it by following step:
1 Deocded video frame(No blocking mode) to NvBuffer(With an fd of dma buffer), My decoder output_plane memory type is V4L2_MEMORY_USERPTR and capture_plane queue buffer 's memory type is V4L2_MEMORY_DMABUF
layout = NvBufferLayout_BlockLinear
payloadType = NvBufferPayload_SurfArray
nvbuf_tag = NvBufferTag_VIDEO_DEC

2 Use NvEGLImageFromFd to get eglImage from decoded buffer(fd)。cause decoded frame layout is array(NV12), i create 2 surface object to warp it .(two src surface object use in cuda)

3 Regist dst ogl texture as a cuda resource, map it ,warp it as an surface object(one dst surface object in cuda).
4 perform color space convert by cuda.(read data from two src surface object , mul matrix , write rgba to dst surface obejct)
5 unregister unmap and destory surface obejct.

I found it is very slow to perform this convert by cuda.
If i don’t call cuda keneral func,the speed is fine.
If i just write data to dst surface obejct(ogl texture),without read from two src surface object(Nv12 fd),the speed slow down heavy.

what is the fast way to do this situation.
code :
if (!decodedFrame || !decodedFrame->dmaBufferFileDescriptor)
return NV_E_INVALID_PARAMETER;

HRESULT hr = NV_NOERROR;
CUcontext oldContext = nullptr;
cuCtxPopCurrent(&oldContext);
if (cuCtxPushCurrent(m_cudaContext) != CUDA_SUCCESS) {
    NvError("cuCtxPushCurrent() failed.");
    return false;
}

do {

#if defined(MRA_OPENGL)
CUresult status;
CUeglFrame eglFrame;
CUgraphicsResource eglImageResource = nullptr;
CUgraphicsResource oglTextureResource = nullptr;
CUarray cudaArray = nullptr;
CUsurfObject srcYSurfObj = 0;
CUsurfObject srcUVSurfObj = 0;
CUsurfObject dstSurfObj = 0;

    // Warp decoded frame fd as a EGLImage.
    NV_ASSERT(eglGetCurrentDisplay());
    EGLImageKHR eglImage = NvEGLImageFromFd(eglGetCurrentDisplay(), decodedFrame->dmaBufferFileDescriptor);
    if (eglImage == NULL) {
        NvError() << "Error while mapping dmabuf fd "<< decodedFrame->dmaBufferFileDescriptor << " to EGLImage";
        hr = NV_E_FAIL;
        break;
    }

    // Resigter EGLImage with a cuda resource.
    status = cuGraphicsEGLRegisterImage(&eglImageResource, eglImage, CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY);
    if (status != CUDA_SUCCESS) {
        NvError() << "cuGraphicsEGLRegisterImage() failed, error code = " << status;
        hr = NV_E_FAIL;
        break;
    }

    // Register OGLTexture with a cuda resource.
    CMraOglTexture *oglTexture = static_cast<CMraOglTexture *>(dstTexture);
    NV_ASSERT(oglTexture);
    GLuint texId = oglTexture->GetNativeTexture().nativeTex;
    // NOTE: flag must be cudaGraphicsRegisterFlagsSurfaceLoadStore that texture can binding with cuda surfac 2D to write on it.
    status = cuGraphicsGLRegisterImage(&oglTextureResource,
                                       texId,
                                       GL_TEXTURE_2D,
                                       cudaGraphicsRegisterFlagsSurfaceLoadStore);
    if (status != CUDA_SUCCESS) {
        NvError() << "cuGraphicsGLRegisterImage() failed, error code = " << status;
        hr = NV_E_FAIL;
        break;
    }

    // Get mapped EGLFrame from EGLimage resource.
    status = cuGraphicsResourceGetMappedEglFrame(&eglFrame, eglImageResource, 0, 0);
    if (status != CUDA_SUCCESS) {
        NvError() << "cuGraphicsResourceGetMappedEglFrame() failed, error code = " << status;
        hr = NV_E_FAIL;
        break;
    }

    // Create y/uv surface object from eglFrame
    CUDA_RESOURCE_DESC srcYSurfResDesc;
    memset(&srcYSurfResDesc, 0, sizeof(srcYSurfResDesc));
    srcYSurfResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
    srcYSurfResDesc.res.array.hArray = eglFrame.frame.pArray[0];
    status = cuSurfObjectCreate(&srcYSurfObj, &srcYSurfResDesc);
    if (status != CUDA_SUCCESS) {
        NvError() << "cuSurfObjectCreate() for src y plane failed, error code = " << status;
        hr = NV_E_FAIL;
        break;
    }
    CUDA_RESOURCE_DESC srcUVSurfResDesc;
    memset(&srcUVSurfResDesc, 0, sizeof(srcUVSurfResDesc));
    srcUVSurfResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
    srcUVSurfResDesc.res.array.hArray = eglFrame.frame.pArray[1];
    status = cuSurfObjectCreate(&srcUVSurfObj, &srcUVSurfResDesc);
    if (status != CUDA_SUCCESS) {
        NvError() << "cuSurfObjectCreate() for src uv plane failed, error code = " << status;
        hr = NV_E_FAIL;
        break;
    }

    // Map ogl texture cuda resource.
    status = cuGraphicsMapResources(1, &oglTextureResource, 0);
    if (status != CUDA_SUCCESS) {
        NvError() << "cuGraphicsMapResources() failed, error code = " << status;
        hr = NV_E_FAIL;
        break;
    }

    // Get mapped cuArray from OGLTexture resource.
    status = cuGraphicsSubResourceGetMappedArray(&cudaArray, oglTextureResource, 0, 0);
    if (status != CUDA_SUCCESS) {
        NvError() << "cuGraphicsSubResourceGetMappedArray() failed, error code = " << status;
        hr = NV_E_FAIL;
        break;
    }

    // Binding cuArray to cuda surface2D object.
    CUDA_RESOURCE_DESC surfResDesc;
    memset(&surfResDesc, 0, sizeof(surfResDesc));
    surfResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
    surfResDesc.res.array.hArray = cudaArray;
    status = cuSurfObjectCreate(&dstSurfObj, &surfResDesc);
    if (status != CUDA_SUCCESS) {
        NvError() << "cuSurfObjectCreate() failed, error code = " << status;
        hr = NV_E_FAIL;
        break;
    }

    // Sync cuda context, wait for all thing ready.
    status = cuCtxSynchronize();
    if (status != CUDA_SUCCESS) {
        NvError() << "cuCtxSynchronize() failed, error code = " << status;
        hr = NV_E_FAIL;
        break;
    }

    // Perform color space convert by cuda kernel.
    ColorSpaceConvertParam convertParam;
    // FIX ME: other color mode need support later.
    FillColorSpaceConvertParam(YuvColorMode_601_VideoRange, convertParam);
    // FIX ME: odd width height need support later.
    NV_ASSERT((eglFrame.width & 1) == 0);
    NV_ASSERT((eglFrame.height & 1) == 0);
    // FXI ME: hdr surce need support later.
    NV_ASSERT(eglFrame.cuFormat == CU_AD_FORMAT_UNSIGNED_INT8);

    CUDAConvertColorSpaceNV12ToBGRA(srcYSurfObj,
                                    srcUVSurfObj,
                                    dstSurfObj,
                                    convertParam,
                                    eglFrame.width,
                                    eglFrame.height);

    // Sync cuda context, wait for all thing complete.
    status = cuCtxSynchronize();
    if (status != CUDA_SUCCESS)
        NvError() << "cuCtxSynchronize() failed, error code = " << status;

    // Unmap ogl texture cuda resource.
    status = cuGraphicsUnmapResources(1, &oglTextureResource, 0);
    if (status != CUDA_SUCCESS)
        NvError() << "cuGraphicsUnmapResources() failed, error code = " << status;

    // Unregister OGLTexture.
    status = cuGraphicsUnregisterResource(oglTextureResource);
    if (status != CUDA_SUCCESS)
        NvError() << "cuGraphicsUnregisterResource() for ogl texture failed, error code = " << status;

    // Unregister EGLImage resource.
    status = cuGraphicsUnregisterResource(eglImageResource);
    if (status != CUDA_SUCCESS)
        NvError() << "cuGraphicsUnregisterResource() for elg image failed, error code = " << status;

    // Destory surface object.
    status = cuSurfObjectDestroy(srcYSurfObj);
    if (status != CUDA_SUCCESS)
        NvError() << "cuSurfObjectDestroy() for y plane failed, error code = " << status;
    status = cuSurfObjectDestroy(srcUVSurfObj);
    if (status != CUDA_SUCCESS)
        NvError() << "cuSurfObjectDestroy() for uv plane failed, error code = " << status;
    status = cuSurfObjectDestroy(dstSurfObj);
    if (status != CUDA_SUCCESS)
        NvError() << "cuSurfObjectDestroy() failed, error code = " << status;

    if(0 != NvDestroyEGLImage(eglGetCurrentDisplay(), eglImage))
        NvError() << "NvDestroyEGLImage() failed";

#else
NV_ASSERT(false);
//TODO:
#endif
} while(0);

cuCtxPopCurrent(&oldContext);
return hr;

cu code:
device inline float clamp(float val, float mn, float mx)
{
return (val >= mn) ? ((val <= mx) ? val : mx) : mn;
}

global
void ConvertColorSpaceNV12ToBGRAKernel(CUsurfObject srcYSurfObj,
CUsurfObject srcUVSurfObj,
CUsurfObject dstSurfObj,
ColorSpaceConvertParam param,
int imageWidth,
int imageheight)
{
// Calculate image coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

// Cause we process NV12 src image ,process 4 pixel in each cuda thread.
unsigned int processWidth = imageWidth / 2;
unsigned int processheight = imageheight / 2;
if (x < processWidth && y < processheight) {
    float dstR = 0, dstG = 0, dstB = 0, dstA = 255.f;
    uchar4 srcY;
    uchar2 srcUV;
    float4 srcFloatY;
    float2 srcFloatUV;

    // Read from src image memory
    surf2Dread(&srcY.x, srcYSurfObj, 2 * x, 2 * y);
    surf2Dread(&srcY.y, srcYSurfObj, 2 * x + 1, 2 * y);
    surf2Dread(&srcY.z, srcYSurfObj, 2 * x, 2 * y + 1);
    surf2Dread(&srcY.w, srcYSurfObj, 2 * x + 1, 2 * y + 1);
    surf2Dread(&srcUV.x, srcUVSurfObj, 2 * x, y);
    surf2Dread(&srcUV.y, srcUVSurfObj, 2 * x + 1, y);

    // Perform color space convert
    srcFloatY.x = srcY.x - param.yuvOffset[0];
    srcFloatY.y = srcY.y - param.yuvOffset[0];
    srcFloatY.z = srcY.z - param.yuvOffset[0];
    srcFloatY.w = srcY.w - param.yuvOffset[0];
    srcFloatUV.x = srcUV.x - param.yuvOffset[1];
    srcFloatUV.y = srcUV.y - param.yuvOffset[2];

    // 1st pixel
    dstR = srcFloatY.x * param.matYuvToRgb.data[0][0] +
           srcFloatUV.x * param.matYuvToRgb.data[0][1] +
           srcFloatUV.y * param.matYuvToRgb.data[0][2];
    dstG = srcFloatY.x * param.matYuvToRgb.data[1][0] +
           srcFloatUV.x * param.matYuvToRgb.data[1][1] +
           srcFloatUV.y * param.matYuvToRgb.data[1][2];
    dstB = srcFloatY.x * param.matYuvToRgb.data[2][0] +
           srcFloatUV.x * param.matYuvToRgb.data[2][1] +
           srcFloatUV.y * param.matYuvToRgb.data[2][2];
    dstR = clamp(dstR, 0.f, 255.f);
    dstG = clamp(dstG, 0.f, 255.f);
    dstB = clamp(dstB, 0.f, 255.f);
    surf2Dwrite(make_uchar4(dstR, dstG, dstB, dstA), dstSurfObj, 2 * x * 4, 2 * (processheight - y - 1));

    // 2nd pixel
    dstR = srcFloatY.y * param.matYuvToRgb.data[0][0] +
           srcFloatUV.x * param.matYuvToRgb.data[0][1] +
           srcFloatUV.y * param.matYuvToRgb.data[0][2];
    dstG = srcFloatY.y * param.matYuvToRgb.data[1][0] +
           srcFloatUV.x * param.matYuvToRgb.data[1][1] +
           srcFloatUV.y * param.matYuvToRgb.data[1][2];
    dstB = srcFloatY.y * param.matYuvToRgb.data[2][0] +
           srcFloatUV.x * param.matYuvToRgb.data[2][1] +
           srcFloatUV.y * param.matYuvToRgb.data[2][2];
    dstR = clamp(dstR, 0.f, 255.f);
    dstG = clamp(dstG, 0.f, 255.f);
    dstB = clamp(dstB, 0.f, 255.f);
    surf2Dwrite(make_uchar4(dstR, dstG, dstB, dstA), dstSurfObj, (2 * x + 1) * 4, 2 * (processheight - y - 1));

    // 3rd pixel
    dstR = srcFloatY.z * param.matYuvToRgb.data[0][0] +
           srcFloatUV.x * param.matYuvToRgb.data[0][1] +
           srcFloatUV.y * param.matYuvToRgb.data[0][2];
    dstG = srcFloatY.z * param.matYuvToRgb.data[1][0] +
           srcFloatUV.x * param.matYuvToRgb.data[1][1] +
           srcFloatUV.y * param.matYuvToRgb.data[1][2];
    dstB = srcFloatY.z * param.matYuvToRgb.data[2][0] +
           srcFloatUV.x * param.matYuvToRgb.data[2][1] +
           srcFloatUV.y * param.matYuvToRgb.data[2][2];
    dstR = clamp(dstR, 0.f, 255.f);
    dstG = clamp(dstG, 0.f, 255.f);
    dstB = clamp(dstB, 0.f, 255.f);
    surf2Dwrite(make_uchar4(dstR, dstG, dstB, dstA), dstSurfObj, 2 * x * 4, 2 * (processheight - y - 1) + 1);

    // 4th pixel
    dstR = srcFloatY.w * param.matYuvToRgb.data[0][0] +
           srcFloatUV.x * param.matYuvToRgb.data[0][1] +
           srcFloatUV.y * param.matYuvToRgb.data[0][2];
    dstG = srcFloatY.w * param.matYuvToRgb.data[1][0] +
           srcFloatUV.x * param.matYuvToRgb.data[1][1] +
           srcFloatUV.y * param.matYuvToRgb.data[1][2];
    dstB = srcFloatY.w * param.matYuvToRgb.data[2][0] +
           srcFloatUV.x * param.matYuvToRgb.data[2][1] +
           srcFloatUV.y * param.matYuvToRgb.data[2][2];
    dstR = clamp(dstR, 0.f, 255.f);
    dstG = clamp(dstG, 0.f, 255.f);
    dstB = clamp(dstB, 0.f, 255.f);
    surf2Dwrite(make_uchar4(dstR, dstG, dstB, dstA), dstSurfObj, (2 * x + 1) * 4, 2 * (processheight - y - 1) + 1);
}

}

void CUDAConvertColorSpaceNV12ToBGRA(CUsurfObject srcYSurfObj,
CUsurfObject srcUVSurfObj,
CUsurfObject dstSurfObj,
ColorSpaceConvertParam &param,
int imageWidth,
int imageHeight)
{
dim3 threadsperBlock(16, 16);
dim3 numBlocks((imageWidth / 2 + threadsperBlock.x - 1) / threadsperBlock.x,
(imageHeight / 2 + threadsperBlock.y - 1) / threadsperBlock.y);
ConvertColorSpaceNV12ToBGRAKernel<<<numBlocks, threadsperBlock>>>(srcYSurfObj,
srcUVSurfObj,
dstSurfObj,
param,
imageWidth,
imageHeight);
}

Hi,
Using NvBuffer APIs is the optimal solution. For further improvement, you can try to shift the task of format conversion from GPU to VIC(hardware converter) by calling NvBufferTransform().

We have added 20W modes from Jetpack 4.6, please execute sudo nvpmodel -m 7 and sudo jetson_clocks to get maximum throughput of Xavier NX. All power modes are listed in developer guide

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.