In my project, use v4l2 nvdec to decoded video frame from mp4, then copy decoded data to opengl texture.
I do it by following step:
1 Deocded video frame(No blocking mode) to NvBuffer(With an fd of dma buffer), My decoder output_plane memory type is V4L2_MEMORY_USERPTR and capture_plane queue buffer 's memory type is V4L2_MEMORY_DMABUF
layout = NvBufferLayout_BlockLinear
payloadType = NvBufferPayload_SurfArray
nvbuf_tag = NvBufferTag_VIDEO_DEC
2 Use NvEGLImageFromFd to get eglImage from decoded buffer(fd)。cause decoded frame layout is array(NV12), i create 2 surface object to warp it .(two src surface object use in cuda)
3 Regist dst ogl texture as a cuda resource, map it ,warp it as an surface object(one dst surface object in cuda).
4 perform color space convert by cuda.(read data from two src surface object , mul matrix , write rgba to dst surface obejct)
5 unregister unmap and destory surface obejct.
I found it is very slow to perform this convert by cuda.
If i don’t call cuda keneral func,the speed is fine.
If i just write data to dst surface obejct(ogl texture),without read from two src surface object(Nv12 fd),the speed slow down heavy.
what is the fast way to do this situation.
code :
if (!decodedFrame || !decodedFrame->dmaBufferFileDescriptor)
return NV_E_INVALID_PARAMETER;
HRESULT hr = NV_NOERROR;
CUcontext oldContext = nullptr;
cuCtxPopCurrent(&oldContext);
if (cuCtxPushCurrent(m_cudaContext) != CUDA_SUCCESS) {
NvError("cuCtxPushCurrent() failed.");
return false;
}
do {
#if defined(MRA_OPENGL)
CUresult status;
CUeglFrame eglFrame;
CUgraphicsResource eglImageResource = nullptr;
CUgraphicsResource oglTextureResource = nullptr;
CUarray cudaArray = nullptr;
CUsurfObject srcYSurfObj = 0;
CUsurfObject srcUVSurfObj = 0;
CUsurfObject dstSurfObj = 0;
// Warp decoded frame fd as a EGLImage.
NV_ASSERT(eglGetCurrentDisplay());
EGLImageKHR eglImage = NvEGLImageFromFd(eglGetCurrentDisplay(), decodedFrame->dmaBufferFileDescriptor);
if (eglImage == NULL) {
NvError() << "Error while mapping dmabuf fd "<< decodedFrame->dmaBufferFileDescriptor << " to EGLImage";
hr = NV_E_FAIL;
break;
}
// Resigter EGLImage with a cuda resource.
status = cuGraphicsEGLRegisterImage(&eglImageResource, eglImage, CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY);
if (status != CUDA_SUCCESS) {
NvError() << "cuGraphicsEGLRegisterImage() failed, error code = " << status;
hr = NV_E_FAIL;
break;
}
// Register OGLTexture with a cuda resource.
CMraOglTexture *oglTexture = static_cast<CMraOglTexture *>(dstTexture);
NV_ASSERT(oglTexture);
GLuint texId = oglTexture->GetNativeTexture().nativeTex;
// NOTE: flag must be cudaGraphicsRegisterFlagsSurfaceLoadStore that texture can binding with cuda surfac 2D to write on it.
status = cuGraphicsGLRegisterImage(&oglTextureResource,
texId,
GL_TEXTURE_2D,
cudaGraphicsRegisterFlagsSurfaceLoadStore);
if (status != CUDA_SUCCESS) {
NvError() << "cuGraphicsGLRegisterImage() failed, error code = " << status;
hr = NV_E_FAIL;
break;
}
// Get mapped EGLFrame from EGLimage resource.
status = cuGraphicsResourceGetMappedEglFrame(&eglFrame, eglImageResource, 0, 0);
if (status != CUDA_SUCCESS) {
NvError() << "cuGraphicsResourceGetMappedEglFrame() failed, error code = " << status;
hr = NV_E_FAIL;
break;
}
// Create y/uv surface object from eglFrame
CUDA_RESOURCE_DESC srcYSurfResDesc;
memset(&srcYSurfResDesc, 0, sizeof(srcYSurfResDesc));
srcYSurfResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
srcYSurfResDesc.res.array.hArray = eglFrame.frame.pArray[0];
status = cuSurfObjectCreate(&srcYSurfObj, &srcYSurfResDesc);
if (status != CUDA_SUCCESS) {
NvError() << "cuSurfObjectCreate() for src y plane failed, error code = " << status;
hr = NV_E_FAIL;
break;
}
CUDA_RESOURCE_DESC srcUVSurfResDesc;
memset(&srcUVSurfResDesc, 0, sizeof(srcUVSurfResDesc));
srcUVSurfResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
srcUVSurfResDesc.res.array.hArray = eglFrame.frame.pArray[1];
status = cuSurfObjectCreate(&srcUVSurfObj, &srcUVSurfResDesc);
if (status != CUDA_SUCCESS) {
NvError() << "cuSurfObjectCreate() for src uv plane failed, error code = " << status;
hr = NV_E_FAIL;
break;
}
// Map ogl texture cuda resource.
status = cuGraphicsMapResources(1, &oglTextureResource, 0);
if (status != CUDA_SUCCESS) {
NvError() << "cuGraphicsMapResources() failed, error code = " << status;
hr = NV_E_FAIL;
break;
}
// Get mapped cuArray from OGLTexture resource.
status = cuGraphicsSubResourceGetMappedArray(&cudaArray, oglTextureResource, 0, 0);
if (status != CUDA_SUCCESS) {
NvError() << "cuGraphicsSubResourceGetMappedArray() failed, error code = " << status;
hr = NV_E_FAIL;
break;
}
// Binding cuArray to cuda surface2D object.
CUDA_RESOURCE_DESC surfResDesc;
memset(&surfResDesc, 0, sizeof(surfResDesc));
surfResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
surfResDesc.res.array.hArray = cudaArray;
status = cuSurfObjectCreate(&dstSurfObj, &surfResDesc);
if (status != CUDA_SUCCESS) {
NvError() << "cuSurfObjectCreate() failed, error code = " << status;
hr = NV_E_FAIL;
break;
}
// Sync cuda context, wait for all thing ready.
status = cuCtxSynchronize();
if (status != CUDA_SUCCESS) {
NvError() << "cuCtxSynchronize() failed, error code = " << status;
hr = NV_E_FAIL;
break;
}
// Perform color space convert by cuda kernel.
ColorSpaceConvertParam convertParam;
// FIX ME: other color mode need support later.
FillColorSpaceConvertParam(YuvColorMode_601_VideoRange, convertParam);
// FIX ME: odd width height need support later.
NV_ASSERT((eglFrame.width & 1) == 0);
NV_ASSERT((eglFrame.height & 1) == 0);
// FXI ME: hdr surce need support later.
NV_ASSERT(eglFrame.cuFormat == CU_AD_FORMAT_UNSIGNED_INT8);
CUDAConvertColorSpaceNV12ToBGRA(srcYSurfObj,
srcUVSurfObj,
dstSurfObj,
convertParam,
eglFrame.width,
eglFrame.height);
// Sync cuda context, wait for all thing complete.
status = cuCtxSynchronize();
if (status != CUDA_SUCCESS)
NvError() << "cuCtxSynchronize() failed, error code = " << status;
// Unmap ogl texture cuda resource.
status = cuGraphicsUnmapResources(1, &oglTextureResource, 0);
if (status != CUDA_SUCCESS)
NvError() << "cuGraphicsUnmapResources() failed, error code = " << status;
// Unregister OGLTexture.
status = cuGraphicsUnregisterResource(oglTextureResource);
if (status != CUDA_SUCCESS)
NvError() << "cuGraphicsUnregisterResource() for ogl texture failed, error code = " << status;
// Unregister EGLImage resource.
status = cuGraphicsUnregisterResource(eglImageResource);
if (status != CUDA_SUCCESS)
NvError() << "cuGraphicsUnregisterResource() for elg image failed, error code = " << status;
// Destory surface object.
status = cuSurfObjectDestroy(srcYSurfObj);
if (status != CUDA_SUCCESS)
NvError() << "cuSurfObjectDestroy() for y plane failed, error code = " << status;
status = cuSurfObjectDestroy(srcUVSurfObj);
if (status != CUDA_SUCCESS)
NvError() << "cuSurfObjectDestroy() for uv plane failed, error code = " << status;
status = cuSurfObjectDestroy(dstSurfObj);
if (status != CUDA_SUCCESS)
NvError() << "cuSurfObjectDestroy() failed, error code = " << status;
if(0 != NvDestroyEGLImage(eglGetCurrentDisplay(), eglImage))
NvError() << "NvDestroyEGLImage() failed";
#else
NV_ASSERT(false);
//TODO:
#endif
} while(0);
cuCtxPopCurrent(&oldContext);
return hr;
cu code:
device inline float clamp(float val, float mn, float mx)
{
return (val >= mn) ? ((val <= mx) ? val : mx) : mn;
}
global
void ConvertColorSpaceNV12ToBGRAKernel(CUsurfObject srcYSurfObj,
CUsurfObject srcUVSurfObj,
CUsurfObject dstSurfObj,
ColorSpaceConvertParam param,
int imageWidth,
int imageheight)
{
// Calculate image coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
// Cause we process NV12 src image ,process 4 pixel in each cuda thread.
unsigned int processWidth = imageWidth / 2;
unsigned int processheight = imageheight / 2;
if (x < processWidth && y < processheight) {
float dstR = 0, dstG = 0, dstB = 0, dstA = 255.f;
uchar4 srcY;
uchar2 srcUV;
float4 srcFloatY;
float2 srcFloatUV;
// Read from src image memory
surf2Dread(&srcY.x, srcYSurfObj, 2 * x, 2 * y);
surf2Dread(&srcY.y, srcYSurfObj, 2 * x + 1, 2 * y);
surf2Dread(&srcY.z, srcYSurfObj, 2 * x, 2 * y + 1);
surf2Dread(&srcY.w, srcYSurfObj, 2 * x + 1, 2 * y + 1);
surf2Dread(&srcUV.x, srcUVSurfObj, 2 * x, y);
surf2Dread(&srcUV.y, srcUVSurfObj, 2 * x + 1, y);
// Perform color space convert
srcFloatY.x = srcY.x - param.yuvOffset[0];
srcFloatY.y = srcY.y - param.yuvOffset[0];
srcFloatY.z = srcY.z - param.yuvOffset[0];
srcFloatY.w = srcY.w - param.yuvOffset[0];
srcFloatUV.x = srcUV.x - param.yuvOffset[1];
srcFloatUV.y = srcUV.y - param.yuvOffset[2];
// 1st pixel
dstR = srcFloatY.x * param.matYuvToRgb.data[0][0] +
srcFloatUV.x * param.matYuvToRgb.data[0][1] +
srcFloatUV.y * param.matYuvToRgb.data[0][2];
dstG = srcFloatY.x * param.matYuvToRgb.data[1][0] +
srcFloatUV.x * param.matYuvToRgb.data[1][1] +
srcFloatUV.y * param.matYuvToRgb.data[1][2];
dstB = srcFloatY.x * param.matYuvToRgb.data[2][0] +
srcFloatUV.x * param.matYuvToRgb.data[2][1] +
srcFloatUV.y * param.matYuvToRgb.data[2][2];
dstR = clamp(dstR, 0.f, 255.f);
dstG = clamp(dstG, 0.f, 255.f);
dstB = clamp(dstB, 0.f, 255.f);
surf2Dwrite(make_uchar4(dstR, dstG, dstB, dstA), dstSurfObj, 2 * x * 4, 2 * (processheight - y - 1));
// 2nd pixel
dstR = srcFloatY.y * param.matYuvToRgb.data[0][0] +
srcFloatUV.x * param.matYuvToRgb.data[0][1] +
srcFloatUV.y * param.matYuvToRgb.data[0][2];
dstG = srcFloatY.y * param.matYuvToRgb.data[1][0] +
srcFloatUV.x * param.matYuvToRgb.data[1][1] +
srcFloatUV.y * param.matYuvToRgb.data[1][2];
dstB = srcFloatY.y * param.matYuvToRgb.data[2][0] +
srcFloatUV.x * param.matYuvToRgb.data[2][1] +
srcFloatUV.y * param.matYuvToRgb.data[2][2];
dstR = clamp(dstR, 0.f, 255.f);
dstG = clamp(dstG, 0.f, 255.f);
dstB = clamp(dstB, 0.f, 255.f);
surf2Dwrite(make_uchar4(dstR, dstG, dstB, dstA), dstSurfObj, (2 * x + 1) * 4, 2 * (processheight - y - 1));
// 3rd pixel
dstR = srcFloatY.z * param.matYuvToRgb.data[0][0] +
srcFloatUV.x * param.matYuvToRgb.data[0][1] +
srcFloatUV.y * param.matYuvToRgb.data[0][2];
dstG = srcFloatY.z * param.matYuvToRgb.data[1][0] +
srcFloatUV.x * param.matYuvToRgb.data[1][1] +
srcFloatUV.y * param.matYuvToRgb.data[1][2];
dstB = srcFloatY.z * param.matYuvToRgb.data[2][0] +
srcFloatUV.x * param.matYuvToRgb.data[2][1] +
srcFloatUV.y * param.matYuvToRgb.data[2][2];
dstR = clamp(dstR, 0.f, 255.f);
dstG = clamp(dstG, 0.f, 255.f);
dstB = clamp(dstB, 0.f, 255.f);
surf2Dwrite(make_uchar4(dstR, dstG, dstB, dstA), dstSurfObj, 2 * x * 4, 2 * (processheight - y - 1) + 1);
// 4th pixel
dstR = srcFloatY.w * param.matYuvToRgb.data[0][0] +
srcFloatUV.x * param.matYuvToRgb.data[0][1] +
srcFloatUV.y * param.matYuvToRgb.data[0][2];
dstG = srcFloatY.w * param.matYuvToRgb.data[1][0] +
srcFloatUV.x * param.matYuvToRgb.data[1][1] +
srcFloatUV.y * param.matYuvToRgb.data[1][2];
dstB = srcFloatY.w * param.matYuvToRgb.data[2][0] +
srcFloatUV.x * param.matYuvToRgb.data[2][1] +
srcFloatUV.y * param.matYuvToRgb.data[2][2];
dstR = clamp(dstR, 0.f, 255.f);
dstG = clamp(dstG, 0.f, 255.f);
dstB = clamp(dstB, 0.f, 255.f);
surf2Dwrite(make_uchar4(dstR, dstG, dstB, dstA), dstSurfObj, (2 * x + 1) * 4, 2 * (processheight - y - 1) + 1);
}
}
void CUDAConvertColorSpaceNV12ToBGRA(CUsurfObject srcYSurfObj,
CUsurfObject srcUVSurfObj,
CUsurfObject dstSurfObj,
ColorSpaceConvertParam ¶m,
int imageWidth,
int imageHeight)
{
dim3 threadsperBlock(16, 16);
dim3 numBlocks((imageWidth / 2 + threadsperBlock.x - 1) / threadsperBlock.x,
(imageHeight / 2 + threadsperBlock.y - 1) / threadsperBlock.y);
ConvertColorSpaceNV12ToBGRAKernel<<<numBlocks, threadsperBlock>>>(srcYSurfObj,
srcUVSurfObj,
dstSurfObj,
param,
imageWidth,
imageHeight);
}