I’m attempting to convert a NV12 bitmap produced by the NVIDIA H.264 decode,r NVDEC, to a 24-bit packed RGB bitmap. I’m confident the decoder produces good NV12 frames most of the time because modifying the YUV output and re-encoding them with NVENC works or nvjpegEncodeYUV() works on most frames (some have vertical lines on the bottom part as strides are repeated sometimes, but at least no errors).
Calling nppiNV12ToRGB_8u_P2C3R() causes a subsequent 700, CUDA_ERROR_ILLEGAL_ADDRESS, return code from cudaDeviceSynchronize(), cuMemcpyDtoH() or cuMemcpy2D(). The error still occurs attempting to copy even if cudaDeviceSynchronize() is not called.
The return code from nppiNV12ToRGB_8u_P2C3R() is 0. If the call to nppiNV12ToRGB_8u_P2C3R() is commented out, cuMemcpyDtoH() or cuMemcpy2D() each return 0 and copy the requested number of bytes (all zeros).
Using cuMemCpy2DAsync() (return code=0) and cuStreamSynchronize() (return code=700) was similar.
What am I doing wrong in the setup or call to nppiNV12ToRGB_8u_P2C3R()? Is nppiNV12ToRGB_8u_P2C3R() a good method to convert in device memory?
Quadro P5000, Driver 26.21.14.3602 (NVIDIA 436.02 / Win 7 64, CUDA 10.1)
Thank you
int NvDecoder::HandlePictureDisplay(CUVIDPARSERDISPINFO *pDispInfo)
{
CUVIDPROCPARAMS videoProcessingParameters = {};
videoProcessingParameters.progressive_frame = pDispInfo->progressive_frame;
videoProcessingParameters.second_field = pDispInfo->repeat_first_field + 1;
videoProcessingParameters.top_field_first = pDispInfo->top_field_first;
videoProcessingParameters.unpaired_field = pDispInfo->repeat_first_field < 0;
videoProcessingParameters.output_stream = m_cuvidStream;
CUdeviceptr dpSrcFrame = 0;
unsigned int nSrcPitch = 0;
//Get the decoded YUV frame from NVDEC will be referenced by dpSrcFrame.
NVDEC_API_CALL(cuvidMapVideoFrame(m_hDecoder, pDispInfo->picture_index, &dpSrcFrame,
&nSrcPitch, &videoProcessingParameters));
if (m_allocated == false)
{
m_allocated = true;
//also tried cuMemAllocPitch()
m_rgb24 = nppiMalloc_8u_C3(1920,1080,&m_rgb24_pitch);
DebugOutput(L"1 HandlePictureDisplay m_rgb24=%p, m_rgb24_pitch=%d",m_rgb24,m_rgb24_pitch);
//pinned memory
cudaError_t w_alloc_err = cudaHostAlloc(&m_host,m_rgb24_pitch * 1088,
cudaHostAllocPortable | cudaHostAllocMapped);
DebugOutput(L"2 HandlePictureDisplay m_host=%p, w_alloc_err=%d",m_host,w_alloc_err);
}
CUdeviceptr pLuma = dpSrcFrame;
//m_nSurfaceHeight is 1088, set in HandleVideoSequence()
CUdeviceptr pChromaUV = dpSrcFrame + nSrcPitch * m_nSurfaceHeight;
//Convert NV12 to RGB.
Npp8u *w_yuv_src[2] = {(Npp8u*)&pLuma,(Npp8u*)&pChromaUV};
NppiSize w_roi = {1920,1080};
NppStatus w_NppStatus = nppiNV12ToRGB_8u_P2C3R(w_yuv_src, nSrcPitch,
(Npp8u*)m_rgb24, m_rgb24_pitch, w_roi);
DebugOutput(L"3 HandlePictureDisplay w_NppStatus=%d",w_NppStatus);
<b>cudaError_t w_cuErr = cudaDeviceSynchronize(); //Return code is 700, CUDA_ERROR_ILLEGAL_ADDRESS.</b>
DebugOutput(L"4 HandlePictureDisplay cudaDeviceSynchronize w_cuErr=%d",w_cuErr);
//Try just 100 bytes. Return code is 700, CUDA_ERROR_ILLEGAL_ADDRESS.
<b>CUresult w_DhRc = cuMemcpyDtoH(m_host, (CUdeviceptr)m_rgb24, 100);</b>
DebugOutput(L"5 HandlePictureDisplay w_DhRc=%d",w_DhRc);
//Try copying the whole RGB bitmap.
CUDA_MEMCPY2D m = { 0 };
m.srcMemoryType = CU_MEMORYTYPE_DEVICE;
m.srcDevice = (CUdeviceptr) m_rgb24;
m.srcPitch = m_rgb24_pitch;
m.dstMemoryType = CU_MEMORYTYPE_HOST;
m.dstHost = m_host;
m.dstPitch = 1920*3;
m.WidthInBytes =1920*3;
m.Height = 1080;
<b>CUresult w_cuRc = cuMemcpy2D(&m); //return code is 700, CUDA_ERROR_ILLEGAL_ADDRESS</b>
DebugOutput(L"6 HandlePictureDisplay w_cuRc=%d",w_cuRc);
NVDEC_API_CALL(cuvidUnmapVideoFrame(m_hDecoder, dpSrcFrame));
return 0;
}
output:
1 HandlePictureDisplay m_rgb24=000000060C000000, m_rgb24_pitch=6144
2 HandlePictureDisplay m_host=0000000203C00000, w_alloc_err=0
3 HandlePictureDisplay w_NppStatus=0
4 HandlePictureDisplay cudaDeviceSynchronize w_cuErr=700
5 HandlePictureDisplay w_DhRc=700
6 HandlePictureDisplay w_cuRc=700