I found that the problem is caused by premature end of data segment. It seems that sometimes nppiEncodeHuffmanScan_JPEG_8u16s_P3R returns incorrect length.
That’s my code, without error handling, in case someone can see something:
void CudaJpeg::Compress(
void* pY, int pitchY, //input Y plane and its pitch
void* pU, int pitchU, //input U plane and its pitch
void* pV, int pitchV, //input V plane and its pitch
void* pJpeg, //pointer to the result buffer, large enough
int* pJpegLength //returns length of jpeg file written to buffer
)
{
Npp8u *pdScan = (Npp8u *)pJpeg+jpegPreSize;
Npp32s nScanLength=0;
Npp8u *apDstImage[3] = {(Npp8u*)pY, (Npp8u*)pU, (Npp8u*)pV};
Npp32s aDstImageStep[3] = {pitchY, pitchU, pitchV};
//write precalculated jpeg headers
cudaMemcpyAsync(pJpeg, pdJpegPre, jpegPreSize,
cudaMemcpyDeviceToDevice, cudaStream);
//perform DCT & Co
for (int i = 0; i<3; i++)
{
nppSetStream(cudaStream);
nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(
apDstImage[i], aDstImageStep[i],
apdDCT[i], aDCTStep[i],
pdQuantizationTables + oFrameHeader.aQuantizationTableSelector[i] * 64,
aSrcSize[i],
pDCTState);
}
//encode and write jpeg data
nppSetStream(cudaStream);
nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep,
0, oScanHeader.nSs, oScanHeader.nSe,
oScanHeader.nA >> 4, oScanHeader.nA & 0x0f,
pdScan, &nScanLength,
apHuffmanDCTable,
apHuffmanACTable,
aSrcSize,
pJpegEncoderTemp);
//copy "end of image" marker
cudaMemcpyAsync(pdScan+nScanLength, pdJpegPost,
jpegPostSize, cudaMemcpyDeviceToDevice, cudaStream);
//calculates total size: header + scanLength + end marker
*pJpegLength = nScanLength+jpegPreSize+jpegPostSize;
}