Issue with nppiYUV420ToRGB_8u_P3AC4R

Hi,
Am trying to convert yuv420p to rgba and then back to yuv420p.Am doing this in order to check if yuv420p to rgba is working properly.I have observed the image gets distorted.The y,cb,cr values have changed after converting back.Can someone please help in figuring out the solution to this issue.
NppiSize oSizeROI = { 1920, 1080 };

unsigned char *ptrY = NULL, *ptrCb = NULL, *ptrCr = NULL;

unsigned char *ptrYz = NULL, *ptrCbz = NULL, *ptrCrz = NULL;
ptrYz = (unsigned char*) malloc(
		sizeof(unsigned char) * BCK_WIDTH * BCK_HEIGHT);
ptrCbz = (unsigned char*) malloc((BCK_WIDTH * BCK_HEIGHT) / 4);
ptrCrz = (unsigned char*) malloc((BCK_WIDTH * BCK_HEIGHT) / 4);
char* puYuv420 = buildYUV420("Yuv420-Input.yuv", 0xff, 0x80, 0x80);
getYCbCrFromYuv420pImage("result.yuv", &ptrY, &ptrCb, &ptrCr);

Npp8u *pDevice_MasterY = NULL, *pDevice_MasterU = NULL, *pDevice_MasterV =
		NULL, *pDevice_Master_RGBA = NULL;

cudaError_t uError1 = cudaMalloc(&pDevice_MasterY, BCK_HEIGHT * BCK_WIDTH);
if (uError1 != cudaSuccess)
	printf("Could not allocate memory: %d", uError1);
cudaMemcpy(pDevice_MasterY, ptrY, BCK_HEIGHT * BCK_WIDTH,
		cudaMemcpyHostToDevice);

cudaMalloc(&pDevice_MasterU, (BCK_HEIGHT * BCK_WIDTH) / 4);
cudaMemcpy(pDevice_MasterU, ptrCb, (BCK_HEIGHT * BCK_WIDTH) / 4,
		cudaMemcpyHostToDevice);

cudaMalloc(&pDevice_MasterV, (BCK_HEIGHT * BCK_WIDTH) / 4);
cudaMemcpy(pDevice_MasterV, ptrCr, (BCK_HEIGHT * BCK_WIDTH) / 4,
		cudaMemcpyHostToDevice);
uError1 = cudaMalloc(&pDevice_Master_RGBA, BCK_HEIGHT * BCK_WIDTH * 4);
Npp8u* pDevice_Master_YUVArray[3] = { 0 };
pDevice_Master_YUVArray[0] = pDevice_MasterY;
pDevice_Master_YUVArray[1] = pDevice_MasterU;
pDevice_Master_YUVArray[2] = pDevice_MasterV;

int SrcArrayDevStep[3] = { BCK_WIDTH, BCK_WIDTH / 2, BCK_WIDTH / 2 };

NppStatus uStatus = nppiYUV420ToRGB_8u_P3AC4R(pDevice_Master_YUVArray,
		SrcArrayDevStep, pDevice_Master_RGBA, BCK_WIDTH * 4, oSizeROI);
if (uStatus != 0) {
	printf("oops %d\n", (int) uStatus);
	return 1;
}


cudaFree(pDevice_MasterY);
cudaFree(pDevice_MasterU);
cudaFree(pDevice_MasterV);


Npp8u *pDevice_OverlayedY = NULL, *pDevice_OverlayedU = NULL,
		*pDevice_OverlayedV = NULL;

uError1 = cudaMalloc(&pDevice_OverlayedY, BCK_HEIGHT * BCK_WIDTH);
if (uError1 != cudaSuccess)
	printf("Could not allocate memory: %d", uError1);
cudaMalloc(&pDevice_OverlayedU, (BCK_HEIGHT * BCK_WIDTH) / 4);

cudaMalloc(&pDevice_OverlayedV, (BCK_HEIGHT * BCK_WIDTH) / 4);
Npp8u* pDevice_Overlayed_YUVArray[3] = { 0 };
pDevice_Overlayed_YUVArray[0] = pDevice_OverlayedY;
pDevice_Overlayed_YUVArray[1] = pDevice_OverlayedU;
pDevice_Overlayed_YUVArray[2] = pDevice_OverlayedV;

uStatus = nppiRGBToYCrCb420_8u_AC4P3R(pDevice_Master_RGBA, BCK_WIDTH * 4,
		pDevice_Overlayed_YUVArray, SrcArrayDevStep, oSizeROI);

cudaMemcpy(ptrYz, pDevice_Overlayed_YUVArray[0], BCK_HEIGHT * BCK_WIDTH,
		cudaMemcpyDeviceToHost);

cudaMemcpy(ptrCbz, pDevice_Overlayed_YUVArray[2], (BCK_HEIGHT * BCK_WIDTH) / 4,
		cudaMemcpyDeviceToHost);

cudaMemcpy(ptrCrz, pDevice_Overlayed_YUVArray[1], (BCK_HEIGHT * BCK_WIDTH) / 4,
		cudaMemcpyDeviceToHost);
FILE * file;

file = fopen("input.yuv", "w+");
if (file == NULL) {
} else {
	fseek(file, 0, SEEK_SET);
	int width = 1920, height = 1080, half_width = 960, half_height = 540;
	int bytes_read = fwrite(ptrYz, sizeof(unsigned char), width * height,
			file);
	fseek(file, width * height, SEEK_SET);
	bytes_read = fwrite(ptrCrz, sizeof(unsigned char),
			half_width * half_height, file);
	fseek(file, (width * height) + (half_width * half_height), SEEK_SET);
	bytes_read = fwrite(ptrCbz, sizeof(unsigned char),
			half_width * half_height, file);
	fclose(file);
}

You convert a YUV420 image to RGB:

NppStatus uStatus = nppiYUV420ToRGB_8u_P3AC4R(

then you convert the RGB image to YCrCb420:

uStatus = nppiRGBToYCrCb420_8u_AC4P3R

Why would you think a YUV image should be the same numerically as a YCrCb image?

If you study the documentation carefully:

http://docs.nvidia.com/cuda/pdf/NPP_Library.pdf

starting at around page 498, section 7.43 Color and Sampling Conversion, you will find that the formulas to convert an RGB to YUV image don’t match the formulas to convert an RGB to YCrCb image. Therefore I would not expect:

YUV->RGB->YCrCb

to produce a numerically identical image from YUV to YCrCb

For example, the formula for the Y component of RGB to YUV is given as:

Npp32f nY = 0.299F * R + 0.587F * G + 0.114F * B;

on page 500.

The corresponding formula for the Y component of RGB to YCrCb is given as:

Npp32f nY = 0.257F * R + 0.504F * G + 0.098F * B + 16.0F;

on page 504. It’s evident they do not match.

moreover, even if they match, it’s a lossy conversion unless you have infinite precision