Issue with nppiYUV420ToRGB_8u_P3AC4R()

Hi,
Am trying to convert yuv420p to rgba and then back to yuv420p.Am doing this in order to check if yuv420p to rgba is working properly.I have observed the image gets distorted.The y,cb,cr values have changed after converting back.Can someone please help in figuring out the solution to this issue.
NppiSize oSizeROI = { 1920, 1080 };

unsigned char *ptrY = NULL, *ptrCb = NULL, *ptrCr = NULL;

unsigned char *ptrYz = NULL, *ptrCbz = NULL, *ptrCrz = NULL;
ptrYz = (unsigned char*) malloc(
		sizeof(unsigned char) * BCK_WIDTH * BCK_HEIGHT);
ptrCbz = (unsigned char*) malloc((BCK_WIDTH * BCK_HEIGHT) / 4);
ptrCrz = (unsigned char*) malloc((BCK_WIDTH * BCK_HEIGHT) / 4);
char* puYuv420 = buildYUV420("Yuv420-Input.yuv", 0xff, 0x80, 0x80);
getYCbCrFromYuv420pImage("result.yuv", &ptrY, &ptrCb, &ptrCr);

Npp8u *pDevice_MasterY = NULL, *pDevice_MasterU = NULL, *pDevice_MasterV =
		NULL, *pDevice_Master_RGBA = NULL;

cudaError_t uError1 = cudaMalloc(&pDevice_MasterY, BCK_HEIGHT * BCK_WIDTH);
if (uError1 != cudaSuccess)
	printf("Could not allocate memory: %d", uError1);
cudaMemcpy(pDevice_MasterY, ptrY, BCK_HEIGHT * BCK_WIDTH,
		cudaMemcpyHostToDevice);

cudaMalloc(&pDevice_MasterU, (BCK_HEIGHT * BCK_WIDTH) / 4);
cudaMemcpy(pDevice_MasterU, ptrCb, (BCK_HEIGHT * BCK_WIDTH) / 4,
		cudaMemcpyHostToDevice);

cudaMalloc(&pDevice_MasterV, (BCK_HEIGHT * BCK_WIDTH) / 4);
cudaMemcpy(pDevice_MasterV, ptrCr, (BCK_HEIGHT * BCK_WIDTH) / 4,
		cudaMemcpyHostToDevice);
uError1 = cudaMalloc(&pDevice_Master_RGBA, BCK_HEIGHT * BCK_WIDTH * 4);
Npp8u* pDevice_Master_YUVArray[3] = { 0 };
pDevice_Master_YUVArray[0] = pDevice_MasterY;
pDevice_Master_YUVArray[1] = pDevice_MasterU;
pDevice_Master_YUVArray[2] = pDevice_MasterV;

int SrcArrayDevStep[3] = { BCK_WIDTH, BCK_WIDTH / 2, BCK_WIDTH / 2 };

NppStatus uStatus = nppiYUV420ToRGB_8u_P3AC4R(pDevice_Master_YUVArray,
		SrcArrayDevStep, pDevice_Master_RGBA, BCK_WIDTH * 4, oSizeROI);
if (uStatus != 0) {
	printf("oops %d\n", (int) uStatus);
	return 1;
}


cudaFree(pDevice_MasterY);
cudaFree(pDevice_MasterU);
cudaFree(pDevice_MasterV);


Npp8u *pDevice_OverlayedY = NULL, *pDevice_OverlayedU = NULL,
		*pDevice_OverlayedV = NULL;

uError1 = cudaMalloc(&pDevice_OverlayedY, BCK_HEIGHT * BCK_WIDTH);
if (uError1 != cudaSuccess)
	printf("Could not allocate memory: %d", uError1);
cudaMalloc(&pDevice_OverlayedU, (BCK_HEIGHT * BCK_WIDTH) / 4);

cudaMalloc(&pDevice_OverlayedV, (BCK_HEIGHT * BCK_WIDTH) / 4);
Npp8u* pDevice_Overlayed_YUVArray[3] = { 0 };
pDevice_Overlayed_YUVArray[0] = pDevice_OverlayedY;
pDevice_Overlayed_YUVArray[1] = pDevice_OverlayedU;
pDevice_Overlayed_YUVArray[2] = pDevice_OverlayedV;

uStatus = nppiRGBToYCrCb420_8u_AC4P3R(pDevice_Master_RGBA, BCK_WIDTH * 4,
		pDevice_Overlayed_YUVArray, SrcArrayDevStep, oSizeROI);

cudaMemcpy(ptrYz, pDevice_Overlayed_YUVArray[0], BCK_HEIGHT * BCK_WIDTH,
		cudaMemcpyDeviceToHost);

cudaMemcpy(ptrCbz, pDevice_Overlayed_YUVArray[2], (BCK_HEIGHT * BCK_WIDTH) / 4,
		cudaMemcpyDeviceToHost);

cudaMemcpy(ptrCrz, pDevice_Overlayed_YUVArray[1], (BCK_HEIGHT * BCK_WIDTH) / 4,
		cudaMemcpyDeviceToHost);
FILE * file;

file = fopen("input.yuv", "w+");
if (file == NULL) {
} else {
	fseek(file, 0, SEEK_SET);
	int width = 1920, height = 1080, half_width = 960, half_height = 540;
	int bytes_read = fwrite(ptrYz, sizeof(unsigned char), width * height,
			file);
	fseek(file, width * height, SEEK_SET);
	bytes_read = fwrite(ptrCrz, sizeof(unsigned char),
			half_width * half_height, file);
	fseek(file, (width * height) + (half_width * half_height), SEEK_SET);
	bytes_read = fwrite(ptrCbz, sizeof(unsigned char),
			half_width * half_height, file);
	fclose(file);
}