NPP: Conversion from BGRA to YUV420 gives quarter of the result

Hello,

I am trying to convert a BGRA image to YUV420 using the nppiBGRToYUV420_8u_AC4P3R function.
Unfortunately, when images are saved to disk(encoded via with LibVpx), only the quarter of the result has been converted. The CPU version in contrary works well. Here the code snippet:

Conversion classes:

#define YUV_NB_CHANNELS 3
#define YUV_Y_IDX 0
#define YUV_U_IDX 1
#define YUV_V_IDX 2

struct ConverterConstructArgs
{
	uint32 Width;
	uint32 Height;
	uint32 SrcPitch;
	uint32 DstPitch[YUV_NB_CHANNELS];
};

struct Bgra8ToYuv420Converter
{
	Bgra8ToYuv420Converter(const ConverterConstructArgs& Data)
	: Width(Data.Width)
	, Height(Data.Height)
	, SrcPitch(Data.SrcPitch)
	{
		std::memcpy(DstPitch, Data.DstPitch, sizeof(uint32) * YUV_NB_CHANNELS);
	}

	virtual ~Bgra8ToYuv420Converter() {};
	virtual void Convert(unsigned char* SrcBgra, unsigned char* DstYuv[YUV_NB_CHANNELS]) = 0;

protected:
	uint32 Width;
	uint32 Height;
	uint32 SrcPitch;
	uint32 DstPitch[YUV_NB_CHANNELS];
};

struct GpuBgra8ToYuv420 : Bgra8ToYuv420Converter
{
	GpuBgra8ToYuv420(const ConverterConstructArgs& Data)
	: Bgra8ToYuv420Converter(Data)
	{
		// Compute half resolution
		this->HalfWidth = Width >> 1;
		this->HalfHeight = Height >> 1;

		// Allocate bgra memory
		this->CudaBGRAImage = nppiMalloc_8u_C4(Width, Height, &this->CudaBGRAPitch);
		check(this->CudaBGRAImage);

		// Allocate YUVs Y channel memory
		this->CudaYUVImage[YUV_Y_IDX] = nppiMalloc_8u_C1(Width, Height, &this->CudaYUVPitch[YUV_Y_IDX]);
		check(this->CudaYUVImage[YUV_Y_IDX]);

		// Allocate subsampled YUVs U and V channel memory
		for (uint8 i = YUV_U_IDX; i <= YUV_V_IDX; ++i)
		{
			this->CudaYUVImage[i] = nppiMalloc_8u_C1(this->HalfWidth, this->HalfHeight, &this->CudaYUVPitch[i]);
			check(this->CudaYUVImage[i]);
		}
	};

	~GpuBgra8ToYuv420()
	{
		nppiFree(this->CudaBGRAImage);

		for (uint8 i = 0u; i < YUV_NB_CHANNELS; ++i)
		{
			nppiFree(this->CudaYUVImage[i]);
		}
	}

#define CUDA_CHECK(err) { check(err == cudaSuccess); }
	virtual void Convert(unsigned char* SrcBgra, unsigned char* DstYuv[3]) override
	{

		// Copy host BGRA image to device
		CUDA_CHECK(cudaMemcpy2D(this->CudaBGRAImage,
			this->CudaBGRAPitch,
			SrcBgra,
			this->SrcPitch,
			this->Width,
			this->Height,
			cudaMemcpyHostToDevice));

		// Perform conversion
		{
			const NppiSize oSizeROI{ this->Width, this->Height };

			const NppStatus status = nppiBGRToYUV420_8u_AC4P3R(this->CudaBGRAImage,
				this->CudaBGRAPitch,
				this->CudaYUVImage,
				this->CudaYUVPitch,
				oSizeROI
			);

			if (status != NPP_SUCCESS)
			{
				return;
			}
		}

		// Copy back result to host
		// Y channel
		CUDA_CHECK(cudaMemcpy2D(DstYuv[YUV_Y_IDX],
			this->DstPitch[YUV_Y_IDX],
			this->CudaYUVImage[YUV_Y_IDX],
			this->CudaYUVPitch[YUV_Y_IDX],
			this->Width,
			this->Height,
			cudaMemcpyDeviceToHost));

		// U and V channels
		for (uint8 i = YUV_U_IDX; i <= YUV_V_IDX; ++i)
		{
			CUDA_CHECK(cudaMemcpy2D(DstYuv[i],
				this->DstPitch[i],
				this->CudaYUVImage[i],
				this->CudaYUVPitch[i],
				this->HalfWidth,
				this->HalfHeight,
				cudaMemcpyDeviceToHost));
		}
	}

private:
	Npp8u* CudaBGRAImage;
	int CudaBGRAPitch;

	Npp8u* CudaYUVImage[YUV_NB_CHANNELS];
	int CudaYUVPitch[YUV_NB_CHANNELS];

	uint32 HalfWidth;
	uint32 HalfHeight;
};

// CPU version
// @See: https://github.com/sigint9/shadercap/blob/master/VideoEncoder.cpp
#define RGB2YUV_SHIFT 15
#define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
#define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
#define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
#define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
#define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
#define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
#define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
#define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
#define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))

struct CpuBgra8ToYuv420 : Bgra8ToYuv420Converter
{
	CpuBgra8ToYuv420(const ConverterConstructArgs& Data)
	: Bgra8ToYuv420Converter(Data)
	{
	}

	virtual void Convert(unsigned char* SrcBgra, unsigned char* DstYuv[3]) override
	{
		unsigned int i;

		// Y pass.
		for (i = 0; i < this->Width * this->Height; ++i)
		{
			unsigned int r = SrcBgra[4 * i + 2];
			unsigned int g = SrcBgra[4 * i + 1];
			unsigned int b = SrcBgra[4 * i + 0];

			unsigned int y = ((RY*r + GY * g + BY * b) >> RGB2YUV_SHIFT) + 16;
			unsigned int u = ((RU*r + GU * g + BU * b) >> RGB2YUV_SHIFT) + 128;
			unsigned int v = ((RV*r + GV * g + BV * b) >> RGB2YUV_SHIFT) + 128;

			SrcBgra[4 * i + 2] = y;
			SrcBgra[4 * i + 1] = u;
			SrcBgra[4 * i + 0] = v;

			DstYuv[YUV_Y_IDX][i] = y;
		}

		// UV pass, 4 x 4 downsampling.
		i = 0;
		for (unsigned int y = 0; y < this->Height; y += 2)
		{
			for (unsigned int x = 0; x < this->Width; x += 2)
			{
				unsigned int sumU = 0, sumV = 0;

				// Left Root.
				//
				sumU += SrcBgra[4 * (y * this->Width + x) + 1];
				sumV += SrcBgra[4 * (y * this->Width + x) + 0];

				// Right Root.
				sumU += SrcBgra[4 * (y * this->Width + x + 1) + 1];
				sumV += SrcBgra[4 * (y * this->Width + x + 1) + 0];

				// Left Top.
				sumU += SrcBgra[4 * ((y + 1) * this->Width + x) + 1];
				sumV += SrcBgra[4 * ((y + 1) * this->Width + x) + 0];

				// Right Top.
				sumU += SrcBgra[4 * ((y + 1) * this->Width + x + 1) + 1];
				sumV += SrcBgra[4 * ((y + 1) * this->Width + x + 1) + 0];

				// Get average.
				DstYuv[YUV_U_IDX][i] = sumU / 4;
				DstYuv[YUV_V_IDX][i] = sumV / 4;

				i += 1;
			}
		}
	}
};

Here how the converter is called(not much important):

const unsigned int PlaneSize = ImageWidth * ImageHeight;
unsigned char* Bgra = BGRABuffer->img_data;

unsigned char* Yplane = YUVBuffer->img_data;
unsigned char* Uplane = Yplane + PlaneSize;
unsigned char* Vplane = Uplane + (PlaneSize >> 2);

unsigned char* YUV[YUV_NB_CHANNELS] = {Yplane, Uplane, Vplane};

ImageConverter->Convert(Bgra, YUV);// Launch conversion.

Images.
Source
https://imgshare.io/image/kTH4e

GPU converted(resolution does not match since screenshot of a video).
https://imgshare.io/image/gpu.kTe3y
You can clearly see that only a quarter of the image has been converted.

ConverterConstructArgs used.

Width = 2048
Height = 2048
SrcPitch = 8192
DstPitch[YUV_NB_CHANNELS] = {2048, 1024, 1024}

Tests made

  • Upload BGRA to GPU then read back to CPU and call CPU converter --> Success. Meaning it is not a source transfer issue.

  • Call CPU converter first. Then upload YUV result to GPU and readback to CPU --> Success. Meaning it is not a result transfer issue.

This would mean that the following block causes the problem:

// Perform conversion
{
	const NppiSize oSizeROI{ this->Width, this->Height };

	const NppStatus status = nppiBGRToYUV420_8u_AC4P3R(this->CudaBGRAImage,
		this->CudaBGRAPitch,
		this->CudaYUVImage,
		this->CudaYUVPitch,
		oSizeROI
	);

	if (status != NPP_SUCCESS)
	{
		return;
	}
}

Is it a NPP issue or am-I missing something?
By the way the GPU version is 6% faster but needs to be confirmed when bug is fixed ;).

Thank you!

If you want to provide a short, complete code, that I can copy, paste, compile, and run, and see the issue, without having to add anything or change anything, I’ll take a look as time permits. I don’t have time to assemble a bunch of pieces and add things that are missing. I’m not asking for your whole code, I’m asking for a minimal example. Just the necessary pieces, but it has to be a complete program. This is well documented in many places on the web, here is one example:

http://sscce.org/

This is just a request/suggestion, if you don’t want to do it, that’s fine. Perhaps someone else will be able to help.

Hello Robert,

Unfortunately it won’t be possible to provide full code. But see that all the necessary elements are provided, as well as runtime values. I am also giving images showing that only a quarter of the image is converted. Since BGR to YUV420 implies subsampling there is good chances that the problem is from that. You could try yourself to convert a BGRA image with nppiBGRToYUV420_8u_AC4P3R and should obtain the same bug. As mentioned in the explanation the following block seems to be the problem:

// Perform conversion
{
	const NppiSize oSizeROI{ this->Width, this->Height };

	const NppStatus status = nppiBGRToYUV420_8u_AC4P3R(this->CudaBGRAImage,
		this->CudaBGRAPitch,
		this->CudaYUVImage,
		this->CudaYUVPitch,
		oSizeROI
	);

	if (status != NPP_SUCCESS)
	{
		return;
	}
}

How would I do that? Do you mean that I should write an application that calls that function?

Not a full application. For example, the free image sample could be used and it will become a working example. It would not make sense for me to write another example that is broken. See: https://docs.nvidia.com/cuda/cuda-samples/index.html#freeimage-and-npp-interopability

My current guess is:

  • It is a library issue.
  • Or the library is expecting something (for eg images resolution) that I am giving wrong.

All the code I gave is written to be very human-readable.

Thanks if you decide to help :)

I see, what should I write, exactly? (to test your code)

Thank you very much. You could follow these steps:

  1. Make the freeImageInteropNPP project links against nppicc.lib and nppisu.lib.

  2. Read a BGRA8 image with FreeImage(See original sample). This one can be used: https://imgshare.io/image/source.kTH4e

  3. Launch conversion using my code.

  4. Check the values.

Here a freeImageInteropNPP.cpp i made that include my classes. To make the sample compatible it also includes additional code not tested but documented. Whats is missing: Free image loading and conversion verification.

/**
 * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

// A simple CUDA Sample demonstrates how to use FreeImage library with NPP. Detailed description of this
// example can be found as comments in the source code.

#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
#pragma warning(disable:4819)
#  define WINDOWS_LEAN_AND_MEAN
#  define NOMINMAX
#  include <windows.h>
#endif

#include "FreeImage.h"
#include "Exceptions.h"

#include <string.h>
#include <fstream>
#include <iostream>

#include <cuda_runtime.h>
#include <npp.h>               // CUDA NPP Definitions

#include <helper_cuda.h>       // helper for CUDA Error handling and initialization
#include <helper_string.h>     // helper for string parsing

#include <assert.h>
#include <vector>

inline int cudaDeviceInit(int argc, const char **argv)
{
    int deviceCount;
    checkCudaErrors(cudaGetDeviceCount(&deviceCount));

    if (deviceCount == 0)
    {
        std::cerr << "CUDA error: no devices supporting CUDA." << std::endl;
        exit(EXIT_FAILURE);
    }

    int dev = findCudaDevice(argc, argv);

    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    std::cerr << "cudaSetDevice GPU" << dev << " = " << deviceProp.name << std::endl;

    checkCudaErrors(cudaSetDevice(dev));

    return dev;
}

bool printfNPPinfo(int argc, char *argv[], int cudaVerMajor, int cudaVerMinor)
{
    const NppLibraryVersion *libVer   = nppGetLibVersion();

    printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build);

	int driverVersion, runtimeVersion;
    cudaDriverGetVersion(&driverVersion);
    cudaRuntimeGetVersion(&runtimeVersion);

	printf("  CUDA Driver  Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10);
	printf("  CUDA Runtime Version: %d.%d\n", runtimeVersion/1000, (runtimeVersion%100)/10);

	bool bVal = checkCudaCapabilities(cudaVerMajor, cudaVerMinor);
	return bVal;
}

// Error handler for FreeImage library.
//  In case this handler is invoked, it throws an NPP exception.
extern "C" void
FreeImageErrorHandler(FREE_IMAGE_FORMAT oFif, const char *zMessage)
{
    throw npp::Exception(zMessage);
}

std::ostream &
operator <<(std::ostream &rOutputStream, const FIBITMAP &rBitmap)
{
    unsigned int nImageWidth    = FreeImage_GetWidth(const_cast<FIBITMAP *>(&rBitmap));
    unsigned int nImageHeight   = FreeImage_GetHeight(const_cast<FIBITMAP *>(&rBitmap));
    unsigned int nPitch         = FreeImage_GetPitch(const_cast<FIBITMAP *>(&rBitmap));
    unsigned int nBPP           = FreeImage_GetBPP(const_cast<FIBITMAP *>(&rBitmap));

    FREE_IMAGE_COLOR_TYPE eType = FreeImage_GetColorType(const_cast<FIBITMAP *>(&rBitmap));

    rOutputStream << "Size  (" << nImageWidth << ", " << nImageHeight << ")\n";
    rOutputStream << "Pitch "  << nPitch << "\n";
    rOutputStream << "Type  ";

    switch (eType)
    {
        case FIC_MINISWHITE:
            rOutputStream << "FIC_MINISWHITE\n";
            break;

        case FIC_MINISBLACK:
            rOutputStream << "FIC_MINISBLACK\n";
            break;

        case FIC_RGB:
            rOutputStream << "FIC_RGB\n";
            break;

        case FIC_PALETTE:
            rOutputStream << "FIC_PALETTE\n";
            break;

        case FIC_RGBALPHA:
            rOutputStream << "FIC_RGBALPHA\n";
            break;

        case FIC_CMYK:
            rOutputStream << "FIC_CMYK\n";
            break;

        default:
            rOutputStream << "Unknown pixel format.\n";
    }

    rOutputStream << "BPP   " << nBPP << std::endl;

    return rOutputStream;
}

///////////////////////////////////////////////////////////////////////////////////////////////////
//	Trylz Image Converter
///////////////////////////////////////////////////////////////////////////////////////////////////

#define YUV_NB_CHANNELS 3
#define YUV_Y_IDX 0
#define YUV_U_IDX 1
#define YUV_V_IDX 2

struct ConverterConstructArgs
{
	uint32_t Width;
	uint32_t Height;
	uint32_t SrcPitch;
	uint32_t DstPitch[YUV_NB_CHANNELS];
};

struct Bgra8ToYuv420Converter
{
	Bgra8ToYuv420Converter(const ConverterConstructArgs& Data)
		: Width(Data.Width)
		, Height(Data.Height)
		, SrcPitch(Data.SrcPitch)
	{
		std::memcpy(DstPitch, Data.DstPitch, sizeof(uint32_t) * YUV_NB_CHANNELS);
	}

	virtual ~Bgra8ToYuv420Converter() {};
	virtual void Convert(unsigned char* SrcBgra, unsigned char* DstYuv[YUV_NB_CHANNELS]) = 0;

protected:
	uint32_t Width;
	uint32_t Height;
	uint32_t SrcPitch;
	uint32_t DstPitch[YUV_NB_CHANNELS];
};

struct GpuBgra8ToYuv420 : Bgra8ToYuv420Converter
{
	GpuBgra8ToYuv420(const ConverterConstructArgs& Data)
		: Bgra8ToYuv420Converter(Data)
	{
		// Compute half resolution
		this->HalfWidth = Width >> 1;
		this->HalfHeight = Height >> 1;

		// Allocate bgra memory
		this->CudaBGRAImage = nppiMalloc_8u_C4(Width, Height, &this->CudaBGRAPitch);
		assert(this->CudaBGRAImage);

		// Allocate YUVs Y channel memory
		this->CudaYUVImage[YUV_Y_IDX] = nppiMalloc_8u_C1(Width, Height, &this->CudaYUVPitch[YUV_Y_IDX]);
		assert(this->CudaYUVImage[YUV_Y_IDX]);

		// Allocate subsampled YUVs U and V channel memory
		for (uint8_t i = YUV_U_IDX; i <= YUV_V_IDX; ++i)
		{
			this->CudaYUVImage[i] = nppiMalloc_8u_C1(this->HalfWidth, this->HalfHeight, &this->CudaYUVPitch[i]);
			assert(this->CudaYUVImage[i]);
		}
	};

	~GpuBgra8ToYuv420()
	{
		nppiFree(this->CudaBGRAImage);

		for (uint8_t i = 0u; i < YUV_NB_CHANNELS; ++i)
		{
			nppiFree(this->CudaYUVImage[i]);
		}
	}

#define CUDA_CHECK(err) { assert(err == cudaSuccess); }
	virtual void Convert(unsigned char* SrcBgra, unsigned char* DstYuv[3]) override
	{

		// Copy host BGRA image to device
		CUDA_CHECK(cudaMemcpy2D(this->CudaBGRAImage,
			this->CudaBGRAPitch,
			SrcBgra,
			this->SrcPitch,
			this->Width,
			this->Height,
			cudaMemcpyHostToDevice));

		// Perform conversion
		{
			const NppiSize oSizeROI{ this->Width, this->Height };

			const NppStatus status = nppiBGRToYUV420_8u_AC4P3R(this->CudaBGRAImage,
				this->CudaBGRAPitch,
				this->CudaYUVImage,
				this->CudaYUVPitch,
				oSizeROI
			);

			if (status != NPP_SUCCESS)
			{
				return;
			}
		}

		// Copy back result to host
		// Y channel
		CUDA_CHECK(cudaMemcpy2D(DstYuv[YUV_Y_IDX],
			this->DstPitch[YUV_Y_IDX],
			this->CudaYUVImage[YUV_Y_IDX],
			this->CudaYUVPitch[YUV_Y_IDX],
			this->Width,
			this->Height,
			cudaMemcpyDeviceToHost));

		// U and V channels
		for (uint8_t i = YUV_U_IDX; i <= YUV_V_IDX; ++i)
		{
			CUDA_CHECK(cudaMemcpy2D(DstYuv[i],
				this->DstPitch[i],
				this->CudaYUVImage[i],
				this->CudaYUVPitch[i],
				this->HalfWidth,
				this->HalfHeight,
				cudaMemcpyDeviceToHost));
		}
	}

private:
	Npp8u* CudaBGRAImage;
	int CudaBGRAPitch;

	Npp8u* CudaYUVImage[YUV_NB_CHANNELS];
	int CudaYUVPitch[YUV_NB_CHANNELS];

	uint32_t HalfWidth;
	uint32_t HalfHeight;
};

// CPU version
// @See: https://github.com/sigint9/shadercap/blob/master/VideoEncoder.cpp
#define RGB2YUV_SHIFT 15
#define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
#define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
#define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
#define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
#define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
#define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
#define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
#define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
#define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))

struct CpuBgra8ToYuv420 : Bgra8ToYuv420Converter
{
	CpuBgra8ToYuv420(const ConverterConstructArgs& Data)
		: Bgra8ToYuv420Converter(Data)
	{
	}

	virtual void Convert(unsigned char* SrcBgra, unsigned char* DstYuv[3]) override
	{
		unsigned int i;

		// Y pass.
		for (i = 0; i < this->Width * this->Height; ++i)
		{
			unsigned int r = SrcBgra[4 * i + 2];
			unsigned int g = SrcBgra[4 * i + 1];
			unsigned int b = SrcBgra[4 * i + 0];

			unsigned int y = ((RY*r + GY * g + BY * b) >> RGB2YUV_SHIFT) + 16;
			unsigned int u = ((RU*r + GU * g + BU * b) >> RGB2YUV_SHIFT) + 128;
			unsigned int v = ((RV*r + GV * g + BV * b) >> RGB2YUV_SHIFT) + 128;

			SrcBgra[4 * i + 2] = y;
			SrcBgra[4 * i + 1] = u;
			SrcBgra[4 * i + 0] = v;

			DstYuv[YUV_Y_IDX][i] = y;
		}

		// UV pass, 4 x 4 downsampling.
		i = 0;
		for (unsigned int y = 0; y < this->Height; y += 2)
		{
			for (unsigned int x = 0; x < this->Width; x += 2)
			{
				unsigned int sumU = 0, sumV = 0;

				// Left Root.
				//
				sumU += SrcBgra[4 * (y * this->Width + x) + 1];
				sumV += SrcBgra[4 * (y * this->Width + x) + 0];

				// Right Root.
				sumU += SrcBgra[4 * (y * this->Width + x + 1) + 1];
				sumV += SrcBgra[4 * (y * this->Width + x + 1) + 0];

				// Left Top.
				sumU += SrcBgra[4 * ((y + 1) * this->Width + x) + 1];
				sumV += SrcBgra[4 * ((y + 1) * this->Width + x) + 0];

				// Right Top.
				sumU += SrcBgra[4 * ((y + 1) * this->Width + x + 1) + 1];
				sumV += SrcBgra[4 * ((y + 1) * this->Width + x + 1) + 0];

				// Get average.
				DstYuv[YUV_U_IDX][i] = sumU / 4;
				DstYuv[YUV_V_IDX][i] = sumV / 4;

				i += 1;
			}
		}
	}
};

///////////////////////////////////////////////////////////////////////////////////////////////////
//	LibVpx
//  @See: https://chromium.googlesource.com/webm/libvpx/+/master
///////////////////////////////////////////////////////////////////////////////////////////////////

#define ADDRESS_STORAGE_SIZE sizeof(size_t)
#define VPX_MAX_ALLOCABLE_MEMORY (1ULL << 40)

static uint64_t get_aligned_malloc_size(size_t size, size_t align) {
	return (uint64_t)size + align - 1 + ADDRESS_STORAGE_SIZE;
}

static size_t *get_malloc_address_location(void *const mem) {
	return ((size_t *)mem) - 1;
}

// Returns 0 in case of overflow of nmemb * size.
static int check_size_argument_overflow(uint64_t nmemb, uint64_t size) {
	const uint64_t total_size = nmemb * size;
	if (nmemb == 0) return 1;
	if (size > VPX_MAX_ALLOCABLE_MEMORY / nmemb) return 0;
	if (total_size != (size_t)total_size) return 0;

	return 1;
}

/*returns an addr aligned to the byte boundary specified by align*/
#define align_addr(addr, align) \
  (void *)(((size_t)(addr) + ((align)-1)) & ~(size_t)((align)-1))

static void set_actual_malloc_address(void *const mem,
	const void *const malloc_addr) {
	size_t *const malloc_addr_location = get_malloc_address_location(mem);
	*malloc_addr_location = (size_t)malloc_addr;
}

void *vpx_memalign(size_t align, size_t size) {
	void *x = NULL, *addr;
	const uint64_t aligned_size = get_aligned_malloc_size(size, align);
	if (!check_size_argument_overflow(1, aligned_size)) return NULL;

	addr = malloc((size_t)aligned_size);
	if (addr) {
		x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, align);
		set_actual_malloc_address(x, addr);
	}
	return x;
}

///////////////////////////////////////////////////////////////////////////////////////////////////

int main(int argc, char *argv[])
{
	// TODO: Read BGRA image using FreeImage.
	// All these fields must be initialized.
	uint32_t ImageWidth = 0, ImageHeight = 0, SrcPitch = 0;
	unsigned char* Bgra = nullptr;

	// Allocate YUV VPX_IMG_FMT_I420 buffer.
	unsigned char* YUVBuffer = nullptr;
	uint32_t DstPitch[YUV_NB_CHANNELS];
	{
		// @See img_alloc_helper: https://chromium.googlesource.com/webm/libvpx/+/master/vpx/src/vpx_image.c
		const int buf_align = 1;
		const int stride_align = 1;
		const int bps = 12;

		unsigned int xcs = 1;
		unsigned int ycs = 1;

		int align = (1 << xcs) - 1;
		unsigned int w = (ImageWidth + align) & ~align;
		align = (1 << ycs) - 1;
		unsigned int h = (ImageHeight + align) & ~align;
		unsigned int s = w;
		s = (s + stride_align - 1) & ~(stride_align - 1);
		unsigned int stride_in_bytes = s;

		const uint64_t alloc_size = (uint64_t)h * s * bps / 8;

		// YUV memory buffer
		YUVBuffer = (uint8_t *)vpx_memalign(buf_align, (size_t)alloc_size);

		// Strides
		DstPitch[YUV_Y_IDX] = stride_in_bytes;
		DstPitch[YUV_U_IDX] = DstPitch[YUV_V_IDX] = stride_in_bytes >> xcs;
	}

	// Create Image converter
	ConverterConstructArgs ConverterData;
	ConverterData.Width = ImageWidth;
	ConverterData.Height = ImageHeight;
	ConverterData.SrcPitch = SrcPitch;
	std::memcpy(ConverterData.DstPitch, DstPitch, sizeof(uint32_t) * YUV_NB_CHANNELS);

	// GPU Cuda converter.
	// For CPU: ImageConverter = std::make_unique<CpuBgra8ToYuv420>(ConverterData);
	std::unique_ptr<Bgra8ToYuv420Converter> ImageConverter = std::make_unique<GpuBgra8ToYuv420>(ConverterData);

	// Conversion
	const unsigned int PlaneSize = ImageWidth * ImageHeight;
	unsigned char* Yplane = YUVBuffer;
	unsigned char* Uplane = Yplane + PlaneSize;
	unsigned char* Vplane = Uplane + (PlaneSize >> 2);

	unsigned char* YUVChannels[YUV_NB_CHANNELS] = { Yplane, Uplane, Vplane };

	// Launch conversion.
	ImageConverter->Convert(Bgra, YUVChannels);

	// TODO: Check YUV buffer values. Only the 1/4 is converted which is not good.
	// Could be done manually or write to file to see.

exit(EXIT_SUCCESS);
}

And the reason you cannot complete this application yourself is?

Hello,

The real question should be how are these functions usually tested?
I will not complete the application because:

1- Instead of spending time writing another sample that does not work, I would prefer to make a pure Cuda implementation.

2- Performance is only 6% better on my hardware. Just stick to CPU since encoding is offline could do it.

3- If it is a library bug I would have wasted my time(hoping it is not :)).

Given this list of reasons, what might be potential reasons a random forum participant would complete this application? I can’t think of any.

I never asked any random person to complete it. The initial post is very well described. A simple viable example of the nppiBGRToYUV420_8u_AC4P3R function, that is even not related to my code, will be welcomed. Also, the main reason I posted is that it could help a future person who will encounter the same issue.

There’s no reason that this function should produce only 1/4 of the output data.

I’ve written applications recently that use both RGB->YUV and YUV->RGB transforms in NPP with no issues.

It’s always possible that there is a defect in the library, but anyway it is impossible (for me) to say without a test case.

I don’t see any evidence that only 1/4 of the output is generated:

$ cat t9.cu
#include <nppi.h>
#include <iostream>

const int w = 16;
const int h = 16;
const int B = 0;
const int G = 1;
const int R = 2;
const int A = 3;
int main(){

  Npp8u *pSrc, *hSrc, *pDst[3];
  int nSrcStep, rDstStep[3];
  NppiSize oSizeROI;

  // create test packed BGRA image
  hSrc = new Npp8u[h*w*4];
  for (int i = 0; i < h; i++)
    for (int j = 0; j < w/4; j++){
      // red bar
      hSrc[(i*w+j)*4+B] = 0;
      hSrc[(i*w+j)*4+G] = 0;
      hSrc[(i*w+j)*4+R] = 255;
      hSrc[(i*w+j)*4+A] = 255;
      // green bar
      hSrc[(i*w+j+(w/4))*4+B] = 0;
      hSrc[(i*w+j+(w/4))*4+G] = 255;
      hSrc[(i*w+j+(w/4))*4+R] = 0;
      hSrc[(i*w+j+(w/4))*4+A] = 255;
      // blue bar
      hSrc[(i*w+j+(w/2))*4+B] = 255;
      hSrc[(i*w+j+(w/2))*4+G] = 0;
      hSrc[(i*w+j+(w/2))*4+R] = 0;
      hSrc[(i*w+j+(w/2))*4+A] = 255;
      // white bar
      hSrc[(i*w+j+(3*w/4))*4+B] = 255;
      hSrc[(i*w+j+(3*w/4))*4+G] = 255;
      hSrc[(i*w+j+(3*w/4))*4+R] = 255;
      hSrc[(i*w+j+(3*w/4))*4+A] = 255;}
  cudaMalloc(&pSrc, h*w*4*sizeof(Npp8u));
  cudaMemcpy(pSrc, hSrc, h*w*4*sizeof(Npp8u), cudaMemcpyHostToDevice);
  nSrcStep = w*4*sizeof(Npp8u);
  cudaMalloc(pDst+0, h*w*sizeof(Npp8u));     // Y storage
  cudaMalloc(pDst+1, (h*w*sizeof(Npp8u))/4); // U storage
  cudaMalloc(pDst+2, (h*w*sizeof(Npp8u))/4); // V storage
  cudaMemset(pDst+0, 0,  h*w*sizeof(Npp8u));  
  cudaMemset(pDst+1, 0, (h*w*sizeof(Npp8u))/4); 
  cudaMemset(pDst+2, 0, (h*w*sizeof(Npp8u))/4);
  rDstStep[0] = w*sizeof(Npp8u);
  rDstStep[1] = (w/2)*sizeof(Npp8u);
  rDstStep[2] = (w/2)*sizeof(Npp8u);
  oSizeROI.width = w;
  oSizeROI.height = h;
  NppStatus stat = nppiBGRToYUV420_8u_AC4P3R(pSrc, nSrcStep, pDst, rDstStep, oSizeROI);
  if (stat != NPP_SUCCESS) std::cout << "NPP error: " << (int)stat << std::endl;
  cudaError_t err = cudaDeviceSynchronize();
  if (err != cudaSuccess) std::cout << "CUDA error: " << cudaGetErrorString(err) << std::endl;
  Npp8u *hY, *hU, *hV;
  hY = new Npp8u[h*w];
  hU = new Npp8u[h*w/4];
  hV = new Npp8u[h*w/4];
  cudaMemcpy(hY, pDst[0], h*w*sizeof(Npp8u),     cudaMemcpyDeviceToHost);
  cudaMemcpy(hU, pDst[1], h*(w/4)*sizeof(Npp8u), cudaMemcpyDeviceToHost);
  cudaMemcpy(hV, pDst[2], h*(w/4)*sizeof(Npp8u), cudaMemcpyDeviceToHost);
  // from https://en.wikipedia.org/wiki/YUV
  std::cout << "Expected values: " << std::endl;
  std::cout << "color  Y   U   V" << std::endl;
  int Yred = 0.299*255;
  int Ured = (-0.147*255)+128;
  int Vred = (0.615*255)+128;
  if (Yred > 255) Yred = 255;
  if (Ured > 255) Ured = 255;
  if (Vred > 255) Vred = 255;
  if (Yred < 0) Yred = 0;
  if (Ured < 0) Ured = 0;
  if (Vred < 0) Vred = 0;
  std::cout << "RED:   " << Yred << " " << Ured << " " << Vred << std::endl;
  int Ygrn = 0.587*255;
  int Ugrn = (-0.289*255)+128;
  int Vgrn = (-0.515*255)+128;
  if (Ygrn > 255) Ygrn = 255;
  if (Ugrn > 255) Ugrn = 255;
  if (Vgrn > 255) Vgrn = 255;
  if (Ygrn < 0) Ygrn = 0;
  if (Ugrn < 0) Ugrn = 0;
  if (Vgrn < 0) Vgrn = 0;
  std::cout << "GREEN: " << Ygrn << " " << Ugrn << " " << Vgrn << std::endl;
  int Yblu = 0.114*255;
  int Ublu = (0.436*255)+128;
  int Vblu = (-0.100*255)+128;
  if (Yblu > 255) Yblu = 255;
  if (Ublu > 255) Ublu = 255;
  if (Vblu > 255) Vblu = 255;
  if (Yblu < 0) Yblu = 0;
  if (Ublu < 0) Ublu = 0;
  if (Vblu < 0) Vblu = 0;
  std::cout << "BLUE:  " << Yblu << " " << Ublu << " " << Vblu << std::endl;
  std::cout << "Y plane:" << std::endl;
  for (int i = 0; i < h; i++){
    for (int j = 0; j < w; j++)
      std::cout << (unsigned)hY[i*w+j] <<  " ";
    std::cout << std::endl;}
  std::cout << "U plane:" << std::endl;
  for (int i = 0; i < h/2; i++){
    for (int j = 0; j < w/2; j++)
      std::cout << (unsigned)hU[i*(w/2)+j] <<  " ";
    std::cout << std::endl;}
  std::cout << "V plane:" << std::endl;
  for (int i = 0; i < h/2; i++){
    for (int j = 0; j < w/2; j++)
      std::cout << (unsigned)hV[i*(w/2)+j] <<  " ";
    std::cout << std::endl;}
}

$ nvcc -o t9 t9.cu -lnppicc -lnppig
$ cuda-memcheck ./t9
========= CUDA-MEMCHECK
Expected values: 
color  Y   U   V
RED:   76 90 255
GREEN: 149 54 0
BLUE:  29 239 102
Y plane:
76 76 76 76 149 149 149 149 29 29 29 29 255 255 255 255 
76 76 76 76 149 149 149 149 29 29 29 29 255 255 255 255 
76 76 76 76 149 149 149 149 29 29 29 29 255 255 255 255 
76 76 76 76 149 149 149 149 29 29 29 29 255 255 255 255 
76 76 76 76 149 149 149 149 29 29 29 29 255 255 255 255 
76 76 76 76 149 149 149 149 29 29 29 29 255 255 255 255 
76 76 76 76 149 149 149 149 29 29 29 29 255 255 255 255 
76 76 76 76 149 149 149 149 29 29 29 29 255 255 255 255 
76 76 76 76 149 149 149 149 29 29 29 29 255 255 255 255 
76 76 76 76 149 149 149 149 29 29 29 29 255 255 255 255 
76 76 76 76 149 149 149 149 29 29 29 29 255 255 255 255 
76 76 76 76 149 149 149 149 29 29 29 29 255 255 255 255 
76 76 76 76 149 149 149 149 29 29 29 29 255 255 255 255 
76 76 76 76 149 149 149 149 29 29 29 29 255 255 255 255 
76 76 76 76 149 149 149 149 29 29 29 29 255 255 255 255 
76 76 76 76 149 149 149 149 29 29 29 29 255 255 255 255 
U plane:
90 90 54 54 239 239 128 128 
90 90 54 54 239 239 128 128 
90 90 54 54 239 239 128 128 
90 90 54 54 239 239 128 128 
90 90 54 54 239 239 128 128 
90 90 54 54 239 239 128 128 
90 90 54 54 239 239 128 128 
90 90 54 54 239 239 128 128 
V plane:
255 255 0 0 102 102 128 128 
255 255 0 0 102 102 128 128 
255 255 0 0 102 102 128 128 
255 255 0 0 102 102 128 128 
255 255 0 0 102 102 128 128 
255 255 0 0 102 102 128 128 
255 255 0 0 102 102 128 128 
255 255 0 0 102 102 128 128 
========= ERROR SUMMARY: 0 errors
$

Thank you very much for the example. I made something wrong then.
I will look for my mistake using this snippet tomorrow.

Fixed! I had to make the following changes:

  1. Use cudaMalloc instead of the nppiMalloc_XXX functions.
    The weird thing is that my image has a pitch of 8192 but nppiMalloc_8u_C4 returns a pitch of 8704. So there could be an alignment issue.

  2. Replace the cudaMemcpy2D calls by cudaMemcpy. I don’t get why…

Btw is cudaDeviceSynchronize call really needed?

This doesn’t look right to me:

CUDA_CHECK(cudaMemcpy2D(this->CudaBGRAImage,
			this->CudaBGRAPitch,
			SrcBgra,
			this->SrcPitch,
			this->Width,  // ***********
			this->Height,
			cudaMemcpyHostToDevice));

The argument that I have marked with *********** is supposed to be the width of the transfer (line) in bytes. So if your Width is in BGRA “pixels” then that could explain the 1/4 issue.

You may wish to read the documentation for cudaMemcpy2D.
https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g3a58270f6775efe56c65ac47843e7cee

My mistake. Yes, it should be 4 * Width and it makes sense now.

Thanks a lot for your help :)