Hello,
I am trying to convert a BGRA image to YUV420 using the nppiBGRToYUV420_8u_AC4P3R function.
Unfortunately, when images are saved to disk(encoded via with LibVpx), only the quarter of the result has been converted. The CPU version in contrary works well. Here the code snippet:
Conversion classes:
#define YUV_NB_CHANNELS 3
#define YUV_Y_IDX 0
#define YUV_U_IDX 1
#define YUV_V_IDX 2
struct ConverterConstructArgs
{
uint32 Width;
uint32 Height;
uint32 SrcPitch;
uint32 DstPitch[YUV_NB_CHANNELS];
};
struct Bgra8ToYuv420Converter
{
Bgra8ToYuv420Converter(const ConverterConstructArgs& Data)
: Width(Data.Width)
, Height(Data.Height)
, SrcPitch(Data.SrcPitch)
{
std::memcpy(DstPitch, Data.DstPitch, sizeof(uint32) * YUV_NB_CHANNELS);
}
virtual ~Bgra8ToYuv420Converter() {};
virtual void Convert(unsigned char* SrcBgra, unsigned char* DstYuv[YUV_NB_CHANNELS]) = 0;
protected:
uint32 Width;
uint32 Height;
uint32 SrcPitch;
uint32 DstPitch[YUV_NB_CHANNELS];
};
struct GpuBgra8ToYuv420 : Bgra8ToYuv420Converter
{
GpuBgra8ToYuv420(const ConverterConstructArgs& Data)
: Bgra8ToYuv420Converter(Data)
{
// Compute half resolution
this->HalfWidth = Width >> 1;
this->HalfHeight = Height >> 1;
// Allocate bgra memory
this->CudaBGRAImage = nppiMalloc_8u_C4(Width, Height, &this->CudaBGRAPitch);
check(this->CudaBGRAImage);
// Allocate YUVs Y channel memory
this->CudaYUVImage[YUV_Y_IDX] = nppiMalloc_8u_C1(Width, Height, &this->CudaYUVPitch[YUV_Y_IDX]);
check(this->CudaYUVImage[YUV_Y_IDX]);
// Allocate subsampled YUVs U and V channel memory
for (uint8 i = YUV_U_IDX; i <= YUV_V_IDX; ++i)
{
this->CudaYUVImage[i] = nppiMalloc_8u_C1(this->HalfWidth, this->HalfHeight, &this->CudaYUVPitch[i]);
check(this->CudaYUVImage[i]);
}
};
~GpuBgra8ToYuv420()
{
nppiFree(this->CudaBGRAImage);
for (uint8 i = 0u; i < YUV_NB_CHANNELS; ++i)
{
nppiFree(this->CudaYUVImage[i]);
}
}
#define CUDA_CHECK(err) { check(err == cudaSuccess); }
virtual void Convert(unsigned char* SrcBgra, unsigned char* DstYuv[3]) override
{
// Copy host BGRA image to device
CUDA_CHECK(cudaMemcpy2D(this->CudaBGRAImage,
this->CudaBGRAPitch,
SrcBgra,
this->SrcPitch,
this->Width,
this->Height,
cudaMemcpyHostToDevice));
// Perform conversion
{
const NppiSize oSizeROI{ this->Width, this->Height };
const NppStatus status = nppiBGRToYUV420_8u_AC4P3R(this->CudaBGRAImage,
this->CudaBGRAPitch,
this->CudaYUVImage,
this->CudaYUVPitch,
oSizeROI
);
if (status != NPP_SUCCESS)
{
return;
}
}
// Copy back result to host
// Y channel
CUDA_CHECK(cudaMemcpy2D(DstYuv[YUV_Y_IDX],
this->DstPitch[YUV_Y_IDX],
this->CudaYUVImage[YUV_Y_IDX],
this->CudaYUVPitch[YUV_Y_IDX],
this->Width,
this->Height,
cudaMemcpyDeviceToHost));
// U and V channels
for (uint8 i = YUV_U_IDX; i <= YUV_V_IDX; ++i)
{
CUDA_CHECK(cudaMemcpy2D(DstYuv[i],
this->DstPitch[i],
this->CudaYUVImage[i],
this->CudaYUVPitch[i],
this->HalfWidth,
this->HalfHeight,
cudaMemcpyDeviceToHost));
}
}
private:
Npp8u* CudaBGRAImage;
int CudaBGRAPitch;
Npp8u* CudaYUVImage[YUV_NB_CHANNELS];
int CudaYUVPitch[YUV_NB_CHANNELS];
uint32 HalfWidth;
uint32 HalfHeight;
};
// CPU version
// @See: https://github.com/sigint9/shadercap/blob/master/VideoEncoder.cpp
#define RGB2YUV_SHIFT 15
#define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
#define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
#define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
#define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
#define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
#define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
#define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
#define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
#define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
struct CpuBgra8ToYuv420 : Bgra8ToYuv420Converter
{
CpuBgra8ToYuv420(const ConverterConstructArgs& Data)
: Bgra8ToYuv420Converter(Data)
{
}
virtual void Convert(unsigned char* SrcBgra, unsigned char* DstYuv[3]) override
{
unsigned int i;
// Y pass.
for (i = 0; i < this->Width * this->Height; ++i)
{
unsigned int r = SrcBgra[4 * i + 2];
unsigned int g = SrcBgra[4 * i + 1];
unsigned int b = SrcBgra[4 * i + 0];
unsigned int y = ((RY*r + GY * g + BY * b) >> RGB2YUV_SHIFT) + 16;
unsigned int u = ((RU*r + GU * g + BU * b) >> RGB2YUV_SHIFT) + 128;
unsigned int v = ((RV*r + GV * g + BV * b) >> RGB2YUV_SHIFT) + 128;
SrcBgra[4 * i + 2] = y;
SrcBgra[4 * i + 1] = u;
SrcBgra[4 * i + 0] = v;
DstYuv[YUV_Y_IDX][i] = y;
}
// UV pass, 4 x 4 downsampling.
i = 0;
for (unsigned int y = 0; y < this->Height; y += 2)
{
for (unsigned int x = 0; x < this->Width; x += 2)
{
unsigned int sumU = 0, sumV = 0;
// Left Root.
//
sumU += SrcBgra[4 * (y * this->Width + x) + 1];
sumV += SrcBgra[4 * (y * this->Width + x) + 0];
// Right Root.
sumU += SrcBgra[4 * (y * this->Width + x + 1) + 1];
sumV += SrcBgra[4 * (y * this->Width + x + 1) + 0];
// Left Top.
sumU += SrcBgra[4 * ((y + 1) * this->Width + x) + 1];
sumV += SrcBgra[4 * ((y + 1) * this->Width + x) + 0];
// Right Top.
sumU += SrcBgra[4 * ((y + 1) * this->Width + x + 1) + 1];
sumV += SrcBgra[4 * ((y + 1) * this->Width + x + 1) + 0];
// Get average.
DstYuv[YUV_U_IDX][i] = sumU / 4;
DstYuv[YUV_V_IDX][i] = sumV / 4;
i += 1;
}
}
}
};
Here how the converter is called(not much important):
const unsigned int PlaneSize = ImageWidth * ImageHeight;
unsigned char* Bgra = BGRABuffer->img_data;
unsigned char* Yplane = YUVBuffer->img_data;
unsigned char* Uplane = Yplane + PlaneSize;
unsigned char* Vplane = Uplane + (PlaneSize >> 2);
unsigned char* YUV[YUV_NB_CHANNELS] = {Yplane, Uplane, Vplane};
ImageConverter->Convert(Bgra, YUV);// Launch conversion.
Images.
Source
https://imgshare.io/image/kTH4e
GPU converted(resolution does not match since screenshot of a video).
https://imgshare.io/image/gpu.kTe3y
You can clearly see that only a quarter of the image has been converted.
ConverterConstructArgs used.
Width = 2048
Height = 2048
SrcPitch = 8192
DstPitch[YUV_NB_CHANNELS] = {2048, 1024, 1024}
Tests made
-
Upload BGRA to GPU then read back to CPU and call CPU converter → Success. Meaning it is not a source transfer issue.
-
Call CPU converter first. Then upload YUV result to GPU and readback to CPU → Success. Meaning it is not a result transfer issue.
This would mean that the following block causes the problem:
// Perform conversion
{
const NppiSize oSizeROI{ this->Width, this->Height };
const NppStatus status = nppiBGRToYUV420_8u_AC4P3R(this->CudaBGRAImage,
this->CudaBGRAPitch,
this->CudaYUVImage,
this->CudaYUVPitch,
oSizeROI
);
if (status != NPP_SUCCESS)
{
return;
}
}
Is it a NPP issue or am-I missing something?
By the way the GPU version is 6% faster but needs to be confirmed when bug is fixed ;).
Thank you!