Hello Everyone,
I have working code for CPU implementation of this conversion, The GPU implementation builds but fails at runtime.
CPU Code:
cv::Mat NV12toRGB (uchar *pBuffer, int width, int height) {
cv::Mat result(height, width, CV_8CU3);
uchar y, cb, cr;
long ySize = width*height;
long uSize;
uSize = ySize >> 2;
long bufferSize = ySize + uSize*2;
uchar *output = result.data;
uchar *pY = pBuffer;
uchar *pUV = pY + ySize;
uchar r,g,b;
for (int i = 0; i < uSize; ++i)
{
for (int j = 0; j < 4; ++j)
{
y = pY[i*4 + j];
cb = uchar(pUV[(i*2)]);
cr = uchar(pUV[(i*2) + 1]);
b = saturate_cast<uchar>(y+2.032*(cb-128));
g = saturate_cast<uchar>(y-0.395*(cb-128)-0.581*(cr-128));
r = saturate_cast<uchar>(y+1.140*(cr-128));
*output++=b;
*output++=g;
*output++=r;
}
}
return result;
}
For the GPU code, I wanted to abstract out the cv::Mat to the CPU main. So the GPU code is purely moving buffers around.
Here it is:
cudaError_t cudaNV12toRGB(char* input, char* output, size_t width, size_t height)
{
if( !input || !output )
return cudaErrorInvalidDevicePointer;
size_t ySize = width*height;
char* pUV = input + ySize;
const dim3 blockDim(8, 8, 1);
const dim3 gridDim(iDivUp(width, blockDim.x), iDivUp(height, blockDim.y), 1);
NV12toRGB<<<gridDim, blockDim>>>(input, output, pUV, width, height);
return CUDA(cudaGetLastError());
}
Device Code:
__global__
void NV12toRGB(char* input,
char* output,
char* uvPtr,
size_t width,
size_t height)
{
size_t x, y, pixel, pxGroup, uvIndex, outIndex;
x = (blockIdx.x * blockDim.x) + threadIdx.x;
y = (blockIdx.y * blockDim.y) + threadIdx.y;
if (x >= width)
return;
if (y >= height)
return;
//pixel calculations
float r, g, b;
char cb, cr, yp;
pixel = y * width + x;
pxGroup = pixel/4;
uvIndex = pxGroup*2;
outIndex = pixel*3;
yp = input[pixel];
cb = uvPtr[uvIndex];
cr = uvPtr[uvIndex+1];
//calc rgb values
b = (yp+2.032*(cb-128)) + 0.5;
g = (yp-0.395*(cb-128)-0.581*(cr-128)) + 0.5;
r = (yp+1.140*(cr-128)) + 0.5;
//Make the pixel
output[outIndex] = (b < 0 ? 0 : (b > 0xff ? 0xff : b)); //saturate
output[outIndex+1] = (g < 0 ? 0 : (g > 0xff ? 0xff : g));
output[outIndex+2] = (r < 0 ? 0 : (r > 0xff ? 0xff : r));
}
Comments and Questions are appreciated!!