CUDA Image Processing on TX2, converting NV12 to RGB [TX2, OpenCV]

Hello Everyone,

I have working code for CPU implementation of this conversion, The GPU implementation builds but fails at runtime.

CPU Code:

cv::Mat NV12toRGB (uchar *pBuffer, int width, int height) {
    cv::Mat result(height, width, CV_8CU3);
    uchar y, cb, cr;

    long ySize = width*height;
    long uSize;
    uSize = ySize >> 2;

    long bufferSize = ySize + uSize*2;

    uchar *output = result.data;
    uchar *pY = pBuffer;
    uchar *pUV = pY + ySize;

    uchar r,g,b;
    for (int i = 0; i < uSize; ++i)
    {
        for (int j = 0; j < 4; ++j)
        {
            y = pY[i*4 + j];

            cb = uchar(pUV[(i*2)]);
            cr = uchar(pUV[(i*2) + 1]);

            b = saturate_cast<uchar>(y+2.032*(cb-128));
            g = saturate_cast<uchar>(y-0.395*(cb-128)-0.581*(cr-128));
            r = saturate_cast<uchar>(y+1.140*(cr-128));

            *output++=b;
            *output++=g;
            *output++=r;
        }
    }
    return result;
}

For the GPU code, I wanted to abstract out the cv::Mat to the CPU main. So the GPU code is purely moving buffers around.

Here it is:

cudaError_t cudaNV12toRGB(char* input, char* output, size_t width, size_t height)
{
    if( !input || !output )
        return cudaErrorInvalidDevicePointer;

    size_t ySize = width*height;
    char* pUV = input + ySize;

    const dim3 blockDim(8, 8, 1);
    const dim3 gridDim(iDivUp(width, blockDim.x), iDivUp(height, blockDim.y), 1);

    NV12toRGB<<<gridDim, blockDim>>>(input, output, pUV,  width, height);

    return CUDA(cudaGetLastError());
}

Device Code:

__global__
void NV12toRGB(char* input,
               char* output,
               char* uvPtr,
               size_t width,
               size_t height)
{
    size_t x, y, pixel, pxGroup, uvIndex, outIndex;

    x = (blockIdx.x * blockDim.x) + threadIdx.x;
    y = (blockIdx.y * blockDim.y) + threadIdx.y;

    if (x >= width)
        return;

    if (y >= height)
        return;

    //pixel calculations
    float r, g, b;
    char cb, cr, yp;

    pixel = y * width + x;
    pxGroup = pixel/4;
    uvIndex = pxGroup*2;
    outIndex = pixel*3;

    yp = input[pixel];
    cb = uvPtr[uvIndex];
    cr = uvPtr[uvIndex+1];

    //calc rgb values
    b = (yp+2.032*(cb-128)) + 0.5;
    g = (yp-0.395*(cb-128)-0.581*(cr-128)) + 0.5;
    r = (yp+1.140*(cr-128)) + 0.5;

    //Make the pixel
    output[outIndex]   = (b < 0 ? 0 : (b > 0xff ? 0xff : b)); //saturate
    output[outIndex+1] = (g < 0 ? 0 : (g > 0xff ? 0xff : g));
    output[outIndex+2] = (r < 0 ? 0 : (r > 0xff ? 0xff : r));
}

Comments and Questions are appreciated!!

You might get better help if you define what “fails at runtime” means. Do you get a reported error? If so, what? Or is it simply that the results are not correct? If so, in what way?

I usually also suggest that people asking questions like this provide a complete test case. Don’t make your test case dependent on OpenCV. You may simply have errors in your memory allocation.

I also recommend proper CUDA error checking, and running your code with cuda-memcheck, any time you are having trouble with a CUDA code.

Here’s a test case roughly hacked together around what you have shown. It runs without any runtime error for me:

$ cat t283.cu
#include <stdio.h>

__global__
void NV12toRGB(char* input,
               char* output,
               char* uvPtr,
               size_t width,
               size_t height)
{
    size_t x, y, pixel, pxGroup, uvIndex, outIndex;

    x = (blockIdx.x * blockDim.x) + threadIdx.x;
    y = (blockIdx.y * blockDim.y) + threadIdx.y;

    if (x >= width)
        return;

    if (y >= height)
        return;

    //pixel calculations
    float r, g, b;
    char cb, cr, yp;

    pixel = y * width + x;
    pxGroup = pixel/4;
    uvIndex = pxGroup*2;
    outIndex = pixel*3;

    yp = input[pixel];
    cb = uvPtr[uvIndex];
    cr = uvPtr[uvIndex+1];

    //calc rgb values
    b = (yp+2.032*(cb-128)) + 0.5;
    g = (yp-0.395*(cb-128)-0.581*(cr-128)) + 0.5;
    r = (yp+1.140*(cr-128)) + 0.5;

    //Make the pixel
    output[outIndex]   = (b < 0 ? 0 : (b > 0xff ? 0xff : b)); //saturate
    output[outIndex+1] = (g < 0 ? 0 : (g > 0xff ? 0xff : g));
    output[outIndex+2] = (r < 0 ? 0 : (r > 0xff ? 0xff : r));
}



cudaError_t cudaNV12toRGB(char* input, char* output, size_t width, size_t height)
{
    if( !input || !output )
        return cudaErrorInvalidDevicePointer;

    size_t ySize = width*height;
    char* pUV = input + ySize;

    const dim3 blockDim(8, 8, 1);
    const dim3 gridDim(width/blockDim.x, height/blockDim.y, 1);

    NV12toRGB<<<gridDim, blockDim>>>(input, output, pUV,  width, height);

    return cudaDeviceSynchronize();
}

int main(){

  const int w = 256;
  const int h = 512;
  char *i, *o;
  cudaMalloc(&i, 2*w*h);
  cudaMalloc(&o, 3*w*h);
  cudaNV12toRGB(i, o, w, h);
 }

$ nvcc -o t283 t283.cu
$ cuda-memcheck ./t283
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$