Here’s my working kernel for converting NV12 to RGBAf based on the kernels in jetson-inference:
#define COLOR_COMPONENT_MASK 0x3FF
#define COLOR_COMPONENT_BIT_SIZE 10
__global__ void myKernel(
cudaSurfaceObject_t surfLuma,
cudaSurfaceObject_t surfChroma,
float4* dstImage,
uint32_t width,
uint32_t height)
{
uint32_t yuv101010Pel[2];
uint32_t x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
if(x >= width) return;
if(y >= height) return;
yuv101010Pel[0] = surf2Dread<uint8_t>(surfLuma, x+0, y) << 2;
yuv101010Pel[1] = surf2Dread<uint8_t>(surfLuma, x+1, y) << 2;
int y_chroma = (y >> 1);
if(y & 1)
{
uint32_t chromaCb;
uint32_t chromaCr;
chromaCb = surf2Dread<uint8_t>(surfChroma, x, y_chroma);
chromaCr = surf2Dread<uint8_t>(surfChroma, x+1, y_chroma);
if(y_chroma < ((height >> 1) - 1))
{
chromaCb = (chromaCb + surf2Dread<uint8_t>(surfChroma, x, y_chroma+1) + 1) >> 1;
chromaCr = (chromaCr + surf2Dread<uint8_t>(surfChroma, x+1, y_chroma+1) + 1) >> 1;
}
yuv101010Pel[0] |= (chromaCb << ((COLOR_COMPONENT_BIT_SIZE << 0) + 2));
yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
yuv101010Pel[1] |= (chromaCb << ((COLOR_COMPONENT_BIT_SIZE << 0) + 2));
yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
}
else
{
yuv101010Pel[0] |= ((uint32_t)surf2Dread<uint8_t>(surfChroma, x, y_chroma) << ((COLOR_COMPONENT_BIT_SIZE << 0) + 2));
yuv101010Pel[0] |= ((uint32_t)surf2Dread<uint8_t>(surfChroma, x+1, y_chroma) << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
yuv101010Pel[1] |= ((uint32_t)surf2Dread<uint8_t>(surfChroma, x, y_chroma) << ((COLOR_COMPONENT_BIT_SIZE << 0) + 2));
yuv101010Pel[1] |= ((uint32_t)surf2Dread<uint8_t>(surfChroma, x+1, y_chroma) << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
}
// this steps performs the color conversion
float luma[2], u[2], v[2];
float red[2], green[2], blue[2];
luma[0] = float(yuv101010Pel[0] & COLOR_COMPONENT_MASK);
u[0] = float((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK) - 512.0f;
v[0] = float((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK) - 512.0f;
red[0] = luma[0] + 1.140f * v[0];
green[0] = luma[0] - 0.395f * u[0] - 0.581f * v[0];
blue[0] = luma[0] + 2.032f * u[0];
red[0] = min(max(red[0], 0.0f), 1023.f);
green[0] = min(max(green[0], 0.0f), 1023.f);
blue[0] = min(max(blue[0], 0.0f), 1023.f);
luma[1] = float(yuv101010Pel[1] & COLOR_COMPONENT_MASK);
u[1] = float((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK) - 512.0f;
v[1] = float((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK) - 512.0f;
red[1] = luma[1] + 1.140f * v[1];
green[1] = luma[1] - 0.395f * u[1] - 0.581f * v[1];
blue[1] = luma[1] + 2.032f * u[1];
red[1] = min(max(red[1], 0.0f), 1023.f);
green[1] = min(max(green[1], 0.0f), 1023.f);
blue[1] = min(max(blue[1], 0.0f), 1023.f);
const float s = 1.0f / 1024.0f * 255.0f;
dstImage[y * width + x] = make_float4(red[0] * s, green[0] * s, blue[0] * s, 1.0f);
dstImage[y * width + x + 1] = make_float4(red[1] * s, green[1] * s, blue[1] * s, 1.0f);
}
// cudaNV12ToRGBA
cudaError_t myNV12ToRGBAf(
cudaSurfaceObject_t surfLuma,
cudaSurfaceObject_t surfChroma,
float4* destDev,
uint32_t width,
uint32_t height )
{
if( !destDev )
return cudaErrorInvalidDevicePointer;
if( width == 0 || height == 0 )
return cudaErrorInvalidValue;
const dim3 blockDim(8,8,1);
//const dim3 gridDim((width+(2*blockDim.x-1))/(2*blockDim.x), (height+(blockDim.y-1))/blockDim.y, 1);
const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height, blockDim.y), 1);
myKernel<<<gridDim, blockDim>>>(surfLuma, surfChroma, destDev, width, height );
return CUDA(cudaGetLastError());
}
Then I can pass in the luma/chroma surface objects from cuEGLStreamConsumer
csResource cudaResource = 0;
CUstream cudaStream = 0;
cuResult = cuEGLStreamConsumerAcquireFrame(&cudaConnection, &cudaResource, &cudaStream, -1);
if(cuResult != CUDA_SUCCESS) break;
// Get the CUDA EGL frame.
CUeglFrame cudaEGLFrame;
cuResult = cuGraphicsResourceGetMappedEglFrame(&cudaEGLFrame, cudaResource, 0, 0);
if(cuResult != CUDA_SUCCESS)
{
ORIGINATE_ERROR("Unable to get the CUDA EGL frame (CUresult %s).",
getCudaErrorString(cuResult));
}
m_isConnected = true;
CUDA_RESOURCE_DESC cudaResourceDesc;
CUsurfObject cudaSurfLuma = 0;
CUsurfObject cudaSurfChroma = 0;
memset(&cudaResourceDesc, 0, sizeof(cudaResourceDesc));
cudaResourceDesc.resType = CU_RESOURCE_TYPE_ARRAY;
// Create a surface from the luminance plane
cudaResourceDesc.res.array.hArray = cudaEGLFrame.frame.pArray[0];
cuResult = cuSurfObjectCreate(&cudaSurfLuma, &cudaResourceDesc);
if(cuResult != CUDA_SUCCESS)
{
ORIGINATE_ERROR("Unable to create the luma surface object (CUresult %s)",
getCudaErrorString(cuResult));
}
// Create a surface from the chrominance plane
cudaResourceDesc.res.array.hArray = cudaEGLFrame.frame.pArray[1];
cuResult = cuSurfObjectCreate(&cudaSurfChroma, &cudaResourceDesc);
if(cuResult != CUDA_SUCCESS)
{
ORIGINATE_ERROR("Unable to create the chroma surface object (CUresult %s)",
getCudaErrorString(cuResult));
}
myNV12ToRGBAf(
cudaSurfLuma,
cudaSurfChroma,
frame[1],
shared->width,
shared->height);
Then I pass the cuda memory buffer (frame[1]) into one of the jetson-inference net objects.