Error running nppiRGBToYUV420_8u_C3P3R on discrete GPU

Good morning,
I have the following code

==============================================================================

int W=1920;
int H=1200;
int nSrcStep = W;

cudaMallocManaged(&ptr_RGBCUDA, (3 * W * H));
cudaMallocManaged(&ptr_YCbCrCUDA, (3 * W * H) / 2);

Npp8u* pYCbCrDst[3];
pYCbCrDst[0] = (Npp8u*)ptr_YCbCrCUDA;
pYCbCrDst[1] = (Npp8u*)((long)ptr_YCbCrCUDA + (long)nSrcStep * (long)h);
pYCbCrDst[2] = (Npp8u*)((long)ptr_YCbCrCUDA + (long)nSrcStep * (long)h + (long)(nSrcStep * h / 4));

int steps[3];
steps[0] = nSrcStep;
steps[1] = nSrcStep / 2;
steps[2] = nSrcStep / 2;

stat = nppiRGBToYUV420_8u_C3P3R((Npp8u*)ptr_RGBCUDA, 3 * W, pYCbCrDst, steps, oSizeROI);
if (stat == NPP_SUCCESS)
{
cerr = cudaMemcpy(ptr_YCbCrHost, ptr_YCbCrCUDA, (3 * W * H) / 2, cudaMemcpyDeviceToHost);
printf(“\nnppiRGBToYUV420_8u_C3P3R result %i %i\n\n”, stat, cerr);
}
else printf(“\nnppiRGBToYUV420_8u_C3P3R error %i\n\n”, stat);

=====================================================================================

I’m testing the following code on two different hardware:

  • Jetson Orin Nano and AGX with Jetpack 5.1.1 (Cuda version 11.4)
  • Laptop with NVIDIA GeForce RTX 3050, Windows 11, driver version 536.99, Cuda version 12.2

If I run the code on Jetson Orin Nano everything works correctly. If I run the code on Laptop I have nppiRGBToYUV420_8u_C3P3R function that returns NPP_SUCCESS, but the following cudaMemcpy returns error 700 (cudaErrorIllegalAddress).

I tried to debug the code using compute-sanitizer and the result is that I had a lot of error lines like this

========= Invalid global write of size 1 bytes
========= at 0x910 in void ImageColorConversionKernel_4XX_8u<(NppColorModel)0, (NppPixelFormat)3, (NppColorModel)4, (NppPixelFormat)12>(const unsigned char *, const unsigned char *, const unsigned char *, const unsigned char *, int, int, int, unsigned char *, unsigned char *, unsigned char *, unsigned char *, int, int, int, unsigned int, unsigned int)
========= by thread (11,1,0) in block (1,2,0)
========= Address 0xa3462b is out of bounds

It seems is something related to memory access, but I cannot understand where is the issue

Regards

You have not provided a complete code that I can test, and omitted important items.

When I build a complete code like this around what you have shown, I get no errors:

# cat t88.cu
#include <npp.h>
#include <cstdio>

int main(){

  int W=1920;
  int H=1200;
  int h = H;
  int nSrcStep = W;
  Npp8u *ptr_RGBCUDA, *ptr_YCbCrCUDA, *ptr_YCbCrHost;
  cudaMallocManaged(&ptr_RGBCUDA, (3 * W * H));
  cudaMallocManaged(&ptr_YCbCrCUDA, (3 * W * H) / 2);
  cudaMallocManaged(&ptr_YCbCrHost, (3 * W * H) / 2);

  Npp8u* pYCbCrDst[3];
  pYCbCrDst[0] = (Npp8u*)ptr_YCbCrCUDA;
  pYCbCrDst[1] = (Npp8u*)((size_t)ptr_YCbCrCUDA + (size_t)nSrcStep * (size_t)h);
  pYCbCrDst[2] = (Npp8u*)((size_t)ptr_YCbCrCUDA + (size_t)nSrcStep * (size_t)h + (size_t)(nSrcStep * h / 4));

  int steps[3];
  steps[0] = nSrcStep;
  steps[1] = nSrcStep / 2;
  steps[2] = nSrcStep / 2;
  NppiSize oSizeROI = {W, H};
  NppStatus stat = nppiRGBToYUV420_8u_C3P3R((Npp8u*)ptr_RGBCUDA, 3 * W, pYCbCrDst, steps, oSizeROI);
  if (stat == NPP_SUCCESS)
  {
    cudaError_t cerr = cudaMemcpy(ptr_YCbCrHost, ptr_YCbCrCUDA, (3 * W * H) / 2, cudaMemcpyDeviceToHost);
    printf("\nnppiRGBToYUV420_8u_C3P3R result %i %i\n\n", stat, cerr);
  }
  else printf("\nnppiRGBToYUV420_8u_C3P3R error %i\n\n", stat);
}
# nvcc -o t88 t88.cu -lnppicc
# compute-sanitizer ./t88
========= COMPUTE-SANITIZER

nppiRGBToYUV420_8u_C3P3R result 0 0

========= ERROR SUMMARY: 0 errors
#

CUDA 12.2, L40 GPU, Ubuntu 22.04

(note, for posterity I have edited the above code to include the “fix” from below, switching from (long) casting to (size_t) casting.)

Hi @Robert_Crovella ,
sorry for the late reply.

I took exactly the same code you provide and I used with a “Cuda 12.1 Runtime Project” under Visual Studio 2022. If I run it I had exactly the same error

nppiRGBToYUV420_8u_C3P3R result 0 700

My settings are CUDA 12.1, NVIDIA GeForce RTX 3050 Laptop GPU, Windows 11Pro

Best regards

on windows, long is a 32-bit type. Try casting to a 64-bit type.

pYCbCrDst[1] = (Npp8u*)((size_t)ptr_YCbCrCUDA + (size_t)nSrcStep * (size_t)h);
pYCbCrDst[2] = (Npp8u*)((size_t)ptr_YCbCrCUDA + (size_t)nSrcStep * (size_t)h + (size_t)(nSrcStep * h / 4));

A clue is here:

That is a suspiciously small value for an address/pointer, and if you compare that numerical value to the numerical value of the ptr_YCbCrCUDA pointer after it is allocated, I think you will find an obvious discrepancy.

Thank you @Robert_Crovella,
finally I fix my issue

Regards

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.