Cuda histogram sample consumes too much memory

Hello.

When I run the cuda histogram sample of argus lib at 4k analysis, the argus daemon service consumes too much memory - about 1 GB - at the time of submitting the first request. The problem is getting bigger for me because I need 3 parallel cuda consumers for three different sensors, that run the same code. So again, for every thread, the argus deamon service consumes about 1 GB memory at the time of the first reqeuest. And then I am getting segmentation fault due to lack of memory. I dont know if this is a normal situation of argus deamon, and if so, what I have to do in order to fix this. Here is the code that every thread executes:

Initialization:

bool CudaConsumerThread::threadInitialize()
{
    CUresult cuResult;
    threadId = pthread_self();
    // Global variables
    g_cudaContext = 0;
    if(!initCUDA(&g_cudaContext)) {
        ACQUISITION_ERROR("Failed to initialize CUDA");
        return false;
    }
    ACQUISITION_PRINT("Creating stream settings");
    this->streamSettings = UniqueObj<OutputStreamSettings>(m_iCaptureSession->createOutputStreamSettings());
    this->iStreamSettings = interface_cast<IOutputStreamSettings>(streamSettings);
    if (this->iStreamSettings)
    {
        this->iStreamSettings->setPixelFormat(PIXEL_FMT_YCbCr_420_888);
        this->iStreamSettings->setResolution(m_streamSize);
    }

    outputStream = UniqueObj<OutputStream>
    (m_iCaptureSession->createOutputStream(streamSettings.get()));
    iStream = interface_cast<IStream>(outputStream);
    ACQUISITION_PRINT("Connecting CUDA to OutputStream as an EGLStream consumer");
    cuResult = cuEGLStreamConsumerConnect(&cudaConnection, iStream->getEGLStream());
    if (cuResult != CUDA_SUCCESS)
    {
        ACQUISITION_ERROR("Unable to connect CUDA as a consumer from EGLStream (CUresult "
             + std::string(getCudaErrorString(cuResult)) + ").");
             return false;
    }
    ACQUISITION_PRINT("Create capture request");
    this->request = UniqueObj<Request>(m_iCaptureSession->createRequest());
    this->iRequest = interface_cast<IRequest>(request);
    if (!iRequest)
        ACQUISITION_ERROR("Failed to create Request");
    this->iRequest->enableOutputStream(outputStream.get());

    return true;
}

Submite reqeust:

bool CudaConsumerThread::acquireFrame()
{
    /* Submit request */
    struct timespec tstart={0,0}, tend={0,0};
    double time;
    clock_gettime(CLOCK_REALTIME, &tstart);
    Argus::Status status;
    const uint64_t ONE_SECOND = 1000000000;
    uint32_t result = this->m_iCaptureSession->capture(request.get(), ONE_SECOND, &status);
    if (result == 0)
    {
        ORIGINATE_ERROR("Failed to submit capture request (status %x)", status);
        return false;
    }
    clock_gettime(CLOCK_REALTIME, &tend);
    time = ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) -
       ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec);

    ACQUISITION_TIME("Capture time", time);

    /* Acquire frame */
    CUresult cuResult;
    CUgraphicsResource cudaResource = 0;
    CUstream cudaStream = 0;
    cuResult = cuEGLStreamConsumerAcquireFrame(&cudaConnection, &cudaResource, &cudaStream, -1);
    clock_gettime(CLOCK_REALTIME, &tstart);
    if (cuResult != CUDA_SUCCESS)
    {
        ACQUISITION_ERROR("Unable to acquire an image frame from the EGLStream with CUDA as a consumer (CUresult " + std::string(getCudaErrorString(cuResult)) + ").");
        return false;
    }       
    clock_gettime(CLOCK_REALTIME, &tend);
    time = ((double)tend.tv_sec + 1.0e-9*tend.tv_nsec) -
       ((double)tstart.tv_sec + 1.0e-9*tstart.tv_nsec);

    ACQUISITION_TIME("Acquisition time", time);
    
    /* Move frame to GPU */
    clock_gettime(CLOCK_REALTIME, &tstart);
    CUeglFrame cudaEGLFrame;
    cuResult = cuGraphicsResourceGetMappedEglFrame(&cudaEGLFrame, cudaResource, 0, 0);
    if (cuResult != CUDA_SUCCESS)
    {
        ACQUISITION_ERROR("Unable to get the CUDA EGL frame (CUresult "
                + std::string(getCudaErrorString(cuResult)) + ").");
        return false;
    }       
    
    // Print the information contained in the CUDA EGL frame structure.
    PROPAGATE_ERROR(printCUDAEGLFrame(cudaEGLFrame));
    
    if ((cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV420_PLANAR) &&
        (cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR) &&
        (cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV422_PLANAR) &&
        (cudaEGLFrame.eglColorFormat != CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR))
    {   
        ORIGINATE_ERROR("Only YUV color formats are supported");
    }   
    if (cudaEGLFrame.cuFormat != CU_AD_FORMAT_UNSIGNED_INT8)
        ORIGINATE_ERROR("Only 8-bit unsigned int formats are supported");
        
    CUDA_RESOURCE_DESC cudaResourceDesc;
    memset(&cudaResourceDesc, 0, sizeof(cudaResourceDesc));
    cudaResourceDesc.resType = CU_RESOURCE_TYPE_ARRAY;
    cudaResourceDesc.res.array.hArray = cudaEGLFrame.frame.pArray[0];
    CUsurfObject cudaSurfObj = 0;
    cuResult = cuSurfObjectCreate(&cudaSurfObj, &cudaResourceDesc);
    if (cuResult != CUDA_SUCCESS)
    {
        ACQUISITION_PRINT("Unable to create the surface object \
                (CUresult " + std::string(getCudaErrorString(cuResult)) + ").");
        return false;
    }

    cuResult = cuSurfObjectDestroy(cudaSurfObj);
    if (cuResult != CUDA_SUCCESS)
    {
        ACQUISITION_PRINT("Unable to destroy the surface object \
                (CUresult " + std::string(getCudaErrorString(cuResult)) + ").");
    }

    cuResult = cuEGLStreamConsumerReleaseFrame(&cudaConnection, cudaResource, &cudaStream);
    if (cuResult != CUDA_SUCCESS)
    {
        ACQUISITION_PRINT("Unable to release the last frame acquired from the EGLStream  \
                (CUresult " + std::string(getCudaErrorString(cuResult)) + ").");
        return false;
    }

    return true;
}

Any help would be appreciated. Thank you.

Hi,

We are trying to reproduce this issue on our environment.

May I know the difference between your implement and the cudaHistogram sample?
Could we reproduce this issue directly with the official sample?

Thanks.

Hi AastaLL,

The first part of my post concerns the official cudaHistogram sample itself:
When I run the cuda histogram sample of argus lib at 4k analysis, the argus daemon service consumes too much memory - about 1 GB - at the time of submitting the first request.

Do you observe the same behavior?

Hi,

We are checking this internally.
Will update information with you later.

Thanks.

Update:

I tried the sample on different camera (IMX214, the previews was the IMX377). The memory problem remains. So we can assume that this is not a driver issue.

Hi,

We try this with IMX274 4K input and the memory taken is 579Mb, far from 1G.
There is some difference between us. Could you also test the pure cudaHistorm on your environment?

More, could you check your application with nvprof and share the data with us?
https://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview

Thanks.

Hello and sorry for the delay.

I checked again the memory with the cudaHistogram sample and the memory taken is ~590MB. Still, the memory that the sample takes is too much.

I also checked the cudaHistogram with the nvprof tool with argument –print-gpu-trace and the results in the first frame are

Executing Argus Sample: argus_cudahistogram
Argus Version: 0.96.2 (multi-process)
Creating output stream
Initializing CUDA
==4332== NVPROF is profiling process 4332, command: ./argus_cudahistogram
==4332== Warning: Unified Memory Profiling is not supported on the underlying platform. System requirements for unified memory can be found at: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
Connecting CUDA to OutputStream as an EGLStream consumer

Submitting a capture request
Acquiring an image from the EGLStream
CUeglFrame:
 width: 4096
 height: 2304
 depth: 0
 pitch: 0
 planeCount: 2
 numChannels: 1
 frameType: array
 colorFormat: YUV420 semi-planar
 cuFormat: uint8
Calculating histogram with 64 bins...
Finished after 2.443386 ms.
Result 0:
 0:   197235   192853   199573   203184   206839   215397   225270   228707
 8:   237797   256634   275134   291443   300190   305268   318036   334733
16:   321257   285956   262867   240898   202215   126716    68228    39064
24:    26959    23373    19118    17701    19318    16849    13873    13229
32:    14123    15619    15845    16957    19431    19697    19384    22744
40:    28023    34379    39993    42523    45603    52921    64098    83242
48:   110477   133518   152565   169731   176417   183695   198226   205360
56:   222309   238742   240651   244449   251456   247803   229733   211876
==4332== Profiling application: ./argus_cudahistogram
==4332== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
37.2977s  2.3288ms            (16 16 1)        (32 4 1)        13      256B        0B         -           -           -           -  NVIDIA Tegra X1         1         7  void histogram_smem_atomics<int=1024, int=64>(unsigned __int64, unsigned int, unsigned int, unsigned int*) [121]
37.3000s  13.073us              (2 1 1)       (128 1 1)        28        0B        0B         -           -           -           -  NVIDIA Tegra X1         1         7  void histogram_smem_accum<int=1024, int=64>(unsigned int const *, int, unsigned int*) [126]
37.3001s  1.3030us                    -               -         -         -         -      256B  187.37MB/s      Device    Pageable  NVIDIA Tegra X1         1         7  [CUDA memcpy DtoH]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy

And this is the output of the nvprof with no arguments

==4465== Profiling application: ./argus_cudahistogram
==4465== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   99.44%  2.5820ms         1  2.5820ms  2.5820ms  2.5820ms  void histogram_smem_atomics<int=1024, int=64>(unsigned __int64, unsigned int, unsigned int, unsigned int*)
                    0.50%  12.917us         1  12.917us  12.917us  12.917us  void histogram_smem_accum<int=1024, int=64>(unsigned int const *, int, unsigned int*)
                    0.06%  1.6140us         1  1.6140us  1.6140us  1.6140us  [CUDA memcpy DtoH]
      API calls:   59.70%  249.90ms         1  249.90ms  249.90ms  249.90ms  cuEGLStreamConsumerAcquireFrame
                   39.09%  163.64ms         1  163.64ms  163.64ms  163.64ms  cuCtxCreate
                    0.61%  2.5528ms         1  2.5528ms  2.5528ms  2.5528ms  cudaEventSynchronize
                    0.36%  1.4899ms         2  744.93us  337.60us  1.1523ms  cudaMalloc
                    0.06%  249.06us         2  124.53us  105.52us  143.54us  cudaFree
                    0.04%  171.40us         2  85.701us  15.312us  156.09us  cudaEventRecord
                    0.04%  170.31us         1  170.31us  170.31us  170.31us  cuEGLStreamConsumerConnectWithFlags
                    0.03%  111.40us         2  55.701us  38.280us  73.123us  cudaLaunch
                    0.02%  96.769us         1  96.769us  96.769us  96.769us  cudaMemcpy
                    0.02%  89.061us        94     947ns     416ns  30.104us  cuDeviceGetAttribute
                    0.01%  42.551us         1  42.551us  42.551us  42.551us  cuSurfObjectCreate
                    0.00%  15.000us         1  15.000us  15.000us  15.000us  cuSurfObjectDestroy
                    0.00%  12.760us         2  6.3800us     885ns  11.875us  cudaConfigureCall
                    0.00%  11.563us         2  5.7810us  3.2300us  8.3330us  cudaEventCreate
                    0.00%  10.313us         1  10.313us  10.313us  10.313us  cuEGLStreamConsumerReleaseFrame
                    0.00%  10.000us         1  10.000us  10.000us  10.000us  cuDeviceTotalMem
                    0.00%  9.9480us         7  1.4210us     364ns  6.4060us  cudaSetupArgument
                    0.00%  9.7910us         2  4.8950us  2.1870us  7.6040us  cudaEventDestroy
                    0.00%  5.8320us         4  1.4580us     885ns  2.9680us  cuDeviceGetCount
                    0.00%  5.5200us         1  5.5200us  5.5200us  5.5200us  cudaEventElapsedTime
                    0.00%  3.3850us         1  3.3850us  3.3850us  3.3850us  cuGraphicsResourceGetMappedEglFrame
                    0.00%  2.8630us         3     954ns     729ns  1.0930us  cuDeviceGet
                    0.00%  2.8120us         1  2.8120us  2.8120us  2.8120us  cuInit
                    0.00%  2.1870us         1  2.1870us  2.1870us  2.1870us  cuDriverGetVersion
                    0.00%  1.4580us         1  1.4580us  1.4580us  1.4580us  cuDeviceGetName

FYI The cudaHistogram consumes

~= 1GB with imx377 in full resolution and JetPack 3.1
~= 590MB with imx214 in full resolution and JetPack 3.3

Hi,

We don’t have imx377 sensor.
Could you help us test it with JetPack3.3?

We would like to narrow down the issue is from sensor or package version.
Thanks.

Hi,

I am trying to understand the situation here. When I work without the cuda connection, everything is ok in 4k resolution, no memory consumption. The problem occurs when you connect the argus producer with the cuda consumer like the cudaHistogram sample does, whatever the sensor is. In my mind this can’t be a sensor issue.

However if you believe that any test in imx377 can help I am willing to help.

Hi,

After checking, this is a known problem.

Root cause is there are lots of buffers behind this sample.
We are discussing the possibility to lower the buffer amount.

Will update information with you if any progress.
Thanks.

Hi,

So currently, there is no way to use 3 cameras on 4k resolution and GPU processing on Jetson TX1. Ηow do you propose to continue?

My plan was not only use 3 cameras, but to go to 6 in the near feature. Ιf there is no solution soon I will have to redesign my system. Can you give me a timeline for fixing this bug?

Is there any chance to use Jetson TX2 for the project that I described ?

Hi,

It may help since TX2 have twice memory size.
We will check this internally and update information with you.

Thanks.

Hi,

TX1 doesn’t support (3x) 4K camera.
For your use-case, it’s recommended to use Jetson AGX Xavier, which supports (2x) 4Kp60.
https://developer.nvidia.com/embedded/buy/jetson-xavier-devkit

Thanks.