All CUDA-capable devices busy or unavailable

Hi,

  1. I encountered this error when I executed cv::cuda::orb:detectAndComputeAsync to do feature point detection.
$ cuda-memcheck ./test_orb
========= CUDA-MEMCHECK
========= Program hit cudaErrorDevicesUnavailable (error 46) due to "all CUDA-capable devices are busy or unavailable" on CUDA API call to cudaMemcpyToSymbol.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1 [0x2fdb04]
========= Host Frame:/opt/vision/opencv/lib/libopencv_cudafeatures2d.so.4.5 [0xa8314]
=========
terminate called after throwing an instance of 'cv::Exception'
what(): OpenCV(4.5.0) /home/lzq/disk/opencv-4.5.0/opencv_contrib-4.5.0/modules/cudafeatures2d/src/cuda/orb.cu:170: error: (-217:Gpu API call) all CUDA-capable devices are busy or unavailable in function 'loadUMax'

========= Error: process didn't terminate successfully
========= No CUDA-MEMCHECK results found

If I execute the program directly ($ ./test_orb), I will get the result, but I don’t know if it’s correct

The environment configuration is as follows:
Jetson TX2 NX
JetPack 4.5
Opencv 4.5.0 opencv_contrib 4.5.0 (Install from source -DWITH_CUDA=ON)
Linux: Ubuntu 20.04

#include <stdio.h>
#include <chrono>
#include <vector>
#include <algorithm>

#include <opencv2/opencv.hpp>
#include <opencv2/cudafeatures2d.hpp>
#include <cuda_runtime_api.h>

void detectGrid(cv::Ptr<cv::cuda::ORB> & d_detector, cv::Mat & image,
  std::vector<cv::KeyPoint> & keypoints, const int range, const int cell_x, const int cell_y)
{
  int blockDimx = ceil((image.cols + cell_x - 1) / cell_x);
  int blockDimy = ceil((image.rows + cell_y - 1) / cell_y);

  std::vector<std::vector<cv::KeyPoint>> sub_keypoints(range);
  // for (int i = 0; i < range; i++) {
  //   sub_keypoints[i].reserve(1000);
  // }

  //Create CUDA Streams Array
  std::shared_ptr<std::vector<cv::cuda::Stream>> streamsArray =
    std::make_shared<std::vector<cv::cuda::Stream>>(4);

  for (int i = 0; i < 4; i++) {
    cv::cuda::Stream stream;
    streamsArray->push_back(stream);
  }

  #pragma omp parallel for num_threads(4)
  for (int cell_id = 0; cell_id < range; cell_id++) {
    int blockx = cell_id / cell_x;
    int blocky = cell_id % cell_x;
    int startx = blocky * blockDimx;
    int starty = blockx * blockDimy;
    int endx = (blocky + 1) * blockDimx;
    int endy = (blockx + 1) * blockDimy;
    endx = endx <= image.cols ? endx : image.cols;
    endy = endy <= image.rows ? endy : image.rows;

    cv::Range row_range(starty, endy);
    cv::Range col_range(startx, endx);
    cv::Mat sub_image = image(row_range, col_range);
    cv::cuda::GpuMat d_sub_image;
    d_sub_image.upload(sub_image);
    std::cout << sub_image.cols << " " << sub_image.rows << std::endl;
    cv::cuda::GpuMat d_keypoints;
    d_detector->detectAndCompute(d_sub_image, cv::cuda::GpuMat(),
      sub_keypoints[cell_id], cv::noArray(), false);
    // d_detector->detectAndComputeAsync(d_sub_image, cv::cuda::GpuMat(),
    //   d_keypoints, cv::noArray(), false, (*streamsArray)[cell_id%4]);
    // d_detector->convert(d_keypoints, sub_keypoints[cell_id]);
    std::cout << "cell_id: " << cell_id <<
      " | sub_keypoints size: " << sub_keypoints[cell_id].size() << std::endl;
  }

  for (int i = 0; i < range; i++) {
    keypoints.insert(
      keypoints.end(), sub_keypoints[i].begin(), sub_keypoints[i].end());
  }
}

int main ()
{
  int nums = 1;
  // CPU ORB 
  cv::Ptr<cv::ORB> detector = cv::ORB::create(
    1000, 1.2, 1, static_cast<int>(30u), 0, 2, cv::ORB::HARRIS_SCORE, 20, 0);
  // CUDA ORB
  cv::Ptr<cv::cuda::ORB> d_detector = cv::cuda::ORB::create(
    1000, 1.2, 1, static_cast<int>(30u), 0, 2, cv::ORB::HARRIS_SCORE, 20, 0);
  cv::cuda::Stream stream;

  std::cout << "------------------" << std::endl;
  // upload image to GPU
  std::vector<cv::Mat> image(nums);
  std::vector<cv::cuda::GpuMat> d_image(nums);
  for (int i = 0; i < nums; i++) {
    image[i] = cv::imread("/home/lzq/code/test_orb_demo/images/" +
      std::to_string(i) + ".png", CV_8UC1);
    d_image[i].upload(image[i]);
  }

  std::vector<cv::KeyPoint> keypoints_GPU2;
  int block_num = 1;
  int cell_x = 1;
  int cell_y = 1;
  // only detect the first image
  detectGrid(d_detector, image[0], keypoints_GPU2, block_num, cell_x, cell_y);

  cv::Mat image_gpu_grid = image[0];
  cv::Scalar colorCircle1(0, 0, 255); // (B, G, R)
  for (int i = 0; i < keypoints_GPU2.size(); i++) {
    cv::circle(image_gpu_grid, cv::Point(keypoints_GPU2[i].pt.x, keypoints_GPU2[i].pt.y), 0, colorCircle1, 4);
  }
  cv::imwrite("/home/lzq/code/test_orb_demo/image_gpu_grid.png", image_gpu_grid);
  return 0;
}

I build OpenCV by this way, refer to lines 88 to 105 :

sudo apt-get purge -y '*opencv*' || echo "previous OpenCV installation not found" && \
mkdir opencv && \
cd opencv
tar -xzvf OpenCV-4.5.0-aarch64.tar.gz && \
sudo dpkg -i --force-depends *.deb && \
sudo apt-get update && \
sudo apt-get install -y -f --no-install-recommends && \
sudo dpkg -i *.deb && \
sudo rm -rf /var/lib/apt/lists/* && \
sudo apt-get clean && \
cd ../ && \
sudo rm -rf opencv && \
sudo cp -r /usr/include/opencv4 /usr/local/include/opencv4 && \
sudo cp -r /usr/lib/python3.6/dist-packages/cv2 /usr/local/lib/python3.6/dist-packages/cv2 

Thanks!

I encountered the same error on Nano. When I run it in a non-docker environment, there will be an error, but when I run the program in docker, this error will not occur and the program can run normally.

$cd ~/code/NVIDIA_CUDA-10.1_Samples/0_Simple/simplePrintf
$make
$cuda-memcheck ./simplePrintf

Output:
========= Program hit cudaErrorDevicesUnavailable (error 46) due to "all CUDA-capable devices are busy or unavailable" on CUDA API call to cudaLaunchKernel.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1 [0x2fdb04]
========= Host Frame:./simplePrintf [0x4a834]
=========
========= Program hit cudaErrorDevicesUnavailable (error 46) due to "all CUDA-capable devices are busy or unavailable" on CUDA API call to cudaDeviceSynchronize.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1 [0x2fdb04]
========= Host Frame:./simplePrintf [0x332ec]
=========
========= ERROR SUMMARY: 2 errors

$docker start dusty_nv
$docker exec -it dusty_nv /bin/bash
$cd /home/lzq/code/NVIDIA_CUDA-10.1_Samples/0_Simple/simplePrintf
$make clean
$make
$cuda-memcheck ./simplePrintf

========= CUDA-MEMCHECK
GPU Device 0: "NVIDIA Tegra X1" with compute capability 5.3

Device 0: "NVIDIA Tegra X1" with Compute 5.3 capability
printf() is called. Output:
... ...
========= ERROR SUMMARY: 0 errors 

The same program running on the two platforms will be different.

On Nano, although using “$cuda-memcheck ./simplePrintf” will prompt this error, It can get the correct result by directly executing the program “$./simplePrintf”.

The performance on TX2 NX is opposite to that of Nano, no matter it’s “$cuda-memcheck ./simplePrintf” or “$./simplePrintf” can not get results.

I started seeing the same issue just a few days back. The issue can easily be reproduced in a new JetPack 4.6 install on a Nano 2Gb, and using cuda-memcheck to run a very simple C program that just calls cudaMalloc.

nvcc --gpu-architecture=sm_53 simple.cu -o simple -I.
cuda-memcheck ./simple

As others have commented, tests used to validate the data read/written to the allocated device memory are passing.

Hi almstrand,
Thank you for your suggestion, but I checked the compilation command, and the corresponding option has been added before, but it didn’t work.

nvcc -ccbin g++ -I../../common/inc -m64 -gencode arch=computee_62,code=sm_62 -o simplePrintf.o -c simplePrintf.cu
cuda-memcheck ./simplePrintf
Output:
========= Program hit cudaErrorDevicesUnavailable (error 46) due to "all CUDA-capable devices are busy or unavailable" on CUDA API call to cudaLaunchKernel.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1 [0x2fdb04]
========= Host Frame:./simplePrintf [0x4a834]
=========
========= Program hit cudaErrorDevicesUnavailable (error 46) due to "all CUDA-capable devices are busy or unavailable" on CUDA API call to cudaDeviceSynchronize.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1 [0x2fdb04]
========= Host Frame:./simplePrintf [0x332ec]
=========
========= ERROR SUMMARY: 2 errors

Perhaps there is something specific to using cuda-memcheck with recent JetPack versions. You might get better help by posting on one of the Jetson forums for your device. You could also file a bug.

Hi, Thank you for your advice. I want to confirm that the results can be obtained if cuda-memcheck is not executed, just like executing “./simplePrintf” but without cuda-memcheck, can I trust the correctness of the results?

I won’t be able to say anything about correctness of results based on what is in this thread. Even cuda-memcheck does not guarantee correctness of results, it simply screens for certain types of errors.

Changing the permissions for file /dev/nvhost-dbg-gpu appears to resolve the issue. This may help un-block developers who come across this thread, but NVIDIA, @Robert_Crovella could this be addressed in a future JetPack version?

sudo chmod a+rw /dev/nvhost-dbg-gpu
1 Like

Hi almstrand,
You are right, it works for me. And executing with sudo also works. Thank you very much!

sudo /usr/local/cuda/bin/cuda-memcheck ./simplePrintf
1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.