All CUDA-capable devices busy or unavailable

Zhuoqian · December 11, 2021, 6:37am

Hi,

I encountered this error when I executed cv::cuda::orb:detectAndComputeAsync to do feature point detection.

$ cuda-memcheck ./test_orb
========= CUDA-MEMCHECK
========= Program hit cudaErrorDevicesUnavailable (error 46) due to "all CUDA-capable devices are busy or unavailable" on CUDA API call to cudaMemcpyToSymbol.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1 [0x2fdb04]
========= Host Frame:/opt/vision/opencv/lib/libopencv_cudafeatures2d.so.4.5 [0xa8314]
=========
terminate called after throwing an instance of 'cv::Exception'
what(): OpenCV(4.5.0) /home/lzq/disk/opencv-4.5.0/opencv_contrib-4.5.0/modules/cudafeatures2d/src/cuda/orb.cu:170: error: (-217:Gpu API call) all CUDA-capable devices are busy or unavailable in function 'loadUMax'

========= Error: process didn't terminate successfully
========= No CUDA-MEMCHECK results found

If I execute the program directly ($ ./test_orb), I will get the result, but I don’t know if it’s correct

The environment configuration is as follows:
Jetson TX2 NX
JetPack 4.5
Opencv 4.5.0 opencv_contrib 4.5.0 (Install from source -DWITH_CUDA=ON)
Linux: Ubuntu 20.04

#include <stdio.h>
#include <chrono>
#include <vector>
#include <algorithm>

#include <opencv2/opencv.hpp>
#include <opencv2/cudafeatures2d.hpp>
#include <cuda_runtime_api.h>

void detectGrid(cv::Ptr<cv::cuda::ORB> & d_detector, cv::Mat & image,
  std::vector<cv::KeyPoint> & keypoints, const int range, const int cell_x, const int cell_y)
{
  int blockDimx = ceil((image.cols + cell_x - 1) / cell_x);
  int blockDimy = ceil((image.rows + cell_y - 1) / cell_y);

  std::vector<std::vector<cv::KeyPoint>> sub_keypoints(range);
  // for (int i = 0; i < range; i++) {
  //   sub_keypoints[i].reserve(1000);
  // }

  //Create CUDA Streams Array
  std::shared_ptr<std::vector<cv::cuda::Stream>> streamsArray =
    std::make_shared<std::vector<cv::cuda::Stream>>(4);

  for (int i = 0; i < 4; i++) {
    cv::cuda::Stream stream;
    streamsArray->push_back(stream);
  }

  #pragma omp parallel for num_threads(4)
  for (int cell_id = 0; cell_id < range; cell_id++) {
    int blockx = cell_id / cell_x;
    int blocky = cell_id % cell_x;
    int startx = blocky * blockDimx;
    int starty = blockx * blockDimy;
    int endx = (blocky + 1) * blockDimx;
    int endy = (blockx + 1) * blockDimy;
    endx = endx <= image.cols ? endx : image.cols;
    endy = endy <= image.rows ? endy : image.rows;

    cv::Range row_range(starty, endy);
    cv::Range col_range(startx, endx);
    cv::Mat sub_image = image(row_range, col_range);
    cv::cuda::GpuMat d_sub_image;
    d_sub_image.upload(sub_image);
    std::cout << sub_image.cols << " " << sub_image.rows << std::endl;
    cv::cuda::GpuMat d_keypoints;
    d_detector->detectAndCompute(d_sub_image, cv::cuda::GpuMat(),
      sub_keypoints[cell_id], cv::noArray(), false);
    // d_detector->detectAndComputeAsync(d_sub_image, cv::cuda::GpuMat(),
    //   d_keypoints, cv::noArray(), false, (*streamsArray)[cell_id%4]);
    // d_detector->convert(d_keypoints, sub_keypoints[cell_id]);
    std::cout << "cell_id: " << cell_id <<
      " | sub_keypoints size: " << sub_keypoints[cell_id].size() << std::endl;
  }

  for (int i = 0; i < range; i++) {
    keypoints.insert(
      keypoints.end(), sub_keypoints[i].begin(), sub_keypoints[i].end());
  }
}

int main ()
{
  int nums = 1;
  // CPU ORB 
  cv::Ptr<cv::ORB> detector = cv::ORB::create(
    1000, 1.2, 1, static_cast<int>(30u), 0, 2, cv::ORB::HARRIS_SCORE, 20, 0);
  // CUDA ORB
  cv::Ptr<cv::cuda::ORB> d_detector = cv::cuda::ORB::create(
    1000, 1.2, 1, static_cast<int>(30u), 0, 2, cv::ORB::HARRIS_SCORE, 20, 0);
  cv::cuda::Stream stream;

  std::cout << "------------------" << std::endl;
  // upload image to GPU
  std::vector<cv::Mat> image(nums);
  std::vector<cv::cuda::GpuMat> d_image(nums);
  for (int i = 0; i < nums; i++) {
    image[i] = cv::imread("/home/lzq/code/test_orb_demo/images/" +
      std::to_string(i) + ".png", CV_8UC1);
    d_image[i].upload(image[i]);
  }

  std::vector<cv::KeyPoint> keypoints_GPU2;
  int block_num = 1;
  int cell_x = 1;
  int cell_y = 1;
  // only detect the first image
  detectGrid(d_detector, image[0], keypoints_GPU2, block_num, cell_x, cell_y);

  cv::Mat image_gpu_grid = image[0];
  cv::Scalar colorCircle1(0, 0, 255); // (B, G, R)
  for (int i = 0; i < keypoints_GPU2.size(); i++) {
    cv::circle(image_gpu_grid, cv::Point(keypoints_GPU2[i].pt.x, keypoints_GPU2[i].pt.y), 0, colorCircle1, 4);
  }
  cv::imwrite("/home/lzq/code/test_orb_demo/image_gpu_grid.png", image_gpu_grid);
  return 0;
}

I build OpenCV by this way, refer to lines 88 to 105 :

github.com

dusty-nv/jetson-containers/blob/master/Dockerfile.ros.galactic

#
# this dockerfile roughly follows the 'Install ROS From Source' procedures from:
#   https://docs.ros.org/en/galactic/Installation/Ubuntu-Development-Setup.html
#
ARG BASE_IMAGE=nvcr.io/nvidia/l4t-base:r32.5.0
FROM ${BASE_IMAGE}

ARG ROS_PKG=ros_base
ENV ROS_DISTRO=galactic
ENV ROS_ROOT=/opt/ros/${ROS_DISTRO}
ENV ROS_PYTHON_VERSION=3

ENV DEBIAN_FRONTEND=noninteractive
ENV SHELL /bin/bash
SHELL ["/bin/bash", "-c"] 

WORKDIR /tmp

# change the locale from POSIX to UTF-8
RUN locale-gen en_US en_US.UTF-8 && update-locale LC_ALL=en_US.UTF-8 LANG=en_US.UTF-8

This file has been truncated. show original

sudo apt-get purge -y '*opencv*' || echo "previous OpenCV installation not found" && \
mkdir opencv && \
cd opencv
tar -xzvf OpenCV-4.5.0-aarch64.tar.gz && \
sudo dpkg -i --force-depends *.deb && \
sudo apt-get update && \
sudo apt-get install -y -f --no-install-recommends && \
sudo dpkg -i *.deb && \
sudo rm -rf /var/lib/apt/lists/* && \
sudo apt-get clean && \
cd ../ && \
sudo rm -rf opencv && \
sudo cp -r /usr/include/opencv4 /usr/local/include/opencv4 && \
sudo cp -r /usr/lib/python3.6/dist-packages/cv2 /usr/local/lib/python3.6/dist-packages/cv2

Thanks!

Zhuoqian · December 11, 2021, 6:40am

I encountered the same error on Nano. When I run it in a non-docker environment, there will be an error, but when I run the program in docker, this error will not occur and the program can run normally.

$cd ~/code/NVIDIA_CUDA-10.1_Samples/0_Simple/simplePrintf
$make
$cuda-memcheck ./simplePrintf

Output:
========= Program hit cudaErrorDevicesUnavailable (error 46) due to "all CUDA-capable devices are busy or unavailable" on CUDA API call to cudaLaunchKernel.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1 [0x2fdb04]
========= Host Frame:./simplePrintf [0x4a834]
=========
========= Program hit cudaErrorDevicesUnavailable (error 46) due to "all CUDA-capable devices are busy or unavailable" on CUDA API call to cudaDeviceSynchronize.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1 [0x2fdb04]
========= Host Frame:./simplePrintf [0x332ec]
=========
========= ERROR SUMMARY: 2 errors

$docker start dusty_nv
$docker exec -it dusty_nv /bin/bash
$cd /home/lzq/code/NVIDIA_CUDA-10.1_Samples/0_Simple/simplePrintf
$make clean
$make
$cuda-memcheck ./simplePrintf

========= CUDA-MEMCHECK
GPU Device 0: "NVIDIA Tegra X1" with compute capability 5.3

Device 0: "NVIDIA Tegra X1" with Compute 5.3 capability
printf() is called. Output:
... ...
========= ERROR SUMMARY: 0 errors

The same program running on the two platforms will be different.

On Nano, although using “$cuda-memcheck ./simplePrintf” will prompt this error, It can get the correct result by directly executing the program “$./simplePrintf”.

The performance on TX2 NX is opposite to that of Nano, no matter it’s “$cuda-memcheck ./simplePrintf” or “$./simplePrintf” can not get results.

almstrand · December 13, 2021, 6:27am

I started seeing the same issue just a few days back. The issue can easily be reproduced in a new JetPack 4.6 install on a Nano 2Gb, and using cuda-memcheck to run a very simple C program that just calls cudaMalloc.

nvcc --gpu-architecture=sm_53 simple.cu -o simple -I.
cuda-memcheck ./simple

As others have commented, tests used to validate the data read/written to the allocated device memory are passing.

Zhuoqian · December 13, 2021, 9:03am

Hi almstrand,
Thank you for your suggestion, but I checked the compilation command, and the corresponding option has been added before, but it didn’t work.

nvcc -ccbin g++ -I../../common/inc -m64 -gencode arch=computee_62,code=sm_62 -o simplePrintf.o -c simplePrintf.cu
cuda-memcheck ./simplePrintf

Output:
========= Program hit cudaErrorDevicesUnavailable (error 46) due to "all CUDA-capable devices are busy or unavailable" on CUDA API call to cudaLaunchKernel.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1 [0x2fdb04]
========= Host Frame:./simplePrintf [0x4a834]
=========
========= Program hit cudaErrorDevicesUnavailable (error 46) due to "all CUDA-capable devices are busy or unavailable" on CUDA API call to cudaDeviceSynchronize.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1 [0x2fdb04]
========= Host Frame:./simplePrintf [0x332ec]
=========
========= ERROR SUMMARY: 2 errors

Robert_Crovella · December 13, 2021, 3:05pm

Perhaps there is something specific to using cuda-memcheck with recent JetPack versions. You might get better help by posting on one of the Jetson forums for your device. You could also file a bug.

Zhuoqian · December 14, 2021, 2:03am

Hi, Thank you for your advice. I want to confirm that the results can be obtained if cuda-memcheck is not executed, just like executing “./simplePrintf” but without cuda-memcheck, can I trust the correctness of the results?

Robert_Crovella · December 14, 2021, 3:54am

I won’t be able to say anything about correctness of results based on what is in this thread. Even cuda-memcheck does not guarantee correctness of results, it simply screens for certain types of errors.

almstrand · December 14, 2021, 6:36am

Changing the permissions for file /dev/nvhost-dbg-gpu appears to resolve the issue. This may help un-block developers who come across this thread, but NVIDIA, @Robert_Crovella could this be addressed in a future JetPack version?

sudo chmod a+rw /dev/nvhost-dbg-gpu

Zhuoqian · December 14, 2021, 7:31am

Hi almstrand,
You are right, it works for me. And executing with sudo also works. Thank you very much!

sudo /usr/local/cuda/bin/cuda-memcheck ./simplePrintf

system · December 28, 2021, 7:31am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
All CUDA-capable devices busy or unavailable Jetson TX2 cuda , jetson-inference	3	921	January 5, 2022
CUDA is installed and activated but echo $CUDA_VISIBLE_DEVICES returns nothing and can't use CUDA at all Jetson Xavier NX cuda	24	2639	November 7, 2023
runtime error with cuda 9.1 CUDA Programming and Performance	18	4132	April 24, 2018
VisionWorks+CUDA Segmentation Fault Jetson TX2	23	2220	October 25, 2017
Setting up CUDA/TensorRT manually on Jetson AGX Xavier Jetson AGX Xavier	13	20917	October 18, 2021
cufftPlan1D initialisation hides subsequence memory access errors CUDA Programming and Performance	8	653	November 24, 2020
Build OpenCV 3.4 with CUDA on NVIDIA Jetson TX2 Jetson TX2 opencv	14	9504	October 18, 2021
OpenCV : the function detectMultiScale hangs / crashes on Xavier Jetson AGX Xavier	14	3075	October 18, 2021
JETPACK 4.2 - JETSON TX2 - OpenCV 3.4.6 - GStreamer - Problem with on-board camera Jetson TX2	11	2035	October 18, 2021
How to run your own kernel in jetson nano Jetson Nano opencv	3	517	October 15, 2021

All CUDA-capable devices busy or unavailable

Related topics