All CUDA-capable devices busy or unavailable

Zhuoqian · December 14, 2021, 2:19am

Hi,
I described the problem in the cuda forum before, but it has not been resolved yet .

The specific question is:

encountered this error when I executed cv::cuda::orb:detectAndComputeAsync to do feature point detection.

$ cuda-memcheck ./test_orb
========= CUDA-MEMCHECK
========= Program hit cudaErrorDevicesUnavailable (error 46) due to "all CUDA-capable devices are busy or unavailable" on CUDA API call to cudaMemcpyToSymbol.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1 [0x2fdb04]
========= Host Frame:/opt/vision/opencv/lib/libopencv_cudafeatures2d.so.4.5 [0xa8314]
=========
terminate called after throwing an instance of 'cv::Exception'
what(): OpenCV(4.5.0) /home/lzq/disk/opencv-4.5.0/opencv_contrib-4.5.0/modules/cudafeatures2d/src/cuda/orb.cu:170: error: (-217:Gpu API call) all CUDA-capable devices are busy or unavailable in function 'loadUMax'

========= Error: process didn't terminate successfully
========= No CUDA-MEMCHECK results found

If I execute the program directly ($ ./test_orb), I will get the result, but I don’t know if it’s correct

The environment configuration is as follows:
Jetson TX2 NX
JetPack 4.5
Opencv 4.5.0 opencv_contrib 4.5.0 (Install from source -DWITH_CUDA=ON)
Linux: Ubuntu 20.04

#include <stdio.h>
#include <chrono>
#include <vector>
#include <algorithm>

#include <opencv2/opencv.hpp>
#include <opencv2/cudafeatures2d.hpp>
#include <cuda_runtime_api.h>

void detectGrid(cv::Ptr<cv::cuda::ORB> & d_detector, cv::Mat & image,
  std::vector<cv::KeyPoint> & keypoints, const int range, const int cell_x, const int cell_y)
{
  int blockDimx = ceil((image.cols + cell_x - 1) / cell_x);
  int blockDimy = ceil((image.rows + cell_y - 1) / cell_y);

  std::vector<std::vector<cv::KeyPoint>> sub_keypoints(range);
  // for (int i = 0; i < range; i++) {
  //   sub_keypoints[i].reserve(1000);
  // }

  //Create CUDA Streams Array
  std::shared_ptr<std::vector<cv::cuda::Stream>> streamsArray =
    std::make_shared<std::vector<cv::cuda::Stream>>(4);

  for (int i = 0; i < 4; i++) {
    cv::cuda::Stream stream;
    streamsArray->push_back(stream);
  }

  #pragma omp parallel for num_threads(4)
  for (int cell_id = 0; cell_id < range; cell_id++) {
    int blockx = cell_id / cell_x;
    int blocky = cell_id % cell_x;
    int startx = blocky * blockDimx;
    int starty = blockx * blockDimy;
    int endx = (blocky + 1) * blockDimx;
    int endy = (blockx + 1) * blockDimy;
    endx = endx <= image.cols ? endx : image.cols;
    endy = endy <= image.rows ? endy : image.rows;

    cv::Range row_range(starty, endy);
    cv::Range col_range(startx, endx);
    cv::Mat sub_image = image(row_range, col_range);
    cv::cuda::GpuMat d_sub_image;
    d_sub_image.upload(sub_image);
    std::cout << sub_image.cols << " " << sub_image.rows << std::endl;
    cv::cuda::GpuMat d_keypoints;
    d_detector->detectAndCompute(d_sub_image, cv::cuda::GpuMat(),
      sub_keypoints[cell_id], cv::noArray(), false);
    // d_detector->detectAndComputeAsync(d_sub_image, cv::cuda::GpuMat(),
    //   d_keypoints, cv::noArray(), false, (*streamsArray)[cell_id%4]);
    // d_detector->convert(d_keypoints, sub_keypoints[cell_id]);
    std::cout << "cell_id: " << cell_id <<
      " | sub_keypoints size: " << sub_keypoints[cell_id].size() << std::endl;
  }

  for (int i = 0; i < range; i++) {
    keypoints.insert(
      keypoints.end(), sub_keypoints[i].begin(), sub_keypoints[i].end());
  }
}

int main ()
{
  int nums = 1;
  // CPU ORB 
  cv::Ptr<cv::ORB> detector = cv::ORB::create(
    1000, 1.2, 1, static_cast<int>(30u), 0, 2, cv::ORB::HARRIS_SCORE, 20, 0);
  // CUDA ORB
  cv::Ptr<cv::cuda::ORB> d_detector = cv::cuda::ORB::create(
    1000, 1.2, 1, static_cast<int>(30u), 0, 2, cv::ORB::HARRIS_SCORE, 20, 0);
  cv::cuda::Stream stream;

  std::cout << "------------------" << std::endl;
  // upload image to GPU
  std::vector<cv::Mat> image(nums);
  std::vector<cv::cuda::GpuMat> d_image(nums);
  for (int i = 0; i < nums; i++) {
    image[i] = cv::imread("/home/lzq/code/test_orb_demo/images/" +
      std::to_string(i) + ".png", CV_8UC1);
    d_image[i].upload(image[i]);
  }

  std::vector<cv::KeyPoint> keypoints_GPU2;
  int block_num = 1;
  int cell_x = 1;
  int cell_y = 1;
  // only detect the first image
  detectGrid(d_detector, image[0], keypoints_GPU2, block_num, cell_x, cell_y);

  cv::Mat image_gpu_grid = image[0];
  cv::Scalar colorCircle1(0, 0, 255); // (B, G, R)
  for (int i = 0; i < keypoints_GPU2.size(); i++) {
    cv::circle(image_gpu_grid, cv::Point(keypoints_GPU2[i].pt.x, keypoints_GPU2[i].pt.y), 0, colorCircle1, 4);
  }
  cv::imwrite("/home/lzq/code/test_orb_demo/image_gpu_grid.png", image_gpu_grid);
  return 0;
}

I want to confirm that the results can be obtained if cuda-memcheck is not executed, just like executing “./test_orb” but without cuda-memcheck, can I trust the correctness of the results?

AastaLLL · December 14, 2021, 5:26am

Hi,

Since profiling needs root authority, could you try it with sudo again?

For example:

$ sudo /usr/local/cuda/bin/cuda-memcheck ./test_orb

Thanks.

Zhuoqian · December 14, 2021, 5:43am

Hi AastaLLL,
You are right, I try it with sudo again and it works.

Thank you very much!

system · January 5, 2022, 5:04am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.