Hi,
- I encountered this error when I executed cv::cuda::orb:detectAndComputeAsync to do feature point detection.
$ cuda-memcheck ./test_orb
========= CUDA-MEMCHECK
========= Program hit cudaErrorDevicesUnavailable (error 46) due to "all CUDA-capable devices are busy or unavailable" on CUDA API call to cudaMemcpyToSymbol.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/aarch64-linux-gnu/tegra/libcuda.so.1 [0x2fdb04]
========= Host Frame:/opt/vision/opencv/lib/libopencv_cudafeatures2d.so.4.5 [0xa8314]
=========
terminate called after throwing an instance of 'cv::Exception'
what(): OpenCV(4.5.0) /home/lzq/disk/opencv-4.5.0/opencv_contrib-4.5.0/modules/cudafeatures2d/src/cuda/orb.cu:170: error: (-217:Gpu API call) all CUDA-capable devices are busy or unavailable in function 'loadUMax'
========= Error: process didn't terminate successfully
========= No CUDA-MEMCHECK results found
If I execute the program directly ($ ./test_orb), I will get the result, but I don’t know if it’s correct
The environment configuration is as follows:
Jetson TX2 NX
JetPack 4.5
Opencv 4.5.0 opencv_contrib 4.5.0 (Install from source -DWITH_CUDA=ON)
Linux: Ubuntu 20.04
#include <stdio.h>
#include <chrono>
#include <vector>
#include <algorithm>
#include <opencv2/opencv.hpp>
#include <opencv2/cudafeatures2d.hpp>
#include <cuda_runtime_api.h>
void detectGrid(cv::Ptr<cv::cuda::ORB> & d_detector, cv::Mat & image,
std::vector<cv::KeyPoint> & keypoints, const int range, const int cell_x, const int cell_y)
{
int blockDimx = ceil((image.cols + cell_x - 1) / cell_x);
int blockDimy = ceil((image.rows + cell_y - 1) / cell_y);
std::vector<std::vector<cv::KeyPoint>> sub_keypoints(range);
// for (int i = 0; i < range; i++) {
// sub_keypoints[i].reserve(1000);
// }
//Create CUDA Streams Array
std::shared_ptr<std::vector<cv::cuda::Stream>> streamsArray =
std::make_shared<std::vector<cv::cuda::Stream>>(4);
for (int i = 0; i < 4; i++) {
cv::cuda::Stream stream;
streamsArray->push_back(stream);
}
#pragma omp parallel for num_threads(4)
for (int cell_id = 0; cell_id < range; cell_id++) {
int blockx = cell_id / cell_x;
int blocky = cell_id % cell_x;
int startx = blocky * blockDimx;
int starty = blockx * blockDimy;
int endx = (blocky + 1) * blockDimx;
int endy = (blockx + 1) * blockDimy;
endx = endx <= image.cols ? endx : image.cols;
endy = endy <= image.rows ? endy : image.rows;
cv::Range row_range(starty, endy);
cv::Range col_range(startx, endx);
cv::Mat sub_image = image(row_range, col_range);
cv::cuda::GpuMat d_sub_image;
d_sub_image.upload(sub_image);
std::cout << sub_image.cols << " " << sub_image.rows << std::endl;
cv::cuda::GpuMat d_keypoints;
d_detector->detectAndCompute(d_sub_image, cv::cuda::GpuMat(),
sub_keypoints[cell_id], cv::noArray(), false);
// d_detector->detectAndComputeAsync(d_sub_image, cv::cuda::GpuMat(),
// d_keypoints, cv::noArray(), false, (*streamsArray)[cell_id%4]);
// d_detector->convert(d_keypoints, sub_keypoints[cell_id]);
std::cout << "cell_id: " << cell_id <<
" | sub_keypoints size: " << sub_keypoints[cell_id].size() << std::endl;
}
for (int i = 0; i < range; i++) {
keypoints.insert(
keypoints.end(), sub_keypoints[i].begin(), sub_keypoints[i].end());
}
}
int main ()
{
int nums = 1;
// CPU ORB
cv::Ptr<cv::ORB> detector = cv::ORB::create(
1000, 1.2, 1, static_cast<int>(30u), 0, 2, cv::ORB::HARRIS_SCORE, 20, 0);
// CUDA ORB
cv::Ptr<cv::cuda::ORB> d_detector = cv::cuda::ORB::create(
1000, 1.2, 1, static_cast<int>(30u), 0, 2, cv::ORB::HARRIS_SCORE, 20, 0);
cv::cuda::Stream stream;
std::cout << "------------------" << std::endl;
// upload image to GPU
std::vector<cv::Mat> image(nums);
std::vector<cv::cuda::GpuMat> d_image(nums);
for (int i = 0; i < nums; i++) {
image[i] = cv::imread("/home/lzq/code/test_orb_demo/images/" +
std::to_string(i) + ".png", CV_8UC1);
d_image[i].upload(image[i]);
}
std::vector<cv::KeyPoint> keypoints_GPU2;
int block_num = 1;
int cell_x = 1;
int cell_y = 1;
// only detect the first image
detectGrid(d_detector, image[0], keypoints_GPU2, block_num, cell_x, cell_y);
cv::Mat image_gpu_grid = image[0];
cv::Scalar colorCircle1(0, 0, 255); // (B, G, R)
for (int i = 0; i < keypoints_GPU2.size(); i++) {
cv::circle(image_gpu_grid, cv::Point(keypoints_GPU2[i].pt.x, keypoints_GPU2[i].pt.y), 0, colorCircle1, 4);
}
cv::imwrite("/home/lzq/code/test_orb_demo/image_gpu_grid.png", image_gpu_grid);
return 0;
}
I build OpenCV by this way, refer to lines 88 to 105 :
sudo apt-get purge -y '*opencv*' || echo "previous OpenCV installation not found" && \
mkdir opencv && \
cd opencv
tar -xzvf OpenCV-4.5.0-aarch64.tar.gz && \
sudo dpkg -i --force-depends *.deb && \
sudo apt-get update && \
sudo apt-get install -y -f --no-install-recommends && \
sudo dpkg -i *.deb && \
sudo rm -rf /var/lib/apt/lists/* && \
sudo apt-get clean && \
cd ../ && \
sudo rm -rf opencv && \
sudo cp -r /usr/include/opencv4 /usr/local/include/opencv4 && \
sudo cp -r /usr/lib/python3.6/dist-packages/cv2 /usr/local/lib/python3.6/dist-packages/cv2
Thanks!