Hi all,
Currently I am working on a project that uses OpenCV with CUDA support to perform some operations (warpPerspective, warpAffine, resize, among others). I am trying to use Streams to execute several computations in parallel, the problem is that even when using streams the whole process runs serialized.
- OpenCV 4.1 with CUDA support
- CUDA 10.2
- GeForce GTX 1050 Ti Mobile (final program will also be used in Jetson platforms)
- NVIDIA Nsight Systems 2020.1.1
Here is a minimal example that can be used to reproduce the problem:
#include "opencv2/opencv.hpp"
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/core/cuda.hpp>
#include <vector>
#include <memory>
#include <iostream>
#include <chrono>
#include <ctime>
cv::Mat doRotate(cv::Mat cvInput, double angle, cv::cuda::Stream stream){
cv::Point2f center((cvInput.cols - 1) / 2.0, (cvInput.rows - 1) / 2.0);
cv::Mat rot = cv::getRotationMatrix2D(center, angle, 1.0);
//Determine bounding rectangle, center not relevant.
cv::Rect2f bbox = cv::RotatedRect(cv::Point2f(), cvInput.size(),
angle).boundingRect2f();
//Adjust transformation matrix.
rot.at<double>(0, 2) += bbox.width / 2.0 - cvInput.cols / 2.0;
rot.at<double>(1, 2) += bbox.height / 2.0 - cvInput.rows / 2.0;
cv::Rect2f rBox = cv::RotatedRect(cv::Point2f(), cvInput.size(),angle).boundingRect2f();
cv::Mat output;
cv::cuda::HostMem cudaMemSrc = cv::cuda::HostMem(cvInput, cv::cuda::HostMem::PAGE_LOCKED);
cv::cuda::HostMem cudaMemDst = cv::cuda::HostMem(output, cv::cuda::HostMem::PAGE_LOCKED);
cv::cuda::GpuMat src, dst;
src.upload(cudaMemSrc,stream);
cv::cuda::warpAffine(src, dst, rot, rBox.size(),0,0,cv::Scalar(),stream);
dst.download(output,stream);
return output;
}
std::shared_ptr<std::vector<cv::Mat>> computeArray(std::shared_ptr<std::vector<cv::Mat>> inputArray,
std::shared_ptr<std::vector<cv::cuda::Stream>> streamsArray){
std::vector<cv::Mat> *array = inputArray.get();
std::shared_ptr<std::vector<cv::Mat>> outputArray = std::make_shared< std::vector<cv::Mat> >();
//#pragma omp parallel for
for(unsigned int i=0; i<array->size(); i++){
cv::Mat result = doRotate((*array)[i],180,(*streamsArray)[i]);
outputArray->push_back(result);
}
(*streamsArray)[0].waitForCompletion();
(*streamsArray)[1].waitForCompletion();
(*streamsArray)[2].waitForCompletion();
(*streamsArray)[3].waitForCompletion();
return outputArray;
}
int main (int argc, char* argv[]){
std::shared_ptr<std::vector<cv::cuda::Stream>> streamsArray = std::make_shared<std::vector<cv::cuda::Stream>>();
cv::cuda::Stream streamA, streamB, streamC, streamD;
streamsArray->push_back(streamA);
streamsArray->push_back(streamB);
streamsArray->push_back(streamC);
streamsArray->push_back(streamD);
for(int i=0; i<20; i++){
try{
cv::Mat src_host = cv::imread("1080.jpg");
std::shared_ptr<std::vector<cv::Mat>> images = std::make_shared<std::vector<cv::Mat>>();
images->push_back(src_host);
images->push_back(src_host);
images->push_back(src_host);
images->push_back(src_host);
std::shared_ptr<std::vector<cv::Mat>> result = std::make_shared<std::vector<cv::Mat>>();
result = computeArray(images, streamsArray);
//cv::imshow("Result", (*result)[0]);
//cv::waitKey(0);
}
catch(const cv::Exception& ex){
std::cout << "Error: " << ex.what() << std::endl;
}
}
return 0;
}
As seen in the previous image the 4 streams are detected by Nsight but they still execute in serial.
Also if the following line is uncommented:
#pragma omp parallel for
Result is different and HtoD operations seem to happen first, but the overall is still serialized.
Things I have tried so far:
1- Compile OpenCV with -DCUDA_NVCC_FLAGS=“–default-stream per-thread;” option
2- Use pinned vs shared memory
3- Use global allocated image (to avoid allocating memory each iteration)
Any help or hint will be appreciated
Regards,
Fabian