Hi,
I’m just running some basic testing metrics CPU vs GPU for OpenCV 4.5.2 with Cuda 11.4 on Ubuntu 21.XX
Each DFT is taking 80ms which is definitely wrong. I did a Nsight check and found everything at the hardware level looking good, no memory copies happening. But on the API side, CUDA is running a CUModuleLoad 80ms every single loop here. I’m not understanding why it’s doing that?
Any thoughts on how to proceed or what I may have gotten wrong? Thanks, I don’t think this happened with my previous environment that got nuked.
Number of CUDA devices: 1
Total time for GPU-DFT: 0.94756
Total time for GPU-DFT: 0.822355
Total time for GPU-DFT: 0.817892
Total time for GPU-DFT: 0.81832
Total time for GPU-DFT: 0.815274
Total time for GPU-DFT: 0.811999
Total time for GPU-DFT: 0.804606
Total time for GPU-DFT: 0.796385
Total time for GPU-DFT: 0.798238
Total time for GPU-DFT: 0.792199
Total time for GPU-DFT: 0.791789
Total time for GPU-DFT: 0.828016
Total time for CPU-DFT: 0.0381671
Total time for CPU-DFT: 0.0340495
Total time for CPU-DFT: 0.0344006
Total time for CPU-DFT: 0.0340798
Total time for CPU-DFT: 0.0342437
Total time for CPU-DFT: 0.0343547
Total time for CPU-DFT: 0.0339979
Total time for CPU-DFT: 0.0341221
Total time for CPU-DFT: 0.0345065
Total time for CPU-DFT: 0.0341161
Total time for CPU-DFT: 0.034196
Standard tutorial code at the bottom
int main()
{
cv::Mat image =cv::imread("720p_ifft.png", cv::COLOR_BGR2GRAY);
cv::imshow("image",image);
int height, width;
height = image.rows;
width = image.cols;
cv::Size s = image.size();
height = s.height;
width = s.width;
//Convert to 32-bit floating point
image.convertTo(image,CV_32FC1);
//GPU-DFT
int device =cv::cuda::getCudaEnabledDeviceCount();
std::cout<<"Number of CUDA devices: "<< device << std::endl;
int getD = cv::cuda::getDevice();
cv::cuda::setDevice(getD);
//Get dft size
int h =cv:: getOptimalDFTSize( image.rows );
int w =cv:: getOptimalDFTSize( image.cols );
cv::Size dftsize(h,w);
cv::Mat sizedimage;
cv::Mat transform = cv::Mat(h,w/2+1,CV_32FC2);
//Resize Image
cv::resize(image,sizedimage,dftsize);
//Upload image to GpuMat
cv::cuda::GpuMat gputransform = cv::cuda::GpuMat(h,w/2+1,CV_32FC2);
cv::cuda::GpuMat gpuimage;
gpuimage.upload(sizedimage);
//DFT
for(int i=0;i<=11;i++){
double t = (double)cv::getTickCount();
cv::cuda::dft(gpuimage,gputransform,sizedimage.size());
t = ((double)cv::getTickCount() - t)/cv::getTickFrequency();
std::cout<<"Total time for GPU-DFT: "<<t << std::endl;
//Download transformed image to CPU
}
gputransform.download(transform);
//CPU-DFT
for(int j=0;j<=10;j++){
cv::Mat cputransform = cv::Mat(h,w/2+1,CV_32FC2);
double totalcputime = (double)cv::getTickCount();
cv::dft(sizedimage,cputransform);
totalcputime = ((double)cv::getTickCount() - totalcputime)/cv::getTickFrequency();
std::cout<<"Total time for CPU-DFT: "<<totalcputime<<std::endl;
}
return 0;
}