I’m computing an fft of an image using OpenCV GPU but the time that is take to transfer data from CPU to GPU is longer than the fft itself, so the time that is take to do transfer+fft is longer than fft in CPU. I don’t see any advantage of using GPU in this case. Am I right? and how can you solve this issue, I heard about shared memory… Anyone had any similar experience or any idea?
EDIT
This is a sample of my testing code:
#include <stdexcept>
#include "opencv2/imgproc.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/cudaimgproc.hpp"
#include "opencv2/cudaarithm.hpp"
#include <iostream>
int main (int argc, char *argv[])
{
cv::Mat image =cv::imread(argv[1],CV_LOAD_IMAGE_GRAYSCALE);
int height, width;
height = image.rows;
width = image.cols;
cv::Size s = image.size();
height = s.height;
width = s.width;
//Convert to 32-bit floating point
image.convertTo(image,CV_32FC1);
//GPU-DFT
int device =cv::cuda::getCudaEnabledDeviceCount();
std::cout<<"Number of CUDA devices: "<< device << std::endl;
int getD = cv::cuda::getDevice();
cv::cuda::setDevice(getD);
//Get dft size
int h =cv:: getOptimalDFTSize( image.rows );
int w =cv:: getOptimalDFTSize( image.cols );
cv::Size dftsize(h,w);
cv::Mat sizedimage;
cv::Mat transform = cv::Mat(h,w/2+1,CV_32FC2);
//Resize Image
cv::resize(image,sizedimage,dftsize);
//Upload image to GpuMat
cv::cuda::GpuMat gputransform = cv::cuda::GpuMat(h,w/2+1,CV_32FC2);
cv::cuda::GpuMat gpuimage;
gpuimage.upload(sizedimage);
//DFT
for(int i=0; i<3;i++)
{
double t = (double)cv::getTickCount();
cv::cuda::dft(gpuimage,gputransform,sizedimage.size());
t = ((double)cv::getTickCount() - t)/cv::getTickFrequency();
std::cout<<"Total time for GPU-DFT: "<<t << std::endl;
}
//Download transformed image to CPU
gputransform.download(transform);
//CPU-DFT
cv::Mat cputransform = cv::Mat(h,w/2+1,CV_32FC2);
double totalcputime = (double)cv::getTickCount();
cv::dft(sizedimage,cputransform);
totalcputime = ((double)cv::getTickCount() - totalcputime)/cv::getTickFrequency();
std::cout<<"\nTotal time for CPU-DFT: "<<totalcputime<<std::endl;
return 0;
}
And that’s what I get:
Number of CUDA devices: 1
Total time for GPU-DFT: 1.08107
Total time for GPU-DFT: 0.0637337
Total time for GPU-DFT: 0.0400113
Total time for CPU-DFT: 0.785276