Hi everyone, I wrote both an image convolution directly using cuda kernel and then I tried using opencv cuda convolution on my Jetson nano (Jetpack 4.3) with cuda and opencv 4.0.0 recompiled after removing Jetpack opencv version. I paste below my opencv code with convolution matrix. I used the same matrix in cuda “handwritten” convolution (just cuda code without opencv). The problem is that in opencv cuda convolution version the convolution process is 200 times slower than the “handwritten” cuda convolution. In opencv convolution I’m trying to use unified memory to access Mat and GpuMat from other opencv function. Have you got any suggestion to speed up this code on Jetson Nano?
cudaSetDeviceFlags(cudaDeviceMapHost); //Support for mapped pinned allocations
int rows = 512;
int cols = 640;
int righekernel=7;
int colonnekernel=7;
float *h_a, *cu_kernel, *h_result;
QTime Timek1, Timek2;
int elaps;
float kernelmatr[righekernel*colonnekernel]= {
0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04};
//Allocate memory for device pointers
cudaMallocManaged(&h_a, sizeof(float)*rows*cols);
cudaMallocManaged(&h_result, sizeof(float)*rows*cols);
cudaMallocManaged(&cu_kernel, sizeof(float)*righekernel*colonnekernel);
memcpy(cu_kernel,kernelmatr,righekernel*colonnekernel*sizeof(float));
//Mats (declaring them using pointers)
cv::Mat hmat_a(cv::Size(cols, rows), CV_32F, h_a);
cv::Mat hmat_orig(cv::Size(cols, rows), CV_32F, h_a);
cv::Mat hmat_result(cv::Size(cols, rows), CV_32F, h_result);
cv::Mat hmat_kernel(cv::Size(colonnekernel, righekernel), CV_32F, cu_kernel);
//Gpu Mats (declaring with the same pointers!)
cv::cuda::GpuMat dmat_a(cv::Size(cols, rows), CV_32F, h_a);
cv::cuda::GpuMat dmat_result(cv::Size(cols, rows), CV_32F, h_result);
cv::cuda::GpuMat dmat_kernel(cv::Size(colonnekernel, righekernel), CV_32F, cu_kernel);
hmat_orig = cv::imread("img_in.bmp",0);
hmat_orig.convertTo(hmat_a, CV_32FC1);
cv::Ptr<cv::cuda::Convolution> conv= cv::cuda::createConvolution(cv::Size(7, 7));
Timek1 = QTime::currentTime();
conv->convolve(dmat_a, dmat_kernel, dmat_result);
Timek2 = QTime::currentTime();
elaps = Timek1.msecsTo(Timek2);
ui->textEdit_UM->setText(QString::number(elaps));
cv::Mat dst;
dmat_result.download(dst);
cv::imwrite("out.bmp",dst);
cudaFree(&h_a);
cudaFree(&h_result);
cudaFree(&cu_kernel);