Opencv cuda convolution extremly slower than bare cuda code convolution on Jetson Nano using unified memory

Hi everyone, I wrote both an image convolution directly using cuda kernel and then I tried using opencv cuda convolution on my Jetson nano (Jetpack 4.3) with cuda and opencv 4.0.0 recompiled after removing Jetpack opencv version. I paste below my opencv code with convolution matrix. I used the same matrix in cuda “handwritten” convolution (just cuda code without opencv). The problem is that in opencv cuda convolution version the convolution process is 200 times slower than the “handwritten” cuda convolution. In opencv convolution I’m trying to use unified memory to access Mat and GpuMat from other opencv function. Have you got any suggestion to speed up this code on Jetson Nano?

cudaSetDeviceFlags(cudaDeviceMapHost); //Support for mapped pinned allocations

int rows = 512;
int cols = 640;
int righekernel=7;
int colonnekernel=7;
float *h_a, *cu_kernel, *h_result;
QTime Timek1, Timek2;
int elaps;
float kernelmatr[righekernel*colonnekernel]= {
         0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
        0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
        0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
        0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
        0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
        0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
        0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04};

//Allocate memory for device pointers
cudaMallocManaged(&h_a, sizeof(float)*rows*cols);
cudaMallocManaged(&h_result, sizeof(float)*rows*cols);
cudaMallocManaged(&cu_kernel, sizeof(float)*righekernel*colonnekernel);
memcpy(cu_kernel,kernelmatr,righekernel*colonnekernel*sizeof(float));

//Mats (declaring them using pointers)
cv::Mat hmat_a(cv::Size(cols, rows), CV_32F, h_a);
cv::Mat hmat_orig(cv::Size(cols, rows), CV_32F, h_a);
cv::Mat hmat_result(cv::Size(cols, rows), CV_32F, h_result);
cv::Mat hmat_kernel(cv::Size(colonnekernel, righekernel), CV_32F, cu_kernel);

//Gpu Mats (declaring with the same pointers!)
cv::cuda::GpuMat dmat_a(cv::Size(cols, rows), CV_32F, h_a);
cv::cuda::GpuMat dmat_result(cv::Size(cols, rows), CV_32F, h_result);
cv::cuda::GpuMat dmat_kernel(cv::Size(colonnekernel, righekernel), CV_32F, cu_kernel);

hmat_orig = cv::imread("img_in.bmp",0);
hmat_orig.convertTo(hmat_a, CV_32FC1);
cv::Ptr<cv::cuda::Convolution> conv= cv::cuda::createConvolution(cv::Size(7, 7));

Timek1 = QTime::currentTime();
conv->convolve(dmat_a, dmat_kernel, dmat_result);
Timek2 = QTime::currentTime();
elaps = Timek1.msecsTo(Timek2);
ui->textEdit_UM->setText(QString::number(elaps));


cv::Mat dst;
dmat_result.download(dst);
cv::imwrite("out.bmp",dst);




cudaFree(&h_a);
cudaFree(&h_result);
cudaFree(&cu_kernel);

Hi,

Have you built OpenCV with cuDNN support?

Not sure how do you build OpenCV on the Jetson.
Here is a good example for your reference:

Thanks.

Yes, I launched exactly the same script when I configured the Jetson. I think the problem is in UNIFIED MEMORY management in the OpenCv cuda configuation. Is there any flag to select in Tegra OpenCV setup/configuration?

I followed this guide to write cuda code:

and this discussion in my code posted before, using OpenCv Cuda implementation:

==> so maybe the problem is OpenCv convolution implementation. Any suggestions?

Be aware that CUDA stuff may be long to set up the first time, up to a few seconds.
Make a loop and you’ll probably find that the next convolutions are much faster.

[EDIT: Just checked now with this code:

#include <stdio.h>
#include <stdlib.h>
#include <iostream>

#include "cuda_runtime.h"

#include "opencv2/core.hpp"
#include "opencv2/cudaarithm.hpp"
#include "opencv2/imgcodecs.hpp" 
#include "opencv2/highgui.hpp" 
 

int main() {
	/* Convolution kernel in unified memory */
	const int kern_height = 7;
	const int kern_width = 7;
	float * u_kern;
	cudaMallocManaged(&u_kern, sizeof(float) * kern_height * kern_width);
	cv::Mat          h_kernel(kern_height, kern_width, CV_32FC1, u_kern);
	cv::cuda::GpuMat d_kernel(kern_height, kern_width, CV_32FC1, u_kern);
	// Set kernel from CPU. Here flat kernel as example
	h_kernel.setTo(1.0/(kern_height * kern_width));   

	/* Prepare convolution filter */
	cv::Ptr<cv::cuda::Convolution> conv = cv::cuda::createConvolution(cv::Size(kern_height, kern_width));

	/* Read image and get resolution */
	cv::Mat readFrame = cv::imread("/usr/local/cuda/samples/3_Imaging/dct8x8/data/barbara.bmp", cv::IMREAD_GRAYSCALE);
	unsigned int height = readFrame.rows;
	unsigned int width = readFrame.cols;

	cv::imshow("Original", readFrame);
	//cv::waitKey(-1);

	/* Float frame in unified memory */
	float * u_fFrame;
	cudaMallocManaged(&u_fFrame, sizeof(float) * height * width);
	cv::Mat          h_fFrame(height, width, CV_32FC1, u_fFrame);
	cv::cuda::GpuMat d_fFrame(height, width, CV_32FC1, u_fFrame);

	/* Float result in unified memory */
	float * u_fResult;
	unsigned int res_height = height - kern_height + 1;
	unsigned int res_width = width - kern_width + 1;
	cudaMallocManaged(&u_fResult, sizeof(float) * res_height * res_width);
	cv::Mat          h_fResult(res_height, res_width, CV_32FC1, u_fResult);
	cv::cuda::GpuMat d_fResult(res_height, res_width, CV_32FC1, u_fResult);


	/* Process */
	std::cout<<"Starting..." << std::endl;
	readFrame.convertTo(h_fFrame, CV_32FC1);

	int loops = 10;
	while(loops--) {
		double prev = (double) cv::getTickCount();
		conv->convolve(d_fFrame, d_kernel, d_fResult);
		cudaDeviceSynchronize();
		double now = (double) cv::getTickCount();
		double delta = (now - prev) / cv::getTickFrequency();
		std::cout<<"convolution time =" << delta << std::endl;
	}

	cv::Mat h_result(res_height, res_width, CV_8UC1);
	h_fResult.convertTo(h_result, CV_8UC1);
	/* Processing done */


	cv::imshow("Filtered", h_result);
	cv::waitKey(-1);
	cv::imwrite("filtered.bmp", h_result);


	/* Clean up */
	cudaFree(u_fResult);
	cudaFree(u_fFrame);
	cudaFree(u_kern);

	return 0;
}

and got these timings:

Starting...
convolution time =2.04625
convolution time =0.00437133
convolution time =0.00437347
convolution time =0.0041271
convolution time =0.00897517
convolution time =0.00619217
convolution time =0.00867555
convolution time =0.00689813
convolution time =0.00498308
convolution time =0.00404592

Dear Honey_Patouceul, I tried your code but unfortunately the result I get in the convoluaione are images with artifacts (in the attachment you can see origina limage and convoluted image with a large vertical line). This does not happen with the image you used for your example, coming from the cuda examples installed in the Jetson. Do I need to change anything in the image format? I am also attaching the original image in this case.

Original image (I load it as bmp format)

Convoluted image (large black row after convolution)

I wrote CUDA code directly for convolution (in handwritten CUDA I have no artifacts) with unified memory but even in this case I have a very high startup cost and I can’t understand why. Do you have any idea why the first iterations in CUDA are so slow?

The cause may be the improper size given to createConvolution. I did not pay attention and reproduced your original code, that works up to 512 pixels.
Giving the frame size instead should solve your problem:

     ...
     unsigned int height = readFrame.rows;
     unsigned int width = readFrame.cols;

	 /* Prepare convolution filter */
	 cv::Ptr<cv::cuda::Convolution> conv = cv::cuda::createConvolution(cv::Size(height, width));

Someone from NVIDIA may better advise or at least inform about first iteration setup time.
Obviously, next iterations may benefit from cache, but there may be more I’m not aware of.

Thanks, this solved the artifact problem. I just hope someone from NVIDIA can help us figure out how to improve setup time. In fact the setup time is high in opencv but it is also high (albeit of shorter duration) in the code that I wrote directly in CUDA. I hope in the clarification of NVIDIA for OpenCV approach and for CUDA approach. Thanks again Honey Patouceul.

Hi,

Would you mind to share a completely source to reproduce this?
Or we can use the Honey_Patouceul sample directly?

Thanks.

Dear
AastaLLL, I apologize for the delay in replying.
To reproduce my same situation you can use the Honey_Patouceul sample (which is the version tested with GPU-based OpenCv), while for the CUDA code I am attaching an example project with the convolution created directly in CUDA.

From the tests made I found that the code written directly in CUDA is faster than the GPU-based OpenCv one in executing the convolution but is still slow during the setup. How can I speed up the OpenCv version? How can I speed up that CUDA instead?

In the CUDA version I have doubts if this is correct:

  1. Grid and Block sizing to maximize Kernel execution speed (target image size is 640x512 with a 7x7 convolution kernel)

  2. management of unified memory to ensure maximum access speed to image data

Thanks in advanceCUDA_CONV.zip (497.9 KB)

Hi,

Since we don’t own OpenCV implementation, you can check this issue with OpenCV developer to get more information.

Based on their code, it seems that they implement convolution through cufft rather than cudnn.
Depends on usecase, to convert the spatial signal to Fourier may not always has gain due to the transformation overhead.

For slow cuDNN issue, this is a known regression from cuDNN v8.
https://forums.developer.nvidia.com/t/darknet-slower-using-jetpack-4-4-cudnn-8-0-0-cuda-10-2-than-jetpack-4-3-cudnn-7-6-3-cuda-10-0/
Our internal team is working on this. Will share you the latest status once we got any update.

Thanks.

1 Like

Thank you very Much AstaLLL, I hope for your soon feedback regarding the fix you are doing in Nvidia, and I hope for your update. In the meantime I have simplified the code written directly in CUDA (without CuDNN and not even CuFFT) for convolution so that it is more understandable. This code still has the problem that calling the cuda kernel in a loop the first iterations are slower. Is there a way to make these first iterations faster? I strongly hope for your help.

#include <iostream>
#include <cstdlib>
#include <time.h>
#include <cuda_runtime.h>
#include <thread>         // std::this_thread::sleep_for
#include <chrono>
#include <math.h>
#include "opencv2/opencv.hpp"
#include "opencv2/core/cvstd.hpp"
#include "opencv2/core/cuda.hpp"
#include "opencv2/highgui.hpp"
#include "opencv4/opencv2/core/cuda.hpp"
#include "opencv4/opencv2/cudaarithm.hpp"
#include "opencv4/opencv2/cudafilters.hpp"
#include "opencv4/opencv2/imgproc.hpp"

using namespace cv;
using namespace std;
//using namespace std:: chrono;


#define TILE_WIDTH 16
#define maskCols 7
#define maskRows 7
#define w (TILE_WIDTH + maskCols -1)

__global__ void myProcessing(float * InputImageData, const float *__restrict__ kernel, float* outputImageData, int channels, int width, int height){

	__shared__ float N_ds[w][w];  //block of image in shared memory


	// allocation in shared memory of image blocks
	int maskRadius = maskRows/2;
 	for (int k = 0; k <channels; k++) {
 		int dest = threadIdx.y * TILE_WIDTH + threadIdx.x;
 		int destY = dest/w;     //row of shared memory
 		int destX = dest%w;		//col of shared memory
 		int srcY = blockIdx.y *TILE_WIDTH + destY - maskRadius; // index to fetch data from input image
 		int srcX = blockIdx.x *TILE_WIDTH + destX - maskRadius; // index to fetch data from input image
 		int src = (srcY *width +srcX) * channels + k;   // index of input image
 		if(srcY>= 0 && srcY < height && srcX>=0 && srcX < width)
 			N_ds[destY][destX] = InputImageData[src];  // copy element of image in shared memory
 		else
 			N_ds[destY][destX] = 0;



 		dest = threadIdx.y * TILE_WIDTH+ threadIdx.x + TILE_WIDTH * TILE_WIDTH;
 		destY = dest/w;
		destX = dest%w;
		srcY = blockIdx.y *TILE_WIDTH + destY - maskRadius;
		srcX = blockIdx.x *TILE_WIDTH + destX - maskRadius;
		src = (srcY *width +srcX) * channels + k;
		if(destY < w){
			if(srcY>= 0 && srcY < height && srcX>=0 && srcX < width)
				N_ds[destY][destX] = InputImageData[src];
			else
				N_ds[destY][destX] = 0;
		}

 		__syncthreads();


 		//compute kernel convolution
 		float accum = 0;
 		int y, x;
 		for (y= 0; y < maskCols; y++)
 			for(x = 0; x<maskRows; x++)
 				accum += N_ds[threadIdx.y + y][threadIdx.x + x] *kernel[y * maskCols + x];

 		y = blockIdx.y * TILE_WIDTH + threadIdx.y;
 		x = blockIdx.x * TILE_WIDTH + threadIdx.x;
 		if(y < height && x < width)
 			outputImageData[(y * width + x) * channels + k] = accum;
 		__syncthreads();


 	}

}




int main(){

	int imageChannels;
	int imageHeight;
	int imageWidth;

	float* deviceInputImageData;
	float* deviceOutputImageData;
	float* deviceMaskData;
	float hostMaskData[maskRows * maskCols]={
			0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
			0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
			0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
			0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
			0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
			0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04,
			0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04

	};

	//______________________________________________________OK UM
	Mat image,outputImage;
	
	

	imageWidth = 640;
	imageHeight = 480;
	imageChannels = 1;
	
	Mat imageresult= Mat::zeros(imageHeight,imageWidth,CV_32FC1);

	cudaDeviceReset();

	cudaMallocManaged( &deviceInputImageData, imageWidth * imageHeight * imageChannels * sizeof(float));
	cudaMallocManaged( &deviceOutputImageData, imageWidth * imageHeight *imageChannels * sizeof(float));
 	    cudaMallocManaged( &deviceMaskData, maskRows * maskCols * sizeof(float));
     
    Mat image1(imageHeight,imageWidth,CV_32FC1, deviceInputImageData);

image= imread("img/img.jpg", IMREAD_GRAYSCALE);
image.convertTo(image1,CV_32FC1);
	deviceInputImageData=(float*)image1.data;

// for(int i=0; i<(maskRows * maskCols);i++)
// {
//     deviceMaskData[i]=0.04; 
// }
	memcpy(deviceMaskData, hostMaskData, maskRows * maskCols * sizeof(float));
	//______________________________________________________FINE OK UM

	dim3 dimGrid(ceil((float) imageWidth/TILE_WIDTH), ceil((float) imageHeight/TILE_WIDTH));
	dim3 dimBlock(TILE_WIDTH,TILE_WIDTH,1);

	myProcessing<<<dimGrid,dimBlock>>>(deviceInputImageData, deviceMaskData, deviceOutputImageData, imageChannels, imageWidth, imageHeight);
cudaStreamSynchronize(NULL);
cudaDeviceSynchronize();	

	memcpy(imageresult.data,(unsigned char*)deviceOutputImageData,imageresult.cols*imageresult.rows*sizeof(float));
	imageresult.convertTo(imageresult,CV_8UC1);
	imwrite("output/res.bmp",imageresult);

	cudaMemset(deviceInputImageData,0,imageWidth * imageHeight * imageChannels * sizeof(float));
	cudaMemset(deviceOutputImageData,0,imageWidth * imageHeight * imageChannels * sizeof(float));
	cudaMemset(deviceMaskData,0,maskRows * maskCols * sizeof(float));
	
cudaFree(deviceInputImageData);
	cudaFree(deviceOutputImageData);
	cudaFree(deviceMaskData);




}

Thanks again.

Hi,

It’s known that the very first CUDA kernel takes longer for some GPU initialization.
So you can add some warm-up mechanism to avoid the latency in the first run.

Thanks.