Image processing is faster on CPU than with CUDA

Hi,

I’m new to CUDA programming and I try to code a program that takes a frame from a camera with OpenCV, and give me a mask of a ball (all the pixels where the ball is are white and the others are black).

I tried it with OpenCV and it takes around 20ms to process one frame. So I tried with CUDA, but I get the same result. I hoped it would be much faster and I can’t understant why it isn’t faster than the CPU.

Here is my program :

#include <cuda.h>
#include <iostream>
#include <opencv2/opencv.hpp>
#include <ctime>
#include <chrono>

#define BLOCK_ROWS 32
#define BLOCK_COLS 16

using namespace std;
using namespace std::chrono;
using namespace cv;

#define CHECKCUDAERROR(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) {
    if (code != cudaSuccess) {
        cerr << "GPUAssert: " << cudaGetErrorString(code) << " in file : " << file << " at line " << line << endl;
        if (abort) exit(code);
    }
}


__global__ void cuda_hsv_ball_mask(uchar *input_data, uchar *output_data, int maxRow, int maxCol, int step, int maxChannel) {
    unsigned int row = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int col = blockIdx.y * blockDim.y + threadIdx.y;
	//unsigned int ch = blockIdx.z * blockDim.z + threadIdx.z;

    if (row < maxRow && col < maxCol) {
        int pos_h = row * step + col * maxChannel + 0;
		int pos_s = row * step + col * maxChannel + 1;
		int pos_v = row * step + col * maxChannel + 2;

		int pos_out = row * maxCol + col;

	
		if (input_data[pos_h] > 17 && input_data[pos_h] < 73 && input_data[pos_s] > 100 && input_data[pos_v] > 100) {
			output_data[pos_out] = 255;
		} else {
			output_data[pos_out] = 0;
		}
    }
}


void cuda_main(const cv::Mat &input, const cv::Mat &output) {

    uchar *device_input, *device_output;

    size_t device_in_size = input.step * input.rows;
    size_t device_out_size = output.step * output.rows;

    CHECKCUDAERROR(cudaMalloc( (void **) &device_input, device_in_size));
    CHECKCUDAERROR(cudaMalloc( (void **) &device_output, device_out_size));

    CHECKCUDAERROR(cudaMemcpy(device_input, input.data, device_in_size, cudaMemcpyHostToDevice));

    dim3 Threads(BLOCK_COLS, BLOCK_ROWS);
    //dim3 Blocks((input.cols + Threads.x - 1) / Threads.x, (input.rows + Threads.y - 1) / Threads.y);
	dim3 Blocks(24, 24, 2);

    cuda_hsv_ball_mask<<<Blocks, Threads>>>(device_input, device_output, input.rows, input.cols, input.step, input.channels());

    CHECKCUDAERROR(cudaDeviceSynchronize());
    CHECKCUDAERROR(cudaGetLastError());

    CHECKCUDAERROR(cudaMemcpy(output.data, device_output, device_out_size, cudaMemcpyDeviceToHost));

    CHECKCUDAERROR(cudaFree(device_input));
    CHECKCUDAERROR(cudaFree(device_output));
}



void getBallPic(int x, int y, int h, int s, int v, Mat &out) {
    if (h > 17 && h < 73 && s > 100 && v > 100 ) {
		out.at<uint8_t>(y, x) = (uint8_t)255;
	}
}


void test(Mat frame, Mat &out) {
	cuda::GpuMat gpuFrame;
	gpuFrame.upload(frame);

	out.setTo(Scalar(0));
	cuda::GpuMat hsv, channels[3];
	Mat h, s, v;
	
	cuda::split(gpuFrame, channels);

	//Séparation de l'image en 3 canaux h s v
	channels[0].download(h);
	channels[1].download(s);
	channels[2].download(v);
	//cout << h.at<uint8_t>(Point(0, 0)) << endl;
	//Création du mask de la balle
    for (int y = 0; y < out.rows; y++) {
        for (int x = 0; x < out.cols; x++) {
			//cout << "x: " << x << " y: " << y << endl;
            uint8_t h_value = h.at<uint8_t>(Point(x, y));
            uint8_t s_value = s.at<uint8_t>(Point(x, y));
            uint8_t v_value = v.at<uint8_t>(Point(x, y));
            getBallPic(x, y, (int)h_value, (int)s_value, (int)v_value, out);
        }
    }
}


int main(int argc, char *argv[]) {
	VideoCapture cap("v4l2src device=/dev/video0 ! video/x-raw,width=1920,height=1080,format=(string)UYVY ! nvvidconv ! video/x-raw(memory:NVMM),width=1920,height=1080,format=(string)I420 ! nvvidconv! video/x-raw, format=(string)BGRx ! videoconvert ! video/x-raw, format=(string)BGR ! appsink");
	
	char keypressed;
	Mat input;
	

	if( !cap.isOpened() )
	{
		cout << "***Could not initialize capturing...***\n";
		return -1;
	}

	for(;;)	{
		
		high_resolution_clock::time_point begin = high_resolution_clock::now();

		cap >> input;
		resize(input, input, Size(640, 360));
		Mat output(input.rows, input.cols, CV_8UC1);
		cvtColor(input, input, COLOR_BGR2HSV);

		cuda_main(input, output);
		//test(input, output);
		


		if( input.empty() )
			break;
		imshow("input", input);
		imshow("output", output);
		
		high_resolution_clock::time_point end = high_resolution_clock::now();
		
		duration<double, milli> time_span = end - begin;
		cout << "process time : " << time_span.count() << " ms" << endl;


		keypressed = (char)waitKey(10);
		if( keypressed == 27 )
			break;
	}
	return 0;
}

Hope someone can help me !
Emilien

Hi!
You are using same components I am planning to use (CUDA, CV image processing, etc).
Can you give details to make (free) building environment. So I can get jump start.
And double check, and make those needed uhh… smart questions.
Usually it fix its.

Pm if you prefer it.

Thx Jack. Early retired senior ict-specialist
Ps. Hope we both will find this usefull