Hi,
I’m new to CUDA programming and I try to code a program that takes a frame from a camera with OpenCV, and give me a mask of a ball (all the pixels where the ball is are white and the others are black).
I tried it with OpenCV and it takes around 20ms to process one frame. So I tried with CUDA, but I get the same result. I hoped it would be much faster and I can’t understant why it isn’t faster than the CPU.
Here is my program :
#include <cuda.h>
#include <iostream>
#include <opencv2/opencv.hpp>
#include <ctime>
#include <chrono>
#define BLOCK_ROWS 32
#define BLOCK_COLS 16
using namespace std;
using namespace std::chrono;
using namespace cv;
#define CHECKCUDAERROR(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) {
if (code != cudaSuccess) {
cerr << "GPUAssert: " << cudaGetErrorString(code) << " in file : " << file << " at line " << line << endl;
if (abort) exit(code);
}
}
__global__ void cuda_hsv_ball_mask(uchar *input_data, uchar *output_data, int maxRow, int maxCol, int step, int maxChannel) {
unsigned int row = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int col = blockIdx.y * blockDim.y + threadIdx.y;
//unsigned int ch = blockIdx.z * blockDim.z + threadIdx.z;
if (row < maxRow && col < maxCol) {
int pos_h = row * step + col * maxChannel + 0;
int pos_s = row * step + col * maxChannel + 1;
int pos_v = row * step + col * maxChannel + 2;
int pos_out = row * maxCol + col;
if (input_data[pos_h] > 17 && input_data[pos_h] < 73 && input_data[pos_s] > 100 && input_data[pos_v] > 100) {
output_data[pos_out] = 255;
} else {
output_data[pos_out] = 0;
}
}
}
void cuda_main(const cv::Mat &input, const cv::Mat &output) {
uchar *device_input, *device_output;
size_t device_in_size = input.step * input.rows;
size_t device_out_size = output.step * output.rows;
CHECKCUDAERROR(cudaMalloc( (void **) &device_input, device_in_size));
CHECKCUDAERROR(cudaMalloc( (void **) &device_output, device_out_size));
CHECKCUDAERROR(cudaMemcpy(device_input, input.data, device_in_size, cudaMemcpyHostToDevice));
dim3 Threads(BLOCK_COLS, BLOCK_ROWS);
//dim3 Blocks((input.cols + Threads.x - 1) / Threads.x, (input.rows + Threads.y - 1) / Threads.y);
dim3 Blocks(24, 24, 2);
cuda_hsv_ball_mask<<<Blocks, Threads>>>(device_input, device_output, input.rows, input.cols, input.step, input.channels());
CHECKCUDAERROR(cudaDeviceSynchronize());
CHECKCUDAERROR(cudaGetLastError());
CHECKCUDAERROR(cudaMemcpy(output.data, device_output, device_out_size, cudaMemcpyDeviceToHost));
CHECKCUDAERROR(cudaFree(device_input));
CHECKCUDAERROR(cudaFree(device_output));
}
void getBallPic(int x, int y, int h, int s, int v, Mat &out) {
if (h > 17 && h < 73 && s > 100 && v > 100 ) {
out.at<uint8_t>(y, x) = (uint8_t)255;
}
}
void test(Mat frame, Mat &out) {
cuda::GpuMat gpuFrame;
gpuFrame.upload(frame);
out.setTo(Scalar(0));
cuda::GpuMat hsv, channels[3];
Mat h, s, v;
cuda::split(gpuFrame, channels);
//Séparation de l'image en 3 canaux h s v
channels[0].download(h);
channels[1].download(s);
channels[2].download(v);
//cout << h.at<uint8_t>(Point(0, 0)) << endl;
//Création du mask de la balle
for (int y = 0; y < out.rows; y++) {
for (int x = 0; x < out.cols; x++) {
//cout << "x: " << x << " y: " << y << endl;
uint8_t h_value = h.at<uint8_t>(Point(x, y));
uint8_t s_value = s.at<uint8_t>(Point(x, y));
uint8_t v_value = v.at<uint8_t>(Point(x, y));
getBallPic(x, y, (int)h_value, (int)s_value, (int)v_value, out);
}
}
}
int main(int argc, char *argv[]) {
VideoCapture cap("v4l2src device=/dev/video0 ! video/x-raw,width=1920,height=1080,format=(string)UYVY ! nvvidconv ! video/x-raw(memory:NVMM),width=1920,height=1080,format=(string)I420 ! nvvidconv! video/x-raw, format=(string)BGRx ! videoconvert ! video/x-raw, format=(string)BGR ! appsink");
char keypressed;
Mat input;
if( !cap.isOpened() )
{
cout << "***Could not initialize capturing...***\n";
return -1;
}
for(;;) {
high_resolution_clock::time_point begin = high_resolution_clock::now();
cap >> input;
resize(input, input, Size(640, 360));
Mat output(input.rows, input.cols, CV_8UC1);
cvtColor(input, input, COLOR_BGR2HSV);
cuda_main(input, output);
//test(input, output);
if( input.empty() )
break;
imshow("input", input);
imshow("output", output);
high_resolution_clock::time_point end = high_resolution_clock::now();
duration<double, milli> time_span = end - begin;
cout << "process time : " << time_span.count() << " ms" << endl;
keypressed = (char)waitKey(10);
if( keypressed == 27 )
break;
}
return 0;
}
Hope someone can help me !
Emilien