Hello developers,
I’m writing a simple piece of code which is trying to use the OpenCV 3.2.0 library’s CUDA optimised code to do some image processing. However, I got an unexpectedly poor performance on my TK1. Performing a box filter on a 4k by 1k image using kernel size of 5 runs for 0.017 seconds on CPU. I have tested the same box filter on GPU, but I got an unexpected execution time of 2 seconds. What’s worse is, when I tried to use a loop to filter 25 images, the GPU process got killed. This is rather poor. I have attached the code. Would you guys help me with this problem? I would be grateful if you could share me some docs on GPU programming.
In the code below, SPImage is a class I wrote for generating images containing random salt&pepper noise. Full code could be found in https://github.com/sthy14/TK1CV
#include "SPImage.hpp"
#include "omp.h"
#include <vector>
#include <opencv2/opencv.hpp>
#include <string>
#include <iostream>
#include <fstream>
#define NUMBER 1
using namespace cv;
using namespace std;
int main(int argc, char* argv[]) {
cuda::DeviceInfo gpu;
if (!gpu.isCompatible()) {
cout << "GPU is not compatible\n";
exit(-1);
}
SPImage generator;
Mat image[25];
for (int i = 0; i < NUMBER; i++) {
image[i] = generator.Generate(4096, 1024);
string fileName = "imgO" + to_string(i) + ".bmp";
imwrite(fileName, image[i]);
}
cuda::GpuMat gImage[25];
for (int i = 0; i < NUMBER; i++) {
gImage[i].upload(image[i]);
}
Ptr<cuda::Filter> gpuBlur = cuda::createBoxFilter(CV_8UC1, CV_8UC1, Size(5, 5));
Mat imageOut[25];
cuda::GpuMat gOut[25];
Mat gImageOut[25];
Mat mpImageOut[25];
double sStart = (double) getTickCount();
for (int i = 0; i < NUMBER; i++) {
blur(image[i], imageOut[i], Size(5, 5));
}
double sTime = ((double) getTickCount() - sStart) / getTickFrequency();
cout << sTime << "\n";
double mpStart = (double) getTickCount();
#pragma omp parallel for
for (int i = 0; i < NUMBER; i++) {
blur(image[i], mpImageOut[i], Size(5, 5));
}
double mpTime = ((double) getTickCount() - mpStart) / getTickFrequency();
cout << mpTime << "\n";
double gStart = (double) getTickCount();
for (int i = 0; i < NUMBER; i++) {
gpuBlur->apply(gImage[i], gOut[i]);
}
double gTime = ((double) getTickCount() - gStart) / getTickFrequency();
cout << gTime << "\n";
for (int i = 0; i < 25; i++) {
gOut[i].download(gImageOut[i]);
}
for (int i = 0; i < NUMBER; i++) {
// Output processed images.
string sName = "imgS" + to_string(i) + ".bmp";
string gName = "imgG" + to_string(i) + ".bmp";
string mpName = "imgMP" + to_string(i) + ".bmp";
imwrite(sName, imageOut[i]);
imwrite(gName, gImageOut[i]);
imwrite(mpName, mpImageOut[i]);
}
}
Kind regards,
Dawen