I am trying to compare some opencv operations on the Jetson Tk1 and TX1 to understand performance benefits on the tx1. Both devices are flashed with the latest version of Jetpack and the OpenCV4Tegra. I wrote some code (at the end of post) to compare some simple image operations. From c++ - gpu::blur function takes significantly more time - Stack Overflow I understand that the non-gpu versions of the opencv functions I used are already optimized for the GPU and can therefore be faster than the Opencv::gpu versions. Strangely, some gpu operations run faster with the “cpu” version (noticably blur operations) but others run faster with the “gpu” implementation.
Even more strangely, although the TX1 is faster in all ::gpu operations, it is about the same speed than the TK1 in the “cpu” operations so I am wondering if these functions are using the OpenCV4Tegra GPU optimizations at all.
Can someone give an interpretation of these results?
It would be useful to be able to monitor GPU utilization during this operations? nvidia-smi is not available on the Tegras, any alternatives?
The image I am using is 1920x1200 color image
This are some results:
TX1
Time elapsed cpu_complete : 0.070188
Time elapsed gpu_complete : 0.367516
Time elapsed gpu_upload : 0.006238
Time elapsed cpu_cvtColor : 0.072496
Time elapsed gpu_cvtColor : 0.000483
Time elapsed cpu_blur : 0.020984
Time elapsed gpu_blur : 0.263246
Time elapsed cpu_threshold : 0.017799
Time elapsed gpu_threshold : 0.000896
Time elapsed gpu_download : 0.004659
Time elapsed CPU findcontours : 0.039958
Time elapsed cpu_complete : 0.06975
Time elapsed gpu_complete : 0.302142
Time elapsed gpu_upload : 0.010906
Time elapsed cpu_cvtColor : 0.086576
Time elapsed gpu_cvtColor : 0.000595
Time elapsed cpu_blur : 0.019373
Time elapsed gpu_blur : 0.264425
Time elapsed cpu_threshold : 0.026459
Time elapsed gpu_threshold : 0.005357
Time elapsed gpu_download : 0.001322
TK1:
Time elapsed cpu_complete : 0.053434
Time elapsed gpu_complete : 2.60139
Time elapsed gpu_upload : 0.024251
Time elapsed cpu_cvtColor : 0.065067
Time elapsed gpu_cvtColor : 0.000307
Time elapsed cpu_blur : 0.041415
Time elapsed gpu_blur : 1.04086
Time elapsed cpu_threshold : 0.025452
Time elapsed gpu_threshold : 0.004709
Time elapsed gpu_download : 0.009877
Time elapsed CPU findcontours : 0.049267
Time elapsed cpu_complete : 0.05283
Time elapsed gpu_complete : 0.316723
Time elapsed gpu_upload : 0.011
Time elapsed cpu_cvtColor : 0.065437
Time elapsed gpu_cvtColor : 0.00057
Time elapsed cpu_blur : 0.035615
Time elapsed gpu_blur : 1.07537
Time elapsed cpu_threshold : 0.019534
Time elapsed gpu_threshold : 0.000268
Time elapsed gpu_download : 0.002389
The code is:
#include <iostream>
#include <sstream>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include "opencv2/opencv.hpp"
#include "opencv2/gpu/gpu.hpp"
using namespace std;
#include <ctime>
void printtime(clock_t begin, std::string texto)
{
clock_t end=clock();
double timeSec = (end - begin) / static_cast<double>( CLOCKS_PER_SEC );
std::cout << "Time elapsed "<<texto<<" : " << timeSec << std::endl;
}
int main(int /*argc*/, char** /*argv*/)
{
std::vector<std::vector<cv::Point> > contours;
std::vector<cv::Vec4i> hierarchy;
int blurwindow=55; // blur kernel size
int threshlimit=55; // threshold limit
clock_t begin, end, begin_partial,begin_1frame ;
double timeSec;
begin = clock();
cv::Mat orig_image,dst_host,grey,blurred,thresh;
// Stream and string to generate filenames
std::stringstream ss;
std::string filename;
char key = 0;
begin = clock();
// cv::Mat image;
unsigned int rowBytes;
// GPU definitions
cv::gpu::GpuMat gpu_src, gpu_grey,gpu_thresh,gpu_blurred;
// http://stackoverflow.com/a/19454917
// The first call of any gpu function is slow due to CUDA context initialization. All next calls wil be faster. Call some gpu function before time measurement:
cv::gpu::GpuMat test;
test.create(1, 1, CV_8U); // Just to initialize context
vector<int> compression_parms;
compression_parms.push_back(CV_IMWRITE_PNG_COMPRESSION);
compression_parms.push_back(1);
begin = clock();
while(key != 'q')
{
begin_1frame =clock();
orig_image = cv::imread("Test.png");
// CPU operations
begin_partial = clock();
cv::cvtColor(orig_image,grey,CV_BGR2GRAY);
cv::blur(grey,blurred,cv::Size(blurwindow,blurwindow));
cv::threshold(blurred, thresh, threshlimit, 255.0, CV_THRESH_BINARY_INV);
printtime(begin_partial,"cpu_complete");
// GPU operations
begin_partial = clock();
gpu_src.upload(orig_image);
cv::gpu::cvtColor(gpu_src,gpu_grey,CV_BGR2GRAY); // not good to optimize with
cv::gpu::blur(gpu_grey,gpu_blurred,cv::Size(blurwindow,blurwindow));
cv::gpu::threshold(gpu_grey, gpu_thresh, threshlimit, 255.0, CV_THRESH_BINARY_INV);
gpu_thresh.download(thresh);
printtime(begin_partial,"gpu_complete");
// STEP BY STEP CPU
// STEP BY STEP GPU
begin_partial = clock();
gpu_src.upload(orig_image);
printtime(begin_partial,"gpu_upload");
begin_partial = clock();
cv::cvtColor(orig_image,grey,CV_BGR2GRAY);
printtime(begin_partial,"cpu_cvtColor");
begin_partial = clock();
cv::gpu::cvtColor(gpu_src,gpu_grey,CV_BGR2GRAY); // not good to optimize with gpu
printtime(begin_partial,"gpu_cvtColor");
begin_partial = clock();
cv::blur(grey,blurred,cv::Size(blurwindow,blurwindow));
printtime(begin_partial,"cpu_blur");
begin_partial = clock();
cv::gpu::blur(gpu_grey,gpu_blurred,cv::Size(blurwindow,blurwindow));
printtime(begin_partial,"gpu_blur");
begin_partial = clock();
cv::threshold(blurred, thresh, threshlimit, 255.0, CV_THRESH_BINARY_INV);
printtime(begin_partial,"cpu_threshold");
begin_partial = clock();
cv::gpu::threshold(gpu_grey, gpu_thresh, threshlimit, 255.0, CV_THRESH_BINARY_INV);
printtime(begin_partial,"gpu_threshold");
begin_partial = clock();
gpu_thresh.download(thresh);
printtime(begin_partial,"gpu_download");
begin_partial = clock();
cv::findContours(thresh,contours,hierarchy,CV_RETR_EXTERNAL,CV_CHAIN_APPROX_SIMPLE);
printtime(begin_partial," CPU findcontours");
}
return 0;
}