Got back to this topic for some reason, and I may share my findings at that time (Xavier NX, last trial was on Dec 2020).
I just then tried a simple example of Sobel filtering from opencv on 1280x720p30. Using POCL built with llvm11.
This may be a side case, not sure it can be further generalized.
These were excluding first 20 frames from measurement on next 500 frames:
- Opencv CPU cv::Mat (or cv::UMat with POCL basic) : ~25 ms per frame
- Opencv CUDA GpuMat: 3 ms per frame
- Opencv UMat with POCL CUDA: 2 ms per frame
- Opencv UMat with POCL pthreads: 0.5 ms per frame
So the big improvement may be on CPU side with carmel CPU support.
Unable to retry now with R32.5.1, only from what I remind.
Test Code (use with caution, not tested before posting)
#include <signal.h>
#include <iostream>
#include <vector>
#include <CL/cl.h>
#include <opencv2/opencv.hpp>
#include <opencv2/core/ocl.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/core/cuda.hpp>
#include <opencv2/cudafilters.hpp>
//#include <opencv2/cudaobjdetect.hpp>
//#include <opencv2/cudaimgproc.hpp>
#define IGNORE_FIRST_FRAMES 20
#define LOOP_MEASURE_FRAMES 500
typedef enum {
test_no_opencl_cpu = 0,
test_opencl_cpu,
test_opencl_gpu,
test_cuda,
test_unknown
} test_case_t;
test_case_t tcase = test_opencl_gpu;
static cv::VideoCapture *capPtr = NULL;
void my_handler(int s){
std::cerr<< "Caught signal " << s << std::endl;
if(capPtr) {
capPtr->release();
capPtr = NULL;
}
exit(s);
}
void Process_Sobel_CPU(cv::Mat frameBGRin, cv::Mat frameBGRout) {
cv::Sobel(frameBGRin, frameBGRout, -1, 1, 1, 1, cv::BORDER_DEFAULT);
}
void Process_Sobel_UMat(cv::UMat frameBGRin, cv::UMat frameBGRout) {
cv::Sobel(frameBGRin, frameBGRout, -1, 1, 1, 1, cv::BORDER_DEFAULT);
}
void Process_Sobel_CUDA(cv::cuda::GpuMat frameBGRin, cv::cuda::GpuMat frameBGRout) {
static cv::Ptr < cv::cuda::Filter > cuda_Sobel_filter = cv::cuda::createSobelFilter (CV_8UC3, CV_8UC3, 1, 1, 1, 1, cv::BORDER_DEFAULT);
cuda_Sobel_filter->apply (frameBGRin, frameBGRout);
}
void PrintOclDeviceInfo(cv::ocl::Device dev) {
std::cout << "\tName: " << dev.name() << std::endl;
std::cout << "\tType: " << dev.type() << std::endl;
std::cout << "\tAvailable: " << (dev.available() ? "YES":"NO") << std::endl;
std::cout << "\tOpenCL version: " << dev.OpenCLVersion() << std::endl;
std::cout << "\tVendor: " << dev.vendorName() << std::endl;
std::cout << "\tDriver version: " << dev.driverVersion() << std::endl;
std::cout << "\tVersion: " << dev.version() << std::endl;
//std::cout << "\tExtensions: " << dev.extensions() << std::endl;
std::cout << "\tHost unified memory: " << (dev.hostUnifiedMemory() ? "YES":"NO") << std::endl;
std::cout << "\tCompiler available: " << (dev.compilerAvailable() ? "YES":"NO") << std::endl;
std::cout << "\tLinker available: " << (dev.linkerAvailable() ? "YES":"NO") << std::endl;
}
void ShowAllPlatformsInfo() {
std::vector< cv::ocl::PlatformInfo > platforms_info;
cv::ocl::getPlatfomsInfo(platforms_info);
for (auto platform : platforms_info) {
std::cout << "Platform: " << platform.name() << " Devices: " << platform.deviceNumber() << std::endl;
for (unsigned int devIdx = 0; devIdx < platform.deviceNumber(); ++devIdx) {
std::cout << " Device: " << devIdx << std::endl;
cv::ocl::Device dev;
platform.getDevice(dev, devIdx);
PrintOclDeviceInfo(dev);
}
std::cout << std::endl;
}
}
void DiscoverOpenCLDevices() {
//ShowAllPlatformsInfo();
cv::ocl::Context cpu_contexts;
cpu_contexts.create(cv::ocl::Device::TYPE_CPU);
std::cout << "CPU devices detected:" << cpu_contexts.ndevices() << std::endl;
for(unsigned int devIdx = 0; devIdx < cpu_contexts.ndevices(); ++devIdx) {
cv::ocl::Device dev = cpu_contexts.device(devIdx);
PrintOclDeviceInfo(dev);
}
cv::ocl::Context gpu_contexts;
gpu_contexts.create(cv::ocl::Device::TYPE_GPU);
std::cout << "GPU devices detected:" << gpu_contexts.ndevices() << std::endl;
for(unsigned int devIdx = 0; devIdx < gpu_contexts.ndevices(); ++devIdx) {
cv::ocl::Device dev = gpu_contexts.device(devIdx);
PrintOclDeviceInfo(dev);
}
}
int main (int argc, char **argv)
{
if (argc > 1) {
std::cout << "Trying to interpret code " << argv[1] << std::endl;
unsigned int code = (unsigned int)atoi(argv[1]);
if (code >= (unsigned int) test_unknown) {
std::cerr << "Unknown code " << code << std::endl;
return (-1);
}
tcase = (test_case_t) code;
}
std::cerr << "Main Starting: " << std::endl;
const char *gst =
"nvarguscamerasrc ! video/x-raw(memory:NVMM), width=1920, height=1080, format=NV12, framerate=30/1 ! "
"nvvidconv ! video/x-raw, format=BGRx, width=1280, height=720 ! "
"videoconvert ! video/x-raw, format=BGR ! appsink";
capPtr = new cv::VideoCapture (gst, cv::CAP_GSTREAMER);
if (!capPtr || !capPtr->isOpened()) {
std::cerr << "Failed to open capture. Aborting." << std::endl;
return (-4);
}
switch (tcase) {
case test_no_opencl_cpu:
cv::ocl::setUseOpenCL(false);
cv::namedWindow ("FrameOut", cv::WINDOW_AUTOSIZE);
break;
case test_opencl_cpu:
if (!cv::ocl::haveOpenCL()) {
std::cerr << "No OpenCL support, aborting" << std::endl;
return (-2);
}
DiscoverOpenCLDevices();
putenv((char*)"OPENCV_OPENCL_DEVICE=Portable Computing Language:CPU");
cv::ocl::setUseOpenCL(true);
cv::namedWindow ("FrameOut", cv::WINDOW_AUTOSIZE | cv::WINDOW_OPENGL);
break;
case test_opencl_gpu:
if (!cv::ocl::haveOpenCL()) {
std::cerr << "No OpenCL support, aborting" << std::endl;
return (-3);
}
DiscoverOpenCLDevices();
putenv((char*)"OPENCV_OPENCL_DEVICE=Portable Computing Language:GPU");
cv::ocl::setUseOpenCL(true);
cv::namedWindow("FrameOut", cv::WINDOW_AUTOSIZE | cv::WINDOW_OPENGL);
break;
case test_cuda:
cv::ocl::setUseOpenCL(false);
cv::namedWindow("FrameOut", cv::WINDOW_AUTOSIZE | cv::WINDOW_OPENGL);
break;
default:
std::cerr << "Unknown mode " << (int)tcase << std::endl;
return (-4);
}
cv::Mat frameBGRin (720, 1280, CV_8UC3);
cv::Mat frameBGRout (720, 1280, CV_8UC3);
cv::UMat uframeBGRin (720, 1280, CV_8UC3);
cv::UMat uframeBGRout (720, 1280, CV_8UC3);
cv::cuda::GpuMat dframeBGRin (720, 1280, CV_8UC3);
cv::cuda::GpuMat dframeBGRout (720, 1280, CV_8UC3);
int nbFrames = 0;
for ( ; nbFrames < IGNORE_FIRST_FRAMES; ++nbFrames) {
switch (tcase) {
case test_no_opencl_cpu:
if (!capPtr->read(frameBGRin)) {
std::cerr << "Failed to read frame " << nbFrames << std::endl;
capPtr->release();
return (-5);
}
break;
case test_opencl_cpu:
case test_opencl_gpu:
if (!capPtr->read(uframeBGRin)) {
std::cerr << "Failed to read frame " << nbFrames << std::endl;
capPtr->release();
return (-6);
}
break;
case test_cuda:
if (!capPtr->read(frameBGRin)) {
std::cerr << "Failed to read frame " << nbFrames << std::endl;
capPtr->release();
return (-5);
}
break;
}
}
double startTime = 0.0;
double waitAndReadUsed_time = 0.0;
double processUsed_time = 0.0;
double displayUsed_time = 0.0;
nbFrames=0;
startTime = (double)cv::getTickCount();
for ( ; nbFrames < LOOP_MEASURE_FRAMES; ++nbFrames) {
double loop_start = cv::getTickCount ();
/****************************************
* Wait and read frame
***************************************/
switch (tcase) {
case test_no_opencl_cpu:
if (!capPtr->read(frameBGRin)) {
std::cerr << "Failed to read frame " << nbFrames << std::endl;
capPtr->release();
return (-6);
}
break;
case test_opencl_cpu:
case test_opencl_gpu:
if (!capPtr->read(uframeBGRin)) {
std::cerr << "Failed to read frame " << nbFrames << std::endl;
capPtr->release();
return (-6);
}
break;
case test_cuda:
if (!capPtr->read(frameBGRin)) {
std::cerr << "Failed to read frame " << nbFrames << std::endl;
capPtr->release();
return (-6);
}
dframeBGRin.upload(frameBGRin);
break;
}
double wait_read = (cv::getTickCount () - loop_start) / cv::getTickFrequency ();
waitAndReadUsed_time += wait_read;
/****************************************
* Process frame
***************************************/
double proc_start = cv::getTickCount ();
double proc_time = 0;
switch (tcase) {
case test_no_opencl_cpu:
Process_Sobel_CPU(frameBGRin, frameBGRout);
break;
case test_opencl_cpu:
case test_opencl_gpu:
Process_Sobel_UMat(uframeBGRin, uframeBGRout);
break;
case test_cuda:
Process_Sobel_CUDA(dframeBGRin, dframeBGRout);
break;
}
proc_time = ((cv::getTickCount () - proc_start) / cv::getTickFrequency () );
processUsed_time += proc_time;
/****************************************
* Display frame
***************************************/
double display_start = cv::getTickCount();
//std::stringstream ss;
//ss << "Processing: " << (nbFrames / processUsed_time) << " FPS - Frame: " << nbFrames;
//cv::putText (frameBGR, ss.str (), cv::Point (30, 30), cv::FONT_HERSHEY_SIMPLEX, 1.0, cv::Scalar (255));
//cv::imshow ("FrameOut", frameBGR);
switch (tcase) {
case test_no_opencl_cpu:
//cv::imshow ("FrameOut", frameBGRout);
break;
case test_opencl_cpu:
case test_opencl_gpu:
//cv::imshow ("FrameOut", uframeBGRout);
break;
case test_cuda:
//cv::imshow ("FrameOut", dframeBGRout);
break;
default:
std::cerr << "No display implemented for this case" << std::endl;
break;
}
char c = cv::waitKey (1);
if (c == 27) {
break;
}
double disp_time = (cv::getTickCount() - display_start) / cv::getTickFrequency ();
displayUsed_time += disp_time;
double loop_time = (cv::getTickCount() - loop_start) / cv::getTickFrequency ();
//std::cout << "Frame: " << nbFrames << " Wait & read: " << wait_read << " Process time: " << proc_time << " Display time: " << disp_time << " Loop:" << loop_time << std::endl;
}
std::cout << "Terminated\n";
double totalTime = (cv::getTickCount () - startTime) / cv::getTickFrequency ();
std::cout << "Total Time for " << nbFrames << " frames: " << totalTime << " s. average: " << 1000.0*totalTime/(double)nbFrames << " ms" << std::endl;
std::cout << "Wait & read time: " << waitAndReadUsed_time << " s. average: " << 1000.0*waitAndReadUsed_time/(double)nbFrames << " ms\n";
std::cout << "Process time: " << processUsed_time << " s. average: " << 1000.0*processUsed_time/(double)nbFrames << " ms\n";
std::cout << "Display time: " << displayUsed_time << " s. average: " << 1000.0*displayUsed_time/(double)nbFrames << " ms\n";
capPtr->release();
}