Can the Xavier run OpenCL applications?

Got back to this topic for some reason, and I may share my findings at that time (Xavier NX, last trial was on Dec 2020).

I just then tried a simple example of Sobel filtering from opencv on 1280x720p30. Using POCL built with llvm11.
This may be a side case, not sure it can be further generalized.
These were excluding first 20 frames from measurement on next 500 frames:

  • Opencv CPU cv::Mat (or cv::UMat with POCL basic) : ~25 ms per frame
  • Opencv CUDA GpuMat: 3 ms per frame
  • Opencv UMat with POCL CUDA: 2 ms per frame
  • Opencv UMat with POCL pthreads: 0.5 ms per frame

So the big improvement may be on CPU side with carmel CPU support.

Unable to retry now with R32.5.1, only from what I remind.

Test Code (use with caution, not tested before posting)
#include <signal.h>
#include <iostream>
#include <vector>
#include <CL/cl.h>

#include <opencv2/opencv.hpp>
#include <opencv2/core/ocl.hpp>
#include <opencv2/imgproc.hpp>


#include <opencv2/core/cuda.hpp>
#include <opencv2/cudafilters.hpp>

//#include <opencv2/cudaobjdetect.hpp>
//#include <opencv2/cudaimgproc.hpp>


#define IGNORE_FIRST_FRAMES 20
#define LOOP_MEASURE_FRAMES 500

typedef enum  {
	test_no_opencl_cpu = 0,
	test_opencl_cpu,
	test_opencl_gpu,
	test_cuda,
	test_unknown
} test_case_t;
test_case_t tcase = test_opencl_gpu;




static cv::VideoCapture *capPtr = NULL;
void my_handler(int s){
       std::cerr<< "Caught signal " << s << std::endl;
       if(capPtr) {
           capPtr->release();
           capPtr = NULL;
       }
       exit(s); 
}



void Process_Sobel_CPU(cv::Mat frameBGRin, cv::Mat frameBGRout) {
    cv::Sobel(frameBGRin, frameBGRout, -1, 1, 1, 1, cv::BORDER_DEFAULT);
}

void Process_Sobel_UMat(cv::UMat frameBGRin, cv::UMat frameBGRout) {
    cv::Sobel(frameBGRin, frameBGRout, -1, 1, 1, 1, cv::BORDER_DEFAULT);
}

void Process_Sobel_CUDA(cv::cuda::GpuMat frameBGRin, cv::cuda::GpuMat frameBGRout) {
    static cv::Ptr < cv::cuda::Filter > cuda_Sobel_filter = cv::cuda::createSobelFilter (CV_8UC3, CV_8UC3, 1, 1, 1, 1, cv::BORDER_DEFAULT);
    cuda_Sobel_filter->apply (frameBGRin, frameBGRout);
}


void PrintOclDeviceInfo(cv::ocl::Device dev) {
	std::cout << "\tName: " << dev.name() << std::endl;
	std::cout << "\tType: " << dev.type() << std::endl;
	std::cout << "\tAvailable: " << (dev.available() ? "YES":"NO") << std::endl;
	std::cout << "\tOpenCL version: " << dev.OpenCLVersion() << std::endl;
	std::cout << "\tVendor: " << dev.vendorName() << std::endl;
	std::cout << "\tDriver version: " << dev.driverVersion() << std::endl;
	std::cout << "\tVersion: " << dev.version() << std::endl;
	//std::cout << "\tExtensions: " << dev.extensions() << std::endl;
	std::cout << "\tHost unified memory: " << (dev.hostUnifiedMemory() ? "YES":"NO") << std::endl;
	std::cout << "\tCompiler available: " << (dev.compilerAvailable() ? "YES":"NO") << std::endl;
	std::cout << "\tLinker available: " << (dev.linkerAvailable() ? "YES":"NO") << std::endl;
}

void ShowAllPlatformsInfo() {
  std::vector< cv::ocl::PlatformInfo > platforms_info;
  cv::ocl::getPlatfomsInfo(platforms_info);	
  for (auto platform : platforms_info) {
	std::cout << "Platform: " << platform.name() << " Devices: " << platform.deviceNumber() << std::endl;
        for (unsigned int devIdx = 0; devIdx < platform.deviceNumber(); ++devIdx) {
		std::cout << "   Device: " << devIdx << std::endl;
		cv::ocl::Device dev;
                platform.getDevice(dev, devIdx);
        	PrintOclDeviceInfo(dev);
        }
	std::cout << std::endl;
  }	
}

void DiscoverOpenCLDevices() {
  //ShowAllPlatformsInfo();

  cv::ocl::Context cpu_contexts;
  cpu_contexts.create(cv::ocl::Device::TYPE_CPU);
  std::cout << "CPU devices detected:" << cpu_contexts.ndevices() << std::endl;
  for(unsigned int devIdx = 0; devIdx < cpu_contexts.ndevices(); ++devIdx) {
	cv::ocl::Device dev = cpu_contexts.device(devIdx);
        PrintOclDeviceInfo(dev);
  }

  cv::ocl::Context gpu_contexts;
  gpu_contexts.create(cv::ocl::Device::TYPE_GPU);
  std::cout << "GPU devices detected:" << gpu_contexts.ndevices() << std::endl;
  for(unsigned int devIdx = 0; devIdx < gpu_contexts.ndevices(); ++devIdx) {
	cv::ocl::Device dev = gpu_contexts.device(devIdx);
        PrintOclDeviceInfo(dev);
  }
}


int main (int argc, char **argv)
{
  if (argc > 1) {
     std::cout << "Trying to interpret code " << argv[1] << std::endl;
     unsigned int code = (unsigned int)atoi(argv[1]);
     if (code >= (unsigned int) test_unknown) {
        std::cerr << "Unknown code " << code << std::endl;
	return (-1);
     }

     tcase = (test_case_t) code;
  }

  std::cerr << "Main Starting:  " << std::endl;

  const char *gst =
    "nvarguscamerasrc ! video/x-raw(memory:NVMM), width=1920, height=1080, format=NV12, framerate=30/1 ! "
    "nvvidconv ! video/x-raw, format=BGRx, width=1280, height=720 ! "
    "videoconvert ! video/x-raw, format=BGR ! appsink";
  capPtr = new cv::VideoCapture (gst, cv::CAP_GSTREAMER);
  if (!capPtr || !capPtr->isOpened()) {
        std::cerr << "Failed to open capture. Aborting." << std::endl;
        return (-4);
  }
 
  switch (tcase) {
     case test_no_opencl_cpu:
        cv::ocl::setUseOpenCL(false);
        cv::namedWindow ("FrameOut", cv::WINDOW_AUTOSIZE);
	break;

     case test_opencl_cpu:
  	if (!cv::ocl::haveOpenCL()) {
		std::cerr << "No OpenCL support, aborting" << std::endl;
		return (-2);
	}
	DiscoverOpenCLDevices();
        putenv((char*)"OPENCV_OPENCL_DEVICE=Portable Computing Language:CPU");
        cv::ocl::setUseOpenCL(true);
        cv::namedWindow ("FrameOut", cv::WINDOW_AUTOSIZE | cv::WINDOW_OPENGL);
	break;

     case test_opencl_gpu: 
  	if (!cv::ocl::haveOpenCL()) {
		std::cerr << "No OpenCL support, aborting" << std::endl;
		return (-3);
	}
	DiscoverOpenCLDevices();
        putenv((char*)"OPENCV_OPENCL_DEVICE=Portable Computing Language:GPU");
        cv::ocl::setUseOpenCL(true);
  	cv::namedWindow("FrameOut", cv::WINDOW_AUTOSIZE | cv::WINDOW_OPENGL);
        break;

     case test_cuda:
        cv::ocl::setUseOpenCL(false);
  	cv::namedWindow("FrameOut", cv::WINDOW_AUTOSIZE | cv::WINDOW_OPENGL);
	break;

     default:
        std::cerr << "Unknown mode " << (int)tcase << std::endl;
        return (-4);
  }


  cv::Mat frameBGRin (720, 1280, CV_8UC3);
  cv::Mat frameBGRout (720, 1280, CV_8UC3);

  cv::UMat uframeBGRin (720, 1280, CV_8UC3);
  cv::UMat uframeBGRout (720, 1280, CV_8UC3);

  cv::cuda::GpuMat dframeBGRin (720, 1280, CV_8UC3);
  cv::cuda::GpuMat dframeBGRout (720, 1280, CV_8UC3);



  int nbFrames = 0;
  for ( ; nbFrames < IGNORE_FIRST_FRAMES; ++nbFrames) {
      switch (tcase) {
         case test_no_opencl_cpu:
            if (!capPtr->read(frameBGRin)) {
               std::cerr << "Failed to read frame " << nbFrames << std::endl;
	       capPtr->release();
	       return (-5);
            }
            break;

         case test_opencl_cpu:
         case test_opencl_gpu:
            if (!capPtr->read(uframeBGRin)) {
               std::cerr << "Failed to read frame " << nbFrames << std::endl;
	       capPtr->release();
	       return (-6);
            }
            break;

     	case test_cuda:
            if (!capPtr->read(frameBGRin)) {
               std::cerr << "Failed to read frame " << nbFrames << std::endl;
	       capPtr->release();
	       return (-5);
            }
	    break;

     }
  }

  double startTime = 0.0;
  double waitAndReadUsed_time = 0.0;
  double processUsed_time = 0.0;
  double displayUsed_time = 0.0;

  nbFrames=0;
  startTime = (double)cv::getTickCount();
  for ( ; nbFrames < LOOP_MEASURE_FRAMES; ++nbFrames) {
      double loop_start = cv::getTickCount ();

      /****************************************
       * Wait and read frame
       ***************************************/
      switch (tcase) {
      case test_no_opencl_cpu:
         if (!capPtr->read(frameBGRin)) {
            std::cerr << "Failed to read frame " << nbFrames << std::endl;
	    capPtr->release();
	    return (-6);
         }
         break;

        case test_opencl_cpu:
        case test_opencl_gpu:
            if (!capPtr->read(uframeBGRin)) {
               std::cerr << "Failed to read frame " << nbFrames << std::endl;
	       capPtr->release();
	       return (-6);
            }
            break;

     	case test_cuda:
         if (!capPtr->read(frameBGRin)) {
            std::cerr << "Failed to read frame " << nbFrames << std::endl;
	    capPtr->release();
	    return (-6);
         }
	 dframeBGRin.upload(frameBGRin);
	 break;
      }
      double wait_read = (cv::getTickCount () - loop_start) / cv::getTickFrequency ();
      waitAndReadUsed_time += wait_read;

      
      /****************************************
       * Process frame
       ***************************************/
      double proc_start = cv::getTickCount ();
      double proc_time = 0;
      switch (tcase) {
     	case test_no_opencl_cpu:
		Process_Sobel_CPU(frameBGRin, frameBGRout);
       		break;


     	case test_opencl_cpu:
     	case test_opencl_gpu: 
		Process_Sobel_UMat(uframeBGRin, uframeBGRout);
       		break;


     	case test_cuda:
		Process_Sobel_CUDA(dframeBGRin, dframeBGRout);
 		break;

      }
      proc_time = ((cv::getTickCount () - proc_start) / cv::getTickFrequency () );
      processUsed_time += proc_time;



      /****************************************
       * Display frame
       ***************************************/
      double display_start = cv::getTickCount();
      //std::stringstream ss;
      //ss << "Processing: " << (nbFrames / processUsed_time) << " FPS - Frame: " << nbFrames;
      //cv::putText (frameBGR, ss.str (), cv::Point (30, 30), cv::FONT_HERSHEY_SIMPLEX, 1.0, cv::Scalar (255));
      //cv::imshow ("FrameOut", frameBGR);

      switch (tcase) {
         case test_no_opencl_cpu:
		//cv::imshow ("FrameOut", frameBGRout);
		break;

     	 case test_opencl_cpu:
         case test_opencl_gpu:
      		//cv::imshow ("FrameOut", uframeBGRout);
		break;
        
     	 case test_cuda:
      		//cv::imshow ("FrameOut", dframeBGRout);
		break;
 

     	 default:
		std::cerr << "No display implemented for this case" << std::endl;
		break;
      }

      char c = cv::waitKey (1);
      if (c == 27) {
	break;
      }
      double disp_time = (cv::getTickCount() - display_start) / cv::getTickFrequency ();
      displayUsed_time += disp_time;
  

      double loop_time = (cv::getTickCount() - loop_start) / cv::getTickFrequency ();
      //std::cout << "Frame: " << nbFrames << "   Wait & read: " << wait_read << "   Process time: " << proc_time << "    Display time: " << disp_time << "    Loop:" << loop_time << std::endl;
    
  }

      std::cout << "Terminated\n";
      double totalTime = (cv::getTickCount () - startTime) / cv::getTickFrequency ();
      std::cout << "Total Time for " << nbFrames << " frames: " << totalTime << " s. average: " << 1000.0*totalTime/(double)nbFrames << " ms" << std::endl;

      std::cout << "Wait & read time: " << waitAndReadUsed_time << " s. average: " << 1000.0*waitAndReadUsed_time/(double)nbFrames << " ms\n";
      std::cout << "Process time: " << processUsed_time << " s. average: " << 1000.0*processUsed_time/(double)nbFrames << " ms\n";
      std::cout << "Display time: " << displayUsed_time << " s. average: " << 1000.0*displayUsed_time/(double)nbFrames << " ms\n";

      capPtr->release();
}