Unable to run concurrent opencv cuda functions through Streams

Nvidia Driver installed: nvidia-driver-460
nvcc version: 10.0
GPU: 1050Ti

I have compiled OpenCV 4.5.1 using the CMake command as follow:
cmake -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_C_COMPILER=/usr/bin/gcc-7 -D CMAKE_INSTALL_PREFIX=/usr/local -D INSTALL_PYTHON_EXAMPLES=ON -D INSTALL_C_EXAMPLES=OFF -D BUILD_SHARED_LIBRARY=OFF -D BUILD_TESTS=OFF -D JAVA_AWT_INCLUDE_PATH=/usr/lib/jvm/java-8-openjdk-amd64/include -D JAVA_AWT_LIBRARY=/usr/lib/jvm/java-8-openjdk-amd64/include/jawt.h -D JAVA_INCLUDE_PATH=/usr/lib/jvm/java-8-openjdk-amd64/include -D JAVA_INCLUDE_PATH2=/usr/lib/jvm/java-8-openjdk-amd64/include/linux -D JAVA_JVM_LIBRARY=/usr/lib/jvm/java-8-openjdk-amd64/include/jni.h -D ANT_EXECUTABLE=/usr/bin/ant -D WITH_TBB=ON -D WITH_CUDA=ON -D BUILD_opencv_cudacodec=OFF -D ENABLE_FAST_MATH=1 -D BUILD_PERF_TESTS=OFF -D CUDA_FAST_MATH=1 -D WITH_CUBLAS=1 -D WITH_V4L=ON -D WITH_QT=OFF -D WITH_OPENGL=ON -D WITH_GSTREAMER=ON -D OPENCV_GENERATE_PKGCONFIG=ON -D OPENCV_PC_FILE_NAME=opencv.pc -D OPENCV_ENABLE_NONFREE=ON -D OPENCV_PYTHON3_INSTALL_PATH=~/.virtualenvs/opencv_cuda/lib/python3.6/site-packages -D OPENCV_EXTRA_MODULES_PATH=~/opencv_contrib/modules -D PYTHON_EXECUTABLE=~/.virtualenvs/opencv_cuda/bin/python -D BUILD_EXAMPLES=ON -D WITH_CUDNN=ON -D OPENCV_DNN_CUDA=ON -D CUDA_ARCH_BIN=6.1

Using the above configuration, I was trying to run OpenCV Cuda functions in concurrency in different streams. I have shown different variants of my function compute and the corresponding response of NVVP profiler. I have compiled my code using nvcc --default-stream per-thread ./streamExample.cu -o streamExample `pkg-config opencv --cflags --libs`

My code is:

#include <opencv2/opencv.hpp>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/core/cuda.hpp>
#include <opencv2/cudaarithm.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/core.hpp>
#include "opencv2/imgproc.hpp"
#include "opencv2/objdetect/objdetect.hpp"
#include "opencv2/highgui/highgui.hpp"
#include </usr/local/cuda-10.0/include/cuda_profiler_api.h>

#include <cuda.h>
#include <cuda_runtime.h>

#include <unistd.h>
#include <vector>
#include <memory>
#include <iostream>

cv::cuda::GpuMat diviGpu;

//Define test target size 
cv::Size rSize(256, 256);

cv::Rect rect;

std::shared_ptr<std::vector<cv::Mat>> computeArray(std::shared_ptr<std::vector< cv::cuda::HostMem >> srcMemArray,
                                                   std::shared_ptr<std::vector< cv::cuda::HostMem >> dstMemArray,
                                                   std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuSrcArray,
                                                   std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuDstArray,
                                                   std::shared_ptr<std::vector< cv::Mat >> outArray,
                                                   std::shared_ptr<std::vector< cv::cuda::Stream >> streamsArray,
                                                   int i){


    double rotCoeff[2][3] = {-1, 1.2246467991473532e-16, 2069, -1.2246467991473532e-16, -1 , 2469};
    cv::Mat RotMatrix = cv::Mat(2, 3, CV_64FC1, rotCoeff);

    for(int j=0; j<4; j++){
        (*gpuSrcArray)[i+j].upload((*srcMemArray)[i+j], (*streamsArray)[j]);
        cv::cuda::divide((*gpuSrcArray)[i+j], diviGpu, (*gpuSrcArray)[i+j], 1, -1, (*streamsArray)[j]);
        cv::cuda::resize((*gpuSrcArray)[i+j], (*gpuDstArray)[i+j], rSize, 0, 0, cv::INTER_AREA, (*streamsArray)[j]);
        (*gpuDstArray)[i+j].download((*dstMemArray)[i+j],(*streamsArray)[j]);
        (*outArray)[i+j] = (*dstMemArray)[i+j].createMatHeader();
    }

    (*streamsArray)[0].waitForCompletion();
    (*streamsArray)[1].waitForCompletion();
    (*streamsArray)[2].waitForCompletion();
    (*streamsArray)[3].waitForCompletion();
    return outArray;

}

int main (int argc, char* argv[]){

    auto start = std::chrono::high_resolution_clock::now();
    std::shared_ptr<std::vector<cv::cuda::Stream>> streamsArray = std::make_shared<std::vector<cv::cuda::Stream>>();

    cv::cuda::Stream streamA, streamB, streamC, streamD;
    streamsArray->push_back(streamA);
    streamsArray->push_back(streamB);
    streamsArray->push_back(streamC);
    streamsArray->push_back(streamD);

    //Create Pinned Memory (PAGE_LOCKED) arrays
    std::shared_ptr<std::vector<cv::cuda::HostMem >> srcMemArray = std::make_shared<std::vector<cv::cuda::HostMem >>();
    std::shared_ptr<std::vector<cv::cuda::HostMem >> dstMemArray = std::make_shared<std::vector<cv::cuda::HostMem >>();

    //Create GpuMat arrays to use them on OpenCV CUDA Methods
    std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuSrcArray = std::make_shared<std::vector<cv::cuda::GpuMat>>();
    std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuDstArray = std::make_shared<std::vector<cv::cuda::GpuMat>>();

    //Create Output array for CPU Mat
    std::shared_ptr<std::vector< cv::Mat >> outArray = std::make_shared<std::vector<cv::Mat>>();

    //Load test image
    for(int i=20;i<40;i++){
        char trainImg_path[1000];
        sprintf( trainImg_path, "/home/morphle/Desktop/MorphleWork/6/x25y%d.jpg", i);

        cv::Mat srcHostImage = cv::imread(trainImg_path, cv::IMREAD_COLOR);
        std::cout<<srcHostImage.size()<<std::endl;
        cv::cvtColor(srcHostImage, srcHostImage, cv::COLOR_BGR2GRAY);

        cv::Mat outMat;

        cv::cuda::HostMem srcHostMem = cv::cuda::HostMem(srcHostImage, cv::cuda::HostMem::PAGE_LOCKED);
        cv::cuda::HostMem srcDstMem = cv::cuda::HostMem(outMat, cv::cuda::HostMem::PAGE_LOCKED);

        cv::cuda::GpuMat srcMat;
        cv::cuda::GpuMat dstMat;
        outArray->push_back(outMat);

        srcMemArray->push_back(srcHostMem);
        dstMemArray->push_back(srcDstMem);
        gpuSrcArray->push_back(srcMat);
        gpuDstArray->push_back(dstMat);
        outArray->push_back(outMat);
    }

    //Dummy Divider
    cv::Mat dividier = cv::Mat::ones((*srcMemArray)[0].size(),(*srcMemArray)[0].type());
    diviGpu.upload(dividier, (*streamsArray)[0]);

    rect = cv::RotatedRect(cv::Point(), cv::Size((*srcMemArray)[0].cols+2*0-2, (*srcMemArray)[0].rows+2*0-2), 180).boundingRect();

    //Test the process 20 times
    for(int k = 0; k<10; k++){
        for(int i=0; i<20; i=i+4){
            std::shared_ptr<std::vector<cv::Mat>> result = std::make_shared<std::vector<cv::Mat>>();
            result = computeArray(srcMemArray, dstMemArray, gpuSrcArray, gpuDstArray, outArray, streamsArray, i);
        }
    }

    return 0;
}

However, when I try to run OpenCV Cuda functions in streams, the nvvp profiler shows:

Try 2:

std::shared_ptr<std::vector<cv::Mat>> computeArray(std::shared_ptr<std::vector< cv::cuda::HostMem >> srcMemArray,
                                                   std::shared_ptr<std::vector< cv::cuda::HostMem >> dstMemArray,
                                                   std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuSrcArray,
                                                   std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuDstArray,
                                                   std::shared_ptr<std::vector< cv::Mat >> outArray,
                                                   std::shared_ptr<std::vector< cv::cuda::Stream >> streamsArray,
                                                   int i){


    double rotCoeff[2][3] = {-1, 1.2246467991473532e-16, 2069, -1.2246467991473532e-16, -1 , 2469};
    cv::Mat RotMatrix = cv::Mat(2, 3, CV_64FC1, rotCoeff);

    // for(int j=0; j<4; j++){
    (*gpuSrcArray)[i+0].upload((*srcMemArray)[i+0], (*streamsArray)[0]);
    (*gpuSrcArray)[i+1].upload((*srcMemArray)[i+1], (*streamsArray)[0]);
    (*gpuSrcArray)[i+2].upload((*srcMemArray)[i+2], (*streamsArray)[0]);
    (*gpuSrcArray)[i+3].upload((*srcMemArray)[i+3], (*streamsArray)[0]);

    cv::cuda::divide((*gpuSrcArray)[i+0], diviGpu, (*gpuSrcArray)[i+0], 1, -1, (*streamsArray)[1]);
    cv::cuda::divide((*gpuSrcArray)[i+1], diviGpu, (*gpuSrcArray)[i+1], 1, -1, (*streamsArray)[1]);
    cv::cuda::divide((*gpuSrcArray)[i+2], diviGpu, (*gpuSrcArray)[i+2], 1, -1, (*streamsArray)[1]);
    cv::cuda::divide((*gpuSrcArray)[i+3], diviGpu, (*gpuSrcArray)[i+3], 1, -1, (*streamsArray)[1]);

    cv::cuda::resize((*gpuSrcArray)[i+0], (*gpuDstArray)[i+0], rSize, 0, 0, cv::INTER_AREA, (*streamsArray)[2]);
    cv::cuda::resize((*gpuSrcArray)[i+1], (*gpuDstArray)[i+1], rSize, 0, 0, cv::INTER_AREA, (*streamsArray)[2]);
    cv::cuda::resize((*gpuSrcArray)[i+2], (*gpuDstArray)[i+2], rSize, 0, 0, cv::INTER_AREA, (*streamsArray)[2]);
    cv::cuda::resize((*gpuSrcArray)[i+3], (*gpuDstArray)[i+3], rSize, 0, 0, cv::INTER_AREA, (*streamsArray)[2]);

    (*gpuDstArray)[i+0].download((*dstMemArray)[i+0],(*streamsArray)[3]);
    (*gpuDstArray)[i+1].download((*dstMemArray)[i+1],(*streamsArray)[3]);
    (*gpuDstArray)[i+2].download((*dstMemArray)[i+2],(*streamsArray)[3]);
    (*gpuDstArray)[i+3].download((*dstMemArray)[i+3],(*streamsArray)[3]);

    (*outArray)[i+0] = (*dstMemArray)[i+0].createMatHeader();
    (*outArray)[i+1] = (*dstMemArray)[i+1].createMatHeader();
    (*outArray)[i+2] = (*dstMemArray)[i+2].createMatHeader();
    (*outArray)[i+3] = (*dstMemArray)[i+3].createMatHeader();
    // }

    (*streamsArray)[0].waitForCompletion();
    (*streamsArray)[1].waitForCompletion();
    (*streamsArray)[2].waitForCompletion();
    (*streamsArray)[3].waitForCompletion();
    return outArray;

}

Try 3:

std::shared_ptr<std::vector<cv::Mat>> computeArray(std::shared_ptr<std::vector< cv::cuda::HostMem >> srcMemArray,
                                                   std::shared_ptr<std::vector< cv::cuda::HostMem >> dstMemArray,
                                                   std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuSrcArray,
                                                   std::shared_ptr<std::vector< cv::cuda::GpuMat >> gpuDstArray,
                                                   std::shared_ptr<std::vector< cv::Mat >> outArray,
                                                   std::shared_ptr<std::vector< cv::cuda::Stream >> streamsArray,
                                                   int i){


    double rotCoeff[2][3] = {-1, 1.2246467991473532e-16, 2069, -1.2246467991473532e-16, -1 , 2469};
    cv::Mat RotMatrix = cv::Mat(2, 3, CV_64FC1, rotCoeff);

    // for(int j=0; j<4; j++){
    (*gpuSrcArray)[i+0].upload((*srcMemArray)[i+0], (*streamsArray)[0]);
    (*gpuSrcArray)[i+1].upload((*srcMemArray)[i+1], (*streamsArray)[1]);
    (*gpuSrcArray)[i+2].upload((*srcMemArray)[i+2], (*streamsArray)[2]);
    (*gpuSrcArray)[i+3].upload((*srcMemArray)[i+3], (*streamsArray)[3]);

    cv::cuda::divide((*gpuSrcArray)[i+0], diviGpu, (*gpuSrcArray)[i+0], 1, -1, (*streamsArray)[0]);
    cv::cuda::divide((*gpuSrcArray)[i+1], diviGpu, (*gpuSrcArray)[i+1], 1, -1, (*streamsArray)[1]);
    cv::cuda::divide((*gpuSrcArray)[i+2], diviGpu, (*gpuSrcArray)[i+2], 1, -1, (*streamsArray)[2]);
    cv::cuda::divide((*gpuSrcArray)[i+3], diviGpu, (*gpuSrcArray)[i+3], 1, -1, (*streamsArray)[3]);

    cv::cuda::resize((*gpuSrcArray)[i+0], (*gpuDstArray)[i+0], rSize, 0, 0, cv::INTER_AREA, (*streamsArray)[0]);
    cv::cuda::resize((*gpuSrcArray)[i+1], (*gpuDstArray)[i+1], rSize, 0, 0, cv::INTER_AREA, (*streamsArray)[1]);
    cv::cuda::resize((*gpuSrcArray)[i+2], (*gpuDstArray)[i+2], rSize, 0, 0, cv::INTER_AREA, (*streamsArray)[2]);
    cv::cuda::resize((*gpuSrcArray)[i+3], (*gpuDstArray)[i+3], rSize, 0, 0, cv::INTER_AREA, (*streamsArray)[3]);

    (*gpuDstArray)[i+0].download((*dstMemArray)[i+0],(*streamsArray)[0]);
    (*gpuDstArray)[i+1].download((*dstMemArray)[i+1],(*streamsArray)[1]);
    (*gpuDstArray)[i+2].download((*dstMemArray)[i+2],(*streamsArray)[2]);
    (*gpuDstArray)[i+3].download((*dstMemArray)[i+3],(*streamsArray)[3]);

    (*outArray)[i+0] = (*dstMemArray)[i+0].createMatHeader();
    (*outArray)[i+1] = (*dstMemArray)[i+1].createMatHeader();
    (*outArray)[i+2] = (*dstMemArray)[i+2].createMatHeader();
    (*outArray)[i+3] = (*dstMemArray)[i+3].createMatHeader();
    // }

    (*streamsArray)[0].waitForCompletion();
    (*streamsArray)[1].waitForCompletion();
    (*streamsArray)[2].waitForCompletion();
    (*streamsArray)[3].waitForCompletion();
    return outArray;

}

I was able to get the results shown in this link: GPU Pro Tip: CUDA 7 Streams Simplify Concurrency | NVIDIA Developer Blog

Issue:
It can be observed in the screenshot that the compute kernel because of OpenCV Cuda functions are running synchronously, no matter how the functions are arranged. I have also done the same exercise using multithreading, At best, only overlap on MemCpyAsync and a kernel can be seen.

Please help me figure out my mistakes. A few of my guesses are:
i.) It may be because of the missing -D NVCC_CUDA_Flags = --default-stream per-thread: If this is the case, kindly advise me how it can be sent as an argument to the CMake command while compiling OpenCV
ii.) The individual kernel for a particular OpenCV Cuda function is too much large such that other functions are not been able to run concurrently. Then, please suggest ways to compute parallel streams for OpenCV Cuda Functions using OpenCV Cuda Streams.

My final aim is to achieve 4-way or more concurrency using in-built OpenCV Cuda functions though Streams.

So you must be using streams correctly to some degree.

That is quite possible. I think it is a likely explanation.

There is no supporting logic to your final aim, that I know of. If a kernel saturates the GPU:

  1. There is no reason to expect to witness kernel concurrency.
  2. Even if you could witness kernel concurrency, there is no reason to conclude the overall work would get done any quicker. You would be dividing resources between kernels.

The fundamental premise here is broken/invalid. If it were simply possible to take any arbitrary workload, and run 4 of them in parallel, then the GPU would by definition have infinite capacity.

I happen to know that GPUs don’t have infinite capacity. Therefore the underlying premise is flawed.

Unless you are certain that the kernels spun up by OpenCV are small enough that they do not saturate the GPU, there is no reason to expect to witness kernel concurrency. The profiler can help you estimate the size (i.e. resource consumption) of a particular kernel launch. It can report the grid configuration, shared memory usage, register usage, and other relevant data. This kind of estimate for concurrency requires analysis and data, and generally starts with an understanding of occupancy, and being able to estimate the occupancy of a particular kernel. NVIDIA provides tools to help with that also (e.g. occupancy API, occupancy calculator spreadsheet).

A 1050Ti has 6 SMs. That means it has an instantaneous maximum capacity of 6*2048 ~= 12,000 threads. Any kernel launch of 12,000 threads or more will saturate that GPU, and prevent you from witnessing kernel concurrency. A relatively small image of say 256x256 pixels could easily translate into an OpenCV image processing algorithm kernel launch of 256x256 = 65,536 threads. It’s entirely realistic to expect that typical image processing kernels for an image of that size will completely fill the GPU and prevent concurrency. And that is only a portion of the analysis that would be needed to conclusively confirm that concurrency should be possible. For the 256x256 example, if it were me, I would immediately conclude that it is not sensible to expect to witness kernel concurrency for a typical image processing kernel on a 1050Ti, and I would immediately move on to spend my valuable time on other tasks. I would conclude that unless your image size is significantly below 100x100 pixels, kernel concurrency is unlikely.

As an aside, NVIDIA doesn’t maintain or provide support for OpenCV, including OpenCV-CUDA. Likewise, NVIDIA doesn’t maintain or provide support for CMake.

Thank you @Robert_Crovella for a detailed answer.