How to run cv::cuda::Convolution concurrently?

Hi, I’m trying to run cv::cuda::Convolution concurrently.
Here is my test code but the Timeline of NSight Performance Analysis indicates that each stream is not running concurrently.

Thanks in advance!

int main()
  // Choose which GPU to run on, change this on a multi-GPU system.
  cudaError_t cudaStatus = cudaSetDevice(0);
  if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    return 1;

  int num_kernels = 8;
  cv::Mat temp_image = cv::imread("test.bmp", 0);
  cv::Mat source_image;
  temp_image.convertTo(source_image, CV_32FC1);
  std::vector<cv::cuda::GpuMat> device_mats(num_kernels);
  std::vector<cv::cuda::HostMem> sources(num_kernels);
  std::vector<cv::cuda::GpuMat> device_kernels(num_kernels);
  std::vector<cv::cuda::GpuMat> device_results(num_kernels);
  std::vector<cv::Ptr<cv::cuda::Convolution>> convolvers(num_kernels);
  std::vector<cv::cuda::Stream> streams(num_kernels);
  for (int i = 0; i < num_kernels; i++) {
    sources[i] = cv::cuda::HostMem(source_image);
    device_mats[i].upload(sources[i], streams[i]);
    device_kernels[i].upload(sources[i], streams[i]);
    convolvers[i] = cv::cuda::createConvolution();
    convolvers[i]->convolve(device_mats[i], device_kernels[i], device_results[i], true, streams[i]);

  // cudaDeviceReset must be called before exiting in order for profiling and
  // tracing tools such as Nsight and Visual Profiler to show complete traces.
  cudaStatus = cudaDeviceReset();
  if (cudaStatus != cudaSuccess) {
      fprintf(stderr, "cudaDeviceReset failed!");
      return 1;

  return 0;

Sorry, I forgot to mention that I’m running this test on a Surface Book 2(NVDIA GeForce GTX 1050).