Can't overlap streams

My code cannot achieve concurrency. In Nsight Systems, it shows that any memory copies and kernels are not overlapped.

(N times of HostToDevice >> N times of Kernel execution >> N times of DeviceToHost)

I don’t understand why it’s not overlapped, because IT USED TO BE WORK. About months ago I tested my code on Nsight Systems and verified that it could achieve concurrency. I’ve changed CUDA kernels and barely changed overall structure of this code, and found this problem. Now old version of my code is also not working…

What can be the suspect of this problem?

CUDA Version : 10.0
GPU : 2080ti
CPU : Intel i9 9900k
OS : Windows 10
IDE : Visual Studio 2017

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <vector>
#include <time.h>
#include <opencv2/opencv.hpp>

#include <bitset>

#include <helper_functions.h>  // helper for shared functions common to CUDA Samples
#include <helper_cuda.h>       // helper functions for CUDA error checking and initialization

using namespace std;
using namespace cv;

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaDeviceSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

Mat read_BMP_opencv(char* filename, int& w, int& h);
__global__ void sinc2D7K(uchar* buff, uchar* buffer_out, int w, int h, float offset_xx, float offset_yy);


int main()
{
	cudaEvent_t start, stop;
	float  elapsedTime;
	CUDA_SAFE_CALL(cudaEventCreate(&start));
	CUDA_SAFE_CALL(cudaEventCreate(&stop));
	int f_width, f_height;

	float offset[2] = { 0.1f, 0.2f };

	int margin = 2;
	int crop_size = 2048;
	const int stream_num = 32; // ceil(16384.0f / float(crop_size)) * ceil(8192.0f / float(crop_size))
	int n_iter = 50;

	cout << "crop size = " << crop_size << endl;
	cout << "number of streams = " << stream_num << endl;
	cout << "repeat = " << n_iter << endl;

	cudaStream_t stream[stream_num];

	for (int n = 0; n < stream_num; n++)
	{
		CUDA_SAFE_CALL(cudaStreamCreate(&stream[n]));
	}

	cv::Mat::setDefaultAllocator(cv::cuda::HostMem::getAllocator(cv::cuda::HostMem::AllocType::PAGE_LOCKED));

	Mat* result = new Mat[stream_num];
	Mat* img = new Mat[stream_num];
	uchar* *data = new uchar*[stream_num];
	char buf[256];

	for (int i = 0; i < stream_num; i++)
	{
		//sprintf(buf, "test_13k.bmp", i + 1);
		//sprintf(buf, "test_%d.bmp", crop_size);
		sprintf(buf, "input_images/test_2048_%d.bmp", i);
		img[i] = read_BMP_opencv(buf, f_width, f_height);
		data[i] = img[i].data;
	}

	uchar* *h_shifted = new uchar*[stream_num];
	uchar* *d_data = new uchar*[stream_num];
	uchar* *d_shifted = new uchar*[stream_num];

	for (int i = 0; i < stream_num; i++)
	{
		CUDA_SAFE_CALL(cudaMalloc((void**)&d_data[i], sizeof(uchar) * f_width * f_height));
		CUDA_SAFE_CALL(cudaMalloc((void**)&d_shifted[i], sizeof(uchar) * f_width * f_height));
		CUDA_SAFE_CALL(cudaMallocHost((void**)&h_shifted[i], sizeof(uchar) * f_width * f_height));
	}

	dim3 threadsPerBlock(16, 16, 1);
	dim3 numBlocks(int(f_width/threadsPerBlock.x), int(f_height / threadsPerBlock.y), 1);

	CUDA_SAFE_CALL(cudaEventRecord(start, 0));

	for (int i = 0; i < stream_num; i++)
	{
		// CPU -> GPU
		cudaMemcpyAsync(d_data[i], data[i], sizeof(uchar) * f_width * f_height, cudaMemcpyHostToDevice, stream[i]);

		sinc2D7K <<<numBlocks, threadsPerBlock, 0, stream[i]>>> (d_data[i], d_shifted[i], f_width, f_height, offset[0], offset[1]);

		// GPU -> CPU
		cudaMemcpyAsync(h_shifted[i], d_shifted[i], sizeof(uchar) * f_height * f_width, cudaMemcpyDeviceToHost, stream[i]);
	}
	cudaDeviceSynchronize();

	CUDA_SAFE_CALL(cudaEventRecord(stop, 0));
	CUDA_SAFE_CALL(cudaEventSynchronize(stop));
	CUDA_SAFE_CALL(cudaEventElapsedTime(&elapsedTime, start, stop));

	cout << "width : " << f_width << ", height : " << f_height << endl;
	cout << "CUDA stream " << stream_num << "-way result" << endl;
	printf("Average Shift Time: %3.1f ms\n", elapsedTime);

	for (int i = 0; i < stream_num; i++)
	{
		result[i] = Mat(f_height, f_width, CV_8UC1);
		result[i].data = h_shifted[i];
		result[i] = result[i](Range(20, f_height - 20), Range(20, f_width - 20)).clone();

		sprintf(buf, "output_images/test3_shift_%d.bmp", i);
		imwrite(buf, result[i]);
	}

	for (int i = 0; i < stream_num; i++)
	{
		CUDA_SAFE_CALL(cudaFree(d_data[i]));
		CUDA_SAFE_CALL(cudaFree(d_shifted[i]));
		CUDA_SAFE_CALL(cudaFreeHost(h_shifted[i]));

		CUDA_SAFE_CALL(cudaStreamDestroy(stream[i]));
	}

	//waitKey(5000);

	return 0;
}

Mat read_BMP_opencv(char* filename, int& w, int& h)
{
	Mat input_img = imread(filename, 0);
	if (input_img.empty())
		throw "Argument Exception";

	// extract image height and width from header
	int width = input_img.cols;
	int height = input_img.rows;

	//cout << endl;
	//cout << "  Name: " << filename << endl;
	//cout << " Width: " << width << endl;
	//cout << "Height: " << height << endl;

	w = width;
	h = height;

	return input_img;
}

__global__ void sinc2D7K(uchar* buff, uchar* buffer_out, int w, int h, float offset_xx, float offset_yy)
{
	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;
	int width = w, height = h;

	if ((x >= 3 && x < width - 3) && (y >= 3 && y < height - 3))
	{
		float val = buff[width*(y - 3) + (x - 3)] * sinf((3.0f - offset_x)*pi) / ((3.0f - offset_x)*pi) * sinf((3.0f - offset_y)*pi) / ((3.0f - offset_y)*pi) +
			buff[width*(y - 3) + (x - 2)] * sinf((2.0f - offset_x)*pi) / ((2.0f - offset_x)*pi) * sinf((3.0f - offset_y)*pi) / ((3.0f - offset_y)*pi) +
			buff[width*(y - 3) + (x - 1)] * sinf((1.0f - offset_x)*pi) / ((1.0f - offset_x)*pi) * sinf((3.0f - offset_y)*pi) / ((3.0f - offset_y)*pi) +
			buff[width*(y - 3) + x] * sinf((0.0f - offset_x)*pi) / ((0.0f - offset_x)*pi) * sinf((3.0f - offset_y)*pi) / ((3.0f - offset_y)*pi) +
			buff[width*(y - 3) + (x + 1)] * sinf((-1.0f - offset_x)*pi) / ((-1.0f - offset_x)*pi) * sinf((3.0f - offset_y)*pi) / ((3.0f - offset_y)*pi) +
			buff[width*(y - 3) + (x + 2)] * sinf((-2.0f - offset_x)*pi) / ((-2.0f - offset_x)*pi) * sinf((3.0f - offset_y)*pi) / ((3.0f - offset_y)*pi) +
			buff[width*(y - 3) + (x + 3)] * sinf((-3.0f - offset_x)*pi) / ((-3.0f - offset_x)*pi) * sinf((3.0f - offset_y)*pi) / ((3.0f - offset_y)*pi) +

			buff[width*(y - 2) + (x - 3)] * sinf((3.0f - offset_x)*pi) / ((3.0f - offset_x)*pi) * sinf((2.0f - offset_y)*pi) / ((2.0f - offset_y)*pi) +
			buff[width*(y - 2) + (x - 2)] * sinf((2.0f - offset_x)*pi) / ((2.0f - offset_x)*pi) * sinf((2.0f - offset_y)*pi) / ((2.0f - offset_y)*pi) +
			buff[width*(y - 2) + (x - 1)] * sinf((1.0f - offset_x)*pi) / ((1.0f - offset_x)*pi) * sinf((2.0f - offset_y)*pi) / ((2.0f - offset_y)*pi) +
			buff[width*(y - 2) + x] * sinf((0.0f - offset_x)*pi) / ((0.0f - offset_x)*pi) * sinf((2.0f - offset_y)*pi) / ((2.0f - offset_y)*pi) +
			buff[width*(y - 2) + (x + 1)] * sinf((-1.0f - offset_x)*pi) / ((-1.0f - offset_x)*pi) * sinf((2.0f - offset_y)*pi) / ((2.0f - offset_y)*pi) +
			buff[width*(y - 2) + (x + 2)] * sinf((-2.0f - offset_x)*pi) / ((-2.0f - offset_x)*pi) * sinf((2.0f - offset_y)*pi) / ((2.0f - offset_y)*pi) +
			buff[width*(y - 2) + (x + 3)] * sinf((-3.0f - offset_x)*pi) / ((-3.0f - offset_x)*pi) * sinf((2.0f - offset_y)*pi) / ((2.0f - offset_y)*pi) +

			buff[width*(y - 1) + (x - 3)] * sinf((3.0f - offset_x)*pi) / ((3.0f - offset_x)*pi) * sinf((1.0f - offset_y)*pi) / ((1.0f - offset_y)*pi) +
			buff[width*(y - 1) + (x - 2)] * sinf((2.0f - offset_x)*pi) / ((2.0f - offset_x)*pi) * sinf((1.0f - offset_y)*pi) / ((1.0f - offset_y)*pi) +
			buff[width*(y - 1) + (x - 1)] * sinf((1.0f - offset_x)*pi) / ((1.0f - offset_x)*pi) * sinf((1.0f - offset_y)*pi) / ((1.0f - offset_y)*pi) +
			buff[width*(y - 1) + x] * sinf((0.0f - offset_x)*pi) / ((0.0f - offset_x)*pi) * sinf((1.0f - offset_y)*pi) / ((1.0f - offset_y)*pi) +
			buff[width*(y - 1) + (x + 1)] * sinf((-1.0f - offset_x)*pi) / ((-1.0f - offset_x)*pi) * sinf((1.0f - offset_y)*pi) / ((1.0f - offset_y)*pi) +
			buff[width*(y - 1) + (x + 2)] * sinf((-2.0f - offset_x)*pi) / ((-2.0f - offset_x)*pi) * sinf((1.0f - offset_y)*pi) / ((1.0f - offset_y)*pi) +
			buff[width*(y - 1) + (x + 3)] * sinf((-3.0f - offset_x)*pi) / ((-3.0f - offset_x)*pi) * sinf((1.0f - offset_y)*pi) / ((1.0f - offset_y)*pi) +

			buff[width*y + (x - 3)] * sinf((3.0f - offset_x)*pi) / ((3.0f - offset_x)*pi) * sinf((0.0f - offset_y)*pi) / ((0.0f - offset_y)*pi) +
			buff[width*y + (x - 2)] * sinf((2.0f - offset_x)*pi) / ((2.0f - offset_x)*pi) * sinf((0.0f - offset_y)*pi) / ((0.0f - offset_y)*pi) +
			buff[width*y + (x - 1)] * sinf((1.0f - offset_x)*pi) / ((1.0f - offset_x)*pi) * sinf((0.0f - offset_y)*pi) / ((0.0f - offset_y)*pi) +
			buff[width*y + x] * sinf((0.0f - offset_x)*pi) / ((0.0f - offset_x)*pi) * sinf((0.0f - offset_y)*pi) / ((0.0f - offset_y)*pi) +
			buff[width*y + (x + 1)] * sinf((-1.0f - offset_x)*pi) / ((-1.0f - offset_x)*pi) * sinf((0.0f - offset_y)*pi) / ((0.0f - offset_y)*pi) +
			buff[width*y + (x + 2)] * sinf((-2.0f - offset_x)*pi) / ((-2.0f - offset_x)*pi) * sinf((0.0f - offset_y)*pi) / ((0.0f - offset_y)*pi) +
			buff[width*y + (x + 3)] * sinf((-3.0f - offset_x)*pi) / ((-3.0f - offset_x)*pi) * sinf((0.0f - offset_y)*pi) / ((0.0f - offset_y)*pi) +

			buff[width*(y + 1) + (x - 3)] * sinf((3.0f - offset_x)*pi) / ((3.0f - offset_x)*pi) * sinf((-1.0f - offset_y)*pi) / ((-1.0f - offset_y)*pi) +
			buff[width*(y + 1) + (x - 2)] * sinf((2.0f - offset_x)*pi) / ((2.0f - offset_x)*pi) * sinf((-1.0f - offset_y)*pi) / ((-1.0f - offset_y)*pi) +
			buff[width*(y + 1) + (x - 1)] * sinf((1.0f - offset_x)*pi) / ((1.0f - offset_x)*pi) * sinf((-1.0f - offset_y)*pi) / ((-1.0f - offset_y)*pi) +
			buff[width*(y + 1) + x] * sinf((0.0f - offset_x)*pi) / ((0.0f - offset_x)*pi) * sinf((-1.0f - offset_y)*pi) / ((-1.0f - offset_y)*pi) +
			buff[width*(y + 1) + (x + 1)] * sinf((-1.0f - offset_x)*pi) / ((-1.0f - offset_x)*pi) * sinf((-1.0f - offset_y)*pi) / ((-1.0f - offset_y)*pi) +
			buff[width*(y + 1) + (x + 2)] * sinf((-2.0f - offset_x)*pi) / ((-2.0f - offset_x)*pi) * sinf((-1.0f - offset_y)*pi) / ((-1.0f - offset_y)*pi) +
			buff[width*(y + 1) + (x + 3)] * sinf((-3.0f - offset_x)*pi) / ((-3.0f - offset_x)*pi) * sinf((-1.0f - offset_y)*pi) / ((-1.0f - offset_y)*pi) +

			buff[width*(y + 2) + (x - 3)] * sinf((3.0f - offset_x)*pi) / ((3.0f - offset_x)*pi) * sinf((-2.0f - offset_y)*pi) / ((-2.0f - offset_y)*pi) +
			buff[width*(y + 2) + (x - 2)] * sinf((2.0f - offset_x)*pi) / ((2.0f - offset_x)*pi) * sinf((-2.0f - offset_y)*pi) / ((-2.0f - offset_y)*pi) +
			buff[width*(y + 2) + (x - 1)] * sinf((1.0f - offset_x)*pi) / ((1.0f - offset_x)*pi) * sinf((-2.0f - offset_y)*pi) / ((-2.0f - offset_y)*pi) +
			buff[width*(y + 2) + x] * sinf((0.0f - offset_x)*pi) / ((0.0f - offset_x)*pi) * sinf((-2.0f - offset_y)*pi) / ((-2.0f - offset_y)*pi) +
			buff[width*(y + 2) + (x + 1)] * sinf((-1.0f - offset_x)*pi) / ((-1.0f - offset_x)*pi) * sinf((-2.0f - offset_y)*pi) / ((-2.0f - offset_y)*pi) +
			buff[width*(y + 2) + (x + 2)] * sinf((-2.0f - offset_x)*pi) / ((-2.0f - offset_x)*pi) * sinf((-2.0f - offset_y)*pi) / ((-2.0f - offset_y)*pi) +
			buff[width*(y + 2) + (x + 3)] * sinf((-3.0f - offset_x)*pi) / ((-3.0f - offset_x)*pi) * sinf((-2.0f - offset_y)*pi) / ((-2.0f - offset_y)*pi) +

			buff[width*(y + 3) + (x - 3)] * sinf((3.0f - offset_x)*pi) / ((3.0f - offset_x)*pi) * sinf((-3.0f - offset_y)*pi) / ((-3.0f - offset_y)*pi) +
			buff[width*(y + 3) + (x - 2)] * sinf((2.0f - offset_x)*pi) / ((2.0f - offset_x)*pi) * sinf((-3.0f - offset_y)*pi) / ((-3.0f - offset_y)*pi) +
			buff[width*(y + 3) + (x - 1)] * sinf((1.0f - offset_x)*pi) / ((1.0f - offset_x)*pi) * sinf((-3.0f - offset_y)*pi) / ((-3.0f - offset_y)*pi) +
			buff[width*(y + 3) + x] * sinf((0.0f - offset_x)*pi) / ((0.0f - offset_x)*pi) * sinf((-3.0f - offset_y)*pi) / ((-3.0f - offset_y)*pi) +
			buff[width*(y + 3) + (x + 1)] * sinf((-1.0f - offset_x)*pi) / ((-1.0f - offset_x)*pi) * sinf((-3.0f - offset_y)*pi) / ((-3.0f - offset_y)*pi) +
			buff[width*(y + 3) + (x + 2)] * sinf((-2.0f - offset_x)*pi) / ((-2.0f - offset_x)*pi) * sinf((-3.0f - offset_y)*pi) / ((-3.0f - offset_y)*pi) +
			buff[width*(y + 3) + (x + 3)] * sinf((-3.0f - offset_x)*pi) / ((-3.0f - offset_x)*pi) * sinf((-3.0f - offset_y)*pi) / ((-3.0f - offset_y)*pi);

		buffer_out[y * width + x] = (uchar)val;
	}
}

Side remark: Unless you always compile with -use_fast_math, you would want to replace code like this:

with code like this:

sinpif (3.0f - offset_x)

It’s always on -use_fast_math. Thanks anyway. :)