Can't overlap streams

astroboykhw · December 6, 2021, 7:42am

My code cannot achieve concurrency. In Nsight Systems, it shows that any memory copies and kernels are not overlapped.

(N times of HostToDevice >> N times of Kernel execution >> N times of DeviceToHost)

I don’t understand why it’s not overlapped, because IT USED TO BE WORK. About months ago I tested my code on Nsight Systems and verified that it could achieve concurrency. I’ve changed CUDA kernels and barely changed overall structure of this code, and found this problem. Now old version of my code is also not working…

What can be the suspect of this problem?

CUDA Version : 10.0
GPU : 2080ti
CPU : Intel i9 9900k
OS : Windows 10
IDE : Visual Studio 2017

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <vector>
#include <time.h>
#include <opencv2/opencv.hpp>

#include <bitset>

#include <helper_functions.h>  // helper for shared functions common to CUDA Samples
#include <helper_cuda.h>       // helper functions for CUDA error checking and initialization

using namespace std;
using namespace cv;

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaDeviceSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

Mat read_BMP_opencv(char* filename, int& w, int& h);
__global__ void sinc2D7K(uchar* buff, uchar* buffer_out, int w, int h, float offset_xx, float offset_yy);


int main()
{
	cudaEvent_t start, stop;
	float  elapsedTime;
	CUDA_SAFE_CALL(cudaEventCreate(&start));
	CUDA_SAFE_CALL(cudaEventCreate(&stop));
	int f_width, f_height;

	float offset[2] = { 0.1f, 0.2f };

	int margin = 2;
	int crop_size = 2048;
	const int stream_num = 32; // ceil(16384.0f / float(crop_size)) * ceil(8192.0f / float(crop_size))
	int n_iter = 50;

	cout << "crop size = " << crop_size << endl;
	cout << "number of streams = " << stream_num << endl;
	cout << "repeat = " << n_iter << endl;

	cudaStream_t stream[stream_num];

	for (int n = 0; n < stream_num; n++)
	{
		CUDA_SAFE_CALL(cudaStreamCreate(&stream[n]));
	}

	cv::Mat::setDefaultAllocator(cv::cuda::HostMem::getAllocator(cv::cuda::HostMem::AllocType::PAGE_LOCKED));

	Mat* result = new Mat[stream_num];
	Mat* img = new Mat[stream_num];
	uchar* *data = new uchar*[stream_num];
	char buf[256];

	for (int i = 0; i < stream_num; i++)
	{
		//sprintf(buf, "test_13k.bmp", i + 1);
		//sprintf(buf, "test_%d.bmp", crop_size);
		sprintf(buf, "input_images/test_2048_%d.bmp", i);
		img[i] = read_BMP_opencv(buf, f_width, f_height);
		data[i] = img[i].data;
	}

	uchar* *h_shifted = new uchar*[stream_num];
	uchar* *d_data = new uchar*[stream_num];
	uchar* *d_shifted = new uchar*[stream_num];

	for (int i = 0; i < stream_num; i++)
	{
		CUDA_SAFE_CALL(cudaMalloc((void**)&d_data[i], sizeof(uchar) * f_width * f_height));
		CUDA_SAFE_CALL(cudaMalloc((void**)&d_shifted[i], sizeof(uchar) * f_width * f_height));
		CUDA_SAFE_CALL(cudaMallocHost((void**)&h_shifted[i], sizeof(uchar) * f_width * f_height));
	}

	dim3 threadsPerBlock(16, 16, 1);
	dim3 numBlocks(int(f_width/threadsPerBlock.x), int(f_height / threadsPerBlock.y), 1);

	CUDA_SAFE_CALL(cudaEventRecord(start, 0));

	for (int i = 0; i < stream_num; i++)
	{
		// CPU -> GPU
		cudaMemcpyAsync(d_data[i], data[i], sizeof(uchar) * f_width * f_height, cudaMemcpyHostToDevice, stream[i]);

		sinc2D7K <<<numBlocks, threadsPerBlock, 0, stream[i]>>> (d_data[i], d_shifted[i], f_width, f_height, offset[0], offset[1]);

		// GPU -> CPU
		cudaMemcpyAsync(h_shifted[i], d_shifted[i], sizeof(uchar) * f_height * f_width, cudaMemcpyDeviceToHost, stream[i]);
	}
	cudaDeviceSynchronize();

	CUDA_SAFE_CALL(cudaEventRecord(stop, 0));
	CUDA_SAFE_CALL(cudaEventSynchronize(stop));
	CUDA_SAFE_CALL(cudaEventElapsedTime(&elapsedTime, start, stop));

	cout << "width : " << f_width << ", height : " << f_height << endl;
	cout << "CUDA stream " << stream_num << "-way result" << endl;
	printf("Average Shift Time: %3.1f ms\n", elapsedTime);

	for (int i = 0; i < stream_num; i++)
	{
		result[i] = Mat(f_height, f_width, CV_8UC1);
		result[i].data = h_shifted[i];
		result[i] = result[i](Range(20, f_height - 20), Range(20, f_width - 20)).clone();

		sprintf(buf, "output_images/test3_shift_%d.bmp", i);
		imwrite(buf, result[i]);
	}

	for (int i = 0; i < stream_num; i++)
	{
		CUDA_SAFE_CALL(cudaFree(d_data[i]));
		CUDA_SAFE_CALL(cudaFree(d_shifted[i]));
		CUDA_SAFE_CALL(cudaFreeHost(h_shifted[i]));

		CUDA_SAFE_CALL(cudaStreamDestroy(stream[i]));
	}

	//waitKey(5000);

	return 0;
}

Mat read_BMP_opencv(char* filename, int& w, int& h)
{
	Mat input_img = imread(filename, 0);
	if (input_img.empty())
		throw "Argument Exception";

	// extract image height and width from header
	int width = input_img.cols;
	int height = input_img.rows;

	//cout << endl;
	//cout << "  Name: " << filename << endl;
	//cout << " Width: " << width << endl;
	//cout << "Height: " << height << endl;

	w = width;
	h = height;

	return input_img;
}

__global__ void sinc2D7K(uchar* buff, uchar* buffer_out, int w, int h, float offset_xx, float offset_yy)
{
	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;
	int width = w, height = h;

	if ((x >= 3 && x < width - 3) && (y >= 3 && y < height - 3))
	{
		float val = buff[width*(y - 3) + (x - 3)] * sinf((3.0f - offset_x)*pi) / ((3.0f - offset_x)*pi) * sinf((3.0f - offset_y)*pi) / ((3.0f - offset_y)*pi) +
			buff[width*(y - 3) + (x - 2)] * sinf((2.0f - offset_x)*pi) / ((2.0f - offset_x)*pi) * sinf((3.0f - offset_y)*pi) / ((3.0f - offset_y)*pi) +
			buff[width*(y - 3) + (x - 1)] * sinf((1.0f - offset_x)*pi) / ((1.0f - offset_x)*pi) * sinf((3.0f - offset_y)*pi) / ((3.0f - offset_y)*pi) +
			buff[width*(y - 3) + x] * sinf((0.0f - offset_x)*pi) / ((0.0f - offset_x)*pi) * sinf((3.0f - offset_y)*pi) / ((3.0f - offset_y)*pi) +
			buff[width*(y - 3) + (x + 1)] * sinf((-1.0f - offset_x)*pi) / ((-1.0f - offset_x)*pi) * sinf((3.0f - offset_y)*pi) / ((3.0f - offset_y)*pi) +
			buff[width*(y - 3) + (x + 2)] * sinf((-2.0f - offset_x)*pi) / ((-2.0f - offset_x)*pi) * sinf((3.0f - offset_y)*pi) / ((3.0f - offset_y)*pi) +
			buff[width*(y - 3) + (x + 3)] * sinf((-3.0f - offset_x)*pi) / ((-3.0f - offset_x)*pi) * sinf((3.0f - offset_y)*pi) / ((3.0f - offset_y)*pi) +

			buff[width*(y - 2) + (x - 3)] * sinf((3.0f - offset_x)*pi) / ((3.0f - offset_x)*pi) * sinf((2.0f - offset_y)*pi) / ((2.0f - offset_y)*pi) +
			buff[width*(y - 2) + (x - 2)] * sinf((2.0f - offset_x)*pi) / ((2.0f - offset_x)*pi) * sinf((2.0f - offset_y)*pi) / ((2.0f - offset_y)*pi) +
			buff[width*(y - 2) + (x - 1)] * sinf((1.0f - offset_x)*pi) / ((1.0f - offset_x)*pi) * sinf((2.0f - offset_y)*pi) / ((2.0f - offset_y)*pi) +
			buff[width*(y - 2) + x] * sinf((0.0f - offset_x)*pi) / ((0.0f - offset_x)*pi) * sinf((2.0f - offset_y)*pi) / ((2.0f - offset_y)*pi) +
			buff[width*(y - 2) + (x + 1)] * sinf((-1.0f - offset_x)*pi) / ((-1.0f - offset_x)*pi) * sinf((2.0f - offset_y)*pi) / ((2.0f - offset_y)*pi) +
			buff[width*(y - 2) + (x + 2)] * sinf((-2.0f - offset_x)*pi) / ((-2.0f - offset_x)*pi) * sinf((2.0f - offset_y)*pi) / ((2.0f - offset_y)*pi) +
			buff[width*(y - 2) + (x + 3)] * sinf((-3.0f - offset_x)*pi) / ((-3.0f - offset_x)*pi) * sinf((2.0f - offset_y)*pi) / ((2.0f - offset_y)*pi) +

			buff[width*(y - 1) + (x - 3)] * sinf((3.0f - offset_x)*pi) / ((3.0f - offset_x)*pi) * sinf((1.0f - offset_y)*pi) / ((1.0f - offset_y)*pi) +
			buff[width*(y - 1) + (x - 2)] * sinf((2.0f - offset_x)*pi) / ((2.0f - offset_x)*pi) * sinf((1.0f - offset_y)*pi) / ((1.0f - offset_y)*pi) +
			buff[width*(y - 1) + (x - 1)] * sinf((1.0f - offset_x)*pi) / ((1.0f - offset_x)*pi) * sinf((1.0f - offset_y)*pi) / ((1.0f - offset_y)*pi) +
			buff[width*(y - 1) + x] * sinf((0.0f - offset_x)*pi) / ((0.0f - offset_x)*pi) * sinf((1.0f - offset_y)*pi) / ((1.0f - offset_y)*pi) +
			buff[width*(y - 1) + (x + 1)] * sinf((-1.0f - offset_x)*pi) / ((-1.0f - offset_x)*pi) * sinf((1.0f - offset_y)*pi) / ((1.0f - offset_y)*pi) +
			buff[width*(y - 1) + (x + 2)] * sinf((-2.0f - offset_x)*pi) / ((-2.0f - offset_x)*pi) * sinf((1.0f - offset_y)*pi) / ((1.0f - offset_y)*pi) +
			buff[width*(y - 1) + (x + 3)] * sinf((-3.0f - offset_x)*pi) / ((-3.0f - offset_x)*pi) * sinf((1.0f - offset_y)*pi) / ((1.0f - offset_y)*pi) +

			buff[width*y + (x - 3)] * sinf((3.0f - offset_x)*pi) / ((3.0f - offset_x)*pi) * sinf((0.0f - offset_y)*pi) / ((0.0f - offset_y)*pi) +
			buff[width*y + (x - 2)] * sinf((2.0f - offset_x)*pi) / ((2.0f - offset_x)*pi) * sinf((0.0f - offset_y)*pi) / ((0.0f - offset_y)*pi) +
			buff[width*y + (x - 1)] * sinf((1.0f - offset_x)*pi) / ((1.0f - offset_x)*pi) * sinf((0.0f - offset_y)*pi) / ((0.0f - offset_y)*pi) +
			buff[width*y + x] * sinf((0.0f - offset_x)*pi) / ((0.0f - offset_x)*pi) * sinf((0.0f - offset_y)*pi) / ((0.0f - offset_y)*pi) +
			buff[width*y + (x + 1)] * sinf((-1.0f - offset_x)*pi) / ((-1.0f - offset_x)*pi) * sinf((0.0f - offset_y)*pi) / ((0.0f - offset_y)*pi) +
			buff[width*y + (x + 2)] * sinf((-2.0f - offset_x)*pi) / ((-2.0f - offset_x)*pi) * sinf((0.0f - offset_y)*pi) / ((0.0f - offset_y)*pi) +
			buff[width*y + (x + 3)] * sinf((-3.0f - offset_x)*pi) / ((-3.0f - offset_x)*pi) * sinf((0.0f - offset_y)*pi) / ((0.0f - offset_y)*pi) +

			buff[width*(y + 1) + (x - 3)] * sinf((3.0f - offset_x)*pi) / ((3.0f - offset_x)*pi) * sinf((-1.0f - offset_y)*pi) / ((-1.0f - offset_y)*pi) +
			buff[width*(y + 1) + (x - 2)] * sinf((2.0f - offset_x)*pi) / ((2.0f - offset_x)*pi) * sinf((-1.0f - offset_y)*pi) / ((-1.0f - offset_y)*pi) +
			buff[width*(y + 1) + (x - 1)] * sinf((1.0f - offset_x)*pi) / ((1.0f - offset_x)*pi) * sinf((-1.0f - offset_y)*pi) / ((-1.0f - offset_y)*pi) +
			buff[width*(y + 1) + x] * sinf((0.0f - offset_x)*pi) / ((0.0f - offset_x)*pi) * sinf((-1.0f - offset_y)*pi) / ((-1.0f - offset_y)*pi) +
			buff[width*(y + 1) + (x + 1)] * sinf((-1.0f - offset_x)*pi) / ((-1.0f - offset_x)*pi) * sinf((-1.0f - offset_y)*pi) / ((-1.0f - offset_y)*pi) +
			buff[width*(y + 1) + (x + 2)] * sinf((-2.0f - offset_x)*pi) / ((-2.0f - offset_x)*pi) * sinf((-1.0f - offset_y)*pi) / ((-1.0f - offset_y)*pi) +
			buff[width*(y + 1) + (x + 3)] * sinf((-3.0f - offset_x)*pi) / ((-3.0f - offset_x)*pi) * sinf((-1.0f - offset_y)*pi) / ((-1.0f - offset_y)*pi) +

			buff[width*(y + 2) + (x - 3)] * sinf((3.0f - offset_x)*pi) / ((3.0f - offset_x)*pi) * sinf((-2.0f - offset_y)*pi) / ((-2.0f - offset_y)*pi) +
			buff[width*(y + 2) + (x - 2)] * sinf((2.0f - offset_x)*pi) / ((2.0f - offset_x)*pi) * sinf((-2.0f - offset_y)*pi) / ((-2.0f - offset_y)*pi) +
			buff[width*(y + 2) + (x - 1)] * sinf((1.0f - offset_x)*pi) / ((1.0f - offset_x)*pi) * sinf((-2.0f - offset_y)*pi) / ((-2.0f - offset_y)*pi) +
			buff[width*(y + 2) + x] * sinf((0.0f - offset_x)*pi) / ((0.0f - offset_x)*pi) * sinf((-2.0f - offset_y)*pi) / ((-2.0f - offset_y)*pi) +
			buff[width*(y + 2) + (x + 1)] * sinf((-1.0f - offset_x)*pi) / ((-1.0f - offset_x)*pi) * sinf((-2.0f - offset_y)*pi) / ((-2.0f - offset_y)*pi) +
			buff[width*(y + 2) + (x + 2)] * sinf((-2.0f - offset_x)*pi) / ((-2.0f - offset_x)*pi) * sinf((-2.0f - offset_y)*pi) / ((-2.0f - offset_y)*pi) +
			buff[width*(y + 2) + (x + 3)] * sinf((-3.0f - offset_x)*pi) / ((-3.0f - offset_x)*pi) * sinf((-2.0f - offset_y)*pi) / ((-2.0f - offset_y)*pi) +

			buff[width*(y + 3) + (x - 3)] * sinf((3.0f - offset_x)*pi) / ((3.0f - offset_x)*pi) * sinf((-3.0f - offset_y)*pi) / ((-3.0f - offset_y)*pi) +
			buff[width*(y + 3) + (x - 2)] * sinf((2.0f - offset_x)*pi) / ((2.0f - offset_x)*pi) * sinf((-3.0f - offset_y)*pi) / ((-3.0f - offset_y)*pi) +
			buff[width*(y + 3) + (x - 1)] * sinf((1.0f - offset_x)*pi) / ((1.0f - offset_x)*pi) * sinf((-3.0f - offset_y)*pi) / ((-3.0f - offset_y)*pi) +
			buff[width*(y + 3) + x] * sinf((0.0f - offset_x)*pi) / ((0.0f - offset_x)*pi) * sinf((-3.0f - offset_y)*pi) / ((-3.0f - offset_y)*pi) +
			buff[width*(y + 3) + (x + 1)] * sinf((-1.0f - offset_x)*pi) / ((-1.0f - offset_x)*pi) * sinf((-3.0f - offset_y)*pi) / ((-3.0f - offset_y)*pi) +
			buff[width*(y + 3) + (x + 2)] * sinf((-2.0f - offset_x)*pi) / ((-2.0f - offset_x)*pi) * sinf((-3.0f - offset_y)*pi) / ((-3.0f - offset_y)*pi) +
			buff[width*(y + 3) + (x + 3)] * sinf((-3.0f - offset_x)*pi) / ((-3.0f - offset_x)*pi) * sinf((-3.0f - offset_y)*pi) / ((-3.0f - offset_y)*pi);

		buffer_out[y * width + x] = (uchar)val;
	}
}

njuffa · December 6, 2021, 7:59am

Side remark: Unless you always compile with -use_fast_math, you would want to replace code like this:

with code like this:

sinpif (3.0f - offset_x)

astroboykhw · December 16, 2021, 1:19pm

It’s always on -use_fast_math. Thanks anyway. :)

Topic		Replies	Views
Overlapping CPU and GPU operations using streams. Total failure. Any help? CUDA Programming and Performance	6	6175	April 2, 2013
Don't observe overlapping behavior in streams CUDA Programming and Performance cuda , kernel	3	98	July 15, 2025
Can't get any concurrency on simple vector add across multi-GPU and streams CUDA Programming and Performance	17	6010	April 28, 2012
Stream Concurrency (or lack thereof) on GTX 480 CUDA Programming and Performance	6	2621	July 15, 2010
streams not overlapping CUDA Programming and Performance	1	1604	May 23, 2011
Time intervals and non-concurrent in multi streaming CUDA Programming and Performance cuda	6	736	April 6, 2023
Concurrent Kernel executions & Data Transfers CUDA Programming and Performance cuda	3	743	March 8, 2023
How to Overlap Data Transfers in CUDA C/C++ Technical Blog	23	2592	January 18, 2023
Cannot get any stream parallelism. CUDA Programming and Performance	13	1486	December 31, 2019
about streaming style sample code in Programming Guide ... why such a style? CUDA Programming and Performance	5	1500	January 23, 2009

Can't overlap streams

Related topics