CUDA streams for image processing

I want to use a pipelined algorithm where I can transfer a image to the GPU using one stream when the other stream is doing the computation task at CUDA kernel.

Based upon some suggestions, I have the following code which is running without any error or crash but it has two major problems. First of all, it is making the PC very slow and taking a lot of time. Secondly, I think that the resulting processed image is not proper. Sometimes, I think that there are some patches on it which means it could not be processed completely.

I am adding a completely working example of my code but it will need OpenCV libraries. Please let me know, if there is some conceptual problem with my code.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <stdio.h>

#include <opencv2/highgui.hpp>
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>

using namespace cv;
using namespace std;

RNG randg(123456);
const cv::Size IMG_SIZE(1024, 1024);
int cur_frame;

__global__ void proc_kernel(unsigned char *d_cur, int iw, int ih, int numChannels, unsigned char R_val, unsigned char G_val, unsigned char B_val)
	// Calculate our pixel's location
	int Row = blockIdx.y*blockDim.y + threadIdx.y;
	int Column = blockIdx.x*blockDim.x + threadIdx.x;

	// Operate only if we are in the correct boundaries
    if(Column >= 0 && Column < iw && Row >= 0 && Row < ih)
		d_cur[  numChannels*  (iw*Row   + Column)  + 0   ] = (unsigned char)R_val;
		d_cur[  numChannels*  (iw*Row   + Column)  + 1   ] = (unsigned char)G_val;
		d_cur[  numChannels*  (iw*Row   + Column)  + 2   ] = (unsigned char)B_val;


bool captureImg(unsigned char *h_cur)

	Mat img(IMG_SIZE, CV_8UC3);
	img = cv::Scalar(randg.uniform(100,150),randg.uniform(150,200), randg.uniform(10,100));
	imshow("Original image", img);

	h_cur =;

	//cout<<(int)cur_frame<<". Frame captured"<<endl;
	return false;

bool validate_image(unsigned char *img) 

	//Display processed image
	Mat processedImg(IMG_SIZE, CV_8UC3);
	unsigned char *ptr_processedImage;

	ptr_processedImage = img; = ptr_processedImage;
	imshow("Processed Image", processedImg);

	return true;

void CUDART_CB my_callback(cudaStream_t stream, cudaError_t status, void* data) 
    validate_image((unsigned char *)data);

int main()
	bool done = false;

	unsigned char *h_imgA, *h_imgB, *d_imgA, *d_imgB, *ptr_processedImage;
	//Define a template image to initilize paramters for testing
	Mat paramImg(IMG_SIZE, CV_8UC3);
	paramImg = cv::Scalar(randg.uniform(150,250),randg.uniform(150,250), randg.uniform(150,250));

	//Define an empty image to display the final processed image after the GPU processing
	Mat processedImg(IMG_SIZE, CV_8UC3);
	//Image parameters
	size_t dsize = paramImg.channels() * paramImg.rows * paramImg.cols*sizeof(unsigned char);
	int iw = paramImg.cols;
	int ih = paramImg.rows;
	int numChannels = paramImg.channels();

	//Allocate buffer memory at host
	//cudaHostAlloc(&h_imgA, dsize, cudaHostAllocDefault);
	//cudaHostAlloc(&h_imgB, dsize, cudaHostAllocDefault);

	//Allocate memories at Device
	cudaMalloc(&d_imgA, dsize);
	cudaMalloc(&d_imgB, dsize);

	//Define Cuda streams
	cudaStream_t st1, st2;
	cudaStream_t *curst = &st1;
	cudaStream_t *nxtst = &st2;

	//Set the process
	unsigned char *h_cur = h_imgA;
	unsigned char *h_nxt = h_imgB;
	unsigned char *d_cur = d_imgA;
	unsigned char *d_nxt = d_imgB;

	done = captureImg(h_cur);

	//Set kernel dimensions
	dim3 numBlocks(numChannels* (ceil(iw/16.0)), numChannels*(ceil(ih/16.0)));
	dim3 numThreads(16,16);

	//Start the process
		unsigned char R_val = randg.uniform(10,100), G_val = randg.uniform(100,200), B_val = randg.uniform(200,250);

			//Copy source image from host to device
			cudaMemcpyAsync(d_cur, h_cur, dsize, cudaMemcpyHostToDevice, *curst);

			//Call CUDA kernel
			proc_kernel<<<numBlocks, numThreads, 0, *curst>>>(d_cur, iw, ih, numChannels, R_val, G_val, B_val);

			//Copy current image back to Host from Device
			cudaMemcpyAsync(h_cur, d_cur, dsize, cudaMemcpyDeviceToHost, *curst);

			// insert a cuda stream callback here to copy the cur frame to output
			cudaStreamAddCallback(*curst, &my_callback, (void *)h_cur, 0);
			cudaStreamSynchronize(*nxtst); // prevent overrun
			done = captureImg(h_nxt); // capture nxt image while GPU is processing cur

		unsigned char *h_tmp = h_cur;
		h_cur = h_nxt;
		h_nxt = h_tmp;

		d_cur = d_nxt;
		d_nxt = h_tmp;

		cudaStream_t *st_tmp = curst;
		curst = nxtst;
		nxtst = st_tmp;


	return 0;

just check: if i am not mistaken, (multiple) issued stream callbacks my serialize streams and remove their asynchronous nature

rather replace the callbacks with ordinary functions, triggered by events: record events, synchronize on the events, and then call the callback function, as an ordinary function

if (run > 1st)

in your case: prev = nxt;

if the stream callbacks are not the culprit, the host may be struggling to adapt its pace to that of the device
you can note this with the profiler, and check for potentially significant gaps due to synchronization
increasing the pending work generally solves this (increase the number of streams you issue work in)

Sorry “little_jimmy”, I understood that you want to say that my streams might not be working parallel. But i did not understand your solution to deal with this situation. I also did not understand, what do you mean by


. I am sorry for that but it’s because of my little knowledge about CUDA.

i noted 2 possible solutions, the one carrying precedence over the other
i assume you do not entirely follow the 1st

to clarify:

//s_cnt = stream_cnt = 2 (your case)
//s_ptr = stream_ptr = cur (your case)
//prev_s_ptr = previous stream pointer or simply stream pointer to process - nxt in your case
// cnt = count or counter - number of times you have run the stream loop already

cudaEvent_t e[s_cnt];
cudaStream_t s[s_cnt];

cnt = 0;

while ()

cudaEventRecord(e[s_ptr], s[s_ptr]);

if (cnt > 0)

validate_image((unsigned char *)data);