can't achieve cudaMemcpyAsync and kernel concurrency

hi folks,

I’m writing a 360 video stitching application, and having a concurrency issue. The attatched pic shows the serial version.

My plan is to assign the odd frames to one stream, and even frames to another stream, thus the memcpy and kernel can be parallel between 2 streams.

My code is below. The problem is that I can’t see the expected concurrency, all the memcpy (both D2H, H2D) and kernel runs serially. What did I miss?

cudaStream_t stream[2];

for (int i = 0; i < 2; ++i)
	cudaStreamCreate(&stream[i]);

...

for (int i = 0; i < src_n; ++i)
{
	h_arg.iimg[i] = inbuf[stream_idx][i];
	cudaHostRegister(src[i].y.data, src[i].y.width * src[i].y.height, cudaHostRegisterDefault);
	checkCudaErrors(cudaMemcpyAsync(h_arg.iimg[i].y.data, src[i].y.data, src[i].y.width * src[i].y.height, cudaMemcpyHostToDevice, stream[stream_idx]));
	cudaHostRegister(src[i].u.data, src[i].u.width * src[i].u.height, cudaHostRegisterDefault);
	checkCudaErrors(cudaMemcpyAsync(h_arg.iimg[i].u.data, src[i].u.data, src[i].u.width * src[i].u.height, cudaMemcpyHostToDevice, stream[stream_idx]));
	cudaHostRegister(src[i].v.data, src[i].v.width * src[i].v.height, cudaHostRegisterDefault);
	checkCudaErrors(cudaMemcpyAsync(h_arg.iimg[i].v.data, src[i].v.data, src[i].v.width * src[i].v.height, cudaMemcpyHostToDevice, stream[stream_idx]));
}
h_arg.oimg = outbuf[stream_idx];
cudaHostRegister(dst->y.data, dst->y.width * dst->y.height, cudaHostRegisterDefault);
cudaHostRegister(dst->u.data, dst->u.width * dst->u.height, cudaHostRegisterDefault);
cudaHostRegister(dst->v.data, dst->v.width * dst->v.height, cudaHostRegisterDefault);

kernel_stitch << <dim_grid, dim_block, 0 , stream[stream_idx]>> > (h_arg);

checkCudaErrors(cudaMemcpyAsync(dst->y.data, h_arg.oimg.y.data, dst->y.width * dst->y.height, cudaMemcpyDeviceToHost, stream[stream_idx]));
checkCudaErrors(cudaMemcpyAsync(dst->u.data, h_arg.oimg.u.data, dst->u.width * dst->u.height, cudaMemcpyDeviceToHost, stream[stream_idx]));
checkCudaErrors(cudaMemcpyAsync(dst->v.data, h_arg.oimg.v.data, dst->v.width * dst->v.height, cudaMemcpyDeviceToHost, stream[stream_idx]));

// sync and output previous frame (in previous stream)
uint32_t prev_idx = (stream_idx + stream_num - 1) % stream_num;
checkCudaErrors(cudaStreamSynchronize(stream[prev_idx]));

for (int i = 0; i < src_n; ++i)
{
	if (prev_inbuf[i].y.data)
		cudaHostUnregister(prev_inbuf[i].y.data);
	if (prev_inbuf[i].u.data)
		cudaHostUnregister(prev_inbuf[i].u.data);
	if (prev_inbuf[i].v.data)
		cudaHostUnregister(prev_inbuf[i].v.data);
	prev_inbuf[i] = src[i];
}
if (prev_outbuf.y.data)
	cudaHostUnregister(prev_outbuf.y.data);
if (prev_outbuf.u.data)
	cudaHostUnregister(prev_outbuf.u.data);
if (prev_outbuf.v.data)
	cudaHostUnregister(prev_outbuf.v.data);
prev_outbuf = *dst;

stream_idx = (stream_idx + 1) % stream_num;

I figured it out finally.

TX1 has only one async engine. So the sequence of issuing actions to streams is a little trickier than the one with multiple engines.

the trick is well covered in this link.
https://devblogs.nvidia.com/parallelforall/how-overlap-data-transfers-cuda-cc/

Hi,

Here is our CUDA document for the concurrent kernel for your reference.
[url]Programming Guide :: CUDA Toolkit Documentation