hi folks,
I’m writing a 360 video stitching application, and having a concurrency issue. The attatched pic shows the serial version.
My plan is to assign the odd frames to one stream, and even frames to another stream, thus the memcpy and kernel can be parallel between 2 streams.
My code is below. The problem is that I can’t see the expected concurrency, all the memcpy (both D2H, H2D) and kernel runs serially. What did I miss?
cudaStream_t stream[2];
for (int i = 0; i < 2; ++i)
cudaStreamCreate(&stream[i]);
...
for (int i = 0; i < src_n; ++i)
{
h_arg.iimg[i] = inbuf[stream_idx][i];
cudaHostRegister(src[i].y.data, src[i].y.width * src[i].y.height, cudaHostRegisterDefault);
checkCudaErrors(cudaMemcpyAsync(h_arg.iimg[i].y.data, src[i].y.data, src[i].y.width * src[i].y.height, cudaMemcpyHostToDevice, stream[stream_idx]));
cudaHostRegister(src[i].u.data, src[i].u.width * src[i].u.height, cudaHostRegisterDefault);
checkCudaErrors(cudaMemcpyAsync(h_arg.iimg[i].u.data, src[i].u.data, src[i].u.width * src[i].u.height, cudaMemcpyHostToDevice, stream[stream_idx]));
cudaHostRegister(src[i].v.data, src[i].v.width * src[i].v.height, cudaHostRegisterDefault);
checkCudaErrors(cudaMemcpyAsync(h_arg.iimg[i].v.data, src[i].v.data, src[i].v.width * src[i].v.height, cudaMemcpyHostToDevice, stream[stream_idx]));
}
h_arg.oimg = outbuf[stream_idx];
cudaHostRegister(dst->y.data, dst->y.width * dst->y.height, cudaHostRegisterDefault);
cudaHostRegister(dst->u.data, dst->u.width * dst->u.height, cudaHostRegisterDefault);
cudaHostRegister(dst->v.data, dst->v.width * dst->v.height, cudaHostRegisterDefault);
kernel_stitch << <dim_grid, dim_block, 0 , stream[stream_idx]>> > (h_arg);
checkCudaErrors(cudaMemcpyAsync(dst->y.data, h_arg.oimg.y.data, dst->y.width * dst->y.height, cudaMemcpyDeviceToHost, stream[stream_idx]));
checkCudaErrors(cudaMemcpyAsync(dst->u.data, h_arg.oimg.u.data, dst->u.width * dst->u.height, cudaMemcpyDeviceToHost, stream[stream_idx]));
checkCudaErrors(cudaMemcpyAsync(dst->v.data, h_arg.oimg.v.data, dst->v.width * dst->v.height, cudaMemcpyDeviceToHost, stream[stream_idx]));
// sync and output previous frame (in previous stream)
uint32_t prev_idx = (stream_idx + stream_num - 1) % stream_num;
checkCudaErrors(cudaStreamSynchronize(stream[prev_idx]));
for (int i = 0; i < src_n; ++i)
{
if (prev_inbuf[i].y.data)
cudaHostUnregister(prev_inbuf[i].y.data);
if (prev_inbuf[i].u.data)
cudaHostUnregister(prev_inbuf[i].u.data);
if (prev_inbuf[i].v.data)
cudaHostUnregister(prev_inbuf[i].v.data);
prev_inbuf[i] = src[i];
}
if (prev_outbuf.y.data)
cudaHostUnregister(prev_outbuf.y.data);
if (prev_outbuf.u.data)
cudaHostUnregister(prev_outbuf.u.data);
if (prev_outbuf.v.data)
cudaHostUnregister(prev_outbuf.v.data);
prev_outbuf = *dst;
stream_idx = (stream_idx + 1) % stream_num;