@Robert_Crovella Thanks for your quick reply. I’ve double confirmed that both streams are created successfully as non-default stream. I will show the corresponding snippet code below with the logs:
the 1st thread is running:
} else if (src_device.Type() == OrtDevice::GPU) {
if (dst_device.Type() == OrtDevice::CPU) {
// copying from GPU to pinned memory, this is non-blocking
std::cout<<“before cudaMemcpyAsync() src is GPU dst is CPU, dst:”<<dst_data<<“, src:”<<src_data<<“, stream:”<<stream.GetHandle()<<“\n”;
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, static_cast<cudaStream_t>(stream.GetHandle())));
std::cout<<“after cudaMemcpyAsync() src is GPU dst is CPU\n”;
}
} else {
the 2nd thread is running:
std::cout<<"after NonZeroInclusivePrefixSum(), dst:"<<&nonzero_elements<<", src:"<<prefix_counts + number_of_blocks - 1<<", stream:"<<Stream(context)<<"\n";
// cudaMemcpyAsync from device memory to pageable host memory will return only once the copy has completed.
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(
&nonzero_elements, prefix_counts + number_of_blocks - 1,
sizeof(int), cudaMemcpyDeviceToHost, Stream(context)));
std::cout<<"before fast_divmod()\n";
And the logs:
…
before cudaMemcpyAsync() src is GPU dst is CPU, dst:0x561aef23cec0, src:0x7fce93499400, stream:0x561ad89648a0
after cudaMemcpyAsync() src is GPU dst is CPU
after NonZeroInclusivePrefixSum(), dst:0x7fcebdffab30, src:0x7fce9349a600, stream:0x561ad89648a0
after NonZeroInclusivePrefixSum(), dst:0x7fce55ffcb30, src:0x7fce3a0a8500, stream:0x7fce48005950
before fast_divmod()
before cudaMemcpyAsync() src is GPU dst is CPU, dst:0x561ade1fa680, src:0x7fce3a5a4400, stream:0x561ad89648a0
You can see that neither 1st thread (0x561ad89648a0) nor 2nd thread (0x7fce48005950) doesn’t print out the log after their corresponding cudaMemcpyAsync().
I have a way to repro this issue consistently.
Need to mention that if we do cudaStreamSynchronize(stream) before cudaMemcpyAsync(), there will be no hang