Recently I came across the following document for cuda 11.4 regarding synchronization of cudaMemcpy* calls. CUDA Runtime API :: CUDA Toolkit Documentation
which mentions
However, from my experience cudaMemcpyAsync for host-to-device transfer of pageable memory always blocks on the stream until the transfer is finished. A simple test program with cuda 11.4 does not show asynchronous copy involving a worker thread.
#include <vector>
__global__
void kernel2(){
}
int main(){
cudaStream_t stream1; cudaStreamCreate(&stream1);
cudaStream_t stream2; cudaStreamCreate(&stream2);
constexpr std::size_t size = 16 * 1024 * 1024;
char* d_array; cudaMalloc(&d_array, size);
char* h_array; cudaMallocHost(&h_array, size);
std::vector<char> vec(size);
{
cudaMemcpyAsync(d_array, vec.data(), size, cudaMemcpyHostToDevice, stream1);
kernel2<<<1,1,0,stream2>>>();
kernel2<<<1,1,0,stream1>>>();
cudaDeviceSynchronize();
}
{
cudaMemcpyAsync(d_array, h_array, size, cudaMemcpyHostToDevice, stream1);
kernel2<<<1,1,0,stream2>>>();
kernel2<<<1,1,0,stream1>>>();
cudaDeviceSynchronize();
}
cudaDeviceReset();
}
Do I misunderstand the document? Can someone show a case when an extra threads performs the copy?