I just started using cuda, and I learned that cudamemcpyaync needs pinned memory to overlap with kernal, but I found that when my program only has H2D and kernal, even if it uses pageable memory, it overlaps with kernal. Wondering if I misunderstood something.
The code is like this
auto async_func1 = [&](float *a)
{
memset(a, 0, bytes);
for (int i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
cudaMemcpyAsync(&d_a[offset], &a[offset], streamBytes,
cudaMemcpyHostToDevice, stream[i]);
kernel<<<streamSize / blockSize, blockSize, 0, stream[i]>>>(d_a, offset);
}
for (int i = 0; i < nStreams; ++i)
{
cudaStreamSynchronize(stream[i]);
}
};
