Hi,
I have some CUDA code which uploads data to the device, performs some calculations and copies the result back. In order to speed this up, I copy the data in chunks and perform the computation asynchronously. The host memory is allocated pinned.
The basic loop looks like this:
Stream 1: copy HtoD - compute - copy DtoH
Stream 2: copy HtoD - compute - copy DtoH
Stream 3: copy HtoD - compute - copy DtoH
Stream 1: copy HtoD - compute - copy DtoH
…
(The actual code is at the bottom of this post.)
I tested the code on several machines, but on some, all operations seem to get serialized, despite being on different streams.
I have tested the following constellations:
1) Windows 11, RTX 4090, driver version 552.44, CUDA 12.4
All operations are serialized, as shown in the following Nsight Systems screenshot.
Nsight confirms, that the host memory is indeed pinned.
2) Windows 10, RTX 3080, driver version 536.25, CUDA 12.2
Same, all operations serialized.
3) Two different PCs with Ubuntu 24, RTX 4090, driver version 550, CUDA 12.4
Both of these have identical hardware as the first Windows PC. In fact, one of these is a dual boot on the Windows PC.
Here, the operations run in parallel, as shown on the following Nsight screenshot.
From these findings I suspected a difference between Windows and Linux, but then I found a Windows PC, where it actually worked as intended:
4) Windows 10, RTX 2070 Super, CUDA 12.3
Can anyone point me towards a solution for this problem? Is there some Windows configuration which prevents async memcpy/compute? How can I make sure, that the code runs as expected on a customer’s PC?
Thank you very much in advance.
- Philipp
Here is a minimal code sample to reproduce my results:
#include <cuda_runtime.h>
#include <iostream>
static void check_error_(cudaError_t error, int line)
{
if (error != cudaSuccess)
{
std::cerr << "CUDA error in line " << line << ":\n";
std::cerr << cudaGetErrorString(error) << "\n";
exit(1);
}
}
#define check_error(error) check_error_(error, __LINE__)
static __global__ void compute_kernel(float* data, size_t count)
{
size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= count)
{
return;
}
// Waste some time.
float v = data[tid];
for (int i = 0; i < 2048; ++i)
{
v = sqrt(v);
}
data[tid] = v;
}
void test_async_memcpy()
{
const size_t size = (size_t)1024 * 1024 * 1024;
const size_t chunk_size = (size_t)1024 * 1024;
const size_t chunk_count = (size + chunk_size - 1) / chunk_size;
const size_t stream_count = 3;
// Allocate pinned host memory and initialize.
float* host_data = nullptr;
check_error(cudaHostAlloc(&host_data, size * sizeof(float), cudaHostAllocDefault));
for (size_t i = 0; i < size; ++i)
{
host_data[i] = (float)i;
}
// Allocate device memory and create streams.
float* device_chunk_data[stream_count] = {};
cudaStream_t streams[stream_count] = {};
for (size_t i = 0; i < stream_count; ++i)
{
check_error(cudaMalloc(&device_chunk_data[i], chunk_size * sizeof(float)));
check_error(cudaStreamCreate(&streams[i]));
}
// Main loop.
for (size_t chunk_index = 0; chunk_index < chunk_count; ++chunk_index)
{
float* device_data = device_chunk_data[chunk_index % stream_count];
cudaStream_t stream = streams[chunk_index % stream_count];
// Copy chunk to device.
size_t offset = chunk_index * chunk_size;
check_error(cudaMemcpyAsync(device_data, host_data + offset, chunk_size * sizeof(float), cudaMemcpyHostToDevice, stream));
// Compute.
int block_size = 128;
int grid_size = (int)((chunk_size + block_size - 1) / block_size);
compute_kernel<<<grid_size, block_size, 0, stream>>>(device_data, chunk_size);
// Copy chunk back to host.
check_error(cudaMemcpyAsync(host_data + offset, device_data, chunk_size * sizeof(float), cudaMemcpyDeviceToHost, stream));
}
check_error(cudaDeviceSynchronize());
// Cleanup.
for (size_t i = 0; i < 3; ++i)
{
check_error(cudaStreamDestroy(streams[i]));
check_error(cudaFree(device_chunk_data[i]));
}
check_error(cudaFreeHost(host_data));
}
int main()
{
test_async_memcpy();
}