I am building a framework to process data for the host on the device using CUDA graphs. To hide copy latency of processed data, a Memcpy node copies the data back from the device into the corresponding host buffer as soon as possible.
In testing I have noticed that the data that is copied back to the host from inside a CUDA graph is not always correct and have theorized that it is actually cached on the host side due to the host not knowing that its data was changed. Here is a simple example implementation of the observed behavior:
#include <cuda_runtime.h>
#include <sstream>
#define CUDA_CHECK(err) \
do { \
cudaError_t err_ = (err); \
if(err_ != cudaSuccess) { \
std::stringstream msg; \
msg << "CUDA error at " << __FILE__ << ":" << __LINE__ \
<< " code=" << static_cast<unsigned int>(err_) << "(" << cudaGetErrorName(err_) \
<< ") " << #err << std::endl; \
throw std::runtime_error(msg.str()); \
} \
} while(0)
static inline void check(int *data, int value, size_t size, const char *message)
{
size_t failed = 0;
for(size_t idx = 0; idx < size; idx++) {
if(data[idx] != value) {
failed++;
}
}
if(failed > 0)
printf("%s: %zu out of %zu elements\n", message, failed, size);
}
int main()
{
int *cached_h, *cached_d;
size_t size = 1024, sizeInBytes = size * sizeof(*cached_h);
CUDA_CHECK(cudaMallocHost(&cached_h, sizeInBytes));
CUDA_CHECK(cudaMalloc(&cached_d, sizeInBytes)); // Device memory will stay 0
// Prepare copying device memory back to host memory in a way that the host does not know
cudaGraph_t covertCopyGraph;
CUDA_CHECK(cudaGraphCreate(&covertCopyGraph, 0));
cudaGraphNode_t covertCopy;
CUDA_CHECK(cudaGraphAddMemcpyNode1D(
&covertCopy,
covertCopyGraph,
nullptr,
0,
cached_h, // dst
cached_d, // src
sizeInBytes,
cudaMemcpyDeviceToHost));
cudaGraphExec_t covertCopyExec;
CUDA_CHECK(cudaGraphInstantiate(&covertCopyExec, covertCopyGraph));
// Change host memory, then pull device memory covertly for host
for(size_t idx = 0; idx < size; idx++)
cached_h[idx] = 2;
CUDA_CHECK(cudaGraphLaunch(covertCopyExec, 0));
// Verify that it may not match sometimes probably due to host cache
check(cached_h, 0, size, "Data invalid after graph copy");
// Change host memory, then pull device memory triggered from host directly
for(size_t idx = 0; idx < size; idx++)
cached_h[idx] = 2;
CUDA_CHECK(cudaMemcpy(cached_h, cached_d, sizeInBytes, cudaMemcpyDeviceToHost));
// Verify that it matches everytime
check(cached_h, 0, size, "Data invalid after manual copy");
// Free resources
CUDA_CHECK(cudaGraphDestroy(covertCopyGraph));
CUDA_CHECK(cudaGraphExecDestroy(covertCopyExec));
CUDA_CHECK(cudaFreeHost(cached_h));
CUDA_CHECK(cudaFree(cached_d));
}
When executing the program, most of the time I get the following output:
$ ./a.out
Data invalid after graph copy: 1024 out of 1024 elements
When executing the program while running a stress test on the host processor, I get a slightly different output, which indicates to me that the host cache is causing the irregular behavior (due to the cache being contested by the stress test):
$ ./a.out
Data invalid after graph copy: 791 out of 1024 elements
$ ./a.out
Data invalid after graph copy: 916 out of 1024 elements
$ ./a.out
Data invalid after graph copy: 957 out of 1024 elements
$ ./a.out
Data invalid after graph copy: 872 out of 1024 elements
$ ./a.out
Data invalid after graph copy: 1022 out of 1024 elements
$ ./a.out
Data invalid after graph copy: 1000 out of 1024 elements
Is there any way to avoid cached data on the host processor while still being able to copy data from the device back to the host inside a CUDA graph?
The used nvcc compiler version is 12.0.76, my system I’m currently testing on is Debian Bullseye using an an Intel Skylake CPU with a Turing GPU.