Hello,
I am trying to use a cudaGraphExec_t
to represent the computations in a single iteration of an iterative algorithm. Then, I launch the cudaGraphExec_t
as many times as algorithm iterations should be done.
Unfortunately, it seems that this only works correctly if I run cudaStreamSynchronize(stream)
on the stream that the graph exec is launched on in-between graph iterations (causing unnecessary synchronization overhead). If I omit the synchronization, the results are incorrect. According to my understanding of the CUDA documentation, this synchronization should not be necessary: in section 3.2.6.6.5. Using Graph APIs of the CUDA C Programming Guide, it is stated:
A cudaGraphExec_t cannot run concurrently with itself. A launch of a cudaGraphExec_t will be ordered after previous launches of the same executable graph.
Thus I would not expect any concurrency issues in this situation, since it seems that multiple launches of the same cudaGraphExec_t
should get ordered to run after each other. However, the actual behavior seems to be that multiple invocations of the same cudaGraphExec_t
do run concurrently with each other, causing the issues.
Am I doing or understanding something wrong here, or is there an issue with CUDA?
Below is a full example code to demonstrate the problem. The code builds a graph with three nodes:
- memset() a float to zero.
- Run a kernel with 2 * 1024 threads, where each CUDA thread increments the float by 1, and one thread prints “This is a device printf() to make this kernel call take more time.”
- Run a kernel with a single CUDA thread that prints the float’s value.
Node 1 has no dependencies, while each following node depends on the previous node. The graph (exec) is launched twice. Given that a cudaGraphExec_t should not run concurrently with itself, I would expect that all nodes run in the order 1., 2., 3., 1., 2., 3. without overlap and the correct output should be:
This is a device printf() to make this kernel call take more time.
Value of deviceFloat: 2048.000000
This is a device printf() to make this kernel call take more time.
Value of deviceFloat: 2048.000000
However, without running cudaStreamSynchronize(stream)
between the two launches of the graph exec, there are different (non-deterministic) outputs, for example:
This is a device printf() to make this kernel call take more time.
This is a device printf() to make this kernel call take more time.
Value of deviceFloat: 2048.000000
Value of deviceFloat: 2080.000000
or
This is a device printf() to make this kernel call take more time.
This is a device printf() to make this kernel call take more time.
Value of deviceFloat: 4064.000000
Value of deviceFloat: 4096.000000
So, the two launches of the graph exec are obviously running in parallel, which seems to contradict the documentation, if I understand it correctly.
Here is the code:
#include <iostream>
#include <memory>
// Standard CUDA error checking macro
#define CUDA_CHECKED_CALL(cuda_call) \
do { \
cudaError error = (cuda_call); \
if (cudaSuccess != error) { \
std::cout << "Cuda Error: " << cudaGetErrorString(error) << std::endl; \
} \
} while(false)
__global__ void AccumulationKernel(float* deviceFloat) {
if (blockIdx.x == 0 && threadIdx.x == 0) {
printf("This is a device printf() to make this kernel call take more time.\n");
}
// Inefficient accumulation, just for testing
atomicAdd(deviceFloat, 1.f);
}
__global__ void PrintKernel(float* deviceFloat) {
if (blockIdx.x == 0 && threadIdx.x == 0) {
printf("Value of deviceFloat: %f\n", static_cast<double>(*deviceFloat));
}
}
int main(int /*argc*/, char** /*argv*/) {
// Allocate memory for a float number
float* deviceFloat;
CUDA_CHECKED_CALL(cudaMalloc(&deviceFloat, sizeof(float)));
// Create a CUDA graph
cudaGraph_t graph;
CUDA_CHECKED_CALL(cudaGraphCreate(&graph, 0));
// First graph node: Set deviceFloat to zero via a memset. No dependencies.
cudaMemsetParams memsetParams{};
memsetParams.dst = deviceFloat;
memsetParams.elementSize = 1;
memsetParams.width = 1 * sizeof(float);
memsetParams.height = 1;
cudaGraphNode_t memsetNode;
CUDA_CHECKED_CALL(cudaGraphAddMemsetNode(&memsetNode, graph, nullptr, 0, &memsetParams));
// Second graph node: Accumulate some values onto deviceFloat in a kernel call. Depends on memsetNode.
const void* accumParams[] = {
&deviceFloat};
cudaKernelNodeParams params{};
params.func = reinterpret_cast<void*>(&AccumulationKernel);
params.gridDim = dim3(2);
params.blockDim = dim3(1024);
params.kernelParams = const_cast<void**>(accumParams);
cudaGraphNode_t accumNode;
CUDA_CHECKED_CALL(cudaGraphAddKernelNode(&accumNode, graph, /*dependencies*/ &memsetNode, /*dependenciesCount*/ 1, ¶ms));
// Third graph node: Print the value of deviceFloat. Depends on accumNode.
const void* printParams[] = {
&deviceFloat};
params.func = reinterpret_cast<void*>(&PrintKernel);
params.gridDim = dim3(1);
params.blockDim = dim3(1);
params.kernelParams = const_cast<void**>(printParams);
cudaGraphNode_t printNode;
CUDA_CHECKED_CALL(cudaGraphAddKernelNode(&printNode, graph, /*dependencies*/ &accumNode, /*dependenciesCount*/ 1, ¶ms));
// Create graph exec and destroy the graph
cudaGraphExec_t graphExec;
char errorLog[512] = {0};
cudaGraphNode_t errorNode = nullptr;
cudaError_t error = cudaGraphInstantiate(&graphExec, graph, &errorNode, errorLog, 512);
if (error != cudaSuccess) {
errorLog[511] = 0;
std::cout << "Cuda error in cudaGraphInstantiate(): " << cudaGetErrorString(error) << "\n"
<< "Error log: " << errorLog << "\n"
<< "Error node: " << errorNode << std::endl;
}
CUDA_CHECKED_CALL(cudaGraphDestroy(graph));
// Create a stream
cudaStream_t stream;
CUDA_CHECKED_CALL(cudaStreamCreate(&stream));
// Launch the graph exec two times in the stream
for (int i = 0; i < 2; ++ i) {
CUDA_CHECKED_CALL(cudaGraphLaunch(graphExec, stream));
// Uncomment this to get correct behavior:
// cudaStreamSynchronize(stream);
}
// Wait for all device operations to complete
cudaDeviceSynchronize();
// Clean up
cudaGraphExecDestroy(graphExec);
cudaStreamDestroy(stream);
cudaFree(deviceFloat);
return 0;
}
I am on Manjaro Linux, using a Geforce GTX 1080, and the NVIDIA Driver Version is 460.39.