@njuffa
This blog post reports up to factor 1.6 speedup. https://developer.nvidia.com/blog/constructing-cuda-graphs-with-dynamic-parameters/
However, CUDA graphs only make sense if the graph can be reused multiple times to amortize the initial setup costs.
I my toy example (below) with a simple linear graph, I can see a speedup of factor 2.3 in launch overhead. (device idle time between kernels is reduced, as well. Also, the multi gpu launch seems to be interleaved automatically)
// nvcc -arch=sm_70 -g -O3 main.cu -lnvToolsExt -o main
#include <thread>
#include <future>
#include <chrono>
#include <array>
#include <vector>
#include <cassert>
#include <iostream>
#include <cstring>
#include <nvToolsExt.h>
//#define WITH_CALLBACK
void push_range(const std::string& name, int cid){
const uint32_t colors_[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff};
const int num_colors_ = sizeof(colors_)/sizeof(uint32_t);
int color_id = cid;
color_id = color_id%num_colors_;
nvtxEventAttributes_t eventAttrib;
std::memset(&eventAttrib, 0, sizeof(nvtxEventAttributes_t));
eventAttrib.version = NVTX_VERSION;
eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
eventAttrib.colorType = NVTX_COLOR_ARGB;
eventAttrib.color = colors_[color_id];
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
eventAttrib.message.ascii = name.c_str();
nvtxRangePushEx(&eventAttrib);
}
void pop_range(){
nvtxRangePop();
}
__global__
void kernel(int* data){
*data = 42;
}
struct CallbackData{
int* pinnedBuffer;
std::vector<int>* vec;
};
void callback(void* args){
push_range("callback", 3);
CallbackData* data = static_cast<CallbackData*>(args);
data->vec->push_back(*data->pinnedBuffer);
pop_range();
}
int main(){
constexpr int numDevices = 2;
std::array<int, numDevices> deviceIds{0,1};
constexpr int numIterations = 100;
std::array<cudaStream_t, numDevices> streams{};
std::array<cudaEvent_t, numDevices> events{};
std::array<int*, numDevices> deviceBuffers{};
std::array<int*, numDevices> pinnedBuffers{};
std::array<std::vector<int>, numDevices> vectors{};
std::array<CallbackData, numDevices> callbackArgs{};
for(int i = 0; i < numDevices; i++){
cudaSetDevice(deviceIds[i]);
cudaStreamCreate(&streams[i]);
cudaEventCreate(&events[i], cudaEventDisableTiming);
cudaMalloc(&deviceBuffers[i], sizeof(int));
cudaMallocHost(&pinnedBuffers[i], sizeof(int));
vectors[i].reserve(numIterations);
callbackArgs[i].pinnedBuffer = pinnedBuffers[i];
callbackArgs[i].vec = &vectors[i];
}
cudaSetDevice(deviceIds[0]);
cudaStream_t mainstream;
cudaStreamCreate(&mainstream);
cudaEvent_t mainevent;
cudaEventCreate(&mainevent, cudaEventDisableTiming);
auto launch = [&](){
cudaEventRecord(mainevent, mainstream);
for(int i = 0; i < numDevices; i++){
cudaSetDevice(deviceIds[i]);
auto& stream = streams[i];
cudaStreamWaitEvent(stream, mainevent);
for(int k = 0; k < numIterations; k++){
kernel<<<1,1,0,stream>>>(deviceBuffers[i]);
#ifdef WITH_CALLBACK
cudaMemcpyAsync(pinnedBuffers[i], deviceBuffers[i], sizeof(int), cudaMemcpyDeviceToHost, stream);
cudaLaunchHostFunc(stream, callback, (void*)&callbackArgs[i]);
#endif
}
cudaEventRecord(events[i], stream);
cudaStreamWaitEvent(mainstream, events[i]);
}
cudaSetDevice(deviceIds[0]);
};
// no graph
push_range("no graph", 0);
launch();
cudaStreamSynchronize(mainstream);
pop_range();
#ifdef WITH_CALLBACK
for(int i = 0; i < numDevices; i++){
assert(vectors[i].size() == numIterations);
for(auto x : vectors[i]){
assert(x == 42);
}
vectors[i].clear();
}
#endif
//stream capture graph
{
push_range("stream capture graph", 1);
cudaStreamBeginCapture(mainstream, cudaStreamCaptureModeRelaxed);
launch();
cudaGraph_t graph;
cudaStreamEndCapture(mainstream, &graph);
cudaGraphExec_t execGraph;
cudaGraphNode_t errorNode;
cudaError_t status = cudaGraphInstantiate(&execGraph, graph, &errorNode, nullptr, 0);
assert(status == cudaSuccess) ;
cudaGraphDestroy(graph);
cudaGraphLaunch(execGraph, mainstream);
cudaStreamSynchronize(mainstream);
#ifdef WITH_CALLBACK
for(int i = 0; i < numDevices; i++){
assert(vectors[i].size() == numIterations);
for(auto x : vectors[i]){
assert(x == 42);
}
vectors[i].clear();
}
#endif
cudaGraphExecDestroy(execGraph);
pop_range();
}
//construct graph manually
{
push_range("manual graph", 0);
cudaGraph_t graph;
cudaGraphCreate(&graph, 0);
for(int i = 0; i < numDevices; i++){
cudaSetDevice(deviceIds[i]);
cudaGraphNode_t* prev = nullptr;
cudaGraphNode_t kernelNode;
cudaGraphNode_t memcpyNode;
cudaGraphNode_t hostNode;
cudaKernelNodeParams kernelNodeParams{};
kernelNodeParams.func = (void *)kernel;
kernelNodeParams.gridDim = dim3(1, 1, 1);
kernelNodeParams.blockDim = dim3(1, 1, 1);
kernelNodeParams.sharedMemBytes = 0;
void *kernelArgs[1] = {(void *)&deviceBuffers[i]};
kernelNodeParams.kernelParams = kernelArgs;
kernelNodeParams.extra = NULL;
cudaHostNodeParams hostNodeParams{};
hostNodeParams.fn = callback;
hostNodeParams.userData = &callbackArgs[i];
for(int k = 0; k < numIterations; k++){
cudaGraphAddKernelNode(&kernelNode, graph, prev, (prev == nullptr ? 0 : 1), &kernelNodeParams);
prev = &kernelNode;
#ifdef WITH_CALLBACK
cudaGraphAddMemcpyNode1D(&memcpyNode, graph, &kernelNode, 1, pinnedBuffers[i], deviceBuffers[i], sizeof(int), cudaMemcpyDeviceToHost);
cudaGraphAddHostNode(&hostNode, graph, &memcpyNode, 1, &hostNodeParams);
prev = &hostNode;
#endif
}
cudaSetDevice(deviceIds[0]);
}
cudaGraphExec_t execGraph;
cudaGraphNode_t errorNode;
cudaError_t status = cudaGraphInstantiate(&execGraph, graph, &errorNode, nullptr, 0);
assert(status == cudaSuccess) ;
cudaGraphDestroy(graph);
cudaGraphLaunch(execGraph, mainstream);
cudaStreamSynchronize(mainstream);
#ifdef WITH_CALLBACK
for(int i = 0; i < numDevices; i++){
assert(vectors[i].size() == numIterations);
for(auto x : vectors[i]){
assert(x == 42);
}
vectors[i].clear();
}
#endif
cudaGraphExecDestroy(execGraph);
pop_range();
}
cudaEventDestroy(mainevent);
cudaStreamDestroy(mainstream);
for(int i = 0; i < numDevices; i++){
cudaSetDevice(deviceIds[i]);
cudaStreamDestroy(streams[i]);
cudaEventDestroy(events[i]);
cudaFree(deviceBuffers[i]);
cudaFreeHost(pinnedBuffers[i]);
}
}