Ncu 2020.1.1 fails on trivial compute graph

Any suggestions on how to fix this?
The same kernel launched without a compute graph profiles just fine.
x64 Ubuntu 18.04
Driver Version: 440.82 CUDA Version: 10.2

//usr/local/cuda/bin/nvcc $0 --run; exit
#include <stdio.h>
const int dataSetSize = 1<<20;
__managed__ int x[dataSetSize], a[dataSetSize], b[dataSetSize];
extern "C" __global__ void AddOnTheGPU() {
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    if (i < dataSetSize)
        x[i] = a[i] + b[i];
int main() {
    for (int i=0; i<dataSetSize; i++) {
        a[i] = i;
        b[i] = dataSetSize - i;
    int threadsPerBlock = 1024;
    int numBlocks = (dataSetSize + threadsPerBlock-1)/threadsPerBlock;
    if (1) {
        cudaGraph_t graph;
        cudaGraphCreate(&graph, 0);
        cudaGraphNode_t node;
        cudaKernelNodeParams nodeParams {(void*)AddOnTheGPU, dim3(numBlocks), dim3(threadsPerBlock), 0, nullptr, nullptr };
        cudaGraphAddKernelNode(&node, graph, nullptr, 0, &nodeParams);
        cudaGraphExec_t graphExec;
        cudaGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0);
        cudaGraphLaunch(graphExec, 0);
    } else
    printf("%d + %d = %d\n", a[123456], b[123456], x[123456]);
    return 0;


sudo /usr/local/NVIDIA-Nsight-Compute/nv-nsight-cu-cli --target-processes all  ./a.out
==PROF== Connected to process 18636 (/home/cory/sim/ComputeGraph/standalone/a.out)
==PROF== Profiling "AddOnTheGPU" - 1: 0%.
==ERROR== Failed to profile kernel "AddOnTheGPU" in process 18636
==ERROR== The application returned an error code (11).
==ERROR== An error occurred while trying to profile.
==WARNING== No kernels were profiled.

For CUDA graphs applications built with CUDA 10.2 and using an r440 driver, please try with Nsight Compute 2019.5, which is part of CUDA 10.2.

In general, using a newer Nsight Compute version than your toolkit and driver will work, but for CUDA graphs there are more limitations.

1 Like