Unknown error when attempting to profile kernel with Cuda/OpenGL interop

Nsight compute fails to profile a simple cuda kernel when opengl interop is used.
Here is what it looks like in the user interface:

Here is a minimal example:


#include "gl.h"
#include <GLFW/glfw3.h>

#include <cuda_gl_interop.h>
#include <cstdio>
#include <iostream>

template <typename T>
void check(T result, char const *const func, const char *const file, int const line) {
    if (result) {
        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
                static_cast<unsigned int>(result), cudaGetErrorName(result), func);
        _Exit(EXIT_FAILURE);
    }
}

#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)

void windowInit(){
    if (!glfwInit())
        throw std::exception("Error while initializing GLFW");

    glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
    glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 6);
    glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GLFW_FALSE);
    glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
    glfwWindowHint(GLFW_OPENGL_API, GLFW_TRUE);
    glfwWindowHint(GLFW_MAXIMIZED, GLFW_TRUE);
    glfwWindowHint(GLFW_FOCUS_ON_SHOW, GLFW_TRUE);
    glfwWindowHint(GLFW_CONTEXT_RELEASE_BEHAVIOR, GLFW_RELEASE_BEHAVIOR_NONE);
    glfwWindowHint(GLFW_CONTEXT_ROBUSTNESS, GLFW_LOSE_CONTEXT_ON_RESET);
    glfwWindowHint(GLFW_CONTEXT_NO_ERROR, GLFW_FALSE);

    GLFWwindow* w = glfwCreateWindow(800, 600, "GLFW Window", NULL, NULL);
    if (!w) {
        glfwTerminate();
        throw std::exception("Error while creating the window");
    }

    glfwMakeContextCurrent(w);
    gladLoadGL(glfwGetProcAddress);
    gladInstallGLDebug();

    std::cout << "Version: " << glGetString(GL_VERSION) << std::endl;
    std::cout << "Vendor: " << glGetString(GL_VENDOR) << std::endl;
    std::cout << "Renderer: " << glGetString(GL_RENDERER) << std::endl;
    std::cout << "GLSL Version: " << glGetString(GL_SHADING_LANGUAGE_VERSION) << std::endl;

    unsigned int gl_device_count;
    int gl_device_id;
    checkCudaErrors(cudaGLGetDevices(&gl_device_count, &gl_device_id, 1, cudaGLDeviceListAll));
    int cuda_device_id = gl_device_id;
    checkCudaErrors(cudaSetDevice(cuda_device_id));

    cudaDeviceProp props{};
    checkCudaErrors(cudaGetDeviceProperties(&props, gl_device_id));
    printf("GL   : %-24s (%2d SMs)\n", props.name, props.multiProcessorCount);
    checkCudaErrors(cudaGetDeviceProperties(&props, cuda_device_id));
    printf("CUDA : %-24s (%2d SMs)\n", props.name, props.multiProcessorCount);

}

void makeGLBuffer(bool cudaGLInterop) {

    int indexCount = 141642; // using a smaller buffer does not trigger the error (e.g. 128), even with cudaGLInterop=true

    GLuint vboID;
    glCreateBuffers(1, &vboID);
    glNamedBufferStorage(vboID, indexCount * sizeof(uint32_t), nullptr, 0);

    if(cudaGLInterop){
        cudaGraphicsResource_t cudaResource = nullptr;
        checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cudaResource, vboID, cudaGraphicsRegisterFlagsNone));
    }

}

__global__ void simple_kernel() {
    printf("Hello, World!");
}

int main(){
    windowInit();
    makeGLBuffer(true); // calling with cudaGLInterop=false does not trigger the error.
    simple_kernel<<<1, 1>>>();
    return 0;
}

The code runs perfectly fine but Nsight compute fails to profile the kernel.
I’ve managed to profile an app with cuda/gl interop with an earlier version of nsight compute about a year ago on a linux machine and there was no such problem.
What’s going on?

Here is a self-contained reproducer with build instructions:
MinimalReproducer.zip (375.8 KB)
Just run the install.ps1 script with a developer power shell command prompt to compile.

Environment:
Windows 11 24H2
Nsight compute 2025.3.0
Visual Studio 2022 v17.7.6
Cuda toolkit 12.6
Nvidia driver 580.88
GPU: rtx 3060 laptop

The issue is related to saving memory for workload replay. While this is investigated, you can consider using application replay (--replay-mode application) to profile your app, as it doesn’t require this operation.

Thanks for the workaround, it’s not ideal for GUI apps but it works.