Nsight compute fails to profile a simple cuda kernel when opengl interop is used.
Here is what it looks like in the user interface:
Here is a minimal example:
#include "gl.h"
#include <GLFW/glfw3.h>
#include <cuda_gl_interop.h>
#include <cstdio>
#include <iostream>
template <typename T>
void check(T result, char const *const func, const char *const file, int const line) {
if (result) {
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
static_cast<unsigned int>(result), cudaGetErrorName(result), func);
_Exit(EXIT_FAILURE);
}
}
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
void windowInit(){
if (!glfwInit())
throw std::exception("Error while initializing GLFW");
glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 6);
glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GLFW_FALSE);
glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
glfwWindowHint(GLFW_OPENGL_API, GLFW_TRUE);
glfwWindowHint(GLFW_MAXIMIZED, GLFW_TRUE);
glfwWindowHint(GLFW_FOCUS_ON_SHOW, GLFW_TRUE);
glfwWindowHint(GLFW_CONTEXT_RELEASE_BEHAVIOR, GLFW_RELEASE_BEHAVIOR_NONE);
glfwWindowHint(GLFW_CONTEXT_ROBUSTNESS, GLFW_LOSE_CONTEXT_ON_RESET);
glfwWindowHint(GLFW_CONTEXT_NO_ERROR, GLFW_FALSE);
GLFWwindow* w = glfwCreateWindow(800, 600, "GLFW Window", NULL, NULL);
if (!w) {
glfwTerminate();
throw std::exception("Error while creating the window");
}
glfwMakeContextCurrent(w);
gladLoadGL(glfwGetProcAddress);
gladInstallGLDebug();
std::cout << "Version: " << glGetString(GL_VERSION) << std::endl;
std::cout << "Vendor: " << glGetString(GL_VENDOR) << std::endl;
std::cout << "Renderer: " << glGetString(GL_RENDERER) << std::endl;
std::cout << "GLSL Version: " << glGetString(GL_SHADING_LANGUAGE_VERSION) << std::endl;
unsigned int gl_device_count;
int gl_device_id;
checkCudaErrors(cudaGLGetDevices(&gl_device_count, &gl_device_id, 1, cudaGLDeviceListAll));
int cuda_device_id = gl_device_id;
checkCudaErrors(cudaSetDevice(cuda_device_id));
cudaDeviceProp props{};
checkCudaErrors(cudaGetDeviceProperties(&props, gl_device_id));
printf("GL : %-24s (%2d SMs)\n", props.name, props.multiProcessorCount);
checkCudaErrors(cudaGetDeviceProperties(&props, cuda_device_id));
printf("CUDA : %-24s (%2d SMs)\n", props.name, props.multiProcessorCount);
}
void makeGLBuffer(bool cudaGLInterop) {
int indexCount = 141642; // using a smaller buffer does not trigger the error (e.g. 128), even with cudaGLInterop=true
GLuint vboID;
glCreateBuffers(1, &vboID);
glNamedBufferStorage(vboID, indexCount * sizeof(uint32_t), nullptr, 0);
if(cudaGLInterop){
cudaGraphicsResource_t cudaResource = nullptr;
checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cudaResource, vboID, cudaGraphicsRegisterFlagsNone));
}
}
__global__ void simple_kernel() {
printf("Hello, World!");
}
int main(){
windowInit();
makeGLBuffer(true); // calling with cudaGLInterop=false does not trigger the error.
simple_kernel<<<1, 1>>>();
return 0;
}
The code runs perfectly fine but Nsight compute fails to profile the kernel.
I’ve managed to profile an app with cuda/gl interop with an earlier version of nsight compute about a year ago on a linux machine and there was no such problem.
What’s going on?
Here is a self-contained reproducer with build instructions:
MinimalReproducer.zip (375.8 KB)
Just run the install.ps1 script with a developer power shell command prompt to compile.
Environment:
Windows 11 24H2
Nsight compute 2025.3.0
Visual Studio 2022 v17.7.6
Cuda toolkit 12.6
Nvidia driver 580.88
GPU: rtx 3060 laptop
