I’m trying to use openGL-cuda interop to copy from openGL buffer to CUDA as part of a tensorflow custom op. My code looks roughly like this:
GLuint render_buffer = 0;
glGenRenderbuffers(1, &render_buffer);
glBindRenderbuffer(GL_RENDERBUFFER, render_buffer);
glRenderbufferStorage(GL_RENDERBUFFER, GL_RGBA32F, width_pxl, height_pxl);
cudaGraphicsResource_t cuda_resource;
if (auto const err
= cudaGraphicsGLRegisterImage(&cuda_resource, render_buffer, GL_RENDERBUFFER, cudaGraphicsMapFlagsReadOnly))
{
VLOG(0) << "cudaGraphicsGLRegisterImage failed: " << cudaGetErrorName(err);
}
glBindRenderbuffer(GL_RENDERBUFFER, 0);
GLuint depth_buffer = 0;
glGenRenderbuffers(1, &depth_buffer);
glBindRenderbuffer(GL_RENDERBUFFER, depth_buffer);
glRenderbufferStorage(GL_RENDERBUFFER, GL_DEPTH24_STENCIL8, width_pxl, height_pxl);
glBindRenderbuffer(GL_RENDERBUFFER, 0);
GLuint frame_buffer = 0;
bind_framebuffer(render_buffer, depth_buffer, frame_buffer);
glBindFramebuffer(GL_FRAMEBUFFER, frame_buffer);
GLuint attachment = GL_COLOR_ATTACHMENT0;
glDrawBuffers(1, &attachment);
// Draw with EGL here.
cudaGraphicsMapResources(1, &cuda_resource);
cudaArray_t pixel_array;
cudaGraphicsSubResourceGetMappedArray(&pixel_array, cuda_resource, 0, 0);
// This will "download" the pixels from openGL to CUDA.
launch_pixels_download(*output_tensor, pixel_array);
cudaGraphicsUnmapResources(1, &cuda_resource);
This runs fine if I run the custom op only once. However during training when I need to evaluate this more than once, I get into this issue very quickly (after ~50 steps, apparently depending on how big my raster image is):
[I custom_ops/render.cc:123] cudaGraphicsGLRegisterImage failed: cudaErrorNotSupported
[F custom_ops/render.cu:62] cudaCreateSurfaceObject failed: cudaErrorUnknown
Aborted {
libpthread-2.27.so 12890 __restore_rt
libc-2.27.so 3ee97 raise.c:51 gsignal
libc-2.27.so 40801 abort.c:81 abort
libtensorflow_framework.so 6ec844
libtf_custom_ops_cu.so 1afe launch_pixels_download(tensorflow::Tensor&, cudaArray* const&)
libtf_custom_ops_py.so 1cb8c render_scene(tensorflow::OpKernelContext const*, int, path_rendering::scene const&, tensorflow::Tensor*)
libtf_custom_ops_py.so 21756 std::_Function_handler<bool (), GlDispatcher::GlThread::dispatch_blocking(std::function<void ()> const&)::{lambda()#1}>::_M_invoke(std::_Any_data const&)
libtf_custom_ops_py.so 211cc GlDispatcher::GlThread::thread_fn(moodycamel::BlockingConcurrentQueue<std::function<bool ()>, moodycamel::ConcurrentQueueDefaultTraits>&)
libstdc++.so.6.0.25 bd9e0
libpthread-2.27.so 76db start_thread
libc-2.27.so 12188f clone.S:97 clone
}
Some google search suggests that the reason might be memory leak (?), or maybe that Tensorflow would pin all available GPU memory, leaving not enough memory for this copying task. If that’s really the case I thought it should fail right from the first time, unless I’m still missing something here.
I ran cuda-memcheck but it didn’t find any issues.
I’m wondering if there’s a more methodical way to understand what’s really going on here.