GL interop in multithreaded host app

Hello,

I am trying to write a multi-threaded C++ host application using CUDA-OpenGL interoperability. I want to use CUDA to write into a Buffer Object and then render this buffer object using OpenGL. I want one host thread controlling the CUDA kernel calls and another one controlling the OpenGL calls. My problem is that cudaGraphicsGLRegisterBuffer() always gives me cudaErrorUnknown. The problem can be reproduced on the minimal code example given below.

I understand that each host thread needs its own GL context (ctx1 and ctx2), plus I need an extra GL context (ctxShared) in which I will create the VBO, and then both ctx1 and ctx2 will share with ctxShared. First off, is this assumption/setup correct?

[codebox]glutInit(&argC,argV);

glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH);

glutCreateWindow(“WindowName”);

glewInit();

ctx1=wglGetCurrentContext();

ctxShared=wglCreateContext(wglGetCurrentDC());

wglShareLists(ctxShared,ctx1);

ctx2=wglCreateContext(wglGetCurrentDC());

wglShareLists(ctxShared,ctx2);

wglMakeCurrent(wglGetCurrentDC(),ctxShared);

glGenBuffers(1,&positionsVBO);

glBindBuffer(GL_ARRAY_BUFFER,positionsVBO);

unsigned int size=widthheight4*sizeof(float);

glBufferData(GL_ARRAY_BUFFER,size,NULL,GL_DYNAMIC_DRAW);

glBindBuffer(GL_ARRAY_BUFFER,0);

wglMakeCurrent(wglGetCurrentDC(),ctx1);[/codebox]

Then, I create a new host thread and have it execute the following code:

[codebox]wglMakeCurrent(wglGetCurrentDC(),ctx2);

cudaGLSetGLDevice(0);

cudaGraphicsGLRegisterBuffer(&positionsVBO_CUDA,positionsVBO,cudaGraphicsMapFlagsWrite

Discard);[/codebox]

However, cudaGraphicsGLRegisterBuffer() always returns cudaErrorUnknown. Would anyone please have any idea what the problem might be?

If I do not use host threading and do it all in one thread, it works fine (even with the context switches as written above). I also tried calling cudaGLSetGLDevice(0) in the master thread before creating the second one, it didn’t help.

I am using a 32-bit build and 32-bit CUDA libraries, I need the app not to be 64-bit dependent.

My setup:

Windows Vista 64-bit

GeForce GTX 285

Driver version 197.13

CUDA toolkit 3.0

Visual Studio 2005

I had the same problems on a notebook with Vista 32-bit and a GeForce 9500M.

I will be most grateful for any piece of help or advice. Thank you in advance.

(The entire code, using parts of a CUDA example, is given below)

[codebox]//------------------------------------------------

//cuda.h:

//------------------------------------------------

#pragma once

#include <windows.h>

#include <cuda_runtime.h>

#include <gl/glew.h>

#include <cuda_gl_interop.h>

#include <gl/glut.h>

//------------------------------------------------

//data.h:

//------------------------------------------------

#pragma once

extern GLuint positionsVBO;

extern struct cudaGraphicsResource *positionsVBO_CUDA;

extern unsigned int width;

extern unsigned int height;

extern float tim;

//------------------------------------------------

//kernels.h:

//------------------------------------------------

#pragma once

void callKernel();

//------------------------------------------------

//kernels.cu:

//------------------------------------------------

#include “cuda.h”

#include “data.h”

#include “kernels.h”

global void createVertices(float4* positions, float tim, unsigned int width, unsigned int height) {

unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;

unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

float u = x / (float)width;

float v = y / (float)height;

u = u * 2.0f - 1.0f;

v = v * 2.0f - 1.0f;

float freq = 4.0f;

float w = sinf(u * freq + tim) * cosf(v * freq + tim) * 0.5f;

positions[y * width + x] = make_float4(u, w, v, 1.0f);

}

void callKernel() {

float4 *positions;

cudaGraphicsMapResources(1,&positionsVBO_CUDA,0);

size_t num_bytes;

cudaGraphicsResourceGetMappedPointer((void**)&positions,&num_bytes,positionsVBO_CUDA);

dim3 dimBlock(16, 16, 1);

dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);

createVertices<<<dimGrid, dimBlock>>>(positions, tim, width, height);

cudaGraphicsUnmapResources(1, &positionsVBO_CUDA, 0);

}

//------------------------------------------------

//main.cpp:

//------------------------------------------------

#include <boost/thread.hpp>

#include “cuda.h”

#include “data.h”

#include “kernels.h”

using namespace boost;

barrier sync1(2),sync2(2),sync3(2);

HGLRC ctx1,ctx2,ctxShared;

GLuint positionsVBO;

struct cudaGraphicsResource *positionsVBO_CUDA;

unsigned int width=64;

unsigned int height=64;

float tim=0.0f;

void resize(int newWidth,int newHeight) {

if (newHeight==0) { //prevent division by 0

newHeight=1;

}

glViewport(0,0,newWidth,newHeight);

glMatrixMode(GL_PROJECTION);

glLoadIdentity();

gluPerspective(

60.0

,newWidth/double(newHeight)

,0.1

,10.0

);

}

void display() {

sync2.wait(); //app waits till other thread finishes kernel execution

glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);

glEnableClientState(GL_VERTEX_ARRAY);

glBindBuffer(GL_ARRAY_BUFFER, positionsVBO);

glVertexPointer(4, GL_FLOAT, 0, 0);

glDrawArrays(GL_POINTS, 0, width * height);

glDisableClientState(GL_VERTEX_ARRAY);

glutSwapBuffers();

tim+=0.01f;

sync3.wait(); //app waits till this thread finishes GL rendering

glutPostRedisplay();

}

void threadFunc() {

wglMakeCurrent(wglGetCurrentDC(),ctx2);

cudaGLSetGLDevice(0);

cudaError_t errc=cudaGraphicsGLRegisterBuffer(&positionsVBO_CUDA,positionsVBO,cudaGraphicsMapFlagsWrite

Discard);

sync1.wait(); //app waits till registering is finished

for(;;) {

callKernel();

sync2.wait();	//app waits till this thread finishes kernel execution

sync3.wait();	//app waits till other thread finishes GL rendering

}

}

int main(int argC,char **argV) {

glutInit(&argC,argV);

glutInitWindowPosition(100,100);

glutInitWindowSize(512,512);

glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH);

glutCreateWindow(“WindowName”);

glewInit();

glutDisplayFunc(&display);

glutReshapeFunc(&resize);

ctx1=wglGetCurrentContext();

ctxShared=wglCreateContext(wglGetCurrentDC());

wglShareLists(ctxShared,ctx1);

ctx2=wglCreateContext(wglGetCurrentDC());

wglShareLists(ctxShared,ctx2);

wglMakeCurrent(wglGetCurrentDC(),ctxShared);

glGenBuffers(1,&positionsVBO);

glBindBuffer(GL_ARRAY_BUFFER,positionsVBO);

unsigned int size=widthheight4*sizeof(float);

glBufferData(GL_ARRAY_BUFFER,size,NULL,GL_DYNAMIC_DRAW);

glBindBuffer(GL_ARRAY_BUFFER,0);

wglMakeCurrent(wglGetCurrentDC(),ctx1);

//cudaGLSetGLDevice(0); //Present or absent, makes no difference

thread th2(threadFunc);

sync1.wait(); //app waits till registering is finished

glutMainLoop();

return 0;

}

[/codebox]

I have the same issue too, and trying to reslove it now.

I also met this problem. Has any one know how to solve it?

Without considering cuda, does opengl work in multithread? If you create shared opengl contexts and if 2 threads create a vbo in a context, are these vbo accessible in every contexts by every threads?
(Maybe it is clear that yes, I have no idea, it is just to learn a new thing!)

Yes, OpenGL does work with multiple threads. OpenGL threading has in fact been added to the QT 4.8 library; see http://labs.qt.nokia.com/2011/06/03/threaded-opengl-in-4-8/.

Great. Thanks for the link.

So why is it not the same when a cuda context try to work with an opengl context shared between threads?