Hello,
I am trying to write a multi-threaded C++ host application using CUDA-OpenGL interoperability. I want to use CUDA to write into a Buffer Object and then render this buffer object using OpenGL. I want one host thread controlling the CUDA kernel calls and another one controlling the OpenGL calls. My problem is that cudaGraphicsGLRegisterBuffer() always gives me cudaErrorUnknown. The problem can be reproduced on the minimal code example given below.
I understand that each host thread needs its own GL context (ctx1 and ctx2), plus I need an extra GL context (ctxShared) in which I will create the VBO, and then both ctx1 and ctx2 will share with ctxShared. First off, is this assumption/setup correct?
[codebox]glutInit(&argC,argV);
glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH);
glutCreateWindow(“WindowName”);
glewInit();
ctx1=wglGetCurrentContext();
ctxShared=wglCreateContext(wglGetCurrentDC());
wglShareLists(ctxShared,ctx1);
ctx2=wglCreateContext(wglGetCurrentDC());
wglShareLists(ctxShared,ctx2);
wglMakeCurrent(wglGetCurrentDC(),ctxShared);
glGenBuffers(1,&positionsVBO);
glBindBuffer(GL_ARRAY_BUFFER,positionsVBO);
unsigned int size=widthheight4*sizeof(float);
glBufferData(GL_ARRAY_BUFFER,size,NULL,GL_DYNAMIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER,0);
wglMakeCurrent(wglGetCurrentDC(),ctx1);[/codebox]
Then, I create a new host thread and have it execute the following code:
[codebox]wglMakeCurrent(wglGetCurrentDC(),ctx2);
cudaGLSetGLDevice(0);
cudaGraphicsGLRegisterBuffer(&positionsVBO_CUDA,positionsVBO,cudaGraphicsMapFlagsWrite
Discard);[/codebox]
However, cudaGraphicsGLRegisterBuffer() always returns cudaErrorUnknown. Would anyone please have any idea what the problem might be?
If I do not use host threading and do it all in one thread, it works fine (even with the context switches as written above). I also tried calling cudaGLSetGLDevice(0) in the master thread before creating the second one, it didn’t help.
I am using a 32-bit build and 32-bit CUDA libraries, I need the app not to be 64-bit dependent.
My setup:
Windows Vista 64-bit
GeForce GTX 285
Driver version 197.13
CUDA toolkit 3.0
Visual Studio 2005
I had the same problems on a notebook with Vista 32-bit and a GeForce 9500M.
I will be most grateful for any piece of help or advice. Thank you in advance.
(The entire code, using parts of a CUDA example, is given below)
[codebox]//------------------------------------------------
//------------------------------------------------
#pragma once
#include <windows.h>
#include <cuda_runtime.h>
#include <gl/glew.h>
#include <cuda_gl_interop.h>
#include <gl/glut.h>
//------------------------------------------------
//------------------------------------------------
#pragma once
extern GLuint positionsVBO;
extern struct cudaGraphicsResource *positionsVBO_CUDA;
extern unsigned int width;
extern unsigned int height;
extern float tim;
//------------------------------------------------
//------------------------------------------------
#pragma once
void callKernel();
//------------------------------------------------
//------------------------------------------------
#include “cuda.h”
#include “data.h”
#include “kernels.h”
global void createVertices(float4* positions, float tim, unsigned int width, unsigned int height) {
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
float u = x / (float)width;
float v = y / (float)height;
u = u * 2.0f - 1.0f;
v = v * 2.0f - 1.0f;
float freq = 4.0f;
float w = sinf(u * freq + tim) * cosf(v * freq + tim) * 0.5f;
positions[y * width + x] = make_float4(u, w, v, 1.0f);
}
void callKernel() {
float4 *positions;
cudaGraphicsMapResources(1,&positionsVBO_CUDA,0);
size_t num_bytes;
cudaGraphicsResourceGetMappedPointer((void**)&positions,&num_bytes,positionsVBO_CUDA);
dim3 dimBlock(16, 16, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
createVertices<<<dimGrid, dimBlock>>>(positions, tim, width, height);
cudaGraphicsUnmapResources(1, &positionsVBO_CUDA, 0);
}
//------------------------------------------------
//------------------------------------------------
#include <boost/thread.hpp>
#include “cuda.h”
#include “data.h”
#include “kernels.h”
using namespace boost;
barrier sync1(2),sync2(2),sync3(2);
HGLRC ctx1,ctx2,ctxShared;
GLuint positionsVBO;
struct cudaGraphicsResource *positionsVBO_CUDA;
unsigned int width=64;
unsigned int height=64;
float tim=0.0f;
void resize(int newWidth,int newHeight) {
if (newHeight==0) { //prevent division by 0
newHeight=1;
}
glViewport(0,0,newWidth,newHeight);
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
gluPerspective(
60.0
,newWidth/double(newHeight)
,0.1
,10.0
);
}
void display() {
sync2.wait(); //app waits till other thread finishes kernel execution
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glEnableClientState(GL_VERTEX_ARRAY);
glBindBuffer(GL_ARRAY_BUFFER, positionsVBO);
glVertexPointer(4, GL_FLOAT, 0, 0);
glDrawArrays(GL_POINTS, 0, width * height);
glDisableClientState(GL_VERTEX_ARRAY);
glutSwapBuffers();
tim+=0.01f;
sync3.wait(); //app waits till this thread finishes GL rendering
glutPostRedisplay();
}
void threadFunc() {
wglMakeCurrent(wglGetCurrentDC(),ctx2);
cudaGLSetGLDevice(0);
cudaError_t errc=cudaGraphicsGLRegisterBuffer(&positionsVBO_CUDA,positionsVBO,cudaGraphicsMapFlagsWrite
Discard);
sync1.wait(); //app waits till registering is finished
for(;;) {
callKernel();
sync2.wait(); //app waits till this thread finishes kernel execution
sync3.wait(); //app waits till other thread finishes GL rendering
}
}
int main(int argC,char **argV) {
glutInit(&argC,argV);
glutInitWindowPosition(100,100);
glutInitWindowSize(512,512);
glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH);
glutCreateWindow(“WindowName”);
glewInit();
glutDisplayFunc(&display);
glutReshapeFunc(&resize);
ctx1=wglGetCurrentContext();
ctxShared=wglCreateContext(wglGetCurrentDC());
wglShareLists(ctxShared,ctx1);
ctx2=wglCreateContext(wglGetCurrentDC());
wglShareLists(ctxShared,ctx2);
wglMakeCurrent(wglGetCurrentDC(),ctxShared);
glGenBuffers(1,&positionsVBO);
glBindBuffer(GL_ARRAY_BUFFER,positionsVBO);
unsigned int size=widthheight4*sizeof(float);
glBufferData(GL_ARRAY_BUFFER,size,NULL,GL_DYNAMIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER,0);
wglMakeCurrent(wglGetCurrentDC(),ctx1);
//cudaGLSetGLDevice(0); //Present or absent, makes no difference
thread th2(threadFunc);
sync1.wait(); //app waits till registering is finished
glutMainLoop();
return 0;
}
[/codebox]