I have a very strange bug with cuda3.2
The same binary runs fine on some machines, and produce errors on others (range from “the launch timed out and was terminated”, to blue screen).
The exact same code compiled with cuda3.1 works fine on all machines.
I am compiling with vs2005, and cuda3.2 (cudart32_32_16.dll) and 3.1 (cudart32_31_9.dll).
All machines are running drivers 266.58.
OS are xp32, win7, and win7x64. Graphics range from GeForce220M, GeForce285 to GeForce480,TeslaC20x0.
The machines where it “fails” are xp32 (so far), HP xw8600 (GeForce460) and Dell T3500 (tried both Quadro4000 and GeForce470).
I spent some time trying to narrow down the smallest kernel that would reproduce this problem (because using a very simple kernel is working fine on all machines, with both version of cuda) … and make a self-contained example.
I reused the “volumeRender” SDK example (because my original code is using OpenGL interop), so you can just replace the files and it should compile.
volumeRender_kernel.cu
#include <cutil_inline.h>
dim3 blockSize(16, 16);
dim3 gridSize;
texture<float, 2> tex(0, cudaFilterModePoint, cudaAddressModeClamp);
__global__ void d_render(float *od, size_t ostride, int w, int h, float sx, float sy)
{
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
if ((x >= w) || (y >= h)) return;
// // Note: this one works just fine on any hardware, with both versions of cuda
// float dx = x/(float)w - 0.5f;
// float dy = y/(float)h - 0.5f;
// od[y*ostride + x] = (dx*dx + dy*dy);
int i1 = (int)((x+1)*sx);
int j1 = (int)((y+1)*sy);
float accum = 0.0f;
int nb_contrib = 0;
for (int j=(int)(y*sy); j<=j1; ++j)
{
for (int i=(int)(x*sx); i<=i1; ++i)
{
accum += tex2D(tex, i,j); ++nb_contrib;
}
}
od[y*ostride + x] = (nb_contrib <= 0 ? 0.0f : accum/float(nb_contrib));
}
void render_kernel(float *od, size_t ostride, int w, int h, float sx, float sy)
{
gridSize = dim3((w+15)/16, (h+15)/16, 1);
d_render<<<gridSize, blockSize>>>(od, ostride, w, h, sx, sy);
}
void initCudaTexture()
{
int w = 1200, h = 1200;
float* h_buffer = new float[w*h];
for (int j=0, idx=0; j<h; ++j)
{
float dy2 = j/(float)(h-1) - 0.5f; dy2 = dy2*dy2;
for (int i=0; i<w; ++i, ++idx)
{
float dx = i/(float)(w-1) - 0.5f;
h_buffer[idx] = 1.0f - sqrtf(dx*dx + dy2);
}
}
cudaArray* d_array = 0;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cutilSafeCall(cudaMallocArray(&d_array, &channelDesc, w, h));
cutilSafeCall(cudaMemcpyToArray(d_array, 0,0, h_buffer, w*h*sizeof(float), cudaMemcpyHostToDevice));
tex.addressMode[0] = cudaAddressModeClamp;
tex.addressMode[1] = cudaAddressModeClamp;
tex.filterMode = cudaFilterModePoint;
cutilSafeCall(cudaBindTextureToArray(tex, d_array, channelDesc));
}
volumeRender.cpp
// Graphics includes
#include <GL/glew.h>
#if defined (__APPLE__) || defined(MACOSX)
#include <GLUT/glut.h>
#else
#include <GL/glut.h>
#endif
// Utilities and System includes
#include <shrUtils.h>
#include <cutil_inline.h>
#include <cutil_gl_inline.h>
#include <cuda_gl_interop.h>
extern void initCudaTexture();
extern void render_kernel(float *od, size_t ostride, int w, int h, float sx, float sy);
int width = 512, height = 512;
int texw = 1600, texh = 1200;
GLuint pbo = 0; // OpenGL pixel buffer object
GLuint tex = 0; // OpenGL texture object
struct cudaGraphicsResource *cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO)
// render image using CUDA
void render()
{
float *d_output;
cutilSafeCall(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
size_t num_bytes;
cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource));
cutilSafeCall(cudaMemset(d_output, 0, width*height*4));
render_kernel(d_output, texw, width, height, 1200.0f/width, 1200.0f/height);
cutilCheckMsg("kernel failed");
cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
}
// display results using OpenGL (called by GLUT)
void display()
{
glClear(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT);
if (pbo == 0)
{
cudaGLSetGLDevice( 0 );
initCudaTexture();
glDisable(GL_LIGHTING);
glDisable(GL_DEPTH_TEST);
glGenBuffersARB(1, &pbo);
// create texture for display
glGenTextures(1, &tex);
glBindTexture(GL_TEXTURE_2D, tex);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, texw, texh, 0, GL_LUMINANCE, GL_FLOAT, NULL);
glBindTexture(GL_TEXTURE_2D, 0);
glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, texw*texh*sizeof(float), 0, GL_STREAM_DRAW_ARB);
// register this buffer object with CUDA
cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard));
}
render();
glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
glBindTexture(GL_TEXTURE_2D, tex);
// glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, texw, texh, GL_LUMINANCE, GL_FLOAT, 0);
glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, texw, texh, 0, GL_LUMINANCE, GL_FLOAT, 0);
glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
glEnable(GL_TEXTURE_2D);
float tw = width / float(texw);
float th = height / float(texh);
glActiveTexture(GL_TEXTURE0);
glBegin(GL_QUADS);
glTexCoord2f(0, 0); glVertex2f(0, 0);
glTexCoord2f(tw, 0); glVertex2f(1, 0);
glTexCoord2f(tw, th); glVertex2f(1, 1);
glTexCoord2f(0, th); glVertex2f(0, 1);
glEnd();
glDisable(GL_TEXTURE_2D);
glBindTexture(GL_TEXTURE_2D, 0);
glutSwapBuffers();
glutReportErrors();
}
void initGL(int *argc, char **argv)
{
// initialize GLUT callback functions
glutInit(argc, argv);
glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
glutInitWindowSize(width, height);
glutCreateWindow("test 31vs32");
glewInit();
if (!glewIsSupported("GL_VERSION_2_0 GL_ARB_pixel_buffer_object")) {
shrLog("Required OpenGL extensions missing.");
exit(-1);
}
}
void idle()
{
glutPostRedisplay();
}
void keyboard(unsigned char key, int x, int y)
{
switch(key) {
case 27:
exit(0);
break;
}
glutPostRedisplay();
}
void reshape(int w, int h)
{
width = w; height = h;
glViewport(0, 0, w, h);
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
//start logs
shrSetLogFileName ("volumeRender.txt");
shrLog("%s Starting...\n\n", argv[0]);
initGL(&argc, argv);
// This is the normal rendering path for VolumeRender
glutDisplayFunc(display);
glutKeyboardFunc(keyboard);
glutReshapeFunc(reshape);
glutIdleFunc(idle);
glutMainLoop();
cudaThreadExit();
shrEXIT(argc, (const char**)argv);
}
Now, I don’t really know how to proceed / what to expect.
-
can anyone reproduce the failure?
-
is this already a know “bug” (nvcc?) … and I missed it?
-
is it already fixed in the newcoming 4.0 version? … and I should just wait march 4th
-
should I post that some place else? … to get it looked at / resolved?
Any advice welcomed, thanks :)