Strange bug with cuda 3.2 same code fails on some hardware, runs fine with cuda 3.1

I have a very strange bug with cuda3.2

The same binary runs fine on some machines, and produce errors on others (range from “the launch timed out and was terminated”, to blue screen).

The exact same code compiled with cuda3.1 works fine on all machines.

I am compiling with vs2005, and cuda3.2 (cudart32_32_16.dll) and 3.1 (cudart32_31_9.dll).

All machines are running drivers 266.58.

OS are xp32, win7, and win7x64. Graphics range from GeForce220M, GeForce285 to GeForce480,TeslaC20x0.

The machines where it “fails” are xp32 (so far), HP xw8600 (GeForce460) and Dell T3500 (tried both Quadro4000 and GeForce470).

I spent some time trying to narrow down the smallest kernel that would reproduce this problem (because using a very simple kernel is working fine on all machines, with both version of cuda) … and make a self-contained example.

I reused the “volumeRender” SDK example (because my original code is using OpenGL interop), so you can just replace the files and it should compile.

volumeRender_kernel.cu

#include <cutil_inline.h>

dim3 blockSize(16, 16);

dim3 gridSize;

texture<float, 2> tex(0, cudaFilterModePoint, cudaAddressModeClamp);

__global__ void d_render(float *od, size_t ostride, int w, int h, float sx, float sy)

{

	int x = blockIdx.x*blockDim.x + threadIdx.x;

    int y = blockIdx.y*blockDim.y + threadIdx.y;

    if ((x >= w) || (y >= h)) return;

//      // Note: this one works just fine on any hardware, with both versions of cuda

//      float dx = x/(float)w - 0.5f;

//      float dy = y/(float)h - 0.5f;

//      od[y*ostride + x] = (dx*dx + dy*dy);

 	

    int i1 = (int)((x+1)*sx);

    int j1 = (int)((y+1)*sy);

float accum = 0.0f;

    int nb_contrib = 0;

    for (int j=(int)(y*sy); j<=j1; ++j)

    {

        for (int i=(int)(x*sx); i<=i1; ++i)

        {

            accum += tex2D(tex, i,j); ++nb_contrib;

        }

    }

    od[y*ostride + x] = (nb_contrib <= 0 ? 0.0f : accum/float(nb_contrib));

}

void render_kernel(float *od, size_t ostride, int w, int h, float sx, float sy)

{

    gridSize = dim3((w+15)/16, (h+15)/16, 1);

	d_render<<<gridSize, blockSize>>>(od, ostride, w, h, sx, sy);

}

void initCudaTexture()

{

	int w = 1200, h = 1200;

	float* h_buffer = new float[w*h];

	for (int j=0, idx=0; j<h; ++j)

	{

		float dy2 = j/(float)(h-1) - 0.5f; dy2 = dy2*dy2;

		for (int i=0; i<w; ++i, ++idx)

		{

			float dx = i/(float)(w-1) - 0.5f;

			h_buffer[idx] = 1.0f - sqrtf(dx*dx + dy2);

		}

	}

    cudaArray* d_array = 0;

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

	cutilSafeCall(cudaMallocArray(&d_array, &channelDesc, w, h));

	cutilSafeCall(cudaMemcpyToArray(d_array, 0,0, h_buffer, w*h*sizeof(float), cudaMemcpyHostToDevice));

	

	tex.addressMode[0] = cudaAddressModeClamp;

	tex.addressMode[1] = cudaAddressModeClamp;

	tex.filterMode = cudaFilterModePoint;

	cutilSafeCall(cudaBindTextureToArray(tex, d_array, channelDesc));

}

volumeRender.cpp

// Graphics includes

#include <GL/glew.h>

#if defined (__APPLE__) || defined(MACOSX)

#include <GLUT/glut.h>

#else

#include <GL/glut.h>

#endif

// Utilities and System includes

#include <shrUtils.h>

#include <cutil_inline.h>

#include <cutil_gl_inline.h>

#include <cuda_gl_interop.h>

extern void initCudaTexture();

extern void render_kernel(float *od, size_t ostride, int w, int h, float sx, float sy);

int width = 512, height = 512;

int texw = 1600, texh = 1200;

GLuint pbo = 0; 	// OpenGL pixel buffer object

GLuint tex = 0; 	// OpenGL texture object

struct cudaGraphicsResource *cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO)

// render image using CUDA

void render()

{

    float *d_output;

	cutilSafeCall(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));

    size_t num_bytes; 

    cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource));

    cutilSafeCall(cudaMemset(d_output, 0, width*height*4));

    render_kernel(d_output, texw, width, height, 1200.0f/width, 1200.0f/height);

    cutilCheckMsg("kernel failed");

    cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));

}

// display results using OpenGL (called by GLUT)

void display()

{

	glClear(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT);

	if (pbo == 0)

	{

		cudaGLSetGLDevice( 0 );

		initCudaTexture();

		glDisable(GL_LIGHTING);

		glDisable(GL_DEPTH_TEST);

		glGenBuffersARB(1, &pbo);

		// create texture for display

		glGenTextures(1, &tex);

		glBindTexture(GL_TEXTURE_2D, tex);

		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);

		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);

		glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, texw, texh, 0, GL_LUMINANCE, GL_FLOAT, NULL);

		glBindTexture(GL_TEXTURE_2D, 0);

		

		glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);

		glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, texw*texh*sizeof(float), 0, GL_STREAM_DRAW_ARB);

		// register this buffer object with CUDA

		cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard));	

	}

render();

glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);

    glBindTexture(GL_TEXTURE_2D, tex);

    // glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, texw, texh, GL_LUMINANCE, GL_FLOAT, 0);

	glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, texw, texh, 0, GL_LUMINANCE, GL_FLOAT, 0);

    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);

    glEnable(GL_TEXTURE_2D);

	float tw = width / float(texw);

	float th = height / float(texh);

	glActiveTexture(GL_TEXTURE0);

    glBegin(GL_QUADS);

    glTexCoord2f(0, 0); glVertex2f(0, 0);

    glTexCoord2f(tw, 0); glVertex2f(1, 0);

    glTexCoord2f(tw, th); glVertex2f(1, 1);

    glTexCoord2f(0, th); glVertex2f(0, 1);

    glEnd();

    glDisable(GL_TEXTURE_2D);

    glBindTexture(GL_TEXTURE_2D, 0);

glutSwapBuffers();

    glutReportErrors();

}

void initGL(int *argc, char **argv)

{

    // initialize GLUT callback functions

    glutInit(argc, argv);

    glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);

    glutInitWindowSize(width, height);

    glutCreateWindow("test 31vs32");

glewInit();

    if (!glewIsSupported("GL_VERSION_2_0 GL_ARB_pixel_buffer_object")) {

        shrLog("Required OpenGL extensions missing.");

        exit(-1);

    }

}

void idle()

{

    glutPostRedisplay();

}

void keyboard(unsigned char key, int x, int y)

{

    switch(key) {

        case 27:

            exit(0);

            break;

    }

    glutPostRedisplay();

}

void reshape(int w, int h)

{

    width = w; height = h;

    glViewport(0, 0, w, h);

    glMatrixMode(GL_MODELVIEW);

    glLoadIdentity();

    glMatrixMode(GL_PROJECTION);

    glLoadIdentity();

    glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);

}

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int

main( int argc, char** argv) 

{

    //start logs

    shrSetLogFileName ("volumeRender.txt");

    shrLog("%s Starting...\n\n", argv[0]); 

	initGL(&argc, argv);

// This is the normal rendering path for VolumeRender

    glutDisplayFunc(display);

    glutKeyboardFunc(keyboard);

    glutReshapeFunc(reshape);

    glutIdleFunc(idle);

    glutMainLoop();

cudaThreadExit();

    shrEXIT(argc, (const char**)argv);

}

Now, I don’t really know how to proceed / what to expect.

  • can anyone reproduce the failure?

  • is this already a know “bug” (nvcc?) … and I missed it?

  • is it already fixed in the newcoming 4.0 version? … and I should just wait march 4th

  • should I post that some place else? … to get it looked at / resolved?

Any advice welcomed, thanks :)

dumb question: where/how do I report this as a problem that really matters to me (production code, stuck with cuda3.1)

I thought I followed the directions provided in the sticky post “Reporting a problem with CUDA”, but got no response. Any other location I should post/ask?
– did I miss the sticky post about that?

Compiler was reworked in 3.2 and now it either contains new bugs, or your bugs at your program start to appears cause of different compilation. There are a lot of similar threads when program stopped working with 3.2 and work with 3.0-3.1, esp with using 2.0. Try use cuda 4.0 rc also use memcheck ocelot to hadle and locates errors.