Strange bug with cuda 3.2 same code fails on some hardware, runs fine with cuda 3.1

Xvv · March 3, 2011, 3:38pm

I have a very strange bug with cuda3.2

The same binary runs fine on some machines, and produce errors on others (range from “the launch timed out and was terminated”, to blue screen).

The exact same code compiled with cuda3.1 works fine on all machines.

I am compiling with vs2005, and cuda3.2 (cudart32_32_16.dll) and 3.1 (cudart32_31_9.dll).

All machines are running drivers 266.58.

OS are xp32, win7, and win7x64. Graphics range from GeForce220M, GeForce285 to GeForce480,TeslaC20x0.

The machines where it “fails” are xp32 (so far), HP xw8600 (GeForce460) and Dell T3500 (tried both Quadro4000 and GeForce470).

I spent some time trying to narrow down the smallest kernel that would reproduce this problem (because using a very simple kernel is working fine on all machines, with both version of cuda) … and make a self-contained example.

I reused the “volumeRender” SDK example (because my original code is using OpenGL interop), so you can just replace the files and it should compile.

volumeRender_kernel.cu

#include <cutil_inline.h>

dim3 blockSize(16, 16);

dim3 gridSize;

texture<float, 2> tex(0, cudaFilterModePoint, cudaAddressModeClamp);

__global__ void d_render(float *od, size_t ostride, int w, int h, float sx, float sy)

{

	int x = blockIdx.x*blockDim.x + threadIdx.x;

    int y = blockIdx.y*blockDim.y + threadIdx.y;

    if ((x >= w) || (y >= h)) return;

//      // Note: this one works just fine on any hardware, with both versions of cuda

//      float dx = x/(float)w - 0.5f;

//      float dy = y/(float)h - 0.5f;

//      od[y*ostride + x] = (dx*dx + dy*dy);

 	

    int i1 = (int)((x+1)*sx);

    int j1 = (int)((y+1)*sy);

float accum = 0.0f;

    int nb_contrib = 0;

    for (int j=(int)(y*sy); j<=j1; ++j)

    {

        for (int i=(int)(x*sx); i<=i1; ++i)

        {

            accum += tex2D(tex, i,j); ++nb_contrib;

        }

    }

    od[y*ostride + x] = (nb_contrib <= 0 ? 0.0f : accum/float(nb_contrib));

}

void render_kernel(float *od, size_t ostride, int w, int h, float sx, float sy)

{

    gridSize = dim3((w+15)/16, (h+15)/16, 1);

	d_render<<<gridSize, blockSize>>>(od, ostride, w, h, sx, sy);

}

void initCudaTexture()

{

	int w = 1200, h = 1200;

	float* h_buffer = new float[w*h];

	for (int j=0, idx=0; j<h; ++j)

	{

		float dy2 = j/(float)(h-1) - 0.5f; dy2 = dy2*dy2;

		for (int i=0; i<w; ++i, ++idx)

		{

			float dx = i/(float)(w-1) - 0.5f;

			h_buffer[idx] = 1.0f - sqrtf(dx*dx + dy2);

		}

	}

    cudaArray* d_array = 0;

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

	cutilSafeCall(cudaMallocArray(&d_array, &channelDesc, w, h));

	cutilSafeCall(cudaMemcpyToArray(d_array, 0,0, h_buffer, w*h*sizeof(float), cudaMemcpyHostToDevice));

	

	tex.addressMode[0] = cudaAddressModeClamp;

	tex.addressMode[1] = cudaAddressModeClamp;

	tex.filterMode = cudaFilterModePoint;

	cutilSafeCall(cudaBindTextureToArray(tex, d_array, channelDesc));

}

volumeRender.cpp

// Graphics includes

#include <GL/glew.h>

#if defined (__APPLE__) || defined(MACOSX)

#include <GLUT/glut.h>

#else

#include <GL/glut.h>

#endif

// Utilities and System includes

#include <shrUtils.h>

#include <cutil_inline.h>

#include <cutil_gl_inline.h>

#include <cuda_gl_interop.h>

extern void initCudaTexture();

extern void render_kernel(float *od, size_t ostride, int w, int h, float sx, float sy);

int width = 512, height = 512;

int texw = 1600, texh = 1200;

GLuint pbo = 0; 	// OpenGL pixel buffer object

GLuint tex = 0; 	// OpenGL texture object

struct cudaGraphicsResource *cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO)

// render image using CUDA

void render()

{

    float *d_output;

	cutilSafeCall(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));

    size_t num_bytes; 

    cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource));

    cutilSafeCall(cudaMemset(d_output, 0, width*height*4));

    render_kernel(d_output, texw, width, height, 1200.0f/width, 1200.0f/height);

    cutilCheckMsg("kernel failed");

    cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));

}

// display results using OpenGL (called by GLUT)

void display()

{

	glClear(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT);

	if (pbo == 0)

	{

		cudaGLSetGLDevice( 0 );

		initCudaTexture();

		glDisable(GL_LIGHTING);

		glDisable(GL_DEPTH_TEST);

		glGenBuffersARB(1, &pbo);

		// create texture for display

		glGenTextures(1, &tex);

		glBindTexture(GL_TEXTURE_2D, tex);

		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);

		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);

		glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, texw, texh, 0, GL_LUMINANCE, GL_FLOAT, NULL);

		glBindTexture(GL_TEXTURE_2D, 0);

		

		glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);

		glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, texw*texh*sizeof(float), 0, GL_STREAM_DRAW_ARB);

		// register this buffer object with CUDA

		cutilSafeCall(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard));	

	}

render();

glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);

    glBindTexture(GL_TEXTURE_2D, tex);

    // glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, texw, texh, GL_LUMINANCE, GL_FLOAT, 0);

	glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, texw, texh, 0, GL_LUMINANCE, GL_FLOAT, 0);

    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);

    glEnable(GL_TEXTURE_2D);

	float tw = width / float(texw);

	float th = height / float(texh);

	glActiveTexture(GL_TEXTURE0);

    glBegin(GL_QUADS);

    glTexCoord2f(0, 0); glVertex2f(0, 0);

    glTexCoord2f(tw, 0); glVertex2f(1, 0);

    glTexCoord2f(tw, th); glVertex2f(1, 1);

    glTexCoord2f(0, th); glVertex2f(0, 1);

    glEnd();

    glDisable(GL_TEXTURE_2D);

    glBindTexture(GL_TEXTURE_2D, 0);

glutSwapBuffers();

    glutReportErrors();

}

void initGL(int *argc, char **argv)

{

    // initialize GLUT callback functions

    glutInit(argc, argv);

    glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);

    glutInitWindowSize(width, height);

    glutCreateWindow("test 31vs32");

glewInit();

    if (!glewIsSupported("GL_VERSION_2_0 GL_ARB_pixel_buffer_object")) {

        shrLog("Required OpenGL extensions missing.");

        exit(-1);

    }

}

void idle()

{

    glutPostRedisplay();

}

void keyboard(unsigned char key, int x, int y)

{

    switch(key) {

        case 27:

            exit(0);

            break;

    }

    glutPostRedisplay();

}

void reshape(int w, int h)

{

    width = w; height = h;

    glViewport(0, 0, w, h);

    glMatrixMode(GL_MODELVIEW);

    glLoadIdentity();

    glMatrixMode(GL_PROJECTION);

    glLoadIdentity();

    glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);

}

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int

main( int argc, char** argv) 

{

    //start logs

    shrSetLogFileName ("volumeRender.txt");

    shrLog("%s Starting...\n\n", argv[0]); 

	initGL(&argc, argv);

// This is the normal rendering path for VolumeRender

    glutDisplayFunc(display);

    glutKeyboardFunc(keyboard);

    glutReshapeFunc(reshape);

    glutIdleFunc(idle);

    glutMainLoop();

cudaThreadExit();

    shrEXIT(argc, (const char**)argv);

}

Now, I don’t really know how to proceed / what to expect.

can anyone reproduce the failure?
is this already a know “bug” (nvcc?) … and I missed it?
is it already fixed in the newcoming 4.0 version? … and I should just wait march 4th
should I post that some place else? … to get it looked at / resolved?

Any advice welcomed, thanks :)

Xvv · April 14, 2011, 6:42pm

dumb question: where/how do I report this as a problem that really matters to me (production code, stuck with cuda3.1)

I thought I followed the directions provided in the sticky post “Reporting a problem with CUDA”, but got no response. Any other location I should post/ask?
– did I miss the sticky post about that?

Lev · April 14, 2011, 7:00pm

Compiler was reworked in 3.2 and now it either contains new bugs, or your bugs at your program start to appears cause of different compilation. There are a lot of similar threads when program stopped working with 3.2 and work with 3.0-3.1, esp with using 2.0. Try use cuda 4.0 rc also use memcheck ocelot to hadle and locates errors.

Topic		Replies	Views
CUDA 3.2 results different from 3.1 CUDA Programming and Performance	13	1150	November 26, 2010
CUDA 3.2 Driver BROKE ? Oops.... CUDA Programming and Performance	20	11400	December 22, 2010
CUDA 3.2.16 not work on GTX 460 with xorg-1.9.2 possible CUDA driver bug CUDA Programming and Performance	0	6360	February 14, 2011
New runtime issues on cuda 3.2? CUDA Programming and Performance	3	11996	December 3, 2010
cuda 2.2 bug? CUDA Programming and Performance	29	19679	May 3, 2010
CUDA Toolkit 3.2 release candidate available to registered developers CUDA Programming and Performance	68	63242	December 3, 2010
problems with some cuda tk 3.0 sdk (VolumeRender and others) starting with cuda CUDA Programming and Performance	1	1343	March 28, 2010
CUDA 3.2 RC issues on 13 in macbook pro 2010 (320m) Upgrade to 3.2 RC breaks cuda on macbook pro 201 CUDA Programming and Performance	16	40886	November 18, 2010
CUDA 3.2 RC2 CUDA Programming and Performance	21	3213	November 10, 2010
Compiler bug on cuda 3.2 CUDA Programming and Performance	4	5341	December 29, 2010

Strange bug with cuda 3.2 same code fails on some hardware, runs fine with cuda 3.1

Related topics