Very strange behaviour in very simple application

Hello folks,

lately I’ve been working on a very simple CUDA/OpenGL program, and I’m going mad to figure out what’s wrong with it.

If I run my program in device emulation, I get the correct result:

[attachment=6317:attachment]

But when I run in hardware mode (i.e. non emulated), I get this seemingly senseless output:

[attachment=6318:attachment]

As I said, the code is very, very simple:

#include <GL/glew.h>

#include <GL/glut.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include <cuda_gl_interop.h>

#define RESOLUTION_X    800

#define RESOLUTION_Y    600

typedef unsigned int    uint;

uint    buffer;

uint    tex;

uchar4  *data;

void idle();

void draw();

void initGLUT(int argc, char** argv)

{

    glutInit( &argc, argv );

    glutInitDisplayMode( GLUT_RGBA | GLUT_DOUBLE );

    glutInitWindowSize( RESOLUTION_X, RESOLUTION_Y );

    glutCreateWindow( "AlphaVox" );

    

    glutIdleFunc( idle );

    glutDisplayFunc( draw );

}

void initGLEW()

{

    glewInit();

    

    if ( !glewIsSupported( "GL_EXT_pixel_buffer_object" ) )

        exit(1);

}

void initGL()

{

    int     size;

    

    glGenBuffers( 1, &buffer );

    glGenTextures( 1, &tex );

    

    glBindBuffer( GL_PIXEL_UNPACK_BUFFER, buffer );

    glBufferData( GL_PIXEL_UNPACK_BUFFER, RESOLUTION_X * RESOLUTION_Y * 4, NULL, GL_DYNAMIC_DRAW );

    glGetBufferParameteriv( GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE, &size );

    glBindBuffer( GL_PIXEL_UNPACK_BUFFER, 0 );

    

    glBindTexture( GL_TEXTURE_2D, tex );

    glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST );

    glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST );

    

    glEnable( GL_TEXTURE_2D );

    

    cudaGLRegisterBufferObject( buffer );

}

void idle() {

    glutPostRedisplay();

}

__global__ void sampleDrawing(uchar4* output)

{

    int     y = blockIdx.y * blockDim.y + threadIdx.y;

    int     x = blockIdx.x * blockDim.x + threadIdx.x;

    

    __syncthreads();

    

    output[ y * RESOLUTION_X + x ] = make_uchar4( threadIdx.x * 16, threadIdx.y * 32, 0, 1 );

}

void draw()

{

    dim3    blockSize( 16, 8 );

    dim3    gridSize( RESOLUTION_X / 16, RESOLUTION_Y / 8 );

    

    cudaGLMapBufferObject( (void**) &data, buffer );

    sampleDrawing<<< gridSize, blockSize >>>( data );

    cudaGLUnmapBufferObject( buffer );

    

    glBindBuffer( GL_PIXEL_UNPACK_BUFFER, buffer );

    glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA, RESOLUTION_X, RESOLUTION_Y, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0 );

    glBindBuffer( GL_PIXEL_UNPACK_BUFFER, 0 );

    

    glBegin( GL_QUADS );

    glTexCoord2i( 0, 0 ); glVertex2i( -1, -1 );

    glTexCoord2i( 1, 0 ); glVertex2i( 1, -1 );

    glTexCoord2i( 1, 1 ); glVertex2i( 1, 1 );

    glTexCoord2i( 0, 1 ); glVertex2i( -1, 1 );

    glEnd();

   glutSwapBuffers();

}

void shutdown()

{

    cudaGLUnregisterBufferObject( buffer );

    

    glDeleteBuffers( 1, &buffer );

}

int main(int argc, char** argv)

{

    initGLUT( argc, argv );

    initGLEW();

    initGL();

    

    glutMainLoop();

    

    shutdown();

}

My computer is a Sony Vaio laptop with a 8400M GT device, with 128 MB of dedicated memory. I know it’s quite a low amount of memory (no wonder the majority of SDK samples doesn’t run properly), but it shouldn’t affect this very basic application.
nonemulated.png
emulation.png

Hello, I’ve tried to “restrict” the context of the error and I’ve found the following interesting result.

As I pasted above, I have a simple kernel that fills a pixel buffer object, that then is rendered onto the screen through a screen quad. I create one thread per output pixel. If I use the following kernel:

__global__ void sampleDrawing(uchar4* output)

{

    int    y = blockIdx.y * blockDim.y + threadIdx.y;

    int    x = blockIdx.x * blockDim.x + threadIdx.x;

	

    __syncthreads();

	

    if ( ( x == y ) || ( x == RESOLUTION_Y - y ) )

        output[ y * RESOLUTION_X + x ] = make_uchar4( 0, 0, 255, 1 );

    else

        output[ y * RESOLUTION_X + x ] = make_uchar4( x / 2, y / 2, 0, 1 );

}

I have the following CORRECT result:

[attachment=6319:attachment]

However, if I use the following kernel (the only modification is the “|| true” in the if):

__global__ void sampleDrawing(uchar4* output)

{

    int    y = blockIdx.y * blockDim.y + threadIdx.y;

    int    x = blockIdx.x * blockDim.x + threadIdx.x;

	

    __syncthreads();

	

    if ( ( x == y ) || ( x == RESOLUTION_Y - y ) || true )

        output[ y * RESOLUTION_X + x ] = make_uchar4( 0, 0, 255, 1 );

    else

        output[ y * RESOLUTION_X + x ] = make_uchar4( x / 2, y / 2, 0, 1 );

}

I have the following completely wrong output:

[attachment=6320:attachment]

Now, what the hell is this? :( :( :(
wrong.png
cross.png

Solved!!!

I’ve just installed driver, toolkit and sdk of CUDA 2.0, and now that’s all OK… So it probably was a toolkit bug.