Very strange behaviour in very simple application

Alessandro · May 8, 2008, 1:31pm

Hello folks,

lately I’ve been working on a very simple CUDA/OpenGL program, and I’m going mad to figure out what’s wrong with it.

If I run my program in device emulation, I get the correct result:

[attachment=6317:attachment]

But when I run in hardware mode (i.e. non emulated), I get this seemingly senseless output:

[attachment=6318:attachment]

As I said, the code is very, very simple:

#include <GL/glew.h>

#include <GL/glut.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include <cuda_gl_interop.h>

#define RESOLUTION_X    800

#define RESOLUTION_Y    600

typedef unsigned int    uint;

uint    buffer;

uint    tex;

uchar4  *data;

void idle();

void draw();

void initGLUT(int argc, char** argv)

{

    glutInit( &argc, argv );

    glutInitDisplayMode( GLUT_RGBA | GLUT_DOUBLE );

    glutInitWindowSize( RESOLUTION_X, RESOLUTION_Y );

    glutCreateWindow( "AlphaVox" );

    

    glutIdleFunc( idle );

    glutDisplayFunc( draw );

}

void initGLEW()

{

    glewInit();

    

    if ( !glewIsSupported( "GL_EXT_pixel_buffer_object" ) )

        exit(1);

}

void initGL()

{

    int     size;

    

    glGenBuffers( 1, &buffer );

    glGenTextures( 1, &tex );

    

    glBindBuffer( GL_PIXEL_UNPACK_BUFFER, buffer );

    glBufferData( GL_PIXEL_UNPACK_BUFFER, RESOLUTION_X * RESOLUTION_Y * 4, NULL, GL_DYNAMIC_DRAW );

    glGetBufferParameteriv( GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE, &size );

    glBindBuffer( GL_PIXEL_UNPACK_BUFFER, 0 );

    

    glBindTexture( GL_TEXTURE_2D, tex );

    glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST );

    glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST );

    

    glEnable( GL_TEXTURE_2D );

    

    cudaGLRegisterBufferObject( buffer );

}

void idle() {

    glutPostRedisplay();

}

__global__ void sampleDrawing(uchar4* output)

{

    int     y = blockIdx.y * blockDim.y + threadIdx.y;

    int     x = blockIdx.x * blockDim.x + threadIdx.x;

    

    __syncthreads();

    

    output[ y * RESOLUTION_X + x ] = make_uchar4( threadIdx.x * 16, threadIdx.y * 32, 0, 1 );

}

void draw()

{

    dim3    blockSize( 16, 8 );

    dim3    gridSize( RESOLUTION_X / 16, RESOLUTION_Y / 8 );

    

    cudaGLMapBufferObject( (void**) &data, buffer );

    sampleDrawing<<< gridSize, blockSize >>>( data );

    cudaGLUnmapBufferObject( buffer );

    

    glBindBuffer( GL_PIXEL_UNPACK_BUFFER, buffer );

    glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA, RESOLUTION_X, RESOLUTION_Y, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0 );

    glBindBuffer( GL_PIXEL_UNPACK_BUFFER, 0 );

    

    glBegin( GL_QUADS );

    glTexCoord2i( 0, 0 ); glVertex2i( -1, -1 );

    glTexCoord2i( 1, 0 ); glVertex2i( 1, -1 );

    glTexCoord2i( 1, 1 ); glVertex2i( 1, 1 );

    glTexCoord2i( 0, 1 ); glVertex2i( -1, 1 );

    glEnd();

   glutSwapBuffers();

}

void shutdown()

{

    cudaGLUnregisterBufferObject( buffer );

    

    glDeleteBuffers( 1, &buffer );

}

int main(int argc, char** argv)

{

    initGLUT( argc, argv );

    initGLEW();

    initGL();

    

    glutMainLoop();

    

    shutdown();

}

My computer is a Sony Vaio laptop with a 8400M GT device, with 128 MB of dedicated memory. I know it’s quite a low amount of memory (no wonder the majority of SDK samples doesn’t run properly), but it shouldn’t affect this very basic application.

Alessandro · May 8, 2008, 2:20pm

Hello, I’ve tried to “restrict” the context of the error and I’ve found the following interesting result.

As I pasted above, I have a simple kernel that fills a pixel buffer object, that then is rendered onto the screen through a screen quad. I create one thread per output pixel. If I use the following kernel:

__global__ void sampleDrawing(uchar4* output)

{

    int    y = blockIdx.y * blockDim.y + threadIdx.y;

    int    x = blockIdx.x * blockDim.x + threadIdx.x;

	

    __syncthreads();

	

    if ( ( x == y ) || ( x == RESOLUTION_Y - y ) )

        output[ y * RESOLUTION_X + x ] = make_uchar4( 0, 0, 255, 1 );

    else

        output[ y * RESOLUTION_X + x ] = make_uchar4( x / 2, y / 2, 0, 1 );

}

I have the following CORRECT result:

[attachment=6319:attachment]

However, if I use the following kernel (the only modification is the “|| true” in the if):

__global__ void sampleDrawing(uchar4* output)

{

    int    y = blockIdx.y * blockDim.y + threadIdx.y;

    int    x = blockIdx.x * blockDim.x + threadIdx.x;

	

    __syncthreads();

	

    if ( ( x == y ) || ( x == RESOLUTION_Y - y ) || true )

        output[ y * RESOLUTION_X + x ] = make_uchar4( 0, 0, 255, 1 );

    else

        output[ y * RESOLUTION_X + x ] = make_uchar4( x / 2, y / 2, 0, 1 );

}

I have the following completely wrong output:

[attachment=6320:attachment]

Now, what the hell is this? :( :( :(

Alessandro · May 8, 2008, 2:42pm

Solved!!!

I’ve just installed driver, toolkit and sdk of CUDA 2.0, and now that’s all OK… So it probably was a toolkit bug.