PBO/VBO Derived memory + CUFFT errors

Hi,

It seems that under certain circumstances, performing a C2C 1D FFT with memory derived from a mapped PBO/VBO leads to sometimes bogus results. Attached is code that initializes GL, starts the CUDA GL interop, and performs a series of FFTs while verfying the results. The code stops and reports an error on the detection of any incorrect output from cuFFT. I have tested this code under Fedora 7 32bit on two different machines each running driver version 169.12 with GeForce 8700M GT cards and CUDA 1.1. I have tested a similar setup with the CUDA BLAS library and have seen no such errors.

To compile and run the code, do “sh test_fft.cu; ./test_fft” It assumes there is a working installation of GLEW, GLUT, CUDA, etc. etc.

Interestingly enough, if buffer objects are continuously unmapped after each FFT, (see lines 150-152) the problem is solved. Perhaps something is happening behind the scenes with respect to book keeping in either GL or the CUDA GL interop when the FFT code is involved? I might have also made a mistake somewhere in the code…

I’d be grateful for any help anyone could give!

Thanks!

#if 0

nvcc test_fft.cu -o test_fft -lcufft -lglut -lGLEW

exit

#endif

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <cuda.h>

#include <cufft.h>

#include <GL/glew.h>

#include <GL/gl.h>

#include <GL/glu.h>

#include <GL/glut.h>

#include <cuda_gl_interop.h>

enum CUTBoolean { CUTFalse = 0, CUTTrue = 1 };

#define CUTIL_API

#define CUDA_SAFE_CALL_NO_SYNC(call) do {                               \

        cudaError err = call;                                           \

        if( cudaSuccess != err) {                                       \

            fprintf(stderr, "Cuda error in file '%s' in line %i : %s.n", \

                    __FILE__, __LINE__, cudaGetErrorString( err) );     \

            exit(EXIT_FAILURE);                                         \

        } } while (0)

#define CUDA_SAFE_CALL(call) do {                                       \

        CUDA_SAFE_CALL_NO_SYNC(call);                                   \

        cudaError err = cudaThreadSynchronize();                        \

        if( cudaSuccess != err) {                                       \

            fprintf(stderr, "Cuda error in file '%s' in line %i : %s.n", \

                    __FILE__, __LINE__, cudaGetErrorString( err) );     \

            exit(EXIT_FAILURE);                                         \

        } } while (0)

#define CUFFT_SAFE_CALL(call) do {                                      \

        cufftResult err = call;                                         \

        if( CUFFT_SUCCESS != err) {                                     \

            fprintf(stderr, "CUFFT error in file '%s' in line %i.n",    \

                    __FILE__, __LINE__);                                \

            exit(EXIT_FAILURE);                                         \

        } } while (0)

CUTBoolean CUTIL_API cutCheckErrorGL( const char* file, const int line) {

    CUTBoolean ret_val = CUTTrue;

    GLenum gl_error = glGetError();

    if (gl_error != GL_NO_ERROR)

        {

            fprintf(stderr, "GL Error in file '%s' in line %d :n", file, line);

            fprintf(stderr, "%sn", gluErrorString(gl_error));

            ret_val = CUTFalse;

        }

    return ret_val;

}

#define CUT_CHECK_ERROR_GL()                                \

    if( CUTFalse == cutCheckErrorGL( __FILE__, __LINE__)) { \

        exit(EXIT_FAILURE);                                 \

    }

/* computes simple C2C FFT of size s from in to out */

void fft_test( uint s, float *in, float *out ) {

   /* populate memory */

    uint bytes = sizeof( float ) * s * 2;

    float *h_in = (float *)malloc( bytes );

    h_in[ 0 ] = 0.0f; h_in[ 1 ] = 1.0f;

    for( uint j = 2; j < s * 2; j++ ) h_in[ j ] = 0.0f;

    CUDA_SAFE_CALL(cudaMemcpy( in, h_in, bytes, cudaMemcpyHostToDevice ));

    free( h_in );

   /* compute FFT */

    cufftHandle plan;

    CUFFT_SAFE_CALL(cufftPlan1d( &plan, s, CUFFT_C2C, 1 ));

    CUFFT_SAFE_CALL(cufftExecC2C( plan, (cufftComplex *)in,

                                  (cufftComplex *)out,

                                  CUFFT_FORWARD ));

    CUFFT_SAFE_CALL(cufftDestroy( plan ));

}

/* verifies correct result of FFT */

uint fft_verify( uint s, float *out ) {

   /* verify results */

    uint bytes = sizeof( float ) * s * 2;

    float *h_in = (float *)malloc( bytes );

    CUDA_SAFE_CALL(cudaMemcpy( h_in, out, bytes, cudaMemcpyDeviceToHost ));

    for( uint j = 0; j < s * 2; j++ ) {

        if( fabs( h_in[ j ] - j % 2 ) > 1e-3 ) {

            printf( "error: %2.2f at %d should be %2.2f\n",

                    h_in[ j ], j, (float)(j % 2) );

            return 0;

        }

    }

    free( h_in );

    return 1;

}

float *alloc_pbo( GLuint* pbo, int size )

{

    /* create buffer object */

    float *out;

    glGenBuffers(1, pbo);

    glBindBuffer(GL_ARRAY_BUFFER, *pbo);

    glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);

    glBindBuffer(GL_ARRAY_BUFFER, 0);

    CUDA_SAFE_CALL(cudaGLRegisterBufferObject( *pbo ));

    CUDA_SAFE_CALL(cudaGLMapBufferObject( (void**)&out, *pbo ));

    CUT_CHECK_ERROR_GL();

   return out;

}

void free_pbo( GLuint *pbo ) {

   CUDA_SAFE_CALL(cudaGLUnmapBufferObject( *pbo ));

    CUDA_SAFE_CALL(cudaGLUnregisterBufferObject( *pbo ));

    glDeleteBuffers( 1, (const GLuint *)pbo );

    CUT_CHECK_ERROR_GL();

}

int main( int argc,

          char *argv[] ) {

   /* create GL context, CUDA interop */

    glutInit( &argc, argv );

    glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);

    glutInitWindowSize( 128, 128 );

    glutCreateWindow("CUFFT PBO Verification");

    glewInit();

    CUT_CHECK_ERROR_GL();

   /* run a few FFTs of various sizes */

    for( uint i = 200; i < 210; i++ ) {

       /* allocate PBO memory for FFT work */

        float *in, *out;

        GLuint in_pbo, out_pbo;

        uint bytes = sizeof( float ) * i * 2;

        in = alloc_pbo( &in_pbo, bytes );

        out = alloc_pbo( &out_pbo, bytes );

       /* compute and verify fft */

        fft_test( i, in, out );

        if( !fft_verify( i, out ) ) {

            printf( "Error computing FFT %d\n", i );

            break;

        }

       /* wipe pbo's - - if these are not commented out, no errors occur */

/*         free_pbo( &in_pbo ); */

/*         free_pbo( &out_pbo ); */

    }

}

On the contrary, “continuously unmapping after each FFT” is exactly what you should be doing. Mapping/unmapping is intended to be in the render loop. In fact, the buffer objects should not be accessed through OpenGL while mapped with CUDA; and the pointer passed back from cudaGLMapBufferObject should not be cached. It may change from one invocation to the next.

Reviewing the programming guide, it certainly could be more clear, but the simpleGL source code does the right thing.

////////////////////////////////////////////////////////////////////////////////

//! Run the Cuda part of the computation

////////////////////////////////////////////////////////////////////////////////

void runCuda( GLuint vbo)

{

    // map OpenGL buffer object for writing from CUDA

    float4 *dptr;

    CUDA_SAFE_CALL(cudaGLMapBufferObject( (void**)&dptr, vbo));

   // execute the kernel

    dim3 block(8, 8, 1);

    dim3 grid(mesh_width / block.x, mesh_height / block.y, 1);

    kernel<<< grid, block>>>(dptr, mesh_width, mesh_height, anim);

   // unmap buffer object

    CUDA_SAFE_CALL(cudaGLUnmapBufferObject( vbo));

}