Hi,
It seems that under certain circumstances, performing a C2C 1D FFT with memory derived from a mapped PBO/VBO leads to sometimes bogus results. Attached is code that initializes GL, starts the CUDA GL interop, and performs a series of FFTs while verfying the results. The code stops and reports an error on the detection of any incorrect output from cuFFT. I have tested this code under Fedora 7 32bit on two different machines each running driver version 169.12 with GeForce 8700M GT cards and CUDA 1.1. I have tested a similar setup with the CUDA BLAS library and have seen no such errors.
To compile and run the code, do “sh test_fft.cu; ./test_fft” It assumes there is a working installation of GLEW, GLUT, CUDA, etc. etc.
Interestingly enough, if buffer objects are continuously unmapped after each FFT, (see lines 150-152) the problem is solved. Perhaps something is happening behind the scenes with respect to book keeping in either GL or the CUDA GL interop when the FFT code is involved? I might have also made a mistake somewhere in the code…
I’d be grateful for any help anyone could give!
Thanks!
#if 0
nvcc test_fft.cu -o test_fft -lcufft -lglut -lGLEW
exit
#endif
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cuda.h>
#include <cufft.h>
#include <GL/glew.h>
#include <GL/gl.h>
#include <GL/glu.h>
#include <GL/glut.h>
#include <cuda_gl_interop.h>
enum CUTBoolean { CUTFalse = 0, CUTTrue = 1 };
#define CUTIL_API
#define CUDA_SAFE_CALL_NO_SYNC(call) do { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
#define CUDA_SAFE_CALL(call) do { \
CUDA_SAFE_CALL_NO_SYNC(call); \
cudaError err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
#define CUFFT_SAFE_CALL(call) do { \
cufftResult err = call; \
if( CUFFT_SUCCESS != err) { \
fprintf(stderr, "CUFFT error in file '%s' in line %i.n", \
__FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} } while (0)
CUTBoolean CUTIL_API cutCheckErrorGL( const char* file, const int line) {
CUTBoolean ret_val = CUTTrue;
GLenum gl_error = glGetError();
if (gl_error != GL_NO_ERROR)
{
fprintf(stderr, "GL Error in file '%s' in line %d :n", file, line);
fprintf(stderr, "%sn", gluErrorString(gl_error));
ret_val = CUTFalse;
}
return ret_val;
}
#define CUT_CHECK_ERROR_GL() \
if( CUTFalse == cutCheckErrorGL( __FILE__, __LINE__)) { \
exit(EXIT_FAILURE); \
}
/* computes simple C2C FFT of size s from in to out */
void fft_test( uint s, float *in, float *out ) {
/* populate memory */
uint bytes = sizeof( float ) * s * 2;
float *h_in = (float *)malloc( bytes );
h_in[ 0 ] = 0.0f; h_in[ 1 ] = 1.0f;
for( uint j = 2; j < s * 2; j++ ) h_in[ j ] = 0.0f;
CUDA_SAFE_CALL(cudaMemcpy( in, h_in, bytes, cudaMemcpyHostToDevice ));
free( h_in );
/* compute FFT */
cufftHandle plan;
CUFFT_SAFE_CALL(cufftPlan1d( &plan, s, CUFFT_C2C, 1 ));
CUFFT_SAFE_CALL(cufftExecC2C( plan, (cufftComplex *)in,
(cufftComplex *)out,
CUFFT_FORWARD ));
CUFFT_SAFE_CALL(cufftDestroy( plan ));
}
/* verifies correct result of FFT */
uint fft_verify( uint s, float *out ) {
/* verify results */
uint bytes = sizeof( float ) * s * 2;
float *h_in = (float *)malloc( bytes );
CUDA_SAFE_CALL(cudaMemcpy( h_in, out, bytes, cudaMemcpyDeviceToHost ));
for( uint j = 0; j < s * 2; j++ ) {
if( fabs( h_in[ j ] - j % 2 ) > 1e-3 ) {
printf( "error: %2.2f at %d should be %2.2f\n",
h_in[ j ], j, (float)(j % 2) );
return 0;
}
}
free( h_in );
return 1;
}
float *alloc_pbo( GLuint* pbo, int size )
{
/* create buffer object */
float *out;
glGenBuffers(1, pbo);
glBindBuffer(GL_ARRAY_BUFFER, *pbo);
glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, 0);
CUDA_SAFE_CALL(cudaGLRegisterBufferObject( *pbo ));
CUDA_SAFE_CALL(cudaGLMapBufferObject( (void**)&out, *pbo ));
CUT_CHECK_ERROR_GL();
return out;
}
void free_pbo( GLuint *pbo ) {
CUDA_SAFE_CALL(cudaGLUnmapBufferObject( *pbo ));
CUDA_SAFE_CALL(cudaGLUnregisterBufferObject( *pbo ));
glDeleteBuffers( 1, (const GLuint *)pbo );
CUT_CHECK_ERROR_GL();
}
int main( int argc,
char *argv[] ) {
/* create GL context, CUDA interop */
glutInit( &argc, argv );
glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);
glutInitWindowSize( 128, 128 );
glutCreateWindow("CUFFT PBO Verification");
glewInit();
CUT_CHECK_ERROR_GL();
/* run a few FFTs of various sizes */
for( uint i = 200; i < 210; i++ ) {
/* allocate PBO memory for FFT work */
float *in, *out;
GLuint in_pbo, out_pbo;
uint bytes = sizeof( float ) * i * 2;
in = alloc_pbo( &in_pbo, bytes );
out = alloc_pbo( &out_pbo, bytes );
/* compute and verify fft */
fft_test( i, in, out );
if( !fft_verify( i, out ) ) {
printf( "Error computing FFT %d\n", i );
break;
}
/* wipe pbo's - - if these are not commented out, no errors occur */
/* free_pbo( &in_pbo ); */
/* free_pbo( &out_pbo ); */
}
}