in-place FFT fails on 2D real transform

dear all:

I try to do in-place FFT (real to complex, complex to real), 

It is O.K. for 1D, but fails on 2D and 3D.

moreover complex-to-complex in-place transform is also O.K.

The following code is in-place forward transform of 2D

[codebox]#ifdef DO_DOUBLE

typedef double doublereal;

typedef cufftDoubleComplex Complex; 

#else

typedef float doublereal;

typedef cufftComplex  Complex; 

#endif

void lsc_2DFFTF_R2C( const unsigned int Nx, const unsigned int Ny,

				 doublereal *h_idata )

{

cufftHandle plan ;

doublereal  *d_idata ;

size_t   N     = Nx * Ny ;

size_t   N_pad = Nx * ((Ny >> 1) + 1) ;

cufftResult  flag ;

// step 1: transfer data to device

cutilSafeCall( cudaMalloc((void**)&d_idata, sizeof(Complex   )*N_pad ) );

CUDA_SAFE_CALL(cudaMemcpy(d_idata, h_idata, sizeof(doublereal)*N, cudaMemcpyHostToDevice) );

// step 2: Create a 2D FFT plan.

#if defined (DO_DOUBLE)

cufftPlan2d(&plan, Nx, Ny, CUFFT_D2Z );

#else

cufftPlan2d(&plan, Nx, Ny, CUFFT_R2C );

#endif

// step 3: Use the CUFFT plan to transform the signal in-place.

#if defined (DO_DOUBLE)

flag = cufftExecD2Z( plan, d_idata, (cufftDoubleComplex *)d_idata );

#else

flag = cufftExecR2C( plan, d_idata, (cufftComplex *) d_idata ) ;

#endif

if ( CUFFT_SUCCESS != flag ){

	printf("2D: cufftExecR2C fails\n");

}

// make sure that all threads are done

cudaThreadSynchronize();

// step 4: copy data to host

CUDA_SAFE_CALL(cudaMemcpy(h_idata, d_idata, sizeof(Complex)*N_pad, cudaMemcpyDeviceToHost) );

// Destroy the CUFFT plan.

cufftDestroy(plan);

cudaFree(d_idata);

}

[/codebox]

However if I use out-place, it is O.K.

The following code is out-place forward transform of 2D

[codebox]void lsc_2DFFTF_R2C( const unsigned int Nx, const unsigned int Ny,

				 doublereal *h_idata )

{

cufftHandle plan ;

doublereal  *d_idata ;

Complex  *d_odata ;

size_t   N     = Nx * Ny ;

size_t   N_pad = Nx * ((Ny >> 1) + 1) ;

cufftResult  flag ;

// step 1: transfer data to device

cutilSafeCall( cudaMalloc((void**)&d_idata, sizeof(Complex   )*N_pad ) );

CUDA_SAFE_CALL(cudaMemcpy(d_idata, h_idata, sizeof(doublereal)*N, cudaMemcpyHostToDevice) );

cutilSafeCall( cudaMalloc((void**)&d_odata, sizeof(Complex)*N_pad ) );

// step 2: Create a 2D FFT plan.

#if defined (DO_DOUBLE)

cufftPlan2d(&plan, Nx, Ny, CUFFT_D2Z );

#else

cufftPlan2d(&plan, Nx, Ny, CUFFT_R2C );

#endif

// step 3: Use the CUFFT plan to transform the signal in-place.

#if defined (DO_DOUBLE)

flag = cufftExecD2Z( plan, d_idata, (cufftDoubleComplex *)d_odata );

#else

flag = cufftExecR2C( plan, d_idata, (cufftComplex *) d_odata ) ;

#endif

if ( CUFFT_SUCCESS != flag ){

	printf("2D: cufftExecR2C fails\n");

}

// make sure that all threads are done

cudaThreadSynchronize();

// step 4: copy data to host

CUDA_SAFE_CALL(cudaMemcpy(h_idata, d_odata, sizeof(Complex)*N_pad, cudaMemcpyDeviceToHost) );

// Destroy the CUFFT plan.

cufftDestroy(plan);

cudaFree(d_idata);

}

[/codebox]

does someone use in-place real transform successfully?

ps: my platform is winxp pro64, vc2005, cuda 2.3, driver 190.38, GTX295