You need to be a registered developer to file a bug. If you are not, you can apply at http://developer.nvidia.com/join-nvidia-registered-developer-program
Going back to the bug, the error is probably in your memory allocation.
This example does an inplace transform and it gives the correct results, both on 3.2 and 4.0 RC2
$ cat rfft.cu
#include "cufft.h"
int main()
{
cufftReal *input, *input_d;
cufftReal *inplace, *inplace_d;
cufftComplex *outofplace_d;
cufftHandle r2cplan,c2rplan;
int i,N=8;
// Allocate memory
input=(cufftReal*) malloc(sizeof(cufftReal)*N);
inplace=(cufftReal*) malloc(sizeof(cufftReal)*(N+2));
cudaMalloc( (void**)& input_d, sizeof(cufftReal)*N);
cudaMalloc( (void**)& inplace_d, sizeof(cufftReal)*(N+2));
cudaMalloc( (void**)& outofplace_d, sizeof(cufftComplex)*(N/2+1)*2);
// Out of place transform
for (i=0;i<N;i++) input[i]=(float)i;
printf (" Initial signal\n");
for (i=0;i<N;i++) printf("%d %f\n",i,input[i]);
cudaMemcpy(input_d,input,sizeof(cufftReal)*N,cudaMemcpyHostToDevice);
for (i=0;i<N;i++) input[i]=0.;
cufftPlan1d(&r2cplan,N,CUFFT_R2C,1);
cufftPlan1d(&c2rplan,N,CUFFT_C2R,1);
cufftExecR2C(r2cplan,input_d,outofplace_d);
cufftExecC2R(c2rplan,outofplace_d,input_d);
cudaMemcpy(input,input_d,sizeof(cufftReal)*N,cudaMemcpyDeviceToHost);
printf (" Out of place transform \n");
printf (" Signal Normalized signal\n");
for (i=0;i<N;i++) printf("%d %f %f\n",i,input[i], input[i]/N);
// Inplace transform
for (i=0;i<N;i++) inplace[i]=(float)i;
cudaMemcpy(inplace_d,inplace,sizeof(cufftReal)*N,cudaMemcpyHostToDevice);
for (i=0;i<N;i++) inplace[i]=0.;
cufftExecR2C(r2cplan,inplace_d,(cufftComplex*)inplace_d);
cufftExecC2R(c2rplan,(cufftComplex*)inplace_d,inplace_d);
cudaMemcpy(inplace,inplace_d,sizeof(cufftReal)*N,cudaMemcpyDeviceToHost);
printf (" In place transform \n");
printf (" Signal Normalized signal\n");
for (i=0;i<N;i++) printf("%d %f %f\n",i,inplace[i], inplace[i]/N);
cufftDestroy(r2cplan);
cufftDestroy(c2rplan);
free(input);
free(inplace);
cudaFree(input_d);
cudaFree(inplace_d);
cudaFree(outofplace_d);
}
If you compile the code with
nvcc rfft.cu -L/usr/local/cuda/lib64 -lcufft
The output is the correct one:
Initial signal
0 0.000000
1 1.000000
2 2.000000
3 3.000000
4 4.000000
5 5.000000
6 6.000000
7 7.000000
Out of place transform
Signal Normalized signal
0 0.000000 0.000000
1 8.000000 1.000000
2 16.000000 2.000000
3 24.000000 3.000000
4 32.000000 4.000000
5 40.000000 5.000000
6 48.000000 6.000000
7 56.000000 7.000000
In place transform
Signal Normalized signal
0 0.000000 0.000000
1 8.000000 1.000000
2 16.000000 2.000000
3 24.000000 3.000000
4 32.000000 4.000000
5 40.000000 5.000000
6 48.000000 6.000000
7 56.000000 7.000000