CUDA fft not deterministic behavior and segmentation fault

I have this code to compute FFT and Inverse FFT on a float

cufftHandle plan;
float *src = new float[N];
float *fft = new float[N];
float *ifft = new float[N];
cufftComplex *dev_src, *dev_fft, *dev_ifft;
int size = N * sizeof(cufftComplex);

//Allocate memory on GPU
cudaMalloc((void**)&dev_src,size);
cudaMalloc((void**)&dev_fft,size);
cudaMalloc((void**)&dev_ifft,size);

// Initalize signal
for (int i = 0; i < N; ++i) {
src[i] = rand() / (float)RAND_MAX;
    //src[i].y = 0;
}

// Copy host memory to device
cudaMemcpy(dev_src,src,size,cudaMemcpyHostToDevice);

// CUFFT plan
cufftPlan1d(&plan,N,CUFFT_C2C,BATCH);

// Transform signal
printf(“Transforming signal cufftExecC2C\n”);
cufftExecC2C(plan, dev_src, dev_fft, CUFFT_FORWARD);

// Transform signal back
printf(“Transforming signal back cufftExecC2C\n”);
cufftExecC2C(plan, dev_fft, dev_ifft, CUFFT_INVERSE);

// Copy device memory to host
cudaMemcpy(fft,dev_fft,size,cudaMemcpyDeviceToHost);
cudaMemcpy(ifft,dev_ifft,size,cudaMemcpyDeviceToHost);

// Display results
for (int i = 0; i < N; ++i) {
ifft[i] = ifft[i] / (float) N;
printf(“Signal = %f \t FFT = %f \t IFFT = %f \n”, src[i],fft[i],ifft[i]);
}

//Destroy CUFFT context
cufftDestroy(plan);

// cleanup memory
free(src);
free(fft);
free(ifft);
cudaFree(dev_src);
cudaFree(dev_fft);
cudaFree(dev_ifft);

I have 2 main problems:

  • the execution is not deterministic: sometime signal and ifft are equal and sometime not
  • sometime I have “Segmentation fault:11”

The problem is about types. When I call the cudaMemcpy I have to use the float type instead of cufftComplex.