I am actually struggling to make cufft C2C work, and even if I am able to make FFTs from real to complex (R2C), when I try FFTs from complex to complex the result isn’t always the same (non deterministic).
To be more precise, sometimes the result is correct, and sometimes not!
What am I doing wrong?
Any help would be greatly appreciated,
Here is my code :
int nGPUs = 2;
int whichGPUs[2];
whichGPUs[0] = 0;
whichGPUs[0] = 1;
cufftHandle plan_fft_F1_2_gpus;
result = cufftCreate(&plan_fft_F1_2_gpus);
result = cufftXtSetGPUs (plan_fft_F1_2_gpus, nGPUs, whichGPUs);
size_t F1_worksize[2];
if (cufftMakePlan1d(plan_fft_F1_2_gpus, apod_factor_F1*F1_NPoints, CUFFT_C2C,
apod_factor_F2*F2_NPoints/2+1, F1_worksize)!=CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error : Plan creation failed");
cufftComplex *h_data_output_F1_F2; // the final result
cudaLibXtDesc *d_data_input_F1, *d_data_output_F1;
h_data_output_F1_F2 = (cufftComplex*) malloc(sizeof(cufftComplex)*
//Malloc datas on GPU
if(cufftXtMalloc (plan_fft_F1_2_gpus, &d_data_input_F1, CUFFT_XT_FORMAT_INPLACE)
fprintf(stderr, "CUFFT error1 : Malloc failed");
return 1;
if(cufftXtMalloc (plan_fft_F1_2_gpus, &d_data_output_F1, CUFFT_XT_FORMAT_INPLACE)
fprintf(stderr, "CUFFT error2 : Malloc failed");
return 1;
//F1_data_input comes from previously executed code (not displayed here) and contains the data
if (cufftXtMemcpy (plan_fft_F1_2_gpus, d_data_input_F1,
fprintf(stderr, "CUFFT error : Memcopy failed");
return 1;
if(cufftXtExecDescriptorC2C(plan_fft_F1_2_gpus, d_data_input_F1,
d_data_output_F1, CUFFT_FORWARD)!= CUFFT_SUCCESS){
fprintf(stderr, "Cuda error : failed to execute plan\n");
return 1;
if (cudaDeviceSynchronize() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to synchronize\n");
return 1;
if (cufftXtMemcpy(plan_fft_F1_2_gpus, h_data_output_F1_F2, d_data_output_F1,
fprintf(stderr, "CUFFT error : Memcpy failed");
return 1;
//...(Displaying the results in h_data_output_F1_F2 and more)...