CUFFT on two GPUs (bug)

Hi,

I am actually struggling to make cufft C2C work, and even if I am able to make FFTs from real to complex (R2C), when I try FFTs from complex to complex the result isn’t always the same (non deterministic).

To be more precise, sometimes the result is correct, and sometimes not!

What am I doing wrong?

Any help would be greatly appreciated,

Here is my code :

//...

int nGPUs = 2;
int whichGPUs[2];
whichGPUs[0] = 0;
whichGPUs[0] = 1;

cufftHandle plan_fft_F1_2_gpus;
result = cufftCreate(&plan_fft_F1_2_gpus);

result = cufftXtSetGPUs (plan_fft_F1_2_gpus, nGPUs, whichGPUs);

size_t F1_worksize[2];

if (cufftMakePlan1d(plan_fft_F1_2_gpus, apod_factor_F1*F1_NPoints, CUFFT_C2C, 
apod_factor_F2*F2_NPoints/2+1, F1_worksize)!=CUFFT_SUCCESS){
                        fprintf(stderr, "CUFFT error : Plan creation failed");
                }

cufftComplex *h_data_output_F1_F2; // the final result
cudaLibXtDesc *d_data_input_F1, *d_data_output_F1;

h_data_output_F1_F2 = (cufftComplex*) malloc(sizeof(cufftComplex)*
                        apod_factor_F1*F1_NPoints*((apod_factor_F2*F2_NPoints)/2+1));

//Malloc datas on GPU

if(cufftXtMalloc (plan_fft_F1_2_gpus, &d_data_input_F1, CUFFT_XT_FORMAT_INPLACE)
                           !=CUFFT_SUCCESS){
        fprintf(stderr, "CUFFT error1 : Malloc failed");
        return 1;
}

if(cufftXtMalloc (plan_fft_F1_2_gpus, &d_data_output_F1, CUFFT_XT_FORMAT_INPLACE)
                           !=CUFFT_SUCCESS) {
        fprintf(stderr, "CUFFT error2 : Malloc failed");
        return 1;
}

//F1_data_input comes from previously executed code (not displayed here) and contains the data

if (cufftXtMemcpy (plan_fft_F1_2_gpus, d_data_input_F1,
        F1_data_input, CUFFT_COPY_HOST_TO_DEVICE) != CUFFT_SUCCESS) {
                        fprintf(stderr, "CUFFT error : Memcopy failed");
                        return 1;
}

if(cufftXtExecDescriptorC2C(plan_fft_F1_2_gpus, d_data_input_F1,
                            d_data_output_F1, CUFFT_FORWARD)!= CUFFT_SUCCESS){
        fprintf(stderr, "Cuda error : failed to execute plan\n");
        return 1;
}

if (cudaDeviceSynchronize() != cudaSuccess){
        fprintf(stderr, "Cuda error: Failed to synchronize\n");
        return 1;
}

if (cufftXtMemcpy(plan_fft_F1_2_gpus, h_data_output_F1_F2, d_data_output_F1, 
CUFFT_COPY_DEVICE_TO_HOST)!=CUFFT_SUCCESS) {
        fprintf(stderr, "CUFFT error : Memcpy failed");
        return 1;
}

cufftDestroy(plan_fft_F1_2_gpus);
cufftXtFree(d_data_input_F1);
cufftXtFree(d_data_output_F1);

//...(Displaying the results in h_data_output_F1_F2 and more)...

cross posted (problem seems to be resolved?):

http://stackoverflow.com/questions/43830132/cufft-xt-why-does-the-result-changes-each-time