The multi-gpu fft 3D R2C problem

Sorry to disturb, but recently I want to try multi-GPU FFT on two 1080GPU cards.
Before, I have tried the sample program named simpleCUFFT_2d_MGPU.cu.
Now I want to do myself problem that is a 3D R2C FFT, and I have read the CUFFT LIBRARY USER’S GUIDE many times, but can not run the program.

The code is there :

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cufftXt.h>

#define N 512
#define NSUM N*N*N
#define NSUM2 N*N*(N/2+1)

int main(int argc, char **argv)
{

	int GPU_N;
	cudaGetDeviceCount(&GPU_N);
	int nGPUs = 2;
	int *whichGPUs;
	whichGPUs = (int*) malloc(sizeof(int) * nGPUs);
	cufftReal *f = (cufftReal*) malloc(sizeof(cufftReal) * NSUM);
	cufftComplex *h_d_out = (cufftComplex *) malloc(sizeof(cufftComplex) * NSUM2);
	for (int i = 0; i < NSUM; i++) {
		f[i] = i % 5000 ;
	}

	cufftResult result;
	cufftHandle planComplex;
	size_t* worksize;
	cudaLibXtDesc *d_f , *d_out;
	worksize = (size_t*) malloc(sizeof(size_t) * nGPUs);

	result = cufftCreate(&planComplex);
	result = cufftXtSetGPUs(planComplex, nGPUs, whichGPUs);
	result = cufftMakePlan3d(planComplex, N, N, N, CUFFT_R2C, worksize);
	printf("check 1 : %d \n",result);
	result = cufftXtMalloc(planComplex, (cudaLibXtDesc **) &d_f, CUFFT_XT_FORMAT_INPUT);
	result = cufftXtMalloc(planComplex, (cudaLibXtDesc **) &d_out, CUFFT_XT_FORMAT_OUTPUT);
	printf("check 2 : %d \n",result);

	result = cufftXtMemcpy(planComplex, d_f, f, CUFFT_COPY_HOST_TO_DEVICE);
	printf("check 3 : %d \n",result);
	result = cufftXtExecDescriptorR2C(planComplex, d_f, d_out);
	printf("check 4 : %d \n",result);

	result = cufftXtMemcpy(planComplex, h_d_out, d_out, CUFFT_COPY_DEVICE_TO_HOST);
	printf("check 5 : %d \n",result);
	result = cufftXtFree(d_f);
	result = cufftDestroy(planComplex);

	free(h_d_out);
	free(worksize);

	return 0;
}

The print info about the result is :

check 1 : 0 
check 2 : 0 
check 3 : 5 
check 4 : 5 
check 5 : 5

I find the error code means : Driver or internal cuFFT library error
And I also can not find the detail info about the Parameter of cufftXtSubFormat.
Can anyone give detail information about the four types of format :
CUFFT_XT_FORMAT_INPUT
CUFFT_XT_FORMAT_OUTPUT
CUFFT_XT_FORMAT_INPLACE
CUFFT_XT_FORMAT_INPLACE_SHUFFLED

One problem is you’re not setting whichGPUs anywhere. It’s not enough to just allocate space. You have to provide numerical values.

there is a fully worked 3D 2-gpu example in the cufft documentation:

https://docs.nvidia.com/cuda/cufft/index.html#threed-complex-to-complex-xt-transforms

when I build a complete code out of that, it runs without error for me:

$ cat t1503.cu
#include <cufftXt.h>
#include <cufft.h>
#include <stdio.h>

void test(){
// Demonstrate how to use CUFFT to perform 3-d FFTs using 2 GPUs
//
// cufftCreate() - Create an empty plan


    cufftHandle plan_input; cufftResult result;
    result = cufftCreate(&plan_input);
    if (result != CUFFT_SUCCESS) { printf ("*Create failed\n"); return; }
//
// cufftXtSetGPUs() - Define which GPUs to use
    int nGPUs = 2, whichGPUs[2];
    whichGPUs[0] = 0; whichGPUs[1] = 1;
    result = cufftXtSetGPUs (plan_input, nGPUs, whichGPUs);
    if (result != CUFFT_SUCCESS) { printf ("*XtSetGPUs failed\n"); return; }
//
// Initialize FFT input data
    size_t worksize[2];
    cufftComplex *host_data_input, *host_data_output;
    int nx = 512, ny = 512, nz = 512;
    int size_of_data = sizeof(cufftComplex) * nx * ny * nz;
    host_data_input = (cufftComplex *)malloc(size_of_data);
    if (host_data_input == NULL) { printf ("malloc failed\n"); return; }
    host_data_output = (cufftComplex *)malloc(size_of_data);
    if (host_data_output == NULL) { printf ("malloc failed\n"); return; }
  //  initialize_3d_data (nx, ny, nz, host_data_input, host_data_output);
//
// cufftMakePlan3d() - Create the plan
    result = cufftMakePlan3d (plan_input, nz, ny, nx, CUFFT_C2C, worksize);
    if (result != CUFFT_SUCCESS) { printf ("*MakePlan* failed\n"); return; }
//
// cufftXtMalloc() - Malloc data on multiple GPUs
    cudaLibXtDesc *device_data_input;
    result = cufftXtMalloc (plan_input, &device_data_input,
        CUFFT_XT_FORMAT_INPLACE);
    if (result != CUFFT_SUCCESS) { printf ("*XtMalloc failed\n"); return; }
//
// cufftXtMemcpy() - Copy data from host to multiple GPUs
    result = cufftXtMemcpy (plan_input, device_data_input,
        host_data_input, CUFFT_COPY_HOST_TO_DEVICE);
    if (result != CUFFT_SUCCESS) { printf ("*XtMemcpy failed\n"); return; }
//
// cufftXtExecDescriptorC2C() - Execute FFT on multiple GPUs
    result = cufftXtExecDescriptorC2C (plan_input, device_data_input,
        device_data_input, CUFFT_FORWARD);
    if (result != CUFFT_SUCCESS) { printf ("*XtExec* failed\n"); return; }
//
// cufftXtMemcpy() - Copy data from multiple GPUs to host
    result = cufftXtMemcpy (plan_input, host_data_output,
        device_data_input, CUFFT_COPY_DEVICE_TO_HOST);
    if (result != CUFFT_SUCCESS) { printf ("*XtMemcpy failed\n"); return; }
//
// Print output and check results
    int output_return = 0; // output_3d_results (nx, ny, nz, host_data_input, host_data_output);
    if (output_return != 0) { return; }
//
// cufftXtFree() - Free GPU memory
    result = cufftXtFree(device_data_input);
    if (result != CUFFT_SUCCESS) { printf ("*XtFree failed\n"); return; }
//
// cufftDestroy() - Destroy FFT plan
    result = cufftDestroy(plan_input);
    if (result != CUFFT_SUCCESS) { printf ("*Destroy failed: code\n"); return; }
    free(host_data_input); free(host_data_output);
}

int main(){

   test();
}
$ nvcc -o t1503 t1503.cu -lcufft
$ CUDA_VISIBLE_DEVICES="2,3" ./t1503
$

Thank you very much and I am sorry for the mistake that not setting whichGPUs.
I changed my code and try your sample code of the inplace 3D multi-gpu FFT. It is worked. I also try the sample code on the guide manual. It is also worked, but these codes all have the same style is that all in-place transform.
In a real application, I want to do the R2C FFT, so I also try out-of-place 3D C2C FFT firstly.
But it still has some problem:

[As for the hardware information, I am using two GTX1080 GPU which has 12GB memory for each card.]

The code like this :

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cufftXt.h>

#define N 722
#define NSUM N*N*N
#define NSUM2 N*N*(N/2+1)

void testC2C_outofplace_FFT()
{

	printf("Test1: out of place of 3D FFT \n");
	int GPU_N;
	cudaGetDeviceCount(&GPU_N);
	int nGPUs = 2;
	int *whichGPUs;
	whichGPUs = (int*) malloc(sizeof(int) * nGPUs);
	cufftComplex *f = (cufftComplex*) malloc(sizeof(cufftComplex) * NSUM);
	cufftComplex *h_d_out = (cufftComplex *) malloc(sizeof(cufftComplex) * NSUM);
	for (int i = 0; i < NSUM; i++) {
		f[i].x = i % 5000 ;
	}
	whichGPUs[0] = 0; whichGPUs[1] = 1;

	cufftResult result;
	cufftHandle planComplex;
	size_t* worksize;
	cudaLibXtDesc *d_f , *d_out;

	worksize = (size_t*) malloc(sizeof(size_t) * nGPUs);
	result = cufftCreate(&planComplex);
	result = cufftXtSetGPUs(planComplex, nGPUs, whichGPUs);
	result = cufftMakePlan3d(planComplex, N, N, N, CUFFT_C2C, worksize);
	printf("Input : CUFFT_XT_FORMAT_INPUT and Output : CUFFT_XT_FORMAT_OUTPUT\n");
	printf("check 1 : %d \n",result);
	result = cufftXtMalloc(planComplex, (cudaLibXtDesc **) &d_f, CUFFT_XT_FORMAT_INPUT);
	//CUFFT_XT_FORMAT_INPUT
	//CUFFT_XT_FORMAT_OUTPUT
	//CUFFT_XT_FORMAT_INPLACE
	//CUFFT_XT_FORMAT_INPLACE_SHUFFLED
	//CUFFT_FORMAT_UNDEFINED
	result = cufftXtMalloc(planComplex, (cudaLibXtDesc **) &d_out, CUFFT_XT_FORMAT_OUTPUT);
	printf("check 2 : %d \n",result);

	result = cufftXtMemcpy(planComplex, d_f, f, CUFFT_COPY_HOST_TO_DEVICE);
	printf("check 3 : %d \n",result);
	result = cufftXtExecDescriptorC2C(planComplex, d_f, d_out,CUFFT_FORWARD);

	printf("check 4 : %d \n",result);

	result = cufftXtMemcpy(planComplex, h_d_out, d_f, CUFFT_COPY_DEVICE_TO_HOST);
	printf("check 5 : %d \n",result);
	result = cufftXtFree(d_f);
	result = cufftXtFree(d_out);
	result = cufftDestroy(planComplex);


	// cleanup memory

	free(h_d_out);
	free(worksize);

}


int main(){
	testC2C_outofplace_FFT();
}
Test1: out of place of 3D FFT 
Input : CUFFT_XT_FORMAT_INPLACE and Output : CUFFT_XT_FORMAT_OUTPUT 
check 1 : 0 
check 2 : 0 
check 3 : 0 
check 4 : 6 
check 5 : 0 


Test1: out of place of 3D FFT 
Input : CUFFT_XT_FORMAT_INPUT and Output : CUFFT_XT_FORMAT_OUTPUT 
check 1 : 0 
check 2 : 0 
check 3 : 5 
check 4 : 5 
check 5 : 5 

Test1: out of place of 3D FFT 
Input : CUFFT_XT_FORMAT_INPUT and Output : CUFFT_XT_FORMAT_INPUT 
check 1 : 0 
check 2 : 0 
check 3 : 5 
check 4 : 5 
check 5 : 5

I try a lot of combinations with the input data and output data format, but almost all have a problem.
I want to ask if I want to do a C2C out-of-place transform, what should I do?
Thank you again!

I don’t have an answer for you right now. My suggestion would be to file a bug using the instructions linked to the sticky post at the top of the CUDA programming sub-forum.

OK, thank you very much and I have submitted the problem as a bug.

Currently it seems that CUFFT requires these multi-GPU transforms to be in-place.

I expect a future documentation update will clarify this.