Complex-to-Complex Convolution using Two GPUs Example


I am using the cuFFT documentation get a Convolution working using two GPUs. I cant compile the code below because it seems I am missing an include for initialize_1d_data and output_1d_results.

What do I need to include to use initialize_1d_data and output_1d_results?

#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <iostream>
#include <fstream>
#include <string>
#include <cstring>
#include <cstdlib>
#include <math.h>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>

using namespace std;

// Utility routine to perform complex pointwise multiplication with scaling
__global__ void ComplexPointwiseMulAndScale(cufftComplex *a, cufftComplex *b, int size)
    const int numThreads = blockDim.x * gridDim.x;
    const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
    float scale = 1.0f / (float)size;
    cufftComplex c;
    for (int i = threadID; i < size; i += numThreads)
        c = cuCmulf(a[i], b[i]);
        b[i] = make_cuFloatComplex(scale*cuCrealf(c), scale*cuCimagf(c));

int main(void)
	// Demonstrate how to use CUFFT to perform a convolution using 1-d FFTs and
	// 2 GPUs. The forward FFTs use both GPUs, while the inverse FFT uses one.
	// Function return codes should be checked for errors in actual code.
	// cufftCreate() - Create an empty plan
	cufftResult result;
	cudaError_t cuda_status;
	    cufftHandle plan_forward_2_gpus, plan_inverse_1_gpu;
	    result = cufftCreate(&plan_forward_2_gpus);
	    result = cufftCreate(&plan_inverse_1_gpu);

	// cufftXtSetGPUs() - Define which GPUs to use
	    int nGPUs = 2, whichGPUs[2];
	    whichGPUs[0] = 0; whichGPUs[1] = 1;
	    result = cufftXtSetGPUs(plan_forward_2_gpus, nGPUs, whichGPUs);
	// Initialize FFT input data
	    size_t worksize[2];
	    cufftComplex *host_data_input, *host_data_output;
	    int nx = 1048576, batch = 2, rank = 1, n[1];
	    int inembed[1], istride, idist, onembed[1], ostride, odist;
	    n[0] = nx;
	    int size_of_one_set = sizeof(cufftComplex) * nx;
	    int size_of_data = size_of_one_set * batch;
	    host_data_input = (cufftComplex*)malloc(size_of_data);
	    host_data_output = (cufftComplex*)malloc(size_of_one_set);
	    initialize_1d_data (nx, batch, rank, n, inembed, &istride, &idist,
	        onembed, &ostride, &odist, host_data_input, host_data_output);
	// cufftMakePlanMany(), cufftPlan1d - Create the plans
	    result = cufftMakePlanMany (plan_forward_2_gpus, rank, n, inembed,
	        istride, idist, onembed, ostride, odist, CUFFT_C2C, batch, worksize);
	    result = cufftPlan1d (&plan_inverse_1_gpu, nx, CUFFT_C2C, 1);
	// cufftXtMalloc(), cudaMallocHost - Allocate data for GPUs
	    cudaLibXtDesc *device_data_input; cufftComplex *GPU0_data_from_GPU1;
	    result = cufftXtMalloc (plan_forward_2_gpus, &device_data_input,
	    int device0 = device_data_input->descriptor->GPUs[0];
	    cudaSetDevice(device0) ;
	    cuda_status = cudaMallocHost ((void**)&GPU0_data_from_GPU1,size_of_one_set);
	// cufftXtMemcpy() - Copy data from host to multiple GPUs
	    result = cufftXtMemcpy (plan_forward_2_gpus, device_data_input,
	        host_data_input, CUFFT_COPY_HOST_TO_DEVICE);
	// cufftXtExecDescriptorC2C() - Execute forward FFTs on multiple GPUs
	    result = cufftXtExecDescriptorC2C (plan_forward_2_gpus, device_data_input,
	        device_data_input, CUFFT_FORWARD);
	// cudaMemcpy result from GPU1 to GPU0
	    cufftComplex *device_data_on_GPU1;
	    device_data_on_GPU1 = (cufftComplex*)
	    cuda_status = cudaMemcpy (GPU0_data_from_GPU1, device_data_on_GPU1,
	        size_of_one_set, cudaMemcpyDeviceToDevice);
	// Continued on next page

	// Demonstrate how to use CUFFT to perform a convolution using 1-d FFTs and
	// 2 GPUs. The forward FFTs use both GPUs, while the inverse FFT uses one.
	// Function return codes should be checked for errors in actual code.
	// Part 2
	// Multiply results and scale output
	    cufftComplex *device_data_on_GPU0;
	    device_data_on_GPU0 = (cufftComplex*)
	    cudaSetDevice(device0) ;
	    ComplexPointwiseMulAndScale<<<32, 256>>>((cufftComplex*)device_data_on_GPU0,
	        (cufftComplex*) GPU0_data_from_GPU1, nx);
	// cufftExecC2C() - Execute inverse FFT on one GPU
	    result = cufftExecC2C (plan_inverse_1_gpu, GPU0_data_from_GPU1,
	        GPU0_data_from_GPU1, CUFFT_INVERSE);
	// cudaMemcpy() - Copy results from GPU0 to host
	    cuda_status = cudaMemcpy(host_data_output, GPU0_data_from_GPU1,
	        size_of_one_set, cudaMemcpyDeviceToHost);
	// Print output and check results
	    int output_return = output_1d_results (nx, batch,
	        host_data_input, host_data_output);
	// cufftDestroy() - Destroy FFT plans
	    result = cufftDestroy(plan_forward_2_gpus);
	    result = cufftDestroy(plan_inverse_1_gpu);
	// cufftXtFree(), cudaFreeHost(), free() - Free GPU and host memory
	    result = cufftXtFree(device_data_input);
	    cuda_status = cudaFreeHost (GPU0_data_from_GPU1);
	    free(host_data_input); free(host_data_output);

	return 0;

These aren’t standard routines - they are not provided for you anywhere. You need to write those routines yourself.

If you want a complete CUFFT Multi-GPU code, there are the following cufft sample codes available:


simpleCUFFT_2d_MGPU - SimpleCUFFT_2d_MGPU