Complex-to-Complex Convolution using Two GPUs Example

Hello,

I am using the cuFFT documentation get a Convolution working using two GPUs. I cant compile the code below because it seems I am missing an include for initialize_1d_data and output_1d_results.

What do I need to include to use initialize_1d_data and output_1d_results?

``````#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <iostream>
#include <fstream>
#include <string>
#include <cstring>
#include <cstdlib>
#include <math.h>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>

using namespace std;

// Utility routine to perform complex pointwise multiplication with scaling
__global__ void ComplexPointwiseMulAndScale(cufftComplex *a, cufftComplex *b, int size)
{
const int numThreads = blockDim.x * gridDim.x;
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
float scale = 1.0f / (float)size;
cufftComplex c;
for (int i = threadID; i < size; i += numThreads)
{
c = cuCmulf(a[i], b[i]);
b[i] = make_cuFloatComplex(scale*cuCrealf(c), scale*cuCimagf(c));
}
return;
}

int main(void)
{
//
// Demonstrate how to use CUFFT to perform a convolution using 1-d FFTs and
// 2 GPUs. The forward FFTs use both GPUs, while the inverse FFT uses one.
// Function return codes should be checked for errors in actual code.
//
// cufftCreate() - Create an empty plan
cufftResult result;
cudaError_t cuda_status;
cufftHandle plan_forward_2_gpus, plan_inverse_1_gpu;
result = cufftCreate(&plan_forward_2_gpus);
result = cufftCreate(&plan_inverse_1_gpu);

// cufftXtSetGPUs() - Define which GPUs to use
int nGPUs = 2, whichGPUs[2];
whichGPUs[0] = 0; whichGPUs[1] = 1;
result = cufftXtSetGPUs(plan_forward_2_gpus, nGPUs, whichGPUs);
///usr/local/cuda/lib64
// Initialize FFT input data
size_t worksize[2];
cufftComplex *host_data_input, *host_data_output;
int nx = 1048576, batch = 2, rank = 1, n[1];
int inembed[1], istride, idist, onembed[1], ostride, odist;
n[0] = nx;
int size_of_one_set = sizeof(cufftComplex) * nx;
int size_of_data = size_of_one_set * batch;
host_data_input = (cufftComplex*)malloc(size_of_data);
host_data_output = (cufftComplex*)malloc(size_of_one_set);
initialize_1d_data (nx, batch, rank, n, inembed, &istride, &idist,
onembed, &ostride, &odist, host_data_input, host_data_output);
//
// cufftMakePlanMany(), cufftPlan1d - Create the plans
result = cufftMakePlanMany (plan_forward_2_gpus, rank, n, inembed,
istride, idist, onembed, ostride, odist, CUFFT_C2C, batch, worksize);
result = cufftPlan1d (&plan_inverse_1_gpu, nx, CUFFT_C2C, 1);
//
// cufftXtMalloc(), cudaMallocHost - Allocate data for GPUs
cudaLibXtDesc *device_data_input; cufftComplex *GPU0_data_from_GPU1;
result = cufftXtMalloc (plan_forward_2_gpus, &device_data_input,
CUFFT_XT_FORMAT_INPLACE);
int device0 = device_data_input->descriptor->GPUs[0];
cudaSetDevice(device0) ;
cuda_status = cudaMallocHost ((void**)&GPU0_data_from_GPU1,size_of_one_set);
//
// cufftXtMemcpy() - Copy data from host to multiple GPUs
result = cufftXtMemcpy (plan_forward_2_gpus, device_data_input,
host_data_input, CUFFT_COPY_HOST_TO_DEVICE);
//
// cufftXtExecDescriptorC2C() - Execute forward FFTs on multiple GPUs
result = cufftXtExecDescriptorC2C (plan_forward_2_gpus, device_data_input,
device_data_input, CUFFT_FORWARD);
//
// cudaMemcpy result from GPU1 to GPU0
cufftComplex *device_data_on_GPU1;
device_data_on_GPU1 = (cufftComplex*)
(device_data_input->descriptor->data[1]);
cuda_status = cudaMemcpy (GPU0_data_from_GPU1, device_data_on_GPU1,
size_of_one_set, cudaMemcpyDeviceToDevice);
//
// Continued on next page
//

//
// Demonstrate how to use CUFFT to perform a convolution using 1-d FFTs and
// 2 GPUs. The forward FFTs use both GPUs, while the inverse FFT uses one.
// Function return codes should be checked for errors in actual code.
//
// Part 2
//
// Multiply results and scale output
cufftComplex *device_data_on_GPU0;
device_data_on_GPU0 = (cufftComplex*)
(device_data_input->descriptor->data[0]);
cudaSetDevice(device0) ;
ComplexPointwiseMulAndScale<<<32, 256>>>((cufftComplex*)device_data_on_GPU0,
(cufftComplex*) GPU0_data_from_GPU1, nx);
//
// cufftExecC2C() - Execute inverse FFT on one GPU
result = cufftExecC2C (plan_inverse_1_gpu, GPU0_data_from_GPU1,
GPU0_data_from_GPU1, CUFFT_INVERSE);
//
// cudaMemcpy() - Copy results from GPU0 to host
cuda_status = cudaMemcpy(host_data_output, GPU0_data_from_GPU1,
size_of_one_set, cudaMemcpyDeviceToHost);
//
// Print output and check results
int output_return = output_1d_results (nx, batch,
host_data_input, host_data_output);
//
// cufftDestroy() - Destroy FFT plans
result = cufftDestroy(plan_forward_2_gpus);
result = cufftDestroy(plan_inverse_1_gpu);
//
// cufftXtFree(), cudaFreeHost(), free() - Free GPU and host memory
result = cufftXtFree(device_data_input);
cuda_status = cudaFreeHost (GPU0_data_from_GPU1);
free(host_data_input); free(host_data_output);

return 0;
}
``````

These aren’t standard routines - they are not provided for you anywhere. You need to write those routines yourself.

If you want a complete CUFFT Multi-GPU code, there are the following cufft sample codes available:

simpleCUFFT_MGPU - Simple CUFFT_MGPU
http://docs.nvidia.com/cuda/cuda-samples/index.html#simple-cufft_mgpu

simpleCUFFT_2d_MGPU - SimpleCUFFT_2d_MGPU
http://docs.nvidia.com/cuda/cuda-samples/index.html#simplecufft_2d_mgpu