I’m having some problems with cuFFT, specifically in performing an FFT, modifying the result while it’s still in GPU memory, then performing an IFFT on the modified data.
I’m attempting to do the modification using a kernel which is executed in the same stream as the two fourier transforms. If I attempt to modify the data in place, the IFFT appears to ignore the modified data and use the original unmodified data despite it no longer existing. If the modified data is placed into a new array, the IFFT returns zeroes.
This example shows the scenario where the modified data is placed into a new array, the pointer for which is passed to the IFFT.
#include <complex>
#include <iostream>
#include <vector>
#include <algorithm>
#include <cmath>
#include <cuda_runtime.h>
#include <cufft.h>
#define cuFFTFORWARD -1
#define cuFFTINVERSE 1
#define pi 3.141593
// cufft API error chekcing
#ifndef CUFFT_CALL
#define CUFFT_CALL( call ) \
{ \
auto status = static_cast<cufftResult>( call ); \
if ( status != CUFFT_SUCCESS ) \
fprintf( stderr, \
"ERROR: CUFFT call \"%s\" in line %d of file %s failed " \
"with " \
"code (%d).\n", \
#call, \
__LINE__, \
__FILE__, \
status ); \
}
#endif
__global__ void kernelHilbert(cufftComplex* fft_out, cufftComplex* ifft_in)
{
int i = threadIdx.x;
cufftComplex temp = fft_out[i];
float arg = atan2(temp.y, temp.x);
float mag = sqrt((temp.x * temp.x) + (temp.y * temp.y));
if (arg < 0)
{
arg += (-1 * pi / 4);
}
else if (arg > 0)
{
arg += (pi / 4);
}
float x = mag * cos(arg);
float y = mag * sin(arg);
ifft_in[i].x = x;
ifft_in[i].y = y;
}
int main()
{
cudaError_t cudaStatus;
cufftHandle plan_FFT;
cufftHandle plan_IFFT;
cudaStream_t stream = NULL;
int batch_size = 1;
int fft_length = 16;
using complex_type = std::complex<float>;
std::vector<complex_type> fft_in(fft_length);
std::vector<complex_type> fft_out(fft_length);
// input to ifft is fft_out
std::vector<complex_type> ifft_in(fft_length);
std::vector<complex_type> ifft_out(fft_length);
cufftComplex* d_fft_in = nullptr;
cufftComplex* d_fft_out = nullptr;
cufftComplex* d_ifft_in = nullptr;
cufftComplex* d_ifft_out = nullptr;
for (int i = 0; i < fft_length; i++)
{
fft_in[i] = static_cast<complex_type>(i);
}
std::printf("Input array:\n");
for (auto& i : fft_in) {
std::printf("%f + %fj\n", i.real(), i.imag());
}
std::printf("=====\n");
// Creation of stream
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
// Creation of plan for FFT
cufftCreate(&plan_FFT);
cufftPlan1d(&plan_FFT, fft_in.size(), CUFFT_C2C, batch_size);
// Creation of plan for IFFT
cufftCreate(&plan_IFFT);
cufftPlan1d(&plan_IFFT, fft_out.size(), CUFFT_C2C, batch_size);
// Allocation of memory for FFT
cudaMalloc((void**)(&d_fft_in), sizeof(complex_type) * fft_in.size());
cudaMalloc((void**)(&d_fft_out), sizeof(complex_type) * fft_out.size());
// Allocation of memory for IFFT
cudaMalloc((void**)(&d_ifft_in), sizeof(complex_type) * ifft_in.size());
cudaMalloc((void**)(&d_ifft_out), sizeof(complex_type) * ifft_out.size());
// Copy input array to device
cudaMemcpyAsync(d_fft_in, fft_in.data(), sizeof(complex_type) * fft_in.size(), cudaMemcpyHostToDevice, stream);
// Perform FFT
CUFFT_CALL(cufftExecC2C(plan_FFT, d_fft_in, d_fft_out, cuFFTFORWARD));
// Setup and start kernel for modifying the FMC output
dim3 threads(16);
dim3 blocks(1);
kernelHilbert <<<blocks, threads, 0, stream>>> (d_fft_out, d_ifft_in);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
return 0;
}
// Perform IFFT
CUFFT_CALL(cufftExecC2C(plan_IFFT, d_ifft_in, d_ifft_out, cuFFTINVERSE));
// Copy out the results
cudaMemcpyAsync(fft_out.data(), d_fft_out, sizeof(complex_type) * fft_in.size(), cudaMemcpyDeviceToHost, stream); // copy out the fft result
cudaMemcpyAsync(ifft_in.data(), d_ifft_in, sizeof(complex_type) * fft_in.size(), cudaMemcpyDeviceToHost, stream); // copy out the modified fft result
cudaMemcpyAsync(ifft_out.data(), d_ifft_out, sizeof(complex_type) * fft_in.size(), cudaMemcpyDeviceToHost, stream); // copy out the ifft result
cudaStreamSynchronize(stream);
// Normalise the result of the IFFT
std::transform(ifft_out.begin(), ifft_out.end(), ifft_out.begin(), [](complex_type a) {float arg = std::arg(a); float mag = std::abs(a) * ((float)1 / 16); return std::polar(mag, arg); });
// Display results
std::printf("FFT array:\n");
for (auto& i : fft_out) {
std::printf("%f + %fj\n", i.real(), i.imag());
}
std::printf("=====\n");
std::printf("Modified FFT array:\n");
for (auto& i : ifft_in) {
std::printf("%f + %fj\n", i.real(), i.imag());
}
std::printf("=====\n");
std::printf("IFFT array:\n");
for (auto& i : ifft_out) {
std::printf("%f + %fj\n", i.real(), i.imag());
}
std::printf("=====\n");
// Tidy up
cudaFree(d_fft_in);
cudaFree(d_fft_out);
cudaFree(d_ifft_out);
cufftDestroy(plan_FFT);
cufftDestroy(plan_IFFT);
cudaStreamDestroy(stream);
}
I’m unsure why this is happening, so any insight would be very helpful.