Hello @Robert_Crovella,
This is the full code example:
#include <cuda_runtime.h>
#include <complex>
#include <cufft.h>
#include <vector>
void fft(float* d_input, int inputSize, cufftComplex** d_output, int& outputSize, cudaStream_t stream)
{
constexpr int batchSize = 1;
outputSize = inputSize / 2 + 1;
cudaMalloc(reinterpret_cast<void**>(d_output), sizeof(std::complex<float>) * outputSize);
cufftHandle plan;
cufftCreate(&plan);
cufftPlan1d(&plan, inputSize, CUFFT_R2C, batchSize);
cufftSetStream(plan, stream);
cufftExecR2C(plan, d_input, *d_output);
cufftDestroy(plan);
}
void ifft(cufftComplex* d_input, int inputSize, float** d_output, int& outputSize, cudaStream_t stream)
{
constexpr int batchSize = 1;
outputSize = (inputSize - 1) * 2;
cudaMalloc(reinterpret_cast<void**>(d_output), sizeof(float) * outputSize);
cufftHandle plan;
cufftCreate(&plan);
cufftPlan1d(&plan, inputSize, CUFFT_C2R, batchSize);
cufftSetStream(plan, stream);
cufftExecC2R(plan, d_input, *d_output);
cufftDestroy(plan);
}
int main() {
cudaStream_t stream = nullptr;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
constexpr int input_length = 5;
float h_input[input_length]{ 1, 2, 3, 4, 5 };
std::printf("Input array:\n");
for (auto& i : h_input) {
std::printf("%f\n", i);
}
std::printf("=====\n");
float* d_input = nullptr;
auto error = cudaMalloc(reinterpret_cast<void**>(&d_input), sizeof(float) * input_length);
error = cudaMemcpyAsync(d_input, h_input, sizeof(float) * input_length, cudaMemcpyHostToDevice, stream);
cufftComplex* d_fft_output = nullptr;
int fft_output_size = 0;
// FFT
fft(d_input, input_length, &d_fft_output, fft_output_size, stream);
std::vector<cufftComplex> h_fft_output(fft_output_size);
cudaMemcpyAsync(h_fft_output.data(), d_fft_output, sizeof(cufftComplex) * fft_output_size,
cudaMemcpyDeviceToHost, stream);
std::printf("FFT output:\n");
for (auto& i : h_fft_output) {
std::printf("%f + %fj\n", i.x, i.y);
}
std::printf("=====\n");
float* d_ifft_output = nullptr;
int ifft_output_size = 0;
//IFFT
ifft(d_fft_output, fft_output_size, &d_ifft_output, ifft_output_size, stream);
float h_ifft[input_length];
cudaMemcpyAsync(h_ifft, d_ifft_output, sizeof(float) * ifft_output_size,
cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
std::printf("IFFT output:\n");
for (auto& i : h_ifft) {
std::printf("%f\n", i);
}
std::printf("=====\n");
cudaFree(d_input);
cudaFree(d_fft_output);
cudaFree(d_ifft_output);
cudaStreamDestroy(stream);
cudaDeviceReset();
return EXIT_SUCCESS;
}
It produces the following output:
Input array:
1.000000
2.000000
3.000000
4.000000
5.000000
=====
FFT output:
15.000000 + 0.000000j
-2.500000 + 3.440955j
-2.500000 + 0.812299j
=====
IFFT output:
10.000000
11.540092
23.459909
0.000000
-107374176.000000
=====
At the same time, in Matlab, I get the following results:
X = [1 2 3 4 5];
Y = fft(X)
Y = 1×5 complex
15.0000 + 0.0000i -2.5000 + 3.4410i -2.5000 + 0.8123i -2.5000 - 0.8123i -2.5000 - 3.4410i
ifft(Y)
ans = 1×5
1 2 3 4 5
FFT real to complex output transform is correct (array of non-redundant complex elements).
I also tried to inverse transform the full (restored) complex number array (with redundant numbers), i.e. vector<cufftComplex> { {15.f, 0.f}, {-2.5f, 3.441f}, {-2.5f, 0.8123f}, {-2.5f, -0.8123f}, {-2.5f, -3.441f} }
(also changed the outputSize
to be equal to inputSize
inside the ifft()
function, and have got [5,10,15,20,25]
. It appears, from your answer, that now I need to divide this result by the size of the transform :).
But would it be possible somehow to get the correct inverse transform data from an array of non-redundant complex elements without redundant data restoration? Is there some built-in functionality in cuFFT? Or if there isn’t, it appears that according to your answer on StackOverflow it would be better for me to use C2C in fft()
.