Performing an FFT, modifying the result, then performing an IFFT

I’m having some problems with cuFFT, specifically in performing an FFT, modifying the result while it’s still in GPU memory, then performing an IFFT on the modified data.

I’m attempting to do the modification using a kernel which is executed in the same stream as the two fourier transforms. If I attempt to modify the data in place, the IFFT appears to ignore the modified data and use the original unmodified data despite it no longer existing. If the modified data is placed into a new array, the IFFT returns zeroes.

This example shows the scenario where the modified data is placed into a new array, the pointer for which is passed to the IFFT.

#include <complex>
#include <iostream>
#include <vector>
#include <algorithm>
#include <cmath>

#include <cuda_runtime.h>
#include <cufft.h>


#define cuFFTFORWARD -1
#define cuFFTINVERSE 1

#define pi 3.141593

// cufft API error chekcing
#ifndef CUFFT_CALL
#define CUFFT_CALL( call )                                                                                             \
    {                                                                                                                  \
        auto status = static_cast<cufftResult>( call );                                                                \
        if ( status != CUFFT_SUCCESS )                                                                                 \
            fprintf( stderr,                                                                                           \
                     "ERROR: CUFFT call \"%s\" in line %d of file %s failed "                                          \
                     "with "                                                                                           \
                     "code (%d).\n",                                                                                   \
                     #call,                                                                                            \
                     __LINE__,                                                                                         \
                     __FILE__,                                                                                         \
                     status );                                                                                         \
    }
#endif 

__global__ void kernelHilbert(cufftComplex* fft_out, cufftComplex* ifft_in)
{
	int i = threadIdx.x;

	cufftComplex temp = fft_out[i];
	float arg = atan2(temp.y, temp.x);
	float mag = sqrt((temp.x * temp.x) + (temp.y * temp.y));

	if (arg < 0)
	{
		arg += (-1 * pi / 4);
	}
	else if (arg > 0)
	{
		arg += (pi / 4);
	}
		
	float x = mag * cos(arg);
	float y = mag * sin(arg);

	ifft_in[i].x = x;
	ifft_in[i].y = y;
}

int main()
{
	cudaError_t cudaStatus;

	cufftHandle plan_FFT;
	cufftHandle plan_IFFT;
	cudaStream_t stream = NULL;

	int batch_size = 1;

	int fft_length = 16;

	using complex_type = std::complex<float>;

	std::vector<complex_type> fft_in(fft_length);
	std::vector<complex_type> fft_out(fft_length);

	// input to ifft is fft_out
	std::vector<complex_type> ifft_in(fft_length);
	std::vector<complex_type> ifft_out(fft_length);

	cufftComplex* d_fft_in = nullptr;
	cufftComplex* d_fft_out = nullptr;

	cufftComplex* d_ifft_in = nullptr;
	cufftComplex* d_ifft_out = nullptr;


	for (int i = 0; i < fft_length; i++)
	{
		fft_in[i] = static_cast<complex_type>(i);
	}

	std::printf("Input array:\n");
	for (auto& i : fft_in) {
		std::printf("%f + %fj\n", i.real(), i.imag());
	}
	std::printf("=====\n");

	// Creation of stream
	cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);


	// Creation of plan for FFT
	cufftCreate(&plan_FFT);
	cufftPlan1d(&plan_FFT, fft_in.size(), CUFFT_C2C, batch_size);

	// Creation of plan for IFFT
	cufftCreate(&plan_IFFT);
	cufftPlan1d(&plan_IFFT, fft_out.size(), CUFFT_C2C, batch_size);


	// Allocation of memory for FFT
	cudaMalloc((void**)(&d_fft_in), sizeof(complex_type) * fft_in.size());
	cudaMalloc((void**)(&d_fft_out), sizeof(complex_type) * fft_out.size());

	// Allocation of memory for IFFT
	cudaMalloc((void**)(&d_ifft_in), sizeof(complex_type) * ifft_in.size());
	cudaMalloc((void**)(&d_ifft_out), sizeof(complex_type) * ifft_out.size());


	// Copy input array to device
	cudaMemcpyAsync(d_fft_in, fft_in.data(), sizeof(complex_type) * fft_in.size(), cudaMemcpyHostToDevice, stream);


	// Perform FFT
	CUFFT_CALL(cufftExecC2C(plan_FFT, d_fft_in, d_fft_out, cuFFTFORWARD)); 

	// Setup and start kernel for modifying the FMC output
	dim3 threads(16);
	dim3 blocks(1);

	kernelHilbert <<<blocks, threads, 0, stream>>> (d_fft_out, d_ifft_in);
	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess)
	{
		return 0;
	}

	// Perform IFFT
	CUFFT_CALL(cufftExecC2C(plan_IFFT, d_ifft_in, d_ifft_out, cuFFTINVERSE));

	
	// Copy out the results
	cudaMemcpyAsync(fft_out.data(), d_fft_out, sizeof(complex_type) * fft_in.size(), cudaMemcpyDeviceToHost, stream); // copy out the fft result
	cudaMemcpyAsync(ifft_in.data(), d_ifft_in, sizeof(complex_type) * fft_in.size(), cudaMemcpyDeviceToHost, stream); // copy out the modified fft result
	cudaMemcpyAsync(ifft_out.data(), d_ifft_out, sizeof(complex_type) * fft_in.size(), cudaMemcpyDeviceToHost, stream); // copy out the ifft result

	cudaStreamSynchronize(stream);

	// Normalise the result of the IFFT
	std::transform(ifft_out.begin(), ifft_out.end(), ifft_out.begin(), [](complex_type a) {float arg = std::arg(a); float mag = std::abs(a) * ((float)1 / 16); return std::polar(mag, arg); });

	// Display results
	std::printf("FFT array:\n");
	for (auto& i : fft_out) {
		std::printf("%f + %fj\n", i.real(), i.imag());
	}
	std::printf("=====\n");
	std::printf("Modified FFT array:\n");
	for (auto& i : ifft_in) {
		std::printf("%f + %fj\n", i.real(), i.imag());
	}
	std::printf("=====\n");

	std::printf("IFFT array:\n");
	for (auto& i : ifft_out) {
		std::printf("%f + %fj\n", i.real(), i.imag());
	}
	std::printf("=====\n");

	// Tidy up
	cudaFree(d_fft_in);
	cudaFree(d_fft_out);
	cudaFree(d_ifft_out);

	cufftDestroy(plan_FFT);
	cufftDestroy(plan_IFFT);

	cudaStreamDestroy(stream);
}

I’m unsure why this is happening, so any insight would be very helpful.

I don’t see any evidence of that in your code. Why did you choose the non-blocking flag? That is not the same stream that your CUFFT calls are using.

Why not start by just launching everything into the ordinary, unmodifed, null stream.

Thankyou very much for pointing out the issue with streams - I completely missed the cufftSetStream function. I’ve removed all the references to streams in the code and it now functions correctly.

I’ll confess I’m not very familiar with how to use streams properly and I haven’t been able to find much reference to them in the CUDA programming guide, would you happen to know of any good resources for learning how to use them?

streams are covered in some detail in the CUDA programming guide here (in fact you may want to back up and start at section 3.2.8, Asynchronous Concurrent Execution). I also generally point folks to this online training series, the section 7 there (“Concurrency”) has some coverage of stream usage. You can also find cuda sample codes that use and demonstrate stream usage, such as simpleStreams.

Thank you for the recommendations!

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.