I am implementing some signal handling functions and many of them are FFT-related.
One I am having trouble with is the Hilbert Transform, which I implemented after Matlab/Octave hilbert (sort of).
The problem is that, since I don’t know how cuFFT stores the positive/negative frequencies, it is possible that my function is zeroing the wrong elements.
I had a look at the documentation and specially this thread, https://devtalk.nvidia.com/default/topic/793692/gpu-accelerated-libraries/how-cufft-complex-data-is-stored-/, and Njuffa’s explanation here, https://stackoverflow.com/questions/13535182/copying-data-to-cufftcomplex-data-struct
However, his answer addresses the raw copy of data. My case is: where in the array are the negative frequencies?
Below is a complete, fully working program that demonstrates my line of thought. An input array is filled with sine, then the fwd cuFFT is computed, then the “Hilbert” func is run on the complex, then inv FFT is computed and the output is scaled. Both input and output arrays are printed to disk so they can be plotted:
#include <iostream> // Standard I/O
#include <fstream> // For file writing
#include <cufft.h>
#ifndef __CUDA_CALL
#define __CUDA_CALL(call) do {\
cudaError_t cuda_error = call;\
if(cuda_error != cudaSuccess) {\
std::cerr << "CUDA Error: " << cudaGetErrorString(cuda_error) << ", " << __FILE__ << ", line " << __LINE__ << std::endl;\
return -1;}\
} while(0)
#endif
using namespace std;
// Helper function to write input and output data
__host__ void Write_File(float *output, const int LENGTH, const char *file_name)
{
ofstream out_file(file_name);
for(int i = 0; i < LENGTH; i++)
out_file << output[i] << endl;
out_file.close();
}
__device__ const float PI = 3.141592654f;
// Fills the array in GPU
__global__ void Populate(float *input, const int LENGTH)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x,
offset = gridDim.x * blockDim.x;
for(int global_idx = tid; global_idx < LENGTH; global_idx += offset)
input[global_idx] = __sinf(global_idx * 2 * PI * 0.01);
}
// Hilbert Transform programmed to resemble that of Matlab/Octave
__global__ void Hilbert(cufftComplex *dev_complex, const int COMPLEX_LENGTH, const int LIMIT_1, const int LIMIT_2)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x,
offset = gridDim.x * blockDim.x;
if((tid > 0) && (tid < LIMIT_1))
for(int global_idx = tid; global_idx < LIMIT_1; global_idx += offset)
{
dev_complex[global_idx].x *= 2;
dev_complex[global_idx].y *= 2;
}
else
for(int global_idx = LIMIT_2; global_idx < COMPLEX_LENGTH; global_idx += offset)
{
dev_complex[global_idx].x = 0;
dev_complex[global_idx].y = 0;
}
}
int main(void)
{
const int LENGTH = 1024,
G_SIZE = 200,
B_SIZE = 256,
CPX_LEN = LENGTH / 2 + 1, // Defines the complex array length according to cuFFT documentation
LIMIT_1 = CPX_LEN % 2 == 0 ? CPX_LEN / 2 : (CPX_LEN + 1) / 2, // Determines the loop limits according to
LIMIT_2 = CPX_LEN % 2 == 0 ? LIMIT_1 + 1 : LIMIT_1; // the complex length being even or odd
float *input,
*output;
cufftComplex *complex;
cufftHandle fwd_plan,
inv_plan;
cufftPlan1d(&fwd_plan, LENGTH, CUFFT_R2C, 1); // Creates forward and inverse plans
cufftPlan1d(&inv_plan, LENGTH, CUFFT_C2R, 1);
// Allocates and initializes the managed arrays
__CUDA_CALL(cudaMallocManaged(&input, LENGTH * sizeof(float)));
__CUDA_CALL(cudaMallocManaged(&output, LENGTH * sizeof(float)));
__CUDA_CALL(cudaMallocManaged(&complex, (CPX_LEN) * sizeof(cufftComplex)));
__CUDA_CALL(cudaMemset(input, 0, LENGTH * sizeof(float)));
__CUDA_CALL(cudaMemset(output, 0, LENGTH * sizeof(float)));
__CUDA_CALL(cudaMemset(complex, 0, (CPX_LEN) * sizeof(cufftComplex)));
Populate <<< G_SIZE, B_SIZE >>> (input, LENGTH);
__CUDA_CALL(cudaDeviceSynchronize());
cufftExecR2C(fwd_plan, (cufftReal *)input, complex); // Forward FFT
__CUDA_CALL(cudaDeviceSynchronize());
Hilbert <<< G_SIZE, B_SIZE >>> (complex, CPX_LEN, LIMIT_1, LIMIT_2);
__CUDA_CALL(cudaDeviceSynchronize());
cufftExecC2R(inv_plan, complex, (cufftReal *)output); // Inverse FFT
__CUDA_CALL(cudaDeviceSynchronize());
for(int i = 0; i < LENGTH; i++) // Scales the iFFT
output[i] = output[i] / LENGTH;
Write_File(input, LENGTH, "Input.txt");
Write_File(output, LENGTH, "Output.txt");
__CUDA_CALL(cudaFree(input));
__CUDA_CALL(cudaFree(output));
__CUDA_CALL(cudaFree(complex));
return 0;
}
You can copy/paste this code and compile with -lcufft. When you run, it will output 2 text files, which you can then inspect in Matlab/Octave:
load Input.txt;
load Output.txt;
plot(Input)
hold on;
plot(Output, "color", "red")
You will notice that the output (red line) isn’t really applying any phase shift, it just has a different amplitude, which makes me believe that cuFFT stores the elements in another way, so the function operates on the wrong elements.
Any light on it?