I’m trying to write a simple code using cufft library. After the inverse transformam aren’t same. Someone can help me to understand why this is happening?? I’m using Visual Studio
My code
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cuda_runtime.h>
#include <cufft.h>
#include <helper_functions.h>
#include <helper_cuda.h>
// Complex data type
typedef float2 Complex;
static __device__ __host__ inline Complex ComplexAdd(Complex, Complex);
static __device__ __host__ inline Complex ComplexScale(Complex, float);
static __device__ __host__ inline Complex ComplexMul(Complex, Complex);
static __global__ void ComplexPointwiseMulAndScale(Complex *, const Complex *, int, float);
// Filtering functions
void Convolve(const Complex *, int, const Complex *, int, Complex *);
// Padding functions
int PadData(const Complex *, Complex **, int,
const Complex *, Complex **, int);
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char **argv);
// The filter size is assumed to be a number smaller than the signal size
#define SIGNAL_SIZE 16
#define FILTER_KERNEL_SIZE 11
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
runTest(argc, argv);
system("Pause");
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv)
{
printf("[simpleCUFFT] is starting...\n");
findCudaDevice(argc, (const char **)argv);
// Allocate host memory for the signal
cufftComplex *h_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE);
cufftComplex *h_reversed_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE);
// Initalize the memory for the signal
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
h_signal[i].x = rand() / (float)RAND_MAX;
h_signal[i].y = 0;
}
cufftComplex *d_signal;
checkCudaErrors(cudaMalloc((void **)&d_signal, SIGNAL_SIZE));
// Copy host memory to device
checkCudaErrors(cudaMemcpy(d_signal, h_signal, SIGNAL_SIZE,
cudaMemcpyHostToDevice));
cufftHandle plan;
checkCudaErrors(cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_C2C, 1));
// Transform signal and kernel
printf("Transforming signal cufftExecC2C\n");
checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD));
getLastCudaError("Kernel execution failed [ ComplexPointwiseMulAndScale ]");
// Transform signal back
printf("Transforming signal back cufftExecC2C\n");
checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE));
// Copy device memory to host
checkCudaErrors(cudaMemcpy(h_reversed_signal, d_signal, SIGNAL_SIZE,
cudaMemcpyDeviceToHost));
// check result
bool bTestResult = sdkCompareL2fe((float *)h_reversed_signal, (float *)h_signal, 2 * SIGNAL_SIZE, 1e-5f);
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
h_reversed_signal[i].x = h_reversed_signal[i].x / (float)SIGNAL_SIZE;
h_reversed_signal[i].y = h_reversed_signal[i].y/(float)SIGNAL_SIZE;
printf("first : %f %f after %f %f \n", h_signal[i].x, h_signal[i].y, h_reversed_signal[i].x, h_reversed_signal[i].y);
}
//Destroy CUFFT context
checkCudaErrors(cufftDestroy(plan));
// cleanup memory
free(h_signal);
free(h_reversed_signal);
checkCudaErrors(cudaFree(d_signal));
cudaDeviceReset();
}
// Pad data
int PadData(const Complex *signal, Complex **padded_signal, int signal_size,
const Complex *filter_kernel, Complex **padded_filter_kernel, int filter_kernel_size)
{
int minRadius = filter_kernel_size / 2;
int maxRadius = filter_kernel_size - minRadius;
int new_size = signal_size + maxRadius;
// Pad signal
Complex *new_data = (Complex *)malloc(sizeof(Complex) * new_size);
memcpy(new_data + 0, signal, signal_size * sizeof(Complex));
memset(new_data + signal_size, 0, (new_size - signal_size) * sizeof(Complex));
*padded_signal = new_data;
// Pad filter
new_data = (Complex *)malloc(sizeof(Complex) * new_size);
memcpy(new_data + 0, filter_kernel + minRadius, maxRadius * sizeof(Complex));
memset(new_data + maxRadius, 0, (new_size - filter_kernel_size) * sizeof(Complex));
memcpy(new_data + new_size - minRadius, filter_kernel, minRadius * sizeof(Complex));
*padded_filter_kernel = new_data;
return new_size;
}
My result
[simpleCUFFT] is starting...
GPU Device 0: "GeForce GTX 570" with compute capability 2.0
Transforming signal cufftExecC2C
Transforming signal back cufftExecC2C
ERROR, l2-norm error 1 is greater than epsilon 1e-005
first : 0.001251 0.000000 after 0.000000 0.000000
first : 0.563585 0.000000 after 8388608.000000 16895.835938
first : 0.193304 0.000000 after -26975130.000000 -26975130.000000
first : 0.808740 0.000000 after -26975130.000000 -26975130.000000
first : 0.585009 0.000000 after -26975130.000000 -26975130.000000
first : 0.479873 0.000000 after -26975130.000000 -26975130.000000
first : 0.350291 0.000000 after -26975130.000000 -26975130.000000
first : 0.895962 0.000000 after -26975130.000000 -26975130.000000
first : 0.822840 0.000000 after -26975130.000000 -26975130.000000
first : 0.746605 0.000000 after -26975130.000000 -26975130.000000
first : 0.174108 0.000000 after -26975130.000000 -26975130.000000
first : 0.858943 0.000000 after -26975130.000000 -26975130.000000
first : 0.710501 0.000000 after -26975130.000000 -26975130.000000
first : 0.513535 0.000000 after -26975130.000000 -26975130.000000
first : 0.303995 0.000000 after -26975130.000000 -26975130.000000
first : 0.014985 0.000000 after -26975130.000000 -26975130.000000