Cufft 1D transform

I’m trying to write a simple code using cufft library. After the inverse transformam aren’t same. Someone can help me to understand why this is happening?? I’m using Visual Studio
My code

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, project
#include <cuda_runtime.h>
#include <cufft.h>
#include <helper_functions.h>
#include <helper_cuda.h>

// Complex data type
typedef float2 Complex;
static __device__ __host__ inline Complex ComplexAdd(Complex, Complex);
static __device__ __host__ inline Complex ComplexScale(Complex, float);
static __device__ __host__ inline Complex ComplexMul(Complex, Complex);
static __global__ void ComplexPointwiseMulAndScale(Complex *, const Complex *, int, float);

// Filtering functions
void Convolve(const Complex *, int, const Complex *, int, Complex *);

// Padding functions
int PadData(const Complex *, Complex **, int,
            const Complex *, Complex **, int);

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char **argv);
// The filter size is assumed to be a number smaller than the signal size
#define SIGNAL_SIZE        16
#define FILTER_KERNEL_SIZE 11
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    runTest(argc, argv);
	system("Pause");
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv)
{
    printf("[simpleCUFFT] is starting...\n");

    findCudaDevice(argc, (const char **)argv);

    // Allocate host memory for the signal
	cufftComplex *h_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE);

	cufftComplex *h_reversed_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE);

    // Initalize the memory for the signal
    for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
    {
        h_signal[i].x = rand() / (float)RAND_MAX;
        h_signal[i].y = 0;
    }
    cufftComplex *d_signal;
    checkCudaErrors(cudaMalloc((void **)&d_signal, SIGNAL_SIZE));
    // Copy host memory to device
	checkCudaErrors(cudaMemcpy(d_signal, h_signal, SIGNAL_SIZE,
                               cudaMemcpyHostToDevice));
    cufftHandle plan;
    checkCudaErrors(cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_C2C, 1));

    // Transform signal and kernel
    printf("Transforming signal cufftExecC2C\n");
    checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD));
    getLastCudaError("Kernel execution failed [ ComplexPointwiseMulAndScale ]");
    // Transform signal back
    printf("Transforming signal back cufftExecC2C\n");
    checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE));

   // Copy device memory to host
    checkCudaErrors(cudaMemcpy(h_reversed_signal, d_signal, SIGNAL_SIZE,
                               cudaMemcpyDeviceToHost));
    // check result
    bool bTestResult = sdkCompareL2fe((float *)h_reversed_signal, (float *)h_signal, 2 * SIGNAL_SIZE, 1e-5f);
	for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
	{
		h_reversed_signal[i].x = h_reversed_signal[i].x / (float)SIGNAL_SIZE;
		h_reversed_signal[i].y = h_reversed_signal[i].y/(float)SIGNAL_SIZE;
		printf("first : %f %f  after %f %f \n", h_signal[i].x, h_signal[i].y, h_reversed_signal[i].x, h_reversed_signal[i].y);
	}
    //Destroy CUFFT context
    checkCudaErrors(cufftDestroy(plan));
    // cleanup memory
    free(h_signal);
    free(h_reversed_signal);
    checkCudaErrors(cudaFree(d_signal));
    cudaDeviceReset();
}

// Pad data
int PadData(const Complex *signal, Complex **padded_signal, int signal_size,
            const Complex *filter_kernel, Complex **padded_filter_kernel, int filter_kernel_size)
{
    int minRadius = filter_kernel_size / 2;
    int maxRadius = filter_kernel_size - minRadius;
    int new_size = signal_size + maxRadius;

    // Pad signal
    Complex *new_data = (Complex *)malloc(sizeof(Complex) * new_size);
    memcpy(new_data +           0, signal,              signal_size * sizeof(Complex));
    memset(new_data + signal_size,      0, (new_size - signal_size) * sizeof(Complex));
    *padded_signal = new_data;

    // Pad filter
    new_data = (Complex *)malloc(sizeof(Complex) * new_size);
    memcpy(new_data +                    0, filter_kernel + minRadius,                       maxRadius * sizeof(Complex));
    memset(new_data +            maxRadius,                         0, (new_size - filter_kernel_size) * sizeof(Complex));
    memcpy(new_data + new_size - minRadius,             filter_kernel,                       minRadius * sizeof(Complex));
    *padded_filter_kernel = new_data;

    return new_size;
}

My result

[simpleCUFFT] is starting...
GPU Device 0: "GeForce GTX 570" with compute capability 2.0

Transforming signal cufftExecC2C
Transforming signal back cufftExecC2C
ERROR, l2-norm error 1 is greater than epsilon 1e-005
first : 0.001251 0.000000  after 0.000000 0.000000
first : 0.563585 0.000000  after 8388608.000000 16895.835938
first : 0.193304 0.000000  after -26975130.000000 -26975130.000000
first : 0.808740 0.000000  after -26975130.000000 -26975130.000000
first : 0.585009 0.000000  after -26975130.000000 -26975130.000000
first : 0.479873 0.000000  after -26975130.000000 -26975130.000000
first : 0.350291 0.000000  after -26975130.000000 -26975130.000000
first : 0.895962 0.000000  after -26975130.000000 -26975130.000000
first : 0.822840 0.000000  after -26975130.000000 -26975130.000000
first : 0.746605 0.000000  after -26975130.000000 -26975130.000000
first : 0.174108 0.000000  after -26975130.000000 -26975130.000000
first : 0.858943 0.000000  after -26975130.000000 -26975130.000000
first : 0.710501 0.000000  after -26975130.000000 -26975130.000000
first : 0.513535 0.000000  after -26975130.000000 -26975130.000000
first : 0.303995 0.000000  after -26975130.000000 -26975130.000000
first : 0.014985 0.000000  after -26975130.000000 -26975130.000000

This has been asked numerous times before. Here is a short answer:
https://devtalk.nvidia.com/default/topic/489258/cuda-programming-and-performance/1d-cufft-results-not-matching-fftwf-results-source-code-attached/post/3507338/#3507338

Yes, thanks but my results are completely wrong apparently and it’s not a scale problem, like sqrt(2). I’m trying but I’m new on CUDA. If someone could take a look on my code. Thanks.

Your cudaMalloc call on line 62, and cudaMemcpy on line 64 are only allocating SIGNAL_SIZE bytes, which I am assuming is wrong.

I think if you change the code in 3 places (lines 62, 64, and 78), along the lines of what Tiomat said, it will be basically correct. At least it works for me. I still get the ERROR l2-norm error printout, but the before and after data is identical. I’m not sure calling that error checking function at that point in the code makes sense, because the output signal has not been scaled down by SIGNAL_SIZE, yet.

Here’s a fully worked example with the 3 changes I mentioned above (now at lines 57, 59, and 73 below). I’ve also moved the sdk error checking function to after the scaling of the result by the data set size, so that it will give a proper result also.

$ cat t653.cu
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, project
#include <cufft.h>
#include <helper_functions.h>
#include <helper_cuda.h>

// Complex data type
typedef float2 Complex;

// Filtering functions
void Convolve(const Complex *, int, const Complex *, int, Complex *);

// Padding functions
int PadData(const Complex *, Complex **, int,
            const Complex *, Complex **, int);

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char **argv);
// The filter size is assumed to be a number smaller than the signal size
#define SIGNAL_SIZE        16
#define FILTER_KERNEL_SIZE 11
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    runTest(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv)
{
    printf("[simpleCUFFT] is starting...\n");

    findCudaDevice(argc, (const char **)argv);

    // Allocate host memory for the signal
        cufftComplex *h_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE);

        cufftComplex *h_reversed_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE);

    // Initalize the memory for the signal
    for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
    {
        h_signal[i].x = rand() / (float)RAND_MAX;
        h_signal[i].y = 0;
    }
    cufftComplex *d_signal;
    checkCudaErrors(cudaMalloc((void **)&d_signal, SIGNAL_SIZE*sizeof(cufftComplex)));
    // Copy host memory to device
        checkCudaErrors(cudaMemcpy(d_signal, h_signal, SIGNAL_SIZE*sizeof(cufftComplex),
                               cudaMemcpyHostToDevice));
    cufftHandle plan;
    checkCudaErrors(cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_C2C, 1));

    // Transform signal and kernel
    printf("Transforming signal cufftExecC2C\n");
    checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD));
    getLastCudaError("Kernel execution failed [ ComplexPointwiseMulAndScale ]");
    // Transform signal back
    printf("Transforming signal back cufftExecC2C\n");
    checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE));

   // Copy device memory to host
    checkCudaErrors(cudaMemcpy(h_reversed_signal, d_signal, SIGNAL_SIZE*sizeof(cufftComplex),
                               cudaMemcpyDeviceToHost));
    // check result
        for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
        {
                h_reversed_signal[i].x = h_reversed_signal[i].x / (float)SIGNAL_SIZE;
                h_reversed_signal[i].y = h_reversed_signal[i].y/(float)SIGNAL_SIZE;
                printf("first : %f %f  after %f %f \n", h_signal[i].x, h_signal[i].y, h_reversed_signal[i].x, h_reversed_signal[i].y);
        }
    bool bTestResult = sdkCompareL2fe((float *)h_reversed_signal, (float *)h_signal, 2 * SIGNAL_SIZE, 1e-5f);
    //Destroy CUFFT context
    checkCudaErrors(cufftDestroy(plan));
    // cleanup memory
    free(h_signal);
    free(h_reversed_signal);
    checkCudaErrors(cudaFree(d_signal));
    cudaDeviceReset();
}

// Pad data
int PadData(const Complex *signal, Complex **padded_signal, int signal_size,
            const Complex *filter_kernel, Complex **padded_filter_kernel, int filter_kernel_size)
{
    int minRadius = filter_kernel_size / 2;
    int maxRadius = filter_kernel_size - minRadius;
    int new_size = signal_size + maxRadius;

    // Pad signal
    Complex *new_data = (Complex *)malloc(sizeof(Complex) * new_size);
    memcpy(new_data +           0, signal,              signal_size * sizeof(Complex));
    memset(new_data + signal_size,      0, (new_size - signal_size) * sizeof(Complex));
    *padded_signal = new_data;

    // Pad filter
    new_data = (Complex *)malloc(sizeof(Complex) * new_size);
    memcpy(new_data +                    0, filter_kernel + minRadius,                       maxRadius * sizeof(Complex));
    memset(new_data +            maxRadius,                         0, (new_size - filter_kernel_size) * sizeof(Complex));
    memcpy(new_data + new_size - minRadius,             filter_kernel,                       minRadius * sizeof(Complex));
    *padded_filter_kernel = new_data;

    return new_size;
}
$ nvcc -D_DEBUG -arch=sm_20 -I/usr/local/cuda/samples/common/inc -o t653 t653.cu -lcufft
$ ./t653
[simpleCUFFT] is starting...
GPU Device 0: "Quadro 5000" with compute capability 2.0

Transforming signal cufftExecC2C
Transforming signal back cufftExecC2C
first : 0.840188 0.000000  after 0.840188 0.000000
first : 0.394383 0.000000  after 0.394383 -0.000000
first : 0.783099 0.000000  after 0.783099 0.000000
first : 0.798440 0.000000  after 0.798440 0.000000
first : 0.911647 0.000000  after 0.911647 -0.000000
first : 0.197551 0.000000  after 0.197551 0.000000
first : 0.335223 0.000000  after 0.335223 0.000000
first : 0.768230 0.000000  after 0.768230 -0.000000
first : 0.277775 0.000000  after 0.277775 -0.000000
first : 0.553970 0.000000  after 0.553970 0.000000
first : 0.477397 0.000000  after 0.477397 0.000000
first : 0.628871 0.000000  after 0.628871 -0.000000
first : 0.364784 0.000000  after 0.364785 0.000000
first : 0.513401 0.000000  after 0.513401 -0.000000
first : 0.952230 0.000000  after 0.952230 -0.000000
first : 0.916195 0.000000  after 0.916195 -0.000000
$

I used linux instead of windows, but that should not matter from a code correctness standpoint.

Thank you txbob, this code will help me a lot in my work. And one more time, I’m sorry the hassle.