CUFFT Library

Hello, everyone

I am new to both CUDA and FFT.
In Matlab when, I enter a one dimensional array of complex numbers, I have an output of arrays with real numbers of same size and same dimension.
Trying to repeat this in CUDA C, but have different output.
Can you please help? In Matlab, when I enter ifft(array)

My arrayOfComplexNmbers:
[4.6500 + 0.0000i 0.5964 - 1.4325i 0.4905 - 0.5637i 0.4286 - 0.2976i 0.4345 - 0.1512i 0.4500 + 0.0000i 0.4345 + 0.1512i 0.4286 + 0.2976i 0.4905 + 0.5637i 0.5964 + 1.4325i]

My arrayOfRealNumbers
[ 0.9000 0.8000 0.7000 0.6000 0.5000 0.4000 0.3000 0.2000 0.1500 0.1000]

When I enter ifft(arrayOfComplexNmbers) in Matlab, my output is arrayOfRealNumbers.
This is my CUDA code for this:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include "device_launch_parameters.h"
#include "device_functions.h"

#define NX 256
#define NY 128
#define NRANK 2
#define BATCH 1
#define SIGNAL_SIZE 10

typedef float2 Complex;
__global__ void printCUDAVariables_1(cufftComplex *cudaSignal){
	int index = threadIdx.x + blockIdx.x*blockDim.x;	
	printf("COMPLEX CUDA %d %f %f \n", index, cudaSignal[index].x, cudaSignal[index].y);
}

__global__ void printCUDAVariables_2(cufftReal *cudaSignal){
	int index = threadIdx.x + blockIdx.x*blockDim.x;
	printf("REAL CUDA %d %f \n", index, cudaSignal);
}


int main() {
	cufftHandle plan;
	//int n[NRANK] = { NX, NY };
	Complex *h_signal = (Complex *)malloc(sizeof(Complex)* SIGNAL_SIZE);
	float *r_signal = 0;
	if (r_signal != 0){
		r_signal = (float*)realloc(r_signal, SIGNAL_SIZE * sizeof(float));
	}
	else{
		r_signal = (float*)malloc(SIGNAL_SIZE * sizeof(float));
	}
	int mem_size = sizeof(Complex)* SIGNAL_SIZE * 2;

	h_signal[0].x = (float)4.65;
	h_signal[0].y = (float)0;

	h_signal[1].x = (float)0.5964;
	h_signal[1].y = (float)0;

	h_signal[2].x = (float)4.65;
	h_signal[2].y = (float)-1.4325;

	h_signal[3].x = (float)0.4905;
	h_signal[3].y = (float)0.5637;

	h_signal[4].x = (float)0.4286;
	h_signal[4].y = (float)-0.2976;

	h_signal[5].x = (float)0.4345;
	h_signal[5].y = (float)-0.1512;

	h_signal[6].x = (float)0.45;
	h_signal[6].y = (float)0;

	h_signal[7].x = (float)0.4345;
	h_signal[7].y = (float)-0.1512;

	h_signal[8].x = (float)0.4286;
	h_signal[8].y = (float)0.2976;

	h_signal[9].x = (float)0.4905;
	h_signal[9].y = (float)-0.5637;

	h_signal[10].x = (float)0.5964;
	h_signal[10].y = (float)1.4325;

	//for (int i = 0; i < SIGNAL_SIZE; i++){
	//	printf("RAW %f %f\n", h_signal[i].x, h_signal[i].y);
	//}
	//allocate device memory for signal
	cufftComplex *d_signal, *d_signal_out;
	cudaMalloc(&d_signal, mem_size);	
	cudaMalloc(&d_signal_out, mem_size);
	cudaMemcpy(d_signal, h_signal, mem_size, cudaMemcpyHostToDevice);
	printCUDAVariables_1 << <10, 1 >> >(d_signal);
	//cufftReal *odata;
	//cudaMalloc((void **)&odata, sizeof(cufftReal)*NX*(NY / 2 + 1));
	
	//cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_C2R, BATCH);	
	cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);
	cufftExecC2C(plan, d_signal, d_signal_out, CUFFT_INVERSE);
	//cufftExecC2R(plan, d_signal, odata);
	cudaDeviceSynchronize();
	printCUDAVariables_1 << <10, 1 >> >(d_signal_out);
	//printCUDAVariables_2 << <10, 1 >> >(odata);
	//cudaMemcpy(h_signal, d_signal_out, SIGNAL_SIZE*2*sizeof(float), cudaMemcpyDeviceToHost);
	
	/*
	for (int i = 0; i < SIGNAL_SIZE; ++i){
		printf("IFFT HOST %f %f \n", &h_signal[i].x, &h_signal[i].y);
	}
	*/
	cufftDestroy(plan);
	cudaFree(d_signal);
	cudaFree(d_signal_out);

	return 0;
}

This question is replied in this https://stackoverflow.com/questions/46562575/how-to-cuda-ifft.