half-empty array after ifft function

I am new to FFT, may you please help me
I have a

FFT_size=512 
signalArray[FFT_size]

and I send it to

getiFFT_main(cuDoubleComplex *inputSignal)

function:

cuDoubleComplex * iFFTsignalArray = getiFFT_main(signalArray);

cuDoubleComplex * getiFFT_main(cuDoubleComplex *inputSignal)
{
	int mem_size = sizeof(cuDoubleComplex)*FFT_size;
	cufftHandle plan;
	cufftComplex *d_signal_in, *d_signal_out;
	cudaMalloc(&d_signal_in, mem_size);
	cudaMalloc(&d_signal_out, mem_size);
	cudaMemcpy(d_signal_in, inputSignal, mem_size, cudaMemcpyHostToDevice);

	cufftPlan1d(&plan, FFT_size, CUFFT_C2C, BATCH);
	cufftExecC2C(plan, d_signal_in, d_signal_out, CUFFT_INVERSE);
	cudaDeviceSynchronize();

	cudaMemcpy(inputSignal, d_signal_out, FFT_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
	cufftDestroy(plan);
	cudaFree(d_signal_in);
	cudaFree(d_signal_out);

	return inputSignal;
}

later when I want to print iFFTsignalArray, I have values until 255 including

for (int b = 0; b < FFT_size; b++)
{
printf("generated ifft signal: [%d] [%d] %.2f, %.2f\n", a, b, cuCreal(iFFTsignalArray[b]), cuCimag(iFFTsignalArray[b]));
}

can you please look at getiFFT_main() function

I suggest: provide a complete code, also include your input data, as well as your actual test results

Thank you txbob,
I have four files:
Header, cpp file, cu file and makefile.

My make file is:

NVCC=nvcc
CUDAFLAGS= -arch=sm_30
OPT= -g -G
RM=/bin/rm -f
all: OFDM

main: OFDM.o Generate.o
	${NVCC} ${OPT} -o main OFDM.o Generate.o

Generate.o: Header.cuh Generate.cpp
	${NVCC} ${OPT} ${CUDAFLAGS} -std=c++11 -c Generate.cpp

OFDM.o: Header.cuh OFDM.cu
	$(NVCC) ${OPT} $(CUDAFLAGS)	-std=c++11 -c OFDM.cu -lcufft

OFDM: OFDM.o Generate.o
	${NVCC} ${CUDAFLAGS} -o OFDM OFDM.o Generate.o -lcufft
clean:
	${RM} *.o OFDM

Header:

#define BATCH 1
#define FFT_size 512
#define numberOfUEs 10
#define numberOfBlocks 64
#define numberOfThreads 64

double getGeneratedRandom();
cuDoubleComplex getModulatedSignal();
cuDoubleComplex *getiFFT_main(cuDoubleComplex *inputSignal);

Generate.cpp:

#include "device_launch_parameters.h"
#include <cuda_runtime.h>
#include "device_functions.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cufft.h>
#include <time.h>
#include <iostream>
#include <cuComplex.h>
#include "Header.cuh"

double getGeneratedRandom()
{
	int randomNumberInt = rand() % ((100 - (-100)) + 1) - 100;
	double randomNumber = (double)randomNumberInt / 100;
	return randomNumber;
}

cuDoubleComplex getModulatedSignal()
{
	double modulatedSignalReal = 	getGeneratedRandom();
	double modulatedSignalComplex = getGeneratedRandom();

	if (modulatedSignalReal > 0.5)
		modulatedSignalReal = 1;
	else
		modulatedSignalReal = -1;

	if (modulatedSignalComplex > 0.5)
		modulatedSignalComplex = 1;
	else
		modulatedSignalComplex = -1;
	static cuDoubleComplex signal;
	signal = make_cuDoubleComplex(modulatedSignalReal, modulatedSignalComplex);

	return signal;
}

cuDoubleComplex * getiFFT_main(cuDoubleComplex *inputSignal)
{
	int mem_size = sizeof(cuDoubleComplex)*FFT_size;
	cufftHandle plan;
	cufftComplex *d_signal_in, *d_signal_out;
	cudaMalloc(&d_signal_in, mem_size);
	cudaMalloc(&d_signal_out, mem_size);
	cudaMemcpy(d_signal_in, inputSignal, mem_size, cudaMemcpyHostToDevice);

	cufftPlan1d(&plan, FFT_size, CUFFT_C2C, BATCH);
	cufftExecC2C(plan, d_signal_in, d_signal_out, CUFFT_INVERSE);
	cudaDeviceSynchronize();

	cudaMemcpy(inputSignal, d_signal_out, FFT_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
	cufftDestroy(plan);
	cudaFree(d_signal_in);
	cudaFree(d_signal_out);

	return inputSignal;
}

OFDM.cu:

#include "device_launch_parameters.h"
#include "device_functions.h"

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <time.h>
#include "Header.cuh"

/*
for n = 1:N
		for carrier_index = 1:FFT_size
			 Tx_carrier_signal(carrier_index) = QPSK_mod(rand(1),rand(1));
		end
		Time_signal(n,:) = sqrt(coeffs(n))*ifft(Tx_carrier_signal, FFT_size).* sqrt(FFT_size);
end
*/

int main()
{
	srand((unsigned)time(0));
	double coefficientsArray[numberOfUEs] = { 0.01, 0.05, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.15, 0.16 };
	cuDoubleComplex signalArray[FFT_size];
	cuDoubleComplex timeSignalArray[numberOfUEs][FFT_size];
	for (int a = 0; a < numberOfUEs; a++)
	{
		for (int carrier_index = 0; carrier_index < FFT_size; carrier_index++)
		{
			signalArray[carrier_index] = getModulatedSignal();
			//printf("generated signal: %d %.2f, %.2f\n", carrier_index, cuCreal(signalArray[carrier_index]), cuCimag(signalArray[carrier_index]));
		}
		cuDoubleComplex * iFFTsignalArray = getiFFT_main(signalArray);
		for (int b = 0; b < FFT_size; b++)
		{
			printf("generated ifft signal: [%d] [%d] %.2f, %.2f\n", a, b, cuCreal(iFFTsignalArray[b]), cuCimag(iFFTsignalArray[b]));
		}
		for (int b = 0; b < FFT_size; b++)
		{
			timeSignalArray[a][b] = cuCmul(iFFTsignalArray[b], make_cuDoubleComplex(sqrt((double)coefficientsArray[a])*sqrt((double)FFT_size), 0));
			//printf("generated time signal array: [%d][%d] %.2f, %.2f\n", a, b , cuCreal(timeSignalArray[a][b]), cuCimag(timeSignalArray[a][b]));
		}
	}
		//printf("generated ifft signal: %d %.2f, %.2f\n", a, cuCreal(iFFTsignalArray[a]), cuCimag(iFFTsignalArray[a]));
/*
		for (int b = 0; b < FFT_size; b++)
		{
			timeSignalArray[a][b] = cuCmul(iFFTsignalArray[b], make_cuDoubleComplex(sqrt((double)coefficientsArray[a])*sqrt((double)FFT_size), 0));

			//printf("generated time signal array: [%d][%d] %.2f, %.2f\n", a, b , cuCreal(timeSignalArray[a][b]), cuCimag(timeSignalArray[a][b]));
		}

*/
	return 0;
}

output is too large two-dimensional (10x512), so I expect all values to be filled, but after 255 all the values are 0 in each row:

generated ifft signal: [0] [0] -16285016252571713536.00, -14810003504.49
generated ifft signal: [0] [1] 1995785480.20, 1096887.76
generated ifft signal: [0] [2] 1305770.25, -7722831.03
generated ifft signal: [0] [3] -56837415062.17, 1400109.75
generated ifft signal: [0] [4] -0.00, -375201729.35
generated ifft signal: [0] [5] -32483348621.50, -27009093713.53
generated ifft signal: [0] [6] -482210497.30, -7249689.01
generated ifft signal: [0] [7] 0.23, 9407738.01
generated ifft signal: [0] [8] -29685420161.93, 166338848.64
generated ifft signal: [0] [9] -26213.95, -88940.27
generated ifft signal: [0] [10] -142348.84, 909034109.31
generated ifft signal: [0] [11] 122958160.49, 1111066.26

and after the 255th

generated ifft signal: [0] [253] -56837398678.17, -7722833.03
generated ifft signal: [0] [254] 1305767.75, 1096887.26
generated ifft signal: [0] [255] 1995784968.20, -14809999408.49
generated ifft signal: [0] [256] 0.00, 0.00
generated ifft signal: [0] [257] 0.00, 0.00
generated ifft signal: [0] [258] 0.00, 0.00
generated ifft signal: [0] [259] 0.00, 0.00
generated ifft signal: [0] [260] 0.00, 0.00
generated ifft signal: [0] [261] 0.00, 0.00
generated ifft signal: [0] [262] 0.00, 0.00
generated ifft signal: [0] [263] 0.00, 0.00
generated ifft signal: [0] [264] 0.00, 0.00

I don’t understand the term batch, I recognized that when I change it to 2 the output is without zeros.
My main idea was to convert MATLAB code:

for n = 1:N
		for carrier_index = 1:FFT_size
			 Tx_carrier_signal(carrier_index) = QPSK_mod(rand(1),rand(1));
		end
		Time_signal(n,:) = sqrt(coeffs(n))*ifft(Tx_carrier_signal, FFT_size).* sqrt(FFT_size);
end

to CUDA and I in the part ifft(Tx_carrier_signal, FFT_size) I did it as:

the function cuDoubleComplex * getiFFT_main(cuDoubleComplex *inputSignal) in Generate.cpp and
the rest part of loop as in main function in OFDM.cu

Please look through and help
Thank you

BATCH should be 1 for a single transform.

Your usage of CUFFT is not consistent for double complex types.

Z2Z is the correct transform type for double complex transforms. You are using C2C.

https://docs.nvidia.com/cuda/cufft/index.html#cufft-transform-types
https://docs.nvidia.com/cuda/cufft/index.html#function-cufftexecc2c-cufftexecz2z

cufftDoubleComplex is the correct data type for Z2Z, you are using cufftComplex.

I get a fully populated array when I make those corrections:

cuDoubleComplex * getiFFT_main(cuDoubleComplex *inputSignal)
{
        int mem_size = sizeof(cuDoubleComplex)*FFT_size;
        cufftHandle plan;
        cufftDoubleComplex *d_signal_in, *d_signal_out;  // fix
        cudaMalloc(&d_signal_in, mem_size);
        cudaMalloc(&d_signal_out, mem_size);
        cudaMemcpy(d_signal_in, inputSignal, mem_size, cudaMemcpyHostToDevice);

        cufftPlan1d(&plan, FFT_size, CUFFT_Z2Z, BATCH);  // fix
        cufftExecZ2Z(plan, d_signal_in, d_signal_out, CUFFT_INVERSE);  // fix
        cudaDeviceSynchronize();

        cudaMemcpy(inputSignal, d_signal_out, FFT_size * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
        cufftDestroy(plan);
        cudaFree(d_signal_in);
        cudaFree(d_signal_out);

        return inputSignal;
}

In your main routine, I would also generally recommend against using large stack-based arrays:

cuDoubleComplex signalArray[FFT_size];
	cuDoubleComplex timeSignalArray[numberOfUEs][FFT_size];

but that is not the issue here.

Thank you very much, txbob.
May I ask you about correctness of my makefile, I couldn’t find any CUDA makefile tutorials and not sure about it and also about your comment regarding large stack-based arrays.

May you please explain if the answer is short or send link to related resources otherwise.

Thank you