Printing out values after cufft

i am trying to run cufft 1D implementation i want to take in a vector of real or complex valued entries and transform nothing more what i have right now compiles with the correct arch types (borrowed Makefile from samples). I want to print out the results of the transformed signal but have no idea get sef faults AHHHHHHHH i just want to verify the result using another prgram like maple or matlab… code:

// includes, system
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// includes, project
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
#include "common/inc/helper_cuda.h"
#include "common/inc/helper_functions.h"

#define SIGNAL_SIZE 16

// Complex data type
typedef float2 Complex;

//--------------------------------------------------------------------------------
// Pad data function
int PadData(const Complex *signal, Complex **padded_signal, int signal_size) {
//  int minRadius = filter_kernel_size / 2;
//  int maxRadius = filter_kernel_size - minRadius;
  int new_size = signal_size; //+ maxRadius;

// Pad signal
  Complex *new_data =
      reinterpret_cast<Complex *>(malloc(sizeof(Complex) * new_size));
  memcpy(new_data + 0, signal, signal_size * sizeof(Complex));
  memset(new_data + signal_size, 0, (new_size - signal_size) * sizeof(Complex));
  *padded_signal = new_data;

// Pad filter
//  new_data = reinterpret_cast<Complex *>(malloc(sizeof(Complex) * new_size));
//  memcpy(new_data + 0, filter_kernel + minRadius, maxRadius * sizeof(Complex));
//  memset(new_data + maxRadius, 0,
//         (new_size - filter_kernel_size) * sizeof(Complex));
//  memcpy(new_data + new_size - minRadius, filter_kernel,
//         minRadius * sizeof(Complex));
//  *padded_filter_kernel = new_data;

  return new_size;
}
int main() {
	// allocate memory for the host
	Complex *h_signal =
	      reinterpret_cast<Complex *>(malloc(sizeof(Complex) * SIGNAL_SIZE));

	// Initialize the memory for the signal
	for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) {
	    h_signal[i].x = i / static_cast<float>(RAND_MAX);
	    h_signal[i].y = 0;
	}

	// Pad signal and filter kernel
	  Complex *h_padded_signal;

	  int new_size =
	      PadData(h_signal, &h_padded_signal, SIGNAL_SIZE);
	  int mem_size = sizeof(Complex) * new_size;

	// Allocate device memory for signal
	  Complex *d_signal;
	  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_signal), mem_size));

	// Copy host memory to device
	  checkCudaErrors(
	      cudaMemcpy(d_signal, h_padded_signal, mem_size, cudaMemcpyHostToDevice));

	// CUFFT plan simple API
	  cufftHandle plan;
	  checkCudaErrors(cufftPlan1d(&plan, new_size, CUFFT_C2C, 1));

	// Transform signal
	  printf("Transforming signal cufftExecC2C\n");
	  checkCudaErrors(cufftExecC2C(plan, reinterpret_cast<cufftComplex *>(d_signal),
		                       reinterpret_cast<cufftComplex *>(d_signal),
		                       CUFFT_FORWARD));

	// Copy device memory to host
	  Complex *h_convolved_signal = h_padded_signal;
	  checkCudaErrors(cudaMemcpy(h_convolved_signal, d_signal, mem_size,
		                     cudaMemcpyDeviceToHost));
	
	// Destroy CUFFT context
	  checkCudaErrors(cufftDestroy(plan));

	// cleanup memory
	  free(h_signal);
	  free(h_padded_signal);
	  checkCudaErrors(cudaFree(d_signal));

}

also how do you use the result do you want to cast it to normal c++ data type is that possible? with the thought of plotting it in a plot

also cuda 10 being used

i was trying to print the memory allocated for device :)

h_signal[i].x = i / static_cast(RAND_MAX);
this also makes it go zero not sure maybe its for the padding used to convolve but i didnt want that

currently working with this printing out the correct values for transform on the vector

// includes, system
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// includes, project
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>
#include "common/inc/helper_cuda.h"
#include "common/inc/helper_functions.h"

#define SIGNAL_SIZE 16

// Complex data type
typedef float2 Complex;

//--------------------------------------------------------------------------------

int main() {
	// allocate memory for the host
	Complex *h_signal =
	      reinterpret_cast<Complex *>(malloc(sizeof(Complex) * SIGNAL_SIZE));

	// Initialize the memory for the signal
	for (unsigned int i = 1; i < SIGNAL_SIZE+1; ++i) {

	    h_signal[i].x = i; /// static_cast<float>(RAND_MAX);
	    h_signal[i].y = 0;
	}
	
	for (unsigned int i = 1; i < SIGNAL_SIZE+1; ++i) {
		printf("%f %f\n", h_signal[i].x, h_signal[i].y);	
	}

	// Pad signal and filter kernel
	  //Complex *h_padded_signal;

	  int new_size = SIGNAL_SIZE;
	     // PadData(h_signal, &h_padded_signal, SIGNAL_SIZE);
	  int mem_size = sizeof(Complex) * new_size;

	// Allocate device memory for signal
	  Complex *d_signal;
	  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_signal), mem_size));

	// Copy host memory to device
	  checkCudaErrors(
	      cudaMemcpy(d_signal, h_signal, mem_size, cudaMemcpyHostToDevice));

	// CUFFT plan simple API
	  cufftHandle plan;
	  checkCudaErrors(cufftPlan1d(&plan, new_size, CUFFT_C2C, 1));

	// Transform signal
	  printf("Transforming signal cufftExecC2C\n");
	  checkCudaErrors(cufftExecC2C(plan, reinterpret_cast<cufftComplex *>(d_signal),
		                       reinterpret_cast<cufftComplex *>(d_signal),
		                       CUFFT_FORWARD));

	// Copy device memory to host
	  Complex *h_convolved_signal = h_signal;
	  checkCudaErrors(cudaMemcpy(h_convolved_signal, d_signal, mem_size,
		                     cudaMemcpyDeviceToHost));
	for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) {
		printf("%f %f\n", h_convolved_signal[i].x, h_convolved_signal[i].y);	
	}
	
	// Destroy CUFFT context
	  checkCudaErrors(cufftDestroy(plan));

	// cleanup memory
	  free(h_signal);

	//  free(h_padded_signal);

	  checkCudaErrors(cudaFree(d_signal));

}