Question about the CUFFT sample code

I’ve taken the sample code and got rid of most of the non-essential parts. Which leaves me with:

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <time.h>

#include <cutil.h>

#include <cufft.h>

#include <cutil_inline.h>

// Complex data type

typedef float2 Complex; 


// declaration, forward

void runTest(int argc, char** argv);

#define SIGNAL_SIZE		100


// Program main


int main(int argc, char** argv) 


	runTest(argc, argv);

	cutilExit(argc, argv);


void runTest(int argc, char** argv) 


	if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

		cutilDeviceInit(argc, argv);


		cudaSetDevice( cutGetMaxGflopsDeviceId() );

	// Allocate host memory for the signal

	Complex* h_signal = (Complex*)malloc(sizeof(Complex) * SIGNAL_SIZE);


	// Initalize the memory for the signal

	for (unsigned int i = 0; i < SIGNAL_SIZE; i++) {

	h_signal[i].x = 1;

		h_signal[i].y = 0;


	// Print signal

	for (unsigned int i = 0; i < SIGNAL_SIZE; i++) {

	printf("%f\n", h_signal[i].x);


	printf("end of signal\n");

	// Allocate device memory for signal

	Complex* d_signal;

	cutilSafeCall(cudaMalloc((void**)&d_signal, SIGNAL_SIZE));

	// Copy host memory to device

	cutilSafeCall(cudaMemcpy(d_signal, h_signal, SIGNAL_SIZE, cudaMemcpyHostToDevice));

	// CUFFT plan

	cufftHandle plan;

	cufftSafeCall(cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_C2C, 1));

	// Transform signal

	cufftSafeCall(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD));

	// Copy device memory to host

	Complex* h_convolved_signal = h_signal;

	cutilSafeCall(cudaMemcpy(h_convolved_signal, d_signal, SIGNAL_SIZE, cudaMemcpyDeviceToHost));

	// Print result

	for (unsigned int i = 0; i < SIGNAL_SIZE; i++) {

	printf("%f %f\n", h_convolved_signal[i].x, h_convolved_signal[i].y);


	printf("end of result\n");

	//Destroy CUFFT context


	// cleanup memory





Basically I create data to be transformed, print the data, let CUFFT do the transformation and print the results. The output however does not show a correct fourier transformation - but I can’t find my mistake.

From a quick scan, your cudaMalloc is incorrect:
cudaMalloc((void**)&d_signal, SIGNAL_SIZE)

It should be:
cudaMalloc((void**)&d_signal, SIGNAL_SIZE*sizeof(Complex))

Same problem with the cudaMemcpy.

You are confusing number of elements and size in bytes.