Problem with cufft test (computing forward and inverse fft)

I am trying out the cufft library. As a test, I wanted to compute the 2d forward and inverse FFT of the array
0, 1, 2,
3, 4, 5,
6, 7, 8.

Problem is, I am getting back the array
0, 9, 2,
3, 4, 5,
6, 7, 8.

I can’t find the problem. Here is the code, any help is appreciated.
[checkCudaErrors() is the macro from helper_cuda.h]

void test_fft() 
{
	int input_width = 3;
	int input_height = 3;

	float input_f[9] = {
		0, 1, 2,
		3, 4, 5,
		6, 7, 8
	};

	// Allocate fft input buffer on device
	float *d_data;
	size_t d_data_size = input_height * input_width * sizeof(float);
	checkCudaErrors(cudaMalloc((void**) &d_data, d_data_size));

	// Allocate fft output buffer on device
	cufftComplex *d_fft_output;
	size_t d_fft_output_size = input_height * (input_width / 2 + 1) * sizeof(cufftComplex);
	checkCudaErrors(cudaMalloc((void**) &d_fft_output, d_fft_output_size));
	
	// Make forward and inverse plans
	cufftHandle fft_plan_forward, fft_plan_inverse;
	checkCudaErrors(cufftPlan2d(&fft_plan_forward, input_height, input_width, CUFFT_R2C));
	checkCudaErrors(cufftPlan2d(&fft_plan_inverse, input_height, input_width, CUFFT_C2R));

	// Copy input data on device
	checkCudaErrors(cudaMemcpy(d_data, input_f, d_data_size, cudaMemcpyHostToDevice));

	// Compute forward FFT
	checkCudaErrors(cufftExecR2C(fft_plan_forward, d_data, d_fft_output));
	checkCudaErrors(cudaDeviceSynchronize());

	// Allocate fft output buffer on host
	float *h_fft_output;
	size_t h_fft_output_size = input_height * input_width * sizeof(float);
	h_fft_output = (float*) malloc(h_fft_output_size);

	// Copy fft output data to host
	checkCudaErrors(cudaMemcpy(h_fft_output, d_fft_output, h_fft_output_size, cudaMemcpyDeviceToHost));

	// Compute inverse FFT
	checkCudaErrors(cufftExecC2R(fft_plan_inverse, d_fft_output, d_data));
	checkCudaErrors(cudaDeviceSynchronize());

	// Copy inverse fft output data to host
	checkCudaErrors(cudaMemcpy(input_f, d_data, input_height * input_width, cudaMemcpyDeviceToHost));

	cout << endl << "forward fft real: " << endl;
	for (int i = 0; i < h_fft_output_size / sizeof(float); i++) {
		cout << h_fft_output[2 * i] << " ";
	}
	
	cout << endl << "forward fft imaginary: " << endl;
	for (int i = 0; i < h_fft_output_size / sizeof(float); i++) {
		cout << h_fft_output[2 * i + 1] << " ";
	}

	cout << endl << "inverse fft: " << endl;
	for (int i = 0; i < input_height; i++) {
		for (int j = 0; j < input_width; j++)
			cout << input_f[i * input_width + j] << " ";
		cout << endl;
	}
}

Ok, it was just a cudaMemcpy of input_height * input_width bytes, instead of input_height * input_width * sizeof(float).
Sorry!