Help cufft execution

Hello !

I am a beginner on cuda and I expect to realize some fast fourier transform whith cufft.h. For that, I made the following code in order to get an image, make the fft transform, reverse it and display the raw_image and the IFFT(FFT(raw_image)) to verify if the images are the same.

And… it for now doesn’t work. ^^

I guess this is the way I stored the data input to make the fft through the GPU but I do not see where is the bug. :/

The datas I collected come from a CImg. For this reason the image data are linearly dispatch along widthheightdepth*spectrum.
The link to the definition : http://cimg.eu/reference/group__cimg__storage.html

The code :

#include <iostream>
#include <stdio.h>
#include "CImg.h"
#include "cuda_runtime.h"
#include "cufft.h"
#include <cassert>

//g++ cufft_ok.cpp -I/usr/local/cuda-9.0/include -L/usr/local/cuda-9.0/lib64 -o cufft_ok -lX11 -lpthread -lcudart -lcufft
using namespace cimg_library;

int main(){
	CImg<float>* p_image  = new CImg<float>("image.bmp"); //the base image
	CImg<float> &image = (*p_image);

	
	float* data = image.data();
	int size = image.size();
	int width = image.width();
	int height = image.height();
	int depth = image.depth();
	int spectrum = image.spectrum();

	CImg<float> test_img(width,height,depth,1); 
	memcpy(test_img._data,data,width*height*depth*sizeof(float));


	size_t work_size;
	int fft_sz = test_img.size();            // Size of each FFT
	int fft_nx = width;
	int fft_ny = height;
	int num_ffts = 1;         // How many FFTs to do
	cufftComplex *out_buf_d;
	cufftReal *in_buf_h, *in_buf_d;

	// Allocate buffers on host and device
	in_buf_h = new float;
	cudaMalloc(&in_buf_d, fft_sz*num_ffts*sizeof(float));
	if (cudaGetLastError() != cudaSuccess){
		fprintf(stderr, "Cuda error: Failed to allocate\n");
		return -1;	
	}
	
	cudaMalloc(&out_buf_d, fft_sz*num_ffts*sizeof(cufftComplex));
	if (cudaGetLastError() != cudaSuccess){
		fprintf(stderr, "Cuda error: Failed to allocate\n");
		return -1;	
	}

	// Fill input buffer with zeros and copy to device
	if(cudaMemset(out_buf_d, 0, fft_sz*num_ffts*sizeof(cufftComplex)) != cudaSuccess){
		fprintf(stderr,"Cuda error : Failed to set memory");
		return -1;
	}

	// Copy image.data() into in_buf_h
	memcpy(in_buf_h,image._data,sizeof(float)*fft_sz*num_ffts);

	// Copy the data from host to device	
	if(cudaMemcpy(in_buf_d, in_buf_h, fft_sz*num_ffts*sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess){
		fprintf(stderr,"Cuda error : Failed to copy memory");
		return -1;
	}
	

	// Plan num_ffts of size fft_sz
	cufftHandle plan;
	cufftCreate(&plan);
	if(cufftMakePlan2d(plan, fft_nx, fft_ny, CUFFT_R2C, &work_size)!= CUFFT_SUCCESS){
		fprintf(stderr,"Cufft error : Failed to make plan");	
		return -1;
	}

	//execute the plan
	if(cufftExecR2C(plan, in_buf_d, out_buf_d) != CUFFT_SUCCESS){
		fprintf(stderr,"Cufft error : Failed to execute the cufft");	
		return -1;
	}

	// Sync the device to flush the output
	if(cudaDeviceSynchronize() != cudaSuccess){
		fprintf(stderr,"Cuda error : Failed to synchronize memory");
		return -1;
	}

	//Plan for ifft
	if(cufftMakePlan2d(plan, fft_nx, fft_ny, CUFFT_C2R, &work_size) != CUFFT_SUCCESS){
		fprintf(stderr,"Cufft error : Failed to make the plan");	
		return -1;
	}

	//execute the plan
	if(cufftExecC2R(plan, out_buf_d, in_buf_d) != CUFFT_SUCCESS){
		fprintf(stderr,"Cufft error : Failed to execute the cufft");	
		return -1;
	}

	// Sync the device to flush the output
	if(cudaDeviceSynchronize() != cudaSuccess){
		fprintf(stderr,"Cuda error : Failed to synchronize memory");
		return -1;
	}

	// Bring back the final data on host
	if(cudaMemcpy(in_buf_h,in_buf_d, fft_sz*num_ffts*sizeof(float), cudaMemcpyDeviceToHost) != cudaSuccess){
		fprintf(stderr,"Cuda error : Failed to copy memory");
		return -1;
	}
	
	//store of ifft(fft(image))
	CImg<float> final(width,height,depth,1); 
	memcpy(final._data,in_buf_h,fft_sz*num_ffts*sizeof(float));

	//Display images to compare visualy
	(test_img,final).display("Gauche : image de départ ; Droite : IFFT(FFT(image))");

	image.~CImg();	
	cufftDestroy(plan); 
	delete in_buf_h;
	cudaFree(in_buf_d);
	cudaFree(out_buf_d);

	return 0;
}

I thank you for your time !
image.bmp (201 KB)

I would try to aim lower first. For example fill a 2D array (as in a linear array with size nx*ny) with a single mode:

dataRealSpace[i+nx*j] = cos(2pikxi/nx)cos(2piky*j/ny);

And see that after the FFT only the mode (kx,ky) is non zero (or at least there is a peak and the rest is numerical noise).
This mode will be in dataFourierSpace[kx+kynx] and it should appear with a value of nxny/4 if I am not mistaken.
Then maybe you can reverse the FFT and see if you get the original data. Seems easier to debug!

I think the FFT format is very tricky, so I would try to get this right before. Then you can see if the problem is with the format of image loading library you are using or the way you are setting up FFT, or whatever…

Good luck!