strange results of convolutionFFT2D

Hello, world!

Merry Christmas!

I have some problems with the convolution, based on cufft. I used the sample code from cuda (cuda/samples/3_Imaging/convolutionFFT2D) as a base and changed it a little bit - just to see the difference with FFTW routines. I have never used CUDA before.

However, I had strange (from my point of view) results. I present the code below. If somebody of you had a deal with this sample, please, help me to understand, what’s going on… :)

Convolution function (changed sample)

double *cuda_convolution_2D(double *h_Data, double *h_Kernel, 
		int dataW, int dataH, 
		int kernelW, int kernelH,
		int kx, int ky)
{
	float *h_ResultGPU;
	float *d_Data, *d_PaddedData, *d_Kernel, *d_PaddedKernel;
	fComplex *d_DataSpectrum, *d_KernelSpectrum;
	cufftHandle fftPlanFwd, fftPlanInv;

	bool bRetVal;
	StopWatchInterface *hTimer = NULL;
	sdkCreateTimer(&hTimer);

	printf("Testing built-in R2C / C2R FFT-based convolution\n");
	const int kernelY = kx;	
	const int kernelX = ky;	
	const int    fftH = snapTransformSize(dataH + kernelH - 1);
	const int    fftW = snapTransformSize(dataW + kernelW - 1);

	printf("...allocating memory\n");
	h_ResultGPU = (float *)malloc(fftH    * fftW * sizeof(float));

	checkCudaErrors(cudaMalloc((void **)&d_Data,   dataH   * dataW   * sizeof(float)));
	checkCudaErrors(cudaMalloc((void **)&d_Kernel, kernelH * kernelW * sizeof(float)));

	checkCudaErrors(cudaMalloc((void **)&d_PaddedData,   fftH * fftW * sizeof(float)));
	checkCudaErrors(cudaMalloc((void **)&d_PaddedKernel, fftH * fftW * sizeof(float)));

	checkCudaErrors(cudaMalloc((void **)&d_DataSpectrum,   fftH * (fftW / 2 + 1) * sizeof(fComplex)));
	checkCudaErrors(cudaMalloc((void **)&d_KernelSpectrum, fftH * (fftW / 2 + 1) * sizeof(fComplex)));

	printf("...creating R2C & C2R FFT plans for %i x %i\n", fftH, fftW);
	checkCudaErrors(cufftPlan2d(&fftPlanFwd, fftW, fftH, CUFFT_R2C));
	checkCudaErrors(cufftPlan2d(&fftPlanInv, fftW, fftH, CUFFT_C2R));

	printf("...uploading to GPU and padding convolution kernel and input data\n");
	checkCudaErrors(cudaMemcpy(d_Kernel, h_Kernel, kernelH * kernelW * sizeof(float), cudaMemcpyHostToDevice));
	checkCudaErrors(cudaMemcpy(d_Data,   h_Data,   dataH   * dataW *   sizeof(float), cudaMemcpyHostToDevice));
	checkCudaErrors(cudaMemset(d_PaddedKernel, 0, fftH * fftW * sizeof(float)));
	checkCudaErrors(cudaMemset(d_PaddedData,   0, fftH * fftW * sizeof(float)));

	padKernel(
		d_PaddedKernel,
		d_Kernel,
		fftH,
		fftW,
		kernelH,
		kernelW,
		kernelY,
		kernelX
	);

	padDataClampToBorder(
		d_PaddedData,
		d_Data,
		fftH,
		fftW,
		dataH,
		dataW,
		kernelH,
		kernelW,
		kernelY,
		kernelX
	);

	//Not including kernel transformation into time measurement,
	//since convolution kernel is not changed very frequently
	printf("...transforming convolution kernel\n");
	checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedKernel, (cufftComplex *)d_KernelSpectrum));

	printf("...running GPU FFT convolution: ");
	checkCudaErrors(cudaDeviceSynchronize());
	sdkResetTimer(&hTimer);
	sdkStartTimer(&hTimer);
	checkCudaErrors(cufftExecR2C(fftPlanFwd, (cufftReal *)d_PaddedData, (cufftComplex *)d_DataSpectrum));
	modulateAndNormalize(d_DataSpectrum, d_KernelSpectrum, fftW, fftH, 1);
	checkCudaErrors(cufftExecC2R(fftPlanInv, (cufftComplex *)d_DataSpectrum, (cufftReal *)d_PaddedData));

	checkCudaErrors(cudaDeviceSynchronize());
	sdkStopTimer(&hTimer);
	double gpuTime = sdkGetTimerValue(&hTimer);
	printf("%f MPix/s (%f ms)\n", (double)dataH * (double)dataW * 1e-6 / (gpuTime * 0.001), gpuTime);

	printf("...reading back GPU convolution results\n");
	checkCudaErrors(cudaMemcpy(h_ResultGPU, d_PaddedData, fftH * fftW * sizeof(float), cudaMemcpyDeviceToHost));

	double sum_delta2 = 0;
	double sum_ref2   = 0;
	double max_delta_ref = 0;

	double *res = new double[fftW*fftH];
	for (int y = 0; y < fftH; y++) {
		for (int x = 0; x < fftW; x++) {
			double  rGPU = (double)h_ResultGPU[y * fftW  + x];
			res[fftW*y + x] = rGPU;
		}
	}

	printf("...shutting down\n");
	sdkDeleteTimer(&hTimer);

	checkCudaErrors(cufftDestroy(fftPlanInv));
	checkCudaErrors(cufftDestroy(fftPlanFwd));

	checkCudaErrors(cudaFree(d_DataSpectrum));
	checkCudaErrors(cudaFree(d_KernelSpectrum));
	checkCudaErrors(cudaFree(d_PaddedData));
	checkCudaErrors(cudaFree(d_PaddedKernel));
	checkCudaErrors(cudaFree(d_Data));
	checkCudaErrors(cudaFree(d_Kernel));

	free(h_ResultGPU);
	return res;
}

And main function is:

int N=1024, M=1024;
	int r_sq = 10.0;
	int r_kern = N/10;
	double *data = new double[N*M];
	int Nk = 1024, Mk = 1024;
	double *kernel = new double[Nk*Mk];

// Initializing sample array	
	for(int i = 0; i<N; i++) {
		for(int j=0; j<M; j++) {
			if((i>2*N/3-r_sq && i<2*N/3+r_sq && j<M/2+r_sq && j>M/2-r_sq) ||
			(i>N/3-r_sq && i<N/3+r_sq && j<M/2+r_sq && j>M/2-r_sq) ||
			(i>N/3-r_sq && i<N/3+r_sq && j<M/3+r_sq && j>M/3-r_sq) ){
				data[j*N+i] = 1.0;
			} else {
				data[j*N+i] = 0.0;
			}
		}
	}

//Initializing blurring kernel
	for(int i = 0; i<Nk; i++) {
		for(int j=0; j<Mk; j++) {
			if((i-Nk/2)*(i-Nk/2) + (j-Mk/2)*(j-Mk/2) < r_kern*r_kern) {
// 			if(i*i + j*j < r_kern*r_kern) {
				kernel[j*Nk+i] = 1.0;
			} else {
				kernel[j*Nk+i] = 0.0;
			}
		}
	}
	double *res = cuda_convolution_2D(data, kernel, N, M, Nk, Mk, Nk/2, Mk/2);

This is data (1024 X 1024):
http://postimg.org/image/dvs9yp99d/

Kernel (1024 X 1024):
http://postimg.org/image/afgl6k4xl/

And the result… s\
http://postimg.org/image/jm3h7kzjt/

Instead of assumed (obtained using FFTW):
http://postimg.org/image/y32rwk1l1/

Since I’m not a professional programmer, I assume that I don’t understand something important about the process of convolution with cuda r2c and c2r transforms and padding of kernel or data. So, I will be grateful for any explanations :) Thank you!