# The Output From CUFFT Is Extremely Large

I am recently using cuFFT for development on a GPU accelerated Poisson equation solver. Unfortunately, the results returning from the algorithm were extremely large. I thought there might be something went wrong in the function for solving Poisson equation so I disable it and simply did a forward and inverse transformation. However, the results were still extremely large. Can anybody tell me what mistake was I make? The GPU I am using is Tesla K80, double precision and compute 3.5, and the system is CentOS, version 2.10 and 64 bit.

``````#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cufft.h>

#include <stdlib.h>
#include <stdio.h>
#include <time.h>

#define TPBx	32  // TPBx * TPBy = number of threads per block
#define TPBy 	32

__global__ void real2complex(cufftDoubleComplex *c, double *a, int n);
__global__ void complex2real_scaled(double *a, cufftDoubleComplex *c, double scale, int n);
__global__ void solve_poisson(cufftDoubleComplex *c, double *kx, double *ky, int n);
void exportData(const char *file, const double *X, const double *Y, const double *Z, const int n);
void gaussian(double *bin, const double *X, const double *Y, const int n);

int main(){
///////////////////////////// INITIZALIZATION ////////////////////////////
int N, R;
printf("Phase 1: Set Up The Environment for Testing\n");
printf("Input the range of x and y: ");		// the range of x and y will be from -R to R
scanf("%d", &R);
printf("Input the number of samples: ");	// the number of samples will be N * N
scanf("%d", &N);
printf("Allocating memory...\n");
clock_t startTime11 = clock();

char *uFile = (char *)"u_data.dat";
char *rFile = (char *)"r_data.dat";
char *rfftFile = (char *)"rfft_data.dat";

double *X = (double *)malloc(sizeof(double) * N);
double *Y = (double *)malloc(sizeof(double) * N);
double *kx = (double *)malloc(sizeof(double) * N);
double *ky = (double *)malloc(sizeof(double) * N);
double *r = (double *)malloc(sizeof(double) * N * N);
double *rfft = (double *)malloc(sizeof(double) * N * N);
double *u = (double *)malloc(sizeof(double) * N * N);

const double EPSILON = 8.85418782 * pow(10, -12); // Permitivity of free space
const double PI = 4 * atan(1);

double *kx_d, *ky_d, *r_d;
cufftDoubleComplex *r_complex_d;
cudaMalloc((void **)&kx_d, sizeof(double) * N);
cudaMalloc((void **)&ky_d, sizeof(double) * N);
cudaMalloc((void **)&r_d, sizeof(double) * N * N);
cudaMalloc((void **)&r_complex_d, sizeof(cufftDoubleComplex) * N * N);

int m = 0;
double deltaX = (double)R / (N / 2);
double deltaK = 1.0 / 60;
for(int i = N/-2; i < N/2; i++){
if(m < N){
X[m] = i * deltaX;
Y[m] = i * deltaX;
}
m += 1;
}
m = 0;
for(int i = 0; i < N/2; i++){
if(m < N/2){
kx[m] = i * deltaK;
kx[m+N/2] = (double)(i - N / 2) * deltaK;
ky[m] = i * deltaK;
ky[m+N/2] = (double)(i - N / 2) * deltaK;
}
m += 1;
}
clock_t endTime11 = clock();

clock_t startTime12 = clock();
gaussian(r, X, Y, N);	// Generate a Gaussian Distribution for r
clock_t endTime12 = clock();

for (int i = 0; i < N * N; i++){
u[i] = 0.0;
}

double totalTime11 = (double)(endTime11 - startTime11) / CLOCKS_PER_SEC;
double totalTime12 = (double)(endTime12 - startTime12) / CLOCKS_PER_SEC;

printf("Phase 1 ended\n");
printf("Time spent on allocating memory: %lf sec\n", totalTime11);
printf("Time spent on generating function: %lf sec\n\n", totalTime12);
//////////////////////////////////////////////////////////////////////////

printf("Phase 2: Evaluation\n");
printf("Copying data from the host to the device...\n");
clock_t startTime21 = clock();

cudaMemcpy(kx_d, kx, sizeof(double) * N, cudaMemcpyHostToDevice);
cudaMemcpy(ky_d, ky, sizeof(double) * N, cudaMemcpyHostToDevice);
cudaMemcpy(r_d, r, sizeof(double) * N * N, cudaMemcpyHostToDevice);

cufftHandle plan;
cufftPlan2d(&plan, N, N, CUFFT_Z2Z);

// Compute the execution configuration
dim3 dimBlock(TPBx, TPBy);
dim3 dimGrid(N / dimBlock.x, N / dimBlock.y);

// Handle N not multiple of TPBx or TPBy
if(N % TPBx != 0){
dimGrid.x += 1;
}
if(N % TPBy != 0){
dimGrid.y += 1;
}

clock_t endTime21 = clock();
printf("Start to solve the Poisson equation...\n");
clock_t startTime22 = clock();

double scale = 1.0 / (EPSILON * N * N);
real2complex<<<dimGrid, dimBlock>>>(r_complex_d, r_d, N);
cufftExecZ2Z(plan, r_complex_d, r_complex_d, CUFFT_FORWARD);
complex2real_scaled<<<dimGrid, dimBlock>>>(r_d, r_complex_d, scale, N);
cudaMemcpy(rfft, r_d, sizeof(double) * N * N, cudaMemcpyDeviceToHost);
// solve_poisson<<<dimGrid, dimBlock>>>(r_complex_d, kx_d, ky_d, N);
cufftExecZ2Z(plan, r_complex_d, r_complex_d, CUFFT_INVERSE);
complex2real_scaled<<<dimGrid, dimBlock>>>(r_d, r_complex_d, scale, N);

clock_t endTime22 = clock();
clock_t startTime23 = clock();

cudaMemcpy(u, r_d, sizeof(double) * N * N, cudaMemcpyDeviceToHost);

clock_t endTime23 = clock();
printf("Phase 2 ended\n");

double totalTime21 = (double)(endTime22 - startTime22) / CLOCKS_PER_SEC;
double totalTime22 = (double)(endTime21 + endTime23 - startTime21 - endTime23) / CLOCKS_PER_SEC;

printf("Time spent on calculation: %lf sec\n", totalTime21);
printf("Data spent on data transfer: %lf sec\n\n", totalTime22);

printf("Phase 3: Data Exportation\n");
printf("Exporting data...\n");

clock_t startTime31 = clock();
exportData(rFile, X, Y, r, N);
exportData(uFile, X, Y, u, N);
exportData(rfftFile, kx, ky, rfft, N);
clock_t endTime31 = clock();
printf("Finish!\n");
printf("Phase 3 ended\n");

double totalTime31 = (double)(endTime31 - startTime31) / CLOCKS_PER_SEC;
printf("Time spent on exporting files: %lf sec\n", totalTime31);

// Destroy plan and clean up memory on device
free(kx);
free(ky);
free(X);
free(Y);
free(r);
free(rfft);
free(u);
cufftDestroy(plan);
cudaFree(r_d);
cudaFree(r_complex_d);
cudaFree(kx_d);
cudaFree(ky_d);

return 0;

}

__global__ void real2complex(cufftDoubleComplex *c, double *a, int n){
/* compute idx and idy, the location of the element in the original NxN array */
int idxX = blockIdx.x * blockDim.x + threadIdx.x;
int idxY = blockIdx.y * blockDim.y + threadIdx.y;

if(idxX < n && idxY < n){
int idx = idxX + idxY * n;
c[idx].x = a[idx];
c[idx].y = 0.0;
}

}

__global__ void complex2real_scaled(double *a, cufftDoubleComplex *c, double scale, int n){
/* Compute index X and index Y, the location of the element in the original NxN array */
int idxX = blockIdx.x * blockDim.x + threadIdx.x;
int idxY = blockIdx.y * blockDim.y + threadIdx.y;

if(idxX < n && idxY < n){
int idx = idxX + idxY * n;
a[idx] = scale * c[idx].x;
}

}

__global__ void solve_poisson(cufftDoubleComplex *c, double *kx, double *ky, int n){
/* compute idxX and idxY, the location of the element in the original NxN array */
int idxX = blockIdx.x * blockDim.x + threadIdx.x;
int idxY = blockIdx.y * blockDim.y + threadIdx.y;

if (idxX < n && idxY < n){
int idx = idxX + idxY * n;
double scale = -(kx[idxX] * kx[idxX] + ky[idxY] * ky[idxY]);

if(idxX == 0 && idxY == 0){
scale = 1.0;
}

scale = 1.0 / scale;
c[idx].x *= scale;
c[idx].y *= scale;
}

}

void exportData(const char *file, const double *X, const double *Y, const double *Z, const int n){

FILE *dataFile = fopen(file, "w");

if(dataFile != NULL){
for(int j = 0; j < n ; j++){
for(int i = 0; i < n; i++){
fprintf(dataFile, "%lf\t%lf\t%lf\n", X[i], Y[j], Z[i+j*n]);
}
}
printf("All data have been stored in \"%s\".\n", file);
fclose(dataFile);
}else{
}

}

void gaussian(double *bin, const double *X, const double *Y, const int n){

int sNum;		// Number of signal
int dim = 2;
const double PI = 4 * atan(1);
double x, y;

printf("Number of signal: ");
scanf("%d", &sNum);

double *sPos = (double *)malloc(sizeof(double) * dim * sNum);	// Position of signal
double *scale = (double *)malloc(sizeof(double) * sNum);		// Normalization factor
double *var = (double *)malloc(sizeof(double) * sNum);			// Variances
for(int s = 0; s < sNum; s++){
printf("Position of signal %d(e.g. 1.2 -3): ", s+1);
scanf("%lf %lf", &sPos[0+s*dim], &sPos[1+s*dim]);
printf("Value of variance %d: ", s+1);
scanf("%lf", &var[s]);
}
for(int s = 0; s < sNum; s++){
printf("Position %d = (%lf, %lf); Variance %d = %lf\n", s+1, sPos[0+s*dim],
sPos[1+s*dim], s+1, var[s]);
}

// Generate required function
printf("Generating density distribution...\n");
for(int s = 0; s < sNum; s++){
scale[s] = 1.0 / sqrt(2 * PI * var[s]);
}
for(int j = 0; j < n; j++){
for(int i = 0; i < n; i++){
bin[i+j*n] = 0;
for(int s = 0; s < sNum; s++){
x = X[i] - sPos[0+s*dim];
y = Y[j] - sPos[1+s*dim];
bin[i+j*n] += scale[s] * exp(-(x * x + y * y)/(2 * var[s]));
}
}
}
// Fix boundary
for(int i = 0; i < n; i++){
bin[i+(n-1)*n] = bin[i];
bin[(n-1)+i*n] = bin[i*n];
}

// Clean up
free(sPos);
free(scale);
free(var);

}
``````

Since the data set are really numerous, I list out some of the abnormal data only. The original function generated by the program is a 2-D Guassian distribution located at the origin where sigma = 1. These three columns are representing x, y and z value respectively. Thus, the z-values are expected to be smaller than 1. As you can see, the data around the maxima are crazily large.

``````-3.457031	0.000000	114451247.457633
-3.398438	0.000000	139909076.347347
-3.339844	0.000000	170443410.849554
-3.281250	0.000000	206930025.781379
-3.222656	0.000000	250366244.751041
-3.164062	0.000000	301881853.033815
-3.105469	0.000000	362749821.268040
-3.046875	0.000000	434396557.928537
-2.988281	0.000000	518411362.454462
-2.929688	0.000000	616554705.230938
-2.871094	0.000000	730764916.972234
-2.812500	0.000000	863162830.303096
-2.753906	0.000000	1016053882.581461
-2.695312	0.000000	1191927163.527911
-2.636719	0.000000	1393450876.424615
-2.578125	0.000000	1623463679.964595
-2.519531	0.000000	1884961391.671168
-2.460938	0.000000	2181078565.416275
-2.402344	0.000000	2515064506.924731
-2.343750	0.000000	2890253363.842965
-2.285156	0.000000	3310028022.033909
-2.226562	0.000000	3777777657.638298
-2.167969	0.000000	4296848934.746574
-2.109375	0.000000	4870491000.006508
-2.050781	0.000000	5501794605.938879
-1.992188	0.000000	6193625890.925258
-1.933594	0.000000	6948555551.515974
-1.875000	0.000000	7768784356.631359
-1.816406	0.000000	8656066167.217058
-1.757812	0.000000	9611629831.985989
-1.699219	0.000000	10636101522.432798
-1.640625	0.000000	11729429240.339199
-1.582031	0.000000	12890811370.370258
-1.523438	0.000000	14118631251.140060
-1.464844	0.000000	15410399792.862606
-1.406250	0.000000	16762708171.823931
-1.347656	0.000000	18171192576.051701
-1.289062	0.000000	19630512858.900551
-1.230469	0.000000	21134346775.842808
-1.171875	0.000000	22675401234.675877
-1.113281	0.000000	24245441683.027573
-1.054688	0.000000	25835340394.234440
-0.996094	0.000000	27435144000.546642
-0.937500	0.000000	29034160170.631523
-0.878906	0.000000	30621062848.057468
-0.820312	0.000000	32184014972.187828
-0.761719	0.000000	33710807107.449856
-0.703125	0.000000	35189009926.930473
-0.644531	0.000000	36606138047.723518
-0.585938	0.000000	37949822314.215302
-0.527344	0.000000	39207987286.500740
-0.468750	0.000000	40369030427.856606
-0.410156	0.000000	41421999309.099648
-0.351562	0.000000	42356763067.583939
-0.292969	0.000000	43164174380.394531
-0.234375	0.000000	43836218337.480751
-0.175781	0.000000	44366144830.009468
-0.117188	0.000000	44748581397.475403
-0.058594	0.000000	44979623895.938049
0.000000	0.000000	45056902847.745636
0.058594	0.000000	44979623895.938049
0.117188	0.000000	44748581397.475403
0.175781	0.000000	44366144830.009468
0.234375	0.000000	43836218337.480751
0.292969	0.000000	43164174380.394531
0.351562	0.000000	42356763067.583939
0.410156	0.000000	41421999309.099648
0.468750	0.000000	40369030427.856606
0.527344	0.000000	39207987286.500740
0.585938	0.000000	37949822314.215302
0.644531	0.000000	36606138047.723518
0.703125	0.000000	35189009926.930473
0.761719	0.000000	33710807107.449856
0.820312	0.000000	32184014972.187828
0.878906	0.000000	30621062848.057468
0.937500	0.000000	29034160170.631523
0.996094	0.000000	27435144000.546638
1.054688	0.000000	25835340394.234436
1.113281	0.000000	24245441683.027573
1.171875	0.000000	22675401234.675877
1.230469	0.000000	21134346775.842808
1.289062	0.000000	19630512858.900547
1.347656	0.000000	18171192576.051704
1.406250	0.000000	16762708171.823931
1.464844	0.000000	15410399792.862606
1.523438	0.000000	14118631251.140060
1.582031	0.000000	12890811370.370256
1.640625	0.000000	11729429240.339201
1.699219	0.000000	10636101522.432795
1.757812	0.000000	9611629831.985991
1.816406	0.000000	8656066167.217060
1.875000	0.000000	7768784356.631359
1.933594	0.000000	6948555551.515972
1.992188	0.000000	6193625890.925258
2.050781	0.000000	5501794605.938882
2.109375	0.000000	4870491000.006509
2.167969	0.000000	4296848934.746575
2.226562	0.000000	3777777657.638298
2.285156	0.000000	3310028022.033912
2.343750	0.000000	2890253363.842964
2.402344	0.000000	2515064506.924731
2.460938	0.000000	2181078565.416274
2.519531	0.000000	1884961391.671168
2.578125	0.000000	1623463679.964595
2.636719	0.000000	1393450876.424614
2.695312	0.000000	1191927163.527912
2.753906	0.000000	1016053882.581464
2.812500	0.000000	863162830.303096
2.871094	0.000000	730764916.972234
2.929688	0.000000	616554705.230938
2.988281	0.000000	518411362.454461
3.046875	0.000000	434396557.928536
3.105469	0.000000	362749821.268038
3.164062	0.000000	301881853.033814
3.222656	0.000000	250366244.751041
3.281250	0.000000	206930025.781378
3.339844	0.000000	170443410.849553
3.398438	0.000000	139909076.347345
3.457031	0.000000	114451247.457632
3.515625	0.000000	93304833.018316
3.574219	0.000000	75804801.975180
3.632812	0.000000	61375955.092698
3.691406	0.000000	49523208.901802
3.750000	0.000000	39822476.045855
3.808594	0.000000	31912197.433483
3.867188	0.000000	25485556.950012
3.925781	0.000000	20283388.817909
3.984375	0.000000	16087770.826154
4.042969	0.000000	12716283.288637
``````

I would be very appreciate for your help as my code are quite long and complex.

I haven’t studied your code, however, when doing a forward an inverse transform in CUFFT, to preserve the original data scale, it is necessary to divide the results by the size of the transform.

https://docs.nvidia.com/cuda/cufft/index.html#cufft-transform-directions

I tried to do so but I am not sure if I did it correctly. Can you check it for me?

I wouldn’t be able to go through 300 lines of code.

You said this:

“simply did a forward and inverse transformation. However, the results were still extremely large.”

It should be possible demonstrate a forward transform, followed by inverse transform, with proper scaling to restore the input, in 30 lines of code or less. If you want to provide a concise example like that, I will check it for you as time permits.