I am recently using cuFFT for development on a GPU accelerated Poisson equation solver. Unfortunately, the results returning from the algorithm were extremely large. I thought there might be something went wrong in the function for solving Poisson equation so I disable it and simply did a forward and inverse transformation. However, the results were still extremely large. Can anybody tell me what mistake was I make? The GPU I am using is Tesla K80, double precision and compute 3.5, and the system is CentOS, version 2.10 and 64 bit.
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cufft.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#define TPBx 32 // TPBx * TPBy = number of threads per block
#define TPBy 32
__global__ void real2complex(cufftDoubleComplex *c, double *a, int n);
__global__ void complex2real_scaled(double *a, cufftDoubleComplex *c, double scale, int n);
__global__ void solve_poisson(cufftDoubleComplex *c, double *kx, double *ky, int n);
void exportData(const char *file, const double *X, const double *Y, const double *Z, const int n);
void gaussian(double *bin, const double *X, const double *Y, const int n);
int main(){
///////////////////////////// INITIZALIZATION ////////////////////////////
int N, R;
printf("Phase 1: Set Up The Environment for Testing\n");
printf("Input the range of x and y: "); // the range of x and y will be from -R to R
scanf("%d", &R);
printf("Input the number of samples: "); // the number of samples will be N * N
scanf("%d", &N);
printf("Allocating memory...\n");
clock_t startTime11 = clock();
char *uFile = (char *)"u_data.dat";
char *rFile = (char *)"r_data.dat";
char *rfftFile = (char *)"rfft_data.dat";
double *X = (double *)malloc(sizeof(double) * N);
double *Y = (double *)malloc(sizeof(double) * N);
double *kx = (double *)malloc(sizeof(double) * N);
double *ky = (double *)malloc(sizeof(double) * N);
double *r = (double *)malloc(sizeof(double) * N * N);
double *rfft = (double *)malloc(sizeof(double) * N * N);
double *u = (double *)malloc(sizeof(double) * N * N);
const double EPSILON = 8.85418782 * pow(10, -12); // Permitivity of free space
const double PI = 4 * atan(1);
double *kx_d, *ky_d, *r_d;
cufftDoubleComplex *r_complex_d;
cudaMalloc((void **)&kx_d, sizeof(double) * N);
cudaMalloc((void **)&ky_d, sizeof(double) * N);
cudaMalloc((void **)&r_d, sizeof(double) * N * N);
cudaMalloc((void **)&r_complex_d, sizeof(cufftDoubleComplex) * N * N);
int m = 0;
double deltaX = (double)R / (N / 2);
double deltaK = 1.0 / 60;
for(int i = N/-2; i < N/2; i++){
if(m < N){
X[m] = i * deltaX;
Y[m] = i * deltaX;
}
m += 1;
}
m = 0;
for(int i = 0; i < N/2; i++){
if(m < N/2){
kx[m] = i * deltaK;
kx[m+N/2] = (double)(i - N / 2) * deltaK;
ky[m] = i * deltaK;
ky[m+N/2] = (double)(i - N / 2) * deltaK;
}
m += 1;
}
clock_t endTime11 = clock();
clock_t startTime12 = clock();
gaussian(r, X, Y, N); // Generate a Gaussian Distribution for r
clock_t endTime12 = clock();
for (int i = 0; i < N * N; i++){
u[i] = 0.0;
}
double totalTime11 = (double)(endTime11 - startTime11) / CLOCKS_PER_SEC;
double totalTime12 = (double)(endTime12 - startTime12) / CLOCKS_PER_SEC;
printf("Phase 1 ended\n");
printf("Time spent on allocating memory: %lf sec\n", totalTime11);
printf("Time spent on generating function: %lf sec\n\n", totalTime12);
//////////////////////////////////////////////////////////////////////////
printf("Phase 2: Evaluation\n");
printf("Copying data from the host to the device...\n");
clock_t startTime21 = clock();
cudaMemcpy(kx_d, kx, sizeof(double) * N, cudaMemcpyHostToDevice);
cudaMemcpy(ky_d, ky, sizeof(double) * N, cudaMemcpyHostToDevice);
cudaMemcpy(r_d, r, sizeof(double) * N * N, cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlan2d(&plan, N, N, CUFFT_Z2Z);
// Compute the execution configuration
dim3 dimBlock(TPBx, TPBy);
dim3 dimGrid(N / dimBlock.x, N / dimBlock.y);
// Handle N not multiple of TPBx or TPBy
if(N % TPBx != 0){
dimGrid.x += 1;
}
if(N % TPBy != 0){
dimGrid.y += 1;
}
clock_t endTime21 = clock();
printf("Start to solve the Poisson equation...\n");
clock_t startTime22 = clock();
double scale = 1.0 / (EPSILON * N * N);
real2complex<<<dimGrid, dimBlock>>>(r_complex_d, r_d, N);
cufftExecZ2Z(plan, r_complex_d, r_complex_d, CUFFT_FORWARD);
complex2real_scaled<<<dimGrid, dimBlock>>>(r_d, r_complex_d, scale, N);
cudaMemcpy(rfft, r_d, sizeof(double) * N * N, cudaMemcpyDeviceToHost);
// solve_poisson<<<dimGrid, dimBlock>>>(r_complex_d, kx_d, ky_d, N);
cufftExecZ2Z(plan, r_complex_d, r_complex_d, CUFFT_INVERSE);
complex2real_scaled<<<dimGrid, dimBlock>>>(r_d, r_complex_d, scale, N);
clock_t endTime22 = clock();
clock_t startTime23 = clock();
cudaMemcpy(u, r_d, sizeof(double) * N * N, cudaMemcpyDeviceToHost);
clock_t endTime23 = clock();
printf("Phase 2 ended\n");
double totalTime21 = (double)(endTime22 - startTime22) / CLOCKS_PER_SEC;
double totalTime22 = (double)(endTime21 + endTime23 - startTime21 - endTime23) / CLOCKS_PER_SEC;
printf("Time spent on calculation: %lf sec\n", totalTime21);
printf("Data spent on data transfer: %lf sec\n\n", totalTime22);
printf("Phase 3: Data Exportation\n");
printf("Exporting data...\n");
clock_t startTime31 = clock();
exportData(rFile, X, Y, r, N);
exportData(uFile, X, Y, u, N);
exportData(rfftFile, kx, ky, rfft, N);
clock_t endTime31 = clock();
printf("Finish!\n");
printf("Phase 3 ended\n");
double totalTime31 = (double)(endTime31 - startTime31) / CLOCKS_PER_SEC;
printf("Time spent on exporting files: %lf sec\n", totalTime31);
// Destroy plan and clean up memory on device
free(kx);
free(ky);
free(X);
free(Y);
free(r);
free(rfft);
free(u);
cufftDestroy(plan);
cudaFree(r_d);
cudaFree(r_complex_d);
cudaFree(kx_d);
cudaFree(ky_d);
return 0;
}
__global__ void real2complex(cufftDoubleComplex *c, double *a, int n){
/* compute idx and idy, the location of the element in the original NxN array */
int idxX = blockIdx.x * blockDim.x + threadIdx.x;
int idxY = blockIdx.y * blockDim.y + threadIdx.y;
if(idxX < n && idxY < n){
int idx = idxX + idxY * n;
c[idx].x = a[idx];
c[idx].y = 0.0;
}
}
__global__ void complex2real_scaled(double *a, cufftDoubleComplex *c, double scale, int n){
/* Compute index X and index Y, the location of the element in the original NxN array */
int idxX = blockIdx.x * blockDim.x + threadIdx.x;
int idxY = blockIdx.y * blockDim.y + threadIdx.y;
if(idxX < n && idxY < n){
int idx = idxX + idxY * n;
a[idx] = scale * c[idx].x;
}
}
__global__ void solve_poisson(cufftDoubleComplex *c, double *kx, double *ky, int n){
/* compute idxX and idxY, the location of the element in the original NxN array */
int idxX = blockIdx.x * blockDim.x + threadIdx.x;
int idxY = blockIdx.y * blockDim.y + threadIdx.y;
if (idxX < n && idxY < n){
int idx = idxX + idxY * n;
double scale = -(kx[idxX] * kx[idxX] + ky[idxY] * ky[idxY]);
if(idxX == 0 && idxY == 0){
scale = 1.0;
}
scale = 1.0 / scale;
c[idx].x *= scale;
c[idx].y *= scale;
}
}
void exportData(const char *file, const double *X, const double *Y, const double *Z, const int n){
FILE *dataFile = fopen(file, "w");
if(dataFile != NULL){
for(int j = 0; j < n ; j++){
for(int i = 0; i < n; i++){
fprintf(dataFile, "%lf\t%lf\t%lf\n", X[i], Y[j], Z[i+j*n]);
}
}
printf("All data have been stored in \"%s\".\n", file);
fclose(dataFile);
}else{
printf("File not found!");
}
}
void gaussian(double *bin, const double *X, const double *Y, const int n){
int sNum; // Number of signal
int dim = 2;
const double PI = 4 * atan(1);
double x, y;
// Ask for essential parameters
printf("Number of signal: ");
scanf("%d", &sNum);
double *sPos = (double *)malloc(sizeof(double) * dim * sNum); // Position of signal
double *scale = (double *)malloc(sizeof(double) * sNum); // Normalization factor
double *var = (double *)malloc(sizeof(double) * sNum); // Variances
for(int s = 0; s < sNum; s++){
printf("Position of signal %d(e.g. 1.2 -3): ", s+1);
scanf("%lf %lf", &sPos[0+s*dim], &sPos[1+s*dim]);
printf("Value of variance %d: ", s+1);
scanf("%lf", &var[s]);
}
for(int s = 0; s < sNum; s++){
printf("Position %d = (%lf, %lf); Variance %d = %lf\n", s+1, sPos[0+s*dim],
sPos[1+s*dim], s+1, var[s]);
}
// Generate required function
printf("Generating density distribution...\n");
for(int s = 0; s < sNum; s++){
scale[s] = 1.0 / sqrt(2 * PI * var[s]);
}
for(int j = 0; j < n; j++){
for(int i = 0; i < n; i++){
bin[i+j*n] = 0;
for(int s = 0; s < sNum; s++){
x = X[i] - sPos[0+s*dim];
y = Y[j] - sPos[1+s*dim];
bin[i+j*n] += scale[s] * exp(-(x * x + y * y)/(2 * var[s]));
}
}
}
// Fix boundary
for(int i = 0; i < n; i++){
bin[i+(n-1)*n] = bin[i];
bin[(n-1)+i*n] = bin[i*n];
}
// Clean up
free(sPos);
free(scale);
free(var);
}
Since the data set are really numerous, I list out some of the abnormal data only. The original function generated by the program is a 2-D Guassian distribution located at the origin where sigma = 1. These three columns are representing x, y and z value respectively. Thus, the z-values are expected to be smaller than 1. As you can see, the data around the maxima are crazily large.
-3.457031 0.000000 114451247.457633
-3.398438 0.000000 139909076.347347
-3.339844 0.000000 170443410.849554
-3.281250 0.000000 206930025.781379
-3.222656 0.000000 250366244.751041
-3.164062 0.000000 301881853.033815
-3.105469 0.000000 362749821.268040
-3.046875 0.000000 434396557.928537
-2.988281 0.000000 518411362.454462
-2.929688 0.000000 616554705.230938
-2.871094 0.000000 730764916.972234
-2.812500 0.000000 863162830.303096
-2.753906 0.000000 1016053882.581461
-2.695312 0.000000 1191927163.527911
-2.636719 0.000000 1393450876.424615
-2.578125 0.000000 1623463679.964595
-2.519531 0.000000 1884961391.671168
-2.460938 0.000000 2181078565.416275
-2.402344 0.000000 2515064506.924731
-2.343750 0.000000 2890253363.842965
-2.285156 0.000000 3310028022.033909
-2.226562 0.000000 3777777657.638298
-2.167969 0.000000 4296848934.746574
-2.109375 0.000000 4870491000.006508
-2.050781 0.000000 5501794605.938879
-1.992188 0.000000 6193625890.925258
-1.933594 0.000000 6948555551.515974
-1.875000 0.000000 7768784356.631359
-1.816406 0.000000 8656066167.217058
-1.757812 0.000000 9611629831.985989
-1.699219 0.000000 10636101522.432798
-1.640625 0.000000 11729429240.339199
-1.582031 0.000000 12890811370.370258
-1.523438 0.000000 14118631251.140060
-1.464844 0.000000 15410399792.862606
-1.406250 0.000000 16762708171.823931
-1.347656 0.000000 18171192576.051701
-1.289062 0.000000 19630512858.900551
-1.230469 0.000000 21134346775.842808
-1.171875 0.000000 22675401234.675877
-1.113281 0.000000 24245441683.027573
-1.054688 0.000000 25835340394.234440
-0.996094 0.000000 27435144000.546642
-0.937500 0.000000 29034160170.631523
-0.878906 0.000000 30621062848.057468
-0.820312 0.000000 32184014972.187828
-0.761719 0.000000 33710807107.449856
-0.703125 0.000000 35189009926.930473
-0.644531 0.000000 36606138047.723518
-0.585938 0.000000 37949822314.215302
-0.527344 0.000000 39207987286.500740
-0.468750 0.000000 40369030427.856606
-0.410156 0.000000 41421999309.099648
-0.351562 0.000000 42356763067.583939
-0.292969 0.000000 43164174380.394531
-0.234375 0.000000 43836218337.480751
-0.175781 0.000000 44366144830.009468
-0.117188 0.000000 44748581397.475403
-0.058594 0.000000 44979623895.938049
0.000000 0.000000 45056902847.745636
0.058594 0.000000 44979623895.938049
0.117188 0.000000 44748581397.475403
0.175781 0.000000 44366144830.009468
0.234375 0.000000 43836218337.480751
0.292969 0.000000 43164174380.394531
0.351562 0.000000 42356763067.583939
0.410156 0.000000 41421999309.099648
0.468750 0.000000 40369030427.856606
0.527344 0.000000 39207987286.500740
0.585938 0.000000 37949822314.215302
0.644531 0.000000 36606138047.723518
0.703125 0.000000 35189009926.930473
0.761719 0.000000 33710807107.449856
0.820312 0.000000 32184014972.187828
0.878906 0.000000 30621062848.057468
0.937500 0.000000 29034160170.631523
0.996094 0.000000 27435144000.546638
1.054688 0.000000 25835340394.234436
1.113281 0.000000 24245441683.027573
1.171875 0.000000 22675401234.675877
1.230469 0.000000 21134346775.842808
1.289062 0.000000 19630512858.900547
1.347656 0.000000 18171192576.051704
1.406250 0.000000 16762708171.823931
1.464844 0.000000 15410399792.862606
1.523438 0.000000 14118631251.140060
1.582031 0.000000 12890811370.370256
1.640625 0.000000 11729429240.339201
1.699219 0.000000 10636101522.432795
1.757812 0.000000 9611629831.985991
1.816406 0.000000 8656066167.217060
1.875000 0.000000 7768784356.631359
1.933594 0.000000 6948555551.515972
1.992188 0.000000 6193625890.925258
2.050781 0.000000 5501794605.938882
2.109375 0.000000 4870491000.006509
2.167969 0.000000 4296848934.746575
2.226562 0.000000 3777777657.638298
2.285156 0.000000 3310028022.033912
2.343750 0.000000 2890253363.842964
2.402344 0.000000 2515064506.924731
2.460938 0.000000 2181078565.416274
2.519531 0.000000 1884961391.671168
2.578125 0.000000 1623463679.964595
2.636719 0.000000 1393450876.424614
2.695312 0.000000 1191927163.527912
2.753906 0.000000 1016053882.581464
2.812500 0.000000 863162830.303096
2.871094 0.000000 730764916.972234
2.929688 0.000000 616554705.230938
2.988281 0.000000 518411362.454461
3.046875 0.000000 434396557.928536
3.105469 0.000000 362749821.268038
3.164062 0.000000 301881853.033814
3.222656 0.000000 250366244.751041
3.281250 0.000000 206930025.781378
3.339844 0.000000 170443410.849553
3.398438 0.000000 139909076.347345
3.457031 0.000000 114451247.457632
3.515625 0.000000 93304833.018316
3.574219 0.000000 75804801.975180
3.632812 0.000000 61375955.092698
3.691406 0.000000 49523208.901802
3.750000 0.000000 39822476.045855
3.808594 0.000000 31912197.433483
3.867188 0.000000 25485556.950012
3.925781 0.000000 20283388.817909
3.984375 0.000000 16087770.826154
4.042969 0.000000 12716283.288637
I would be very appreciate for your help as my code are quite long and complex.