Using cufft to solve Poisson equation (C++)

Yen-Chi · December 29, 2015, 6:05am

Hi all,

I’m using the cuFFTt to solve the Poisson equation.
Here ,I have done the 2D discrete sine transform by cuFFTT and slove the Poisson equation.
But it’s not powerful enough.
When the matrix dimension comes to 2^12 x 2^12, it’s only fifth times faster than cpu.
But the cuFFT is 125 times faster than cpu when the vector length is 2^24.
I think the data communication have spent so many times in gpu computing.
(my enviroment : I7-4770, GTX-960 2G, ram 12G)
compile command: pgc++ -acc -ta=tesla -Minfo -Mcudalib=cufft

Question:
1.How can I improve my code?
2.Does there have discrete sine transform in CUDA library?
3.Can I use double precision in cuFFT?

Here’s my code:

/*
Using cufft to do the discrete sine transform and solve the Poisson equation.
*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cufft.h>
#include "openacc.h"

void fast_poisson_solver_gpu(float *b, float *x, float *data2, float *data3, int Nx, int Ny, int Lx);
void Exact_Solution(float *u, int Nx);
void Exact_Source(float *b, int Nx);
float Error(float *x, float *u, int Nx);
void print_matrix(float *x, int N);

int main()
{
	int p, Nx, Ny, Lx; 
	float *x, *u, *b, *data2, *data3;
	clock_t t1, t2;
	
	// Initialize the numbers of discrete points.
	// Here we consider Nx = Ny. 
	printf(" Please input (Nx = 2^p - 1) p = ");
	scanf("%d",&p);
	Nx = pow(2, p) - 1;
	Ny = Nx;
	printf(" Nx = %d \n", Nx);
	// Prepare the expanded length for discrete sine transform.
	Lx = 2*Nx + 2 ;
	// Create memory for solving Ax = b, where r = b-Ax is the residue.
	// M is the total number of unknowns.
	// Prepare for two dimensional unknown F
	// where b is the one dimensional vector and 
	// F[i][j] = F[j+i*(N-1)];
	b = (float *) malloc(Nx*Ny*sizeof(float));
	x = (float *) malloc(Nx*Ny*sizeof(float));
	
	// data2 : Prepare for dst.
	// data3 : Prepare for complex value to data2 and do the cufft. 
	data2 = (float *) malloc(Lx*Ny*sizeof(float));
	data3 = (float *) malloc(2*Lx*Ny*sizeof(float));

	// Prepare for two dimensional unknowns U
	// where u is the one dimensional vector and
	// U[i][j] = u[j+i*(N-1)] 
	u = (float *) malloc(Nx*Ny*sizeof(float));
		
	Exact_Solution(u, Nx);
	Exact_Source(b, Nx);
	
	t1 = clock();
	fast_poisson_solver_gpu(b, x, data2, data3, Nx, Ny, Lx);
	t2 = clock();
	
	printf(" Fast Poisson Solver: %f secs\n", 1.0*(t2-t1)/CLOCKS_PER_SEC);
	printf(" For N = %d error = %e \n", Nx, Error(x, u, Nx));
	
/*	printf(" \n \n");
	printf(" u matrix \n");
	print_matrix(u, Nx);
	printf(" \n \n");
	printf(" x matrix \n");
	print_matrix(x, Nx);
*/	return 0;
}

void print_matrix(float *x, int N)
{
	int i, j;
	for(i=0;i<N;i++)
	{
		for (j=0;j<N;j++) printf(" %f ", x[N*i+j]);
		printf("\n");
	}
}

void Exact_Solution(float *u, int Nx)
{
	// put the exact solution 
	int i, j;
	float x, y, h;
	h = 1.0/(Nx+1);
	for(i=0;i<Nx;++i)
	{
		x = (i + 1)*h;
		for(j=0;j<Nx;++j)
		{
			//k = j + i*(N-1);
			y = (j + 1)*h;
			u[Nx*i+j] = sin(M_PI*x)*sin(2*M_PI*y);
		}
	}
}

void Exact_Source(float *b, int Nx)
{
	int i,j;
	float x, y, h;
	h = 1.0/(Nx+1);
	for(i=0;i<Nx;++i)
	{
		x = (i+1)*h;
		for(j=0;j<Nx;++j)
		{
			//k = j + i*(N-1);
			y = (j+1)*h;
			b[Nx*i+j] = -(1.0+4.0)*h*h*M_PI*M_PI*sin(M_PI*x)*sin(2*M_PI*y);
		}
	}	
}

float Error(float *x, float *u, int Nx)
{
	// return max_i |x[i] - u[i]|
	int i, j;
	float v, e;
	v = 0.0;
	
	for(i=0;i<Nx;++i)
	{
		for(j=0;j<Nx;j++)
		{
			e = fabs(x[Nx*i+j] - u[Nx*i+j]);
			if(e > v) v = e;		
		}
	}
	return v;
}

void expand_data(float *data, float *data2, int Nx, int Ny, int Lx)
{
	// expand data to 2N + 2 length
	#pragma acc data copyin(data[0:Nx*Ny]), copyout(data2[0:Lx*Ny]) 
	#pragma acc parallel loop independent
	for(int i=0;i<Ny;i++)
	{
		data2[Lx*i] = data2[Lx*i+Nx+1] = 0.0;
		#pragma acc loop independent
		for(int j=0;j<Nx;j++)
		{
			data2[Lx*i+j+1] = data[Nx*i+j];
			data2[Lx*i+Nx+j+2] = -1.0*data[Nx*i+Nx-1-j];
		}
	}
}

void expand_idata(float *data2, float *data3, int Nx, int Ny, int Lx)
{
	#pragma acc data copyin(data2[0:Lx*Ny]), copyout(data3[0:2*Lx*Ny])
	#pragma acc parallel loop independent
	for (int i=0;i<Ny;i++)
	{
		#pragma acc loop independent
		for (int j=0;j<Lx;j++)
		{
			data3[2*Lx*i+2*j] = data2[Lx*i+j];
			data3[2*Lx*i+2*j+1] = 0.0;
		}
	}
}

extern "C" void cuda_fft(float *d_data, int Lx, int Ny, void *stream)
{
	cufftHandle plan;
	cufftPlan1d(&plan, Lx, CUFFT_C2C, Ny);
	cufftSetStream(plan, (cudaStream_t)stream);
	cufftExecC2C(plan, (cufftComplex*)d_data, (cufftComplex*)d_data,CUFFT_FORWARD);
	cufftDestroy(plan);
}

void fdst_gpu(float *data, float *data2, float *data3, int Nx, int Ny, int Lx)
{
	float s;
	s = sqrt(2.0/(Nx+1));

	expand_data(data, data2, Nx, Ny, Lx);
	expand_idata(data2, data3, Nx, Ny, Lx);

	#pragma acc data copy(data3[0:2*Lx*Ny])
	// Copy data to device at start of region and back to host and end of region
	// Inside this region the device data pointer will be used
	#pragma acc host_data use_device(data3)
	{
		void *stream = acc_get_cuda_stream(acc_async_sync);
		cuda_fft(data3, Lx, Ny, stream);
	}
	
	#pragma acc data copyin(data3[0:2*Lx*Ny]), copyout(data[0:Nx*Ny])
	#pragma acc parallel loop independent
	for (int i=0;i<Ny;i++)
	{
		#pragma acc loop independent
		for (int j=0;j<Nx;j++)	data[Nx*i+j] = -1.0*s*data3[2*Lx*i+2*j+3]/2;
	}

}

void transpose(float *data_in, float *data_out, int Nx, int Ny)
{
	int i, j;
	#pragma acc data copyin(data_in[0:Nx*Ny]), copyout(data_out[0:Ny*Nx])
	#pragma acc parallel loop independent
	for(i=0;i<Ny;i++)
	{
		#pragma acc loop independent
		for(j=0;j<Nx;j++)
		{
			data_out[i+j*Ny] = data_in[i*Nx+j];
		}
	}
}

void fast_poisson_solver_gpu(float *b, float *x, float *data2, float *data3, int Nx, int Ny, int Lx)
{
	int i, j;
	float h, *lamda, *temp;
	
	temp = (float *) malloc(Nx*Ny*sizeof(float));
	lamda = (float *) malloc(Nx*sizeof(float));
	h = 1.0/(Nx+1);
	
	#pragma acc data copyout(lamda[0:Nx])
	#pragma acc parallel loop independent
	for(i=0;i<Nx;i++)
	{
		lamda[i] = 2 - 2*cos((i+1)*M_PI*h);
	}
	
	fdst_gpu(b, data2, data3, Nx, Ny, Lx);
	transpose(b, temp, Nx, Ny);
	fdst_gpu(temp, data2, data3, Nx, Ny, Lx);
	transpose(temp, b, Ny, Nx);
	
	#pragma acc data copyin(b[0:Nx*Ny]), copyout(x[0:Nx*Ny])
	#pragma acc parallel loop independent
	for(i=0;i<Ny;i++)
	{
		#pragma acc loop independent
		for(j=0;j<Nx;j++) 
		{
			x[Nx*i+j] = -b[Nx*i+j]/(lamda[i] + lamda[j]);
		}
	}

	fdst_gpu(x, data2, data3, Nx, Ny, Lx);
	transpose(x, temp, Nx, Ny);
	fdst_gpu(temp, data2, data3, Nx, Ny, Lx);
	transpose(temp, x, Ny, Nx);
}

MatColgrove · December 29, 2015, 6:54pm

Hi Yan-Chi,

You have a lot of extra data movement since you’re going in and out of data regions. Your next strategy is to expand the data regions. In your case since very little computation is done on the host, you can have a data region that spans almost the entire program.

Below is a suggested update to your code which nearly eliminates all data movement. Please double check me for any errors.

Hope this helps,
Mat

% cat test.cpp
/*
Using cufft to do the discrete sine transform and solve the Poisson equation.
*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cufft.h>
#include "openacc.h"

void fast_poisson_solver_gpu(float *b, float *x, float *data2, float *data3, int Nx, int Ny, int Lx);
void Exact_Solution(float *u, int Nx);
void Exact_Source(float *b, int Nx);
float Error(float *x, float *u, int Nx);
void print_matrix(float *x, int N);

int main()
{
   int p, Nx, Ny, Lx;
   float *x, *u, *b, *data2, *data3;
   clock_t t1, t2;

   // Initialize the numbers of discrete points.
   // Here we consider Nx = Ny.
   printf(" Please input (Nx = 2^p - 1) p = ");
   scanf("%d",&p);
   Nx = pow(2, p) - 1;
   Ny = Nx;
   printf(" Nx = %d \n", Nx);
   // Prepare the expanded length for discrete sine transform.
   Lx = 2*Nx + 2 ;
   // Create memory for solving Ax = b, where r = b-Ax is the residue.
   // M is the total number of unknowns.
   // Prepare for two dimensional unknown F
   // where b is the one dimensional vector and
   // F[i][j] = F[j+i*(N-1)];
   b = (float *) malloc(Nx*Ny*sizeof(float));
   x = (float *) malloc(Nx*Ny*sizeof(float));

   // data2 : Prepare for dst.
   // data3 : Prepare for complex value to data2 and do the cufft.
   data2 = (float *) malloc(Lx*Ny*sizeof(float));
   data3 = (float *) malloc(2*Lx*Ny*sizeof(float));

#pragma acc enter data create(b[0:Nx*Ny], x[0:Nx*Ny], data2[0:Lx*Ny], data3[0:2*Lx*Ny])
   // Prepare for two dimensional unknowns U
   // where u is the one dimensional vector and
   // U[i][j] = u[j+i*(N-1)]
   u = (float *) malloc(Nx*Ny*sizeof(float));

   Exact_Solution(u, Nx);
   Exact_Source(b, Nx);
   t1 = clock();
   fast_poisson_solver_gpu(b, x, data2, data3, Nx, Ny, Lx);
#pragma acc update host(x[0:Nx*Ny])
   t2 = clock();

   printf(" Fast Poisson Solver: %f secs\n", 1.0*(t2-t1)/CLOCKS_PER_SEC);
   printf(" For N = %d error = %e \n", Nx, Error(x, u, Nx));

/*   printf(" \n \n");
   printf(" u matrix \n");
   print_matrix(u, Nx);
   printf(" \n \n");
   printf(" x matrix \n");
   print_matrix(x, Nx);
#pragma acc exit data delete(b[0:Nx*Ny], x[0:Nx*Ny], data2[0:Lx*Ny], data3[0:2*Lx*Ny])
*/   return 0;
}

void print_matrix(float *x, int N)
{
   int i, j;
   for(i=0;i<N;i++)
   {
      for (j=0;j<N;j++) printf(" %f ", x[N*i+j]);
      printf("\n");
   }
}

void Exact_Solution(float *u, int Nx)
{
   // put the exact solution
   int i, j;
   float x, y, h;
   h = 1.0/(Nx+1);
   for(i=0;i<Nx;++i)
   {
      x = (i + 1)*h;
      for(j=0;j<Nx;++j)
      {
         //k = j + i*(N-1);
         y = (j + 1)*h;
         u[Nx*i+j] = sin(M_PI*x)*sin(2*M_PI*y);
      }
   }
}

void Exact_Source(float *b, int Nx)
{
   int i,j;
   float x, y, h;
   h = 1.0/(Nx+1);
#pragma acc parallel loop gang present(b)
   for(i=0;i<Nx;++i)
   {
      x = (i+1)*h;
#pragma acc loop vector
      for(j=0;j<Nx;++j)
      {
         //k = j + i*(N-1);
         y = (j+1)*h;
         b[Nx*i+j] = -(1.0+4.0)*h*h*M_PI*M_PI*sin(M_PI*x)*sin(2*M_PI*y);
      }
   }
}

float Error(float *x, float *u, int Nx)
{
   // return max_i |x[i] - u[i]|
   int i, j;
   float v, e;
   v = 0.0;

   for(i=0;i<Nx;++i)
   {
      for(j=0;j<Nx;j++)
      {
         e = fabs(x[Nx*i+j] - u[Nx*i+j]);
         if(e > v) v = e;
      }
   }
   return v;
}

void expand_data(float *data, float *data2, int Nx, int Ny, int Lx)
{
   // expand data to 2N + 2 length
   #pragma acc parallel loop independent present(data[0:Nx*Ny],data2[0:Lx*Ny])
   for(int i=0;i<Ny;i++)
   {
      data2[Lx*i] = data2[Lx*i+Nx+1] = 0.0;
      #pragma acc loop independent
      for(int j=0;j<Nx;j++)
      {
         data2[Lx*i+j+1] = data[Nx*i+j];
         data2[Lx*i+Nx+j+2] = -1.0*data[Nx*i+Nx-1-j];
      }
   }
}

void expand_idata(float *data2, float *data3, int Nx, int Ny, int Lx)
{
   #pragma acc parallel loop independent present(data2[0:Lx*Ny],data3[0:2*Lx*Ny])
   for (int i=0;i<Ny;i++)
   {
      #pragma acc loop independent
      for (int j=0;j<Lx;j++)
      {
         data3[2*Lx*i+2*j] = data2[Lx*i+j];
         data3[2*Lx*i+2*j+1] = 0.0;
      }
   }
}

extern "C" void cuda_fft(float *d_data, int Lx, int Ny, void *stream)
{
   cufftHandle plan;
   cufftPlan1d(&plan, Lx, CUFFT_C2C, Ny);
   cufftSetStream(plan, (cudaStream_t)stream);
   cufftExecC2C(plan, (cufftComplex*)d_data, (cufftComplex*)d_data,CUFFT_FORWARD);
   cufftDestroy(plan);
}

void fdst_gpu(float *data, float *data2, float *data3, int Nx, int Ny, int Lx)
{
   float s;
   s = sqrt(2.0/(Nx+1));
   #pragma acc data present(data3[0:2*Lx*Ny],data[0:Nx*Ny],data2[0:Lx*Ny])
   {
   expand_data(data, data2, Nx, Ny, Lx);
   expand_idata(data2, data3, Nx, Ny, Lx);

   // Copy data to device at start of region and back to host and end of region
   // Inside this region the device data pointer will be used
   #pragma acc host_data use_device(data3)
   {
      void *stream = acc_get_cuda_stream(acc_async_sync);
      cuda_fft(data3, Lx, Ny, stream);
   }

   #pragma acc parallel loop independent
   for (int i=0;i<Ny;i++)
   {
      #pragma acc loop independent
      for (int j=0;j<Nx;j++)   data[Nx*i+j] = -1.0*s*data3[2*Lx*i+2*j+3]/2;
   }
   }
}

void transpose(float *data_in, float *data_out, int Nx, int Ny)
{
   int i, j;
   #pragma acc parallel loop independent present(data_in[0:Nx*Ny],data_out[0:Ny*Nx])
   for(i=0;i<Ny;i++)
   {
      #pragma acc loop independent
      for(j=0;j<Nx;j++)
      {
         data_out[i+j*Ny] = data_in[i*Nx+j];
      }
   }
}

void fast_poisson_solver_gpu(float *b, float *x, float *data2, float *data3, int Nx, int Ny, int Lx)
{
   int i, j;
   float h, *lamda, *temp;

   temp = (float *) malloc(Nx*Ny*sizeof(float));
   lamda = (float *) malloc(Nx*sizeof(float));
   h = 1.0/(Nx+1);

   #pragma acc data create(lamda[0:Nx],temp[0:Nx*Ny]), present(b[0:Nx*Ny],x[0:Nx*Ny])
   {

   #pragma acc parallel loop independent
   for(i=0;i<Nx;i++)
   {
      lamda[i] = 2 - 2*cos((i+1)*M_PI*h);
   }

   fdst_gpu(b, data2, data3, Nx, Ny, Lx);
   transpose(b, temp, Nx, Ny);
   fdst_gpu(temp, data2, data3, Nx, Ny, Lx);
   transpose(temp, b, Ny, Nx);

   #pragma acc parallel loop independent
   for(i=0;i<Ny;i++)
   {
      #pragma acc loop independent
      for(j=0;j<Nx;j++)
      {
         x[Nx*i+j] = -b[Nx*i+j]/(lamda[i] + lamda[j]);
      }
   }
   fdst_gpu(x, data2, data3, Nx, Ny, Lx);
   transpose(x, temp, Nx, Ny);
   fdst_gpu(temp, data2, data3, Nx, Ny, Lx);
   transpose(temp, x, Ny, Nx);
   } // end data region
}

% pgc++ -acc -ta=tesla -Mcudalib=cufft -fast testorg.cpp -o testorg.out
% pgc++ -acc -ta=tesla -Mcudalib=cufft -fast test.cpp -o testopt.out
% ./testorg.out
 Please input (Nx = 2^p - 1) p = 12
 Nx = 4095
 Fast Poisson Solver: 2.020000 secs
 For N = 4095 error = 5.364418e-07
% ./testopt.out
 Please input (Nx = 2^p - 1) p = 12
 Nx = 4095
 Fast Poisson Solver: 0.400000 secs
 For N = 4095 error = 5.364418e-07

Yen-Chi · January 7, 2016, 7:27am

Hi Mat,

I’m very appreciate your suggestion, and it’s very helpful.
I’ve learned a lot from your code.
It improves my code and get the 50 times fast than cpu.( compiled with pgc++ -fast )

Topic		Replies	Views
The Output From CUFFT Is Extremely Large GPU-Accelerated Libraries	3	732	June 23, 2018
my speedy FFT 3x faster than CUFFT CUDA Programming and Performance	139	241200	November 16, 2011
accuracy of CUFFT under double precision CUDA Programming and Performance	9	4162	September 18, 2009
Trouble with cuFFT on multiple GPUs GPU-Accelerated Libraries	13	3743	August 26, 2017
cufft1D and MemcpyAsync = gain? optimization of Host/Device transfert CUDA Programming and Performance	2	766	March 13, 2011
cufft doubt comparing r2c and c2c 2D FFTs CUDA Programming and Performance	28	13561	October 27, 2010
cufft questions CUDA Programming and Performance	5	1470	April 2, 2012
Is CUDA right for me? (FDTD) FDTD user needs fast computations while handling massive 3-D arrays CUDA Programming and Performance	17	22442	December 29, 2008
cufftXt batch 1D GPU-Accelerated Libraries	12	2211	October 15, 2019
2D CUFFT wrong result GPU-Accelerated Libraries cufft	8	3110	November 7, 2023

Using cufft to solve Poisson equation (C++)

Related topics