Using OpenAcc to accelerate GMRES(c++).

Hi,

I’m here again.
This time I’m going to write an iteration solver GMRES to solve the linear system.
I know that the “iteration” is very hard to parallel.
Here is my situation.
I can’t get the correct data in the iteration, so the answer was always wrong.
Without openacc, the code is correct and the iteration will converges.

2 points:

  1. How to get the correct data in the iteration solver?
    2.If we get the correct data, will it faster than cpu?
    (Maybe for N is very large.)

Compile command : pgc++ -acc -ta=tesla -Minfo filename.cpp
Here’s my code:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

void print_vector(double *x, int N);
void matrix_vector(double *A, double *x, double *b, int N);
void print_matrixH(double *x, int N, int k);
void exact_solution(double *u, int N);
void initial(double *A, double *b, double *x0, int N);
void gmres(double *A, double *x, double *b, int N, int max_restart, int max_iter, double tol);
double error(double *x, double *y, int N);

int main()
{
	int N, max_restart, max_iter;
	clock_t t1, t2;
	printf(" Please input N =  ");
	scanf("%d",&N);
	printf(" Please input max restart times max_restart = ");
	scanf("%d",&max_restart);
	printf(" Please input max iteration times max_iter = ");
	scanf("%d",&max_iter);
	printf(" N = %d , max_restart = %d , max_iter = %d \n \n", N, max_restart, max_iter);
	
	double *A, *x, *b, *u, tol;
	A = (double *) malloc(N*N*sizeof(double));
	x = (double *) malloc(N*sizeof(double));
	b = (double *) malloc(N*sizeof(double));
	u = (double *) malloc(N*sizeof(double));
	
	exact_solution(u, N);
	initial(A, b, x, N);	
	tol = 1.0e-4;
	
	t1 = clock();
	gmres(A, x, b, N, max_restart, max_iter, tol);
	t2 = clock();
	
	printf(" error = %e \n", error(x, u, N));
	printf(" times = %f \n", 1.0*(t2-t1)/CLOCKS_PER_SEC);
	
	return 0;
}

double error(double *x, double *y, int N)
{
	int i;
	double temp, error;
	error = 0.0;
	for(i=0; i<N; i++)
	{
		temp = fabs(x[i] - y[i]);
		if(temp > error) error = temp;
	}
	return error;
}

void print_vector(double *x, int N)
{
	int i;
	for(i=0; i<N; i++)
	{
		printf(" %f ", x[i]);
	}
	printf("\n");
}

void print_matrix(double *x, int N)
{
	int i, j;
	for(i=0;i<N;i++)
	{
		for (j=0;j<N;j++) printf(" %f ", x[N*i+j]);
		printf("\n");
	}
	printf("\n");
}

void print_matrixH(double *x, int N, int k)
{
	int i, j;
	for(i=0;i<=k;i++)
	{
		for (j=0;j<=k;j++) printf(" %f ", x[N*i+j]);
		printf("\n");
	}
	printf("\n");
}

void exact_solution(double *u, int N)
{
	int i;
	double h, x;
	h = M_PI/(N+1);
	for(i=0; i<N; i++)
	{
		x = (1+i)*h;
		u[i] = x*sin(x);
	}
}

void initial(double *A, double *b, double *x0, int N)
{
	int i, j;
	double h, h2, temp, x;
	
	h = M_PI/(N+1);
	h2 = h*h;
	
	for(i=0; i<N; i++)
	{
		for(j=0; j<N; j++)
		{
			A[N*i+j] = 0.0;
		}
	}
	temp = -2.0/h2;
	for(i=0; i<N; i++)
	{
		x0[i] = 0.0;
		x = (1+i)*h;
		b[i] = 2*cos(x) - x*sin(x);
			
		A[N*i+i] = temp;
	}
	temp = 1.0/h2;
	for(i=0; i<N-1; i++)
	{
		A[N*(i+1)+i] = temp;
		A[N*i+(i+1)] = temp;
	}
}

double norm(double *x, int N)
{
	int i;
	double norm;
	norm=0.0;
	#pragma acc parallel loop seq
	for(i=0; i<N; i++)
	{
		norm += x[i]*x[i];
	}
	norm = sqrt(norm);
	return norm;
}

double inner_product(double *x, double *y, int N)
{
	int i;
	double temp;
	temp = 0.0;
	#pragma acc parallel loop seq present(x[0:N], y[0:N])
	for(i=0; i<N; i++)
	{
		temp += x[i]*y[i];
	}
	return temp;
}

void matrix_vector(double *A, double *x, double *b, int N)
{
	int i, j;
	#pragma acc parallel loop independent present(A[0:N*N], x[0:N], b[0:N])
	for(i=0; i<N; i++)
	{
		b[i] = 0.0;
		#pragma acc loop seq
		for(j=0; j<N; j++)
		{
			b[i] += A[N*i+j]*x[j];
		}
	}
}

void q_subQ(double *q, double *Q, int N, int iter)
{
	int i;
	#pragma acc parallel loop independent present(q[0:N], Q[0:(N+1)*N])
	for(i=0; i<N; i++)
	{
		q[i] = Q[(N+1)*i + iter];
	}
}

void subQ_v(double *Q, double *v, int N, int iter, double norm_v)
{
	int i;
	#pragma acc parallel loop independent present(Q[0:(N+1)*N], v[0:N])
	for(i=0; i<N; i++)
	{
		Q[(N+1)*i + iter] = v[i]/norm_v;
	}
}

void v_shift(double *v, double *q, double h, int N)
{
	int i;
	#pragma acc parallel loop independent present(v[0:N], q[0:N])
	for(i=0; i<N; i++)
	{
		v[i] -= h*q[i];
	}
}

void backsolve(double *H, double *s, double *y, int N, int i)
{
	// i = iter
	int j, k;
	double temp;
	
	for(j=i; j>=0; j--)
	{
		temp = s[j];
		for(k=j+1; k<=i; k++)
		{
			temp -= y[k]*H[N*j+k];
		}
		y[j] = temp/H[N*j+j];
	}

}

void GeneratePlaneRotation(double dx, double dy, double *cs, double *sn, int i)
{
	double temp;
	if (dy == 0.0) 
	{
		cs[i] = 1.0;
		sn[i] = 0.0;
	} 
	else if (fabs(dy) > fabs(dx)) 
	{
		temp = dx / dy;
		sn[i] = 1.0 / sqrt( 1.0 + temp*temp );
		cs[i] = temp * sn[i];
	} 
	else 
	{
		temp = dy / dx;
		cs[i] = 1.0 / sqrt( 1.0 + temp*temp );
		sn[i] = temp * cs[i];
	}
}

/*void ApplyPlaneRotation(Real &dx, Real &dy, Real &cs, Real &sn)
{
	Real temp  =  cs * dx + sn * dy;
	dy = -sn * dx + cs * dy;
	dx = temp;
}
*/

void gmres(double *A, double *x, double *b, int N, int max_restart, int max_iter, double tol)
{
	int i, j, k, m;
	double resid, normb, beta, temp, *tempv, *r, *q, *v, *cs, *sn, *s, *y, *Q, *H;
	
	Q = (double *) malloc(N*(N+1)*sizeof(double));
	H = (double *) malloc((N+1)*N*sizeof(double));
	tempv = (double *) malloc(N*sizeof(double));
	r = (double *) malloc(N*sizeof(double));
	q = (double *) malloc(N*sizeof(double));
	v = (double *) malloc(N*sizeof(double));
	cs = (double *) malloc((N+1)*sizeof(double));
	sn = (double *) malloc((N+1)*sizeof(double));
	s = (double *) malloc((N+1)*sizeof(double));
	y = (double *) malloc(N*sizeof(double));
	
/*	for(i=0; i<N+1; i++)
	{
		cs[i] = 0.0;
		sn[i] = 0.0;
		s[i] = 0.0;
		for(k=0; k<N; k++)
		{
			Q[N*i+k] = 0.0;
			H[N*i+k] = 0.0;
		}
	}
*/	
	normb = norm(b, N);
	
	#pragma acc data create(Q[0:(N+1)*N], H[0:(N+1)*N], tempv[0:N], r[0:N], q[0:N], v[0:N], cs[0:N+1], sn[0:N+1], s[0:N+1], y[0:N]), copyin(A[0:N*N], b[0:N]), copyout(x[0:N])
	{
	matrix_vector(A, x, tempv, N);
	#pragma acc parallel loop independent
	for (i=0; i<N; i++)	r[i] = b[i] - tempv[i];
	beta = norm(r, N);
	
	for (m=0; m<max_restart; m++)
	{
		#pragma acc parallel loop independent
		for (i=0; i<N; i++)
		{
			s[i+1] = 0.0;
			Q[(N+1)*i+0] = r[i]/beta;
		}
		s[0] = beta;
		
		for (i = 0; i<max_iter; i++) 
		{
	  		q_subQ(q, Q, N, i);
	  		matrix_vector(A, q, v, N);
	  		for (k=0; k<=i; k++) 
			{
				q_subQ(q, Q, N, k);
	    		H[N*k+i] = inner_product(q, v, N);
	    		v_shift(v, q, H[N*k+i], N);
	  		}
	  		
			H[N*(i+1)+i] = norm(v, N);
			subQ_v(Q, v, N, i+1, H[N*(i+1)+i]);
			
			#pragma acc parallel loop independent
	    	for (k = 0; k < i; k++)
	      	{
	      		//ApplyPlaneRotation(H(k,i), H(k+1,i), cs(k), sn(k))
	      		temp = cs[k]*H[N*k+i] + sn[k]*H[N*(k+1)+i];
				H[N*(k+1)+i] = -1.0*sn[k]*H[N*k+i] + cs[k]*H[N*(k+1)+i];
				H[N*k+i] = temp;
			}
			
	      	GeneratePlaneRotation(H[N*i+i], H[N*(i+1)+i], cs, sn, i);
	      	//ApplyPlaneRotation(H(i,i), H(i+1,i), cs(i), sn(i))
			H[N*i+i] = cs[i]*H[N*i+i] + sn[i]*H[N*(i+1)+i];
			H[N*(i+1)+i] = 0.0;
	      	//ApplyPlaneRotation(s(i), s(i+1), cs(i), sn(i));
	      	temp = cs[i]*s[i];
	      	s[i+1] = -1.0*sn[i]*s[i];
	      	s[i] = temp;
	      	resid = fabs(s[i+1]/beta);
	     	
	     	if (resid < tol) 
			{
				printf(" resid = %e \n", resid);
				printf(" Converges at %d step ", i+1);
				backsolve(H, s, y, N, i);
				#pragma acc parallel loop independent
				for(j=0; j<N; j++)
				{
					#pragma acc loop seq
					for(k=0; k<=i; k++)
					{
						x[j] += Q[(N+1)*j+k]*y[k];
					}
				}
				break;
	      	}
		}//end inside for
		// Caution : i = i + 1.
		if (resid < tol)	
		{
			printf(" %d cycle \n", m);
			break;
		}
		

		backsolve(H, s, y, N, i-1);
		#pragma acc parallel loop independent
		for(j=0; j<N; j++)
		{
			#pragma acc loop seq
			for(k=0; k<=i-1; k++)
			{
				x[j] += Q[(N+1)*j+k]*y[k];
			}
		}
		matrix_vector(A, x, tempv, N);
		#pragma acc parallel loop independent
		for (j=0; j<N; j++)	r[j] = b[j] - tempv[j];
		beta = norm(r, N);
		s[i] = beta;
		resid = s[i]/normb;
		if ( resid < tol)	
		{
			printf(" resid = %e \n", resid);
			printf(" Converges at %d cycle %d step. \n", m, i);
			break;
		}
	}//end outside for
	}//end pragma acc 
}

Hi Yan-Chi,

Follow your data flow on what copy of a variable is being updated on the device versus where it’s updated on the host. For the “s” and “H” arrays, you assign part on the device and part of the host and then use the data on both the host and device.

Since the the data may be discrete between the host and device, your variables may have two separate copies. Hence you need to ensure that the copies are synchronized (via the “update” clause) or only assigned and used on the device or host.

Note you can try using CUDA Unified Memory (-ta=tesla:managed) in which the CUDA driver manages the dynamic memory and thus eliminates your synchronization issue. However it can be quite slow if you’re accessing data back-and-forth on the device and host. There are other restrictions as well. For details please see: https://www.pgroup.com/lit/articles/insider/v6n2a4.htm

I also see a forward dependency in the following loop:

         #pragma acc parallel loop independent
          for (k = 0; k < i; k++)
            {
               //ApplyPlaneRotation(H(k,i), H(k+1,i), cs(k), sn(k))
               temp = cs[k]*H[N*k+i] + sn[k]*H[N*(k+1)+i];
            H[N*(k+1)+i] = -1.0*sn[k]*H[N*k+i] + cs[k]*H[N*(k+1)+i];
            H[N*k+i] = temp;
         }

You can either fix the code so that it can parallelize or run the loop sequentially on the host. Given you’re already updating the H array on the host before and after this loop, I’d simply remove the parallel pragma and run it sequentially.

2.If we get the correct data, will it faster than cpu?
(Maybe for N is very large.)

What input values do you consider large?

Typically the more work you can move to the device, the faster it will be relative to a host version.

  • Mat

Hi Mat

I’m thinking about the things that your talking about.
I’ll try the “update” clause and CUDA Undefined Memory.

2.If we get the correct data, will it faster than cpu?
(Maybe for N is very large.)

I think GMRES in 1D is not complex enough.
I should try 2D programming.

Thanks a lot Mat.
You let me know some new way to try.

Hi,

I’ve done the 2D GMRES iteration solver.
And got the correct data and result in the parallel computing.
But the result is not faster than cpu. (For N = 100)
How can I improve my code?

Compile command : pgc++ -acc -ta=tesla:managed -Minfo filename.cpp

Here’s my code:

//*****************************************************************
// Iterative template routine -- GMRES
//
// GMRES solves the unsymmetric linear system Ax = b using the 
// Generalized Minimum Residual method
//
// The return value indicates convergence within max_iter (input)
// iterations (0), or no convergence within max_iter iterations (1).
//
// Upon successful return, output arguments have the following values:
//  
//        x  --  approximate solution to Ax = b
// max_iter  --  the number of iterations performed before the
//               tolerance was reached
//      tol  --  the residual after the final iteration
//  
//*****************************************************************

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

void print_vector(double *x, int N);
void matrix_vector(double *A, double *x, double *b, int N);
void print_matrixH(double *x, int N, int k);
void initial(double *A, double *b, double *x0, double *u, int N);
void gmres(double *A, double *x, double *b, int N, int max_restart, int max_iter, double tol);
double error(double *x, double *y, int N);

int main()
{
	int N, max_restart, max_iter;
	clock_t t1, t2;
	printf(" Please input N =  ");
	scanf("%d",&N);
	printf(" Please input max restart times max_restart = ");
	scanf("%d",&max_restart);
	printf(" Please input max iteration times max_iter = ");
	scanf("%d",&max_iter);
	printf(" N = %d , max_restart = %d , max_iter = %d \n \n", N, max_restart, max_iter);
	
	double *A, *x, *b, *u, tol;
	A = (double *) malloc(N*N*sizeof(double));
	x = (double *) malloc(N*N*sizeof(double));
	b = (double *) malloc(N*N*sizeof(double));
	u = (double *) malloc(N*N*sizeof(double));
	
	initial(A, b, x, u, N);	
	tol = 1.0e-4;
	t1 = clock();
	gmres(A, x, b, N, max_restart, max_iter, tol);
	t2 = clock();
	
	printf(" error = %e \n", error(x, u, N*N));
	printf(" times = %f \n", 1.0*(t2-t1)/CLOCKS_PER_SEC);
	
	return 0;
}

double error(double *x, double *y, int N)
{
	int i;
	double temp, error;
	error = 0.0;
	for(i=0; i<N; i++)
	{
		temp = fabs(x[i] - y[i]);
		if(temp > error) error = temp;
	}
	return error;
}

void print_vector(double *x, int N)
{
	int i;
	for(i=0; i<N; i++)
	{
		printf(" %f ", x[i]);
	}
	printf("\n");
}

void print_matrix(double *x, int N)
{
	int i, j;
	for(i=0;i<N;i++)
	{
		for (j=0;j<N;j++) printf(" %f ", x[N*i+j]);
		printf("\n");
	}
	printf("\n");
}

void print_matrixH(double *x, int max_iter, int k)
{
	int i, j;
	for(i=0;i<=k;i++)
	{
		for (j=0;j<=k;j++) printf(" %f ", x[max_iter*i+j]);
		printf("\n");
	}
	printf("\n");
}

void initial(double *A, double *b, double *x0, double *u, int N)
{
	int i, j;
	double h, h2, temp, x, y;
	
	h = M_PI/(N+1);
	h2 = h*h;
	
	for(i=0; i<N; i++)
	{
		y = (1+i)*h;
		for(j=0; j<N; j++)
		{
			x = (1+j)*h;
			x0[N*i+j] = 0.0;
			A[N*i+j] = 0.0;
			u[N*i+j] = x*y*sin(x)*sin(y);
			b[N*i+j] = x*sin(x)*(2*cos(y) - y*sin(y)) + y*sin(y)*(2*cos(x) - x*sin(x));
		}
	}
	temp = -2.0/h2;
	for(i=0; i<N; i++)
	{			
		A[N*i+i] = temp;
	}
	temp = 1.0/h2;
	for(i=0; i<N-1; i++)
	{
		A[N*(i+1)+i] = temp;
		A[N*i+(i+1)] = temp;
	}
}

double norm(double *x, int N)
{
	int i;
	double norm;
	norm=0.0;
	#pragma acc parallel loop seq present(x[0:N])
	for(i=0; i<N; i++)
	{
		norm += x[i]*x[i];
	}
	norm = sqrt(norm);
	return norm;
}

double inner_product(double *x, double *y, int N)
{
	int i;
	double temp;
	temp = 0.0;
	#pragma acc parallel loop seq present(x, y)
	for(i=0; i<N; i++)
	{
		temp += x[i]*y[i];
	}
	return temp;
}

void matrix_matrix(double *A, double *x, double *b, int N)
{
	int i, j, k;
	#pragma acc parallel loop independent present(A, x, b)
	for (i=0; i<N; i++)
	{
		#pragma acc loop independent
		for (j=0; j<N; j++)
		{
			b[N*i+j] = 0.0;
			#pragma acc loop seq
			for (k=0; k<N; k++)
			{
				b[N*i+j] += A[N*i+k]*x[N*k+j];
			}
		}
	}
}

void q_subQ(double *q, double *Q, int N, int iter)
{
	int i;
	#pragma acc parallel loop independent present(Q, q)
	for (i=0; i<N; i++)	q[i] = Q[N*iter+i];
}

void subQ_v(double *Q, double *v, int N, int iter, double norm_v)
{
	int i, N2;
	N2 = N*iter;
	#pragma acc parallel loop independent present(Q, v)
	for (i=0; i<N; i++)	Q[N2+i] = v[i]/norm_v;
}

void w_shift(double *v, double *q, double h, int N)
{
	int i;
	#pragma acc parallel loop independent present(v, q)
	for(i=0; i<N; i++)
	{
		v[i] -= h*q[i];
	}
}

void backsolve(double *H, double *s, double *y, int N, int max_iter, int i)
{
	// i = iter
	int j, k;
	double temp;
	
	for(j=i; j>=0; j--)
	{
		temp = s[j];
		for(k=j+1; k<=i; k++)
		{
			temp -= y[k]*H[max_iter*j+k];
		}
		y[j] = temp/H[max_iter*j+j];
	}

}

void GeneratePlaneRotation(double dx, double dy, double *cs, double *sn, int i)
{
	double temp;
	if (dy == 0.0) 
	{
		cs[i] = 1.0;
		sn[i] = 0.0;
	} 
	else if (fabs(dy) > fabs(dx)) 
	{
		temp = dx / dy;
		sn[i] = 1.0 / sqrt( 1.0 + temp*temp );
		cs[i] = temp * sn[i];
	} 
	else 
	{
		temp = dy / dx;
		cs[i] = 1.0 / sqrt( 1.0 + temp*temp );
		sn[i] = temp * cs[i];
	}
}

void gmres(double *A, double *x, double *b, int N, int max_restart, int max_iter, double tol)
{
	int i, j, k, l, m, N2;
	double resid, normb, beta, temp, *r, *q, *v, *z, *w, *cs, *sn, *s, *y, *Q, *H;
	
	N2 = N*N;
	Q = (double *) malloc(N2*(max_iter+1)*sizeof(double));
	H = (double *) malloc((N+1)*max_iter*sizeof(double));
	r = (double *) malloc(N2*sizeof(double));
	q = (double *) malloc(N2*sizeof(double));
	v = (double *) malloc(N2*sizeof(double));
	z = (double *) malloc(N2*sizeof(double));
	w = (double *) malloc(N2*sizeof(double));
	cs = (double *) malloc((max_iter+1)*sizeof(double));
	sn = (double *) malloc((max_iter+1)*sizeof(double));
	s = (double *) malloc((max_iter+1)*sizeof(double));
	y = (double *) malloc((max_iter+1)*sizeof(double));
	
	#pragma acc data create(Q[0:N2*(max_iter+1)], H[0:(N+1)*max_iter], r[0:N2], q[0:N2], v[0:N2], z[0:N2], w[0:N2], cs[0:max_iter+1], sn[0:max_iter+1], s[0:max_iter+1], y[0:max_iter+1])
	{
	normb = norm(b, N2);
	
	matrix_matrix(A, x, v, N);
	matrix_matrix(x, A, z, N);
	#pragma acc parallel loop independent
	for (i=0; i<N2; i++)	r[i] = b[i] - (v[i] + z[i]);
	beta = norm(r, N2);
	
	for (m=0; m<max_restart; m++)
	{
		#pragma acc parallel loop independent
		for (i=0; i<N2; i++)	Q[i] = r[i]/beta;
		#pragma acc parallel loop independent
		for (i=0; i<max_iter; i++)	s[i+1] = 0.0;
		s[0] = beta;
		
		for (i = 0; i<max_iter; i++) 
		{
	  		q_subQ(q, Q, N2, i);	
	  		matrix_matrix(A, q, v, N);
	  		matrix_matrix(q, A, z, N);
	  		#pragma acc parallel loop independent
	  		for (j=0; j<N2; j++)	w[j] = v[j] + z[j];
	  		
	  		for (k=0; k<=i; k++) 
			{
				q_subQ(q, Q, N2, k);
	    		H[max_iter*k+i] = inner_product(q, w, N2);
	    		w_shift(w, q, H[max_iter*k+i], N2);
	  		}
	  		
			H[max_iter*(i+1)+i] = norm(w, N2);
			subQ_v(Q, w, N2, i+1, H[max_iter*(i+1)+i]);
			
			#pragma acc kernels
	    	for (k = 0; k < i; k++)
	      	{
	      		//ApplyPlaneRotation(H(k,i), H(k+1,i), cs(k), sn(k))
	      		temp = cs[k]*H[max_iter*k+i] + sn[k]*H[max_iter*(k+1)+i];
				H[max_iter*(k+1)+i] = -1.0*sn[k]*H[max_iter*k+i] + cs[k]*H[max_iter*(k+1)+i];
				H[max_iter*k+i] = temp;
			}
			
	      	GeneratePlaneRotation(H[max_iter*i+i], H[max_iter*(i+1)+i], cs, sn, i);
	      	
	      	//ApplyPlaneRotation(H(i,i), H(i+1,i), cs(i), sn(i))
			H[max_iter*i+i] = cs[i]*H[max_iter*i+i] + sn[i]*H[max_iter*(i+1)+i];
			H[max_iter*(i+1)+i] = 0.0;
			
	      	//ApplyPlaneRotation(s(i), s(i+1), cs(i), sn(i));
	      	temp = cs[i]*s[i];
	      	s[i+1] = -1.0*sn[i]*s[i];
	      	s[i] = temp;
	      	resid = fabs(s[i+1]/beta);
	     	
	     	if (resid < tol) 
			{
				printf(" resid = %e \n", resid);
				printf(" Converges at %d step ", i+1);
				backsolve(H, s, y, N, max_iter, i);
				#pragma acc parallel loop independent
				for(j=0; j<N; j++)
				{
					#pragma acc loop independent
					for (l=0; l<N; l++)
					{
						#pragma acc loop seq
						for(k=0; k<=i; k++)
						{
							x[N*j+l] += Q[N2*k+N*j+l]*y[k];
						}
					}
				}
				break;
	      	}
		}//end inside for
		
		if (resid < tol)	
		{
			printf(" %d cycle \n", m);
			break;
		}
		
		// Caution : i = i + 1.
		i = i - 1;
		backsolve(H, s, y, N, max_iter, i);
		#pragma acc parallel loop independent
		for(j=0; j<N; j++)
		{
			#pragma acc loop independent
			for (l=0; l<N; l++)
			{
				#pragma acc loop seq
				for(k=0; k<=i; k++)
				{
					x[N*j+l] += Q[N2*k+N*j+l]*y[k];
				}
			}
		}
		matrix_matrix(A, x, v, N);
		matrix_matrix(x, A, z, N);
		#pragma acc parallel loop independent
		for (j=0; j<N2; j++)	r[j] = b[j] - (v[j] + z[j]);
		beta = norm(r, N2);
		s[i+1] = beta;
		resid = s[i+1]/normb;
		if ( resid < tol)	
		{
			printf(" resid = %e \n", resid);
			printf(" Converges at %d cycle %d step. \n", m, i);
			break;
		}
	}//end outside for
	}//end pragma acc data
}

Using CUDA Unified Memory can have a severe performance especially when a program is using data back and forth on the device and host, as is the case here. You next step is to manually manage your data.

Also, N=100 is fairly small. Try running a much larger value for N, such as 1024. This put put more computation on the device relative to the number of time it’s copied between the host and device.

  • Mat

Hi Mat,

Thanks for replying, and your suggestions are useful.
Yeah, maybe for N = 100 is still too small.
I’ve watched some CUDA Unified Memory references.
It seems writing with CUDA code.
Do I need to write the CUDA code in the program?
Or just manage my data with OpenACC and compile with -ta=tesla:managed?

You can continue to use CUDA Unified Memory (all you need is the -ta=tesla:managed flag and the PGI runtime will automatically use unified memory for all dynamic allocations), but since you use the data on both the host and device, you get a “Ping-Pong” effect where data is being copied back and forth. This can be quite slow. You can still execute on the host, but so long as you don’t touch any of the dynamic memory, no copies will be performed.

If you want to improve your performance you’ll need to do one or more of the following:

  • Within your main loop, only touch dynamic data on device even if it means running sequential kernels on the device. The caveat being that sequential execution on the device can be slow as well. Hence you’ll need to experiment if the cost of running sequential kernels is less than the cost of data movement done by the CUDA runtime.

  • Don’t use CUDA Unified Memory and instead optimize your code to use OpenACC data regions and update directives so that you control when the data is moved.

  • Wait. CUDA Unified Memory is currently considered a beta feature and not expected to be performant. However, future hardware and software version are expected to help.

Hope this helps,
Mat