GPU parallelize a code C that performs calculation of conjugate gradient

Good Morning
who can help me to parallelize the next code C on GPU

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define fabs(a) ((a)>0?(a):(-a))

void scalarProd(double *res,
double *x,
double *y,
int n
){
int i;
*res=0;
for(i=0;i<n;i++) {
*res+=x[i]*y[i];
}

}

void saxpy(int n, double a, double *x, double y)
{
int i;
for(i=0;i<n;i++)
y[i]=a
x[i]+y[i];
}

void xplus_axy(int n, double a, double *x, double y)
{
int i;
for(i=0;i<n;i++)
y[i]=x[i]+a
y[i];
}

void pequalaxx(int n, double a, double *x, double y)
{
int i;
for(i=0;i<n;i++)
y[i]+=a
x[i];
}

void mequalaxx(int n, double a, double *x, double y)
{
int i;
for(i=0;i<n;i++)
y[i]-=a
x[i];
}

double comp_norm(int n, double *x)
{
int i;
double norm=0;
for(i=0;i<n;i++) {
if(fabs(x[i])>norm)
norm=fabs(x[i]);
}

 return norm;

}

void yminusx(int n, double *x, double *y)
{
int i;
for(i=0;i<n;i++)
y[i]=x[i]-y[i];
}

void yequalx(double *y, double *x, int n)
{
int i;
for(i=0;i<n;i++)
y[i]=x[i];
}

void csrmul(double *y,unsigned int *Ap, unsigned int *Aj, double *Av,
unsigned int num_rows, double *x)
{
unsigned int row,col;
for(row=0; row<num_rows; row++) {
unsigned int row_begin=Ap[row];
unsigned int row_end = Ap[row+1];
double sum=0;
for(col=row_begin;col<row_end; col++) {
sum+=Av[col]*x[Aj[col]];
}
y[row]=sum;
}

}

int main( int argc, char** argv)
{

unsigned int timer_cpu = 0;
unsigned int timer_gpu = 0;

 double norm=0, norm_gpu;
 double rho=0, rho_old=1, beta, dqp, alpha;
 double rho_gpu=0, rho_old_gpu=1, beta_gpu, dqp_gpu, alpha_gpu;
 int iter=0,iter_gpu=1;

double* h_Av = NULL;
uint * h_Aj=NULL;
uint * h_Ap=NULL;
double* h_x = NULL;
double* h_b = NULL;
double* h_y = NULL;
double* h_y2 = NULL;
double* h_r = NULL;
double* h_p = NULL;
double* h_q = NULL;

 int nb,ii,nb_ele,k,ind, diag;
 int nb_diag=10;
 int nb_per_row=nb_diag+3;
const int size=2000000;

int n = size;
int nb_non_zero=nb_per_row;
int sizeloc=size;
double sum;
int ecart=size/2;
double diagonal_value=1.001+10.;
int ecart_diag=ecart/(nb_diag+3)*2;
int nb_ele_loc=0;

printf("%d \n",ecart_diag);



 int i,j;

// allocate host memory
h_Av = (double*) malloc(sizeof(double)nnb_non_zero);
h_Aj = (uint*) malloc(sizeof(uint)nnb_non_zero);
h_Ap = (uint*) malloc(sizeof(uint)(n+1));
h_x = (double
) malloc(sizeof(double)n);
h_b = (double
) malloc(sizeof(double)n);
h_y = (double
) malloc(sizeof(double)n);
h_y2 = (double
) malloc(sizeof(double)n);
h_r = (double
) malloc(sizeof(double)n);
h_p = (double
) malloc(sizeof(double)n);
h_q = (double
) malloc(sizeof(double)*n);

 for(i=0;i<n;i++) {
   h_x[i]=1;
	 h_y[i]=33;
	 h_b[i]=18;

}

nb=0;

nb_ele_loc=0;
h_Ap[0]=0;

for(ii=0;ii<sizeloc;ii++) {
i=ii;

	sum=0;
nb_ele=0;
for(k=nb_diag/2;k>=1;k--) {//on parcourt les elements de la diagonale gauche
	ind=-k*ecart_diag+i;
	if(ind>=0 && ind<size) {
		h_Aj[nb]=ind;
		h_Av[nb++]=-1;
		sum+=-h_Av[nb-1];
		nb_ele++;
	}
}
if(i>0) {
	h_Aj[nb]=i-1; //on met un element sur la tridiag gauche
	h_Av[nb++]=-1;
	nb_ele++;
}

h_Aj[nb]=i; //on met un       element sur la diagonale
diag=nb;
h_Av[nb++]=0;
nb_ele++;
if(i<size-1) {
	h_Aj[nb]=i+1; //on met un element sur la tridiag droite
	h_Av[nb++]=-1;
	nb_ele++;
}

for(k=1;k<=nb_diag/2;k++) {//on parcourt les elements de la diagonale droite
	ind=k*ecart_diag+i;
	if(ind>=0 && ind<size) {
		h_Aj[nb]=ind;
		h_Av[nb++]=-1;
		sum+=-h_Av[nb-1];
		nb_ele++;
	}
}
h_Av[diag]=sum+diagonal_value;
h_Ap[ii+1]=h_Ap[ii]+nb_ele;
//printf("%d %f %f %d\n",i,sum,val[diag],rowptr[i]);
nb_ele_loc+=nb_ele;

}

csrmul(h_r,h_Ap,h_Aj,h_Av,n,h_x);


yminusx(n, h_b, h_r);

norm=comp_norm(n, h_r); 

while(norm>1e-8 && iter<2000) 

{
scalarProd(&rho,h_r,h_r,n);

	if(iter==1) {
		yequalx(h_p, h_r, n);		  
}
  else {
	  beta=rho/rho_old;
	  xplus_axy(n, beta, h_r, h_p);
}

	csrmul(h_q,h_Ap,h_Aj,h_Av,n,h_p);
	scalarProd(&dqp,h_p,h_q,n);
	printf("dqp %e\n",dqp);		

alpha=rho/dqp;
	pequalaxx(n, alpha, h_p, h_x);
	mequalaxx(n, alpha, h_q, h_r);

	rho_old=rho;
iter++;		
	//comp_norm(n, &norm, h_r); 										
	norm=rho;
	//printf("norm %f\n",norm);

}
printf(“iter %d\n”,iter);

//verif
csrmul(h_y,h_Ap,h_Aj,h_Av,n,h_x);
for(i=0;i<10;i++)
  printf("%f ",h_y[i]);
printf("\n");

}

  1. Learn to use code tags. On this site, it’s just square brackets with code and /code writen in the beginning and ending tags.

  2. How do you think you should parallelize your algorithm? What data can calculate independently of each other? Tracking data dependencies will help you the most in terms of parallelizing a single-threaded algorithm.

You appear to be doing dense and sparse matrix operations.

Consider using CUBLAS and CUSPARSE to replace these operations.

http://docs.nvidia.com/cuda/cublas/index.html#abstract

http://docs.nvidia.com/cuda/cusparse/index.html#abstract

If the overall goal is solution of tridiagonal system, or something like that, there are various solver approaches that could be considered as well, either within CUSPARSE and CUBLAS, or at a higher level, such as using a LAPACK-like API such as MAGMA:

http://icl.cs.utk.edu/magma/

Sorry, I missed the “conjugate gradient” in the title.

You may want to adapt the cuda conjugate gradient sample code for your use:

http://docs.nvidia.com/cuda/cuda-samples/index.html#conjugategradient