Good Morning
who can help me to parallelize the next code C on GPU
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define fabs(a) ((a)>0?(a):(-a))
void scalarProd(double *res,
double *x,
double *y,
int n
){
int i;
*res=0;
for(i=0;i<n;i++) {
*res+=x[i]*y[i];
}
}
void saxpy(int n, double a, double *x, double y)
{
int i;
for(i=0;i<n;i++)
y[i]=ax[i]+y[i];
}
void xplus_axy(int n, double a, double *x, double y)
{
int i;
for(i=0;i<n;i++)
y[i]=x[i]+ay[i];
}
void pequalaxx(int n, double a, double *x, double y)
{
int i;
for(i=0;i<n;i++)
y[i]+=ax[i];
}
void mequalaxx(int n, double a, double *x, double y)
{
int i;
for(i=0;i<n;i++)
y[i]-=ax[i];
}
double comp_norm(int n, double *x)
{
int i;
double norm=0;
for(i=0;i<n;i++) {
if(fabs(x[i])>norm)
norm=fabs(x[i]);
}
return norm;
}
void yminusx(int n, double *x, double *y)
{
int i;
for(i=0;i<n;i++)
y[i]=x[i]-y[i];
}
void yequalx(double *y, double *x, int n)
{
int i;
for(i=0;i<n;i++)
y[i]=x[i];
}
void csrmul(double *y,unsigned int *Ap, unsigned int *Aj, double *Av,
unsigned int num_rows, double *x)
{
unsigned int row,col;
for(row=0; row<num_rows; row++) {
unsigned int row_begin=Ap[row];
unsigned int row_end = Ap[row+1];
double sum=0;
for(col=row_begin;col<row_end; col++) {
sum+=Av[col]*x[Aj[col]];
}
y[row]=sum;
}
}
int main( int argc, char** argv)
{
unsigned int timer_cpu = 0;
unsigned int timer_gpu = 0;
double norm=0, norm_gpu;
double rho=0, rho_old=1, beta, dqp, alpha;
double rho_gpu=0, rho_old_gpu=1, beta_gpu, dqp_gpu, alpha_gpu;
int iter=0,iter_gpu=1;
double* h_Av = NULL;
uint * h_Aj=NULL;
uint * h_Ap=NULL;
double* h_x = NULL;
double* h_b = NULL;
double* h_y = NULL;
double* h_y2 = NULL;
double* h_r = NULL;
double* h_p = NULL;
double* h_q = NULL;
int nb,ii,nb_ele,k,ind, diag;
int nb_diag=10;
int nb_per_row=nb_diag+3;
const int size=2000000;
int n = size;
int nb_non_zero=nb_per_row;
int sizeloc=size;
double sum;
int ecart=size/2;
double diagonal_value=1.001+10.;
int ecart_diag=ecart/(nb_diag+3)*2;
int nb_ele_loc=0;
printf("%d \n",ecart_diag);
int i,j;
// allocate host memory
h_Av = (double*) malloc(sizeof(double)nnb_non_zero);
h_Aj = (uint*) malloc(sizeof(uint)nnb_non_zero);
h_Ap = (uint*) malloc(sizeof(uint)(n+1));
h_x = (double) malloc(sizeof(double)n);
h_b = (double) malloc(sizeof(double)n);
h_y = (double) malloc(sizeof(double)n);
h_y2 = (double) malloc(sizeof(double)n);
h_r = (double) malloc(sizeof(double)n);
h_p = (double) malloc(sizeof(double)n);
h_q = (double) malloc(sizeof(double)*n);
for(i=0;i<n;i++) {
h_x[i]=1;
h_y[i]=33;
h_b[i]=18;
}
nb=0;
nb_ele_loc=0;
h_Ap[0]=0;
for(ii=0;ii<sizeloc;ii++) {
i=ii;
sum=0;
nb_ele=0;
for(k=nb_diag/2;k>=1;k--) {//on parcourt les elements de la diagonale gauche
ind=-k*ecart_diag+i;
if(ind>=0 && ind<size) {
h_Aj[nb]=ind;
h_Av[nb++]=-1;
sum+=-h_Av[nb-1];
nb_ele++;
}
}
if(i>0) {
h_Aj[nb]=i-1; //on met un element sur la tridiag gauche
h_Av[nb++]=-1;
nb_ele++;
}
h_Aj[nb]=i; //on met un element sur la diagonale
diag=nb;
h_Av[nb++]=0;
nb_ele++;
if(i<size-1) {
h_Aj[nb]=i+1; //on met un element sur la tridiag droite
h_Av[nb++]=-1;
nb_ele++;
}
for(k=1;k<=nb_diag/2;k++) {//on parcourt les elements de la diagonale droite
ind=k*ecart_diag+i;
if(ind>=0 && ind<size) {
h_Aj[nb]=ind;
h_Av[nb++]=-1;
sum+=-h_Av[nb-1];
nb_ele++;
}
}
h_Av[diag]=sum+diagonal_value;
h_Ap[ii+1]=h_Ap[ii]+nb_ele;
//printf("%d %f %f %d\n",i,sum,val[diag],rowptr[i]);
nb_ele_loc+=nb_ele;
}
csrmul(h_r,h_Ap,h_Aj,h_Av,n,h_x);
yminusx(n, h_b, h_r);
norm=comp_norm(n, h_r);
while(norm>1e-8 && iter<2000)
{
scalarProd(&rho,h_r,h_r,n);
if(iter==1) {
yequalx(h_p, h_r, n);
}
else {
beta=rho/rho_old;
xplus_axy(n, beta, h_r, h_p);
}
csrmul(h_q,h_Ap,h_Aj,h_Av,n,h_p);
scalarProd(&dqp,h_p,h_q,n);
printf("dqp %e\n",dqp);
alpha=rho/dqp;
pequalaxx(n, alpha, h_p, h_x);
mequalaxx(n, alpha, h_q, h_r);
rho_old=rho;
iter++;
//comp_norm(n, &norm, h_r);
norm=rho;
//printf("norm %f\n",norm);
}
printf(“iter %d\n”,iter);
//verif
csrmul(h_y,h_Ap,h_Aj,h_Av,n,h_x);
for(i=0;i<10;i++)
printf("%f ",h_y[i]);
printf("\n");
}