Guys,I am struggling to overcome a problem in CUDA programming. It is related to the cyclic reduction algorithm. I have two ways of solving a tri-diagonal system of equations:either a serial algorithm (i.e. Gaussian elimination,the Thomas algorithm) performed by CPU after memcopying data from the GPU to CPU or a parallel algorithm performed by the GPU with specific kernels. The CPU computation gives the correct results, while the GPU computation is clearly unstable. Sometimes it gives the correct values (this happens for large time steps of my integration procedure), sometimes it is highly unstable, and numerical errors seem to propagate. I tried to solve in double precision (enabling the sm_13 option) only the kernels related to the cyclic reduction but I did not get much difference at all. Where is the problems? Below I put the coding of the kernel functions.Can somebody please help me out?
global void CR_reduction(double *LOLD,double *DOLD,double *UOLD,double *BOLD,double *LNEW,double *DNEW,double *UNEW,double *BNEW,float *RESPR,int NMAX){
int x = threadIdx.x + blockIdx.x * blockDim.x;
int k = (x >> 1) ;
//Remember : NMAX refers to the previous reduction phase
//if(x>=NMAX)return;
if(x<NMAX){
if(!x){
//FIRST EQUATION OF REDUCTION PHASE
LNEW[x]=0.0;//west coefficient is nil
DNEW[x]=DOLD[x]-(UOLD[x]*LOLD[x+1])/DOLD[x+1];
UNEW[x]=-(UOLD[x]*UOLD[x+1])/DOLD[x+1];
BNEW[x]=BOLD[x]-(UOLD[x]*BOLD[x+1])/DOLD[x+1];
if(RESPR[x]<RESPR[x+1])RESPR[x]=RESPR[x+1];
}
//else if((x%2)==0 && x){
//if((x%2)==0 && x){
//else if((x & (2-1))==0 && x){
if((x & (2-1))==0 && x){
//even number
if((x+1)==(NMAX-1)){
//if((x+1)==7){
//LAST EQUATION OF REDUCTION PHASE
/*
LNEW[x/2]=-LOLD[x]*LOLD[x-1]/DOLD[x-1];
DNEW[x/2]=DOLD[x]-(LOLD[x]*UOLD[x-1]/DOLD[x-1]);
UNEW[x/2]=0.0;
BNEW[x/2]=BOLD[x]-(LOLD[x]*BOLD[x-1]/DOLD[x-1]);
*/
LNEW[k]=-(LOLD[x]*LOLD[x-1])/DOLD[x-1];
DNEW[k]=DOLD[x]-((LOLD[x]*UOLD[x-1])/DOLD[x-1])-((UOLD[x]*LOLD[x+1])/DOLD[x+1]);
UNEW[k]=0.0;
BNEW[k]=BOLD[x]-((LOLD[x]*BOLD[x-1])/DOLD[x-1])-((UOLD[x]*BOLD[x+1])/DOLD[x+1]);
if(RESPR[x]<RESPR[x+1])RESPR[k]=RESPR[x+1];
else RESPR[k]=RESPR[x];
}
else{
LNEW[k]=-(LOLD[x]*LOLD[x-1])/DOLD[x-1];
DNEW[k]=DOLD[x]-((LOLD[x]*UOLD[x-1])/DOLD[x-1])-((UOLD[x]*LOLD[x+1])/DOLD[x+1]);
UNEW[k]=-(UOLD[x]*UOLD[x+1])/DOLD[x+1];
BNEW[k]=BOLD[x]-((LOLD[x]*BOLD[x-1])/DOLD[x-1])-((UOLD[x]*BOLD[x+1])/DOLD[x+1]);
if(RESPR[x]<RESPR[x+1])RESPR[k]=RESPR[x+1];
else RESPR[k]=RESPR[x];
}
}
__syncthreads();
}
}
global void CR_substitution(double *L,double *D,double *U,double *B,int *J,int NMAX,double *X,bool *X_COMPUTED){
//L->lower diagonal
//D->main diagonal
//U->upper diagonal
//B->source term
//J->index of array
//NMAX->number of unknowns for given backsubstitution phase
//X->solution vector
int x = threadIdx.x + blockIdx.x * blockDim.x;
int k = 0;
int kp = 0;
int km = 0;
//Remember : NMAX refers to the previous reduction phase
//if(x>=NMAX)return;
if(x<NMAX){
if(!x && !X_COMPUTED[J[x]]){
//first node where equation is solved
k=J[x];
X[k]=B[x]/D[x];
X_COMPUTED[k]=true;
}
//else if(x && x!=(NMAX-1) && X[J[x]]==0.0){
//else if(x && (x!=(NMAX-1) && (J[x]+1)!=7) && X[J[x]]==0.0){
if(x && (x!=(NMAX-1)) && !X_COMPUTED[J[x]]){
//else if(x && (x!=(NMAX-1)) && X[J[x]]==0.0){
//else if(x && (J[x]+1)!=7 && X[J[x]]==0.0){
//else if(x && ((J[x]+1)!=7) && X[J[x]]==0.0){
//any other equation
//X[J[x]]=(B[x]-U[x]*X[J[x+1]]-L[x]*X[J[x-1]])/D[x];
k=J[x];
kp=J[x+1];
km=J[x-1];
X[k]=(B[x]/D[x])-((U[x]*X[kp])/D[x])-((L[x]*X[km])/D[x]);
X_COMPUTED[k]=true;
}
//else if(x==(NMAX-1) && X[J[x]]==0.0){
//else if(((J[x]+1)==7 || (x==(NMAX-1))) && X[J[x]]==0.0){
if(((x==(NMAX-1))) && !X_COMPUTED[J[x]]){
//else if(((x==(NMAX-1))) && X[J[x]]==0.0){
//else if(((J[x]+1)==7) && X[J[x]]==0.0){
//last node
//X[J[x]]=(B[x]-L[x]*X[J[x-1]])/D[x];
k=J[x];
km=J[x-1];
X[k]=(B[x]/D[x])-((L[x]*X[km])/D[x]);
X_COMPUTED[k]=true;
}
__syncthreads();
}
}