CUDA issue : cyclic reduction (number crunching routine)-HELP ME PLEASE CUDA issue : cyclic reductio

Guys,I am struggling to overcome a problem in CUDA programming. It is related to the cyclic reduction algorithm. I have two ways of solving a tri-diagonal system of equations:either a serial algorithm (i.e. Gaussian elimination,the Thomas algorithm) performed by CPU after memcopying data from the GPU to CPU or a parallel algorithm performed by the GPU with specific kernels. The CPU computation gives the correct results, while the GPU computation is clearly unstable. Sometimes it gives the correct values (this happens for large time steps of my integration procedure), sometimes it is highly unstable, and numerical errors seem to propagate. I tried to solve in double precision (enabling the sm_13 option) only the kernels related to the cyclic reduction but I did not get much difference at all. Where is the problems? Below I put the coding of the kernel functions.Can somebody please help me out?

global void CR_reduction(double *LOLD,double *DOLD,double *UOLD,double *BOLD,double *LNEW,double *DNEW,double *UNEW,double *BNEW,float *RESPR,int NMAX){

int x = threadIdx.x + blockIdx.x * blockDim.x;

int k = (x >> 1) ;

//Remember : NMAX refers to the previous reduction phase
//if(x>=NMAX)return;

if(x<NMAX){

if(!x){
	//FIRST EQUATION OF REDUCTION PHASE
	LNEW[x]=0.0;//west coefficient is nil
	DNEW[x]=DOLD[x]-(UOLD[x]*LOLD[x+1])/DOLD[x+1];
	UNEW[x]=-(UOLD[x]*UOLD[x+1])/DOLD[x+1];
	BNEW[x]=BOLD[x]-(UOLD[x]*BOLD[x+1])/DOLD[x+1];
	if(RESPR[x]<RESPR[x+1])RESPR[x]=RESPR[x+1];
}
//else if((x%2)==0 && x){
//if((x%2)==0 && x){
//else if((x & (2-1))==0 && x){
if((x & (2-1))==0 && x){
	//even number
	if((x+1)==(NMAX-1)){
	//if((x+1)==7){
		//LAST EQUATION OF REDUCTION PHASE
		/*
		LNEW[x/2]=-LOLD[x]*LOLD[x-1]/DOLD[x-1];
		DNEW[x/2]=DOLD[x]-(LOLD[x]*UOLD[x-1]/DOLD[x-1]);
		UNEW[x/2]=0.0;
		BNEW[x/2]=BOLD[x]-(LOLD[x]*BOLD[x-1]/DOLD[x-1]);
		*/
		LNEW[k]=-(LOLD[x]*LOLD[x-1])/DOLD[x-1];
		DNEW[k]=DOLD[x]-((LOLD[x]*UOLD[x-1])/DOLD[x-1])-((UOLD[x]*LOLD[x+1])/DOLD[x+1]);
		UNEW[k]=0.0;
		BNEW[k]=BOLD[x]-((LOLD[x]*BOLD[x-1])/DOLD[x-1])-((UOLD[x]*BOLD[x+1])/DOLD[x+1]);
		if(RESPR[x]<RESPR[x+1])RESPR[k]=RESPR[x+1];
		else RESPR[k]=RESPR[x];
	}
	else{
		LNEW[k]=-(LOLD[x]*LOLD[x-1])/DOLD[x-1];
		DNEW[k]=DOLD[x]-((LOLD[x]*UOLD[x-1])/DOLD[x-1])-((UOLD[x]*LOLD[x+1])/DOLD[x+1]);
		UNEW[k]=-(UOLD[x]*UOLD[x+1])/DOLD[x+1];
		BNEW[k]=BOLD[x]-((LOLD[x]*BOLD[x-1])/DOLD[x-1])-((UOLD[x]*BOLD[x+1])/DOLD[x+1]);
		if(RESPR[x]<RESPR[x+1])RESPR[k]=RESPR[x+1];
		else RESPR[k]=RESPR[x];
	}
}

__syncthreads();

}

}

global void CR_substitution(double *L,double *D,double *U,double *B,int *J,int NMAX,double *X,bool *X_COMPUTED){
//L->lower diagonal
//D->main diagonal
//U->upper diagonal
//B->source term
//J->index of array
//NMAX->number of unknowns for given backsubstitution phase
//X->solution vector

int x = threadIdx.x + blockIdx.x * blockDim.x;

int k = 0;
int kp = 0;
int km = 0;

//Remember : NMAX refers to the previous reduction phase
//if(x>=NMAX)return;

if(x<NMAX){

if(!x && !X_COMPUTED[J[x]]){
	//first node where equation is solved
	k=J[x];
	X[k]=B[x]/D[x];
	X_COMPUTED[k]=true;
}
//else if(x && x!=(NMAX-1) && X[J[x]]==0.0){

//else if(x && (x!=(NMAX-1) && (J[x]+1)!=7) && X[J[x]]==0.0){

if(x && (x!=(NMAX-1)) && !X_COMPUTED[J[x]]){
//else if(x && (x!=(NMAX-1)) && X[J[x]]==0.0){

	//else if(x && (J[x]+1)!=7 && X[J[x]]==0.0){
//else if(x && ((J[x]+1)!=7) && X[J[x]]==0.0){
	//any other equation
	//X[J[x]]=(B[x]-U[x]*X[J[x+1]]-L[x]*X[J[x-1]])/D[x];
	k=J[x];
	kp=J[x+1];
	km=J[x-1];
	X[k]=(B[x]/D[x])-((U[x]*X[kp])/D[x])-((L[x]*X[km])/D[x]);
	X_COMPUTED[k]=true;
}
//else if(x==(NMAX-1) && X[J[x]]==0.0){

//else if(((J[x]+1)==7 || (x==(NMAX-1))) && X[J[x]]==0.0){

if(((x==(NMAX-1))) && !X_COMPUTED[J[x]]){
//else if(((x==(NMAX-1))) && X[J[x]]==0.0){

//else if(((J[x]+1)==7) && X[J[x]]==0.0){
//last node
	//X[J[x]]=(B[x]-L[x]*X[J[x-1]])/D[x];
	k=J[x];
	km=J[x-1];
	X[k]=(B[x]/D[x])-((L[x]*X[km])/D[x]);
	X_COMPUTED[k]=true;
}

__syncthreads();

}

}

Its hard to tell which branches are commented out and which aren’t without syntax highlighting or formatting…but I think you have a __syncthreads inside an if block that not all threads will take…that can cause problems. Also, YOU DON’T HAVE TO USE ALL CAPS FOR VARIABLE NAMES. We’re not using FORTRAN 77.

Edited to add: I don’t think the syncthreads is doing anything anyway - you don’t use shared memory…so why bother syncing?

Thanks for your reply. Below the source again without any comments. You are right, __syncthreads does nothing at all. Do you have any idea what’s going wrong in my kernels?

global void CR_reduction(double *LOLD,double *DOLD,double *UOLD,double *BOLD,double *LNEW,double *DNEW,double *UNEW,double *BNEW,float *RESPR,int NMAX){

int x = threadIdx.x + blockIdx.x * blockDim.x;

int k = (x >> 1) ;

if(x<NMAX){

	if(!x){

		

		LNEW[x]=0.0;//west coefficient is nil

		DNEW[x]=DOLD[x]-(UOLD[x]*LOLD[x+1])/DOLD[x+1];

		UNEW[x]=-(UOLD[x]*UOLD[x+1])/DOLD[x+1];

		BNEW[x]=BOLD[x]-(UOLD[x]*BOLD[x+1])/DOLD[x+1];

		if(RESPR[x]<RESPR[x+1])RESPR[x]=RESPR[x+1];

	}

	

	if((x & (2-1))==0 && x){

		

		if((x+1)==(NMAX-1)){

		

			LNEW[k]=-(LOLD[x]*LOLD[x-1])/DOLD[x-1];

			DNEW[k]=DOLD[x]-((LOLD[x]*UOLD[x-1])/DOLD[x-1])-((UOLD[x]*LOLD[x+1])/DOLD[x+1]);

			UNEW[k]=0.0;

			BNEW[k]=BOLD[x]-((LOLD[x]*BOLD[x-1])/DOLD[x-1])-((UOLD[x]*BOLD[x+1])/DOLD[x+1]);

			if(RESPR[x]<RESPR[x+1])RESPR[k]=RESPR[x+1];

			else RESPR[k]=RESPR[x];

		}

		else{

			LNEW[k]=-(LOLD[x]*LOLD[x-1])/DOLD[x-1];

			DNEW[k]=DOLD[x]-((LOLD[x]*UOLD[x-1])/DOLD[x-1])-((UOLD[x]*LOLD[x+1])/DOLD[x+1]);

			UNEW[k]=-(UOLD[x]*UOLD[x+1])/DOLD[x+1];

			BNEW[k]=BOLD[x]-((LOLD[x]*BOLD[x-1])/DOLD[x-1])-((UOLD[x]*BOLD[x+1])/DOLD[x+1]);

			if(RESPR[x]<RESPR[x+1])RESPR[k]=RESPR[x+1];

			else RESPR[k]=RESPR[x];

		}

	}

}

}

global void CR_substitution(double *L,double *D,double *U,double *B,int *J,int NMAX,double *X,bool *X_COMPUTED){

int x = threadIdx.x + blockIdx.x * blockDim.x;

int k = 0;

int kp = 0;

int km = 0;

if(x<NMAX){

	if(!x && !X_COMPUTED[J[x]]){

		k=J[x];

		X[k]=B[x]/D[x];

		X_COMPUTED[k]=true;

	}

	if(x && (x!=(NMAX-1)) && !X_COMPUTED[J[x]]){

		k=J[x];

		kp=J[x+1];

		km=J[x-1];

		X[k]=(B[x]/D[x])-((U[x]*X[kp])/D[x])-((L[x]*X[km])/D[x]);

		X_COMPUTED[k]=true;

	}

	if(((x==(NMAX-1))) && !X_COMPUTED[J[x]]){

		k=J[x];

		km=J[x-1];

		X[k]=(B[x]/D[x])-((L[x]*X[km])/D[x]);

		X_COMPUTED[k]=true;

	}

}

}