cuda syncthreads fail

Hi there, I know that there are a lot of topics about that, but I don’t find a solution for my code. The thing is that in a kernel that calls to a device function, that also calls to sub levels device functions I have a wrong synchronization. I have try to debug it but compiling with -G as I was expecting the synchronization is correct. The memcheck tool don’t give me any error and I don’t see any mistake inside the code, all threads of a block (64) are reaching the syncthreads() but they don’t do it correctly. So I am looking for any help here.

I copy below the code (it is not important what it is doing because is a version from the original to make easier to find the mistake, I have deleted as many lines as possible):

#include<stdio.h>

#define lowlim_gpu 4.0
#define UL_gpu 10000    
#define tiny 1.0e-5    
#define beta2f_gpu(beta) (pow(sin(double(beta)),2.0))
#define l12k1_gpu(l1) (pow(sin(l1),2.0)*UL_gpu+lowlim_gpu)
#define INV3 0.333333333333333   // 1/3
#define SQRT3 1.732050807568877 //sqrt(3)
#define INV54 0.018518518518519 // 1/54
#define min3(a,b,c)  (a < b ? min(a,c) : min(b,c) )

__device__ double croot(const double x){
    	double res;
    	if (x>=0) 
      		res=pow(x,INV3);
    	else
      		res=-(pow(-x,INV3));
    	return res;
}

__device__ double find_t(const double *x){

    	double l1=-x[0], l2=-x[1], l3=-x[2];
    	double a0,a1,a2,a3,ee,tmp,z1,z2,z3,p,q,D,offset;

    	a3=l1*l2+l2*l3+l1*l3;
    	a2=1.5-l1-l2-l3;
    	a1=a3-l1-l2-l3;
    	a0=0.5*(a3-2*l1*l2*l3);

    	p=(a1-a2*a2*INV3)*INV3;
    	q=(-9*a2*a1+27*a0+2*a2*a2*a2)*INV54;
    	D=q*q+p*p*p;
    	offset=a2*INV3;
    	if (D>0){
      		ee=sqrt(D);
      		tmp=-q+ee; z1=croot(tmp);
      		tmp=-q-ee; z1=z1+croot(tmp);
      		z1=z1-offset; z2=z1; z3=z1;
    	}else if (D<0){
      		ee=sqrt(-D);
      		double tmp2=-q; 
      		double angle=2*INV3*atan(ee/(sqrt(tmp2*tmp2+ee*ee)+tmp2));
      		tmp=cos(angle);
      		tmp2=sin(angle);
      		ee=sqrt(-p);
      		z1=2*ee*tmp-offset; 
      		z2=-ee*(tmp+SQRT3*tmp2)-offset; 
      		z3=-ee*(tmp-SQRT3*tmp2)-offset; 
    	}else{
      		tmp=-q;
      		tmp=croot(tmp);
      		z1=2*tmp-offset; z2=z1; z3=z1;
      		if (p!=0 || q!=0)
			z2=-tmp-offset; z3=z2;
    	}

    	z1=min3(z1,z2,z3);  
    	return z1;
}

__device__ double hyp(	//INPUT
						double* xL,
						const double* denomvals)
{
    double c,Norm1,t1,T1,R,K2,K3,K4,tmp,tmp2,tmp3;

    if (xL[0]==0 && xL[1]==0){
     	Norm1=1; t1=0; T1=0; 
    }else{
   	if(xL[0]<xL[1]){ tmp=xL[0]; xL[0]=xL[1]; xL[1]=tmp;} //SortDescending(xL);
	if(xL[1]<xL[2]){ tmp=xL[1]; xL[1]=xL[2]; xL[2]=tmp;} //SortDescending(xL);
	if(xL[0]<xL[1]){ tmp=xL[0]; xL[0]=xL[1]; xL[1]=tmp;} //SortDescending(xL);
	 
      	t1=find_t(xL);
      	R=1; K2=0; K3=0; K4=0; 
      	for (int i=0; i<3; i++){
		tmp=xL[i]+t1;
		tmp2=tmp*tmp; tmp3=tmp2*tmp;
        	R=-R*tmp;
        	K2=K2+0.5/tmp2;
        	K3=K3-1.0/tmp3;
        	K4=K4+3.0/(tmp3*tmp);
	}
      	T1=K4/(8.0*K2*K2)-5.0*K3*K3/(24.0*K2*K2*K2);  
      	Norm1=0.125/(K2*R);
    }
    c=sqrt(Norm1/denomvals[2])*exp(double(-t1+T1+denomvals[0]-denomvals[1]));
    return c;
}
__device__ 
inline double iso(const int pt,const double _d,const double bv){
  	return exp(double(-bv*_d));
}

//Applies the forward model and gets a model predicted signal using the parameter values in p (transformed parameter space)  
__device__ inline void forwardModel_stepA(	//INPUT
							const double* 		p,
							const int		nfib,
							const int		idB,	//this must be computed only by NFIBRES threads
							//OUTPUT
							double*			Vs,
							double*			Ks)
{
	int nparams_per_fibre=4;
  
  	int nn = 2+nparams_per_fibre*idB;

    	double cosph, sinph,costh,sinth;
	
    	costh=cos(p[nn+1]); 
	sinth=sin(p[nn+1]); 
	cosph=cos(p[nn+2]); 
	sinph=sin(p[nn+2]);
    	Vs[idB*3+0] = cosph*sinth; 
	Vs[idB*3+1] = sinph*sinth; 
	Vs[idB*3+2] = costh;
	Ks[idB]=l12k1_gpu(p[nn+3]);
    		
}
	
__device__ inline double forwardModel_stepB(	//INPUT
  							const double* 		p,
							const double		bv1, 
							const double		bv2,
							const double*		Vs,
							const double*		Ks,
							const double*		approx,
							const double*		fs,
							const double		_d,
							const double		sumf,
							const int		nfib,
							const int 		nparams,
							const bool 		m_include_f0,
							const int		dir)
{
	double A[9];
	double predicted_signal;
	double partial_fsum;
	double val = 0.0;
	double bd= bv2*_d;
	double temp_vec[3];
	
	for(int n=0;n<nfib;n++){
		
		A[0]=Vs[n*3+0]*bv1; A[1]=Vs[n*3+0]*bv1; A[2]=Vs[n*3+0]*bv1;
		A[3]=Vs[n*3+1]*bv1; A[4]=Vs[n*3+1]*bv1; A[5]=Vs[n*3+1]*bv1;
		A[6]=Vs[n*3+2]*bv1; A[7]=Vs[n*3+2]*bv1; A[8]=Vs[n*3+2]*bv1;

      		double Q=-2*bd*Ks[n]*(pow(A[0]+A[4]+A[8],2.0) - pow(A[3]-A[1],2.0) - pow(A[2]-A[6],2.0)-pow(A[5]-A[7],2.0)); 
		
      		Q=sqrt(Ks[n]*Ks[n]+bd*bd+Q);
      		temp_vec[0]=0.5*(Ks[n]-bd+Q);
		temp_vec[1]=0.5*(Ks[n]-bd-Q);
		temp_vec[2]=0;
      		val+= fs[n]*hyp(temp_vec,&approx[n*3]);
    	}
    	if (m_include_f0){
		///////////
		partial_fsum=1.0;
		for(int j=0;j<3;j++)
			partial_fsum-=fs[j];
		if (partial_fsum==0) 
    			partial_fsum=tiny;
    		//////////////////////////
      		double temp_f0=beta2f_gpu(p[nparams-1])*partial_fsum;
      		predicted_signal = p[0]*(temp_f0+(1-sumf-temp_f0)*iso(dir,_d,bv2)+val);
    	}else{
      		predicted_signal = p[0]*((1-sumf)*iso(dir,_d,bv2)+val); 
	}
	return predicted_signal;
}
__device__ inline void hess(		//INPUT
					const double*		params,
					const double		bv1, 
					const double		bv2,
					const int 		np,
					const int 		np2,
					const bool 		include,
					double*			shared,		
					double*			fs,		
					double*			Vs,		
					double*			Ks,		
					double*			approx,	
					const int		idB,	
					//OUTPUT
					double*			hess)
{
	double sumf=0.7656576576;
	double _d=0.545645645645645;

	__syncthreads();	

	if(idB==0){
		double partial_fsum;
  		for (int k=0; k<3; k++){
    			int kk = 2+np2*k; 
			partial_fsum=1.0;
			for(int j=0;j<k;j++)
				partial_fsum-=fs[j];
			if (partial_fsum==0)
    				partial_fsum=tiny;
    			fs[k] = beta2f_gpu(params[kk])*partial_fsum;
  		}
	}
	if(idB==0) printf("CONTROL 1 THREAD 0\n");
	if(idB==2) printf("CONTROL 1 THREAD 2\n");
	if(idB==20) printf("CONTROL 1 THREAD 20\n");
	if(idB==33) printf("CONTROL 1 THREAD 33\n");
	__syncthreads();

	if(idB==0) forwardModel_stepA(params,3,idB,&Vs[0],&Ks[0]);

	__syncthreads();
	if(idB==0) printf("CONTROL 2 THREAD 0\n");
	if(idB==2) printf("CONTROL 2 THREAD 2\n");
	if(idB==20) printf("CONTROL 2 THREAD 20\n");
	if(idB==33) printf("CONTROL 2 THREAD 33\n");
	__syncthreads();

	if(idB==0) forwardModel_stepA(params,3,idB,&Vs[9],&Ks[3]);

	if(idB==0) printf("CONTROL 3 THREAD 0\n");
	if(idB==2) printf("CONTROL 3 THREAD 2\n");
	if(idB==20) printf("CONTROL 3 THREAD 20\n");
	if(idB==33) printf("CONTROL 3 THREAD 33\n");
	__syncthreads();

	if(idB==0){ 
		double p_plus_h[14];
		for (int p=0; p<np; p++) p_plus_h[p] = params[p];		
    		p_plus_h[1]=params[1]+7;
	
		forwardModel_stepA(params,3,idB,&Vs[9],&Ks[3]);
	}
	__syncthreads();
	double S_trial;

	if(idB==0) printf("CONTROL 4 THREAD 0\n");
	if(idB==2) printf("CONTROL 4 THREAD 2\n");
	if(idB==20) printf("CONTROL 4 THREAD 20\n");
	if(idB==33) printf("CONTROL 4 THREAD 33\n");
	__syncthreads();

	S_trial= forwardModel_stepB(params,bv1,bv2,&Vs[9],&Ks[3],&approx[9],fs,_d,sumf,3,np,include,idB); 
	__syncthreads();
	S_trial= forwardModel_stepB(params,bv1,bv2,&Vs[9],&Ks[3],&approx[9],fs,_d,sumf,3,np,include,idB); 

	if(idB==0) printf("CONTROL 5 THREAD 0\n");
	if(idB==2) printf("CONTROL 5 THREAD 2\n");
	if(idB==20) printf("CONTROL 5 THREAD 20\n");
	if(idB==33) printf("CONTROL 5 THREAD 33\n");
	__syncthreads();
}

//in diffmodel.cc
extern "C" __global__ void fit()
{
	int idA = threadIdx.x;
	int idB = blockIdx.x;
	__shared__ double m[14];
	__shared__ double fs[3];

	__shared__ double shared[64];
	__shared__ double fs2[3];
	__shared__ double Vs[14*9];
	__shared__ double Ks[14*3];
	__shared__ double approx[14*9];

	int np=4;
	double hessd[14*14]; 	
   	
	hess(m,2.445,5.55,14,4,false,shared,fs2,Vs,Ks,approx,idA,hessd);  
	__syncthreads();
	hess(m,2.445,5.55,14,4,false,shared,fs2,Vs,Ks,approx,idA,hessd);  
}

int main( int argc, const char* argv[] ){
	int blocks = 1;
   	dim3 Dim_Grid(blocks,1);
  	dim3 Dim_Block(64,1);

	fit<<<Dim_Grid, Dim_Block>>>();
	cudaDeviceSynchronize();
}

If you run the code you will see that the controls in the second execution of the function hess are not printed in order.:

CONTROL 1 THREAD 33
CONTROL 1 THREAD 0
CONTROL 1 THREAD 2
CONTROL 1 THREAD 20
CONTROL 2 THREAD 0
CONTROL 2 THREAD 33
CONTROL 2 THREAD 2
CONTROL 2 THREAD 20
CONTROL 3 THREAD 33
CONTROL 3 THREAD 0
CONTROL 3 THREAD 2
CONTROL 3 THREAD 20
CONTROL 4 THREAD 0
CONTROL 4 THREAD 33
CONTROL 4 THREAD 2
CONTROL 4 THREAD 20
CONTROL 5 THREAD 0
CONTROL 5 THREAD 33
CONTROL 5 THREAD 2
CONTROL 5 THREAD 20
CONTROL 1 THREAD 20
CONTROL 1 THREAD 33
CONTROL 1 THREAD 0
CONTROL 1 THREAD 2
CONTROL 2 THREAD 20
CONTROL 3 THREAD 20
CONTROL 4 THREAD 20
CONTROL 5 THREAD 20
CONTROL 2 THREAD 33
CONTROL 3 THREAD 33
CONTROL 4 THREAD 33
CONTROL 5 THREAD 33
CONTROL 2 THREAD 2
CONTROL 3 THREAD 2
CONTROL 4 THREAD 2
CONTROL 5 THREAD 2
CONTROL 2 THREAD 0
CONTROL 3 THREAD 0
CONTROL 4 THREAD 0
CONTROL 5 THREAD 0

To compile it I am using:
nvcc -o test test.cu -O3 -arch sm_20 -lcudart -lcuda -lcurand

And I am running it in a Tesla C2050

I hope anyone can help me with this.

Thank you very much!!

Moises.

I expect that __syncthreads() is working. Possibly as you get more text added to the printf buffer it is being added in batches by thread.

A few comments (see also the CUDA Best Practices Guide):

(1) You really don’t want to square by calling pow(x,2.0), that is very slow. Use x*x instead.

(2) CUDA provides cube root and reciprocal cube root functions: cbrt(), rcbrt(); much faster (and likely more accurate) than using pow()

(3) CUDA has built-in, overloaded, min() and max() functions that map directly to hardware instructions.

(4) Instead of x / sqrt(y) I would suggest using x * rsqrt(y) unless you need the exact IEEE-754 rounding semantics.

(5) When computing sin() and cos() of the same argument, use sincos() which will be significantly faster.

(6) If hyp() computes the hypotenuse (i.e. sqrt(xx+yy)), you can use the standard function hypot() instead which is supported by CUDA

(7) I can’t tell right away what the computation involving atan() does, but given that it computes an angle from two source operands the use of atan2() may be indicated?

Thank you very much kbam and njuffa.

Yes kbam, I also expect that it works but in that particular case it doesn’t. I reduced the problem to only these prints and it had the same behaviour and looking the results they weren’t correct. But thank for your comment.

About your comments njuffa:

(1) You really don’t want to square by calling pow(x,2.0), that is very slow. Use x*x instead.

Yes, good point. I have changed them.

(2) CUDA provides cube root and reciprocal cube root functions: cbrt(), rcbrt(); much faster (and likely more accurate) than using pow()

Very good point.I didn’t know this function.

(3) CUDA has built-in, overloaded, min() and max() functions that map directly to hardware instructions.

I am not sure if I understood this. Do you mean I use it or not? I did’t find documentation about this function for double precision.

(4) Instead of x / sqrt(y) I would suggest using x * rsqrt(y) unless you need the exact IEEE-754 rounding semantics.
Well, I prefer the precision than speed. I’ve checked that rsqrt has less precision.

(5) When computing sin() and cos() of the same argument, use sincos() which will be significantly faster.
Yes, I’ve changed. Good point :)

(6) If hyp() computes the hypotenuse (i.e. sqrt(xx+yy)), you can use the standard function hypot() instead which is supported by CUDA
As in 4 I’ve checked hypot() has less precision, so I am going to keep sqrt(xx+yy)

(7) I can’t tell right away what the computation involving atan() does, but given that it computes an angle from two source operands the use of atan2() may be indicated?
Yes, I agree, probably I will change it.

Thanks for your time looking these operations :).

The good news, is that changing (1), (2) and (5) the threads are doing now the syncthreads fine.
The bad news is that I don’t know why, and I want to know. As njuffa checked, I was using inefficient functions, but they weren’t wrong.

So, anyone has an idea why in this code (without change nothing) threads are not doing the syncthreads correctly ?
Is there a time out waiting in the barrier ?