cuda syncthreads fail

moises1982 · February 21, 2013, 12:44pm

Hi there, I know that there are a lot of topics about that, but I don’t find a solution for my code. The thing is that in a kernel that calls to a device function, that also calls to sub levels device functions I have a wrong synchronization. I have try to debug it but compiling with -G as I was expecting the synchronization is correct. The memcheck tool don’t give me any error and I don’t see any mistake inside the code, all threads of a block (64) are reaching the syncthreads() but they don’t do it correctly. So I am looking for any help here.

I copy below the code (it is not important what it is doing because is a version from the original to make easier to find the mistake, I have deleted as many lines as possible):

moises1982 · February 21, 2013, 12:47pm

#include<stdio.h>

#define lowlim_gpu 4.0
#define UL_gpu 10000    
#define tiny 1.0e-5    
#define beta2f_gpu(beta) (pow(sin(double(beta)),2.0))
#define l12k1_gpu(l1) (pow(sin(l1),2.0)*UL_gpu+lowlim_gpu)
#define INV3 0.333333333333333   // 1/3
#define SQRT3 1.732050807568877 //sqrt(3)
#define INV54 0.018518518518519 // 1/54
#define min3(a,b,c)  (a < b ? min(a,c) : min(b,c) )

__device__ double croot(const double x){
    	double res;
    	if (x>=0) 
      		res=pow(x,INV3);
    	else
      		res=-(pow(-x,INV3));
    	return res;
}

__device__ double find_t(const double *x){

    	double l1=-x[0], l2=-x[1], l3=-x[2];
    	double a0,a1,a2,a3,ee,tmp,z1,z2,z3,p,q,D,offset;

    	a3=l1*l2+l2*l3+l1*l3;
    	a2=1.5-l1-l2-l3;
    	a1=a3-l1-l2-l3;
    	a0=0.5*(a3-2*l1*l2*l3);

    	p=(a1-a2*a2*INV3)*INV3;
    	q=(-9*a2*a1+27*a0+2*a2*a2*a2)*INV54;
    	D=q*q+p*p*p;
    	offset=a2*INV3;
    	if (D>0){
      		ee=sqrt(D);
      		tmp=-q+ee; z1=croot(tmp);
      		tmp=-q-ee; z1=z1+croot(tmp);
      		z1=z1-offset; z2=z1; z3=z1;
    	}else if (D<0){
      		ee=sqrt(-D);
      		double tmp2=-q; 
      		double angle=2*INV3*atan(ee/(sqrt(tmp2*tmp2+ee*ee)+tmp2));
      		tmp=cos(angle);
      		tmp2=sin(angle);
      		ee=sqrt(-p);
      		z1=2*ee*tmp-offset; 
      		z2=-ee*(tmp+SQRT3*tmp2)-offset; 
      		z3=-ee*(tmp-SQRT3*tmp2)-offset; 
    	}else{
      		tmp=-q;
      		tmp=croot(tmp);
      		z1=2*tmp-offset; z2=z1; z3=z1;
      		if (p!=0 || q!=0)
			z2=-tmp-offset; z3=z2;
    	}

    	z1=min3(z1,z2,z3);  
    	return z1;
}

__device__ double hyp(	//INPUT
						double* xL,
						const double* denomvals)
{
    double c,Norm1,t1,T1,R,K2,K3,K4,tmp,tmp2,tmp3;

    if (xL[0]==0 && xL[1]==0){
     	Norm1=1; t1=0; T1=0; 
    }else{
   	if(xL[0]<xL[1]){ tmp=xL[0]; xL[0]=xL[1]; xL[1]=tmp;} //SortDescending(xL);
	if(xL[1]<xL[2]){ tmp=xL[1]; xL[1]=xL[2]; xL[2]=tmp;} //SortDescending(xL);
	if(xL[0]<xL[1]){ tmp=xL[0]; xL[0]=xL[1]; xL[1]=tmp;} //SortDescending(xL);
	 
      	t1=find_t(xL);
      	R=1; K2=0; K3=0; K4=0; 
      	for (int i=0; i<3; i++){
		tmp=xL[i]+t1;
		tmp2=tmp*tmp; tmp3=tmp2*tmp;
        	R=-R*tmp;
        	K2=K2+0.5/tmp2;
        	K3=K3-1.0/tmp3;
        	K4=K4+3.0/(tmp3*tmp);
	}
      	T1=K4/(8.0*K2*K2)-5.0*K3*K3/(24.0*K2*K2*K2);  
      	Norm1=0.125/(K2*R);
    }
    c=sqrt(Norm1/denomvals[2])*exp(double(-t1+T1+denomvals[0]-denomvals[1]));
    return c;
}

moises1982 · February 21, 2013, 12:47pm

__device__ 
inline double iso(const int pt,const double _d,const double bv){
  	return exp(double(-bv*_d));
}

//Applies the forward model and gets a model predicted signal using the parameter values in p (transformed parameter space)  
__device__ inline void forwardModel_stepA(	//INPUT
							const double* 		p,
							const int		nfib,
							const int		idB,	//this must be computed only by NFIBRES threads
							//OUTPUT
							double*			Vs,
							double*			Ks)
{
	int nparams_per_fibre=4;
  
  	int nn = 2+nparams_per_fibre*idB;

    	double cosph, sinph,costh,sinth;
	
    	costh=cos(p[nn+1]); 
	sinth=sin(p[nn+1]); 
	cosph=cos(p[nn+2]); 
	sinph=sin(p[nn+2]);
    	Vs[idB*3+0] = cosph*sinth; 
	Vs[idB*3+1] = sinph*sinth; 
	Vs[idB*3+2] = costh;
	Ks[idB]=l12k1_gpu(p[nn+3]);
    		
}
	
__device__ inline double forwardModel_stepB(	//INPUT
  							const double* 		p,
							const double		bv1, 
							const double		bv2,
							const double*		Vs,
							const double*		Ks,
							const double*		approx,
							const double*		fs,
							const double		_d,
							const double		sumf,
							const int		nfib,
							const int 		nparams,
							const bool 		m_include_f0,
							const int		dir)
{
	double A[9];
	double predicted_signal;
	double partial_fsum;
	double val = 0.0;
	double bd= bv2*_d;
	double temp_vec[3];
	
	for(int n=0;n<nfib;n++){
		
		A[0]=Vs[n*3+0]*bv1; A[1]=Vs[n*3+0]*bv1; A[2]=Vs[n*3+0]*bv1;
		A[3]=Vs[n*3+1]*bv1; A[4]=Vs[n*3+1]*bv1; A[5]=Vs[n*3+1]*bv1;
		A[6]=Vs[n*3+2]*bv1; A[7]=Vs[n*3+2]*bv1; A[8]=Vs[n*3+2]*bv1;

      		double Q=-2*bd*Ks[n]*(pow(A[0]+A[4]+A[8],2.0) - pow(A[3]-A[1],2.0) - pow(A[2]-A[6],2.0)-pow(A[5]-A[7],2.0)); 
		
      		Q=sqrt(Ks[n]*Ks[n]+bd*bd+Q);
      		temp_vec[0]=0.5*(Ks[n]-bd+Q);
		temp_vec[1]=0.5*(Ks[n]-bd-Q);
		temp_vec[2]=0;
      		val+= fs[n]*hyp(temp_vec,&approx[n*3]);
    	}
    	if (m_include_f0){
		///////////
		partial_fsum=1.0;
		for(int j=0;j<3;j++)
			partial_fsum-=fs[j];
		if (partial_fsum==0) 
    			partial_fsum=tiny;
    		//////////////////////////
      		double temp_f0=beta2f_gpu(p[nparams-1])*partial_fsum;
      		predicted_signal = p[0]*(temp_f0+(1-sumf-temp_f0)*iso(dir,_d,bv2)+val);
    	}else{
      		predicted_signal = p[0]*((1-sumf)*iso(dir,_d,bv2)+val); 
	}
	return predicted_signal;
}

moises1982 · February 21, 2013, 12:49pm

__device__ inline void hess(		//INPUT
					const double*		params,
					const double		bv1, 
					const double		bv2,
					const int 		np,
					const int 		np2,
					const bool 		include,
					double*			shared,		
					double*			fs,		
					double*			Vs,		
					double*			Ks,		
					double*			approx,	
					const int		idB,	
					//OUTPUT
					double*			hess)
{
	double sumf=0.7656576576;
	double _d=0.545645645645645;

	__syncthreads();	

	if(idB==0){
		double partial_fsum;
  		for (int k=0; k<3; k++){
    			int kk = 2+np2*k; 
			partial_fsum=1.0;
			for(int j=0;j<k;j++)
				partial_fsum-=fs[j];
			if (partial_fsum==0)
    				partial_fsum=tiny;
    			fs[k] = beta2f_gpu(params[kk])*partial_fsum;
  		}
	}
	if(idB==0) printf("CONTROL 1 THREAD 0\n");
	if(idB==2) printf("CONTROL 1 THREAD 2\n");
	if(idB==20) printf("CONTROL 1 THREAD 20\n");
	if(idB==33) printf("CONTROL 1 THREAD 33\n");
	__syncthreads();

	if(idB==0) forwardModel_stepA(params,3,idB,&Vs[0],&Ks[0]);

	__syncthreads();
	if(idB==0) printf("CONTROL 2 THREAD 0\n");
	if(idB==2) printf("CONTROL 2 THREAD 2\n");
	if(idB==20) printf("CONTROL 2 THREAD 20\n");
	if(idB==33) printf("CONTROL 2 THREAD 33\n");
	__syncthreads();

	if(idB==0) forwardModel_stepA(params,3,idB,&Vs[9],&Ks[3]);

	if(idB==0) printf("CONTROL 3 THREAD 0\n");
	if(idB==2) printf("CONTROL 3 THREAD 2\n");
	if(idB==20) printf("CONTROL 3 THREAD 20\n");
	if(idB==33) printf("CONTROL 3 THREAD 33\n");
	__syncthreads();

	if(idB==0){ 
		double p_plus_h[14];
		for (int p=0; p<np; p++) p_plus_h[p] = params[p];		
    		p_plus_h[1]=params[1]+7;
	
		forwardModel_stepA(params,3,idB,&Vs[9],&Ks[3]);
	}
	__syncthreads();
	double S_trial;

	if(idB==0) printf("CONTROL 4 THREAD 0\n");
	if(idB==2) printf("CONTROL 4 THREAD 2\n");
	if(idB==20) printf("CONTROL 4 THREAD 20\n");
	if(idB==33) printf("CONTROL 4 THREAD 33\n");
	__syncthreads();

	S_trial= forwardModel_stepB(params,bv1,bv2,&Vs[9],&Ks[3],&approx[9],fs,_d,sumf,3,np,include,idB); 
	__syncthreads();
	S_trial= forwardModel_stepB(params,bv1,bv2,&Vs[9],&Ks[3],&approx[9],fs,_d,sumf,3,np,include,idB); 

	if(idB==0) printf("CONTROL 5 THREAD 0\n");
	if(idB==2) printf("CONTROL 5 THREAD 2\n");
	if(idB==20) printf("CONTROL 5 THREAD 20\n");
	if(idB==33) printf("CONTROL 5 THREAD 33\n");
	__syncthreads();
}

//in diffmodel.cc
extern "C" __global__ void fit()
{
	int idA = threadIdx.x;
	int idB = blockIdx.x;
	__shared__ double m[14];
	__shared__ double fs[3];

	__shared__ double shared[64];
	__shared__ double fs2[3];
	__shared__ double Vs[14*9];
	__shared__ double Ks[14*3];
	__shared__ double approx[14*9];

	int np=4;
	double hessd[14*14]; 	
   	
	hess(m,2.445,5.55,14,4,false,shared,fs2,Vs,Ks,approx,idA,hessd);  
	__syncthreads();
	hess(m,2.445,5.55,14,4,false,shared,fs2,Vs,Ks,approx,idA,hessd);  
}

int main( int argc, const char* argv[] ){
	int blocks = 1;
   	dim3 Dim_Grid(blocks,1);
  	dim3 Dim_Block(64,1);

	fit<<<Dim_Grid, Dim_Block>>>();
	cudaDeviceSynchronize();
}

moises1982 · February 21, 2013, 12:50pm

If you run the code you will see that the controls in the second execution of the function hess are not printed in order.:

CONTROL 1 THREAD 33
CONTROL 1 THREAD 0
CONTROL 1 THREAD 2
CONTROL 1 THREAD 20
CONTROL 2 THREAD 0
CONTROL 2 THREAD 33
CONTROL 2 THREAD 2
CONTROL 2 THREAD 20
CONTROL 3 THREAD 33
CONTROL 3 THREAD 0
CONTROL 3 THREAD 2
CONTROL 3 THREAD 20
CONTROL 4 THREAD 0
CONTROL 4 THREAD 33
CONTROL 4 THREAD 2
CONTROL 4 THREAD 20
CONTROL 5 THREAD 0
CONTROL 5 THREAD 33
CONTROL 5 THREAD 2
CONTROL 5 THREAD 20
CONTROL 1 THREAD 20
CONTROL 1 THREAD 33
CONTROL 1 THREAD 0
CONTROL 1 THREAD 2
CONTROL 2 THREAD 20
CONTROL 3 THREAD 20
CONTROL 4 THREAD 20
CONTROL 5 THREAD 20
CONTROL 2 THREAD 33
CONTROL 3 THREAD 33
CONTROL 4 THREAD 33
CONTROL 5 THREAD 33
CONTROL 2 THREAD 2
CONTROL 3 THREAD 2
CONTROL 4 THREAD 2
CONTROL 5 THREAD 2
CONTROL 2 THREAD 0
CONTROL 3 THREAD 0
CONTROL 4 THREAD 0
CONTROL 5 THREAD 0

To compile it I am using:
nvcc -o test test.cu -O3 -arch sm_20 -lcudart -lcuda -lcurand

And I am running it in a Tesla C2050

I hope anyone can help me with this.

Thank you very much!!

Moises.

kbam · February 21, 2013, 11:34pm

I expect that __syncthreads() is working. Possibly as you get more text added to the printf buffer it is being added in batches by thread.

njuffa · February 22, 2013, 12:04am

A few comments (see also the CUDA Best Practices Guide):

(1) You really don’t want to square by calling pow(x,2.0), that is very slow. Use x*x instead.

(2) CUDA provides cube root and reciprocal cube root functions: cbrt(), rcbrt(); much faster (and likely more accurate) than using pow()

(3) CUDA has built-in, overloaded, min() and max() functions that map directly to hardware instructions.

(4) Instead of x / sqrt(y) I would suggest using x * rsqrt(y) unless you need the exact IEEE-754 rounding semantics.

(5) When computing sin() and cos() of the same argument, use sincos() which will be significantly faster.

(6) If hyp() computes the hypotenuse (i.e. sqrt(xx+yy)), you can use the standard function hypot() instead which is supported by CUDA

(7) I can’t tell right away what the computation involving atan() does, but given that it computes an angle from two source operands the use of atan2() may be indicated?

moises1982 · February 22, 2013, 12:06pm

Thank you very much kbam and njuffa.

Yes kbam, I also expect that it works but in that particular case it doesn’t. I reduced the problem to only these prints and it had the same behaviour and looking the results they weren’t correct. But thank for your comment.

About your comments njuffa:

(1) You really don’t want to square by calling pow(x,2.0), that is very slow. Use x*x instead.

Yes, good point. I have changed them.

(2) CUDA provides cube root and reciprocal cube root functions: cbrt(), rcbrt(); much faster (and likely more accurate) than using pow()

Very good point.I didn’t know this function.

(3) CUDA has built-in, overloaded, min() and max() functions that map directly to hardware instructions.

I am not sure if I understood this. Do you mean I use it or not? I did’t find documentation about this function for double precision.

(4) Instead of x / sqrt(y) I would suggest using x * rsqrt(y) unless you need the exact IEEE-754 rounding semantics.
Well, I prefer the precision than speed. I’ve checked that rsqrt has less precision.

(5) When computing sin() and cos() of the same argument, use sincos() which will be significantly faster.
Yes, I’ve changed. Good point :)

(6) If hyp() computes the hypotenuse (i.e. sqrt(xx+yy)), you can use the standard function hypot() instead which is supported by CUDA
As in 4 I’ve checked hypot() has less precision, so I am going to keep sqrt(xx+yy)

(7) I can’t tell right away what the computation involving atan() does, but given that it computes an angle from two source operands the use of atan2() may be indicated?
Yes, I agree, probably I will change it.

Thanks for your time looking these operations :).

The good news, is that changing (1), (2) and (5) the threads are doing now the syncthreads fine.
The bad news is that I don’t know why, and I want to know. As njuffa checked, I was using inefficient functions, but they weren’t wrong.

So, anyone has an idea why in this code (without change nothing) threads are not doing the syncthreads correctly ?
Is there a time out waiting in the barrier ?

Topic		Replies	Views
Problems with __syncthreads() CUDA Programming and Performance	2	951	May 4, 2013
Syncthreads and Stalling Kernels CUDA Programming and Performance	16	4151	August 26, 2010
syncthreads error? CUDA Programming and Performance	16	33375	June 2, 2008
__syncthreads screwes calculation CUDA Programming and Performance	2	3424	November 22, 2007
problem with __syncthreads(); CUDA Programming and Performance	1	1697	December 15, 2011
__syncthreads() not syncing all threads in my thread block CUDA Programming and Performance synchronization	2	1171	April 6, 2021
Bug report: Threads out of sync, branched syncthreads problem CUDA Programming and Performance	2	1705	November 30, 2009
CUDA fortran syncthreads_* functions Legacy PGI Compilers	6	4021	November 5, 2010
__syncthreads() problem __syncthreads() results in infinite loop CUDA Programming and Performance	5	2274	August 27, 2008
The result is unpredictable. CUDA Programming and Performance	6	1168	October 25, 2013

cuda syncthreads fail

Related topics