Hi there.
I have a complex code where I need to use several times __syncthreads() with a block of 64 threads (2 warps). But it seems that it doesn’t work some times. And the more curious thing is that all the threads reach the barrier except one (thread 0) and I think that this is not possible because all threads of a warp should execute the same instruction.
I have tried to simplify the code and reproduce the error but when I delete any part of the code it run correctly. So I am going to write here only a part of the code, but it doesn’t fail if it is alone (in my code it is inside loops and it’s called from a global function):
reduction array is in shared memory
result is a float in shared memory
signalsA, signalsB, dataA, dataB are in global memory
S1,S2,t1 and t2 arealso in shared memory
and the other arguments are given by value
__device__ inline void func(int id,float *result,double *signalsA,double *signalsB,const double *dataA,const double *dataB,double *reduction,float S1,float* S2,bool con,float t1,float* t2,int m1,int m2,int threadsBlock,int n2,int n2,int HS){
double a;
double b;
double c;
float H= 1.0/(float)HS;
float* count = result;
if (!con){
reduction[id]=0;
for(int i=0; i<m1; i++){
a=0;
int dir;
for (int m=0; m<HR; m++){
dir= m*n1 +id+i*threadsBlock;
a= a+(H*signalsA[dir]/S2[m]);
}
dir=id+i*threadsBlock;
a=S1*a;
a=dataA[dir]-a;
reduction[id]+=a*a;
}
__syncthreads();
for(unsigned int s=threadsBlock>>1; s>0; s>>=1) {
if (id<s){
reduction[id] = reduction[id] + reduction[id+s];
}
__syncthreads();
}
if(id==0){
b=0.5*n1*log(0.5*reduction[0]);
c=0;
}
*count=0;
__syncthreads();
atomicAdd(count, 1);
__syncthreads();
printf("CHECK: %i...%.20f\n",id,*count);
for (int m=0; m<HR; m++){
reduction[id]=0;
for(int i=0; i<m2; i++){
int dir=m*n2+id+i*threadsBlock;
a=dataB[dir]-signalsB[dir];
reduction[id]+=a*a;
}
__syncthreads();
for(unsigned int s=threadsBlock>>1; s>0; s>>=1) {
if (id<s){
reduction[id] = reduction[id] + reduction[id+s];
}
__syncthreads();
}
c+=log(0.5*reduction[0]);
}
if(id==0){
c*=0.5*n2;
*result=b+c;
}
}else{
reduction[id]=0;
for(int i=0; i<m1; i++){
pred=0;
int dir;
for (int m=0; m<HRsize; m++){
dir=m*n1+id+i*threadsBlock;
a= a+(H*signalsA[dir]/S2[m]);
}
dir=id+i*threadsBlock;
pred=S0LR*pred;
reduction[id]+=log(dataA[dir])-0.5*t1*(dataA[dir]*dataA[dir]+a*a)+fun2(t1*dataA[dir]*a);
}
__syncthreads();
for(unsigned int s=threadsBlock>>1; s>0; s>>=1) {
if (id<s){
reduction[id] = reduction[id] + reduction[id+s];
}
__syncthreads();
}
if(id==0){
b=-n1*log(t1);
b-=reduction[0];
likHR=0;
}
for (int m=0; m<HR; m++){
reduction[id]=0;
for(int i=0; i<m2; i++){
int dir=m*n2+id+i*threadsBlock;
a = dataB[dir]*dataB[dir]+signalsB[dir]*signalsB[dir];
reduction[id]+=log(dataB[dir])-0.5*t2[m]*a+fun2(t2[m]*dataB[dir]*signalsB[dir]);
}
__syncthreads();
for(unsigned int s=threadsBlock>>1; s>0; s>>=1) {
if (id<s){
reduction[id] = reduction[id] + reduction[id+s];
}
__syncthreads();
}
if(id==0){
c+=n2*log(t2[m]);
c+=reduction[0];
}
}
if(id==0){
*result=b-c;
}
}
}