As far as I remember the following strange behaviour of __syncthreads() was already partially mentioned in the forum. However I haven’t seen any constructive advices regarding it.

So I have the kernel with the given loop inside it:

**global** void MultiDistance(const int point_size, const int bd_faces_size, FTYPE* glob_dist, const int offset)

{

const int i = blockIdx.x*blockDim.x+threadIdx.x+offset;

float3 pt_cur = make_float3(0,0,0);

```
if(i < point_size)
pt_cur = make_float3(tex1Dfetch(texPt, texPtOfs+i));
FTYPE dist = 1.e6;
//define minimal distance between the current point and the external boundary
for(int j=0; j<bd_faces_size; j++)
{
__shared__ int4 cur_face;
cur_face = tex1Dfetch(texBdFaces, texBdFacesOfs+j);
__shared__ float3 pt[4];
pt[0] = make_float3(tex1Dfetch(texPt, texPtOfs+cur_face.x));
pt[1] = make_float3(tex1Dfetch(texPt, texPtOfs+cur_face.y));
pt[2] = make_float3(tex1Dfetch(texPt, texPtOfs+cur_face.z));
pt[3] = make_float3(tex1Dfetch(texPt, texPtOfs+cur_face.w));
//find a distance from the point to a face only if point's projection belongs to the face
__shared__ float3 a1;
a1 = pt[1] - pt[0];
__shared__ float3 a2;
a2 = pt[2] - pt[0];
__shared__ float3 norm;
norm = vec(a1,a2); //a normal to the face's plane
norm /= length(norm);
const FTYPE proj_dist = fabs(dot(pt_cur-pt[0], norm));
//project all points at the plane
__shared__ float3 vx;
vx = perpendicular(norm.x, norm.y, norm.z);
__shared__ float3 vy;
vy = vec(norm,vx);
__shared__ float2 proj_pt[4], conv_pt[4];
for(volatile int k=0; k<4; k++)
{
proj_pt[k].x = dot(vx,pt[k]);
proj_pt[k].y = dot(vy,pt[k]);
}
//construct a normal polygon on the plane
convexPolygon<4>(proj_pt, conv_pt);
float2 proj_cur;
proj_cur.x = dot(vx,pt_cur);
proj_cur.y = dot(vy,pt_cur);
const FTYPE plane_dist = tempDist2(4, conv_pt, proj_cur);
if(j == -1)
__syncthreads();
if(plane_dist <= 0)
dist = min(dist, proj_dist);
else
dist = min(dist, sqrt(proj_dist*proj_dist + plane_dist));
}
if(i < point_size)
glob_dist[i-offset] = dist;
```

}

There is only one __syncthreads in the kernel particularly

if(j == -1)

__syncthreads();

As one can easily see __syncthreads will never be called. However if I comment it a number of kernel registers will grow from 20 to 38!

The second strange fact is that putting syncthreads without if statement

// if(j == -1)

__syncthreads();

results in cudaErrorLaunchTimeout error. I should mention here that it’s 100% true that all block threads reach this __syncthreads() call (it’s also obvious just from the kernel’s listing). Moreover if I change the cycle count to

for(int j=0; j<2; j++)

the kernel will work with no problems.

However

for(int j=0; j<10; j++)

gives cudaErrorLaunchTimeout error again! :wacko:

The last fact to mention is that watchdog still exists despite I compute on Tesla D870 which is the second card in my Dell Precision 690 under Windows Server 2003 x64 and CUDA 2.0.

Any ideas how to handle these problems?

Thanks in advance.