Is __syncthreads() abnormal in for loop?

Hello, I have a problem problem about __syncthreads() in for loop.

I write two cumulative example
which reference nvidia document(Optimizing Parallel Reduction in CUDA),
The result is correct in running 1 time ,
but the result is wrong in many times using for loop.
It seem that __syncthreads() is abnormal in for loop
can anyone know the reason?
Thank you

reference
Optimizing Parallel Reduction in CUDA
http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf

///////////////////////////////////////////////////////////////////////////////////////////////////
// case 1

#include “cuda_runtime.h”
#include <stdio.h>

#define arraySize 1024 // power of 2

global void addKernel()
{

int tid = threadIdx.x;
int dataSize = arraySize;
__shared__ int sdata[arraySize];

for(int d=0;d<10;d++)
{	
	sdata[tid] =1; 
	__syncthreads();
	
	if(dataSize>=1024){if(tid < 512){sdata[tid] += sdata[tid+512];	__syncthreads();} }
	if(dataSize>=512) {if(tid < 256){sdata[tid] += sdata[tid+256];	__syncthreads();} }
	if(dataSize>=256) {if(tid < 128){sdata[tid] += sdata[tid+128];	__syncthreads();} }
	if(dataSize>=128) {if(tid <  64){sdata[tid] += sdata[tid+ 64];	__syncthreads();} }	
	
	if(tid < 32)	
	{
		volatile int *smem = sdata;
		if (dataSize >=  64){smem[tid]+=smem[tid+32];}
		if (dataSize >=  32){smem[tid]+=smem[tid+16];}
		if (dataSize >=  16){smem[tid]+=smem[tid+8]; }
		if (dataSize >=  8) {smem[tid]+=smem[tid+4]; }
		if (dataSize >=  4) {smem[tid]+=smem[tid+2]; }	
		if (dataSize >=  2) {smem[tid]+=smem[tid+1]; }
	}
	__syncthreads();
	
	if(tid==0) 	printf("d=%d sdata[0]=%d\n",d,sdata[0]);

	__syncthreads();
}

}

int main()
{
addKernel<<<1, arraySize>>>();
cudaDeviceSynchronize();

return 0;

}
///////////////////////////////////////////////////////////////////////////////////////////////////

///////////////////////////////////////////////////////////////////////////////////////////////////
//case 2

#include “cuda_runtime.h”
#include <stdio.h>

#define arraySize 1024

global void addKernel()
{

int tid = threadIdx.x;
int dataSize = arraySize;
__shared__ int sdata[arraySize];

for(int d=0;d<10;d++)
{	
	sdata[tid] =1; 
	int mySum = sdata[tid];
	__syncthreads();
	
	if(dataSize>=1024){if(tid < 512){sdata[tid] = mySum = mySum + sdata[tid+512];	__syncthreads();} }
	if(dataSize>=512) {if(tid < 256){sdata[tid] = mySum = mySum + sdata[tid+256];	__syncthreads();} }
	if(dataSize>=256) {if(tid < 128){sdata[tid] = mySum = mySum + sdata[tid+128];	__syncthreads();} }
	if(dataSize>=128) {if(tid <  64){sdata[tid] = mySum = mySum + sdata[tid+ 64];	__syncthreads();} }	
	
	if(tid < 32)	
	{
		volatile int *smem = sdata;
		if (dataSize >=  64){smem[tid]= mySum = mySum +smem[tid+32];}
		if (dataSize >=  32){smem[tid]= mySum = mySum +smem[tid+16];}
		if (dataSize >=  16){smem[tid]= mySum = mySum +smem[tid+8]; }
		if (dataSize >=  8) {smem[tid]= mySum = mySum +smem[tid+4]; }
		if (dataSize >=  4) {smem[tid]= mySum = mySum +smem[tid+2]; }	
		if (dataSize >=  2) {smem[tid]= mySum = mySum +smem[tid+1]; }
	}
	__syncthreads();
	
	if(tid==0) 	printf("--d=%d sdata[0]=%d\n",d,sdata[0]);

	__syncthreads();
}

}

int main()
{
addKernel<<<1, arraySize>>>();
cudaDeviceSynchronize();

return 0;

}

///////////////////////////////////////////////////////////////////////////////////////////////////

Hi

Cuda programming manual says
“__syncthreads() is allowed in conditional code but only if the conditional evaluates identically across the entire thread block, otherwise the code execution is likely to hang or produce unintended side effects.”

In lines like one below you need to move the syncthreads outside the } so that all threads in the block execute it.
if(dataSize>=1024){if(tid < 512){sdata[tid] = mySum = mySum + sdata[tid+512]; __syncthreads();} }

Dear kbam:
Problem solved using your suggested approach,
thank you very much.

Nelson

Also the CUDA Programming guide suggest to rather use the threadsfence().