Hello, I have a problem problem about __syncthreads() in for loop.
I write two cumulative example
which reference nvidia document(Optimizing Parallel Reduction in CUDA),
The result is correct in running 1 time ,
but the result is wrong in many times using for loop.
It seem that __syncthreads() is abnormal in for loop
can anyone know the reason?
Thank you
reference
Optimizing Parallel Reduction in CUDA
http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf
///////////////////////////////////////////////////////////////////////////////////////////////////
// case 1
#include “cuda_runtime.h”
#include <stdio.h>
#define arraySize 1024 // power of 2
global void addKernel()
{
int tid = threadIdx.x;
int dataSize = arraySize;
__shared__ int sdata[arraySize];
for(int d=0;d<10;d++)
{
sdata[tid] =1;
__syncthreads();
if(dataSize>=1024){if(tid < 512){sdata[tid] += sdata[tid+512]; __syncthreads();} }
if(dataSize>=512) {if(tid < 256){sdata[tid] += sdata[tid+256]; __syncthreads();} }
if(dataSize>=256) {if(tid < 128){sdata[tid] += sdata[tid+128]; __syncthreads();} }
if(dataSize>=128) {if(tid < 64){sdata[tid] += sdata[tid+ 64]; __syncthreads();} }
if(tid < 32)
{
volatile int *smem = sdata;
if (dataSize >= 64){smem[tid]+=smem[tid+32];}
if (dataSize >= 32){smem[tid]+=smem[tid+16];}
if (dataSize >= 16){smem[tid]+=smem[tid+8]; }
if (dataSize >= 8) {smem[tid]+=smem[tid+4]; }
if (dataSize >= 4) {smem[tid]+=smem[tid+2]; }
if (dataSize >= 2) {smem[tid]+=smem[tid+1]; }
}
__syncthreads();
if(tid==0) printf("d=%d sdata[0]=%d\n",d,sdata[0]);
__syncthreads();
}
}
int main()
{
addKernel<<<1, arraySize>>>();
cudaDeviceSynchronize();
return 0;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////
//case 2
#include “cuda_runtime.h”
#include <stdio.h>
#define arraySize 1024
global void addKernel()
{
int tid = threadIdx.x;
int dataSize = arraySize;
__shared__ int sdata[arraySize];
for(int d=0;d<10;d++)
{
sdata[tid] =1;
int mySum = sdata[tid];
__syncthreads();
if(dataSize>=1024){if(tid < 512){sdata[tid] = mySum = mySum + sdata[tid+512]; __syncthreads();} }
if(dataSize>=512) {if(tid < 256){sdata[tid] = mySum = mySum + sdata[tid+256]; __syncthreads();} }
if(dataSize>=256) {if(tid < 128){sdata[tid] = mySum = mySum + sdata[tid+128]; __syncthreads();} }
if(dataSize>=128) {if(tid < 64){sdata[tid] = mySum = mySum + sdata[tid+ 64]; __syncthreads();} }
if(tid < 32)
{
volatile int *smem = sdata;
if (dataSize >= 64){smem[tid]= mySum = mySum +smem[tid+32];}
if (dataSize >= 32){smem[tid]= mySum = mySum +smem[tid+16];}
if (dataSize >= 16){smem[tid]= mySum = mySum +smem[tid+8]; }
if (dataSize >= 8) {smem[tid]= mySum = mySum +smem[tid+4]; }
if (dataSize >= 4) {smem[tid]= mySum = mySum +smem[tid+2]; }
if (dataSize >= 2) {smem[tid]= mySum = mySum +smem[tid+1]; }
}
__syncthreads();
if(tid==0) printf("--d=%d sdata[0]=%d\n",d,sdata[0]);
__syncthreads();
}
}
int main()
{
addKernel<<<1, arraySize>>>();
cudaDeviceSynchronize();
return 0;
}
///////////////////////////////////////////////////////////////////////////////////////////////////