I know __syncwarp() is a new feature in CUDA 9, according to programming guide, it should only work for Volta instead of P100. However, when I implemente a reduction operation within the warp,

```
if(threadIdx.x < WARP_SIZE / 2)
if(vector[threadIdx.x] > vector[threadIdx.x + WARP_SIZE / 2])
{
vector[threadIdx.x] = vector[threadIdx.x + WARP_SIZE / 2];
index[threadIdx.x] = index[threadIdx.x + WARP_SIZE / 2];
}
__syncwarp();
if(threadIdx.x < WARP_SIZE / 4)
if(vector[threadIdx.x] > vector[threadIdx.x + WARP_SIZE / 4])
{
vector[threadIdx.x] = vector[threadIdx.x + WARP_SIZE / 4];
index[threadIdx.x] = index[threadIdx.x + WARP_SIZE / 4];
}
__syncwarp();
if(threadIdx.x < WARP_SIZE / 8)
if(vector[threadIdx.x] > vector[threadIdx.x + WARP_SIZE / 8])
{
vector[threadIdx.x] = vector[threadIdx.x + WARP_SIZE / 8];
index[threadIdx.x] = index[threadIdx.x + WARP_SIZE / 8];
}
__syncwarp();
if(threadIdx.x < WARP_SIZE / 16)
if(vector[threadIdx.x] > vector[threadIdx.x + WARP_SIZE / 16])
{
vector[threadIdx.x] = vector[threadIdx.x + WARP_SIZE / 16];
index[threadIdx.x] = index[threadIdx.x + WARP_SIZE / 16];
}
__syncwarp();
if(threadIdx.x < WARP_SIZE / 32)
if(vector[threadIdx.x] > vector[threadIdx.x + WARP_SIZE / 32])
{
vector[threadIdx.x] = vector[threadIdx.x + WARP_SIZE / 32];
index[threadIdx.x] = index[threadIdx.x + WARP_SIZE / 32];
}
```

when delete __syncwarp(), the results are incorrect in p100 (CUDA 9.0, but results are correct in k80 and 780ti (CUDA 8.0). In my opinion, __syncwarp() is unnecessary for p100, so why this happens? Thank you for your help