You need a syncthreads at line 13. As you have it now, warp 0 can proceed to line 14 before other warps have had a chance to deposit their partial reduction result into shared memory.
cuda-memcheck has a subtool that can discover shared memory race conditions. It will flag this one.
Here is a fully worked example, around what you have shown:
$ cat t99.cu
#include <stdio.h>
const int block_size = 1024;
const int nblk = 1;
const int n = nblk*block_size;
__device__ float reduce(float val, int tid, int len) {
const int warpSize = 32;
__shared__ float shm[32];
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
if (tid < warpSize) shm[tid] = 0;
__syncthreads();
if (tid % warpSize == 0) {
shm[tid / warpSize] = val;
}
#ifdef USE_FIX
__syncthreads();
#endif
if (tid < warpSize) {
val = shm[tid];
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
}
return val;
}
__global__ void r(float *data, float *result){
float myval = data[threadIdx.x+blockDim.x*blockIdx.x];
float myres = reduce(myval, threadIdx.x, block_size);
if (!threadIdx.x) result[blockIdx.x] = myres;
}
int main(){
float *h_data, *d_data, *h_result, *d_result;
h_data=(float *)malloc(n*sizeof(float));
h_result = (float *)malloc(nblk*sizeof(float));
cudaMalloc(&d_data, n*sizeof(float));
cudaMalloc(&d_result, nblk*sizeof(float));
for (int i = 0; i < n; i++) h_data[i] = 1.0f;
cudaMemcpy(d_data, h_data, n*sizeof(float), cudaMemcpyHostToDevice);
r<<<nblk, block_size>>>(d_data, d_result);
cudaMemcpy(h_result, d_result, nblk*sizeof(float), cudaMemcpyDeviceToHost);
float result = 0.0f;
for (int i = 0; i< nblk; i++) result += h_result[i];
printf(" result: %f should be: %f\n", result, (float)n);
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
return 0;
}
$ nvcc -Wno-deprecated-declarations -lineinfo t99.cu -o t99
$ ./t99
result: 928.000000 should be: 1024.000000
no error
$ cuda-memcheck --tool racecheck ./t99
========= CUDA-MEMCHECK
result: 1024.000000 should be: 1024.000000
no error
========= ERROR: Race reported between Write access at 0x00000148 in /home/user2/misc/t99.cu:15:r(float*, float*)
========= and Read access at 0x00000158 in /home/user2/misc/t99.cu:21:r(float*, float*) [124 hazards]
=========
========= RACECHECK SUMMARY: 1 hazard displayed (1 error, 0 warnings)
$ nvcc -Wno-deprecated-declarations -lineinfo t99.cu -o t99 -DUSE_FIX
$ ./t99
result: 1024.000000 should be: 1024.000000
no error
$ cuda-memcheck --tool racecheck ./t99
========= CUDA-MEMCHECK
result: 1024.000000 should be: 1024.000000
no error
========= RACECHECK SUMMARY: 0 hazards displayed (0 errors, 0 warnings)
$
We can observe that your code as-is produces an incorrect result. When we run it with cuda-memcheck with the racecheck subtool, it points out that there is a race condition. When we add the necessary synchronization, then the results are correct and the racecheck tool reports no issues.