In case anyone was wondering (if there are any code errors, they are due to hand-jamming from my development system which is not attached to the Internet, i.e., no cut and pasting)…
template inline device float floatSum(float val, float *sharedVal);
// sharedVal has at least 8 floats, though it only needs log2(blockDim)
template<> inline device floatSum<32>(float val, float *sharedVal) {
val += __shfl_down_sync(0xFFFFFFFF, val, 16);
val += __shfl_down_sync(0xFFFF, val, 8);
val += __shfl_down_sync(0xFF, val, 4);
val += __shfl_down_sync(0x3, val, 2);
return __shfl_sync(0xFFFFFFFF, val, 0);
}
template<> inline device floatSum<256>(float val, float *sharedVal) {
int warp = threadIdx.x >> 5; // 8 warps
int lane = threadIdx.x % 32; // 32 lanes
val = floatSum<32>(val, sharedVal);
if (lane == 0)
sharedVal[warp] = val;
__syncthreads();
if (warp == 0) {
if (lane < 8)
val = sharedVal[lane];
else
val = 0.0f;
val += __shfl_down_sync(0xFF, val, 4); // <<=== illegal instruction
// compute-sanitizer fails on this line as well, and reports illegal instruction
val += __shfl_down_sync(0xF, val, 2);
val += __shfl_down_sync(0x3, val, 1);
}
__syncthreads();
return __shfl_sync(0xFFFFFFFF, val, 0);
}
I changed the code to the following (which is probably just as fast):
// sharedVal has at least 8 floats
template<> inline device floatSum<256>(float val, float *sharedVal) {
int warp = threadIdx.x >> 5; // 8 warps
int lane = threadIdx.x % 32; // 32 lanes
val = floatSum<32>(val, sharedVal);
if (lane == 0)
sharedVal[warp] = val;
__syncthreads();
if (threadIdx.x == 0) {
val = sharedVal[0];
for (int ii = 1; ii < 8; ii++)
val += sharedVal[ii];
}
__syncthreads();
return __shfl_sync(0xFFFFFFFF, val, 0);
}