I am trying to accelerate my reduce kernel using warp shuffle.

When the array size is a multiple of 32，the code works well. When it comes to general cases, I need to check the boundary condition. I made my test case as below:

the input_data length is 10, using a block length = 1024, so only partial threads (10) of the first warp will participate in the shuffle.

```
for (size_t i = 16; i > 0; i = i >> 1) {
if ((i + laneIdx) < count1) {
printf("delta= %d, threadx = %d, thread_sum = %f\n", i, threadIdx.x, thread_sum);
thread_sum = CudaReduceOperate<T, reduce_type>(thread_sum, __shfl_down_sync(mask, thread_sum, i));
}
}
```

count1 is calculated as the number of active thread in the wap and in the case, count1 == 10. In my view, when the delta = 16, no threads would do the reduction op. delta = 8, the thread 0 and 1 would do the reduction op…etc.

However, I find them all equals to 0 in the outputs. Here are the complete code and outputs:

```
const int kBlockDimReduce = 1024;
template <typename T, ReduceType reduce_type>
static inline __device__ auto CudaReduceOperate(T in1, T in2) -> enable_if_t<reduce_type == ReduceType::kSum, T> {
return in1 + in2;
}
template <typename T, ReduceType reduce_type>
static inline __device__ auto CudaReduceOperate(T in1, T in2) -> enable_if_t<reduce_type == ReduceType::kMax, T> {
return in1 > in2 ? in1 : in2;
}
template <typename T, ReduceType reduce_type>
static inline __device__ auto CudaReduceOperate(T in1, T in2) -> enable_if_t<reduce_type == ReduceType::kMin, T> {
return in1 > in2 ? in2 : in1;
}
template <typename T, ReduceType reduce_type>
static inline __device__ auto CudaReduceAtomicOperate(T *addr, T value)
-> enable_if_t<reduce_type == ReduceType::kSum, void> {
atomicAdd(addr, value);
}
template <typename T, ReduceType reduce_type>
static inline __device__ auto CudaReduceAtomicOperate(T *addr, T value)
-> enable_if_t<reduce_type == ReduceType::kMax, void> {
atomicMax(addr, value);
}
template <typename T, ReduceType reduce_type>
static inline __device__ auto CudaReduceAtomicOperate(T *addr, T value)
-> enable_if_t<reduce_type == ReduceType::kMin, void> {
atomicMin(addr, value);
}
template <typename T, ReduceType reduce_type>
__forceinline__ __device__ T warpReduce(unsigned int mask, T localSum) {
#pragma unroll
for (size_t i = 16; i > 0; i = i >> 1) {
localSum = CudaReduceOperate<T, reduce_type>(__shfl_down_sync(mask, localSum, i), localSum);
printf("fullwarp round = %d threadx = %d, local = %f\n", i, threadIdx.x, localSum);
}
return localSum;
}
template <typename T, ReduceType reduce_type>
__global__ void CudaReduceKernel(void *input_data, int length, void *output_data) {
__shared__ T sdata[kBlockDimReduce];
int idx = 8 * blockIdx.x * blockDim.x + threadIdx.x;
// shuffle init (must be done first, otherwise the maskWindUp won't take effect)
const int laneIdx = threadIdx.x % 32;
const int warpIdx = threadIdx.x / 32;
unsigned int maskLength;
__shared__ unsigned int maskWindUp;
bool is_full_warp = false;
int remain_data = length - (8 * blockIdx.x * blockDim.x + 32 * (warpIdx + 1));
if (remain_data >= 0) {
maskLength = 31; // 31 = warpSize-1
is_full_warp = true;
maskWindUp |= (1 << warpIdx);
} else {
if (remain_data > -32) {
maskLength = 32 + remain_data;
maskWindUp |= (1 << warpIdx);
} else {
maskLength = 0;
// this warp is invalid
maskWindUp &= ~(1 << warpIdx);
}
}
printf("threadx = %d, remain_data = %d, maskWindUp = %d\n", threadIdx.x, remain_data, maskWindUp);
if (idx >= length) {
return;
}
sdata[threadIdx.x] = static_cast<T *>(input_data)[idx];
// unrolling 8
if (idx + 7 * blockDim.x < length) {
#pragma unroll
for (int i = 1; i < 8; i++) {
sdata[threadIdx.x] = CudaReduceOperate<T, reduce_type>(sdata[threadIdx.x],
static_cast<T *>(input_data)[idx + i * blockDim.x]);
}
} else {
// the last block (boundary condition)
const int remain_data = length - 8 * blockIdx.x * blockDim.x;
const int n_unrolling_need = (remain_data + blockDim.x - 1) / blockDim.x;
#pragma unroll
for (size_t i = 1; i < n_unrolling_need; i++) {
if (idx + i * blockDim.x < length) {
sdata[threadIdx.x] = CudaReduceOperate<T, reduce_type>(
sdata[threadIdx.x], static_cast<T *>(input_data)[idx + i * blockDim.x]);
}
}
}
__syncthreads();
// shuffle inside warp
maskLength = maskLength & length;
maskLength = (maskLength > 0) ? (32 - maskLength) : maskLength;
const unsigned int mask = (0xffffffff) >> maskLength;
maskWindUp = 1;
printf("mask --> %d maskWindUp -->%d\n", mask, maskWindUp);
__shared__ T warps_sum[32];
T thread_sum = sdata[threadIdx.x];
__syncthreads();
printf("reduce result before %f, laneID = %d\n", thread_sum, laneIdx);
// 1. warp shuffle
if (is_full_warp) {
thread_sum = warpReduce<T, reduce_type>(mask, thread_sum);
} else {
int count1 = 0;
unsigned int value = mask;
while (value) {
value &= value - 1;
count1++;
}
printf("threadx = %d, count1 = %d, laneID = %d\n", threadIdx.x, count1, laneIdx);
for (size_t i = 16; i > 0; i = i >> 1) {
if ((i + laneIdx) < count1) {
thread_sum = CudaReduceOperate<T, reduce_type>(thread_sum, __shfl_down_sync(mask, thread_sum, i));
}
}
}
printf("threadx = %d, reduce result %f\n", threadIdx.x, thread_sum);
if (threadIdx.x == 0) {
CudaReduceAtomicOperate<T, reduce_type>(static_cast<T *>(output_data), thread_sum);
}
return;
}
template <DataType data_type, ReduceType reduce_type>
void CudaReduce(void *input_data, int length, void *output_data, cudaStream_t stream) {
#if __CUDA_ARCH__ < 700
static_assert(data_type != DataType::kHalf, "CudaReduce does not support kHalf with __CUDA_ARCH__ < 700");
#endif
// forbid call for kInt8 kSum
static_assert(data_type != DataType::kInt8 || reduce_type != ReduceType::kSum,
"CudaReduce does not support sum of int8");
dim3 block(cudaop_constant::kBlockDimReduce);
const int gridsize = (length + 8 * block.x - 1) / (8 * block.x);
dim3 grid(gridsize);
// std::cout << "gridsize " << gridsize << "length " << length << std::endl;
CudaReduceKernel<typename DataTypeTraits<data_type>::Type, reduce_type>
<<<grid, block, 0, stream>>>(input_data, length, output_data);
}
```

the output is:

```
reduce result before 0.333333, laneID = 0
reduce result before 0.247059, laneID = 1
reduce result before 0.600000, laneID = 2
reduce result before 0.658824, laneID = 3
reduce result before 0.992157, laneID = 4
reduce result before 0.760784, laneID = 5
reduce result before 0.368627, laneID = 6
reduce result before 0.156863, laneID = 7
reduce result before 0.509804, laneID = 8
reduce result before 0.635294, laneID = 9
threadx = 0, count1 = 10, laneID = 0
threadx = 1, count1 = 10, laneID = 1
threadx = 2, count1 = 10, laneID = 2
threadx = 3, count1 = 10, laneID = 3
threadx = 4, count1 = 10, laneID = 4
threadx = 5, count1 = 10, laneID = 5
threadx = 6, count1 = 10, laneID = 6
threadx = 7, count1 = 10, laneID = 7
threadx = 8, count1 = 10, laneID = 8
threadx = 9, count1 = 10, laneID = 9
income round = 8 threadx = 0, thread_sum = 0.000000
income round = 8 threadx = 0, thread_sum = 0.000000
income round = 4 threadx = 0, thread_sum = 0.000000
income round = 4 threadx = 0, thread_sum = 0.000000
income round = 4 threadx = 0, thread_sum = 0.000000
income round = 4 threadx = 0, thread_sum = 0.000000
income round = 2 threadx = 0, thread_sum = 0.000000
income round = 2 threadx = 0, thread_sum = 0.000000
income round = 1 threadx = 0, thread_sum = 0.000000
threadx = 9, reduce result 0.635294
income round = 4 threadx = 0, thread_sum = 0.000000
income round = 4 threadx = 0, thread_sum = 0.000000
income round = 1 threadx = 0, thread_sum = 0.000000
income round = 1 threadx = 0, thread_sum = 0.000000
threadx = 8, reduce result 0.000000
income round = 2 threadx = 0, thread_sum = 0.000000
income round = 2 threadx = 0, thread_sum = 0.000000
income round = 2 threadx = 0, thread_sum = 0.000000
income round = 2 threadx = 0, thread_sum = 0.000000
threadx = 6, reduce result 0.000000
threadx = 7, reduce result 0.000000
income round = 1 threadx = 0, thread_sum = 0.000000
income round = 1 threadx = 0, thread_sum = 0.000000
income round = 1 threadx = 0, thread_sum = 0.000000
income round = 1 threadx = 0, thread_sum = 0.000000
income round = 2 threadx = 0, thread_sum = 0.000000
income round = 2 threadx = 0, thread_sum = 0.000000
income round = 1 threadx = 0, thread_sum = 0.000000
income round = 1 threadx = 0, thread_sum = 0.000000
threadx = 2, reduce result 0.000000
threadx = 3, reduce result 0.000000
threadx = 4, reduce result 0.000000
threadx = 5, reduce result 0.000000
threadx = 0, reduce result 0.000000
threadx = 1, reduce result 0.000000
```

all the thread_sum == 0 after the reduce, which is weird. Moreover, the printf seems to work abnormal near the shuffle.

Thanks in advance for the reply!