__shfl_down_sync weird behavior

code1011 · November 22, 2021, 3:10am

I am trying to accelerate my reduce kernel using warp shuffle.

When the array size is a multiple of 32，the code works well. When it comes to general cases, I need to check the boundary condition. I made my test case as below:
the input_data length is 10, using a block length = 1024, so only partial threads (10) of the first warp will participate in the shuffle.

        for (size_t i = 16; i > 0; i = i >> 1) {
            if ((i + laneIdx) < count1) {
                printf("delta= %d, threadx = %d, thread_sum = %f\n", i, threadIdx.x, thread_sum);
                thread_sum = CudaReduceOperate<T, reduce_type>(thread_sum, __shfl_down_sync(mask, thread_sum, i));
            }
        }

count1 is calculated as the number of active thread in the wap and in the case, count1 == 10. In my view, when the delta = 16, no threads would do the reduction op. delta = 8, the thread 0 and 1 would do the reduction op…etc.

However, I find them all equals to 0 in the outputs. Here are the complete code and outputs:

const int kBlockDimReduce = 1024;

template <typename T, ReduceType reduce_type>
static inline __device__ auto CudaReduceOperate(T in1, T in2) -> enable_if_t<reduce_type == ReduceType::kSum, T> {
    return in1 + in2;
}

template <typename T, ReduceType reduce_type>
static inline __device__ auto CudaReduceOperate(T in1, T in2) -> enable_if_t<reduce_type == ReduceType::kMax, T> {
    return in1 > in2 ? in1 : in2;
}

template <typename T, ReduceType reduce_type>
static inline __device__ auto CudaReduceOperate(T in1, T in2) -> enable_if_t<reduce_type == ReduceType::kMin, T> {
    return in1 > in2 ? in2 : in1;
}

template <typename T, ReduceType reduce_type>
static inline __device__ auto CudaReduceAtomicOperate(T *addr, T value)
    -> enable_if_t<reduce_type == ReduceType::kSum, void> {
    atomicAdd(addr, value);
}

template <typename T, ReduceType reduce_type>
static inline __device__ auto CudaReduceAtomicOperate(T *addr, T value)
    -> enable_if_t<reduce_type == ReduceType::kMax, void> {
    atomicMax(addr, value);
}

template <typename T, ReduceType reduce_type>
static inline __device__ auto CudaReduceAtomicOperate(T *addr, T value)
    -> enable_if_t<reduce_type == ReduceType::kMin, void> {
    atomicMin(addr, value);
}

template <typename T, ReduceType reduce_type>
__forceinline__ __device__ T warpReduce(unsigned int mask, T localSum) {
#pragma unroll
    for (size_t i = 16; i > 0; i = i >> 1) {
        localSum = CudaReduceOperate<T, reduce_type>(__shfl_down_sync(mask, localSum, i), localSum);
        printf("fullwarp round = %d threadx = %d, local = %f\n", i, threadIdx.x, localSum);
    }
    return localSum;
}

template <typename T, ReduceType reduce_type>
__global__ void CudaReduceKernel(void *input_data, int length, void *output_data) {
    __shared__ T sdata[kBlockDimReduce];

    int idx = 8 * blockIdx.x * blockDim.x + threadIdx.x;
    // shuffle init (must be done first, otherwise the maskWindUp won't take effect)
    const int laneIdx = threadIdx.x % 32;
    const int warpIdx = threadIdx.x / 32;
    unsigned int maskLength;
    __shared__ unsigned int maskWindUp;
    bool is_full_warp = false;
    int remain_data = length - (8 * blockIdx.x * blockDim.x + 32 * (warpIdx + 1));

    if (remain_data >= 0) {
        maskLength = 31;  // 31 = warpSize-1
        is_full_warp = true;
        maskWindUp |= (1 << warpIdx);
    } else {
        if (remain_data > -32) {
            maskLength = 32 + remain_data;
            maskWindUp |= (1 << warpIdx);
        } else {
            maskLength = 0;
            // this warp is invalid
            maskWindUp &= ~(1 << warpIdx);
        }
    }
    printf("threadx = %d, remain_data = %d, maskWindUp = %d\n", threadIdx.x, remain_data, maskWindUp);
    if (idx >= length) {
        return;
    }

    sdata[threadIdx.x] = static_cast<T *>(input_data)[idx];

    // unrolling 8
    if (idx + 7 * blockDim.x < length) {
#pragma unroll
        for (int i = 1; i < 8; i++) {
            sdata[threadIdx.x] = CudaReduceOperate<T, reduce_type>(sdata[threadIdx.x],
                                                                   static_cast<T *>(input_data)[idx + i * blockDim.x]);
        }
    } else {
        // the last block (boundary condition)
        const int remain_data = length - 8 * blockIdx.x * blockDim.x;
        const int n_unrolling_need = (remain_data + blockDim.x - 1) / blockDim.x;
#pragma unroll
        for (size_t i = 1; i < n_unrolling_need; i++) {
            if (idx + i * blockDim.x < length) {
                sdata[threadIdx.x] = CudaReduceOperate<T, reduce_type>(
                    sdata[threadIdx.x], static_cast<T *>(input_data)[idx + i * blockDim.x]);
            }
        }
    }

    __syncthreads();
    // shuffle inside warp
    maskLength = maskLength & length;
    maskLength = (maskLength > 0) ? (32 - maskLength) : maskLength;
    const unsigned int mask = (0xffffffff) >> maskLength;
    maskWindUp = 1;
    printf("mask --> %d maskWindUp -->%d\n", mask, maskWindUp);

    __shared__ T warps_sum[32];

    T thread_sum = sdata[threadIdx.x];
    __syncthreads();
    printf("reduce result before %f, laneID = %d\n", thread_sum, laneIdx);
    // 1. warp shuffle
    if (is_full_warp) {
        thread_sum = warpReduce<T, reduce_type>(mask, thread_sum);
    } else {
        int count1 = 0;
        unsigned int value = mask;
        while (value) {
            value &= value - 1;
            count1++;
        }
        printf("threadx = %d, count1 = %d, laneID = %d\n", threadIdx.x, count1, laneIdx);
        for (size_t i = 16; i > 0; i = i >> 1) {
            if ((i + laneIdx) < count1) {

                thread_sum = CudaReduceOperate<T, reduce_type>(thread_sum, __shfl_down_sync(mask, thread_sum, i));
            }
        }
    }

    printf("threadx = %d, reduce result %f\n", threadIdx.x, thread_sum);
   
    if (threadIdx.x == 0) {
        CudaReduceAtomicOperate<T, reduce_type>(static_cast<T *>(output_data), thread_sum);
    }
    return;
}

template <DataType data_type, ReduceType reduce_type>
void CudaReduce(void *input_data, int length, void *output_data, cudaStream_t stream) {
#if __CUDA_ARCH__ < 700
    static_assert(data_type != DataType::kHalf, "CudaReduce does not support kHalf with __CUDA_ARCH__ < 700");
#endif

    // forbid call for kInt8 kSum
    static_assert(data_type != DataType::kInt8 || reduce_type != ReduceType::kSum,
                  "CudaReduce does not support sum of int8");
    dim3 block(cudaop_constant::kBlockDimReduce);
    const int gridsize = (length + 8 * block.x - 1) / (8 * block.x);
    dim3 grid(gridsize);
    // std::cout << "gridsize " << gridsize << "length " << length << std::endl;
    CudaReduceKernel<typename DataTypeTraits<data_type>::Type, reduce_type>
        <<<grid, block, 0, stream>>>(input_data, length, output_data);
}

the output is:

reduce result before 0.333333, laneID = 0
reduce result before 0.247059, laneID = 1
reduce result before 0.600000, laneID = 2
reduce result before 0.658824, laneID = 3
reduce result before 0.992157, laneID = 4
reduce result before 0.760784, laneID = 5
reduce result before 0.368627, laneID = 6
reduce result before 0.156863, laneID = 7
reduce result before 0.509804, laneID = 8
reduce result before 0.635294, laneID = 9
threadx = 0, count1 = 10, laneID = 0
threadx = 1, count1 = 10, laneID = 1
threadx = 2, count1 = 10, laneID = 2
threadx = 3, count1 = 10, laneID = 3
threadx = 4, count1 = 10, laneID = 4
threadx = 5, count1 = 10, laneID = 5
threadx = 6, count1 = 10, laneID = 6
threadx = 7, count1 = 10, laneID = 7
threadx = 8, count1 = 10, laneID = 8
threadx = 9, count1 = 10, laneID = 9
income round = 8 threadx = 0, thread_sum = 0.000000
income round = 8 threadx = 0, thread_sum = 0.000000
income round = 4 threadx = 0, thread_sum = 0.000000
income round = 4 threadx = 0, thread_sum = 0.000000
income round = 4 threadx = 0, thread_sum = 0.000000
income round = 4 threadx = 0, thread_sum = 0.000000
income round = 2 threadx = 0, thread_sum = 0.000000
income round = 2 threadx = 0, thread_sum = 0.000000
income round = 1 threadx = 0, thread_sum = 0.000000
threadx = 9, reduce result 0.635294
income round = 4 threadx = 0, thread_sum = 0.000000
income round = 4 threadx = 0, thread_sum = 0.000000
income round = 1 threadx = 0, thread_sum = 0.000000
income round = 1 threadx = 0, thread_sum = 0.000000
threadx = 8, reduce result 0.000000
income round = 2 threadx = 0, thread_sum = 0.000000
income round = 2 threadx = 0, thread_sum = 0.000000
income round = 2 threadx = 0, thread_sum = 0.000000
income round = 2 threadx = 0, thread_sum = 0.000000
threadx = 6, reduce result 0.000000
threadx = 7, reduce result 0.000000
income round = 1 threadx = 0, thread_sum = 0.000000
income round = 1 threadx = 0, thread_sum = 0.000000
income round = 1 threadx = 0, thread_sum = 0.000000
income round = 1 threadx = 0, thread_sum = 0.000000
income round = 2 threadx = 0, thread_sum = 0.000000
income round = 2 threadx = 0, thread_sum = 0.000000
income round = 1 threadx = 0, thread_sum = 0.000000
income round = 1 threadx = 0, thread_sum = 0.000000
threadx = 2, reduce result 0.000000
threadx = 3, reduce result 0.000000
threadx = 4, reduce result 0.000000
threadx = 5, reduce result 0.000000
threadx = 0, reduce result 0.000000
threadx = 1, reduce result 0.000000

all the thread_sum == 0 after the reduce, which is weird. Moreover, the printf seems to work abnormal near the shuffle.
Thanks in advance for the reply!

striker159 · November 22, 2021, 6:58am

Shared variables are uninitialized. You have to initialize maskWindUp before using it.

The shuffle mask is probably incorrect. Did you verify that only the threads for which the surrounding if is true are present in the mask of the shuffle call ?

code1011 · November 23, 2021, 2:20am

Thanks for the reminder. However in this case, maskWindUp has nothing to do with the shuffle. It was designed to mark which warps are valid after the shuffle. The first warp would do a “wind up” afterwards. Since in this case, only the first warp takes effect, this mask will not be used.

The second question: yes, I identified them throught the printf, there are exactly 10 threads in the shuffle call, which is marked as true in the mask.

My problem is, the thread sum become 0 after shuffle, which should be at least one of the input data, but actually not.

striker159 · November 23, 2021, 5:32am

I think there are less than 10 threads executing the shuffle call, and a different number of threads in each iteration.

code1011:

        for (size_t i = 16; i > 0; i = i >> 1) {
            if ((i + laneIdx) < count1) {

                thread_sum = CudaReduceOperate<T, reduce_type>(thread_sum, __shfl_down_sync(mask, thread_sum, i));
            }
        }

count1 is 10. For i = 16, no threads will shuffle. For i = 8, only laneIdx 0 and 1 will shuffle. This will require the mask 0x11 instead of 0x1111111111.

code1011 · November 23, 2021, 7:57am

I see, I will give it a try. Thanks a lot!

striker159 · November 23, 2021, 8:24am

For sake of completeness, there are also simpler ways to achieve your goal.

Always use a full warp, pad out-of-bounds elements with the neutral element, i.e. 0 for sum reduction. (thread 0-9 contribute values of the array, thread 10-31 contribute 0)
Use cooperative groups API for reduction

if(is_full_warp){
  auto group = cg::tiled_partition<32>(cg::this_thread_block());
  thread_sum = cg::reduce(group, thread_sum, cg::plus<int>());
}else{
  if(lane < count1){
    auto group = cg::coalesced_group();
    thread_sum = cg::reduce(group, thread_sum, cg::plus<int>());
  }
}

Use a library like Thrust or CUB.

Topic		Replies	Views
Use __shfl_down_sync for multiple variables? CUDA Programming and Performance	2	895	July 7, 2022
confuse about warp-level mask CUDA Programming and Performance	12	3326	November 1, 2018
How to structure multiple concurrent shuffles in the same block? CUDA Programming and Performance	2	936	February 27, 2019
Faster Parallel Reductions on Kepler Technical Blog	53	2658	September 4, 2021
The result of __shfl_down have some randomness, if the block_size of kernel excess 256 CUDA Programming and Performance	4	1079	April 24, 2018
Illegal instruction (error 715) with H100 CUDA Programming and Performance	27	948	September 18, 2024
Warp Reduction in Kernel with "if" guard CUDA Programming and Performance	6	1017	December 29, 2020
CUDA FORTRAN shared memory warp-level sum reduction Legacy PGI Compilers	1	3450	May 19, 2014
Implementing block reduction operations (with no warp shfl) CUDA Programming and Performance	8	1512	November 20, 2018
A question about calculatePartialSum sample code in CUDA_C programming guide CUDA Programming and Performance	13	938	September 29, 2023

__shfl_down_sync weird behavior

Related topics