Re: Cooperative groups are much slower than CUB

Curefab · December 16, 2024, 1:36pm

@Yuki_Ni So it would not just use a numbered cta barrier? But would need additional software overhead?

https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar

striker159 · December 16, 2024, 2:09pm

The cooperative groups api is open source. Syncs are in include/cooperative_groups/details/sync.h
multi-warp groups do not seem to use the barrier ptx, but busy waiting on a memory location.

/* - Multi warp groups synchronization routines - */

#ifdef _CG_CPP11_FEATURES
// Need both acquire and release for the last warp, since it won't be able to acquire with red.and
_CG_STATIC_QUALIFIER unsigned int atom_or_acq_rel_cta(unsigned int *addr, unsigned int val) {
    unsigned int old;
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
    (asm volatile("atom.or.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(old) : _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
    ,
    (__threadfence_block();
    old = atomicOr(addr, val);)
    );
    return old;
}

// Special case where barrier is arrived, but not waited on
_CG_STATIC_QUALIFIER void red_or_release_cta(unsigned int *addr, unsigned int val) {
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
    (asm volatile("red.or.release.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
    ,
    (__threadfence_block();
    atomicOr(addr, val);)
    );
}

// Usually called by last arriving warp to released other warps, can be relaxed, since or was already acq_rel
_CG_STATIC_QUALIFIER void red_and_relaxed_cta(unsigned int *addr, unsigned int val) {
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
    (asm volatile("red.and.relaxed.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
    ,
    (atomicAnd(addr, val);)
    );
}

// Special case of release, where last warp was doing extra work before releasing others, need to be release
//  to ensure that extra work is visible
_CG_STATIC_QUALIFIER void red_and_release_cta(unsigned int *addr, unsigned int val) {
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
    (asm volatile("red.and.release.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
    ,
    (__threadfence_block();
    atomicAnd(addr, val);)
    );
}

// Read the barrier, acquire to ensure all memory operations following the sync are correctly performed after it is released
_CG_STATIC_QUALIFIER unsigned int ld_acquire_cta(unsigned int *addr) {
    unsigned int val;
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
    (asm volatile("ld.acquire.cta.u32 %0,[%1];" : "=r"(val) : _CG_ASM_PTR_CONSTRAINT(addr) : "memory");)
    ,
    (val = *((volatile unsigned int*) addr);
    __threadfence_block();)
    );
    return val;
}

// Get synchronization bit mask of my thread_block_tile of size num_warps. Thread ranks 0..31 have the first bit assigned to them,
// thread ranks 32..63 second etc 
// Bit masks are unique for each group, groups of the same size will have the same number of bits set, but on different positions 
_CG_STATIC_QUALIFIER unsigned int get_group_mask(unsigned int thread_rank, unsigned int num_warps) {
    return num_warps == 32 ? ~0 : ((1 << num_warps) - 1) << (num_warps * (thread_rank / (num_warps * 32)));
}

_CG_STATIC_QUALIFIER void barrier_wait(barrier_t *arrived, unsigned int warp_bit) {
    while(ld_acquire_cta(arrived) & warp_bit);
}

// Default blocking sync.
_CG_STATIC_QUALIFIER void sync_warps(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
    unsigned int warp_id = thread_rank / 32;
    bool warp_master = (thread_rank % 32 == 0);
    unsigned int warp_bit = 1 << warp_id;
    unsigned int group_mask = get_group_mask(thread_rank, num_warps);

    __syncwarp(0xFFFFFFFF);

    if (warp_master) {
        unsigned int old = atom_or_acq_rel_cta(arrived, warp_bit);
        if (((old | warp_bit) & group_mask) == group_mask) {
            red_and_relaxed_cta(arrived, ~group_mask);
        }
        else {
            barrier_wait(arrived, warp_bit);
        }
    }

    __syncwarp(0xFFFFFFFF);
}

// Blocking sync, except the last arriving warp, that releases other warps, returns to do other stuff first.
// Warp returning true from this function needs to call sync_warps_release.
_CG_STATIC_QUALIFIER bool sync_warps_last_releases(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
    unsigned int warp_id = thread_rank / 32;
    bool warp_master = (thread_rank % 32 == 0);
    unsigned int warp_bit = 1 << warp_id;
    unsigned int group_mask = get_group_mask(thread_rank, num_warps);

    __syncwarp(0xFFFFFFFF);

    unsigned int old = 0;
    if (warp_master) {
        old = atom_or_acq_rel_cta(arrived, warp_bit);
    }
    old = __shfl_sync(0xFFFFFFFF, old, 0);
    if (((old | warp_bit) & group_mask) == group_mask) {
        return true;
    }
    barrier_wait(arrived, warp_bit);

    return false;
}

// Release my group from the barrier.
_CG_STATIC_QUALIFIER void sync_warps_release(barrier_t *arrived, bool is_master, unsigned int thread_rank, unsigned int num_warps) {
    unsigned int group_mask = get_group_mask(thread_rank, num_warps);
    if (is_master) {
        red_and_release_cta(arrived, ~group_mask);
    }
}

// Arrive at my group barrier, but don't block or release the barrier, even if every one arrives.
// sync_warps_release needs to be called by some warp after this one to reset the barrier.
_CG_STATIC_QUALIFIER void sync_warps_arrive(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
    unsigned int warp_id = thread_rank / 32;
    bool warp_master = (thread_rank % 32 == 0);
    unsigned int warp_bit = 1 << warp_id;
    unsigned int group_mask = get_group_mask(thread_rank, num_warps);

    __syncwarp(0xFFFFFFFF);

    if (warp_master) {
        red_or_release_cta(arrived, warp_bit);
    }
}
// Wait for my warp to be released from the barrier. Warp must have arrived first.
_CG_STATIC_QUALIFIER void sync_warps_wait(barrier_t *arrived, unsigned int thread_rank) {
    unsigned int warp_id = thread_rank / 32;
    unsigned int warp_bit = 1 << warp_id;

    barrier_wait(arrived, warp_bit);
}

// Wait for specific warp to arrive at the barrier
_CG_QUALIFIER void sync_warps_wait_for_specific_warp(barrier_t *arrived, unsigned int wait_warp_id) {
    unsigned int wait_mask = 1 << wait_warp_id;
    while((ld_acquire_cta(arrived) & wait_mask) != wait_mask);
}

// Initialize the bit corresponding to my warp in the barrier
_CG_QUALIFIER void sync_warps_reset(barrier_t *arrived, unsigned int thread_rank) {
    unsigned int warp_id = thread_rank / 32;
    unsigned int warp_bit = 1 << warp_id;

    __syncwarp(0xFFFFFFFF);

    if (thread_rank % 32 == 0) {
        red_and_release_cta(arrived, ~warp_bit);
    }
    // No need to sync after the atomic, there will be a sync of the group that is being partitioned right after this.
}

Topic		Replies	Views
Cooperative groups are much slower than CUB CUDA Programming and Performance cuda	6	148	December 16, 2024
Cooperative Groups: Flexible CUDA Thread Programming Technical Blog	32	12397	February 7, 2023
What is the usage of cooperative group? CUDA Programming and Performance	10	1174	September 19, 2023
Race condition within warp CUDA Programming and Performance	9	2868	September 20, 2016
Warp-level reduction before atomic accumulation CUDA Programming and Performance	7	1582	December 13, 2022
Best way to pack bits into words for global memory Better than reduce in shared memory? CUDA Programming and Performance	17	6675	June 2, 2012
about the __syncwarp() in P100 CUDA Programming and Performance	11	4037	June 6, 2018
CUDA Kernel self-suspension ? Can a CUDA Kernel conditionally suspend its execution ? CUDA Programming and Performance	46	45178	April 17, 2011
Controlling context switching in CUDA CUDA Programming and Performance	16	4436	April 29, 2013
How to launch CUDA Cooperative Groups Standard Deviation example kernel? CUDA Programming and Performance	11	2558	February 12, 2023

Re: Cooperative groups are much slower than CUB

Related topics