@Yuki_Ni So it would not just use a numbered cta barrier? But would need additional software overhead?
The cooperative groups api is open source. Syncs are in include/cooperative_groups/details/sync.h
multi-warp groups do not seem to use the barrier ptx, but busy waiting on a memory location.
/* - Multi warp groups synchronization routines - */
#ifdef _CG_CPP11_FEATURES
// Need both acquire and release for the last warp, since it won't be able to acquire with red.and
_CG_STATIC_QUALIFIER unsigned int atom_or_acq_rel_cta(unsigned int *addr, unsigned int val) {
unsigned int old;
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
(asm volatile("atom.or.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(old) : _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
,
(__threadfence_block();
old = atomicOr(addr, val);)
);
return old;
}
// Special case where barrier is arrived, but not waited on
_CG_STATIC_QUALIFIER void red_or_release_cta(unsigned int *addr, unsigned int val) {
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
(asm volatile("red.or.release.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
,
(__threadfence_block();
atomicOr(addr, val);)
);
}
// Usually called by last arriving warp to released other warps, can be relaxed, since or was already acq_rel
_CG_STATIC_QUALIFIER void red_and_relaxed_cta(unsigned int *addr, unsigned int val) {
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
(asm volatile("red.and.relaxed.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
,
(atomicAnd(addr, val);)
);
}
// Special case of release, where last warp was doing extra work before releasing others, need to be release
// to ensure that extra work is visible
_CG_STATIC_QUALIFIER void red_and_release_cta(unsigned int *addr, unsigned int val) {
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
(asm volatile("red.and.release.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");)
,
(__threadfence_block();
atomicAnd(addr, val);)
);
}
// Read the barrier, acquire to ensure all memory operations following the sync are correctly performed after it is released
_CG_STATIC_QUALIFIER unsigned int ld_acquire_cta(unsigned int *addr) {
unsigned int val;
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
(asm volatile("ld.acquire.cta.u32 %0,[%1];" : "=r"(val) : _CG_ASM_PTR_CONSTRAINT(addr) : "memory");)
,
(val = *((volatile unsigned int*) addr);
__threadfence_block();)
);
return val;
}
// Get synchronization bit mask of my thread_block_tile of size num_warps. Thread ranks 0..31 have the first bit assigned to them,
// thread ranks 32..63 second etc
// Bit masks are unique for each group, groups of the same size will have the same number of bits set, but on different positions
_CG_STATIC_QUALIFIER unsigned int get_group_mask(unsigned int thread_rank, unsigned int num_warps) {
return num_warps == 32 ? ~0 : ((1 << num_warps) - 1) << (num_warps * (thread_rank / (num_warps * 32)));
}
_CG_STATIC_QUALIFIER void barrier_wait(barrier_t *arrived, unsigned int warp_bit) {
while(ld_acquire_cta(arrived) & warp_bit);
}
// Default blocking sync.
_CG_STATIC_QUALIFIER void sync_warps(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
unsigned int warp_id = thread_rank / 32;
bool warp_master = (thread_rank % 32 == 0);
unsigned int warp_bit = 1 << warp_id;
unsigned int group_mask = get_group_mask(thread_rank, num_warps);
__syncwarp(0xFFFFFFFF);
if (warp_master) {
unsigned int old = atom_or_acq_rel_cta(arrived, warp_bit);
if (((old | warp_bit) & group_mask) == group_mask) {
red_and_relaxed_cta(arrived, ~group_mask);
}
else {
barrier_wait(arrived, warp_bit);
}
}
__syncwarp(0xFFFFFFFF);
}
// Blocking sync, except the last arriving warp, that releases other warps, returns to do other stuff first.
// Warp returning true from this function needs to call sync_warps_release.
_CG_STATIC_QUALIFIER bool sync_warps_last_releases(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
unsigned int warp_id = thread_rank / 32;
bool warp_master = (thread_rank % 32 == 0);
unsigned int warp_bit = 1 << warp_id;
unsigned int group_mask = get_group_mask(thread_rank, num_warps);
__syncwarp(0xFFFFFFFF);
unsigned int old = 0;
if (warp_master) {
old = atom_or_acq_rel_cta(arrived, warp_bit);
}
old = __shfl_sync(0xFFFFFFFF, old, 0);
if (((old | warp_bit) & group_mask) == group_mask) {
return true;
}
barrier_wait(arrived, warp_bit);
return false;
}
// Release my group from the barrier.
_CG_STATIC_QUALIFIER void sync_warps_release(barrier_t *arrived, bool is_master, unsigned int thread_rank, unsigned int num_warps) {
unsigned int group_mask = get_group_mask(thread_rank, num_warps);
if (is_master) {
red_and_release_cta(arrived, ~group_mask);
}
}
// Arrive at my group barrier, but don't block or release the barrier, even if every one arrives.
// sync_warps_release needs to be called by some warp after this one to reset the barrier.
_CG_STATIC_QUALIFIER void sync_warps_arrive(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
unsigned int warp_id = thread_rank / 32;
bool warp_master = (thread_rank % 32 == 0);
unsigned int warp_bit = 1 << warp_id;
unsigned int group_mask = get_group_mask(thread_rank, num_warps);
__syncwarp(0xFFFFFFFF);
if (warp_master) {
red_or_release_cta(arrived, warp_bit);
}
}
// Wait for my warp to be released from the barrier. Warp must have arrived first.
_CG_STATIC_QUALIFIER void sync_warps_wait(barrier_t *arrived, unsigned int thread_rank) {
unsigned int warp_id = thread_rank / 32;
unsigned int warp_bit = 1 << warp_id;
barrier_wait(arrived, warp_bit);
}
// Wait for specific warp to arrive at the barrier
_CG_QUALIFIER void sync_warps_wait_for_specific_warp(barrier_t *arrived, unsigned int wait_warp_id) {
unsigned int wait_mask = 1 << wait_warp_id;
while((ld_acquire_cta(arrived) & wait_mask) != wait_mask);
}
// Initialize the bit corresponding to my warp in the barrier
_CG_QUALIFIER void sync_warps_reset(barrier_t *arrived, unsigned int thread_rank) {
unsigned int warp_id = thread_rank / 32;
unsigned int warp_bit = 1 << warp_id;
__syncwarp(0xFFFFFFFF);
if (thread_rank % 32 == 0) {
red_and_release_cta(arrived, ~warp_bit);
}
// No need to sync after the atomic, there will be a sync of the group that is being partitioned right after this.
}