TMA async bulk tensor copy memory consistency

Hi,

Testing out TMA for bulk tensor copies from shared to global memory, which requires the wait_group command:

https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-wait-group

Judging by its use (in a wrapper) in the CUDA Programming Guide,

  // Initiate TMA transfer to copy shared memory to global memory
  if (threadIdx.x == 0) {
    cde::cp_async_bulk_tensor_2d_shared_to_global(&tensor_map, x, y, &smem_buffer);
    // Wait for TMA transfer to have finished reading shared memory.
    // Create a "bulk async-group" out of the previous bulk copy operation.
    cde::cp_async_bulk_commit_group();
    // Wait for the group to have completed reading from shared memory.
    cde::cp_async_bulk_wait_group_read<0>();
  }

CUDA C++ Programming Guide (see near bottom of section)

it would appear that thread 0 issues the command and waits, after which all threads in the block can then see the change to global memory. This should especially be true after a syncthreads() call, which should have an implicit memory fence.

However, it seems that only the warp containing thread 0 is able to see the change to global memory. Am I missing something as regards to the memory consistency model here? The PTX doc above lists “Writes being made visible to the executing thread.” - I would assume the syncthreads after this would be effective.

This test code creates a 16x16 thread block to copy 16x16 ints from shared to global memory.

static __global__ void __launch_bounds__(256) cp_async_bulk_tensor_s2g(const __grid_constant__ CUtensorMap tensor_map, int32_t * __restrict__ a) {
  int32_t tid = ((int)threadIdx.x + ((int)threadIdx.y * 16));
  extern __align__(128) __shared__ int32_t smem_a[];
  smem_a[tid] = 1;
  assert((smem_a[tid] == 1));
  cuda_fence_view_async_shared();
  __syncthreads();
  for (int32_t i = 0; (i < 100); i = (i + 1)) {
    if (tid == 0) {
      cuda_copy_tma_2d_s2g((&tensor_map), smem_a, 0, 0);
      cuda_copy_bulk_commit_group();
      cuda_copy_bulk_wait_group_0();
    } 
    cuda_fence_view_async_shared();
    __syncthreads();
    assert((a[tid] == 1));
    a[tid] = 0;
    assert((a[tid] == 0));
    cuda_fence_view_async_shared();
    __syncthreads();
  } 
}

DLL void launch(int32_t * __restrict__ a) {
  void *func_ptr = cuda_get_tensor_map_encode_tiled_func_ptr();
  CUtensorMap tensor_map;
  uint64_t size[2];
  size[0] = uint64_t(16ull);
  size[1] = uint64_t(16ull);
  uint64_t stride[1];
  stride[0] = uint64_t(64ull);
  uint32_t box_size[2];
  box_size[0] = uint32_t(16u);
  box_size[1] = uint32_t(16u);
  uint32_t elem_stride[2];
  elem_stride[0] = uint32_t(1u);
  elem_stride[1] = uint32_t(1u);
  
  reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(func_ptr)(
      (&tensor_map),
      CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32,
      2,
      a,
      size,
      stride,
      box_size,
      elem_stride,
      CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
      CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE,
      CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
      CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
  );
      
  cp_async_bulk_tensor_s2g<<<dim3(1, 1, 1), dim3(16, 16, 1), 1024, (cudaStream_t)get_cuda_stream()>>>(tensor_map, a);
  {cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) LOG(ERROR) << "CUDA error: " << cudaGetErrorString(err) << "\n";}
}

Might it be that re-using the tensor map breaks it in the for-loop?

My (autogenerated) PTX wrappers:

static __device__ __forceinline__ void cuda_fence_view_async_shared() {
  asm ("fence.proxy.async.shared::cta;" :  : );
}

static __device__ __forceinline__ uint32_t cuda_cvta_generic_to_shared(void * __restrict__ src) {
  uint32_t ret = 0;
  asm ("{.reg.u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr;}" : "=r"(ret) : "l"(src));
  return ret;
}

static __device__ __forceinline__ void cuda_copy_tma_2d_s2g(const CUtensorMap * __restrict__ dst, void * __restrict__ src, int32_t crd0, int32_t crd1) {
  uint32_t smem_int_ptr = cuda_cvta_generic_to_shared(src);
  asm ("cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%0, {%2, %3}], [%1];" :  : "l"(((uint64_t)(dst))), "r"(smem_int_ptr), "r"(crd0), "r"(crd1));
}

static __device__ __forceinline__ void cuda_copy_bulk_commit_group() {
  asm ("cp.async.bulk.commit_group;" :  : );
}

static __device__ __forceinline__ void cuda_copy_bulk_wait_group_0() {
  asm ("cp.async.bulk.wait_group 0;" :  : );
}

Any advice or pointers much appreciated. Thanks!