Hi,
Testing out TMA for bulk tensor copies from shared to global memory, which requires the wait_group command:
Judging by its use (in a wrapper) in the CUDA Programming Guide,
// Initiate TMA transfer to copy shared memory to global memory
if (threadIdx.x == 0) {
cde::cp_async_bulk_tensor_2d_shared_to_global(&tensor_map, x, y, &smem_buffer);
// Wait for TMA transfer to have finished reading shared memory.
// Create a "bulk async-group" out of the previous bulk copy operation.
cde::cp_async_bulk_commit_group();
// Wait for the group to have completed reading from shared memory.
cde::cp_async_bulk_wait_group_read<0>();
}
CUDA C++ Programming Guide (see near bottom of section)
it would appear that thread 0 issues the command and waits, after which all threads in the block can then see the change to global memory. This should especially be true after a syncthreads() call, which should have an implicit memory fence.
However, it seems that only the warp containing thread 0 is able to see the change to global memory. Am I missing something as regards to the memory consistency model here? The PTX doc above lists “Writes being made visible to the executing thread.” - I would assume the syncthreads after this would be effective.
This test code creates a 16x16 thread block to copy 16x16 ints from shared to global memory.
static __global__ void __launch_bounds__(256) cp_async_bulk_tensor_s2g(const __grid_constant__ CUtensorMap tensor_map, int32_t * __restrict__ a) {
int32_t tid = ((int)threadIdx.x + ((int)threadIdx.y * 16));
extern __align__(128) __shared__ int32_t smem_a[];
smem_a[tid] = 1;
assert((smem_a[tid] == 1));
cuda_fence_view_async_shared();
__syncthreads();
for (int32_t i = 0; (i < 100); i = (i + 1)) {
if (tid == 0) {
cuda_copy_tma_2d_s2g((&tensor_map), smem_a, 0, 0);
cuda_copy_bulk_commit_group();
cuda_copy_bulk_wait_group_0();
}
cuda_fence_view_async_shared();
__syncthreads();
assert((a[tid] == 1));
a[tid] = 0;
assert((a[tid] == 0));
cuda_fence_view_async_shared();
__syncthreads();
}
}
DLL void launch(int32_t * __restrict__ a) {
void *func_ptr = cuda_get_tensor_map_encode_tiled_func_ptr();
CUtensorMap tensor_map;
uint64_t size[2];
size[0] = uint64_t(16ull);
size[1] = uint64_t(16ull);
uint64_t stride[1];
stride[0] = uint64_t(64ull);
uint32_t box_size[2];
box_size[0] = uint32_t(16u);
box_size[1] = uint32_t(16u);
uint32_t elem_stride[2];
elem_stride[0] = uint32_t(1u);
elem_stride[1] = uint32_t(1u);
reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(func_ptr)(
(&tensor_map),
CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32,
2,
a,
size,
stride,
box_size,
elem_stride,
CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE,
CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
);
cp_async_bulk_tensor_s2g<<<dim3(1, 1, 1), dim3(16, 16, 1), 1024, (cudaStream_t)get_cuda_stream()>>>(tensor_map, a);
{cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) LOG(ERROR) << "CUDA error: " << cudaGetErrorString(err) << "\n";}
}
Might it be that re-using the tensor map breaks it in the for-loop?
My (autogenerated) PTX wrappers:
static __device__ __forceinline__ void cuda_fence_view_async_shared() {
asm ("fence.proxy.async.shared::cta;" : : );
}
static __device__ __forceinline__ uint32_t cuda_cvta_generic_to_shared(void * __restrict__ src) {
uint32_t ret = 0;
asm ("{.reg.u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr;}" : "=r"(ret) : "l"(src));
return ret;
}
static __device__ __forceinline__ void cuda_copy_tma_2d_s2g(const CUtensorMap * __restrict__ dst, void * __restrict__ src, int32_t crd0, int32_t crd1) {
uint32_t smem_int_ptr = cuda_cvta_generic_to_shared(src);
asm ("cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%0, {%2, %3}], [%1];" : : "l"(((uint64_t)(dst))), "r"(smem_int_ptr), "r"(crd0), "r"(crd1));
}
static __device__ __forceinline__ void cuda_copy_bulk_commit_group() {
asm ("cp.async.bulk.commit_group;" : : );
}
static __device__ __forceinline__ void cuda_copy_bulk_wait_group_0() {
asm ("cp.async.bulk.wait_group 0;" : : );
}
Any advice or pointers much appreciated. Thanks!