Hello.
When using the inline “cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes” instruction on the H100, it reports an illegal instruction, even though it is listed as a supported architecture and it compiles fine when using nvcc -arch=sm_90.
A provided code example is as follows:
#include <cuda.h>
#include <cudaTypedefs.h>
#include <cuda_runtime.h>
#include <cuda/barrier>
#include <cuda/ptx>
#include <iostream>
#include <vector>
#include <cassert>
const int DEPTH = 8;
const int HEIGHT = 16;
const int WIDTH = 16;
const int TILE_D = 2;
const int TILE_H = 4;
const int TILE_W = 4;
using barrier = cuda::barrier<cuda::thread_scope_block>;
#define CUDA_CHECK(call) { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at line " << __LINE__ << std::endl; \
exit(1); \
} \
}
#define CHECK_DRV(call) { \
CUresult err = call; \
if (err != CUDA_SUCCESS) { \
const char* errStr; \
cuGetErrorName(err, &errStr); \
std::cerr << "Driver API Error: " << errStr << " at line " << __LINE__ << std::endl; \
exit(1); \
} \
}
__device__ __forceinline__ void cp_async_bulk_tensor_3d_im2col(
void* dstMem,
const void* tensorMap,
const int32_t tensorCoords[3],
uint64_t* smem_bar,
const uint16_t offset_w)
{
asm volatile(
"cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes "
"[%0], [%1, {%2, %3, %4}], [%5], {%6};"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(dstMem))),
"l"(tensorMap),
"r"(tensorCoords[0]),
"r"(tensorCoords[1]),
"r"(tensorCoords[2]),
"r"(static_cast<uint32_t>(__cvta_generic_to_shared(smem_bar))),
"h"(offset_w)
: "memory");
}
__global__ void tma_copy_kernel_3d(const __grid_constant__ CUtensorMap tensor_map, uint64_t* d_out) {
extern __shared__ uint64_t smem_buffer[TILE_W * TILE_H * TILE_D];
#pragma nv_diag_suppress static_var_with_dynamic_init
__shared__ cuda::barrier<cuda::thread_scope_block> bar;
if (threadIdx.x == 0) init(&bar, blockDim.x);
__syncthreads();
int tensor_coords[3] = {0, 0, 0};
if (threadIdx.x == 0) {
cp_async_bulk_tensor_3d_im2col(
smem_buffer,
&tensor_map,
tensor_coords,
cuda::device::barrier_native_handle(bar),
0
);
}
bar.arrive_and_wait();
if (threadIdx.x == 0) {
printf("3D Data copied to Shared Memory:\n");
for (int d = 0; d < TILE_D; d++) {
printf("Depth %d:\n", d);
for (int h = 0; h < TILE_H; h++) {
for (int w = 0; w < TILE_W; w++) {
printf("%llu ", smem_buffer[d * TILE_H * TILE_W + h * TILE_W + w]);
}
printf("\n");
}
}
}
}
int main() {
size_t size_bytes = WIDTH * HEIGHT * DEPTH * sizeof(uint64_t);
std::vector<uint64_t> h_data(WIDTH * HEIGHT * DEPTH);
for (uint64_t i = 0; i < WIDTH * HEIGHT * DEPTH; ++i) h_data[i] = i;
uint64_t* d_input;
CUDA_CHECK(cudaMalloc(&d_input, size_bytes));
CUDA_CHECK(cudaMemcpy(d_input, h_data.data(), size_bytes, cudaMemcpyHostToDevice));
CUtensorMap tensor_map;
constexpr int rank = 3;
uint64_t globalDim[rank] = {WIDTH, HEIGHT, DEPTH};
uint64_t globalStrides[rank - 1] = {WIDTH * sizeof(uint64_t), WIDTH * HEIGHT * sizeof(uint64_t)};
uint32_t boxDim[rank] = {TILE_W, TILE_H, TILE_D};
uint32_t elementStrides[rank] = {1, 1, 1};
CHECK_DRV(cuTensorMapEncodeTiled(
&tensor_map,
CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_UINT64,
rank,
d_input,
globalDim,
globalStrides,
boxDim,
elementStrides,
CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE,
CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
));
size_t smem_size = TILE_W * TILE_H * TILE_D * sizeof(uint64_t);
tma_copy_kernel_3d<<<1, 32, smem_size>>>(tensor_map, nullptr);
CUDA_CHECK(cudaDeviceSynchronize());
cudaFree(d_input);
return 0;
}
I checked with the docs here and I do think it should be supported.