Im2col Illegal Instruction Encounterd on Supported Architecture (H100)

Hello.

When using the inline “cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes” instruction on the H100, it reports an illegal instruction, even though it is listed as a supported architecture and it compiles fine when using nvcc -arch=sm_90.

A provided code example is as follows:

#include <cuda.h>
#include <cudaTypedefs.h>
#include <cuda_runtime.h>
#include <cuda/barrier>
#include <cuda/ptx>
#include <iostream>
#include <vector>
#include <cassert>

const int DEPTH = 8;
const int HEIGHT = 16;
const int WIDTH = 16;

const int TILE_D = 2;
const int TILE_H = 4;
const int TILE_W = 4;

using barrier = cuda::barrier<cuda::thread_scope_block>;

#define CUDA_CHECK(call) { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at line " << __LINE__ << std::endl; \
        exit(1); \
    } \
}

#define CHECK_DRV(call) { \
    CUresult err = call; \
    if (err != CUDA_SUCCESS) { \
        const char* errStr; \
        cuGetErrorName(err, &errStr); \
        std::cerr << "Driver API Error: " << errStr << " at line " << __LINE__ << std::endl; \
        exit(1); \
    } \
}

__device__ __forceinline__ void cp_async_bulk_tensor_3d_im2col(
    void* dstMem,
    const void* tensorMap,
    const int32_t tensorCoords[3],
    uint64_t* smem_bar,
    const uint16_t offset_w)
{
    asm volatile(
        "cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes "
        "[%0], [%1, {%2, %3, %4}], [%5], {%6};"
        :
        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(dstMem))),
          "l"(tensorMap),
          "r"(tensorCoords[0]),
          "r"(tensorCoords[1]),
          "r"(tensorCoords[2]),
          "r"(static_cast<uint32_t>(__cvta_generic_to_shared(smem_bar))),
          "h"(offset_w)
        : "memory");
}

__global__ void tma_copy_kernel_3d(const __grid_constant__ CUtensorMap tensor_map, uint64_t* d_out) {
    extern __shared__ uint64_t smem_buffer[TILE_W * TILE_H * TILE_D];

    #pragma nv_diag_suppress static_var_with_dynamic_init
    __shared__ cuda::barrier<cuda::thread_scope_block> bar;

    if (threadIdx.x == 0) init(&bar, blockDim.x);
    __syncthreads();

    int tensor_coords[3] = {0, 0, 0};

    if (threadIdx.x == 0) {
        cp_async_bulk_tensor_3d_im2col(
            smem_buffer, 
            &tensor_map, 
            tensor_coords,
            cuda::device::barrier_native_handle(bar),
            0
        );
    }

    bar.arrive_and_wait();

    if (threadIdx.x == 0) {
        printf("3D Data copied to Shared Memory:\n");
        for (int d = 0; d < TILE_D; d++) {
            printf("Depth %d:\n", d);
            for (int h = 0; h < TILE_H; h++) {
                for (int w = 0; w < TILE_W; w++) {
                    printf("%llu ", smem_buffer[d * TILE_H * TILE_W + h * TILE_W + w]);
                }
                printf("\n");
            }
        }
    }
}

int main() {
    size_t size_bytes = WIDTH * HEIGHT * DEPTH * sizeof(uint64_t);
    std::vector<uint64_t> h_data(WIDTH * HEIGHT * DEPTH);
    for (uint64_t i = 0; i < WIDTH * HEIGHT * DEPTH; ++i) h_data[i] = i;

    uint64_t* d_input;
    CUDA_CHECK(cudaMalloc(&d_input, size_bytes));
    CUDA_CHECK(cudaMemcpy(d_input, h_data.data(), size_bytes, cudaMemcpyHostToDevice));

    CUtensorMap tensor_map;
    constexpr int rank = 3;
    uint64_t globalDim[rank] = {WIDTH, HEIGHT, DEPTH};
    uint64_t globalStrides[rank - 1] = {WIDTH * sizeof(uint64_t), WIDTH * HEIGHT * sizeof(uint64_t)};

    uint32_t boxDim[rank] = {TILE_W, TILE_H, TILE_D};
    uint32_t elementStrides[rank] = {1, 1, 1};

    CHECK_DRV(cuTensorMapEncodeTiled(
        &tensor_map,
        CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_UINT64,
        rank,
        d_input,
        globalDim,
        globalStrides,
        boxDim,
        elementStrides,
        CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
        CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE,
        CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
        CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
    ));

    size_t smem_size = TILE_W * TILE_H * TILE_D * sizeof(uint64_t);
    tma_copy_kernel_3d<<<1, 32, smem_size>>>(tensor_map, nullptr);
    CUDA_CHECK(cudaDeviceSynchronize());

    cudaFree(d_input);
    return 0;
}

I checked with the docs here and I do think it should be supported.

When I tried to compile your code, a warning is shown regarding the shared memory declaration.

It should be declared as either extern __shared__ uint64_t smem_buffer[] or __shared__ uint64_t smem_buffer[TILE_W * TILE_H * TILE_D], not extern __shared__ uint64_t smem_buffer[TILE_W * TILE_H * TILE_D]

In any case, the illegal instruction remains.

I noticed you used cuTensorMapEncodeTiled but tried to access it with im2col. Can you try to use cuTensorMapEncodeIm2col instead?

Thanks a lot striker.

As you mentioned, the issue probably lies with me not using cuTensorMapEncodeIm2col. I will figure out how to use it and then try to report back.

On Hopper, cp.async.bulk.tensor.*.im2col is only legal when the tensor map is encoded with the matching im2col semantics.
Using cuTensorMapEncodeTiled can compile for sm_90 but still trap at runtime as an illegal instruction.