Some question about creating CUtensorMap and use it

I have some questions about the following code.

   constexpr int row = 8;
    constexpr int col = 64;
    size_t byteSize = row * col * sizeof(int);
    int* h_data = (int*)malloc(byteSize);
    int* h_result_data = (int*)malloc(byteSize);
    for (int i = 0, e = row * col; i < e; ++i) {
      h_data[i] = i / 4;
    }

    int* d_data = nullptr;
    cudaMalloc(&d_data, byteSize);
    cudaMemcpy(d_data, h_data, byteSize, cudaMemcpyHostToDevice);

    CUtensorMap tensor_map{};
    // the map don't swizze.
    CUtensorMap result_tensor_map{};
    // rank is the number of dimensions of the array.
    constexpr uint32_t rank = 2;
    // global memory size
    uint64_t size[rank] = {row * 4, col / 4};
    // global memory stride, must be a multiple of 16.
    uint64_t stride[rank - 1] = {col * sizeof(int)};
    // The inner shared memory box dimension in bytes, equal to the swizzle span.
    uint32_t box_size[rank] = {row * 4, col / 4};

    uint32_t elem_stride[rank] = {1, 1};

    // Get the functuon pointer it from driver API.
    auto cuTensorMapEncodeTiled = get_cuTensorMapEncodeTiled();

    CUresult res = cuTensorMapEncodeTiled(
        &tensor_map,                // CUtensorMap *tensorMap,
        CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32,
        rank,                       // cuuint32_t tensorRank,
        d_data,                     // void *globalAddress,
        size,                       // const cuuint64_t *globalDim,
        stride,                     // const cuuint64_t *globalStrides,
        box_size,                   // const cuuint32_t *boxDim,
        elem_stride,                // const cuuint32_t *elementStrides,
        CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
        // Using a swizzle pattern of 128 bytes.
        CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_128B,
        CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
        CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
    );
  • CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_128B. The swizzle size is 128 bytes. Where is the 128 reflect.
  • Why can’t I write code like this?
    // global memory size
    uint64_t size[rank] = {row , col };
    // global memory stride, must be a multiple of 16.
    uint64_t stride[rank - 1] = {col * sizeof(int)};
    // The inner shared memory box dimension in bytes, equal to the swizzle span.
    uint32_t box_size[rank] = {row , col };

Everyone is welcome to leave a message below for discussion. If necessary, I will provide my complete code.Thanks all.