I have some questions about the following code.
constexpr int row = 8;
constexpr int col = 64;
size_t byteSize = row * col * sizeof(int);
int* h_data = (int*)malloc(byteSize);
int* h_result_data = (int*)malloc(byteSize);
for (int i = 0, e = row * col; i < e; ++i) {
h_data[i] = i / 4;
}
int* d_data = nullptr;
cudaMalloc(&d_data, byteSize);
cudaMemcpy(d_data, h_data, byteSize, cudaMemcpyHostToDevice);
CUtensorMap tensor_map{};
// the map don't swizze.
CUtensorMap result_tensor_map{};
// rank is the number of dimensions of the array.
constexpr uint32_t rank = 2;
// global memory size
uint64_t size[rank] = {row * 4, col / 4};
// global memory stride, must be a multiple of 16.
uint64_t stride[rank - 1] = {col * sizeof(int)};
// The inner shared memory box dimension in bytes, equal to the swizzle span.
uint32_t box_size[rank] = {row * 4, col / 4};
uint32_t elem_stride[rank] = {1, 1};
// Get the functuon pointer it from driver API.
auto cuTensorMapEncodeTiled = get_cuTensorMapEncodeTiled();
CUresult res = cuTensorMapEncodeTiled(
&tensor_map, // CUtensorMap *tensorMap,
CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32,
rank, // cuuint32_t tensorRank,
d_data, // void *globalAddress,
size, // const cuuint64_t *globalDim,
stride, // const cuuint64_t *globalStrides,
box_size, // const cuuint32_t *boxDim,
elem_stride, // const cuuint32_t *elementStrides,
CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
// Using a swizzle pattern of 128 bytes.
CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_128B,
CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
);
- CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_128B. The swizzle size is 128 bytes. Where is the 128 reflect.
- Why can’t I write code like this?
// global memory size
uint64_t size[rank] = {row , col };
// global memory stride, must be a multiple of 16.
uint64_t stride[rank - 1] = {col * sizeof(int)};
// The inner shared memory box dimension in bytes, equal to the swizzle span.
uint32_t box_size[rank] = {row , col };
- 1. Introduction — PTX ISA 8.7 documentation Is the content here related to what I’m asking?
Everyone is welcome to leave a message below for discussion. If necessary, I will provide my complete code.Thanks all.