I’m trying to run the cublasLtIgemm example in the document but get CUBLAS_STATUS_INVALID_VALUE error in cublasLtMatrixTransform for B . I copy and paste the example code here
#include <cublasLt.h>
#include <cuda_runtime.h>
#include <cstdint>
int
roundoff(int v, int d) {
return (v + d - 1) / d * d;
}
// Use cublasLtMatmul to perform the tensor op Igemm with the memory
// order transforms on all buffers.
//
// For better performance the data order transforms should be offline
// as much as possible.
//
// Transa, transb assumed N; alpha, beta are host pointers; Tensor ops
// allowed. Alpha assumed 1, beta assumed 0, and stream assumed 0.
int
LtIgemmTensor(cublasLtHandle_t ltHandle,
int m,
int n,
int k,
const int8_t *A,
int lda,
const int8_t *B,
int ldb,
int32_t *C,
int ldc) {
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
cublasLtMatmulDesc_t matmulDesc = NULL;
cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
int32_t alpha = 1, beta = 0;
cublasOperation_t opTranspose = CUBLAS_OP_T;
// The tensor op igemm kernels require specialized memory order of
// data.
cublasLtMatrixTransformDesc_t transformDesc = NULL;
int8_t *Atransform = NULL, *Btransform = NULL;
int32_t *Ctransform = NULL;
cublasLtMatrixLayout_t AtransformDesc = NULL, BtransformDesc = NULL, CtransformDesc = NULL;
float transformAlpha = 1.0f, transformBeta = 0.0f;
cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32;
cublasLtOrder_t order_COL4_4R2_8C = CUBLASLT_ORDER_COL4_4R2_8C;
int ldatransform = 32 * m;
int ldbtransform = 32 * roundoff(n, 8);
int ldctransform = 32 * m;
cudaMalloc(&Atransform, sizeof(int8_t) * roundoff(k, 32) / 32 * ldatransform);
if (!Atransform) goto CLEANUP;
cudaMalloc(&Btransform, sizeof(int8_t) * roundoff(k, 32) / 32 * ldbtransform);
if (!Btransform) goto CLEANUP;
cudaMalloc(&Ctransform, sizeof(int32_t) * roundoff(n, 32) / 32 * ldctransform);
if (!Ctransform) goto CLEANUP;
status = cublasLtMatrixTransformDescCreate(&transformDesc, CUDA_R_32F);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
// B matrix is non-transposed, but transposed matrix is needed - add transpose operation in matrix transform.
status = cublasLtMatrixTransformDescSetAttribute(transformDesc, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSB, &opTranspose, sizeof(opTranspose));
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatmulDescCreate(&matmulDesc, CUDA_R_32I);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
// Tensor op igemm kernels only support NT gemm
cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(opTranspose));
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
// --------------------------------------
// Create descriptors for the original matrices
status = cublasLtMatrixLayoutCreate(&Adesc, CUDA_R_8I, m, k, lda);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatrixLayoutCreate(&Bdesc, CUDA_R_8I, k, n, ldb);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &rowOrder, sizeof(rowOrder));
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatrixLayoutCreate(&Cdesc, CUDA_R_32I, m, n, ldc);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
// -----------------------------------------------------------
// Create descriptors for the transformed matrices
status = cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, m, k, ldatransform);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatrixLayoutSetAttribute(
AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, n, k, ldbtransform);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatrixLayoutSetAttribute(
BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL4_4R2_8C, sizeof(order_COL4_4R2_8C));
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_32I, m, n, ldctransform);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatrixLayoutSetAttribute(
CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
// --------------------------------------------------------
// Transforms and computation
status = cublasLtMatrixTransform(
ltHandle, transformDesc, &transformAlpha, A, Adesc, &transformBeta, NULL, NULL, Atransform, AtransformDesc, 0);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
status = cublasLtMatrixTransform(
ltHandle, transformDesc, &transformAlpha, B, Bdesc, &transformBeta, NULL, NULL, Btransform, BtransformDesc, 0);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
// No need to transform C matrix as beta is assumed to be 0
status = cublasLtMatmul(ltHandle,
matmulDesc,
&alpha,
Atransform,
AtransformDesc,
Btransform,
BtransformDesc,
&beta,
Ctransform,
CtransformDesc,
Ctransform,
CtransformDesc,
NULL,
NULL,
0,
0);
if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
// Transform the outputs to COL order
status = cublasLtMatrixTransform(
ltHandle, transformDesc, &transformAlpha, Ctransform, CtransformDesc, &transformBeta, NULL, NULL, C, Cdesc, 0);
CLEANUP:
// Descriptors are no longer needed as all GPU work was already
// enqueued.
if (CtransformDesc) cublasLtMatrixLayoutDestroy(CtransformDesc);
if (BtransformDesc) cublasLtMatrixLayoutDestroy(BtransformDesc);
if (AtransformDesc) cublasLtMatrixLayoutDestroy(AtransformDesc);
if (Cdesc) cublasLtMatrixLayoutDestroy(Cdesc);
if (Bdesc) cublasLtMatrixLayoutDestroy(Bdesc);
if (Adesc) cublasLtMatrixLayoutDestroy(Adesc);
if (matmulDesc) cublasLtMatmulDescDestroy(matmulDesc);
if (transformDesc) cublasLtMatrixTransformDescDestroy(transformDesc);
// Wait until device is done before freeing transformed buffers
cudaDeviceSynchronize();
if (Ctransform) cudaFree(Ctransform);
if (Btransform) cudaFree(Btransform);
if (Atransform) cudaFree(Atransform);
return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}
Note that in the original code, they forget to define rowOrder, so I assume cublasLtOrder_t rowOrder = CUBLASLT_ORDER_ROW;
The error happened in this line
status = cublasLtMatrixTransform(
ltHandle, transformDesc, &transformAlpha, B, Bdesc, &transformBeta, NULL, NULL, Btransform, BtransformDesc, 0);
And the error message is
Check failed: error == CUBLAS_STATUS_SUCCESS (7 vs. 0) : CUBLAS_STATUS_INVALID_VALUE
I’m running on Tesla T4 GPU with compute capability 7.5. And OS is Ubuntu 16.04.