CublasLtIgemm example error in the CUDA toolkit documentation

I’m trying to run the cublasLtIgemm example in the document but get CUBLAS_STATUS_INVALID_VALUE error in cublasLtMatrixTransform for B . I copy and paste the example code here

#include <cublasLt.h>
#include <cuda_runtime.h>
#include <cstdint>
 
int
roundoff(int v, int d) {
   return (v + d - 1) / d * d;
}
 
// Use cublasLtMatmul to perform the tensor op Igemm with the memory 
// order transforms on all buffers.
//
// For better performance the data order transforms should be offline 
// as much as possible.
//
// Transa, transb assumed N; alpha, beta are host pointers; Tensor ops 
// allowed. Alpha assumed 1, beta assumed 0, and stream assumed 0.

int
LtIgemmTensor(cublasLtHandle_t ltHandle,
             int m,
             int n,
             int k,
             const int8_t *A,
             int lda,
             const int8_t *B,
             int ldb,
             int32_t *C,
             int ldc) {
   cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
 
   cublasLtMatmulDesc_t matmulDesc = NULL;
   cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
   int32_t alpha = 1, beta = 0;
   cublasOperation_t opTranspose = CUBLAS_OP_T;
 
  // The tensor op igemm kernels require specialized memory order of 
  // data.
   cublasLtMatrixTransformDesc_t transformDesc = NULL;
   int8_t *Atransform = NULL, *Btransform = NULL;
   int32_t *Ctransform                   = NULL;
   cublasLtMatrixLayout_t AtransformDesc = NULL, BtransformDesc = NULL, CtransformDesc = NULL;
   float transformAlpha = 1.0f, transformBeta = 0.0f;
   cublasLtOrder_t order_COL32       = CUBLASLT_ORDER_COL32;
   cublasLtOrder_t order_COL4_4R2_8C = CUBLASLT_ORDER_COL4_4R2_8C;
 
   int ldatransform = 32 * m;
   int ldbtransform = 32 * roundoff(n, 8);
   int ldctransform = 32 * m;
 
   cudaMalloc(&Atransform, sizeof(int8_t) * roundoff(k, 32) / 32 * ldatransform);
   if (!Atransform) goto CLEANUP;
   cudaMalloc(&Btransform, sizeof(int8_t) * roundoff(k, 32) / 32 * ldbtransform);
   if (!Btransform) goto CLEANUP;
   cudaMalloc(&Ctransform, sizeof(int32_t) * roundoff(n, 32) / 32 * ldctransform);
   if (!Ctransform) goto CLEANUP;
 
   status = cublasLtMatrixTransformDescCreate(&transformDesc, CUDA_R_32F);
   if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

   // B matrix is non-transposed, but transposed matrix is needed - add transpose operation in matrix transform.
    status = cublasLtMatrixTransformDescSetAttribute(transformDesc, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSB, &opTranspose, sizeof(opTranspose));
    if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
   status = cublasLtMatmulDescCreate(&matmulDesc, CUDA_R_32I);
   if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
   // Tensor op igemm kernels only support NT gemm
   cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(opTranspose));
   if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
 
   // --------------------------------------
   // Create descriptors for the original matrices
 
   status = cublasLtMatrixLayoutCreate(&Adesc, CUDA_R_8I, m, k, lda);
   if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
 
   status = cublasLtMatrixLayoutCreate(&Bdesc, CUDA_R_8I, k, n, ldb);
   if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;

   status = cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &rowOrder, sizeof(rowOrder));
   if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
 
   status = cublasLtMatrixLayoutCreate(&Cdesc, CUDA_R_32I, m, n, ldc);
   if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
 
   // -----------------------------------------------------------
   // Create descriptors for the transformed matrices
 
   status = cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, m, k, ldatransform);
   if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
   status = cublasLtMatrixLayoutSetAttribute(
       AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
   if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
 
   status = cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, n, k, ldbtransform);
   if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
   status = cublasLtMatrixLayoutSetAttribute(
       BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL4_4R2_8C, sizeof(order_COL4_4R2_8C));
   if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
 
   status = cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_32I, m, n, ldctransform);
   if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
   status = cublasLtMatrixLayoutSetAttribute(
       CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
   if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
 
   // --------------------------------------------------------
   // Transforms and computation
 
   status = cublasLtMatrixTransform(
       ltHandle, transformDesc, &transformAlpha, A, Adesc, &transformBeta, NULL, NULL, Atransform, AtransformDesc, 0);
   if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
   status = cublasLtMatrixTransform(
       ltHandle, transformDesc, &transformAlpha, B, Bdesc, &transformBeta, NULL, NULL, Btransform, BtransformDesc, 0);
   if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
   // No need to transform C matrix as beta is assumed to be 0
   status = cublasLtMatmul(ltHandle,
                           matmulDesc,
                           &alpha,
                           Atransform,
                           AtransformDesc,
                           Btransform,
                           BtransformDesc,
                           &beta,
                           Ctransform,
                           CtransformDesc,
                           Ctransform,
                           CtransformDesc,
                           NULL,
                           NULL,
                           0,
                           0);
   if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
 
   // Transform the outputs to COL order
   status = cublasLtMatrixTransform(
       ltHandle, transformDesc, &transformAlpha, Ctransform, CtransformDesc, &transformBeta, NULL, NULL, C, Cdesc, 0);
 
CLEANUP:
  // Descriptors are no longer needed as all GPU work was already 
  // enqueued.
   if (CtransformDesc) cublasLtMatrixLayoutDestroy(CtransformDesc);
   if (BtransformDesc) cublasLtMatrixLayoutDestroy(BtransformDesc);
   if (AtransformDesc) cublasLtMatrixLayoutDestroy(AtransformDesc);
   if (Cdesc) cublasLtMatrixLayoutDestroy(Cdesc);
   if (Bdesc) cublasLtMatrixLayoutDestroy(Bdesc);
   if (Adesc) cublasLtMatrixLayoutDestroy(Adesc);
   if (matmulDesc) cublasLtMatmulDescDestroy(matmulDesc);
   if (transformDesc) cublasLtMatrixTransformDescDestroy(transformDesc);
 
   // Wait until device is done before freeing transformed buffers
   cudaDeviceSynchronize();
   if (Ctransform) cudaFree(Ctransform);
   if (Btransform) cudaFree(Btransform);
   if (Atransform) cudaFree(Atransform);
 
   return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}

Note that in the original code, they forget to define rowOrder, so I assume cublasLtOrder_t rowOrder = CUBLASLT_ORDER_ROW;
The error happened in this line

status = cublasLtMatrixTransform(
       ltHandle, transformDesc, &transformAlpha, B, Bdesc, &transformBeta, NULL, NULL, Btransform, BtransformDesc, 0);

And the error message is

Check failed: error == CUBLAS_STATUS_SUCCESS (7 vs. 0) : CUBLAS_STATUS_INVALID_VALUE

I’m running on Tesla T4 GPU with compute capability 7.5. And OS is Ubuntu 16.04.

I read a previous post https://devtalk.nvidia.com/default/topic/1049211/gpu-accelerated-libraries/where-can-i-find-working-examples-for-the-new-cublaslt-library-/2 concerning a ticket on cublasLtMatrixTransform I wonder if that’s been fixed yet.

NVM, I found the problem, the code actually assumes m = n = k, which should have been put in the documentation as well.