Errors Occurred When Using CUDNN Matmul on V100

When I use cudnn’s matmul on V100, I encountered some problems. Specifically, when the shape (M, N, K) is changed, cudnnBackendFinalize(plan) reports an error of CUDNN_STATUS_NOT_SUPPORTED. The following is an error example with M=4096, N=32, K=1;

#include <iomanip>
#include <iostream>
#include <cstdlib>
#include <vector>
#include <cuda_fp16.h>
#include <cuda.h>
#include <cudnn.h>


int main() {
  cudnnStatus_t status;
  cudnnHandle_t handle;
  cudnnCreate(&handle);
  cudnnDataType_t dtype = CUDNN_DATA_FLOAT;

  cudnnDataType_t dataType = CUDNN_DATA_HALF;
  // input
  const int batch_size = 1;
  const int M = 4096;
  const int N = 32;
  const int K = 1;
  std::cout << "batch_size: " << batch_size << std::endl;
  std::cout << "M: " << M << std::endl;
  std::cout << "N: " << N << std::endl;
  std::cout << "K: " << K << std::endl;
  std::cout << std::endl;
  int64_t alignment = 16;
  // dyMatrixTensor descriptor
  cudnnBackendDescriptor_t dy_desc;
  cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &dy_desc);
  cudnnBackendSetAttribute(dy_desc,
                                    CUDNN_ATTR_TENSOR_DATA_TYPE,
                                    CUDNN_TYPE_DATA_TYPE,
                                    1,
                                    &dataType);

  int64_t dyDim[] = {batch_size, M, K};
  int64_t dyStr[] = {K*M, K, 1};
  int64_t dyUi = 'y';
  cudnnBackendSetAttribute(dy_desc, CUDNN_ATTR_TENSOR_DIMENSIONS,
                          CUDNN_TYPE_INT64, 3, dyDim);
  cudnnBackendSetAttribute(dy_desc, CUDNN_ATTR_TENSOR_STRIDES,
                          CUDNN_TYPE_INT64, 3, dyStr);
  cudnnBackendSetAttribute(dy_desc, CUDNN_ATTR_TENSOR_UNIQUE_ID,
                          CUDNN_TYPE_INT64, 1, &dyUi);
  cudnnBackendSetAttribute(dy_desc, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT,
                          CUDNN_TYPE_INT64, 1, &alignment);
  status = cudnnBackendFinalize(dy_desc);
  if (status != CUDNN_STATUS_SUCCESS){
    std::cout << "dyMatrixTensor error" << std::endl;
    return -1;
  }
  // wMatrixTensor descriptor
  cudnnBackendDescriptor_t w_desc;
  cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &w_desc);
  cudnnBackendSetAttribute(w_desc,
                                    CUDNN_ATTR_TENSOR_DATA_TYPE,
                                    CUDNN_TYPE_DATA_TYPE,
                                    1,
                                    &dataType);

  int64_t wDim[] = {batch_size, K, N};
  int64_t wStr[] = {K*N, N, 1};
  int64_t wUi = 'w';
  cudnnBackendSetAttribute(w_desc, CUDNN_ATTR_TENSOR_DIMENSIONS,
                          CUDNN_TYPE_INT64, 3, wDim);
  cudnnBackendSetAttribute(w_desc, CUDNN_ATTR_TENSOR_STRIDES,
                          CUDNN_TYPE_INT64, 3, wStr);
  cudnnBackendSetAttribute(w_desc, CUDNN_ATTR_TENSOR_UNIQUE_ID,
                          CUDNN_TYPE_INT64, 1, &wUi);
  cudnnBackendSetAttribute(w_desc, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT,
                          CUDNN_TYPE_INT64, 1, &alignment);
  status = cudnnBackendFinalize(w_desc);
  if (status != CUDNN_STATUS_SUCCESS) {
    std::cout << "wMatrixTensor error" << std::endl;
    return -1;
  }
  // resultMatrixTensor descriptor
  cudnnBackendDescriptor_t r_desc;
  cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &r_desc);
  cudnnBackendSetAttribute(r_desc,
                                    CUDNN_ATTR_TENSOR_DATA_TYPE,
                                    CUDNN_TYPE_DATA_TYPE,
                                    1,
                                    &dataType);

  int64_t rDim[] = {batch_size, M, N};
  int64_t rStr[] = {N*M, N, 1};
  int64_t rUi = 'r';
  cudnnBackendSetAttribute(r_desc, CUDNN_ATTR_TENSOR_DIMENSIONS,
                          CUDNN_TYPE_INT64, 3, rDim);
  cudnnBackendSetAttribute(r_desc, CUDNN_ATTR_TENSOR_STRIDES,
                          CUDNN_TYPE_INT64, 3, rStr);
  cudnnBackendSetAttribute(r_desc, CUDNN_ATTR_TENSOR_UNIQUE_ID,
                          CUDNN_TYPE_INT64, 1, &rUi);
  cudnnBackendSetAttribute(r_desc, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT,
                          CUDNN_TYPE_INT64, 1, &alignment);
  status = cudnnBackendFinalize(r_desc);
  if (status != CUDNN_STATUS_SUCCESS) {
    std::cout << "resultMatrixTensor error" << std::endl;
    return -1;
  }
  // matmul descriptor
  cudnnBackendDescriptor_t matmul_Desc;

  cudnnBackendCreateDescriptor(CUDNN_BACKEND_MATMUL_DESCRIPTOR, &matmul_Desc);

  cudnnBackendSetAttribute(matmul_Desc,
                          CUDNN_ATTR_MATMUL_COMP_TYPE,
                          CUDNN_TYPE_DATA_TYPE,
                          1,
                          &dtype);

  status = cudnnBackendFinalize(matmul_Desc);
  if (status != CUDNN_STATUS_SUCCESS) {
    std::cout << "matmul descriptor error" << std::endl;
    return -1;
  }
  // operater
  cudnnBackendDescriptor_t matmulOp;

  cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR, &matmulOp);

  cudnnBackendSetAttribute(matmulOp,
                CUDNN_ATTR_OPERATION_MATMUL_ADESC,
                CUDNN_TYPE_BACKEND_DESCRIPTOR,
                1,
                &dy_desc);

  cudnnBackendSetAttribute(matmulOp,
                CUDNN_ATTR_OPERATION_MATMUL_BDESC,
                CUDNN_TYPE_BACKEND_DESCRIPTOR,
                1,
                &w_desc);
  
  cudnnBackendSetAttribute(matmulOp,
                CUDNN_ATTR_OPERATION_MATMUL_CDESC,
                CUDNN_TYPE_BACKEND_DESCRIPTOR,
                1,
                &r_desc);
  cudnnBackendSetAttribute(matmulOp,
                CUDNN_ATTR_OPERATION_MATMUL_DESC,
                CUDNN_TYPE_BACKEND_DESCRIPTOR,
                1,
                &matmul_Desc);

  status = cudnnBackendFinalize(matmulOp);
  if (status != CUDNN_STATUS_SUCCESS) {
    std::cout << "matmul operater error" << std::endl;
    return -1;
  }
  // op graph
  cudnnBackendDescriptor_t op_graph;
  cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, &op_graph);
  cudnnBackendSetAttribute(op_graph, CUDNN_ATTR_OPERATIONGRAPH_OPS,
                          CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &matmulOp);
  cudnnBackendSetAttribute(op_graph, CUDNN_ATTR_OPERATIONGRAPH_HANDLE,
                          CUDNN_TYPE_HANDLE, 1, &handle);
  status = cudnnBackendFinalize(op_graph);
  if (status != CUDNN_STATUS_SUCCESS) {
    std::cout << "op graph error" << std::endl;
    return -1;
  }
  // Create, set, and finalize an engine descriptor.
  cudnnBackendDescriptor_t engine;
  cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINE_DESCRIPTOR, &engine);
  cudnnBackendSetAttribute(engine, CUDNN_ATTR_ENGINE_OPERATION_GRAPH,
                          CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &op_graph);
  int64_t gidx = 0;
  cudnnBackendSetAttribute(engine, CUDNN_ATTR_ENGINE_GLOBAL_INDEX,
                          CUDNN_TYPE_INT64, 1, &gidx);
  status = cudnnBackendFinalize(engine);
  if (status != CUDNN_STATUS_SUCCESS) {
    std::cout << "engine error" << std::endl;
    return -1;
  }
  // Create, set, and finalize an enginecfg descriptor.
  cudnnBackendDescriptor_t engcfg;
  cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, &engcfg);
  cudnnBackendSetAttribute(engcfg, CUDNN_ATTR_ENGINECFG_ENGINE,
                          CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engine);
  status = cudnnBackendFinalize(engcfg);
  if (status != CUDNN_STATUS_SUCCESS) {
    std::cout << "engine cfg error" << std::endl;
    return -1;
  }
  // plan
  cudnnBackendDescriptor_t plan;
  cudnnBackendCreateDescriptor(CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, &plan);
  cudnnBackendSetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG,
                          CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engcfg);
  cudnnBackendSetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_HANDLE, CUDNN_TYPE_HANDLE, 1, &handle);

  status = cudnnBackendFinalize(plan);
  if (status != CUDNN_STATUS_SUCCESS) {
    std::cout << "Plan error  " << std::endl;
    return -1;
  }
    

  int64_t workspaceSize=100*1024*1024;
  // cudnnBackendGetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE,
  //                         CUDNN_TYPE_INT64, 1, NULL, &workspaceSize);

  // Create, set and finalize a variant pack descriptor
  void  *dyData;
  cudaMalloc(
      &dyData, batch_size * M * K * sizeof(half));

  void  *wData;
  cudaMalloc(
      &wData, batch_size * K * N * sizeof(half));

  void *rData;
  cudaMalloc(
      &rData, batch_size * M * N * sizeof(half));


  void *dev_ptrs[3] = {rData, dyData, wData}; // device pointer
  int64_t uids[3] = {'r', 'y', 'w'};
  void *workspace;
  cudaMalloc(&workspace, workspaceSize);

  cudnnBackendDescriptor_t varpack;
  cudnnBackendCreateDescriptor(CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR, &varpack);
  cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS,
                          CUDNN_TYPE_VOID_PTR, 3, dev_ptrs);
  cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS,
                          CUDNN_TYPE_INT64, 3, uids);
  cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_WORKSPACE,
                          CUDNN_TYPE_VOID_PTR, 1, &workspace);
  status = cudnnBackendFinalize(varpack);
  if (status != CUDNN_STATUS_SUCCESS) {
    std::cout << "Varpack error  " << std::endl;
    return -1;
  }
  // run
  status = cudnnBackendExecute(handle, plan, varpack);
  if (status != CUDNN_STATUS_SUCCESS) {
    std::cout << "Execute error "<< status << std::endl;
    return -1;
  }
  std::cout << "successful end " << std::endl;
  return 0;
}


The reference API documentation is API Reference - NVIDIA Docs. And cudnn cpp frontend also has the same issue.

My cudnn version is 8.8.0, and it can run normally when M=4096, N=32, K=1000. How can I use cudnn to run matmul normally on V100?

Hi, @AakankshaS, Can you help take a look at this problem?

Hi, @840241309, did you find what was causing the error and if so, were you able to solve it? I am having the same exact one with cudnn 9.0.0.