When I use cudnn’s matmul on V100, I encountered some problems. Specifically, when the shape (M, N, K) is changed, cudnnBackendFinalize(plan) reports an error of CUDNN_STATUS_NOT_SUPPORTED. The following is an error example with M=4096, N=32, K=1;
#include <iomanip>
#include <iostream>
#include <cstdlib>
#include <vector>
#include <cuda_fp16.h>
#include <cuda.h>
#include <cudnn.h>
int main() {
cudnnStatus_t status;
cudnnHandle_t handle;
cudnnCreate(&handle);
cudnnDataType_t dtype = CUDNN_DATA_FLOAT;
cudnnDataType_t dataType = CUDNN_DATA_HALF;
// input
const int batch_size = 1;
const int M = 4096;
const int N = 32;
const int K = 1;
std::cout << "batch_size: " << batch_size << std::endl;
std::cout << "M: " << M << std::endl;
std::cout << "N: " << N << std::endl;
std::cout << "K: " << K << std::endl;
std::cout << std::endl;
int64_t alignment = 16;
// dyMatrixTensor descriptor
cudnnBackendDescriptor_t dy_desc;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &dy_desc);
cudnnBackendSetAttribute(dy_desc,
CUDNN_ATTR_TENSOR_DATA_TYPE,
CUDNN_TYPE_DATA_TYPE,
1,
&dataType);
int64_t dyDim[] = {batch_size, M, K};
int64_t dyStr[] = {K*M, K, 1};
int64_t dyUi = 'y';
cudnnBackendSetAttribute(dy_desc, CUDNN_ATTR_TENSOR_DIMENSIONS,
CUDNN_TYPE_INT64, 3, dyDim);
cudnnBackendSetAttribute(dy_desc, CUDNN_ATTR_TENSOR_STRIDES,
CUDNN_TYPE_INT64, 3, dyStr);
cudnnBackendSetAttribute(dy_desc, CUDNN_ATTR_TENSOR_UNIQUE_ID,
CUDNN_TYPE_INT64, 1, &dyUi);
cudnnBackendSetAttribute(dy_desc, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT,
CUDNN_TYPE_INT64, 1, &alignment);
status = cudnnBackendFinalize(dy_desc);
if (status != CUDNN_STATUS_SUCCESS){
std::cout << "dyMatrixTensor error" << std::endl;
return -1;
}
// wMatrixTensor descriptor
cudnnBackendDescriptor_t w_desc;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &w_desc);
cudnnBackendSetAttribute(w_desc,
CUDNN_ATTR_TENSOR_DATA_TYPE,
CUDNN_TYPE_DATA_TYPE,
1,
&dataType);
int64_t wDim[] = {batch_size, K, N};
int64_t wStr[] = {K*N, N, 1};
int64_t wUi = 'w';
cudnnBackendSetAttribute(w_desc, CUDNN_ATTR_TENSOR_DIMENSIONS,
CUDNN_TYPE_INT64, 3, wDim);
cudnnBackendSetAttribute(w_desc, CUDNN_ATTR_TENSOR_STRIDES,
CUDNN_TYPE_INT64, 3, wStr);
cudnnBackendSetAttribute(w_desc, CUDNN_ATTR_TENSOR_UNIQUE_ID,
CUDNN_TYPE_INT64, 1, &wUi);
cudnnBackendSetAttribute(w_desc, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT,
CUDNN_TYPE_INT64, 1, &alignment);
status = cudnnBackendFinalize(w_desc);
if (status != CUDNN_STATUS_SUCCESS) {
std::cout << "wMatrixTensor error" << std::endl;
return -1;
}
// resultMatrixTensor descriptor
cudnnBackendDescriptor_t r_desc;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &r_desc);
cudnnBackendSetAttribute(r_desc,
CUDNN_ATTR_TENSOR_DATA_TYPE,
CUDNN_TYPE_DATA_TYPE,
1,
&dataType);
int64_t rDim[] = {batch_size, M, N};
int64_t rStr[] = {N*M, N, 1};
int64_t rUi = 'r';
cudnnBackendSetAttribute(r_desc, CUDNN_ATTR_TENSOR_DIMENSIONS,
CUDNN_TYPE_INT64, 3, rDim);
cudnnBackendSetAttribute(r_desc, CUDNN_ATTR_TENSOR_STRIDES,
CUDNN_TYPE_INT64, 3, rStr);
cudnnBackendSetAttribute(r_desc, CUDNN_ATTR_TENSOR_UNIQUE_ID,
CUDNN_TYPE_INT64, 1, &rUi);
cudnnBackendSetAttribute(r_desc, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT,
CUDNN_TYPE_INT64, 1, &alignment);
status = cudnnBackendFinalize(r_desc);
if (status != CUDNN_STATUS_SUCCESS) {
std::cout << "resultMatrixTensor error" << std::endl;
return -1;
}
// matmul descriptor
cudnnBackendDescriptor_t matmul_Desc;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_MATMUL_DESCRIPTOR, &matmul_Desc);
cudnnBackendSetAttribute(matmul_Desc,
CUDNN_ATTR_MATMUL_COMP_TYPE,
CUDNN_TYPE_DATA_TYPE,
1,
&dtype);
status = cudnnBackendFinalize(matmul_Desc);
if (status != CUDNN_STATUS_SUCCESS) {
std::cout << "matmul descriptor error" << std::endl;
return -1;
}
// operater
cudnnBackendDescriptor_t matmulOp;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR, &matmulOp);
cudnnBackendSetAttribute(matmulOp,
CUDNN_ATTR_OPERATION_MATMUL_ADESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR,
1,
&dy_desc);
cudnnBackendSetAttribute(matmulOp,
CUDNN_ATTR_OPERATION_MATMUL_BDESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR,
1,
&w_desc);
cudnnBackendSetAttribute(matmulOp,
CUDNN_ATTR_OPERATION_MATMUL_CDESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR,
1,
&r_desc);
cudnnBackendSetAttribute(matmulOp,
CUDNN_ATTR_OPERATION_MATMUL_DESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR,
1,
&matmul_Desc);
status = cudnnBackendFinalize(matmulOp);
if (status != CUDNN_STATUS_SUCCESS) {
std::cout << "matmul operater error" << std::endl;
return -1;
}
// op graph
cudnnBackendDescriptor_t op_graph;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, &op_graph);
cudnnBackendSetAttribute(op_graph, CUDNN_ATTR_OPERATIONGRAPH_OPS,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &matmulOp);
cudnnBackendSetAttribute(op_graph, CUDNN_ATTR_OPERATIONGRAPH_HANDLE,
CUDNN_TYPE_HANDLE, 1, &handle);
status = cudnnBackendFinalize(op_graph);
if (status != CUDNN_STATUS_SUCCESS) {
std::cout << "op graph error" << std::endl;
return -1;
}
// Create, set, and finalize an engine descriptor.
cudnnBackendDescriptor_t engine;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINE_DESCRIPTOR, &engine);
cudnnBackendSetAttribute(engine, CUDNN_ATTR_ENGINE_OPERATION_GRAPH,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &op_graph);
int64_t gidx = 0;
cudnnBackendSetAttribute(engine, CUDNN_ATTR_ENGINE_GLOBAL_INDEX,
CUDNN_TYPE_INT64, 1, &gidx);
status = cudnnBackendFinalize(engine);
if (status != CUDNN_STATUS_SUCCESS) {
std::cout << "engine error" << std::endl;
return -1;
}
// Create, set, and finalize an enginecfg descriptor.
cudnnBackendDescriptor_t engcfg;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, &engcfg);
cudnnBackendSetAttribute(engcfg, CUDNN_ATTR_ENGINECFG_ENGINE,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engine);
status = cudnnBackendFinalize(engcfg);
if (status != CUDNN_STATUS_SUCCESS) {
std::cout << "engine cfg error" << std::endl;
return -1;
}
// plan
cudnnBackendDescriptor_t plan;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, &plan);
cudnnBackendSetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engcfg);
cudnnBackendSetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_HANDLE, CUDNN_TYPE_HANDLE, 1, &handle);
status = cudnnBackendFinalize(plan);
if (status != CUDNN_STATUS_SUCCESS) {
std::cout << "Plan error " << std::endl;
return -1;
}
int64_t workspaceSize=100*1024*1024;
// cudnnBackendGetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE,
// CUDNN_TYPE_INT64, 1, NULL, &workspaceSize);
// Create, set and finalize a variant pack descriptor
void *dyData;
cudaMalloc(
&dyData, batch_size * M * K * sizeof(half));
void *wData;
cudaMalloc(
&wData, batch_size * K * N * sizeof(half));
void *rData;
cudaMalloc(
&rData, batch_size * M * N * sizeof(half));
void *dev_ptrs[3] = {rData, dyData, wData}; // device pointer
int64_t uids[3] = {'r', 'y', 'w'};
void *workspace;
cudaMalloc(&workspace, workspaceSize);
cudnnBackendDescriptor_t varpack;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR, &varpack);
cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS,
CUDNN_TYPE_VOID_PTR, 3, dev_ptrs);
cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS,
CUDNN_TYPE_INT64, 3, uids);
cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_WORKSPACE,
CUDNN_TYPE_VOID_PTR, 1, &workspace);
status = cudnnBackendFinalize(varpack);
if (status != CUDNN_STATUS_SUCCESS) {
std::cout << "Varpack error " << std::endl;
return -1;
}
// run
status = cudnnBackendExecute(handle, plan, varpack);
if (status != CUDNN_STATUS_SUCCESS) {
std::cout << "Execute error "<< status << std::endl;
return -1;
}
std::cout << "successful end " << std::endl;
return 0;
}
The reference API documentation is API Reference - NVIDIA Docs. And cudnn cpp frontend also has the same issue.
My cudnn version is 8.8.0, and it can run normally when M=4096, N=32, K=1000. How can I use cudnn to run matmul normally on V100?