CUDNN_STATUS_NOT_SUPPORTED when finalizing engine config for simple ReLU graph

I’m roughly following Setting Up An Operation Graph For A Grouped Convolution in order to setup a simple ReLU forward pass but I’m getting CUDNN_STATUS_NOT_SUPPORTED when trying to finalize the engine config.

Here’s my code:

#include <cuda_runtime.h>
#include <cudnn.h>
#include <iostream>
#include <cstdio>

#define CHECK_CUDNN(expression) \
{ \
    cudnnStatus_t status = (expression); \
    if(status != CUDNN_STATUS_SUCCESS) \
    { \
        std::cerr << "Error on line " << __LINE__ << ": " \
            << cudnnGetErrorString(status) << std::endl; \
        std::exit(EXIT_FAILURE); \
    } \
}

void tensor_2d_create(
    int64_t dim0, int64_t dim1, int64_t* tensor_count, cudnnBackendDescriptor_t* desc
){
    int64_t n_dims = 2;
    int64_t shape[] = {dim0, dim1};
    int64_t strides[] = {dim1, 1};
    int64_t alignment = 16;
    int64_t uid = (*tensor_count)++;;

    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
    CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, desc));
    CHECK_CUDNN(cudnnBackendSetAttribute(
        *desc, CUDNN_ATTR_TENSOR_DATA_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &data_type
    ));
    CHECK_CUDNN(cudnnBackendSetAttribute(
        *desc, CUDNN_ATTR_TENSOR_DIMENSIONS, CUDNN_TYPE_INT64, n_dims, shape
    ));
    CHECK_CUDNN(cudnnBackendSetAttribute(
        *desc, CUDNN_ATTR_TENSOR_STRIDES, CUDNN_TYPE_INT64, n_dims, strides
    ));
    CHECK_CUDNN(cudnnBackendSetAttribute(
        *desc, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT, CUDNN_TYPE_INT64, 1, &alignment
    ));
    CHECK_CUDNN(cudnnBackendSetAttribute(
        *desc, CUDNN_ATTR_TENSOR_UNIQUE_ID, CUDNN_TYPE_INT64, 1, &uid
    ));
    CHECK_CUDNN(cudnnBackendFinalize(*desc));
}

int main()
{
    cudnnHandle_t cudnn;
    CHECK_CUDNN(cudnnCreate(&cudnn));
    printf("Initialized cuDNN\n");
    printf("cuDNN version: %zu\n", cudnnGetVersion());

    int64_t tensor_count = 0;
    cudnnBackendDescriptor_t input_desc;
    tensor_2d_create(1, 32, &tensor_count, &input_desc);
    
    cudnnBackendDescriptor_t output_desc;
    tensor_2d_create(1, 32, &tensor_count, &output_desc);

    cudnnPointwiseMode_t act_mode = CUDNN_POINTWISE_RELU_FWD;
    cudnnDataType_t act_data_type = CUDNN_DATA_FLOAT;
    cudnnBackendDescriptor_t relu_desc;
    CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_POINTWISE_DESCRIPTOR, &relu_desc));
    CHECK_CUDNN(cudnnBackendSetAttribute(
        relu_desc, CUDNN_ATTR_POINTWISE_MODE, CUDNN_TYPE_POINTWISE_MODE, 1, &act_mode
    ));
    CHECK_CUDNN(cudnnBackendSetAttribute(
        relu_desc, CUDNN_ATTR_POINTWISE_MATH_PREC, CUDNN_TYPE_DATA_TYPE, 1, &act_data_type
    ));
    CHECK_CUDNN(cudnnBackendFinalize(relu_desc));

    cudnnBackendDescriptor_t relu_op_desc;
    CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR, &relu_op_desc));
    CHECK_CUDNN(cudnnBackendSetAttribute(
        relu_op_desc, CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &relu_desc
    ));
    CHECK_CUDNN(cudnnBackendSetAttribute(
        relu_op_desc, CUDNN_ATTR_OPERATION_POINTWISE_XDESC, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &input_desc
    ));
    CHECK_CUDNN(cudnnBackendSetAttribute(
        relu_op_desc, CUDNN_ATTR_OPERATION_POINTWISE_YDESC, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &output_desc
    ));
    CHECK_CUDNN(cudnnBackendFinalize(relu_op_desc));
    printf("Final tensor_count: %ld\n", tensor_count);

    // Create op graph.
    cudnnBackendDescriptor_t op_graph;
    CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, &op_graph));
    CHECK_CUDNN(cudnnBackendSetAttribute(
        op_graph, CUDNN_ATTR_OPERATIONGRAPH_OPS, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &relu_op_desc
    ));
    CHECK_CUDNN(cudnnBackendSetAttribute(
        op_graph, CUDNN_ATTR_OPERATIONGRAPH_HANDLE, CUDNN_TYPE_HANDLE, 1, &cudnn
    ));
    CHECK_CUDNN(cudnnBackendFinalize(op_graph));
    printf("Created graph\n");

    // Create engine.
    cudnnBackendDescriptor_t engine;
    CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINE_DESCRIPTOR, &engine));
    CHECK_CUDNN(cudnnBackendSetAttribute(
        engine, CUDNN_ATTR_ENGINE_OPERATION_GRAPH, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &op_graph
    ));
    int64_t gidx = 0;
    CHECK_CUDNN(cudnnBackendSetAttribute(
        engine, CUDNN_ATTR_ENGINE_GLOBAL_INDEX, CUDNN_TYPE_INT64, 1, &gidx
    ));
    CHECK_CUDNN(cudnnBackendFinalize(engine));
    printf("Created engine\n");

    // Create engine config.
    cudnnBackendDescriptor_t engine_cfg;
    CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, &engine_cfg));
    CHECK_CUDNN(cudnnBackendSetAttribute(
        engine_cfg, CUDNN_ATTR_ENGINECFG_ENGINE, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engine
    ));
    // Error here.
    CHECK_CUDNN(cudnnBackendFinalize(engine_cfg));
    int64_t workspace_size;
    CHECK_CUDNN(cudnnBackendGetAttribute(
        engine_cfg, CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE, CUDNN_TYPE_INT64, 1, NULL, &workspace_size
    ));
    printf("Created engine config\n");

    CHECK_CUDNN(cudnnDestroy(cudnn));
}
cmake_minimum_required(VERSION 3.10)
project(mlp CUDA CXX)

set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 14)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_ARCHITECTURES "86")

find_package(CUDA REQUIRED)

set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-O3;-arch=sm_86)

cuda_add_executable(mlp_cudnn_iso ./src/mlp_cudnn_iso.cu)
set_target_properties(mlp_cudnn_iso PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_include_directories(mlp_cudnn_iso PRIVATE ./src/)
target_link_libraries(mlp_cudnn_iso cuda cudart cudnn cublas)

And here’s the logs with CUDNN_LOGLEVEL_DBG=2:

Initialized cuDNN
cuDNN version: 90300
Final tensor_count: 2
Created graph
Created engine

W! CuDNN (v90300 75) function cudnnBackendFinalize() called:
w!         Warning: CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH; Reason: fortBackend.is_cutlass_sm7x() && ((this->getDeviceProp()->deviceVer < 700) || (this->getDeviceProp()->deviceVer > 800))
w!         Warning: CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH; Reason: init_kernelgen_backend()
w!         Warning: CUDNN_STATUS_NOT_SUPPORTED; Reason: ptr->isSupported()
w!         Warning: CUDNN_STATUS_NOT_SUPPORTED; Reason: finalize_internal()
w!         Warning: CUDNN_STATUS_NOT_SUPPORTED; Reason: ptrDesc->finalize()
w! Time: 2024-11-01T13:17:40.153604 (0d+0h+0m+1s since start)
w! Process=3161219; Thread=3161219; GPU=NULL; Handle=NULL; StreamId=NULL.

Error on line 117: CUDNN_STATUS_NOT_SUPPORTED

My CUDA version is 12.4, my cuDNN version is 9.3, and I’m running this on a 3070 Ti.

The logs seem to indicate that the problem is related to the target architecture but I’m not sure how to interpret it. Any help would be appreciated.

Hi ,
Checking this with the Engineering and will update the thread.
Thanks

Hi,
Try 3D Tensor instead of 2D Tensors.

Maybe check the documentation on Support Surface and the constrains for certain Nodes/Operations: Graph API — NVIDIA cuDNN