Reduction Max Operation Execution Plan Fails Stating Reduction Isn't Over Columns or Rows

I am implementing a reduction operator to perform max reduction over the last dimension (W) using cuDNN 9.1.1. The tensors I am working with have shapes in the NCHW format but use a packed NHWC memory layout, with strides corresponding to NHWC tensors with NCHW shapes. My Y tensor has shape [x_shape[0], x_shape[1], x_shape[2], 1] however, when finalizing execution plans with this operation I get warning
CUDNN_STATUS_NOT_SUPPORTED_GRAPH_PATTERN; Reason: !(reduction_op->isColumnReduction(currBackboneOpType) || reduction_op->isRowReduction(currBackboneOpType) || reduction_op->isAllReduction()) for all four engine configurations provided by the heuristic.

The graph I am implementing is a single reduction operation which follows the fifth graph pattern listed here.

I have pasted the implementation below. Additionally I found that if I reduce over the channel dimension or over all dimensions (y_shape = [1,1,1,1]) the warning does not occur. I am not sure why maximizing over the channel dimension succeeds as that is not an option listed in the above warning but I though it might be useful to include.

What is incorrect with this implementation, I believe I am following the specifications outlined in the documentation and am using proper hardware?

#include <cudnn.h>
#include <iostream>
#include <vector>

#define CUDNN_CHECK(status)                                                    \
    {                                                                          \
        if (status != CUDNN_STATUS_SUCCESS) {                                  \
            std::cerr << "cuDNN error at " << __FILE__ << ":" << __LINE__      \
                      << ": " << cudnnGetErrorString(status) << std::endl;     \
            char error_message[256];                                           \
            cudnnGetLastErrorString(error_message, sizeof(error_message));     \
            std::cerr << "reason: " << error_message;                          \
            std::cerr << std::endl;                                            \
            std::exit(EXIT_FAILURE);                                           \
        }                                                                      \
    }

cudnnBackendDescriptor_t tensor_descriptor(const std::vector<int64_t> &shape,
                                           const std::vector<int64_t> &strides,
                                           int64_t id,
                                           cudnnDataType_t data_type,
                                           int64_t byte_alignment,
                                           bool is_virtual) {
    cudnnBackendDescriptor_t desc;
    CUDNN_CHECK(
        cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &desc));
    CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_UNIQUE_ID,
                                         CUDNN_TYPE_INT64, 1, &id));
    CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_DATA_TYPE,
                                         CUDNN_TYPE_DATA_TYPE, 1, &data_type));
    CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT,
                                         CUDNN_TYPE_INT64, 1, &byte_alignment));
    CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_DIMENSIONS,
                                         CUDNN_TYPE_INT64,
                                         (int64_t)shape.size(), shape.data()));
    CUDNN_CHECK(cudnnBackendSetAttribute(
        desc, CUDNN_ATTR_TENSOR_STRIDES, CUDNN_TYPE_INT64,
        (int64_t)strides.size(), strides.data()));
    CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_IS_VIRTUAL,
                                         CUDNN_TYPE_BOOLEAN, 1, &is_virtual));
    CUDNN_CHECK(cudnnBackendFinalize(desc));
    return desc;
}

int main() {
    std::vector<int64_t> shape_x = {2, 4, 10, 10};
    std::vector<int64_t> stride_x = {400, 1, 40, 4};

    // std::vector<int64_t> shape_y = {1, 4, 10, 1}; // fails
    // std::vector<int64_t> shape_y = {2, 1, 10, 10}; // success
    // std::vector<int64_t> shape_y = {2, 4, 1, 10}; // fails
    std::vector<int64_t> shape_y = {2, 4, 10, 1}; // fails
    // std::vector<int64_t> shape_y = {2, 4, 1, 1}; // fails
    // std::vector<int64_t> shape_y = {1, 1, 1, 1}; // success
    std::vector<int64_t> stride_y = {40, 1, 4, 4};

    cudnnHandle_t handle;
    CUDNN_CHECK(cudnnCreate(&handle));
    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
    int64_t data_type_byte_alignment = 4;

    cudnnBackendDescriptor_t x = tensor_descriptor(
        shape_x, stride_x, 'x', data_type, data_type_byte_alignment, false);
    cudnnBackendDescriptor_t y = tensor_descriptor(
        shape_y, stride_y, 'y', data_type, data_type_byte_alignment, false);

    cudnnBackendDescriptor_t reduce_max;
    CUDNN_CHECK(cudnnBackendCreateDescriptor(CUDNN_BACKEND_REDUCTION_DESCRIPTOR,
                                             &reduce_max));
    cudnnReduceTensorOp_t max_mode = CUDNN_REDUCE_TENSOR_MAX;
    CUDNN_CHECK(cudnnBackendSetAttribute(
        reduce_max, CUDNN_ATTR_REDUCTION_OPERATOR,
        CUDNN_TYPE_REDUCTION_OPERATOR_TYPE, 1, &max_mode));
    CUDNN_CHECK(cudnnBackendSetAttribute(reduce_max,
                                         CUDNN_ATTR_REDUCTION_COMP_TYPE,
                                         CUDNN_TYPE_DATA_TYPE, 1, &data_type));
    CUDNN_CHECK(cudnnBackendFinalize(reduce_max));
    cudnnBackendDescriptor_t op_reduce_max;
    CUDNN_CHECK(cudnnBackendCreateDescriptor(
        CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR, &op_reduce_max));
    CUDNN_CHECK(cudnnBackendSetAttribute(
        op_reduce_max, CUDNN_ATTR_OPERATION_REDUCTION_DESC,
        CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &reduce_max));
    CUDNN_CHECK(cudnnBackendSetAttribute(op_reduce_max,
                                         CUDNN_ATTR_OPERATION_REDUCTION_XDESC,
                                         CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &x));
    CUDNN_CHECK(cudnnBackendSetAttribute(op_reduce_max,
                                         CUDNN_ATTR_OPERATION_REDUCTION_YDESC,
                                         CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &y));
    CUDNN_CHECK(cudnnBackendFinalize(op_reduce_max));

    cudnnBackendDescriptor_t op_graph;
    cudnnBackendDescriptor_t ops[] = {op_reduce_max};
    CUDNN_CHECK(cudnnBackendCreateDescriptor(
        CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, &op_graph));
    CUDNN_CHECK(cudnnBackendSetAttribute(
        op_graph, CUDNN_ATTR_OPERATIONGRAPH_OPS, CUDNN_TYPE_BACKEND_DESCRIPTOR,
        sizeof(ops) / sizeof(ops[0]), ops));
    CUDNN_CHECK(cudnnBackendSetAttribute(op_graph,
                                         CUDNN_ATTR_OPERATIONGRAPH_HANDLE,
                                         CUDNN_TYPE_HANDLE, 1, &handle));
    CUDNN_CHECK(cudnnBackendFinalize(op_graph));

    cudnnBackendDescriptor_t heur_desc;
    CUDNN_CHECK(cudnnBackendCreateDescriptor(
        CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR, &heur_desc));
    CUDNN_CHECK(cudnnBackendSetAttribute(
        heur_desc, CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH,
        CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &op_graph));
    cudnnBackendHeurMode_t heur_mode = CUDNN_HEUR_MODE_A;
    CUDNN_CHECK(cudnnBackendSetAttribute(heur_desc, CUDNN_ATTR_ENGINEHEUR_MODE,
                                         CUDNN_TYPE_HEUR_MODE, 1, &heur_mode));
    CUDNN_CHECK(cudnnBackendFinalize(heur_desc));
    int64_t count = 0;
    CUDNN_CHECK(cudnnBackendGetAttribute(
        heur_desc, CUDNN_ATTR_ENGINEHEUR_RESULTS, CUDNN_TYPE_BACKEND_DESCRIPTOR,
        0, &count, NULL));
    std::cout << "engines: " << count << "\n";
    std::vector<cudnnBackendDescriptor_t> eng_cfgs(count);
    for (cudnnBackendDescriptor_t &cfg : eng_cfgs) {
        CUDNN_CHECK(cudnnBackendCreateDescriptor(
            CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, &cfg));
    }
    CUDNN_CHECK(cudnnBackendGetAttribute(
        heur_desc, CUDNN_ATTR_ENGINEHEUR_RESULTS, CUDNN_TYPE_BACKEND_DESCRIPTOR,
        count, nullptr, eng_cfgs.data()));

    for (cudnnBackendDescriptor_t &cfg : eng_cfgs) {
        cudnnBackendDescriptor_t exec_plan;
        CUDNN_CHECK(cudnnBackendCreateDescriptor(
            CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, &exec_plan));
        CUDNN_CHECK(cudnnBackendSetAttribute(
            exec_plan, CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG,
            CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &cfg));
        CUDNN_CHECK(cudnnBackendSetAttribute(exec_plan,
                                             CUDNN_ATTR_EXECUTION_PLAN_HANDLE,
                                             CUDNN_TYPE_HANDLE, 1, &handle));
        cudnnBackendDescriptor_t engine;
        CUDNN_CHECK(cudnnBackendCreateDescriptor(
            CUDNN_BACKEND_ENGINE_DESCRIPTOR, &engine));
        CUDNN_CHECK(cudnnBackendGetAttribute(cfg, CUDNN_ATTR_ENGINECFG_ENGINE,
                                             CUDNN_TYPE_BACKEND_DESCRIPTOR, 1,
                                             nullptr, &engine));
        cudnnStatus_t status = cudnnBackendFinalize(exec_plan);
        std::cout << cudnnGetErrorString(status) << "\n";
        if (status == CUDNN_STATUS_SUCCESS) {
            std::cout << "success\n";
        } else {
            char error_message[256];
            cudnnGetLastErrorString(error_message, sizeof(error_message));
            std::cout << "reason: " << error_message << std::endl;
        }
    }

    // To be filled in

    return 0;
}