I am implementing a reduction operator to perform max reduction over the last dimension (W) using cuDNN 9.1.1. The tensors I am working with have shapes in the NCHW format but use a packed NHWC memory layout, with strides corresponding to NHWC tensors with NCHW shapes. My Y tensor has shape [x_shape[0], x_shape[1], x_shape[2], 1]
however, when finalizing execution plans with this operation I get warning
CUDNN_STATUS_NOT_SUPPORTED_GRAPH_PATTERN; Reason: !(reduction_op->isColumnReduction(currBackboneOpType) || reduction_op->isRowReduction(currBackboneOpType) || reduction_op->isAllReduction())
for all four engine configurations provided by the heuristic.
The graph I am implementing is a single reduction operation which follows the fifth graph pattern listed here.
I have pasted the implementation below. Additionally I found that if I reduce over the channel dimension or over all dimensions (y_shape = [1,1,1,1]) the warning does not occur. I am not sure why maximizing over the channel dimension succeeds as that is not an option listed in the above warning but I though it might be useful to include.
What is incorrect with this implementation, I believe I am following the specifications outlined in the documentation and am using proper hardware?
#include <cudnn.h>
#include <iostream>
#include <vector>
#define CUDNN_CHECK(status) \
{ \
if (status != CUDNN_STATUS_SUCCESS) { \
std::cerr << "cuDNN error at " << __FILE__ << ":" << __LINE__ \
<< ": " << cudnnGetErrorString(status) << std::endl; \
char error_message[256]; \
cudnnGetLastErrorString(error_message, sizeof(error_message)); \
std::cerr << "reason: " << error_message; \
std::cerr << std::endl; \
std::exit(EXIT_FAILURE); \
} \
}
cudnnBackendDescriptor_t tensor_descriptor(const std::vector<int64_t> &shape,
const std::vector<int64_t> &strides,
int64_t id,
cudnnDataType_t data_type,
int64_t byte_alignment,
bool is_virtual) {
cudnnBackendDescriptor_t desc;
CUDNN_CHECK(
cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &desc));
CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_UNIQUE_ID,
CUDNN_TYPE_INT64, 1, &id));
CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_DATA_TYPE,
CUDNN_TYPE_DATA_TYPE, 1, &data_type));
CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT,
CUDNN_TYPE_INT64, 1, &byte_alignment));
CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_DIMENSIONS,
CUDNN_TYPE_INT64,
(int64_t)shape.size(), shape.data()));
CUDNN_CHECK(cudnnBackendSetAttribute(
desc, CUDNN_ATTR_TENSOR_STRIDES, CUDNN_TYPE_INT64,
(int64_t)strides.size(), strides.data()));
CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_IS_VIRTUAL,
CUDNN_TYPE_BOOLEAN, 1, &is_virtual));
CUDNN_CHECK(cudnnBackendFinalize(desc));
return desc;
}
int main() {
std::vector<int64_t> shape_x = {2, 4, 10, 10};
std::vector<int64_t> stride_x = {400, 1, 40, 4};
// std::vector<int64_t> shape_y = {1, 4, 10, 1}; // fails
// std::vector<int64_t> shape_y = {2, 1, 10, 10}; // success
// std::vector<int64_t> shape_y = {2, 4, 1, 10}; // fails
std::vector<int64_t> shape_y = {2, 4, 10, 1}; // fails
// std::vector<int64_t> shape_y = {2, 4, 1, 1}; // fails
// std::vector<int64_t> shape_y = {1, 1, 1, 1}; // success
std::vector<int64_t> stride_y = {40, 1, 4, 4};
cudnnHandle_t handle;
CUDNN_CHECK(cudnnCreate(&handle));
cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
int64_t data_type_byte_alignment = 4;
cudnnBackendDescriptor_t x = tensor_descriptor(
shape_x, stride_x, 'x', data_type, data_type_byte_alignment, false);
cudnnBackendDescriptor_t y = tensor_descriptor(
shape_y, stride_y, 'y', data_type, data_type_byte_alignment, false);
cudnnBackendDescriptor_t reduce_max;
CUDNN_CHECK(cudnnBackendCreateDescriptor(CUDNN_BACKEND_REDUCTION_DESCRIPTOR,
&reduce_max));
cudnnReduceTensorOp_t max_mode = CUDNN_REDUCE_TENSOR_MAX;
CUDNN_CHECK(cudnnBackendSetAttribute(
reduce_max, CUDNN_ATTR_REDUCTION_OPERATOR,
CUDNN_TYPE_REDUCTION_OPERATOR_TYPE, 1, &max_mode));
CUDNN_CHECK(cudnnBackendSetAttribute(reduce_max,
CUDNN_ATTR_REDUCTION_COMP_TYPE,
CUDNN_TYPE_DATA_TYPE, 1, &data_type));
CUDNN_CHECK(cudnnBackendFinalize(reduce_max));
cudnnBackendDescriptor_t op_reduce_max;
CUDNN_CHECK(cudnnBackendCreateDescriptor(
CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR, &op_reduce_max));
CUDNN_CHECK(cudnnBackendSetAttribute(
op_reduce_max, CUDNN_ATTR_OPERATION_REDUCTION_DESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &reduce_max));
CUDNN_CHECK(cudnnBackendSetAttribute(op_reduce_max,
CUDNN_ATTR_OPERATION_REDUCTION_XDESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &x));
CUDNN_CHECK(cudnnBackendSetAttribute(op_reduce_max,
CUDNN_ATTR_OPERATION_REDUCTION_YDESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &y));
CUDNN_CHECK(cudnnBackendFinalize(op_reduce_max));
cudnnBackendDescriptor_t op_graph;
cudnnBackendDescriptor_t ops[] = {op_reduce_max};
CUDNN_CHECK(cudnnBackendCreateDescriptor(
CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, &op_graph));
CUDNN_CHECK(cudnnBackendSetAttribute(
op_graph, CUDNN_ATTR_OPERATIONGRAPH_OPS, CUDNN_TYPE_BACKEND_DESCRIPTOR,
sizeof(ops) / sizeof(ops[0]), ops));
CUDNN_CHECK(cudnnBackendSetAttribute(op_graph,
CUDNN_ATTR_OPERATIONGRAPH_HANDLE,
CUDNN_TYPE_HANDLE, 1, &handle));
CUDNN_CHECK(cudnnBackendFinalize(op_graph));
cudnnBackendDescriptor_t heur_desc;
CUDNN_CHECK(cudnnBackendCreateDescriptor(
CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR, &heur_desc));
CUDNN_CHECK(cudnnBackendSetAttribute(
heur_desc, CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &op_graph));
cudnnBackendHeurMode_t heur_mode = CUDNN_HEUR_MODE_A;
CUDNN_CHECK(cudnnBackendSetAttribute(heur_desc, CUDNN_ATTR_ENGINEHEUR_MODE,
CUDNN_TYPE_HEUR_MODE, 1, &heur_mode));
CUDNN_CHECK(cudnnBackendFinalize(heur_desc));
int64_t count = 0;
CUDNN_CHECK(cudnnBackendGetAttribute(
heur_desc, CUDNN_ATTR_ENGINEHEUR_RESULTS, CUDNN_TYPE_BACKEND_DESCRIPTOR,
0, &count, NULL));
std::cout << "engines: " << count << "\n";
std::vector<cudnnBackendDescriptor_t> eng_cfgs(count);
for (cudnnBackendDescriptor_t &cfg : eng_cfgs) {
CUDNN_CHECK(cudnnBackendCreateDescriptor(
CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, &cfg));
}
CUDNN_CHECK(cudnnBackendGetAttribute(
heur_desc, CUDNN_ATTR_ENGINEHEUR_RESULTS, CUDNN_TYPE_BACKEND_DESCRIPTOR,
count, nullptr, eng_cfgs.data()));
for (cudnnBackendDescriptor_t &cfg : eng_cfgs) {
cudnnBackendDescriptor_t exec_plan;
CUDNN_CHECK(cudnnBackendCreateDescriptor(
CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, &exec_plan));
CUDNN_CHECK(cudnnBackendSetAttribute(
exec_plan, CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &cfg));
CUDNN_CHECK(cudnnBackendSetAttribute(exec_plan,
CUDNN_ATTR_EXECUTION_PLAN_HANDLE,
CUDNN_TYPE_HANDLE, 1, &handle));
cudnnBackendDescriptor_t engine;
CUDNN_CHECK(cudnnBackendCreateDescriptor(
CUDNN_BACKEND_ENGINE_DESCRIPTOR, &engine));
CUDNN_CHECK(cudnnBackendGetAttribute(cfg, CUDNN_ATTR_ENGINECFG_ENGINE,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1,
nullptr, &engine));
cudnnStatus_t status = cudnnBackendFinalize(exec_plan);
std::cout << cudnnGetErrorString(status) << "\n";
if (status == CUDNN_STATUS_SUCCESS) {
std::cout << "success\n";
} else {
char error_message[256];
cudnnGetLastErrorString(error_message, sizeof(error_message));
std::cout << "reason: " << error_message << std::endl;
}
}
// To be filled in
return 0;
}