I am working on implementing the Fused Attention fprop. As of now I am only combining two matrix multiplications, meaning g3 and g4 are empty. I believe I have also matched all the requirements for this graph but none of the engine configurations provided by the engine heuristic work when passed to an execution plan. When finalizing the exec plan using any of the engine configurations the status CUDNN_STATUS_NOT_SUPPORTED
is returned.
I have pasted the implementation I am using as well as the shapes and strides of all tensors used. Please let me know what I am doing incorrectly or any other information to help.
qshape: 4: 1 1 10 64
qstride: 4: 640 640 64 1
kshape: 4: 1 1 64 10
kstride: 4: 640 640 1 64
sshape: 4: 1 1 10 10
sstride: 4: 100 100 10 1
vshape: 4: 1 1 10 64
vstride: 4: 640 640 64 1
oshape: 4: 1 1 10 64
ostride: 4: 640 640 64 1
#include <cudnn.h>
#include <iostream>
#include <vector>
#define CUDNN_CHECK(status) \
{ \
if (status != CUDNN_STATUS_SUCCESS) { \
fprintf(stderr, "cuDNN error: %s:%d:%s\n", __FILE__, __LINE__, \
cudnnGetErrorString(status)); \
std::exit(EXIT_FAILURE); \
} \
}
void print_vector(const std::vector<int64_t> &v, std::string name) {
std::cout << name << ": " << v.size() << ": ";
for (int64_t i : v) {
std::cout << i << " ";
}
std::cout << std::endl;
}
std::vector<int64_t> standard_4d_strides(const std::vector<int64_t> &shape) {
return {shape[1] * shape[2] * shape[3], shape[2] * shape[3], shape[3], 1};
}
cudnnBackendDescriptor_t
tensor_descriptor(const std::vector<int64_t> &shape,
const std::vector<int64_t> &strides, int64_t id,
cudnnDataType_t data_type, int64_t byte_alignment,
bool is_virtual, bool reordering_fp16x16 = false) {
cudnnBackendDescriptor_t desc;
CUDNN_CHECK(
cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &desc));
CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_UNIQUE_ID,
CUDNN_TYPE_INT64, 1, &id));
CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_DATA_TYPE,
CUDNN_TYPE_DATA_TYPE, 1, &data_type));
CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT,
CUDNN_TYPE_INT64, 1, &byte_alignment));
CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_DIMENSIONS,
CUDNN_TYPE_INT64,
(int64_t)shape.size(), shape.data()));
CUDNN_CHECK(cudnnBackendSetAttribute(
desc, CUDNN_ATTR_TENSOR_STRIDES, CUDNN_TYPE_INT64,
(int64_t)strides.size(), strides.data()));
CUDNN_CHECK(cudnnBackendSetAttribute(desc, CUDNN_ATTR_TENSOR_IS_VIRTUAL,
CUDNN_TYPE_BOOLEAN, 1, &is_virtual));
if (reordering_fp16x16) {
cudnnBackendTensorReordering_t reorder = CUDNN_TENSOR_REORDERING_F16x16;
CUDNN_CHECK(cudnnBackendSetAttribute(
desc, CUDNN_ATTR_TENSOR_REORDERING_MODE,
CUDNN_TYPE_TENSOR_REORDERING_MODE, 1, &reorder));
}
CUDNN_CHECK(cudnnBackendFinalize(desc));
return desc;
}
int main() {
std::vector<int64_t> shape_query = {1, 1, 10, 64};
std::vector<int64_t> strides_query = standard_4d_strides(shape_query);
std::vector<int64_t> shape_key = {1, 1, 10, 64};
std::vector<int64_t> strides_key = standard_4d_strides(shape_key);
std::swap(shape_key[2], shape_key[3]);
std::swap(strides_key[2], strides_key[3]);
std::vector<int64_t> shape_value = {1, 1, 10, 64};
std::vector<int64_t> strides_value = standard_4d_strides(shape_value);
std::vector<int64_t> shape_scores = {shape_query[0], shape_query[1],
shape_query[2], shape_key[3]};
std::vector<int64_t> strides_scores = standard_4d_strides(shape_scores);
std::vector<int64_t> shape_output = {shape_query[0], shape_query[1],
shape_query[2], shape_value[3]};
std::vector<int64_t> strides_output = standard_4d_strides(shape_output);
cudnnHandle_t handle;
CUDNN_CHECK(cudnnCreate(&handle));
cudnnDataType_t comp_type = CUDNN_DATA_FLOAT;
cudnnDataType_t data_type = CUDNN_DATA_HALF;
int64_t data_type_byte_alignment = 2;
cudnnBackendDescriptor_t query_desc =
tensor_descriptor(shape_query, strides_query, 'q', data_type,
data_type_byte_alignment, false);
cudnnBackendDescriptor_t key_desc =
tensor_descriptor(shape_key, strides_key, 'k', data_type,
data_type_byte_alignment, false);
cudnnBackendDescriptor_t value_desc =
tensor_descriptor(shape_value, strides_value, 'v', data_type,
data_type_byte_alignment, false);
cudnnBackendDescriptor_t scores_desc =
tensor_descriptor(shape_scores, strides_scores, 's', data_type,
data_type_byte_alignment, true, true);
cudnnBackendDescriptor_t output_desc =
tensor_descriptor(shape_output, strides_output, 'o', data_type,
data_type_byte_alignment, false);
print_vector(shape_query, "qshape");
print_vector(strides_query, "qstride");
print_vector(shape_key, "kshape");
print_vector(strides_key, "kstride");
print_vector(shape_scores, "sshape");
print_vector(strides_scores, "sstride");
print_vector(shape_value, "vshape");
print_vector(strides_value, "vstride");
print_vector(shape_output, "oshape");
print_vector(strides_output, "ostride");
cudnnBackendDescriptor_t matmul_desc;
CUDNN_CHECK(cudnnBackendCreateDescriptor(CUDNN_BACKEND_MATMUL_DESCRIPTOR,
&matmul_desc));
CUDNN_CHECK(cudnnBackendSetAttribute(matmul_desc,
CUDNN_ATTR_MATMUL_COMP_TYPE,
CUDNN_TYPE_DATA_TYPE, 1, &comp_type));
CUDNN_CHECK(cudnnBackendFinalize(matmul_desc));
cudnnBackendDescriptor_t op_matmul;
CUDNN_CHECK(cudnnBackendCreateDescriptor(
CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR, &op_matmul));
CUDNN_CHECK(cudnnBackendSetAttribute(
op_matmul, CUDNN_ATTR_OPERATION_MATMUL_DESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &matmul_desc));
CUDNN_CHECK(cudnnBackendSetAttribute(
op_matmul, CUDNN_ATTR_OPERATION_MATMUL_ADESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &query_desc));
CUDNN_CHECK(
cudnnBackendSetAttribute(op_matmul, CUDNN_ATTR_OPERATION_MATMUL_BDESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &key_desc));
CUDNN_CHECK(cudnnBackendSetAttribute(
op_matmul, CUDNN_ATTR_OPERATION_MATMUL_CDESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &scores_desc));
CUDNN_CHECK(cudnnBackendFinalize(op_matmul));
cudnnBackendDescriptor_t out_matmul_desc;
CUDNN_CHECK(cudnnBackendCreateDescriptor(CUDNN_BACKEND_MATMUL_DESCRIPTOR,
&out_matmul_desc));
CUDNN_CHECK(cudnnBackendSetAttribute(out_matmul_desc,
CUDNN_ATTR_MATMUL_COMP_TYPE,
CUDNN_TYPE_DATA_TYPE, 1, &comp_type));
CUDNN_CHECK(cudnnBackendFinalize(out_matmul_desc));
cudnnBackendDescriptor_t op_matmul_output;
CUDNN_CHECK(cudnnBackendCreateDescriptor(
CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR, &op_matmul_output));
CUDNN_CHECK(cudnnBackendSetAttribute(
op_matmul_output, CUDNN_ATTR_OPERATION_MATMUL_DESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &out_matmul_desc));
CUDNN_CHECK(cudnnBackendSetAttribute(
op_matmul_output, CUDNN_ATTR_OPERATION_MATMUL_ADESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &scores_desc));
CUDNN_CHECK(cudnnBackendSetAttribute(
op_matmul_output, CUDNN_ATTR_OPERATION_MATMUL_BDESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &value_desc));
CUDNN_CHECK(cudnnBackendSetAttribute(
op_matmul_output, CUDNN_ATTR_OPERATION_MATMUL_CDESC,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &output_desc));
CUDNN_CHECK(cudnnBackendFinalize(op_matmul_output));
cudnnBackendDescriptor_t op_graph;
cudnnBackendDescriptor_t ops[] = {op_matmul, op_matmul_output};
CUDNN_CHECK(cudnnBackendCreateDescriptor(
CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, &op_graph));
CUDNN_CHECK(cudnnBackendSetAttribute(
op_graph, CUDNN_ATTR_OPERATIONGRAPH_OPS, CUDNN_TYPE_BACKEND_DESCRIPTOR,
sizeof(ops) / sizeof(ops[0]), ops));
CUDNN_CHECK(cudnnBackendSetAttribute(op_graph,
CUDNN_ATTR_OPERATIONGRAPH_HANDLE,
CUDNN_TYPE_HANDLE, 1, &handle));
CUDNN_CHECK(cudnnBackendFinalize(op_graph));
cudnnBackendDescriptor_t heur_desc;
CUDNN_CHECK(cudnnBackendCreateDescriptor(
CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR, &heur_desc));
CUDNN_CHECK(cudnnBackendSetAttribute(
heur_desc, CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &op_graph));
cudnnBackendHeurMode_t heur_mode = CUDNN_HEUR_MODE_FALLBACK;
CUDNN_CHECK(cudnnBackendSetAttribute(heur_desc, CUDNN_ATTR_ENGINEHEUR_MODE,
CUDNN_TYPE_HEUR_MODE, 1, &heur_mode));
CUDNN_CHECK(cudnnBackendFinalize(heur_desc));
int64_t count = 0;
CUDNN_CHECK(cudnnBackendGetAttribute(
heur_desc, CUDNN_ATTR_ENGINEHEUR_RESULTS, CUDNN_TYPE_BACKEND_DESCRIPTOR,
0, &count, NULL));
std::vector<cudnnBackendDescriptor_t> eng_cfgs(count);
for (cudnnBackendDescriptor_t &cfg : eng_cfgs) {
CUDNN_CHECK(cudnnBackendCreateDescriptor(
CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, &cfg));
}
CUDNN_CHECK(cudnnBackendGetAttribute(
heur_desc, CUDNN_ATTR_ENGINEHEUR_RESULTS, CUDNN_TYPE_BACKEND_DESCRIPTOR,
count, nullptr, eng_cfgs.data()));
for (cudnnBackendDescriptor_t &cfg : eng_cfgs) {
cudnnBackendDescriptor_t exec_plan;
CUDNN_CHECK(cudnnBackendCreateDescriptor(
CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, &exec_plan));
CUDNN_CHECK(cudnnBackendSetAttribute(
exec_plan, CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG,
CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &cfg));
CUDNN_CHECK(cudnnBackendSetAttribute(exec_plan,
CUDNN_ATTR_EXECUTION_PLAN_HANDLE,
CUDNN_TYPE_HANDLE, 1, &handle));
cudnnStatus_t status = cudnnBackendFinalize(exec_plan);
std::cout << cudnnGetErrorString(status) << "\n";
if (status == CUDNN_STATUS_SUCCESS) {
std::cout << "success\n";
}
}
// To be filled in
return 0;
}