How use the cudnn graph API for do a convolution

Hi, I was searching how i could use the graph api for the deep learning framework that i am creating in cuda but i didint find how i could use this new graph api, i already implemented a code where i use the cudnn cnn library and everthing works well and was very easy to do but i didint figure out how use this new graph api, can you provide me example of setting the parameters for do a optimized convolution in this api? As far as I know it is done using the cudnnBackendExecute() but i dont know how i set the descriptors correct i already see the documentation and i didint figure it out

Here is an example of setting parameters for optimized convolution using the CUDA graph API and cudnnBackendExecute():

  1. Define the Convolution Descriptor:

    cudnnConvolutionDescriptor_t convDesc;
    cudnnCreateConvolutionDescriptor(&convDesc);
    
  2. Set the Convolution Descriptor Properties:

    int pad_h = 1, pad_w = 1; // Padding
    int stride_h = 1, stride_w = 1; // Stride
    int dilation_h = 1, dilation_w = 1; // Dilation
    cudnnDataType_t dataType = CUDNN_DATA_FLOAT; // Data type
    
    cudnnSetConvolution2dDescriptor(convDesc, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, CUDNN_CONVOLUTION, dataType);
    
  3. Define Input and Output Tensor Descriptors:

    cudnnTensorDescriptor_t inputDesc, outputDesc;
    cudnnCreateTensorDescriptor(&inputDesc);
    cudnnCreateTensorDescriptor(&outputDesc);
    
  4. Set Properties for Input and Output Tensors:

    int batch_size = 32, channels = 3, height = 224, width = 224; // Example dimensions
    int num_filters = 64, output_height = 224, output_width = 224; // Output dimensions
    
    cudnnSetTensor4dDescriptor(inputDesc, CUDNN_TENSOR_NHWC, dataType, batch_size, channels, height, width);
    cudnnSetTensor4dDescriptor(outputDesc, CUDNN_TENSOR_NHWC, dataType, batch_size, num_filters, output_height, output_width);
    
  5. Define Filter Descriptor:

    cudnnFilterDescriptor_t filterDesc;
    cudnnCreateFilterDescriptor(&filterDesc);
    
  6. Set Properties for the Filter Descriptor:

    int filter_height = 3, filter_width = 3; // Example filter size
    
    cudnnSetFilter4dDescriptor(filterDesc, dataType, CUDNN_TENSOR_KCHW, num_filters, channels, filter_height, filter_width);
    
  7. Perform the Convolution Operation:
    This is where you would typically set up your graph and execute the backend function, which requires additional context specific to your implementation:

    // Prepare for graph execution and related operations
    cudnnBackendExecute(cudnn_handle, backend_ops, graph, execPlan);
    

This outline provides the basic steps to set up descriptors for optimized convolution using the CUDA graph API with cudnnBackendExecute(). More specific and detailed implementations might be needed depending on your framework’s specifics and the actual use case, which you can further refine with the official NVIDIA documentation and examples.

I want know how i actually set the parameters of the cudnnBackendExecute for i can do the convolution, in the cudnn graph API is different the way that we setup everthing let me show what i done (is the best that i could do but still not working):

// CODE
include <cuda_runtime.h>
include <cudnn.h>
include <cudnn_backend.h>
include

define CHECK_CUDA(call) { cudaError_t status = call; if (status != cudaSuccess) { printf(“CUDA error in %s:%d: %s\n”, FILE, LINE, cudaGetErrorString(status)); exit(1); } }
define CHECK_CUDNN(call) { cudnnStatus_t status = call; if (status != CUDNN_STATUS_SUCCESS) { printf(“cuDNN error in %s:%d: error is %s\n”, FILE, LINE, cudnnGetErrorString(status)); exit(1); } }
define CHECK_CUBLAS(call) { cublasStatus_t status = call; if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, “cuBLAS error in %s:%d: %d\n”, FILE, LINE, status); exit(1);} }

int main()
{
// HANDLE
cudnnHandle_t handle;
CHECK_CUDNN(cudnnCreate(&handle))

// PARAMETERS
int in_channels = 3, kernels = 1;
int k_h = 3, k_w = 3, stride_h = 1, stride_w = 1, pad_h = 0, pad_w = 0;
int input_width = 4, input_height = 4;
int out_h = 2, out_w = 2;
int batch_size = 1;

int64_t workspaceSize;
void *workspace;
// Cudnn fused ops
cudnnBackendDescriptor_t varpack;
cudnnBackendDescriptor_t plan;

cudnnBackendDescriptor_t engcfg;
cudnnBackendDescriptor_t engine;
cudnnBackendDescriptor_t op_graph;

cudnnBackendDescriptor_t convDesc;
cudnnBackendDescriptor_t fprop;

cudnnBackendDescriptor_t inputDesc;
cudnnBackendDescriptor_t kernelDesc;
cudnnBackendDescriptor_t outputDesc;

int64_t nbDims = 2;
cudnnDataType_t compType = CUDNN_DATA_FLOAT;
cudnnConvolutionMode_t mode = CUDNN_CONVOLUTION;

int64_t inputDim[4] = {batch_size, in_channels, input_height, input_width};;
int64_t kernelDim[4] = {kernels, in_channels, k_h, k_w};
int64_t outputDim[4] = {batch_size, kernels, out_h, out_w};

int64_t inputStr[4] = {in_channels * input_height * input_width, input_height * input_width, input_width, 1};
int64_t kernelStr[4] = {in_channels * k_h * k_w, k_h * k_w, k_w, 1};
int64_t outputStr[4] = {kernels * out_h * out_w, out_h * out_w, out_w, 1};

int64_t inUi = 1;
int64_t kUi = 2;
int64_t outUi = 3;

int64_t pad[2] = {pad_h, pad_w};
int64_t dilation[2] = {1, 1};
int64_t filterStr[2] = {stride_h, stride_w};

float alpha = 1.0;
float beta = 0.5;

int64_t alignment = 4;
int64_t gidx = 0;

int64_t uids[3] = {static_cast<int64_t>('x'), static_cast<int64_t>('w'), static_cast<int64_t>('y')};
float *input, *output, *kernel;

CHECK_CUDA(cudaMalloc(&input, batch_size * in_channels * input_height * input_width * sizeof(float)));
CHECK_CUDA(cudaMalloc(&output, batch_size * kernels * out_h * out_w * sizeof(float)));
CHECK_CUDA(cudaMalloc(&kernel, kernels * k_h * k_w * sizeof(float)));

void *dev_ptrs[3] = {input, kernel, output};
cudnnDataType_t dtype = CUDNN_DATA_FLOAT;

// PREPARE VALUES

float input_values[] = {0.0f, 0.0f, 0.0f, 0.0f,
                        0.0f, 1.0f, 1.0f, 0.0f,
                        0.0f, 1.0f, 1.0f, 0.0f,
                        0.0f, 0.0f, 0.0f, 0.0f,

                        0.0f, 0.0f, 0.0f, 0.0f,
                        0.0f, 1.0f, 1.0f, 0.0f,
                        0.0f, 1.0f, 1.0f, 0.0f,
                        0.0f, 0.0f, 0.0f, 0.0f,

                        0.0f, 0.0f, 0.0f, 0.0f,
                        0.0f, 1.0f, 1.0f, 0.0f,
                        0.0f, 1.0f, 1.0f, 0.0f,
                        0.0f, 0.0f, 0.0f, 0.0f};

float kernel_values[] = {1.0f, 0.0f, 1.0f,
                         1.0f, 0.0f, 1.0f,
                         1.0f, 0.0f, 1.0f,

                         1.0f, 0.0f, 1.0f,
                         1.0f, 0.0f, 1.0f,
                         1.0f, 0.0f, 1.0f,

                         1.0f, 0.0f, 1.0f,
                         1.0f, 0.0f, 1.0f,
                         1.0f, 0.0f, 1.0f};

// COPY VALUES

CHECK_CUDA(cudaMemcpy(input, input_values, batch_size * in_channels * input_height * input_width * sizeof(float), cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(kernel, kernel_values, kernels * k_h * k_w * sizeof(float), cudaMemcpyHostToDevice));



// Create Descriptors
CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &inputDesc));
CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &kernelDesc));
CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &outputDesc));

// Set Attributes

// input
CHECK_CUDNN(cudnnBackendSetAttribute(inputDesc, CUDNN_ATTR_TENSOR_DATA_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &dtype));
CHECK_CUDNN(cudnnBackendSetAttribute(inputDesc, CUDNN_ATTR_TENSOR_DIMENSIONS, CUDNN_TYPE_INT64, 4, inputDim));
CHECK_CUDNN(cudnnBackendSetAttribute(inputDesc, CUDNN_ATTR_TENSOR_STRIDES, CUDNN_TYPE_INT64, 4, inputStr));
CHECK_CUDNN(cudnnBackendSetAttribute(inputDesc, CUDNN_ATTR_TENSOR_UNIQUE_ID, CUDNN_TYPE_INT64, 1, &inUi));
CHECK_CUDNN(cudnnBackendSetAttribute(inputDesc, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT, CUDNN_TYPE_INT64, 1, &alignment));
CHECK_CUDNN(cudnnBackendFinalize(inputDesc));

// kernel
CHECK_CUDNN(cudnnBackendSetAttribute(kernelDesc, CUDNN_ATTR_TENSOR_DATA_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &dtype));
CHECK_CUDNN(cudnnBackendSetAttribute(kernelDesc, CUDNN_ATTR_TENSOR_DIMENSIONS, CUDNN_TYPE_INT64, 4, kernelDim));
CHECK_CUDNN(cudnnBackendSetAttribute(kernelDesc, CUDNN_ATTR_TENSOR_STRIDES, CUDNN_TYPE_INT64, 4, kernelStr));
CHECK_CUDNN(cudnnBackendSetAttribute(kernelDesc, CUDNN_ATTR_TENSOR_UNIQUE_ID, CUDNN_TYPE_INT64, 1, &kUi));
CHECK_CUDNN(cudnnBackendSetAttribute(kernelDesc, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT, CUDNN_TYPE_INT64, 1, &alignment));
CHECK_CUDNN(cudnnBackendFinalize(kernelDesc));

// output
CHECK_CUDNN(cudnnBackendSetAttribute(outputDesc, CUDNN_ATTR_TENSOR_DATA_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &dtype));
CHECK_CUDNN(cudnnBackendSetAttribute(outputDesc, CUDNN_ATTR_TENSOR_DIMENSIONS, CUDNN_TYPE_INT64, 4, outputDim));
CHECK_CUDNN(cudnnBackendSetAttribute(outputDesc, CUDNN_ATTR_TENSOR_STRIDES, CUDNN_TYPE_INT64, 4, outputStr));
CHECK_CUDNN(cudnnBackendSetAttribute(outputDesc, CUDNN_ATTR_TENSOR_UNIQUE_ID, CUDNN_TYPE_INT64, 1, &outUi));
CHECK_CUDNN(cudnnBackendSetAttribute(outputDesc, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT, CUDNN_TYPE_INT64, 1, &alignment));
CHECK_CUDNN(cudnnBackendFinalize(outputDesc));

// convolution
CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR, &convDesc));
CHECK_CUDNN(cudnnBackendSetAttribute(convDesc, CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS, CUDNN_TYPE_INT64, 1, &nbDims));
CHECK_CUDNN(cudnnBackendSetAttribute(convDesc, CUDNN_ATTR_CONVOLUTION_COMP_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &compType));
CHECK_CUDNN(cudnnBackendSetAttribute(convDesc, CUDNN_ATTR_CONVOLUTION_CONV_MODE, CUDNN_TYPE_CONVOLUTION_MODE, 1, &mode));
CHECK_CUDNN(cudnnBackendSetAttribute(convDesc, CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS, CUDNN_TYPE_INT64, nbDims, pad));
CHECK_CUDNN(cudnnBackendSetAttribute(convDesc, CUDNN_ATTR_CONVOLUTION_POST_PADDINGS, CUDNN_TYPE_INT64, nbDims, pad));
CHECK_CUDNN(cudnnBackendSetAttribute(convDesc, CUDNN_ATTR_CONVOLUTION_DILATIONS, CUDNN_TYPE_INT64, nbDims, dilation));
CHECK_CUDNN(cudnnBackendSetAttribute(convDesc, CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES, CUDNN_TYPE_INT64, nbDims, filterStr));
CHECK_CUDNN(cudnnBackendFinalize(convDesc));

// forward operation
CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, &fprop));
CHECK_CUDNN(cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &inputDesc));
CHECK_CUDNN(cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &kernelDesc));
CHECK_CUDNN(cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &outputDesc));
CHECK_CUDNN(cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &convDesc));
CHECK_CUDNN(cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA, CUDNN_TYPE_FLOAT, 1, &alpha));
CHECK_CUDNN(cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA, CUDNN_TYPE_FLOAT, 1, &beta));
CHECK_CUDNN(cudnnBackendFinalize(fprop));

// graph
CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, &op_graph));
CHECK_CUDNN(cudnnBackendSetAttribute(op_graph, CUDNN_ATTR_OPERATIONGRAPH_OPS, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &fprop));
CHECK_CUDNN(cudnnBackendSetAttribute(op_graph, CUDNN_ATTR_OPERATIONGRAPH_HANDLE, CUDNN_TYPE_HANDLE, 1, &handle));
CHECK_CUDNN(cudnnBackendFinalize(op_graph));

// engine
CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINE_DESCRIPTOR, &engine));
CHECK_CUDNN(cudnnBackendSetAttribute(engine, CUDNN_ATTR_ENGINE_OPERATION_GRAPH, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &op_graph));
CHECK_CUDNN(cudnnBackendSetAttribute(engine, CUDNN_ATTR_ENGINE_GLOBAL_INDEX, CUDNN_TYPE_INT64, 1, &gidx));
CHECK_CUDNN(cudnnBackendFinalize(engine));

// engine configurayion
CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, &engcfg));
CHECK_CUDNN(cudnnBackendSetAttribute(engcfg, CUDNN_ATTR_ENGINECFG_ENGINE, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engine));
CHECK_CUDNN(cudnnBackendFinalize(engcfg));

// execution plan
CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, &plan));
CHECK_CUDNN(cudnnBackendSetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_HANDLE, CUDNN_TYPE_HANDLE, 1, &handle));
CHECK_CUDNN(cudnnBackendSetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engcfg));
CHECK_CUDNN(cudnnBackendFinalize(plan));

// workspace size
CHECK_CUDNN(cudnnBackendGetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE, CUDNN_TYPE_INT64, 1, NULL, &workspaceSize));
CHECK_CUDA(cudaMalloc(&workspace, workspaceSize));

// var pack
CHECK_CUDNN(cudnnBackendCreateDescriptor(CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR, &varpack));
CHECK_CUDNN(cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS, CUDNN_TYPE_VOID_PTR, 3, dev_ptrs));
CHECK_CUDNN(cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS, CUDNN_TYPE_INT64, 3, uids));
CHECK_CUDNN(cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_WORKSPACE, CUDNN_TYPE_VOID_PTR, 1, &workspace));
CHECK_CUDNN(cudnnBackendFinalize(varpack));

// EXECUTION
cudnnBackendExecute(handle, plan, varpack);


float* h_output = (float*)malloc(batch_size * kernels * out_h * out_w * sizeof(float));
cudaMemcpy(h_output, output, batch_size * kernels * out_h * out_w * sizeof(float), cudaMemcpyDeviceToHost);

for(int i = 0; i < batch_size * out_h * out_w * kernels; i++)
{
    std::cout << h_output[i] << " ";
}
std::cout << "\n";

}