Cudnn convolution is significantly slow

I wrote a simple program that loads two .npy files, convolves them and check if the result is the same as a third .npy file provided by me. The setup seemed straight forward but the execution of the program takes around 5 seconds to complete which is significantly slower than other frameworks (e.g. Caffe takes 1 second for the same operation).
I’m running the code on a Jetson TX2 and my fear is that the setup is wrong, this is the code

#include <iomanip>
#include <iostream>
#include <cstdlib>
#include <vector>
#include <cuda.h>
#include <cudnn.h>
#include <stdio.h>
#include <ctime>

#include "utils.h"


#define CUDA_CALL(f) { \
  cudaError_t err = (f); \
  if (err != cudaSuccess) { \
    std::cout \
        << "    Error occurred: " << err << " " << __LINE__ << std::endl; \
    std::exit(1); \
  } \
}

#define CUDNN_CALL(f) { \
  cudnnStatus_t err = (f); \
  if (err != CUDNN_STATUS_SUCCESS) { \
    std::cout \
        << "    Error occurred: " << err << " " << __LINE__ << std::endl; \
    std::exit(1); \
  } \
}

template<typename T> bool almost_equal(const T a, const T b, const T epsilon)
{
    return std::abs(a - b) < epsilon;
}
    
void load_tensor(
    const std::string &path,
    std::shared_ptr<float> &data,
    std::vector<int> &shape)
{
    cnpy::NpyArray input_tensor = cnpy::npy_load(path);


    float *data_copy = new float[input_tensor.num_vals];
    std::memcpy(data_copy, input_tensor.data<float>(), input_tensor.num_bytes());
    data = std::shared_ptr<float>(data_copy, std::default_delete<float[]>());

    shape = std::vector<int>();
    for(auto dimension : input_tensor.shape)
    {
        shape.push_back(static_cast<int>(dimension));
    }
}

bool compare(
    const float* actual, 
    const float* expected, 
    const std::vector<int> &shape)
{
    const float epsilon = static_cast<float>(1E-3);
    const int N = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());

    for(int i = 0; i < N; i++)
    {
        const bool comparison_result = almost_equal(actual[i], expected[i], epsilon);

        if(!comparison_result) return false;
    }
    return true;
}

void print(const float *data, int n, int c, int h, int w) {
  int a = 0;
  for (int i = 0; i < n; ++i) {
    for (int j = 0; j < c; ++j) {
      std::cout << "n=" << i << ", c=" << j << ":" << std::endl;
      for (int k = 0; k < h; ++k) {
        for (int l = 0; l < w; ++l) {
          std::cout << std::setw(4) << std::right << data[a];
          ++a;
        }
        std::cout << std::endl;
      }
    }
  }
  std::cout << std::endl;
}

int main(int argc, char **argv) {

  if(argc != 4) {
        std::cerr << "Unsufficient number of arguments provided!" << std::endl;
        std::cerr << "Usage conv <folder> <BFM> <IGID>" << std::endl;
        return 0;
  }

  const std::string experiment_folder = std::string(argv[1]);
  const std::string BFM = std::string(argv[2]);
  const std::string IGID = std::string(argv[3]);


  std::shared_ptr<float> input_data;
  std::vector<int> input_shape;

  std::shared_ptr<float> filters_data;
  std::vector<int> filters_shape;

  std::shared_ptr<float> output_data;
  std::vector<int> output_shape;

  load_tensor("input_1.npy", input_data, input_shape);
  load_tensor("input_2.npy", filters_data, filters_shape);
  load_tensor("output_1.npy", output_data, output_shape);

  cudnnHandle_t cudnn;
  CUDNN_CALL(cudnnCreate(&cudnn));

  const int in_n = input_shape[0];
  const int in_c = input_shape[1];
  const int in_h = input_shape[2];
  const int in_w = input_shape[3];

  cudnnTensorDescriptor_t in_desc;
  CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc));
  CUDNN_CALL(cudnnSetTensor4dDescriptor(
        in_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
        in_n, in_c, in_h, in_w));

  float *in_data;
  float *h_in_data = input_data.get();


  CUDA_CALL(cudaMalloc(
        &in_data, in_n * in_c * in_h * in_w * sizeof(float)));
  CUDA_CALL(cudaMemcpy(
        in_data, h_in_data, in_n * in_c * in_h * in_w * sizeof(float), cudaMemcpyHostToDevice));

  const int filt_k = filters_shape[0];
  const int filt_c = filters_shape[1];
  const int filt_h = filters_shape[2];
  const int filt_w = filters_shape[3];

  cudnnFilterDescriptor_t filt_desc;
  CUDNN_CALL(cudnnCreateFilterDescriptor(&filt_desc));
  CUDNN_CALL(cudnnSetFilter4dDescriptor(
        filt_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW,
        filt_k, filt_c, filt_h, filt_w));

  float *filt_data;
  float *h_filt_data = filters_data.get();


  CUDA_CALL(cudaMalloc(
      &filt_data, filt_k * filt_c * filt_h * filt_w * sizeof(float)));
  CUDA_CALL(cudaMemcpy(
        filt_data, h_filt_data, filt_k * filt_c * filt_h * filt_w * sizeof(float), 
        cudaMemcpyHostToDevice));

  // convolution
  const int pad_h = 0;
  const int pad_w = 0;
  const int str_h = 1;
  const int str_w = 1;
  const int dil_h = 1;
  const int dil_w = 1;

  cudnnConvolutionDescriptor_t conv_desc;
  CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc));
  CUDNN_CALL(cudnnSetConvolution2dDescriptor(
        conv_desc,
        pad_h, pad_w, str_h, str_w, dil_h, dil_w,
        CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT));

  // output
  int out_n;
  int out_c;
  int out_h;
  int out_w;
  
  CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim(
        conv_desc, in_desc, filt_desc,
        &out_n, &out_c, &out_h, &out_w));

  cudnnTensorDescriptor_t out_desc;
  CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc));
  CUDNN_CALL(cudnnSetTensor4dDescriptor(
        out_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
        out_n, out_c, out_h, out_w));

  float *out_data;
  float *h_out_data;

  h_out_data = (float *) malloc(out_n * out_c * out_h * out_w * sizeof(float));
  if(!h_out_data){
    std::cout << "Error on h_out_data malloc" << std::endl;
    return 1;
  }

  CUDA_CALL(cudaMalloc(
        &out_data, out_n * out_c * out_h * out_w * sizeof(float)));



  cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_GEMM;

  size_t ws_size;
  CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(
        cudnn, in_desc, filt_desc, conv_desc, out_desc, algo, &ws_size));

  float *ws_data;
  CUDA_CALL(cudaMalloc(&ws_data, ws_size));

  float alpha = 1.f;
  float beta = 0.f;

  CUDNN_CALL(cudnnConvolutionForward(
      cudnn,
      &alpha, in_desc, in_data, filt_desc, filt_data,
      conv_desc, algo, ws_data, ws_size,
      &beta, out_desc, out_data));
  
  CUDA_CALL(cudaMemcpy(
        h_out_data, out_data, out_n * out_c * out_h * out_w * sizeof(float), 
        cudaMemcpyDeviceToHost));
  

  if (compare(h_out_data, output_data.get(), output_shape)) {
    std::cout << "MASKED" << std::endl;
  } else {
    std::cout << "SDC" << std::endl;
  }

  // finalizing

  CUDA_CALL(cudaFree(ws_data));
  CUDA_CALL(cudaFree(out_data));
  CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc));
  CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc));
  CUDA_CALL(cudaFree(filt_data));
  CUDNN_CALL(cudnnDestroyFilterDescriptor(filt_desc));
  CUDA_CALL(cudaFree(in_data));
  CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc));
  CUDNN_CALL(cudnnDestroy(cudnn));

  free(h_filt_data);
  free(h_in_data);
  free(h_out_data);
  

  return 0;
}

Could anyone tell me if there is a problem in the code or if the convolution is simply slower than other frameworks’ convolutions?
Thanks

Hi,

Could you please try on the latest cuDNN v8.4.0

Thank you.

Hi, thanks for the suggestion, do I need to change any part of the code to be compliant with cuDNN v8.4.0 or is it enough to update my version of cuDNN? currently I am running v8.2.1

I believe updating the version is fine. If you face any running issues, please refer to changes in the new version here and update the code.

Thank you.