Cudnn convolution is significantly slow

user2747 · April 11, 2022, 10:42am

I wrote a simple program that loads two .npy files, convolves them and check if the result is the same as a third .npy file provided by me. The setup seemed straight forward but the execution of the program takes around 5 seconds to complete which is significantly slower than other frameworks (e.g. Caffe takes 1 second for the same operation).
I’m running the code on a Jetson TX2 and my fear is that the setup is wrong, this is the code

#include <iomanip>
#include <iostream>
#include <cstdlib>
#include <vector>
#include <cuda.h>
#include <cudnn.h>
#include <stdio.h>
#include <ctime>

#include "utils.h"


#define CUDA_CALL(f) { \
  cudaError_t err = (f); \
  if (err != cudaSuccess) { \
    std::cout \
        << "    Error occurred: " << err << " " << __LINE__ << std::endl; \
    std::exit(1); \
  } \
}

#define CUDNN_CALL(f) { \
  cudnnStatus_t err = (f); \
  if (err != CUDNN_STATUS_SUCCESS) { \
    std::cout \
        << "    Error occurred: " << err << " " << __LINE__ << std::endl; \
    std::exit(1); \
  } \
}

template<typename T> bool almost_equal(const T a, const T b, const T epsilon)
{
    return std::abs(a - b) < epsilon;
}
    
void load_tensor(
    const std::string &path,
    std::shared_ptr<float> &data,
    std::vector<int> &shape)
{
    cnpy::NpyArray input_tensor = cnpy::npy_load(path);


    float *data_copy = new float[input_tensor.num_vals];
    std::memcpy(data_copy, input_tensor.data<float>(), input_tensor.num_bytes());
    data = std::shared_ptr<float>(data_copy, std::default_delete<float[]>());

    shape = std::vector<int>();
    for(auto dimension : input_tensor.shape)
    {
        shape.push_back(static_cast<int>(dimension));
    }
}

bool compare(
    const float* actual, 
    const float* expected, 
    const std::vector<int> &shape)
{
    const float epsilon = static_cast<float>(1E-3);
    const int N = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());

    for(int i = 0; i < N; i++)
    {
        const bool comparison_result = almost_equal(actual[i], expected[i], epsilon);

        if(!comparison_result) return false;
    }
    return true;
}

void print(const float *data, int n, int c, int h, int w) {
  int a = 0;
  for (int i = 0; i < n; ++i) {
    for (int j = 0; j < c; ++j) {
      std::cout << "n=" << i << ", c=" << j << ":" << std::endl;
      for (int k = 0; k < h; ++k) {
        for (int l = 0; l < w; ++l) {
          std::cout << std::setw(4) << std::right << data[a];
          ++a;
        }
        std::cout << std::endl;
      }
    }
  }
  std::cout << std::endl;
}

int main(int argc, char **argv) {

  if(argc != 4) {
        std::cerr << "Unsufficient number of arguments provided!" << std::endl;
        std::cerr << "Usage conv <folder> <BFM> <IGID>" << std::endl;
        return 0;
  }

  const std::string experiment_folder = std::string(argv[1]);
  const std::string BFM = std::string(argv[2]);
  const std::string IGID = std::string(argv[3]);


  std::shared_ptr<float> input_data;
  std::vector<int> input_shape;

  std::shared_ptr<float> filters_data;
  std::vector<int> filters_shape;

  std::shared_ptr<float> output_data;
  std::vector<int> output_shape;

  load_tensor("input_1.npy", input_data, input_shape);
  load_tensor("input_2.npy", filters_data, filters_shape);
  load_tensor("output_1.npy", output_data, output_shape);

  cudnnHandle_t cudnn;
  CUDNN_CALL(cudnnCreate(&cudnn));

  const int in_n = input_shape[0];
  const int in_c = input_shape[1];
  const int in_h = input_shape[2];
  const int in_w = input_shape[3];

  cudnnTensorDescriptor_t in_desc;
  CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc));
  CUDNN_CALL(cudnnSetTensor4dDescriptor(
        in_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
        in_n, in_c, in_h, in_w));

  float *in_data;
  float *h_in_data = input_data.get();


  CUDA_CALL(cudaMalloc(
        &in_data, in_n * in_c * in_h * in_w * sizeof(float)));
  CUDA_CALL(cudaMemcpy(
        in_data, h_in_data, in_n * in_c * in_h * in_w * sizeof(float), cudaMemcpyHostToDevice));

  const int filt_k = filters_shape[0];
  const int filt_c = filters_shape[1];
  const int filt_h = filters_shape[2];
  const int filt_w = filters_shape[3];

  cudnnFilterDescriptor_t filt_desc;
  CUDNN_CALL(cudnnCreateFilterDescriptor(&filt_desc));
  CUDNN_CALL(cudnnSetFilter4dDescriptor(
        filt_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW,
        filt_k, filt_c, filt_h, filt_w));

  float *filt_data;
  float *h_filt_data = filters_data.get();


  CUDA_CALL(cudaMalloc(
      &filt_data, filt_k * filt_c * filt_h * filt_w * sizeof(float)));
  CUDA_CALL(cudaMemcpy(
        filt_data, h_filt_data, filt_k * filt_c * filt_h * filt_w * sizeof(float), 
        cudaMemcpyHostToDevice));

  // convolution
  const int pad_h = 0;
  const int pad_w = 0;
  const int str_h = 1;
  const int str_w = 1;
  const int dil_h = 1;
  const int dil_w = 1;

  cudnnConvolutionDescriptor_t conv_desc;
  CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc));
  CUDNN_CALL(cudnnSetConvolution2dDescriptor(
        conv_desc,
        pad_h, pad_w, str_h, str_w, dil_h, dil_w,
        CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT));

  // output
  int out_n;
  int out_c;
  int out_h;
  int out_w;
  
  CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim(
        conv_desc, in_desc, filt_desc,
        &out_n, &out_c, &out_h, &out_w));

  cudnnTensorDescriptor_t out_desc;
  CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc));
  CUDNN_CALL(cudnnSetTensor4dDescriptor(
        out_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
        out_n, out_c, out_h, out_w));

  float *out_data;
  float *h_out_data;

  h_out_data = (float *) malloc(out_n * out_c * out_h * out_w * sizeof(float));
  if(!h_out_data){
    std::cout << "Error on h_out_data malloc" << std::endl;
    return 1;
  }

  CUDA_CALL(cudaMalloc(
        &out_data, out_n * out_c * out_h * out_w * sizeof(float)));



  cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_GEMM;

  size_t ws_size;
  CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(
        cudnn, in_desc, filt_desc, conv_desc, out_desc, algo, &ws_size));

  float *ws_data;
  CUDA_CALL(cudaMalloc(&ws_data, ws_size));

  float alpha = 1.f;
  float beta = 0.f;

  CUDNN_CALL(cudnnConvolutionForward(
      cudnn,
      &alpha, in_desc, in_data, filt_desc, filt_data,
      conv_desc, algo, ws_data, ws_size,
      &beta, out_desc, out_data));
  
  CUDA_CALL(cudaMemcpy(
        h_out_data, out_data, out_n * out_c * out_h * out_w * sizeof(float), 
        cudaMemcpyDeviceToHost));
  

  if (compare(h_out_data, output_data.get(), output_shape)) {
    std::cout << "MASKED" << std::endl;
  } else {
    std::cout << "SDC" << std::endl;
  }

  // finalizing

  CUDA_CALL(cudaFree(ws_data));
  CUDA_CALL(cudaFree(out_data));
  CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc));
  CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc));
  CUDA_CALL(cudaFree(filt_data));
  CUDNN_CALL(cudnnDestroyFilterDescriptor(filt_desc));
  CUDA_CALL(cudaFree(in_data));
  CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc));
  CUDNN_CALL(cudnnDestroy(cudnn));

  free(h_filt_data);
  free(h_in_data);
  free(h_out_data);
  

  return 0;
}

Could anyone tell me if there is a problem in the code or if the convolution is simply slower than other frameworks’ convolutions?
Thanks

spolisetty · April 13, 2022, 11:37am

Hi,

Could you please try on the latest cuDNN v8.4.0

Thank you.

user2747 · April 18, 2022, 8:37am

Hi, thanks for the suggestion, do I need to change any part of the code to be compliant with cuDNN v8.4.0 or is it enough to update my version of cuDNN? currently I am running v8.2.1

spolisetty · April 19, 2022, 7:00am

I believe updating the version is fine. If you face any running issues, please refer to changes in the new version here and update the code.

Thank you.

Topic		Replies	Views
cuDNN runs pretty slow cuDNN	2	998	April 24, 2023
CuDnn slow convolution operation cuDNN kernel	1	51	January 31, 2025
cuDNN v6.0 failure of filter and workspace initialization for 3D convolution (CUDNN_STATUS_NOT_SUPPORTED) GPU-Accelerated Libraries	0	1651	May 3, 2017
CUDNN: cudnnConvolutionForward very bad performance(very long execution time) on xavier Jetson AGX Xavier	4	1040	October 18, 2021
Failed to get convolution algorithm. This is probably because cuDNN failed to initialize cuDNN	29	51612	October 12, 2021
convulution not running in parallel cuDNN	3	1999	June 22, 2020
Caffe make faild Jetson Nano cuda , caffe	11	2046	December 22, 2021
Cudnn convolution performance by precision DRIVE AGX Xavier General driveos-cuda	6	1086	May 30, 2022
cuDNN examples cuDNN	8	19683	May 2, 2018
cudnnConvolutionForward returns wrong result when meets std::normal_distribution cuDNN cudnn	2	87	July 26, 2024

Cudnn convolution is significantly slow

Related topics