I wrote a simple program that loads two .npy files, convolves them and check if the result is the same as a third .npy file provided by me. The setup seemed straight forward but the execution of the program takes around 5 seconds to complete which is significantly slower than other frameworks (e.g. Caffe takes 1 second for the same operation).
I’m running the code on a Jetson TX2 and my fear is that the setup is wrong, this is the code
#include <iomanip>
#include <iostream>
#include <cstdlib>
#include <vector>
#include <cuda.h>
#include <cudnn.h>
#include <stdio.h>
#include <ctime>
#include "utils.h"
#define CUDA_CALL(f) { \
cudaError_t err = (f); \
if (err != cudaSuccess) { \
std::cout \
<< " Error occurred: " << err << " " << __LINE__ << std::endl; \
std::exit(1); \
} \
}
#define CUDNN_CALL(f) { \
cudnnStatus_t err = (f); \
if (err != CUDNN_STATUS_SUCCESS) { \
std::cout \
<< " Error occurred: " << err << " " << __LINE__ << std::endl; \
std::exit(1); \
} \
}
template<typename T> bool almost_equal(const T a, const T b, const T epsilon)
{
return std::abs(a - b) < epsilon;
}
void load_tensor(
const std::string &path,
std::shared_ptr<float> &data,
std::vector<int> &shape)
{
cnpy::NpyArray input_tensor = cnpy::npy_load(path);
float *data_copy = new float[input_tensor.num_vals];
std::memcpy(data_copy, input_tensor.data<float>(), input_tensor.num_bytes());
data = std::shared_ptr<float>(data_copy, std::default_delete<float[]>());
shape = std::vector<int>();
for(auto dimension : input_tensor.shape)
{
shape.push_back(static_cast<int>(dimension));
}
}
bool compare(
const float* actual,
const float* expected,
const std::vector<int> &shape)
{
const float epsilon = static_cast<float>(1E-3);
const int N = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
for(int i = 0; i < N; i++)
{
const bool comparison_result = almost_equal(actual[i], expected[i], epsilon);
if(!comparison_result) return false;
}
return true;
}
void print(const float *data, int n, int c, int h, int w) {
int a = 0;
for (int i = 0; i < n; ++i) {
for (int j = 0; j < c; ++j) {
std::cout << "n=" << i << ", c=" << j << ":" << std::endl;
for (int k = 0; k < h; ++k) {
for (int l = 0; l < w; ++l) {
std::cout << std::setw(4) << std::right << data[a];
++a;
}
std::cout << std::endl;
}
}
}
std::cout << std::endl;
}
int main(int argc, char **argv) {
if(argc != 4) {
std::cerr << "Unsufficient number of arguments provided!" << std::endl;
std::cerr << "Usage conv <folder> <BFM> <IGID>" << std::endl;
return 0;
}
const std::string experiment_folder = std::string(argv[1]);
const std::string BFM = std::string(argv[2]);
const std::string IGID = std::string(argv[3]);
std::shared_ptr<float> input_data;
std::vector<int> input_shape;
std::shared_ptr<float> filters_data;
std::vector<int> filters_shape;
std::shared_ptr<float> output_data;
std::vector<int> output_shape;
load_tensor("input_1.npy", input_data, input_shape);
load_tensor("input_2.npy", filters_data, filters_shape);
load_tensor("output_1.npy", output_data, output_shape);
cudnnHandle_t cudnn;
CUDNN_CALL(cudnnCreate(&cudnn));
const int in_n = input_shape[0];
const int in_c = input_shape[1];
const int in_h = input_shape[2];
const int in_w = input_shape[3];
cudnnTensorDescriptor_t in_desc;
CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc));
CUDNN_CALL(cudnnSetTensor4dDescriptor(
in_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
in_n, in_c, in_h, in_w));
float *in_data;
float *h_in_data = input_data.get();
CUDA_CALL(cudaMalloc(
&in_data, in_n * in_c * in_h * in_w * sizeof(float)));
CUDA_CALL(cudaMemcpy(
in_data, h_in_data, in_n * in_c * in_h * in_w * sizeof(float), cudaMemcpyHostToDevice));
const int filt_k = filters_shape[0];
const int filt_c = filters_shape[1];
const int filt_h = filters_shape[2];
const int filt_w = filters_shape[3];
cudnnFilterDescriptor_t filt_desc;
CUDNN_CALL(cudnnCreateFilterDescriptor(&filt_desc));
CUDNN_CALL(cudnnSetFilter4dDescriptor(
filt_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW,
filt_k, filt_c, filt_h, filt_w));
float *filt_data;
float *h_filt_data = filters_data.get();
CUDA_CALL(cudaMalloc(
&filt_data, filt_k * filt_c * filt_h * filt_w * sizeof(float)));
CUDA_CALL(cudaMemcpy(
filt_data, h_filt_data, filt_k * filt_c * filt_h * filt_w * sizeof(float),
cudaMemcpyHostToDevice));
// convolution
const int pad_h = 0;
const int pad_w = 0;
const int str_h = 1;
const int str_w = 1;
const int dil_h = 1;
const int dil_w = 1;
cudnnConvolutionDescriptor_t conv_desc;
CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc));
CUDNN_CALL(cudnnSetConvolution2dDescriptor(
conv_desc,
pad_h, pad_w, str_h, str_w, dil_h, dil_w,
CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT));
// output
int out_n;
int out_c;
int out_h;
int out_w;
CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim(
conv_desc, in_desc, filt_desc,
&out_n, &out_c, &out_h, &out_w));
cudnnTensorDescriptor_t out_desc;
CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc));
CUDNN_CALL(cudnnSetTensor4dDescriptor(
out_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
out_n, out_c, out_h, out_w));
float *out_data;
float *h_out_data;
h_out_data = (float *) malloc(out_n * out_c * out_h * out_w * sizeof(float));
if(!h_out_data){
std::cout << "Error on h_out_data malloc" << std::endl;
return 1;
}
CUDA_CALL(cudaMalloc(
&out_data, out_n * out_c * out_h * out_w * sizeof(float)));
cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_GEMM;
size_t ws_size;
CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(
cudnn, in_desc, filt_desc, conv_desc, out_desc, algo, &ws_size));
float *ws_data;
CUDA_CALL(cudaMalloc(&ws_data, ws_size));
float alpha = 1.f;
float beta = 0.f;
CUDNN_CALL(cudnnConvolutionForward(
cudnn,
&alpha, in_desc, in_data, filt_desc, filt_data,
conv_desc, algo, ws_data, ws_size,
&beta, out_desc, out_data));
CUDA_CALL(cudaMemcpy(
h_out_data, out_data, out_n * out_c * out_h * out_w * sizeof(float),
cudaMemcpyDeviceToHost));
if (compare(h_out_data, output_data.get(), output_shape)) {
std::cout << "MASKED" << std::endl;
} else {
std::cout << "SDC" << std::endl;
}
// finalizing
CUDA_CALL(cudaFree(ws_data));
CUDA_CALL(cudaFree(out_data));
CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc));
CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc));
CUDA_CALL(cudaFree(filt_data));
CUDNN_CALL(cudnnDestroyFilterDescriptor(filt_desc));
CUDA_CALL(cudaFree(in_data));
CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc));
CUDNN_CALL(cudnnDestroy(cudnn));
free(h_filt_data);
free(h_in_data);
free(h_out_data);
return 0;
}
Could anyone tell me if there is a problem in the code or if the convolution is simply slower than other frameworks’ convolutions?
Thanks