Efficiency comparison between TFServing and TRTIS

I use TFServing and TRTIS to infer yolov2 and densenet respectively. I found the efficiency of TRTIS is not as high as that of TFServing. TFServing infers yolov2 and densenet costs 34 ms and 9 ms respectively. However, TRTIS infers yolov2 and densenet costs 37 ms and 14 ms respectively, the models run on TRTIS are tensorflow graphdef type. The TRTIS inference code is as follows.
(1)yolov2

#include "yolo_predictor.h"
void YOLOPredictor::doInference(const std::vector<cv::Mat>& image_array, std::vector<BoxInfo>& boxInfos)
{
    int h = 0;
    int w = 0;
    std::vector<std::vector<uint8_t> > inputs_data;
    for(size_t i = 0; i < image_array.size(); ++i)
    {
        cv::Mat im = image_array[i];
        h = im.size().height;
        w = im.size().width;
        inputs_data.emplace_back();
        preprocess(im, &(inputs_data.back()));
    }
    std::vector<std::vector<std::unique_ptr<nic::InferContext::Result> > > results;
    inference(inputs_data, &results);
    
    postprocess(results, boxInfos, h, w);
}

void YOLOPredictor::preprocess(const cv::Mat& im, std::vector<uint8_t>* input_data)
{
    int input_height = TC_Common::strto<int>(meta.find("inp_height")->second);
    int input_width = TC_Common::strto<int>(meta.find("inp_width")->second);

    cv::Mat imsz;
    cv::resize(im, imsz, cv::Size(input_width, input_height), cv::INTER_LINEAR);
    cv::cvtColor(imsz, imsz, cv::COLOR_RGB2BGR);
    imsz.convertTo(imsz, CV_32FC3, 1.0/255, 0);

    size_t img_byte_size = imsz.total() * imsz.elemSize();
    input_data->resize(img_byte_size);
    memcpy(&((*input_data)[0]), imsz.datastart, img_byte_size);
}

void YOLOPredictor::inference(const std::vector<std::vector<uint8_t> >& inputs_data, std::vector<std::vector<std::unique_ptr<nic::InferContext::Result> > >* results)
{
    const size_t batch_size = inputs_data.size();
    nic::Error err(ni::RequestStatusCode::SUCCESS);

    const auto& input = ctx->Inputs()[0];
    
    std::unique_ptr<nic::InferContext::Options> options;
    err = nic::InferContext::Options::Create(&options);
    if(!err.IsOk())
    {
        std::cerr << "failed initializing infer options: " << err << std::endl;
    }

    options->SetBatchSize(batch_size);
    for(const auto& output : ctx->Outputs())
    {
        options->AddRawResult(output);
    }
    err = ctx->SetRunOptions(*options);
    if(!err.IsOk())
    {
        std::cerr << "failed initializing batch size: " << err << std::endl;
    }

    err = input->Reset();
    if(!err.IsOk())
    {
        std::cerr << "failed resetting input: " << err << std::endl;
    }

    for(size_t i = 0; i < batch_size; ++i)
    {
        nic::Error err = input->SetRaw(inputs_data[i]);
        if(!err.IsOk())
        {
            std::cerr << "failed setting input: " << err << std::endl;
        }
    }
    results->emplace_back();
    err = ctx->Run(&(results->back()));
    if(!err.IsOk())
    {
        std::cerr << "failed sending infer request: " << err << std::endl;
    }

}

void YOLOPredictor::postprocess(const std::vector<std::vector<std::unique_ptr<nic::InferContext::Result> > >& results, std::vector<BoxInfo>& boxInfos, int h, int w)
{
    if(results.size() != 1)
    {
        std::cerr << "expected 1 result, got " << results.size() << std::endl;
    }

    const std::unique_ptr<nic::InferContext::Result>& result = results[0][0];
    std::vector<BoundBox> boxes;
    findboxes(meta, result, boxes);
    for(BoundBox box : boxes)
    {
        int left, right, top, bot;
        std::string label;
        float confidence;
        if(!process_box(box, h, w, threshold, left, right, top, bot, label, confidence))
        {
            continue;
        }
        BoxInfo boxInfo;
        boxInfo.label = label;
        boxInfo.confidence = confidence;
        Point topleft;
        topleft.x = left;
        topleft.y = top;
        Point bottomright;
        bottomright.x = right;
        bottomright.y = bot;
        boxInfo.topleft = topleft;
        boxInfo.bottomright = bottomright;
        boxInfos.push_back(boxInfo);
    }
}

(2)densenet

#include "densenet_predictor.h"

void DensenetPredictor::doInference(const std::vector<cv::Mat>& image_array, std::vector<float>& proba_)
{
    for(int i = 0; i < wn_num_class; ++i)
    {
        proba_.push_back(0.0);
    }
    std::vector<std::vector<uint8_t> > inputs_data;
    for(size_t i = 0; i < image_array.size(); ++i)
    {
        inputs_data.emplace_back();
        preprocess(image_array[i], &(inputs_data.back()));
    }
    std::vector<std::vector<std::unique_ptr<nic::InferContext::Result> > > results;
    inference(inputs_data, &results);
    
    postprocess(results, proba_);
}

void DensenetPredictor::preprocess(const cv::Mat& im, std::vector<uint8_t>* input_data)
{
    cv::cvtColor(im, im, cv::COLOR_RGB2BGR);
    size_t img_byte_size = im.total() * im.elemSize();
    input_data->resize(img_byte_size);
    memcpy(&((*input_data)[0]), im.datastart, img_byte_size);
}

void DensenetPredictor::inference(const std::vector<std::vector<uint8_t> >& inputs_data, std::vector<std::vector<std::unique_ptr<nic::InferContext::Result> > >* results)
{
    const size_t batch_size = inputs_data.size();
    nic::Error err(ni::RequestStatusCode::SUCCESS);

    std::unique_ptr<nic::InferContext::Options> options;
    err = nic::InferContext::Options::Create(&options);
    if(!err.IsOk())
    {
        std::cerr << "failed initializing infer options: " << err << std::endl;
    }

    options->SetBatchSize(1);
    for(const auto& output : ctx->Outputs())
    {
        options->AddRawResult(output);
    }
    err = ctx->SetRunOptions(*options);
    if(!err.IsOk())
    {
        std::cerr << "failed initializing batch size: " << err << std::endl;
    }

    
    for(size_t idx = 0; idx < batch_size; ++idx)
    {
        const auto& input = ctx->Inputs()[0];
        err = input->Reset();
        if(!err.IsOk())
        {
            std::cerr << "failed resetting input: " << err << std::endl;
        }
        nic::Error err = input->SetRaw(inputs_data[idx]);
        if(!err.IsOk())
        {
            std::cerr << "failed setting input: " << err << std::endl;
        }
        results->emplace_back();
        err = ctx->Run(&(results->back()));
        if(!err.IsOk())
        {
            std::cerr << "failed sending infer request: " << err << std::endl;
        }
    }
    
    /*
    std::vector<std::shared_ptr<nic::InferContext::Request> > requests;
    
    for(size_t idx = 0; idx < batch_size; ++idx)
    {
        const auto& input = ctx->Inputs()[0];

        err = input->Reset();
        if(!err.IsOk())
        {
            std::cerr << "failed resetting input: " << err << std::endl;
        }
        nic::Error err = input->SetRaw(inputs_data[idx]);
        if(!err.IsOk())
        {
            std::cerr << "failed setting input: " << err << std::endl;
        }
        std::shared_ptr<nic::InferContext::Request> req;
        err = ctx->AsyncRun(&req);
        if(!err.IsOk())
        {
            std::cerr << "failed sending infer request: " << err << std::endl;
        }
        requests.emplace_back(std::move(req));
    }
    std::cout << "requests size:" << requests.size() << std::endl;
    for(auto& request : requests)
    {
        results->emplace_back();
        err = ctx->GetAsyncRunResults(&(results->back()), request, true);
        if(!err.IsOk())
        {
            std::cerr << "failed receiving infer response: " << std::endl;
        }
    }
    */
};

void DensenetPredictor::postprocess(const std::vector<std::vector<std::unique_ptr<nic::InferContext::Result> > >& results, std::vector<float>& proba_)
{
    float proba;
    int l = results.size();
    for(int idx = 0; idx < l; ++idx)
    {
        const std::unique_ptr<nic::InferContext::Result>& result = results[idx][0];
        for(int w = 0; w < wn_num_class; ++w)
        {
            result->GetRawAtCursor(0, &proba);
            proba_[w] += proba;
        }
    }
    for(int j = 0; j < wn_num_class; ++j)
    {
        proba_[j] /= l;
    }
};

(1)I wonder if it’s because my code is not optimal. If it’s not optimal, can you give me some advice about my code? In addition, what I know is that if two images need to be inferred through densenet respectively. One way of TRTIS is synchronous invocation, As shown in lines 53 to 72 of the densenet code.The other way is asynchronous invocation, As shown in lines 75 to 108 of the densenet code.But whether using synchronous or asynchronous way, two images need to be executed twice Run or AsyncRun method. I wonder if I can input two images at a time and return the two inference results at a time, which can be achieved in TFServing.From code implementation, it is similar to that I can set the data of two images into input only by calling input set method(like input->SetRaw()) once, and return results of size 2 by executing run method once.
(2)I found that asynchronous invocation is much slower than synchronous invocation.
(3)In addition, I found the input dimension of the plan model serialized by tensorrt transformation is different from the model before transformation,For example, my original model input is [227, 227, 3], the output is [2], after tensorrt transformation, the input of the plan model is [3, 227, 227], and the output is [1, 1, 2]. So for plan model, how does our TRTIS inference code set Input? Is it necessary to provide a tensorrt plan model client demo?
These are the problems I found in the process of exploring the engineering of TFServing and TRTIS. Thank you for your enthusiastic answers.

Hello,

what is the performance of TF vs. TRT on yolov2 and densenet natively? Just trying to rule out some variables.

The inference time taken from input data to output results.The preprocess and postprocess for TF & TRT are the same.

Interesting. So you are seeing the same inference performance for both TF and TRT? Let’s address that before involving the TRTIS layers.

can you share a small repro that demonstrates the performance you are seeing?

Sorry, I can’t provide TFServing code, TFServing does take less time to get the same result. Now I’m more concerned about whether yolov2 and densenet TRTIS inferrence code are optimal. Can you give me some suggestion on my code?