Inference time increases rapidly when set a high resolution input image


I write an inference code using tensorrt to realize a super resolution mission with model real-ESRGAN which is a complete convolution network.
The pipeline of my workflow is :
1, export an onnx model from pytorch
2, add postprocess to the onnx model
3, export a tensorrt model from onnx
4, write the inference code with tensorrt
When I run the inference code with different resolution images one by one, I found that the speed increased rapidly when the resolution became lager and the inference is too slow according to my experience.For example, I got about 60ms with 100100 input image and about 360ms with 400400 input image.


TensorRT Version: --fix-missing
GPU Type: 2070 Super
Nvidia Driver Version: 495.29.05
CUDA Version: 11.5.2
CUDNN Version:
Operating System + Version: ubuntu 20.04
Python Version (if applicable): 3.9
TensorFlow Version (if applicable):
PyTorch Version (if applicable): 2.0.0
Baremetal or Container (if container which image + tag):

Relevant Files

Please attach or include links to any models, data, files, or scripts necessary to reproduce your issue. (Github repo, Google Drive, Dropbox, etc.)

Steps To Reproduce

1, export an onnx model from pytorch
2, add postprocess to the onnx model
3, export a tensorrt model from onnx
4, write the inference code with tensorrt
The fragment of inference code with tensorrt is below:

void inference(Job& job){
            auto ppre_start_time = std::chrono::high_resolution_clock::now();
            cv::Mat image = cv::imdecode(job.input_image, cv::IMREAD_COLOR);
            int input_channel = image.channels();   
            int input_height = image.rows;
            int input_width = image.cols;
            int input_numel = input_batch * input_channel * input_height * input_width;
            float* input_data_host = nullptr;
            float* input_data_device = nullptr;
            checkRuntime(cudaMallocHost(&input_data_host, input_numel * sizeof(float)));
            checkRuntime(cudaMalloc(&input_data_device, input_numel * sizeof(float)));
            auto ppre_end_time = std::chrono::high_resolution_clock::now();
            auto ppre_duration = std::chrono::duration_cast<std::chrono::milliseconds>(ppre_end_time - ppre_start_time);
            printf("前前处理执行时间: %ld ms\n", ppre_duration.count());

            auto pre_start_time = std::chrono::high_resolution_clock::now();
            int image_area = image.cols * image.rows;
            unsigned char* pimage =;
            float* phost_b = input_data_host + image_area * 0;
            float* phost_g = input_data_host + image_area * 1;
            float* phost_r = input_data_host + image_area * 2;
            for(int i = 0; i < image_area; ++i, pimage += 3){
                // 注意这里的顺序rgb调换了
                *phost_r++ = pimage[0] / 255.0f ;
                *phost_g++ = pimage[1] / 255.0f;
                *phost_b++ = pimage[2] / 255.0f;
            checkRuntime(cudaMemcpyAsync(input_data_device, input_data_host, input_numel * sizeof(float), cudaMemcpyHostToDevice, stream));

            int output_batch = input_batch;
            int output_channel = input_channel;
            int output_height = input_height * scale_factor;
            int output_width = input_width * scale_factor;
            int output_numel = output_batch *  output_channel * output_height * output_width;
            float* output_data_host = nullptr;
            float* output_data_device = nullptr;  
            checkRuntime(cudaMallocHost(&output_data_host, output_numel * sizeof(float)));    
            checkRuntime(cudaMalloc(&output_data_device, output_numel * sizeof(float)));

            // 明确当前推理时,使用的数据输入大小
            auto input_dims = execution_context->getBindingDimensions(0);
            input_dims.d[2] = input_height;
            input_dims.d[3] = input_width;
            // for(int i=0;i<4;++i){
            //     printf("第%d个维度:%d\n", i, input_dims.d[i]);
            // }
            // 设置当前推理时,input大小
            execution_context->setBindingDimensions(0, input_dims);
            float* bindings[] = {input_data_device, output_data_device};

            bool success      = execution_context->enqueueV2((void**)bindings, stream, nullptr);                    // todo : 加个推理成功与否的判断

            checkRuntime(cudaMemcpyAsync(output_data_host, output_data_device, output_numel * sizeof(float), cudaMemcpyDeviceToHost, stream));
            auto pre_end_time = std::chrono::high_resolution_clock::now();
            auto pre_duration = std::chrono::duration_cast<std::chrono::milliseconds>(pre_end_time - pre_start_time);
            printf("推理执行时间: %ld ms\n", pre_duration.count());  
            // printf("推理成功!\n");
            auto post_start_time = std::chrono::high_resolution_clock::now();
            unsigned char* output_data_uchar = new unsigned char[output_numel];
            for (int i = 0; i < output_numel; ++i){
                output_data_uchar[i] = static_cast<uint8_t>(output_data_host[i]);
            cv::Mat output_image(output_height, output_width, CV_8UC3, output_data_uchar);
            delete [] output_data_uchar;
            std::vector<uint8_t> output_image_bytes;
            cv::imencode(image_type, output_image, output_image_bytes);
            auto post_end_time = std::chrono::high_resolution_clock::now();
            auto post_duration = std::chrono::duration_cast<std::chrono::milliseconds>(post_end_time - post_start_time);
            printf("后处理执行时间: %ld ms\n", post_duration.count()); 

Please include:

  • Exact steps/commands to build your repro
  • Exact steps/commands to run your repro
  • Full traceback of errors encountered


Request you to share the model, script, profiler, and performance output if not shared already so that we can help you better.

Alternatively, you can try running your model with trtexec command.

While measuring the model performance, make sure you consider the latency and throughput of the network inference, excluding the data pre and post-processing overhead.
Please refer to the below links for more details: