TensorRT get different result in python and c++

user49985 · July 6, 2022, 12:38pm

Description

TensorRT get different result in python and c++, with same engine and same input;

Environment

TensorRT Version: 8.4.1.5
GPU Type: A10
Nvidia Driver Version: 495.29.05
CUDA Version: 11.2
CUDNN Version: 8.1.1
Operating System + Version: Ubuntu 16.04
Python Version (if applicable): 3.7.6
TensorFlow Version (if applicable):
PyTorch Version (if applicable): 1.7.1
Baremetal or Container (if container which image + tag):

Relevant Files

Steps To Reproduce

Python code

import cv2
import torch
import tensorrt as trt

class MyLogger(trt.ILogger):
    def __init__(self):
       trt.ILogger.__init__(self)

    def log(self, severity, msg):
        print(msg)

logger = MyLogger()

def run_trt(image, feature_map, neighbor_feature_map):
    logger = MyLogger()
    with open("test.engine", "rb") as f:
        serialized_engine = f.read()
    runtime = trt.Runtime(logger)
    engine = runtime.deserialize_cuda_engine(serialized_engine)
    context = engine.create_execution_context()
    context.set_binding_shape(0, list(image.shape))
    context.set_binding_shape(1, list(feature_map.shape))
    context.set_binding_shape(2, list(neighbor_feature_map.shape))

    out_cls = torch.zeros(list(context.get_binding_shape(3))).float().cuda().contiguous()

    buffers = [None] * 4
    buffers[engine.get_binding_index("image")] = image.data_ptr()
    buffers[engine.get_binding_index("feature_map")] = feature_map.data_ptr()
    buffers[engine.get_binding_index("neighbor_feature_map")] = neighbor_feature_map.data_ptr()
    buffers[engine.get_binding_index("out_cls")] = out_cls.data_ptr()
    context.execute_v2(buffers)
    return out_cls

if __name__ == "__main__":
    img = list(torch.jit.load("image_tensor.pt").parameters())[0].cuda().contiguous() # 1x3x700x700
    a = torch.zeros(1, 64, 350, 350).float().cuda()
    b = torch.zeros(1, 64, 700, 700).float().cuda()
    cls = run_trt(img.clone(), a.clone(), b.clone())
    print(cls.flatten()[0:20])

c++ code

#include <vector>
#include <string>
#include <assert.h>
#include <chrono>
#include <iostream>

#include <cuda.h>
#include <cuda_runtime_api.h>
#include <torch/torch.h>
#include <torch/script.h>

#include <NvInferPlugin.h>

class Logger : public nvinfer1::ILogger {
public:
    void log(nvinfer1::ILogger::Severity severity, const char* msg) noexcept override {
            LOG(ERROR) << msg;
    };
};

int main() {
    torch::Device device(torch::kCUDA, 0);
    auto m = torch::jit::load("image_tensor.pt");
    auto img =  (*(m.parameters().begin())).to(device);
    auto a = torch::zeros({1, 64, 350, 350}).to(device).to(torch::kFloat);
    auto b = torch::zeros({1, 64, 700, 700}).to(device).to(torch::kFloat);
    auto logger = Logger{};
    using namespace nvinfer1;
    initLibNvInferPlugins(&logger, "");
    std::vector<char> trt_engine_stream;
    size_t size = 0;
    std::ifstream file("test.engine", std::ios::binary);
    if (file.good()) {
        file.seekg(0, file.end);
        size = file.tellg();
        file.seekg(0, file.beg);
        trt_engine_stream.resize(size);
        file.read(trt_engine_stream.data(), size);
        file.close();
    }
   
    auto engine = infer->deserializeCudaEngine(trt_engine_stream.data(), size);
    auto context = engine->createExecutionContext();
    context->setBindingDimensions(0, nvinfer1::Dims4{1, 3, 700, 700});
    context->setBindingDimensions(1, nvinfer1::Dims4{1, 64, 350, 350});
    context->setBindingDimensions(2, nvinfer1::Dims4{1, 64, 700, 700});
    auto cls_shape = context->getBindingDimensions(3);
    auto cls_tensor = torch::zeros({cls_shape.d[0], cls_shape.d[1], cls_shape.d[2], cls_shape.d[3]}).to(device);
    void* bindings[4] = {img.data_ptr<float>(),
                             a.data_ptr<float>(),
                             b.data_ptr<float>(),
                             cls_tensor.data_ptr<float>()};
    CHECK(context->executeV2(bindings));
    cls_tensor = cls_tensor.to(torch::kCPU);
    std::cout << std::endl;
    for (int j = 0; j < 20; ++j) {
        std::cout << (cls_tensor.data_ptr<float>())[j] << " ";
    }
    std::cout << std::endl;
    return 0;
}

NVES · July 6, 2022, 1:07pm

Hi,
Please refer to below links related custom plugin implementation and sample:

While IPluginV2 and IPluginV2Ext interfaces are still supported for backward compatibility with TensorRT 5.1 and 6.0.x respectively, however, we recommend that you write new plugins or refactor existing ones to target the IPluginV2DynamicExt or IPluginV2IOExt interfaces instead.

Thanks!

user49985 · July 6, 2022, 1:19pm

There is no custom op in engine.

spolisetty · July 7, 2022, 2:35pm

Could you please share with us the ONNX model and script/command to generate the TensorRT engine.

Thank you.

user49985 · July 7, 2022, 2:47pm

The onnx model:

Generate script:

import argparse
import pickle
import sys
import tempfile
import torch
import tensorrt as trt

from inference import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_onnx(args, onnx_file):
    print("converting to onnx")
    ModelClass = getattr(sys.modules[__name__], args.name)

    with open(args.weights, "rb") as f:
           weights = pickle.load(f)
    model = ModelClass(weights=weights).to(device)
    model.eval()

    dummy_inputs = (torch.randn(1, 3, 512, 512, device=device),
                    torch.randn(1, 64, 256, 256, device=device),
                    torch.randn(1, 64, 512, 512, device=device))

    input_names = ["image", "feature_map", "neighbor_feature_map"]
    output_names = ["out_cls"]

    torch.onnx.export(model, dummy_inputs, onnx_file, opset_version=11,
            verbose=False, input_names=input_names, output_names=output_names,
            enable_onnx_checker=args.check,
            dynamic_axes={"image": {0: "b", 2: "h", 3: "w"},
                          "feature_map": {0: "b", 2: "h2", 3: "w2"},
                          "neighbor_feature_map": {0: "b", 2: "h", 3: "w"},
                          "out_cls": {0: "b", 2: "h", 3: "w"}})
    print("converted to onnx: {}".format(onnx_file))
    return f


EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def GiB(val):
    return val * 1 << 30

def get_trt(args, onnx_file):
    print("converting to TensorRT engine")
    min_shape = tuple(int(i) for i in args.min.split(","))
    opt_shape = tuple(int(i) for i in args.opt.split(","))
    max_shape = tuple(int(i) for i in args.max.split(","))

    min_feature = (min_shape[0], 64, min_shape[2] // 2, min_shape[3] // 2)
    opt_feature = (opt_shape[0], 64, opt_shape[2] // 2, opt_shape[3] // 2)
    max_feature = (max_shape[0], 64, max_shape[2] // 2, max_shape[3] // 2)


    min_neighbor = (min_shape[0], 64, min_shape[2], min_shape[3])
    opt_neighbor = (opt_shape[0], 64, opt_shape[2], opt_shape[3])
    max_neighbor = (max_shape[0], 64, max_shape[2], max_shape[3])

    print("dynamic shape: ", min_shape, opt_shape, max_shape)

    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(EXPLICIT_BATCH)
    parser = trt.OnnxParser(network, TRT_LOGGER)
    with open(onnx_file, 'rb') as model:
        if not parser.parse(model.read()):
            print ('ERROR: Failed to parse the ONNX file.')
            for error in range(parser.num_errors):
                print (parser.get_error(error))

    network.get_input(0).dtype = trt.DataType.FLOAT
    network.get_input(1).dtype = trt.DataType.FLOAT
    network.get_input(2).dtype = trt.DataType.FLOAT
    network.get_output(0).dtype = trt.DataType.FLOAT
    profile = builder.create_optimization_profile()
    profile.set_shape("image", min_shape, opt_shape, max_shape)
    profile.set_shape("feature_map", min_feature, opt_feature, max_feature)
    profile.set_shape("neighbor_feature_map", min_neighbor,
                      opt_neighbor, max_neighbor)
    config = builder.create_builder_config()
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, GiB(5))
    config.add_optimization_profile(profile)
    engine = builder.build_engine(network, config)
    with open(args.output, "wb") as f:
        f.write(engine.serialize())

    print("converted to TensorRT engine: {}".format(args.output))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("name", help="model name")
    parser.add_argument("output", help="output tensorrt model")
    parser.add_argument("--weights", help="input weights")
    parser.add_argument("--check", action="store_true", help="enable onnx check")
    parser.add_argument("--min", required=True, help="shape, e.g. (1, 3, 200, 200)")
    parser.add_argument("--opt", required=True, help="shape, e.g. (2, 3, 250, 250)")
    parser.add_argument("--max", required=True, help="shape, e.g. (3, 3, 300, 300)")

    args = parser.parse_args()

    f = tempfile.NamedTemporaryFile()
    onnx_file = f.name
    onnx_model = get_onnx(args, onnx_file)
    torch.cuda.empty_cache()
    get_trt(args, onnx_file)

user49985 · July 7, 2022, 2:51pm

The ModelClass is a simple model, do I need to provide it’s code?

spolisetty · July 12, 2022, 4:34pm

Hi,

We are facing some issues in running the script,

    from inference import *
ModuleNotFoundError: No module named 'inference'

Could you please share with us the complete scripts and steps to run. Also could you please confirm which script’s results are different with respect to the pytorch model output.

Thank you.

user49985 · July 13, 2022, 7:42am

The C++ results are different with pytorch.
I removed the code of generating onnx model, you could use onnx model to generate TensorRT engine.
But the problem is TensorRT c++ reuslts different with python results, maybe how the TensorRT engine be generated has no matter?

import argparse
import pickle
import sys
import tempfile
import torch
import tensorrt as trt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def GiB(val):
    return val * 1 << 30

def get_trt(args, onnx_file):
    print("converting to TensorRT engine")
    min_shape = tuple(int(i) for i in args.min.split(","))
    opt_shape = tuple(int(i) for i in args.opt.split(","))
    max_shape = tuple(int(i) for i in args.max.split(","))

    min_feature = (min_shape[0], 64, min_shape[2] // 2, min_shape[3] // 2)
    opt_feature = (opt_shape[0], 64, opt_shape[2] // 2, opt_shape[3] // 2)
    max_feature = (max_shape[0], 64, max_shape[2] // 2, max_shape[3] // 2)


    min_neighbor = (min_shape[0], 64, min_shape[2], min_shape[3])
    opt_neighbor = (opt_shape[0], 64, opt_shape[2], opt_shape[3])
    max_neighbor = (max_shape[0], 64, max_shape[2], max_shape[3])

    print("dynamic shape: ", min_shape, opt_shape, max_shape)

    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(EXPLICIT_BATCH)
    parser = trt.OnnxParser(network, TRT_LOGGER)
    with open(onnx_file, 'rb') as model:
        if not parser.parse(model.read()):
            print ('ERROR: Failed to parse the ONNX file.')
            for error in range(parser.num_errors):
                print (parser.get_error(error))

    network.get_input(0).dtype = trt.DataType.FLOAT
    network.get_input(1).dtype = trt.DataType.FLOAT
    network.get_input(2).dtype = trt.DataType.FLOAT
    network.get_output(0).dtype = trt.DataType.FLOAT
    profile = builder.create_optimization_profile()
    profile.set_shape("image", min_shape, opt_shape, max_shape)
    profile.set_shape("feature_map", min_feature, opt_feature, max_feature)
    profile.set_shape("neighbor_feature_map", min_neighbor,
                      opt_neighbor, max_neighbor)
    config = builder.create_builder_config()
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, GiB(5))
    config.add_optimization_profile(profile)
    engine = builder.build_engine(network, config)
    with open(args.output, "wb") as f:
        f.write(engine.serialize())

    print("converted to TensorRT engine: {}".format(args.output))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("name", help="model name")
    parser.add_argument("output", help="output tensorrt model")
    parser.add_argument("--check", action="store_true", help="enable onnx check")
    parser.add_argument("--min", required=True, help="shape, e.g. (1, 3, 200, 200)")
    parser.add_argument("--opt", required=True, help="shape, e.g. (2, 3, 250, 250)")
    parser.add_argument("--max", required=True, help="shape, e.g. (3, 3, 300, 300)")

    args = parser.parse_args()

    onnx_file = "test.onnx"
    get_trt(args, onnx_file)

spolisetty · July 18, 2022, 1:21pm

Hi,

We are tracking this issue internally.
However, it would be useful to know how different they are.
For example, is output totally corrupted? or is the difference very small?

Thank you.

user49985 · July 19, 2022, 7:29am

The output totally corrupted.

spolisetty · July 20, 2022, 5:12am

Hi @user49985,

Could you please replace this line:

auto cls_tensor = torch::zeros({cls_shape.d[0], cls_shape.d[1], cls_shape.d[2], cls_shape.d[3]}).to(device);

with the following:

auto cls_tensor = torch::zeros({cls_shape.d[0], cls_shape.d[1], cls_shape.d[2], cls_shape.d[3]}).to(device).to(torch::kFloat);

and see if that fixes the issue?

Thank you.

user49985 · July 20, 2022, 8:47am

Could you reproducing the different result of cpp and python?

spolisetty · July 25, 2022, 2:01pm

Hi,

Our internal engineering team is yet to try it. At first glance, we found the above observation.
Could you please try the above and share the result with us.

Thank you.

spolisetty · July 26, 2022, 3:46pm

Hi,

Our engineering team trying to reproduce the issue, but unable to successfully get the TRT engine and run scripts.

python3 repro.py --min "1,3,700,700" --opt "1,64,350,350" --max "1,64,700,700" test.onnx out_cls
converting to TensorRT engine
dynamic shape:  (1, 3, 700, 700) (1, 64, 350, 350) (1, 64, 700, 700)
[07/25/2022-11:37:58] [TRT] [E] 4: Specified optimization profiles must satisfy MIN<=OPT<=MAX.
[07/25/2022-11:37:58] [TRT] [E] 3: [optimizationProfile.cpp::setDimensions::144] Error Code 3: API Usage Error (Parameter check failed at: runtime/common/optimizationProfile.cpp::setDimensions::144, condition: validate(mThreadResources.getErrorRecorder(), newEntry, true)
)
Traceback (most recent call last):
  File "repro.py", line 75, in <module>
    get_trt(args, onnx_file)
  File "repro.py", line 49, in get_trt
    profile.set_shape("image", min_shape, opt_shape, max_shape)
RuntimeError: Shape provided for opt is inconsistent with other shapes.


python3 repro.py --min "1,3,700,700" --opt "1,64,700,700" --max "1,64,700,700" test.onnx out_cls
converting to TensorRT engine
dynamic shape:  (1, 3, 700, 700) (1, 64, 700, 700) (1, 64, 700, 700)
repro.py:56: DeprecationWarning: Use build_serialized_network instead.
  engine = builder.build_engine(network, config)
[07/25/2022-11:39:11] [TRT] [E] 4: [network.cpp::validate::3064] Error Code 4: Internal Error (image: for dimension number 1 in profile 0 does not match network definition (got min=3, opt=64, max=64), expected min=opt=max=3).)
Traceback (most recent call last):
  File "repro.py", line 75, in <module>
    get_trt(args, onnx_file)
  File "repro.py", line 58, in get_trt
    f.write(engine.serialize())
AttributeError: 'NoneType' object has no attribute 'serialize'

Could you please share step by step to run the scripts and also verbose logs/commands. That would be helpful to us to quickly debug this issue and fix.

Thank you.

user49985 · July 27, 2022, 8:12am

I have provided the TRT engine, you could try the python and c++ code to reproduce the outputs.

user49985 · July 27, 2022, 8:20am

import argparse
import pickle
import sys
import tempfile
import torch
import tensorrt as trt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def GiB(val):
    return val * 1 << 30

def get_trt(args, onnx_file):
    print("converting to TensorRT engine")
    min_shape = tuple(int(i) for i in args.min.split(","))
    opt_shape = tuple(int(i) for i in args.opt.split(","))
    max_shape = tuple(int(i) for i in args.max.split(","))

    min_feature = (min_shape[0], 64, min_shape[2] // 2, min_shape[3] // 2)
    opt_feature = (opt_shape[0], 64, opt_shape[2] // 2, opt_shape[3] // 2)
    max_feature = (max_shape[0], 64, max_shape[2] // 2, max_shape[3] // 2)


    min_neighbor = (min_shape[0], 64, min_shape[2], min_shape[3])
    opt_neighbor = (opt_shape[0], 64, opt_shape[2], opt_shape[3])
    max_neighbor = (max_shape[0], 64, max_shape[2], max_shape[3])

    print("dynamic shape: ", min_shape, opt_shape, max_shape)

    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(EXPLICIT_BATCH)
    parser = trt.OnnxParser(network, TRT_LOGGER)
    with open(onnx_file, 'rb') as model:
        if not parser.parse(model.read()):
            print ('ERROR: Failed to parse the ONNX file.')
            for error in range(parser.num_errors):
                print (parser.get_error(error))

    network.get_input(0).dtype = trt.DataType.FLOAT
    network.get_input(1).dtype = trt.DataType.FLOAT
    network.get_input(2).dtype = trt.DataType.FLOAT
    network.get_output(0).dtype = trt.DataType.FLOAT
    profile = builder.create_optimization_profile()
    profile.set_shape("image", min_shape, opt_shape, max_shape)
    profile.set_shape("feature_map", min_feature, opt_feature, max_feature)
    profile.set_shape("neighbor_feature_map", min_neighbor,
                      opt_neighbor, max_neighbor)
    config = builder.create_builder_config()
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, GiB(5))
    config.add_optimization_profile(profile)
    engine = builder.build_engine(network, config)
    with open(args.output, "wb") as f:
        f.write(engine.serialize())

    print("converted to TensorRT engine: {}".format(args.output))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("output", help="output tensorrt model")
    parser.add_argument("--min", required=True, help="shape, e.g. (1, 3, 200, 200)")
    parser.add_argument("--opt", required=True, help="shape, e.g. (2, 3, 250, 250)")
    parser.add_argument("--max", required=True, help="shape, e.g. (3, 3, 300, 300)")

    args = parser.parse_args()

    onnx_file = "test.onnx"
    get_trt(args, onnx_file)

python test.py test.engine --min 1,3,300,300 --opt 1,3,600,600 --max 1,3,700,700

This command and script could generate the TRT engine by onnx model.

spolisetty · July 27, 2022, 8:54am

Thank you. Let us try this one.

TensorRT engine is not portable across the different platforms, it is dependent on the TensorRT version, OS, GPU, etc.

spolisetty · July 28, 2022, 5:40am

Hi,

Sorry, we are facing an issue with compiling the C++ code as well.
It would be hard for us to work on a fix without successfully running the repro.
Could you please share with us exact commands/steps for C++ code as well.
We are trying with

g++ -I${TRT_ROOT}/include cpp_repro.cpp -o cpp_repro -I/usr/local/cuda/include -Ilibtorch/include/torch/csrc/api/include/ -Ilibtorch/include -x c++ -std=c++14 -Wno-deprecated-declarations

Thank you.

user49985 · July 28, 2022, 8:35am

You could use cmake

project (trt-test)
cmake_minimum_required (VERSION 3.4)

find_package(CUDA REQUIRED)
find_package(Torch REQUIRED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")

add_compile_options(-O0)
add_compile_options(-g)


include_directories (${TORCH_INCLUDE_DIRS})
include_directories (/usr/local/cuda/include)
link_directories (/Develop/download/TensorRT-8.4.1.5/lib)
link_directories (/usr/local/cuda/lib64)
link_libraries (${TORCH_LIBRARIES})
link_libraries (cuda)
link_libraries (cudart)
link_libraries (nvinfer)
link_libraries (nvinfer_plugin)


add_executable (trt_test trt_test)

Replace the TRT path, then cmake .. -DCMAKE_PREFIX_PATH=/path/to/libtorch

spolisetty · August 2, 2022, 4:47am

Hi @user49985 ,

Could you please share working C++ script. This code is not compiling due to incomplete.

Thank you.

Topic		Replies	Views
Different output from TRT engine in python and c++ TensorRT	12	1847	October 12, 2021
Inference of model using tensorflow/onnxruntime and TensorRT gives different result Jetson TX2 tensorrt	20	2804	October 18, 2021
Tensorrt C++ not working as python version and gives wrong results TensorRT	4	588	August 7, 2023
TensorRT C++ result was wired and changed everytime I do the same inference TensorRT	3	771	May 17, 2021
The TensorRT engine produces different inference results when loaded using Python compared to C++ TensorRT cudnn , deepstream	2	139	April 28, 2025
TensorRT 8 : C++ inference gives different results compared to tensorflow python inference TensorRT	7	1467	October 5, 2021
Help with TensorRT errors when building an engine TensorRT cudnn	3	181	February 22, 2025
Output from ONNX inference and trt inference are different Jetson TX2 tensorrt , tensorflow , nvbugs	6	954	October 18, 2021
Yolov5 Engine Inference error TensorRT tensorrt	3	1998	May 6, 2022
BUG: Output TRT engine from trtexec has completely different inference than input model TensorRT tensorrt , debugging-and-troubleshooting	3	2370	January 4, 2022

TensorRT get different result in python and c++

Description

Environment

Relevant Files

Steps To Reproduce

Related topics