TensorRT get different result in python and c++

Description

TensorRT get different result in python and c++, with same engine and same input;

Environment

TensorRT Version: 8.4.1.5
GPU Type: A10
Nvidia Driver Version: 495.29.05
CUDA Version: 11.2
CUDNN Version: 8.1.1
Operating System + Version: Ubuntu 16.04
Python Version (if applicable): 3.7.6
TensorFlow Version (if applicable):
PyTorch Version (if applicable): 1.7.1
Baremetal or Container (if container which image + tag):

Relevant Files

Steps To Reproduce

Python code

import cv2
import torch
import tensorrt as trt

class MyLogger(trt.ILogger):
    def __init__(self):
       trt.ILogger.__init__(self)

    def log(self, severity, msg):
        print(msg)

logger = MyLogger()

def run_trt(image, feature_map, neighbor_feature_map):
    logger = MyLogger()
    with open("test.engine", "rb") as f:
        serialized_engine = f.read()
    runtime = trt.Runtime(logger)
    engine = runtime.deserialize_cuda_engine(serialized_engine)
    context = engine.create_execution_context()
    context.set_binding_shape(0, list(image.shape))
    context.set_binding_shape(1, list(feature_map.shape))
    context.set_binding_shape(2, list(neighbor_feature_map.shape))

    out_cls = torch.zeros(list(context.get_binding_shape(3))).float().cuda().contiguous()

    buffers = [None] * 4
    buffers[engine.get_binding_index("image")] = image.data_ptr()
    buffers[engine.get_binding_index("feature_map")] = feature_map.data_ptr()
    buffers[engine.get_binding_index("neighbor_feature_map")] = neighbor_feature_map.data_ptr()
    buffers[engine.get_binding_index("out_cls")] = out_cls.data_ptr()
    context.execute_v2(buffers)
    return out_cls

if __name__ == "__main__":
    img = list(torch.jit.load("image_tensor.pt").parameters())[0].cuda().contiguous() # 1x3x700x700
    a = torch.zeros(1, 64, 350, 350).float().cuda()
    b = torch.zeros(1, 64, 700, 700).float().cuda()
    cls = run_trt(img.clone(), a.clone(), b.clone())
    print(cls.flatten()[0:20])

c++ code

#include <vector>
#include <string>
#include <assert.h>
#include <chrono>
#include <iostream>

#include <cuda.h>
#include <cuda_runtime_api.h>
#include <torch/torch.h>
#include <torch/script.h>

#include <NvInferPlugin.h>

class Logger : public nvinfer1::ILogger {
public:
    void log(nvinfer1::ILogger::Severity severity, const char* msg) noexcept override {
            LOG(ERROR) << msg;
    };
};

int main() {
    torch::Device device(torch::kCUDA, 0);
    auto m = torch::jit::load("image_tensor.pt");
    auto img =  (*(m.parameters().begin())).to(device);
    auto a = torch::zeros({1, 64, 350, 350}).to(device).to(torch::kFloat);
    auto b = torch::zeros({1, 64, 700, 700}).to(device).to(torch::kFloat);
    auto logger = Logger{};
    using namespace nvinfer1;
    initLibNvInferPlugins(&logger, "");
    std::vector<char> trt_engine_stream;
    size_t size = 0;
    std::ifstream file("test.engine", std::ios::binary);
    if (file.good()) {
        file.seekg(0, file.end);
        size = file.tellg();
        file.seekg(0, file.beg);
        trt_engine_stream.resize(size);
        file.read(trt_engine_stream.data(), size);
        file.close();
    }
   
    auto engine = infer->deserializeCudaEngine(trt_engine_stream.data(), size);
    auto context = engine->createExecutionContext();
    context->setBindingDimensions(0, nvinfer1::Dims4{1, 3, 700, 700});
    context->setBindingDimensions(1, nvinfer1::Dims4{1, 64, 350, 350});
    context->setBindingDimensions(2, nvinfer1::Dims4{1, 64, 700, 700});
    auto cls_shape = context->getBindingDimensions(3);
    auto cls_tensor = torch::zeros({cls_shape.d[0], cls_shape.d[1], cls_shape.d[2], cls_shape.d[3]}).to(device);
    void* bindings[4] = {img.data_ptr<float>(),
                             a.data_ptr<float>(),
                             b.data_ptr<float>(),
                             cls_tensor.data_ptr<float>()};
    CHECK(context->executeV2(bindings));
    cls_tensor = cls_tensor.to(torch::kCPU);
    std::cout << std::endl;
    for (int j = 0; j < 20; ++j) {
        std::cout << (cls_tensor.data_ptr<float>())[j] << " ";
    }
    std::cout << std::endl;
    return 0;
}

Hi,
Please refer to below links related custom plugin implementation and sample:

While IPluginV2 and IPluginV2Ext interfaces are still supported for backward compatibility with TensorRT 5.1 and 6.0.x respectively, however, we recommend that you write new plugins or refactor existing ones to target the IPluginV2DynamicExt or IPluginV2IOExt interfaces instead.

Thanks!

There is no custom op in engine.

Could you please share with us the ONNX model and script/command to generate the TensorRT engine.

Thank you.

The onnx model:

Generate script:

import argparse
import pickle
import sys
import tempfile
import torch
import tensorrt as trt

from inference import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_onnx(args, onnx_file):
    print("converting to onnx")
    ModelClass = getattr(sys.modules[__name__], args.name)

    with open(args.weights, "rb") as f:
           weights = pickle.load(f)
    model = ModelClass(weights=weights).to(device)
    model.eval()

    dummy_inputs = (torch.randn(1, 3, 512, 512, device=device),
                    torch.randn(1, 64, 256, 256, device=device),
                    torch.randn(1, 64, 512, 512, device=device))

    input_names = ["image", "feature_map", "neighbor_feature_map"]
    output_names = ["out_cls"]

    torch.onnx.export(model, dummy_inputs, onnx_file, opset_version=11,
            verbose=False, input_names=input_names, output_names=output_names,
            enable_onnx_checker=args.check,
            dynamic_axes={"image": {0: "b", 2: "h", 3: "w"},
                          "feature_map": {0: "b", 2: "h2", 3: "w2"},
                          "neighbor_feature_map": {0: "b", 2: "h", 3: "w"},
                          "out_cls": {0: "b", 2: "h", 3: "w"}})
    print("converted to onnx: {}".format(onnx_file))
    return f


EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def GiB(val):
    return val * 1 << 30

def get_trt(args, onnx_file):
    print("converting to TensorRT engine")
    min_shape = tuple(int(i) for i in args.min.split(","))
    opt_shape = tuple(int(i) for i in args.opt.split(","))
    max_shape = tuple(int(i) for i in args.max.split(","))

    min_feature = (min_shape[0], 64, min_shape[2] // 2, min_shape[3] // 2)
    opt_feature = (opt_shape[0], 64, opt_shape[2] // 2, opt_shape[3] // 2)
    max_feature = (max_shape[0], 64, max_shape[2] // 2, max_shape[3] // 2)


    min_neighbor = (min_shape[0], 64, min_shape[2], min_shape[3])
    opt_neighbor = (opt_shape[0], 64, opt_shape[2], opt_shape[3])
    max_neighbor = (max_shape[0], 64, max_shape[2], max_shape[3])

    print("dynamic shape: ", min_shape, opt_shape, max_shape)

    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(EXPLICIT_BATCH)
    parser = trt.OnnxParser(network, TRT_LOGGER)
    with open(onnx_file, 'rb') as model:
        if not parser.parse(model.read()):
            print ('ERROR: Failed to parse the ONNX file.')
            for error in range(parser.num_errors):
                print (parser.get_error(error))

    network.get_input(0).dtype = trt.DataType.FLOAT
    network.get_input(1).dtype = trt.DataType.FLOAT
    network.get_input(2).dtype = trt.DataType.FLOAT
    network.get_output(0).dtype = trt.DataType.FLOAT
    profile = builder.create_optimization_profile()
    profile.set_shape("image", min_shape, opt_shape, max_shape)
    profile.set_shape("feature_map", min_feature, opt_feature, max_feature)
    profile.set_shape("neighbor_feature_map", min_neighbor,
                      opt_neighbor, max_neighbor)
    config = builder.create_builder_config()
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, GiB(5))
    config.add_optimization_profile(profile)
    engine = builder.build_engine(network, config)
    with open(args.output, "wb") as f:
        f.write(engine.serialize())

    print("converted to TensorRT engine: {}".format(args.output))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("name", help="model name")
    parser.add_argument("output", help="output tensorrt model")
    parser.add_argument("--weights", help="input weights")
    parser.add_argument("--check", action="store_true", help="enable onnx check")
    parser.add_argument("--min", required=True, help="shape, e.g. (1, 3, 200, 200)")
    parser.add_argument("--opt", required=True, help="shape, e.g. (2, 3, 250, 250)")
    parser.add_argument("--max", required=True, help="shape, e.g. (3, 3, 300, 300)")

    args = parser.parse_args()

    f = tempfile.NamedTemporaryFile()
    onnx_file = f.name
    onnx_model = get_onnx(args, onnx_file)
    torch.cuda.empty_cache()
    get_trt(args, onnx_file)

The ModelClass is a simple model, do I need to provide it’s code?

Hi,

We are facing some issues in running the script,

    from inference import *
ModuleNotFoundError: No module named 'inference'

Could you please share with us the complete scripts and steps to run. Also could you please confirm which script’s results are different with respect to the pytorch model output.

Thank you.

The C++ results are different with pytorch.
I removed the code of generating onnx model, you could use onnx model to generate TensorRT engine.
But the problem is TensorRT c++ reuslts different with python results, maybe how the TensorRT engine be generated has no matter?

import argparse
import pickle
import sys
import tempfile
import torch
import tensorrt as trt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def GiB(val):
    return val * 1 << 30

def get_trt(args, onnx_file):
    print("converting to TensorRT engine")
    min_shape = tuple(int(i) for i in args.min.split(","))
    opt_shape = tuple(int(i) for i in args.opt.split(","))
    max_shape = tuple(int(i) for i in args.max.split(","))

    min_feature = (min_shape[0], 64, min_shape[2] // 2, min_shape[3] // 2)
    opt_feature = (opt_shape[0], 64, opt_shape[2] // 2, opt_shape[3] // 2)
    max_feature = (max_shape[0], 64, max_shape[2] // 2, max_shape[3] // 2)


    min_neighbor = (min_shape[0], 64, min_shape[2], min_shape[3])
    opt_neighbor = (opt_shape[0], 64, opt_shape[2], opt_shape[3])
    max_neighbor = (max_shape[0], 64, max_shape[2], max_shape[3])

    print("dynamic shape: ", min_shape, opt_shape, max_shape)

    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(EXPLICIT_BATCH)
    parser = trt.OnnxParser(network, TRT_LOGGER)
    with open(onnx_file, 'rb') as model:
        if not parser.parse(model.read()):
            print ('ERROR: Failed to parse the ONNX file.')
            for error in range(parser.num_errors):
                print (parser.get_error(error))

    network.get_input(0).dtype = trt.DataType.FLOAT
    network.get_input(1).dtype = trt.DataType.FLOAT
    network.get_input(2).dtype = trt.DataType.FLOAT
    network.get_output(0).dtype = trt.DataType.FLOAT
    profile = builder.create_optimization_profile()
    profile.set_shape("image", min_shape, opt_shape, max_shape)
    profile.set_shape("feature_map", min_feature, opt_feature, max_feature)
    profile.set_shape("neighbor_feature_map", min_neighbor,
                      opt_neighbor, max_neighbor)
    config = builder.create_builder_config()
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, GiB(5))
    config.add_optimization_profile(profile)
    engine = builder.build_engine(network, config)
    with open(args.output, "wb") as f:
        f.write(engine.serialize())

    print("converted to TensorRT engine: {}".format(args.output))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("name", help="model name")
    parser.add_argument("output", help="output tensorrt model")
    parser.add_argument("--check", action="store_true", help="enable onnx check")
    parser.add_argument("--min", required=True, help="shape, e.g. (1, 3, 200, 200)")
    parser.add_argument("--opt", required=True, help="shape, e.g. (2, 3, 250, 250)")
    parser.add_argument("--max", required=True, help="shape, e.g. (3, 3, 300, 300)")

    args = parser.parse_args()

    onnx_file = "test.onnx"
    get_trt(args, onnx_file)

Hi,

We are tracking this issue internally.
However, it would be useful to know how different they are.
For example, is output totally corrupted? or is the difference very small?

Thank you.

The output totally corrupted.

Hi @user49985,

Could you please replace this line:

auto cls_tensor = torch::zeros({cls_shape.d[0], cls_shape.d[1], cls_shape.d[2], cls_shape.d[3]}).to(device);

with the following:

auto cls_tensor = torch::zeros({cls_shape.d[0], cls_shape.d[1], cls_shape.d[2], cls_shape.d[3]}).to(device).to(torch::kFloat);

and see if that fixes the issue?

Thank you.

Could you reproducing the different result of cpp and python?

Hi,

Our internal engineering team is yet to try it. At first glance, we found the above observation.
Could you please try the above and share the result with us.

Thank you.

Hi,

Our engineering team trying to reproduce the issue, but unable to successfully get the TRT engine and run scripts.

python3 repro.py --min "1,3,700,700" --opt "1,64,350,350" --max "1,64,700,700" test.onnx out_cls
converting to TensorRT engine
dynamic shape:  (1, 3, 700, 700) (1, 64, 350, 350) (1, 64, 700, 700)
[07/25/2022-11:37:58] [TRT] [E] 4: Specified optimization profiles must satisfy MIN<=OPT<=MAX.
[07/25/2022-11:37:58] [TRT] [E] 3: [optimizationProfile.cpp::setDimensions::144] Error Code 3: API Usage Error (Parameter check failed at: runtime/common/optimizationProfile.cpp::setDimensions::144, condition: validate(mThreadResources.getErrorRecorder(), newEntry, true)
)
Traceback (most recent call last):
  File "repro.py", line 75, in <module>
    get_trt(args, onnx_file)
  File "repro.py", line 49, in get_trt
    profile.set_shape("image", min_shape, opt_shape, max_shape)
RuntimeError: Shape provided for opt is inconsistent with other shapes.


python3 repro.py --min "1,3,700,700" --opt "1,64,700,700" --max "1,64,700,700" test.onnx out_cls
converting to TensorRT engine
dynamic shape:  (1, 3, 700, 700) (1, 64, 700, 700) (1, 64, 700, 700)
repro.py:56: DeprecationWarning: Use build_serialized_network instead.
  engine = builder.build_engine(network, config)
[07/25/2022-11:39:11] [TRT] [E] 4: [network.cpp::validate::3064] Error Code 4: Internal Error (image: for dimension number 1 in profile 0 does not match network definition (got min=3, opt=64, max=64), expected min=opt=max=3).)
Traceback (most recent call last):
  File "repro.py", line 75, in <module>
    get_trt(args, onnx_file)
  File "repro.py", line 58, in get_trt
    f.write(engine.serialize())
AttributeError: 'NoneType' object has no attribute 'serialize'

Could you please share step by step to run the scripts and also verbose logs/commands. That would be helpful to us to quickly debug this issue and fix.

Thank you.

I have provided the TRT engine, you could try the python and c++ code to reproduce the outputs.

import argparse
import pickle
import sys
import tempfile
import torch
import tensorrt as trt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def GiB(val):
    return val * 1 << 30

def get_trt(args, onnx_file):
    print("converting to TensorRT engine")
    min_shape = tuple(int(i) for i in args.min.split(","))
    opt_shape = tuple(int(i) for i in args.opt.split(","))
    max_shape = tuple(int(i) for i in args.max.split(","))

    min_feature = (min_shape[0], 64, min_shape[2] // 2, min_shape[3] // 2)
    opt_feature = (opt_shape[0], 64, opt_shape[2] // 2, opt_shape[3] // 2)
    max_feature = (max_shape[0], 64, max_shape[2] // 2, max_shape[3] // 2)


    min_neighbor = (min_shape[0], 64, min_shape[2], min_shape[3])
    opt_neighbor = (opt_shape[0], 64, opt_shape[2], opt_shape[3])
    max_neighbor = (max_shape[0], 64, max_shape[2], max_shape[3])

    print("dynamic shape: ", min_shape, opt_shape, max_shape)

    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(EXPLICIT_BATCH)
    parser = trt.OnnxParser(network, TRT_LOGGER)
    with open(onnx_file, 'rb') as model:
        if not parser.parse(model.read()):
            print ('ERROR: Failed to parse the ONNX file.')
            for error in range(parser.num_errors):
                print (parser.get_error(error))

    network.get_input(0).dtype = trt.DataType.FLOAT
    network.get_input(1).dtype = trt.DataType.FLOAT
    network.get_input(2).dtype = trt.DataType.FLOAT
    network.get_output(0).dtype = trt.DataType.FLOAT
    profile = builder.create_optimization_profile()
    profile.set_shape("image", min_shape, opt_shape, max_shape)
    profile.set_shape("feature_map", min_feature, opt_feature, max_feature)
    profile.set_shape("neighbor_feature_map", min_neighbor,
                      opt_neighbor, max_neighbor)
    config = builder.create_builder_config()
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, GiB(5))
    config.add_optimization_profile(profile)
    engine = builder.build_engine(network, config)
    with open(args.output, "wb") as f:
        f.write(engine.serialize())

    print("converted to TensorRT engine: {}".format(args.output))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("output", help="output tensorrt model")
    parser.add_argument("--min", required=True, help="shape, e.g. (1, 3, 200, 200)")
    parser.add_argument("--opt", required=True, help="shape, e.g. (2, 3, 250, 250)")
    parser.add_argument("--max", required=True, help="shape, e.g. (3, 3, 300, 300)")

    args = parser.parse_args()

    onnx_file = "test.onnx"
    get_trt(args, onnx_file)

python test.py test.engine --min 1,3,300,300 --opt 1,3,600,600 --max 1,3,700,700

This command and script could generate the TRT engine by onnx model.

Thank you. Let us try this one.

TensorRT engine is not portable across the different platforms, it is dependent on the TensorRT version, OS, GPU, etc.

Hi,

Sorry, we are facing an issue with compiling the C++ code as well.
It would be hard for us to work on a fix without successfully running the repro.
Could you please share with us exact commands/steps for C++ code as well.
We are trying with

g++ -I${TRT_ROOT}/include cpp_repro.cpp -o cpp_repro -I/usr/local/cuda/include -Ilibtorch/include/torch/csrc/api/include/ -Ilibtorch/include -x c++ -std=c++14 -Wno-deprecated-declarations

Thank you.

You could use cmake

project (trt-test)
cmake_minimum_required (VERSION 3.4)

find_package(CUDA REQUIRED)
find_package(Torch REQUIRED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")

add_compile_options(-O0)
add_compile_options(-g)


include_directories (${TORCH_INCLUDE_DIRS})
include_directories (/usr/local/cuda/include)
link_directories (/Develop/download/TensorRT-8.4.1.5/lib)
link_directories (/usr/local/cuda/lib64)
link_libraries (${TORCH_LIBRARIES})
link_libraries (cuda)
link_libraries (cudart)
link_libraries (nvinfer)
link_libraries (nvinfer_plugin)


add_executable (trt_test trt_test)

Replace the TRT path, then cmake .. -DCMAKE_PREFIX_PATH=/path/to/libtorch

Hi @user49985 ,

Could you please share working C++ script. This code is not compiling due to incomplete.

Thank you.