Description
TensorRT get different result in python and c++, with same engine and same input;
Environment
TensorRT Version: 8.4.1.5
GPU Type: A10
Nvidia Driver Version: 495.29.05
CUDA Version: 11.2
CUDNN Version: 8.1.1
Operating System + Version: Ubuntu 16.04
Python Version (if applicable): 3.7.6
TensorFlow Version (if applicable):
PyTorch Version (if applicable): 1.7.1
Baremetal or Container (if container which image + tag):
Relevant Files
Steps To Reproduce
Python code
import cv2
import torch
import tensorrt as trt
class MyLogger(trt.ILogger):
def __init__(self):
trt.ILogger.__init__(self)
def log(self, severity, msg):
print(msg)
logger = MyLogger()
def run_trt(image, feature_map, neighbor_feature_map):
logger = MyLogger()
with open("test.engine", "rb") as f:
serialized_engine = f.read()
runtime = trt.Runtime(logger)
engine = runtime.deserialize_cuda_engine(serialized_engine)
context = engine.create_execution_context()
context.set_binding_shape(0, list(image.shape))
context.set_binding_shape(1, list(feature_map.shape))
context.set_binding_shape(2, list(neighbor_feature_map.shape))
out_cls = torch.zeros(list(context.get_binding_shape(3))).float().cuda().contiguous()
buffers = [None] * 4
buffers[engine.get_binding_index("image")] = image.data_ptr()
buffers[engine.get_binding_index("feature_map")] = feature_map.data_ptr()
buffers[engine.get_binding_index("neighbor_feature_map")] = neighbor_feature_map.data_ptr()
buffers[engine.get_binding_index("out_cls")] = out_cls.data_ptr()
context.execute_v2(buffers)
return out_cls
if __name__ == "__main__":
img = list(torch.jit.load("image_tensor.pt").parameters())[0].cuda().contiguous() # 1x3x700x700
a = torch.zeros(1, 64, 350, 350).float().cuda()
b = torch.zeros(1, 64, 700, 700).float().cuda()
cls = run_trt(img.clone(), a.clone(), b.clone())
print(cls.flatten()[0:20])
c++ code
#include <vector>
#include <string>
#include <assert.h>
#include <chrono>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <torch/torch.h>
#include <torch/script.h>
#include <NvInferPlugin.h>
class Logger : public nvinfer1::ILogger {
public:
void log(nvinfer1::ILogger::Severity severity, const char* msg) noexcept override {
LOG(ERROR) << msg;
};
};
int main() {
torch::Device device(torch::kCUDA, 0);
auto m = torch::jit::load("image_tensor.pt");
auto img = (*(m.parameters().begin())).to(device);
auto a = torch::zeros({1, 64, 350, 350}).to(device).to(torch::kFloat);
auto b = torch::zeros({1, 64, 700, 700}).to(device).to(torch::kFloat);
auto logger = Logger{};
using namespace nvinfer1;
initLibNvInferPlugins(&logger, "");
std::vector<char> trt_engine_stream;
size_t size = 0;
std::ifstream file("test.engine", std::ios::binary);
if (file.good()) {
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trt_engine_stream.resize(size);
file.read(trt_engine_stream.data(), size);
file.close();
}
auto engine = infer->deserializeCudaEngine(trt_engine_stream.data(), size);
auto context = engine->createExecutionContext();
context->setBindingDimensions(0, nvinfer1::Dims4{1, 3, 700, 700});
context->setBindingDimensions(1, nvinfer1::Dims4{1, 64, 350, 350});
context->setBindingDimensions(2, nvinfer1::Dims4{1, 64, 700, 700});
auto cls_shape = context->getBindingDimensions(3);
auto cls_tensor = torch::zeros({cls_shape.d[0], cls_shape.d[1], cls_shape.d[2], cls_shape.d[3]}).to(device);
void* bindings[4] = {img.data_ptr<float>(),
a.data_ptr<float>(),
b.data_ptr<float>(),
cls_tensor.data_ptr<float>()};
CHECK(context->executeV2(bindings));
cls_tensor = cls_tensor.to(torch::kCPU);
std::cout << std::endl;
for (int j = 0; j < 20; ++j) {
std::cout << (cls_tensor.data_ptr<float>())[j] << " ";
}
std::cout << std::endl;
return 0;
}