Description
I transfer the pth model (PyTorch) to onnx model, and convert to engine file, which can be applied in TensorRT inference based on c++ in Windows10. We find the input size of the onnx will greatly affects the performance of TensorRT. For example, if the network is trained using the image sequence size of 150x150x40 on PyTorch, when we convert our model to onnx file (input size :200x200x40 or 100x100x300), the performance is great. However, when the onnx file size is 150x150x40 (the code is shown below), the performance is much worse than before. The original dark background become light. We also test the performance using different input image sequence size in pytorch, and all the performance is good. The whole image sequence is 16 bit tiff data (min:26520 max: 49546 size:512x512). The Why could this happen? What should I do? Looking forward to your reply. Thank you.
Environment
TensorRT Version: 7.2.1
GPU Type: GeForce RTX 3090
CUDA Version: 11.0
CUDNN Version: 8.0.5
Operating System + Version: windows 10
Python Version (if applicable): 3.6
PyTorch Version (if applicable): 1.7.1
Relevant Files
convert pth to onnx (python):
if is instance(denoise_generator, nn.DataParallel):
denoise_generator.module.load_state_dict(torch.load(model_name)) # parallel
denoise_generator.eval()
else:
denoise_generator.load_state_dict(torch.load(model_name)) # not parallel
denoise_generator.eval()
model = denoise_generator.cuda()
input_name = ['input']
output_name = ['output']
input = torch.randn(1, 1, 40, 150, 150).cuda()
torch.onnx.export(model.module, input, 'NP02_150_40_1.onnx', export_params=True, opset_version=11, do_constant_folding=True, input_names=input_name, output_names=output_name, verbose=True)
convert onnx to engine file (command):
TensorRT-7.2.1.6.Windows10.x86_64.cuda-11.0.cudnn8.0\TensorRT-7.2.1.6\bin\trtexec.exe --onnx=NP02_150_40_1.onnx --explicitBatch --saveEngine=NP02_150_40_1.engine --workspace=2000 --fp16
Inference process (c++):
char *trtModelStream{ nullptr };
std::ifstream file(model_name, std::ios::binary);
file.seekg(0, file.end);
int lengthh = file.tellg();
file.seekg(0, file.beg);
trtModelStream = new char[lengthh];
file.read(trtModelStream, lengthh);
file.close();
IRuntime* runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, lengthh, nullptr);
assert(engine != nullptr);
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);
delete[] trtModelStream;
const ICudaEngine& engine = context.getEngine();
assert(engine.getNbBindings() == 2);
void* buffers[2];
cudaSetDevice(0);
int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
INPUT_S=40; INPUT_H=150; INPUT_W=150;BATCH_SIZE=1;
// create GPU buffers and a stream
cudaMalloc(&buffers[inputIndex], BATCH_SIZE * INPUT_S * INPUT_H * INPUT_W * sizeof(float));
cudaMalloc(&buffers[outputIndex], BATCH_SIZE * INPUT_S * INPUT_H * INPUT_W * sizeof(float));
// Create CUDA stream for the execution of this inference.
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaMemcpyAsync(buffers[inputIndex], data, BATCH_SIZE * INPUT_S * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream);
context.enqueueV2(buffers, stream, nullptr);
cudaMemcpyAsync(output, buffers[outputIndex], BATCH_SIZE * INPUT_S * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
// Release stream
cudaStreamDestroy(stream);
cudaFree(buffers[inputIndex]);
cudaFree(buffers[outputIndex]);