Description
Hello, when I run a simple inference using tensorrt, the first time to do it always takes much longer time than the following execution. Is there any method (using the library) to alleviate it, rather than warm-up a request first?
Environment
TensorRT Version: 10.0.1
CUDA Version: 12.2
Operating System + Version: Ubuntu 22.04
Python Version (if applicable): 3.11
Code
import numpy as np
from cuda import cuda
import tensorrt as trt
import time
Initialize TensorRT logger
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
Create a runtime object
runtime = trt.Runtime(TRT_LOGGER)
Read the serialized engine from file
engine_file_path = “resnet.trt”
trt.init_libnvinfer_plugins(None, “”)
try:
with open(engine_file_path, “rb”) as f:
serialized_engine = f.read()
except FileNotFoundError:
print(f"Error: File {engine_file_path} not found")
exit(1)
Deserialize the engine
engine = runtime.deserialize_cuda_engine(serialized_engine)
if engine is None:
print(“Error: Failed to deserialize the engine”)
exit(1)
Create an execution context
context = engine.create_execution_context()
if context is None:
print(“Error: Failed to create execution context”)
exit(1)
Batch size
batch_number = 16
Define the input and output shapes
input_shape = (batch_number,) + tuple(engine.get_tensor_shape(engine.get_tensor_name(0)))[1:]
output_shape = (batch_number,) + tuple(engine.get_tensor_shape(engine.get_tensor_name(1)))[1:]
Initialize CUDA
cuda.cuInit(0)
Calculate byte sizes for input and output
Input_Byte_Size = int(np.prod(input_shape) * np.float32().itemsize)
Output_Byte_Size = int(np.prod(output_shape) * np.float32().itemsize)
Allocate device memory for inputs and outputs
err_di, d_input = cuda.cuMemAlloc(Input_Byte_Size)
err_do, d_output = cuda.cuMemAlloc(Output_Byte_Size)
Create page-locked memory buffers (numpy arrays) to hold host inputs/outputs
h_input = np.random.random(input_shape).astype(np.float32)
h_output = np.empty(output_shape, dtype=np.float32)
Create a CUDA stream
err_stream, stream = cuda.cuStreamCreate(0)
Get tensor names and set input shape
input_tensor_name = engine.get_tensor_name(0) # Input tensor
output_tensor_name = engine.get_tensor_name(1) # Output tensor
context.set_input_shape(input_tensor_name, input_shape) # Use your input_shape
assert context.all_binding_shapes_specified
Set tensor addresses for input and output
context.set_tensor_address(input_tensor_name, int(d_input))
context.set_tensor_address(output_tensor_name, int(d_output))
Generate input data
input_data = np.random.rand(*input_shape).astype(np.float32)
start_timing = time.perf_counter_ns()
Transfer input data to device asynchronously
cuda.cuMemcpyHtoDAsync(d_input, input_data.ctypes.data, Input_Byte_Size, stream)
Execute the inference asynchronously
context.execute_async_v3(stream_handle=stream)
Transfer predictions back from the GPU asynchronously
cuda.cuMemcpyDtoHAsync(h_output.ctypes.data, d_output, Output_Byte_Size, stream)
Synchronize the stream to ensure all operations are completed
cuda.cuStreamSynchronize(stream)
end_timing = time.perf_counter_ns()
response_latency = round(float((end_timing - start_timing) / 10**6), 1) # Express latency in milliseconds
Print the output
print(“Inference output:”, h_output)
print(f"Response latency: {response_latency} ms")