Description
I want to do segmentation using a bisenet model and I want to do it using tensorrt. When I searched for how I can do it, I wrote a code using certain sources. . but I get an error and I don’t know why.
Environment
TensorRT Version: 10.4
GPU Type: RTX A2000
Nvidia Driver Version: 535
CUDA Version: 12.2
CUDNN Version: 8.6.7
Operating System + Version: 22:04
Python Version (if applicable): 3.10.12
TensorFlow Version (if applicable):
PyTorch Version (if applicable): 2.4.1
Baremetal or Container (if container which image + tag):
Relevant Files
This model : Bisenet
Steps To Reproduce
This is My Code
import os
import os.path as osp
import cv2
import numpy as np
import argparse
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
# Argümanların işlenmesi
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers(dest="command")
compile_parser = subparsers.add_parser('compile')
compile_parser.add_argument('--onnx')
compile_parser.add_argument('--quant', default='fp32')
compile_parser.add_argument('--savepth', default='./model.trt')
run_parser = subparsers.add_parser('run')
run_parser.add_argument('--mdpth')
run_parser.add_argument('--impth')
run_parser.add_argument('--outpth', default='./res.png')
args = parser.parse_args()
np.random.seed(123)
in_datatype = trt.nptype(trt.float32)
out_datatype = trt.nptype(trt.int32)
palette = np.random.randint(0, 256, (256, 3)).astype(np.uint8)
ctx = pycuda.autoinit.context
trt.init_libnvinfer_plugins(None, "")
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
# Görüntü işlemi
def get_image(impth, size):
mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)[:, None, None]
var = np.array([0.229, 0.224, 0.225], dtype=np.float32)[:, None, None]
iH, iW = size[0], size[1]
img = cv2.imread(impth)[:, :, ::-1]
orgH, orgW, _ = img.shape
img = cv2.resize(img, (iW, iH)).astype(np.float32)
img = img.transpose(2, 0, 1) / 255.
img = (img - mean) / var
return img, (orgH, orgW)
# Bellek tahsisi
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for i in range(engine.num_io_tensors):
tensor_name = engine.get_tensor_name(i)
size = trt.volume(engine.get_tensor_shape(tensor_name))
dtype = trt.nptype(engine.get_tensor_dtype(tensor_name))
# Try to reduce the pagelocked memory usage by checking the required size
try:
host_mem = cuda.pagelocked_empty(size, dtype) # Allocate host memory
device_mem = cuda.mem_alloc(host_mem.nbytes) # Allocate device memory
except cuda.Error as e:
print(f"CUDA memory allocation failed: {e}")
return None # Handle memory allocation failure
bindings.append(int(device_mem))
if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
inputs.append((host_mem, device_mem))
else:
outputs.append((host_mem, device_mem))
return inputs, outputs, bindings, stream
# ONNX modelinden motor oluşturma
def build_engine_from_onnx(onnx_file_path):
EXPLICIT_BATCH = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
with open(onnx_file_path, 'rb') as model:
if not parser.parse(model.read()):
for error in range(parser.num_errors):
print(parser.get_error(error))
return None
builder.max_batch_size = 1 # Reduce batch size
config = builder.create_builder_config()
config.max_workspace_size = 1 << 28 # 256 MB
if args.quant == 'fp16':
config.set_flag(trt.BuilderFlag.FP16)
return builder.build_serialized_network(network, config)
# Motoru serileştirme
def serialize_engine_to_file(engine, savepth):
with open(savepth, "wb") as f:
f.write(engine)
# Motoru deserialleştirme
def deserialize_engine_from_file(savepth):
with open(savepth, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
# Ana işlev
def main():
if args.command == 'compile':
engine = build_engine_from_onnx(args.onnx)
if engine:
serialize_engine_to_file(engine, args.savepth)
elif args.command == 'run':
engine = deserialize_engine_from_file(args.mdpth)
if engine is None:
print("Error loading engine")
return
inputs, outputs, bindings, stream = allocate_buffers(engine)
if inputs is None or outputs is None or bindings is None or stream is None:
print("Memory allocation failed, exiting...")
return
context = engine.create_execution_context()
tensor_name = engine.get_tensor_name(0)
ishape = engine.get_tensor_shape(tensor_name)
img, (orgH, orgW) = get_image(args.impth, ishape[1:])
h_input = np.ascontiguousarray(img)
cuda.memcpy_htod_async(inputs[0][1], h_input, stream)
for i in range(engine.num_io_tensors):
context.set_tensor_address(engine.get_tensor_name(i), bindings[i])
context.execute_async_v3(stream_handle=stream.handle)
for h_output, d_output in outputs:
cuda.memcpy_dtoh_async(h_output, d_output, stream)
stream.synchronize()
oshape = engine.get_tensor_shape(1)
pred = np.argmax(outputs[0][0].reshape(oshape), axis=1)
out = palette[pred]
out = out.reshape(*oshape[1:], 3)
out = cv2.resize(out, (orgW, orgH))
cv2.imwrite(args.outpth, out)
del inputs, outputs, bindings
stream.synchronize()
for h_mem, d_mem in outputs:
del h_mem
cuda.free(d_mem)
if __name__ == '__main__':
main()