Description
Hello,
I have trained the DeTr model with a custom data set. Then I converted the DeTr model to a TensorRT model to achieve a faster inference time. As a test, I wrote a script inside the target system to infer only one image.
The problem is that the bounding boxes are not within a range of [0, 1] (see example below).
Do you know what could be the reason for this?
For reference:
preprocess:
transform = T.Compose([
T.Resize((800,800)),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
model deserialize:
with open(engine_file, "rb") as f:
buf = f.read()
engine = runtime.deserialize_cuda_engine(buf)
context = engine.create_execution_context()
memory allocation:
host_inputs = []
cuda_inputs = []
host_outputs_boxes = []
cuda_outputs_boxes = []
host_outputs_logits = []
cuda_outputs_logits = []
bindings = []
input_dimension = np.empty([
batch_size,
channel_size,
image_size,
image_size],
dtype=PRECISION)
output_boxes_dimension = np.empty([
batch_size,
n_predicitons,
4],
dtype=PRECISION)
output_logits_dimension = np.empty([
batch_size,
n_predicitons,
n_CLASSES+1],
dtype=PRECISION)
input_batch = torch.from_numpy(input_dimension)
output_boxes = torch.from_numpy(output_boxes_dimension)
output_logits = torch.from_numpy(output_logits_dimension)
cuda_inputs = cuda.mem_alloc(input_batch.detach().numpy().nbytes)
cuda_outputs_boxes = cuda.mem_alloc(output_boxes.detach().numpy().nbytes)
cuda_outputs_logits = cuda.mem_alloc(output_logits.detach().numpy().nbytes)
bindings = [int(cuda_inputs), int(cuda_outputs_boxes), int(cuda_outputs_logits)]
image:
cv_image = cv2.imread(image_file)
cv_image = cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(cv_image)
t_image = transform(pil_image).unsqueeze(0)
np_image = np.asarray(t_image).astype('float32')
inference:
cfx.push()
boxes = output_boxes_dimension
logits = output_logits_dimension
cuda.memcpy_htod_async(cuda_inputs, np_image, stream)
context.execute_async_v2(bindings, stream.handle, None)
cuda.memcpy_dtoh_async(boxes, cuda_outputs_boxes, stream)
cuda.memcpy_dtoh_async(logits, cuda_outputs_logits, stream)
stream.synchronize()
cfx.pop()
output:
print(boxes.shape)
print(logits.shape)
boxes
(1, 100, 4)
(1, 100, 12)
array([[[-15.440976 , 1.9481723 , -2.0340009 , 5.3164234 ],
[ -0.8660751 , -3.8535829 , -2.3773873 , -3.1096535 ],
[ 2.653059 , -4.726863 , -1.538347 , -2.5069196 ],
...