Some time when i run yolo v4 engine i have this problem:
File “/app/inference_server/engine_yolo.py”, line 175, in _do_inference
stream.synchronize()
pycuda._driver.LogicError: cuStreamSynchronize failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFreeHost failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFreeHost failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFreeHost failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFreeHost failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFreeHost failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuStreamDestroy failed: an illegal memory access was encountered
Please provide the following information when requesting support.
• Hardware: T4
• Network Type Yolo_v4
• TLT Version : nvidia/tlt-streamanalytics:
docker_registry: nvcr.io
docker_tag: v3.0-py3
yolo_engine: trt-yolov4-pruned-16.engine - Google Drive
Script:
from PIL import Image
import cv2
import numpy as np
import time
import pycuda.driver as cuda
import tensorrt as trt
def drwa_detection(detections, image_input):
image = image_input
for detection in detections:
x1, y1, x2, y2 = detection['bbox']
p1 = (int(x1), int(y1))
p2 = (int(x2), int(y2))
color = (255, 45, 56)
thickness = 1
image = cv2.rectangle(image, p1, p2, color, thickness)
font = cv2.FONT_HERSHEY_SIMPLEX
text = detection['class'] + " " + str(detection['score']) + '%'
image = cv2.putText(image, text, p1, font, 0.5, color, 1)
return image
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
class EngineYolo(object):
def __init__(self, trt_engine_path, model_w, model_h, num_classes, threshold, nms_threshold,
box_norm, stride=16):
self.model_w = model_w
self.model_h = model_h
self.stride = stride
self.box_norm = box_norm
self.min_confidence = threshold
self.NUM_CLASSES = num_classes
self.img_shape = (model_w, model_h)
self.grid_h = int(model_h / stride)
self.grid_w = int(model_w / stride)
self.grid_size = self.grid_h * self.grid_w
self.trt_engine = self._load_engine(trt_engine_path)
self.context = self.trt_engine.create_execution_context()
inputs, outputs, bindings, stream = self._allocate_buffers(self.trt_engine)
self.inputs = inputs
self.outputs = outputs
self.bindings = bindings
self.stream = stream
self.nms_threshold = nms_threshold
def _load_engine(self, engine_path):
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt.init_libnvinfer_plugins(None, '')
trt_runtime = trt.Runtime(TRT_LOGGER)
with open(engine_path, "rb") as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
def _allocate_buffers(self, engine, batch_size=1):
"""Allocates host and device buffer for TRT engine inference.
This function is similair to the one in common.py, but
converts network outputs (which are np.float32) appropriately
before writing them to Python buffer. This is needed, since
TensorRT plugins doesn't support output type description, and
in our particular case, we use NMS plugin as network output.
Args:
engine (trt.ICudaEngine): TensorRT engine
Returns:
inputs [HostDeviceMem]: engine input memory
outputs [HostDeviceMem]: engine output memory
bindings [int]: buffer to device bindings
stream (cuda.Stream): cuda stream for engine inference synchronization
"""
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
# Current NMS implementation in TRT only supports DataType.FLOAT but
# it may change in the future, which could brake this sample here
# when using lower precision [e.g. NMS output would not be np.float32
# anymore, even though this is assumed in binding_to_type]
binding_to_type = {
'Input': np.float32,
'BatchedNMS': np.int32,
'BatchedNMS_1': np.float32,
'BatchedNMS_2': np.float32,
'BatchedNMS_3': np.float32
}
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * batch_size
dtype = binding_to_type[str(binding)]
# Allocate host and device buffers
size = abs(size)
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def run(self, img_pil):
"""Infers model on batch of same sized images resized to fit the model.
Args:
image_paths (str): paths to images, that will be packed into batch
and fed into model
"""
inference_start_time = time.time()
img = self._pre_prosses(img_pil)
end_pre_prosses_time = time.time()
inputs = self.inputs
outputs = self.outputs
bindings = self.bindings
stream = self.stream
np.copyto(inputs[0].host, img)
# When infering on single image, we measure inference
# time to output it to the user
# Fetch output from the model
detection_out = self._do_inference(
self.context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream
)
end_inference_time = time.time()
# Output inference time
nms_boxes, nms_categories, nscores = self._post_process(detection_out)
detections = self._parser_detections(nms_boxes, nms_categories, nscores)
# print(
# "TensorRT inference time: {} ms | Pre proses: {} ms | Inference: {} ms | Post proses: {} ms".format(
# int(round((time.time() - inference_start_time) * 1000)),
# int(round((end_pre_prosses_time - inference_start_time) * 1000)),
# int(round((end_inference_time - end_pre_prosses_time) * 1000)),
# int(round((time.time() - end_inference_time) * 1000)),
# ))
return detections
def _pre_prosses(self, pil_image_resized):
image = np.asarray(pil_image_resized)
img_np = image.astype(np.float32)
# HWC -> CHW
img_np = img_np.transpose((2, 0, 1))
# print(img_np)
# Normalize to [0.0, 1.0] interval (expected by model)
#img_np = (1.0 / 255.0) * img_np
img_np = img_np.ravel()
return img_np
def _do_inference(self, context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(
batch_size=batch_size, bindings=bindings, stream_handle=stream.handle
)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
def _post_process(self, outputs, wh_format=False):
"""
Postprocesses the inference output
Args:
outputs (list of float): inference output
min_confidence (float): min confidence to accept detection
analysis_classes (list of int): indices of the classes to consider
Returns: list of list tuple: each element is a two list tuple (x, y) representing the corners of a bb
"""
p_keep_count = outputs[0]
p_bboxes = outputs[1]
p_scores = outputs[2]
p_classes = outputs[3]
analysis_classes = list(range(self.NUM_CLASSES))
threshold = self.min_confidence
p_bboxes = np.array_split(p_bboxes, len(p_bboxes) / 4)
bbs = []
class_ids = []
scores = []
x_scale = self.img_shape[1] / self.model_w
y_scale = self.img_shape[0] / self.model_h
max_scale = max(x_scale, y_scale)
for i in range(p_keep_count[0]):
assert (p_classes[i] < len(analysis_classes))
if p_scores[i] > threshold:
x1 = int(np.round(p_bboxes[i][0] * self.model_w))
y1 = int(np.round(p_bboxes[i][1] * self.model_h))
x2 = int(np.round(p_bboxes[i][2] * self.model_w))
y2 = int(np.round(p_bboxes[i][3] * self.model_h))
if wh_format:
bbs.append([x1, y1, x2 - x1, y2 - y1])
else:
bbs.append([x1, y1, x2, y2])
class_ids.append(p_classes[i])
scores.append(p_scores[i])
# print(class_ids)
return self._filter_detections(bbs, class_ids, scores)
def _filter_detections(self, bbs, class_ids, scores):
bbs = np.asarray(bbs)
class_ids = np.asarray(class_ids)
scores = np.asarray(scores)
nms_boxes, nms_categories, nscores = [], [], []
for category in set(class_ids):
idxs = np.where(class_ids == category)
box = bbs[idxs]
category = class_ids[idxs]
confidence = scores[idxs]
keep = self._nms_boxes(box, confidence)
nms_boxes.append(box[keep])
nms_categories.append(category[keep])
nscores.append(confidence[keep])
if len(nms_boxes) == 0:
return [], [], []
return nms_boxes, nms_categories, nscores
def _nms_boxes(self, boxes, box_confidences):
"""Apply the Non-Maximum Suppression (NMS) algorithm on the bounding boxes with their
confidence scores and return an array with the indexes of the bounding boxes we want to
keep (and display later).
Keyword arguments:
boxes -- a NumPy array containing N bounding-box coordinates that survived filtering,
with shape (N,4); 4 for x,y,height,width coordinates of the boxes
box_confidences -- a Numpy array containing the corresponding confidences with shape N
"""
x_coord = boxes[:, 0]
y_coord = boxes[:, 1]
width = boxes[:, 2]
height = boxes[:, 3]
areas = width * height
ordered = box_confidences.argsort()[::-1]
keep = list()
while ordered.size > 0:
# Index of the current element:
i = ordered[0]
keep.append(i)
xx1 = np.maximum(x_coord[i], x_coord[ordered[1:]])
yy1 = np.maximum(y_coord[i], y_coord[ordered[1:]])
xx2 = np.minimum(x_coord[i] + width[i], x_coord[ordered[1:]] + width[ordered[1:]])
yy2 = np.minimum(y_coord[i] + height[i], y_coord[ordered[1:]] + height[ordered[1:]])
width1 = np.maximum(0.0, xx2 - xx1 + 1)
height1 = np.maximum(0.0, yy2 - yy1 + 1)
intersection = width1 * height1
union = (areas[i] + areas[ordered[1:]] - intersection)
# Compute the Intersection over Union (IoU) score:
iou = intersection / union
# The goal of the NMS algorithm is to reduce the number of adjacent bounding-box
# candidates to a minimum. In this step, we keep only those elements whose overlap
# with the current bounding box is lower than the threshold:
indexes = np.where(iou <= self.nms_threshold)[0]
ordered = ordered[indexes + 1]
keep = np.array(keep)
return keep
def _parser_detections(self, bbs, class_ids, scores):
classes = ['car', 'tow_wheels', 'person']
detections = []
if len(bbs) > 0:
for idx in range(len(bbs[0])):
detection = dict()
idx = int(idx)
detection['bbox'] = np.array(bbs[0][idx]).tolist()
detection['class_id'] = int(class_ids[0][idx])
detection['class'] = classes[int(class_ids[0][idx])]
detection['score'] = int(scores[0][idx]*100)
detections.append(detection)
return detections
if __name__ == '__main__':
engine = EngineYolo(trt_engine_path='yolov4-trt-v4.engine', model_w=960, model_h=544,
num_classes=3, threshold=0.3, nms_threshold=0.5, box_norm=True, stride=16)
img = cv2.imread("person.png")
image_resized = cv2.resize(img, (960, 544))
img = cv2.cvtColor(image_resized, cv2.COLOR_BGR2RGB)
img = Image.fromarray(img)
img = np.asarray(img)
detections = engine.run(img)
print(detections)
cv2_image = drwa_detection(detections, image_resized)
cv2.imwrite('output2.jpg', cv2_image)