I used the test.py that was used in the forums previosly, but it gave an pycude memory error. What could be the problem???
eren@erennx:~$ /home/eren/env/bin/python /home/eren/FPEnet/test.py --input facepic.jpg
Traceback (most recent call last):
File “/home/eren/FPEnet/test.py”, line 148, in
fpenet_obj = FpeNet(‘/home/eren/FPEnet/model.trt’)
File “/home/eren/FPEnet/test.py”, line 35, in init
self._allocate_buffers()
File “/home/eren/FPEnet/test.py”, line 62, in _allocate_buffers
host_mem = cuda.pagelocked_empty(size, dtype)
pycuda._driver.MemoryError: cuMemHostAlloc failed: out of memory
[06/23/2022-20:58:54] [TRT] [E] 1: [defaultAllocator.cpp::deallocate::35] Error Code 1: Cuda Runtime (invalid argument)
Segmentation fault (core dumped)
the code is as below…
import cv2
import numpy as np
import pycuda
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
import time
from PIL import Image
class HostDeviceMem(object):
def init(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
class FpeNet(object):
def init(self, trt_path, input_size=(80, 80), batch_size=1):
self.trt_path = trt_path
self.input_size = input_size
self.batch_size = batch_size
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)
self.trt_engine = self._load_engine(trt_runtime, self.trt_path)
self.inputs, self.outputs, self.bindings, self.stream = \
self._allocate_buffers()
self.context = self.trt_engine.create_execution_context()
self.list_output = None
def _load_engine(self, trt_runtime, engine_path):
with open(engine_path, "rb") as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
def _allocate_buffers(self):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
binding_to_type = {
"input_face_images:0": np.float32,
"softargmax/strided_slice:0": np.float32,
"softargmax/strided_slice_1:0": np.float32
}
for binding in self.trt_engine:
size = trt.volume(self.trt_engine.get_binding_shape(binding)) \
* self.batch_size
dtype = binding_to_type[str(binding)]
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(device_mem))
if self.trt_engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def _do_inference(self, context, bindings, inputs,
outputs, stream):
[cuda.memcpy_htod_async(inp.device, inp.host, stream) \
for inp in inputs]
context.execute_async(
batch_size=self.batch_size, bindings=bindings,
stream_handle=stream.handle)
[cuda.memcpy_dtoh_async(out.host, out.device, stream) \
for out in outputs]
stream.synchronize()
return [out.host for out in outputs]
def _process_image(self, image):
image = cv2.imread(image)
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
w = self.input_size[0]
h = self.input_size[1]
self.image_height = image.shape[0]
self.image_width = image.shape[1]
image_resized = Image.fromarray(np.uint8(image))
image_resized = image_resized.resize(size=(w, h), resample=Image.BILINEAR)
img_np = np.array(image_resized)
img_np = img_np.astype(np.float32) #/ 255 #this was corrected in a forum
img_np = np.expand_dims(img_np, axis=0) # the shape would be 1x80x80
return img_np, image
def predict(self, img_path):
img_processed, image = self._process_image(img_path)
np.copyto(self.inputs[0].host, img_processed.ravel())
t_time = 0
landmarks = None
for i in range(1):
t1 = time.perf_counter()
landmarks, probs = self._do_inference(
self.context, bindings=self.bindings, inputs=self.inputs,
outputs=self.outputs, stream=self.stream)
t2 = time.perf_counter()
t_time += (t2 - t1)
print('inferece time:', t_time)
# to make (x, y)s from the (160, ) output
landmarks = landmarks.reshape(-1, 2)
visualized = self._visualize(image, landmarks)
return visualized
@staticmethod
def _postprocess(landmarks):
landmarks = landmarks.reshape(-1, 2)
return landmarks
def _visualize(self, frame, landmarks):
visualized = cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR)
for x, y in landmarks:
x = x * self.image_width / self.input_size[0]
y = y * self.image_height / self.input_size[1]
x = int(x)
y = int(y)
cv2.circle(visualized, (x, y), 1, (0, 255, 0), 1)
return visualized
if name == ‘main’:
import argparse
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('--input', '-i', type=str, required=True)
args = arg_parser.parse_args()
img_path = args.input
fpenet_obj = FpeNet('/home/eren/FPEnet/model.trt')
output = fpenet_obj.predict(img_path)
cv2.imwrite('landmarks.jpg', output)
print('image has been writen to landmarks.jpg')