Description
Hi all,
now I use tensorRT as a server and serval clients send request to do inference. The result is correct when I only use one client, but when I use 3 or more client to do inference, the result of input1 will get random error and return result of input2.
The reason of above is memory copy in host and cuda ,so I add thread lock in inference, result becomes correct but cost more time in model inference
The code with random error
def infer(self, data):
"""
Real inference process.
:param model: Model objects
:param data: Preprocessed data
:return:
output
"""
stream = self.stream
context = self.context
engine = self.engine
host_inputs = self.host_inputs
cuda_inputs = self.cuda_inputs
host_outputs = self.host_outputs
cuda_outputs = self.cuda_outputs
bindings = self.bindings
batch_input_image = np.empty(shape=[self.batch_size, 3, 224, 224])
for i, img in enumerate(data):
input_ndarray = self.preprocess(img)
np.copyto(batch_input_image[i], input_ndarray)
batch_input_image = np.ascontiguousarray(batch_input_image)
# with lock:
# Push to device
self.ctx.push()
# Copy data to input memory buffer
np.copyto(host_inputs[0], batch_input_image.ravel())
# Transfer input data to the GPU.
# cuda.memcpy_htod_async(self._input.device, self._input.host, self._stream)
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
# Run inference.
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
# context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
# Synchronize the stream
stream.synchronize()
# Pop the device
self.ctx.pop()
output = host_outputs[0].reshape((1,256))
return output
and code with correct result but run slow
lock = threading.Lock()
def infer(self, data):
"""
Real inference process.
:param model: Model objects
:param data: Preprocessed data
:return:
output
"""
stream = self.stream
context = self.context
engine = self.engine
host_inputs = self.host_inputs
cuda_inputs = self.cuda_inputs
host_outputs = self.host_outputs
cuda_outputs = self.cuda_outputs
bindings = self.bindings
batch_input_image = np.empty(shape=[self.batch_size, 3, 224, 224])
for i, img in enumerate(data):
input_ndarray = self.preprocess(img)
np.copyto(batch_input_image[i], input_ndarray)
batch_input_image = np.ascontiguousarray(batch_input_image)
with lock:
# Push to device
self.ctx.push()
# Copy data to input memory buffer
np.copyto(host_inputs[0], batch_input_image.ravel())
# Transfer input data to the GPU.
# cuda.memcpy_htod_async(self._input.device, self._input.host, self._stream)
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
# Run inference.
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
# context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
# Synchronize the stream
stream.synchronize()
# Pop the device
self.ctx.pop()
output = host_outputs[0].reshape((1,256))
return output
Environment
TensorRT Version: 7.0.0.11
GPU Type: P4
Nvidia Driver Version:
CUDA Version: 9.0.176
CUDNN Version: 7.5.1
Operating System + Version:
Python Version (if applicable): 3.5.2
PyTorch Version (if applicable): 1.1.0
Relevant Files
Please attach or include links to any models, data, files, or scripts necessary to reproduce your issue. (Github repo, Google Drive, Dropbox, etc.)
Steps To Reproduce
Please include:
- Exact steps/commands to build your repro
- Exact steps/commands to run your repro
- Full traceback of errors encountered