INT8 calibration with Python API hangs when used with threading. Very low to none CPU & GPU usage. I suspect deadlocked global interpreter lock (GIL) but i wouldn’t know enough on the topic to be sure.
Reproduction:
TensorRT version: 5.1.5
CUDA version: 10.0
cudnn version: 7.5.0.56
GPU: 1080 Ti
Python version: 3.6.7
PyCuda version: 2019.1
Modified /usr/src/tensorrt/samples/python/int8_caffe_mnist/sample.py so that it builds an engine inside a new thread.
#omitted copyright notice
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import random
import threading
# For our custom calibrator
import calibrator
# For ../common.py
import sys, os
sys.path.insert(1, os.path.join(sys.path[0], os.path.pardir))
import common
TRT_LOGGER = trt.Logger()
class ModelData(object):
DEPLOY_PATH = "deploy.prototxt"
MODEL_PATH = "mnist_lenet.caffemodel"
OUTPUT_NAME = "prob"
# The original model is a float32 one.
DTYPE = trt.float32
class BuildInt8EngineThread(threading.Thread):
def __init__(self, deploy_file, model_file, calib):
super().__init__()
self.deploy_file = deploy_file
self.model_file = model_file
self.calib = calib
self.out_engine = None
def run(self):
with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.CaffeParser() as parser:
# We set the builder batch size to be the same as the calibrator's, as we use the same batches
# during inference. Note that this is not required in general, and inference batch size is
# independent of calibration batch size.
builder.max_batch_size = self.calib.get_batch_size()
builder.max_workspace_size = common.GiB(1)
builder.int8_mode = True
builder.int8_calibrator = self.calib
# Parse Caffe model
model_tensors = parser.parse(deploy=self.deploy_file, model=self.model_file, network=network, dtype=ModelData.DTYPE)
network.mark_output(model_tensors.find(ModelData.OUTPUT_NAME))
# Build engine and do int8 calibration.
self.out_engine = builder.build_cuda_engine(network)
# Loads a random batch from the supplied calibrator.
def load_random_batch(calib):
# Load a random batch.
batch = random.choice(calib.batch_files)
_, data, labels = calib.read_batch_file(batch)
data = np.fromstring(data, dtype=np.float32)
labels = np.fromstring(labels, dtype=np.float32)
return data, labels
# Note that we don't expect the accuracy to be 100%, but it should
# be close to the fp32 model (which is in the 98-99% range).
def validate_output(output, labels):
preds = np.argmax(output, axis=1)
print("Expected Predictons:\n" + str(labels))
print("Actual Predictons:\n" + str(preds))
check = np.equal(preds, labels)
accuracy = np.sum(check) / float(check.size) * 100
print("Accuracy: " + str(accuracy) + "%")
# If a prediction was incorrect print out an array of booleans indicating accuracy.
if accuracy != 100:
print("One or more predictions was incorrect:\n" + str(check))
def main():
data_path, data_files = common.find_sample_data(description="Runs a Caffe MNIST network in Int8 mode", subfolder="mnist", find_files=["batches", ModelData.DEPLOY_PATH, ModelData.MODEL_PATH])
[batch_data_dir, deploy_file, model_file] = data_files
# Now we create a calibrator and give it the location of our calibration data.
# We also allow it to cache calibration data for faster engine building.
calibration_cache = "mnist_calibration.cache"
calib = calibrator.MNISTEntropyCalibrator(batch_data_dir, cache_file=calibration_cache)
# We will use the calibrator batch size across the board.
# This is not a requirement, but in this case it is convenient.
batch_size = calib.get_batch_size()
build_engine_thread = BuildInt8EngineThread(deploy_file, model_file, calib)
build_engine_thread.start()
build_engine_thread.join()
with build_engine_thread.out_engine as engine, engine.create_execution_context() as context:
# Allocate engine buffers.
inputs, outputs, bindings, stream = common.allocate_buffers(engine)
# Do inference for the whole batch. We have to specify batch size here, as the common.do_inference uses a default
inputs[0].host, labels = load_random_batch(calib)
[output] = common.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=batch_size)
# Next we need to reshape the output to Nx10 (10 probabilities, one per digit), where N is batch size.
output = output.reshape(batch_size, 10)
validate_output(output, labels)
if __name__ == '__main__':
main()
I’m aware using threads in this case isn’t particularly useful, but it is useful where we are using TensorRT in our actual project, due to architectural constraints.
Thanks in advance for your time & answers.