Hi!
I’m trying to run Trt_pose on one of the DLA cores of the Xavier NX. However building the engine fails with:
[TensorRT] ERROR: ../builder/cudnnBuilder2.cpp (1757) - Assertion Error in operator(): 0 (et.region->getType() == RegionType::kNVM)
This is my code for building the engine (adapted from torch2trt to use DLA). Specifically using the densenet121_baseline_att model:
import torch
import trt_pose.models
from torch2trt import ConversionContext
import tensorrt as trt
def default_input_names(num_inputs):
return ["input_%d" % i for i in range(num_inputs)]
def default_output_names(num_outputs):
return ["output_%d" % i for i in range(num_outputs)]
human_pose = {"supercategory": "person", "id": 1, "name": "person", "keypoints": ["nose", "left_eye", "right_eye", "left_ear", "right_ear", "left_shoulder", "right_shoulder", "left_elbow", "right_elbow", "left_wrist", "right_wrist", "left_hip", "right_hip", "left_knee", "right_knee", "left_ankle", "right_ankle", "neck"], "skeleton": [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 8], [7, 9], [8, 10], [9, 11], [2, 3], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7], [18, 1], [18, 6], [18, 7], [18, 12], [18, 13]]}
if __name__=="__main__":
inputs = (torch.zeros((1, 3, 256, 256)).cuda(),)
num_parts = len(human_pose['keypoints'])
num_links = len(human_pose['skeleton'])
module = trt_pose.models.densenet121_baseline_att(num_parts, 2 * num_links).cuda().eval()
module.load_state_dict(torch.load("./model/densenet121_baseline_att_256x256_B_epoch_160.pth"))
logger = trt.Logger(trt.Logger.VERBOSE)
builder = trt.Builder(logger)
# run once to get num outputs
outputs = module(*inputs)
if not isinstance(outputs, tuple) and not isinstance(outputs, list):
outputs = (outputs,)
input_names = default_input_names(len(inputs))
output_names = default_output_names(len(outputs))
network = builder.create_network()
with ConversionContext(network) as ctx:
ctx.add_inputs(inputs, input_names)
outputs = module(*inputs)
if not isinstance(outputs, tuple) and not isinstance(outputs, list):
outputs = (outputs,)
ctx.mark_outputs(outputs, output_names)
builder.max_batch_size = 1
config = builder.create_builder_config()
config.max_workspace_size = 1 << 30
config.set_flag(trt.BuilderFlag.FP16)
config.set_flag(trt.BuilderFlag.GPU_FALLBACK)
config.default_device_type = trt.DeviceType.DLA
config.DLA_core = 0
# profile = builder.create_optimization_profile()
# profile.set_shape(
# 'input_0', # input tensor name
# (1, 3, 256, 256), # min shape
# (1, 3, 256, 256), # opt shape
# (1, 3, 256, 256)) # max shape
# config.add_optimization_profile(profile)
engine = builder.build_engine(network, config)
Edit to add: I’m on Jetpack 4.5