I am trying profile performance of concurrent use of DLA using nsys
tool using the following formula:
sudo nsys profile --trace=cuda,nvtx,cublas,cudla,cusparse,cudnn,nvmedia --output=ssd_mobnet2.nsys-rep /media/ssd/kunal/.venv/bin/python DLA_throughput_beta.py --onnx onnx/ssd_mobilenet_v1_coco_2018_01_28_prepared.onnx --log_prefix ssd_mob --iterations 10 --dtype fp16
where DLA_throughput_beta.py
is as follows:
import torch
import tensorrt as trt
import pycuda.driver as cuda
import time
import os
import csv
import warnings
import argparse
from multiprocessing import Process, Barrier
warnings.filterwarnings('ignore')
os.environ['TENSORRT_LOGGER_SEVERITY'] = 'VERBOSE'
def build_engine(onnx_file_path, batch_size, dla_core=0, dtype='fp16'):
logger = trt.Logger(trt.Logger.VERBOSE)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, logger)
with open(onnx_file_path, 'rb') as model:
if not parser.parse(model.read()):
print(f'ERROR: Failed to parse the ONNX file on DLA core {dla_core}.', flush=True)
for error in range(parser.num_errors):
print(parser.get_error(error), flush=True)
return None
profile = builder.create_optimization_profile()
input_name = network.get_input(0).name
profile.set_shape(
input_name,
(batch_size, 3, 300, 300), # min shape
(batch_size, 3, 300, 300), # optimal shape
(batch_size, 3, 300, 300) # max shape
)
trt_dtype = {'fp16': trt.BuilderFlag.FP16, 'int8': trt.BuilderFlag.INT8}
config = builder.create_builder_config()
config.add_optimization_profile(profile)
config.set_flag(trt.BuilderFlag.GPU_FALLBACK)
config.default_device_type = trt.DeviceType.DLA
config.set_flag(trt_dtype[dtype])
config.DLA_core = dla_core
serialized_engine = builder.build_serialized_network(network, config)
if serialized_engine is None:
print(f'Failed to build serialized engine on DLA core {dla_core}', flush=True)
return None
runtime = trt.Runtime(logger)
engine = runtime.deserialize_cuda_engine(serialized_engine)
return engine
def allocate_buffers(engine):
inputs, outputs = [], []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_tensor_shape(binding))
dtype = trt.nptype(engine.get_tensor_dtype(binding))
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
tensor_info = {'name': binding, 'host': host_mem, 'device': device_mem}
if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
inputs.append(tensor_info)
elif engine.get_tensor_mode(binding) == trt.TensorIOMode.OUTPUT:
outputs.append(tensor_info)
return inputs, outputs, stream
def infer(context, inputs, outputs, stream):
for inp in inputs:
cuda.memcpy_htod_async(inp['device'], inp['host'], stream)
context.set_tensor_address(inp['name'], inp['device'])
for out in outputs:
context.set_tensor_address(out['name'], out['device'])
context.execute_async_v3(stream.handle)
for out in outputs:
cuda.memcpy_dtoh_async(out['host'], out['device'], stream)
stream.synchronize()
def run_instance(onnx_file_path, log_file_name, dla_core, num_iterations, dtype, batch_sizes, barrier):
# Initialize CUDA context in the child process
cuda.init()
device = cuda.Device(0) # Jetson typically has one GPU/DLA device
cuda_ctx = device.make_context()
try:
file = open(log_file_name, 'w', newline='')
writer = csv.writer(file)
writer.writerow(['Batch Size', 'Average Throughput (images/s)', 'Start time', 'End time', 'Average Latency (ms)'])
for batch_size in batch_sizes:
engine = build_engine(onnx_file_path, batch_size, dla_core, dtype)
if engine is None:
print(f'Failed to build engine on DLA core {dla_core}', flush=True)
file.close()
return
inputs, outputs, stream = allocate_buffers(engine)
context = engine.create_execution_context()
infer(context, inputs, outputs, stream) # Warm-up
barrier.wait()
total_time = 0
for i in range(num_iterations):
start = time.time()
infer(context, inputs, outputs, stream)
end = time.time()
latency = end - start
total_time += latency
if i > 0:
writer.writerow([batch_size, batch_size / latency, start, end, latency * 1000])
file.flush()
# barrier.wait()
avg_time = total_time / num_iterations
throughput = batch_size / avg_time
print(f'DLA Core {dla_core}, Batch Size {batch_size}: Average latency: {avg_time:.6f}s, Throughput: {throughput:.2f} images/s', flush=True)
# Wait for the other process to finish this batch size
barrier.wait()
file.close()
finally:
# Clean up CUDA context
cuda_ctx.pop()
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--onnx', required=True, help='Path to .onnx file')
parser.add_argument('--log_prefix', required=True, help='Prefix for log file names')
parser.add_argument('--iterations', required=False, default=100, type=int)
parser.add_argument('--dtype', required=False, default='fp16', choices=['fp16', 'int8'])
args = parser.parse_args()
onnx_file_path = args.onnx
log_prefix = args.log_prefix
num_iterations = args.iterations
dtype = args.dtype
# batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256]
batch_sizes = [64]
# Create a barrier for two processes
barrier = Barrier(2, action=lambda: print('Both the cores profiled!', flush=True))
# Define processes for each DLA core
processes = [
Process(
target=run_instance,
args=(onnx_file_path, f'/media/ssd/kunal/logs/{log_prefix}_dla0.csv', 0, num_iterations, dtype, batch_sizes, barrier)
),
Process(
target=run_instance,
args=(onnx_file_path, f'/media/ssd/kunal/logs/{log_prefix}_dla1.csv', 1, num_iterations, dtype, batch_sizes, barrier)
)
]
# Start both processes
for p in processes:
p.start()
# Wait for both processes to complete
for p in processes:
p.join()
if __name__ == '__main__':
# Avoid CUDA initialization in the parent process
main()
But upon opening the resulting nsys-rep
file :