NSYS not reading DLA metrics

kunal.sahoo2003 · May 29, 2025, 6:10am

I am trying profile performance of concurrent use of DLA using nsys tool using the following formula:

sudo nsys profile --trace=cuda,nvtx,cublas,cudla,cusparse,cudnn,nvmedia --output=ssd_mobnet2.nsys-rep /media/ssd/kunal/.venv/bin/python DLA_throughput_beta.py --onnx onnx/ssd_mobilenet_v1_coco_2018_01_28_prepared.onnx --log_prefix ssd_mob --iterations 10 --dtype fp16

where DLA_throughput_beta.py is as follows:

import torch
import tensorrt as trt
import pycuda.driver as cuda
import time
import os
import csv
import warnings
import argparse
from multiprocessing import Process, Barrier

warnings.filterwarnings('ignore')

os.environ['TENSORRT_LOGGER_SEVERITY'] = 'VERBOSE'

def build_engine(onnx_file_path, batch_size, dla_core=0, dtype='fp16'):
    logger = trt.Logger(trt.Logger.VERBOSE)
    builder = trt.Builder(logger)
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    parser = trt.OnnxParser(network, logger)
    
    with open(onnx_file_path, 'rb') as model:
        if not parser.parse(model.read()):
            print(f'ERROR: Failed to parse the ONNX file on DLA core {dla_core}.', flush=True)
            for error in range(parser.num_errors):
                print(parser.get_error(error), flush=True)
            return None

    profile = builder.create_optimization_profile()
    input_name = network.get_input(0).name
    profile.set_shape(
        input_name,
        (batch_size, 3, 300, 300),  # min shape
        (batch_size, 3, 300, 300),  # optimal shape
        (batch_size, 3, 300, 300)   # max shape
    )

    trt_dtype = {'fp16': trt.BuilderFlag.FP16, 'int8': trt.BuilderFlag.INT8}
    config = builder.create_builder_config()
    config.add_optimization_profile(profile)
    config.set_flag(trt.BuilderFlag.GPU_FALLBACK)
    config.default_device_type = trt.DeviceType.DLA
    config.set_flag(trt_dtype[dtype])
    config.DLA_core = dla_core

    serialized_engine = builder.build_serialized_network(network, config)
    if serialized_engine is None:
        print(f'Failed to build serialized engine on DLA core {dla_core}', flush=True)
        return None
    
    runtime = trt.Runtime(logger)
    engine = runtime.deserialize_cuda_engine(serialized_engine)
    return engine

def allocate_buffers(engine):
    inputs, outputs = [], []
    stream = cuda.Stream()
    
    for binding in engine:
        size = trt.volume(engine.get_tensor_shape(binding))
        dtype = trt.nptype(engine.get_tensor_dtype(binding))
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        tensor_info = {'name': binding, 'host': host_mem, 'device': device_mem}
        if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
            inputs.append(tensor_info)
        elif engine.get_tensor_mode(binding) == trt.TensorIOMode.OUTPUT:
            outputs.append(tensor_info)
    
    return inputs, outputs, stream

def infer(context, inputs, outputs, stream):
    for inp in inputs:
        cuda.memcpy_htod_async(inp['device'], inp['host'], stream)
        context.set_tensor_address(inp['name'], inp['device'])
    for out in outputs:
        context.set_tensor_address(out['name'], out['device'])
    
    context.execute_async_v3(stream.handle)

    for out in outputs:
        cuda.memcpy_dtoh_async(out['host'], out['device'], stream)
    
    stream.synchronize()

def run_instance(onnx_file_path, log_file_name, dla_core, num_iterations, dtype, batch_sizes, barrier):
    # Initialize CUDA context in the child process
    cuda.init()
    device = cuda.Device(0)  # Jetson typically has one GPU/DLA device
    cuda_ctx = device.make_context()

    try:
        file = open(log_file_name, 'w', newline='')
        writer = csv.writer(file)
        writer.writerow(['Batch Size', 'Average Throughput (images/s)', 'Start time', 'End time', 'Average Latency (ms)'])

        for batch_size in batch_sizes:
            engine = build_engine(onnx_file_path, batch_size, dla_core, dtype)
            if engine is None:
                print(f'Failed to build engine on DLA core {dla_core}', flush=True)
                file.close()
                return

            inputs, outputs, stream = allocate_buffers(engine)
            context = engine.create_execution_context()

            infer(context, inputs, outputs, stream)  # Warm-up
            barrier.wait()

            total_time = 0
            for i in range(num_iterations):
                start = time.time()
                infer(context, inputs, outputs, stream)
                end = time.time()
                latency = end - start
                total_time += latency

                if i > 0:
                    writer.writerow([batch_size, batch_size / latency, start, end, latency * 1000])
                    file.flush()
                # barrier.wait()

            avg_time = total_time / num_iterations
            throughput = batch_size / avg_time

            print(f'DLA Core {dla_core}, Batch Size {batch_size}: Average latency: {avg_time:.6f}s, Throughput: {throughput:.2f} images/s', flush=True)

            # Wait for the other process to finish this batch size
            barrier.wait()

        file.close()
    finally:
        # Clean up CUDA context
        cuda_ctx.pop()

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--onnx', required=True, help='Path to .onnx file')
    parser.add_argument('--log_prefix', required=True, help='Prefix for log file names')
    parser.add_argument('--iterations', required=False, default=100, type=int)
    parser.add_argument('--dtype', required=False, default='fp16', choices=['fp16', 'int8'])

    args = parser.parse_args()

    onnx_file_path = args.onnx
    log_prefix = args.log_prefix
    num_iterations = args.iterations
    dtype = args.dtype
    # batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256]
    batch_sizes = [64]

    # Create a barrier for two processes
    barrier = Barrier(2, action=lambda: print('Both the cores profiled!', flush=True))

    # Define processes for each DLA core
    processes = [
        Process(
            target=run_instance,
            args=(onnx_file_path, f'/media/ssd/kunal/logs/{log_prefix}_dla0.csv', 0, num_iterations, dtype, batch_sizes, barrier)
        ),
        Process(
            target=run_instance,
            args=(onnx_file_path, f'/media/ssd/kunal/logs/{log_prefix}_dla1.csv', 1, num_iterations, dtype, batch_sizes, barrier)
        )
    ]

    # Start both processes
    for p in processes:
        p.start()

    # Wait for both processes to complete
    for p in processes:
        p.join()

if __name__ == '__main__':
    # Avoid CUDA initialization in the parent process
    main()

But upon opening the resulting nsys-rep file :

AakankshaS · May 31, 2025, 7:25am

Can you please confirm the nsight tool version that you are using?
If older, can you please try an upgrade to the latest and try again?

Thanks

kunal.sahoo2003 · June 2, 2025, 8:20am

NVIDIA Nsight Systems version 2024.5.4.34-245434855735v0

Hardware: NVIDIA Jetson AGX Orin Developer Kit (64GB)
Kernel: 5.15.148-tegra

Topic		Replies	Views
nsight systems not seeing profile ranges when DLA is enabled Jetson AGX Xavier	11	3275	October 18, 2021
Profile results of model running on DLA mismatch between TensorRT and nsys Jetson AGX Orin tensorrt , dla	10	1122	April 5, 2023
Questions about DLA Xavier with Nsight System Jetson AGX Xavier tensorrt	6	675	October 18, 2021
DLA activities not shown in Nsight system for Jetson Orin Jetson AGX Orin nsight , nvbugs , dla	15	1032	March 7, 2023
Profiling DLA with GPU fallback on Jetson Xavier Jetson AGX Xavier dla	6	1565	August 29, 2021
Another DLA question Jetson AGX Xavier dla	5	814	October 18, 2021
When using DLA, how to measure performance such as usage occupancy of DLA? Jetson AGX Xavier	3	781	August 19, 2019
Performance without DLA Jetson Xavier NX dla	7	1251	October 18, 2021
DLA and GPU running at the same time - performance question Jetson AGX Xavier nvbugs , performance , dla	24	3234	October 18, 2021
Not able to check whether DLA enabled/disabled using Nsight System Profiling Linux Targets nsight , dla	0	760	March 11, 2022

NSYS not reading DLA metrics

Related topics