NSYS not reading DLA metrics

I am trying profile performance of concurrent use of DLA using nsys tool using the following formula:

sudo nsys profile --trace=cuda,nvtx,cublas,cudla,cusparse,cudnn,nvmedia --output=ssd_mobnet2.nsys-rep /media/ssd/kunal/.venv/bin/python DLA_throughput_beta.py --onnx onnx/ssd_mobilenet_v1_coco_2018_01_28_prepared.onnx --log_prefix ssd_mob --iterations 10 --dtype fp16

where DLA_throughput_beta.py is as follows:

import torch
import tensorrt as trt
import pycuda.driver as cuda
import time
import os
import csv
import warnings
import argparse
from multiprocessing import Process, Barrier

warnings.filterwarnings('ignore')

os.environ['TENSORRT_LOGGER_SEVERITY'] = 'VERBOSE'

def build_engine(onnx_file_path, batch_size, dla_core=0, dtype='fp16'):
    logger = trt.Logger(trt.Logger.VERBOSE)
    builder = trt.Builder(logger)
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    parser = trt.OnnxParser(network, logger)
    
    with open(onnx_file_path, 'rb') as model:
        if not parser.parse(model.read()):
            print(f'ERROR: Failed to parse the ONNX file on DLA core {dla_core}.', flush=True)
            for error in range(parser.num_errors):
                print(parser.get_error(error), flush=True)
            return None

    profile = builder.create_optimization_profile()
    input_name = network.get_input(0).name
    profile.set_shape(
        input_name,
        (batch_size, 3, 300, 300),  # min shape
        (batch_size, 3, 300, 300),  # optimal shape
        (batch_size, 3, 300, 300)   # max shape
    )

    trt_dtype = {'fp16': trt.BuilderFlag.FP16, 'int8': trt.BuilderFlag.INT8}
    config = builder.create_builder_config()
    config.add_optimization_profile(profile)
    config.set_flag(trt.BuilderFlag.GPU_FALLBACK)
    config.default_device_type = trt.DeviceType.DLA
    config.set_flag(trt_dtype[dtype])
    config.DLA_core = dla_core

    serialized_engine = builder.build_serialized_network(network, config)
    if serialized_engine is None:
        print(f'Failed to build serialized engine on DLA core {dla_core}', flush=True)
        return None
    
    runtime = trt.Runtime(logger)
    engine = runtime.deserialize_cuda_engine(serialized_engine)
    return engine

def allocate_buffers(engine):
    inputs, outputs = [], []
    stream = cuda.Stream()
    
    for binding in engine:
        size = trt.volume(engine.get_tensor_shape(binding))
        dtype = trt.nptype(engine.get_tensor_dtype(binding))
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        tensor_info = {'name': binding, 'host': host_mem, 'device': device_mem}
        if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
            inputs.append(tensor_info)
        elif engine.get_tensor_mode(binding) == trt.TensorIOMode.OUTPUT:
            outputs.append(tensor_info)
    
    return inputs, outputs, stream

def infer(context, inputs, outputs, stream):
    for inp in inputs:
        cuda.memcpy_htod_async(inp['device'], inp['host'], stream)
        context.set_tensor_address(inp['name'], inp['device'])
    for out in outputs:
        context.set_tensor_address(out['name'], out['device'])
    
    context.execute_async_v3(stream.handle)

    for out in outputs:
        cuda.memcpy_dtoh_async(out['host'], out['device'], stream)
    
    stream.synchronize()

def run_instance(onnx_file_path, log_file_name, dla_core, num_iterations, dtype, batch_sizes, barrier):
    # Initialize CUDA context in the child process
    cuda.init()
    device = cuda.Device(0)  # Jetson typically has one GPU/DLA device
    cuda_ctx = device.make_context()

    try:
        file = open(log_file_name, 'w', newline='')
        writer = csv.writer(file)
        writer.writerow(['Batch Size', 'Average Throughput (images/s)', 'Start time', 'End time', 'Average Latency (ms)'])

        for batch_size in batch_sizes:
            engine = build_engine(onnx_file_path, batch_size, dla_core, dtype)
            if engine is None:
                print(f'Failed to build engine on DLA core {dla_core}', flush=True)
                file.close()
                return

            inputs, outputs, stream = allocate_buffers(engine)
            context = engine.create_execution_context()

            infer(context, inputs, outputs, stream)  # Warm-up
            barrier.wait()

            total_time = 0
            for i in range(num_iterations):
                start = time.time()
                infer(context, inputs, outputs, stream)
                end = time.time()
                latency = end - start
                total_time += latency

                if i > 0:
                    writer.writerow([batch_size, batch_size / latency, start, end, latency * 1000])
                    file.flush()
                # barrier.wait()

            avg_time = total_time / num_iterations
            throughput = batch_size / avg_time

            print(f'DLA Core {dla_core}, Batch Size {batch_size}: Average latency: {avg_time:.6f}s, Throughput: {throughput:.2f} images/s', flush=True)

            # Wait for the other process to finish this batch size
            barrier.wait()

        file.close()
    finally:
        # Clean up CUDA context
        cuda_ctx.pop()

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--onnx', required=True, help='Path to .onnx file')
    parser.add_argument('--log_prefix', required=True, help='Prefix for log file names')
    parser.add_argument('--iterations', required=False, default=100, type=int)
    parser.add_argument('--dtype', required=False, default='fp16', choices=['fp16', 'int8'])

    args = parser.parse_args()

    onnx_file_path = args.onnx
    log_prefix = args.log_prefix
    num_iterations = args.iterations
    dtype = args.dtype
    # batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256]
    batch_sizes = [64]

    # Create a barrier for two processes
    barrier = Barrier(2, action=lambda: print('Both the cores profiled!', flush=True))

    # Define processes for each DLA core
    processes = [
        Process(
            target=run_instance,
            args=(onnx_file_path, f'/media/ssd/kunal/logs/{log_prefix}_dla0.csv', 0, num_iterations, dtype, batch_sizes, barrier)
        ),
        Process(
            target=run_instance,
            args=(onnx_file_path, f'/media/ssd/kunal/logs/{log_prefix}_dla1.csv', 1, num_iterations, dtype, batch_sizes, barrier)
        )
    ]

    # Start both processes
    for p in processes:
        p.start()

    # Wait for both processes to complete
    for p in processes:
        p.join()

if __name__ == '__main__':
    # Avoid CUDA initialization in the parent process
    main()

But upon opening the resulting nsys-rep file :

Can you please confirm the nsight tool version that you are using?
If older, can you please try an upgrade to the latest and try again?

Thanks

NVIDIA Nsight Systems version 2024.5.4.34-245434855735v0

Hardware: NVIDIA Jetson AGX Orin Developer Kit (64GB)
Kernel: 5.15.148-tegra