Issue
According to the Jetson Orin Nano datasheet, the module delivers 17 TFLOPS FP16 performance in Super mode (MAXN_SUPER).
We benchmarked pure GEMM workloads ([8192×8192] × [8192×8192]) using PyTorch 2.7 (installed in NGC dockers) and observed significantly lower throughput than specified:
| Format | Achieved | % of Spec | vs. TensorRT* |
|---|---|---|---|
| FP16 | 7.9 TFLOPS | 46% | ~25% slower |
| BF16 | 9.1 TFLOPS | 54% | ~13% slower |
*TensorRT achieves ~10.5 TFLOPS on identical matrix sizes (see Related Issues).
Two specific concerns:
- PyTorch vs. TensorRT gap: The 25-40% performance deficit compared to TensorRT suggests PyTorch’s ATen/cuDNN operators lack Jetson-specific optimizations (e.g., SM87 tuning).
- FP16 underperforms BF16: Counter-intuitively, FP16 (7.9 TFLOPS) trails BF16 (9.1 TFLOPS) despite Orin’s native FP16 support, indicating potential kernel selection or tensor core utilization issues.
Could you provide guidance on obtaining PyTorch builds specifically optimized for Jetson Orin (e.g., with cuDNN/cuBLAS tuning for SM87), or recommend compilation flags to approach the 17 TFLOPS specification?
Related Issues
Performance discrepancy: TensorRT achieves ~10 TFLOPS vs. 17 TFLOPS spec on Orin Nano (Super mode)
Hardware
Jetson Orin Develop Kit (Official)
Software
Jetpack:6.2 (L4T 36.4.3)
Docker Env: nvcr.io/nvidia/pytorch:25.06-py3-igpu
Pytorch(inside the docker image):2.7
CUDA(inside the docker image):12.8
Steps to Reproduce
1. Save the benchmark script bench_gemm.py
#!/usr/bin/env python3
"""
MatMul benchmark — preallocate A,B,C and always use torch.matmul(..., out=C).
Assumes torch.matmul(..., out=...) is supported (PyTorch >= 2.6 as requested).
Usage examples:
python matmul_prealloc_out_only.py --dtype bfloat16 --device cuda --N 8192 --repeats 1000
python matmul_prealloc_out_only.py --dtype float16 --device cuda --N 4096 --repeats 100
Notes:
- FLOPs counted as 2 * N**3 per matmul.
- This script intentionally does NOT test fallback paths; it always uses out=.
"""
import argparse
import sys
import time
import math
import gc
import torch
def bytes_per_element(dtype: torch.dtype):
# reliable element size
return torch.tensor([], dtype=dtype).element_size()
def humanize_bytes(n: int) -> str:
for unit in ['B','KB','MB','GB','TB']:
if abs(n) < 1024.0:
return f"{n:3.2f}{unit}"
n /= 1024.0
return f"{n:.2f}PB"
def parse_args():
p = argparse.ArgumentParser()
p.add_argument("--N", type=int, default=8192, help="Matrix size (N x N)")
p.add_argument("--repeats", type=int, default=1000, help="How many matmuls to run")
p.add_argument("--device", choices=["cuda","cpu","auto"], default="cuda")
p.add_argument("--dtype", choices=["float32","float16","bfloat16","float64"], default="bfloat16")
p.add_argument("--warmups", type=int, default=10)
p.add_argument("--no_sync_each_iter", action="store_true",
help="Do NOT synchronize each iteration (faster but less strict).")
return p.parse_args()
def dtype_from_str(s: str):
return {
"float32": torch.float32,
"float16": torch.float16,
"bfloat16": torch.bfloat16,
"float64": torch.float64
}[s]
def main():
args = parse_args()
N = args.N
repeats = args.repeats
warmups = args.warmups
sync_each = not args.no_sync_each_iter
# device selection
if args.device == "auto":
device = "cuda" if torch.cuda.is_available() else "cpu"
else:
device = args.device
if device == "cuda" and not torch.cuda.is_available():
print("CUDA not available; falling back to CPU.", file=sys.stderr)
device = "cpu"
device_t = torch.device(device)
dtype = dtype_from_str(args.dtype)
bpe = bytes_per_element(dtype)
mat_bytes = N * N * bpe
total_needed = mat_bytes * 3 # A,B,C
print(f"Benchmark settings: N={N}, repeats={repeats}, dtype={dtype}, device={device}")
print(f"Estimated per-matrix: {humanize_bytes(mat_bytes)}; A+B+C ~= {humanize_bytes(total_needed)}")
# If CUDA, check GPU memory to avoid OOM
if device == "cuda":
dev = torch.cuda.current_device()
props = torch.cuda.get_device_properties(dev)
total_gpu_mem = props.total_memory
print(f"CUDA device: {torch.cuda.get_device_name(dev)}, total memory: {humanize_bytes(total_gpu_mem)}")
if total_needed > total_gpu_mem:
print("ERROR: Estimated required GPU memory for A,B,C exceeds total GPU memory. Aborting.", file=sys.stderr)
print("Try smaller N or use a lower precision (e.g., float16) or use CPU.", file=sys.stderr)
sys.exit(1)
else:
try:
import psutil
avail = psutil.virtual_memory().available
print(f"System available memory: {humanize_bytes(avail)}")
if total_needed > avail * 0.9:
print("WARNING: estimated memory footprint is large relative to available system memory.", file=sys.stderr)
except Exception:
pass
# Preallocate A, B, C on device
print("Preallocating A, B, C on device...")
A = torch.empty((N, N), dtype=dtype, device=device_t)
B = torch.empty((N, N), dtype=dtype, device=device_t)
C = torch.empty((N, N), dtype=dtype, device=device_t) # output (will be used via out=)
# Initialize A and B:
# Some dtypes (e.g., bfloat16) may not support in-place normal_ on device reliably,
# so create a float32 temp on device then cast/copy into A/B.
torch.manual_seed(12345)
temp = torch.empty((N, N), dtype=torch.float32, device=device_t)
temp.normal_()
if dtype == torch.float32:
A.copy_(temp)
B.copy_(temp) # using same distribution; you can randomize differently if desired
else:
A.copy_(temp.to(dtype))
B.copy_(temp.to(dtype))
del temp
# ensure allocations and initializations are finished
if device == "cuda":
torch.cuda.synchronize()
# Warm-up iterations
print(f"Warm-up {warmups} iterations (not counted)...")
for i in range(warmups):
torch.matmul(A, B, out=C)
if sync_each and device == "cuda":
torch.cuda.synchronize()
# Timed runs using CUDA events (kernel time) or perf_counter for CPU
flops_per = 2 * (N ** 3)
total_flops = flops_per * repeats
print("Starting timed runs...")
if device == "cuda":
starter = torch.cuda.Event(enable_timing=True)
ender = torch.cuda.Event(enable_timing=True)
torch.cuda.synchronize()
starter.record()
for i in range(repeats):
torch.matmul(A, B, out=C)
if sync_each:
torch.cuda.synchronize()
ender.record()
torch.cuda.synchronize()
elapsed_ms = starter.elapsed_time(ender)
total_seconds = elapsed_ms / 1000.0
else:
t0 = time.perf_counter()
for i in range(repeats):
torch.matmul(A, B, out=C)
# attempt to keep memory steady
gc.collect()
total_seconds = time.perf_counter() - t0
secs_per = total_seconds / repeats
gflops = total_flops / total_seconds / 1e9
print("\n=== RESULTS ===")
print(f"Matrix size: {N} x {N}")
print(f"Dtype: {dtype}, device: {device}")
print(f"Repeats: {repeats}, warmups: {warmups}")
print(f"FLOPs per matmul (2*N^3): {flops_per:,}")
print(f"Total FLOPs: {total_flops:,}")
print(f"Total time: {total_seconds:.6f} s")
print(f"Time per matmul: {secs_per:.9f} s")
print(f"Achieved: {gflops:,.3f} GFLOPS ({gflops/1000:.6f} TFLOPS)")
if __name__ == "__main__":
main()
2. Setup docker env
the latest image that Orin + Jetpack 6.2 can run is pytorch:25.06-py3-igpu
sudo docker pull nvcr.io/nvidia/pytorch:25.06-py3-igpu
sudo jetson_clocks
sudo docker run -it --runtime nvidia -v $PWD:$PWD -w $PWD --ipc host --net host nvcr.io/nvidia/pytorch:25.06-py3-igpu
3. Perform benchmark
python3 bench_gemm.py --dtype float16
python3 bench_gemm.py --dtype bfloat16
Additional Findings
During the aforementioned benchmarks, the VDD_IN input current hits its limit (reaches maximum draw).
However, the combined power draw of VDD_CPU_GPU_CV (11.7W) and VDD_SOC (5.8W) totals 17.5W, leaving a 6.9W discrepancy (24.4W - 17.5W) compared to the VDD_IN power.
Is this power distribution expected behavior? Could this discrepancy be related to the suboptimal performance observed?
