Effects of: '.tune1_low = 0x012207FF' from file tegra/clk-tegra124-dfll-fcpu.c (4.9.201-tegra, L4T 32.5.2)

beyondTime · May 23, 2025, 8:35pm

Hello, what’s the meaning of the line ‘.tune1_low = 0x012207FF,’ and its effects to the system?
Where to find information about ´@tune1: DFLL tuning register 1´?
( from clk-dfll.h,
Tegra_X1_TRM_DP07225001_v1.3p.pdf, page 205, 6.1.5 CL_DVFS_TUNE1_0

6.1.5 CL_DVFS_TUNE1_0
Bit Reset Description
31:23 0x0 DFLL_TUNE1_DLY_FINE: Input bits to tune the two phases of the clock. 8 bits to tune, and 1 bit to
choose high vs. low. Drives I_DLY_FINE<8:0>.
22:12 0x0 DFLL_TUNE1_DLY_SRAM: input bits to both coarse (3) and fine (5) tune the delay of sram path like
chain.Drives I_DLY_SRAM<10:0> of DVCO macro
11 0x0 DFLL_TUNE1_DLY_SPARE1: Drives I_DLY_SPARE<16> of DVCO macro
10:0 0x0 DFLL_TUNE1_DLY_WIRE: Input bits to both coarse (2) and fine (9) tune the delay of wire dominated
path. Drives I_DLY_WIRE<10:0>.

0x012207FF
0b1001000100000011111111111
0b0000 0001 00.10 0010 0000 0.1.11 1111 1111 (Little-endian)
)

(thx)

from Kernel sources (4.9.201-tegra, L4T 32.5.2), drivers/clk/tegra/clk-tegra124-dfll-fcpu.c, ln ~620:

´#define CPUB01_CVB_TABLE
.speedo_scale = 100,
.voltage_scale = 1000,
.entries = {
/* f c0, c1, c2 */
{ 204000000UL, { 721589, -12695, 27 } },
{ 306000000UL, { 747134, -14195, 27 } },
{ 408000000UL, { 776324, -15705, 27 } },
{ 510000000UL, { 809160, -17205, 27 } },
{ 612000000UL, { 845641, -18715, 27 } },
{ 714000000UL, { 885768, -20215, 27 } },
{ 816000000UL, { 929540, -21725, 27 } },
{ 918000000UL, { 976958, -23225, 27 } },
{ 1020000000UL, { 1028021, -24725, 27 } },
{ 1122000000UL, { 1082730, -26235, 27 } },
{ 1224000000UL, { 1141084, -27735, 27 } },
{ 1326000000UL, { 1203084, -29245, 27 } },
{ 1428000000UL, { 1268729, -30745, 27 } },
{ 1581000000UL, { 1374032, -33005, 27 } },
[etc.]
{ 0, { } },
},
.vmin_coefficients = { 620000, 0, 0 },
.cpu_dfll_data = {
.tune0_low = 0x0000FFCF,
.tune1_low = 0x012207FF,
.tune1_high = 0x03FFF7FF,
.tune_high_min_millivolts = 850,
.tune_high_margin_millivolts = 38,
.dvco_calibration_max = ULONG_MAX,
},
.cvb_version = “FCPU Table - p4v3”
´

DaneLLL · May 26, 2025, 5:19am

Hi,
All detail is in TRM. If you have further query, please share what issue you are experiencing. We can check and see if can suggest next.

beyondTime · May 26, 2025, 6:32am

Hello,

the system is stable with idling temperatures (on a rel. small heatsink) at about 29-35°C (84-95°F) and up to ~52/53°C (125/127°F) with (glmark2 --off-screen ~2750 or) all 4 CPUs being at almost highest frequency level, for e.g. (natively) compiling the ‘4.9.201-tegra’ Kernel, what was reduced from ~55min to ~30-35min (tegra_defconfig, without kernel debug options).

For understanding the long term effects there’s a different value for the .tune0_low within the kernel patch that changes from
‘0x012207FF, 0b1,0 010,0010,0000, 0 111,1111,1111’ to
‘0x016607FF, 0b1,0 110,0110,0000, 0 111,1111,1111’.
What’s the meaning or effects of these different values for the Tegra X1 SRAM?

Is this a suitable tool for benchmarking the SRAM (bandwidth, latency)?

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <string.h>
#include <unistd.h>

#define CACHE_LINE_SIZE 64
#define L1D_SIZE (32 * 1024)
#define L1I_SIZE (48 * 1024)
#define ITERATIONS (1000000)

static inline uint64_t read_cycles() {
    uint64_t val;
    asm volatile("mrs %0, cntvct_el0" : "=r"(val));
    return val;
}

static inline uint64_t read_freq() {
    uint64_t freq;
    asm volatile("mrs %0, cntfrq_el0" : "=r"(freq));
    return freq;
}

// --- L1D bandwidth test (streaming read from hot data) ---
void benchmark_l1d_bandwidth() {
    char *buffer = aligned_alloc(CACHE_LINE_SIZE, L1D_SIZE);
    if (!buffer) { perror("alloc"); return; }

    memset(buffer, 1, L1D_SIZE);

    volatile uint64_t sum = 0;
    uint64_t start = read_cycles();
    for (int rep = 0; rep < 1000; ++rep) {
        for (size_t i = 0; i < L1D_SIZE; i += CACHE_LINE_SIZE) {
            sum += buffer[i];
        }
    }
    uint64_t end = read_cycles();

    double freq = (double)read_freq();
    double elapsed_s = (end - start) / freq;
    double total_bytes = L1D_SIZE * 1000;
    double bandwidth_MBps = (total_bytes / (1024.0 * 1024.0)) / elapsed_s;

    printf("[L1D] Bandwidth: %.2f MB/s\n", bandwidth_MBps);
    free((void*)buffer);
}

// --- L1D latency test (pointer chasing to prevent prefetching) ---
void benchmark_l1d_latency() {
    size_t count = L1D_SIZE / sizeof(void*);
    void **buffer = aligned_alloc(CACHE_LINE_SIZE, count * sizeof(void*));
    if (!buffer) { perror("alloc"); return; }

    for (size_t i = 0; i < count; ++i)
        buffer[i] = &buffer[(i + 1) % count]; // loop around

    volatile void *p = buffer[0];

    uint64_t start = read_cycles();
    for (size_t i = 0; i < ITERATIONS; ++i) {
        p = *(void**)p;
    }
    uint64_t end = read_cycles();

    double freq = (double)read_freq();
    double total_time_ns = ((end - start) / freq) * 1e9;
    double latency_ns = total_time_ns / ITERATIONS;

    printf("[L1D] Latency: %.2f ns\n", latency_ns);
    free(buffer);
}

// --- L1I latency test (execute from tight loop) ---
__attribute__((noinline))
void hot_loop(int n) {
    volatile int sum = 0;
    for (int i = 0; i < n; ++i) {
        sum += i;
    }
}

void benchmark_l1i_latency() {
    const int reps = 100000;

    uint64_t start = read_cycles();
    hot_loop(reps);
    uint64_t end = read_cycles();

    double freq = (double)read_freq();
    double total_time_ns = ((end - start) / freq) * 1e9;
    double latency_ns = total_time_ns / reps;

    printf("[L1I] Instruction loop latency: %.2f ns per iteration\n", latency_ns);
}

int main() {
    printf("Cortex-A57 Cache Benchmark (L1D + L1I)\n");
    printf("--------------------------------------\n");

    benchmark_l1d_bandwidth();
    benchmark_l1d_latency();
    benchmark_l1i_latency();

    return 0;
}

gcc   -O2   -o a57_cache_benchmark   a57_cache_benchmark.c

Another useful tool is mixbench.

(thx)

Topic		Replies	Views
CL DVFS : LUT Interpretation Jetson TK1	5	1638	October 12, 2015
New TK1 trying to restore and not having much luck Jetson TK1	10	1282	October 18, 2021
Tegra TK1 chip_personality Jetson TK1	9	1519	October 18, 2021
Stability of Jetson Nano together with DeepStream DeepStream SDK	10	1113	October 12, 2021
CPU cores always run at highest frequency... why? Jetson Nano kernel	26	1442	October 18, 2021
Jetson TK1 output issue Jetson TK1	23	1032	January 3, 2024
Kernel panic when I built-in the audio driver Jetson Xavier NX audio	9	282	June 19, 2024
Jetson Nano booting time Jetson Nano	5	2103	February 6, 2020
EMC clock stuck at 204 MHz Jetson Nano kernel	11	2159	October 15, 2021
Keep `uart1_tx` low at boot Jetson AGX Xavier uart	5	644	March 23, 2023

Effects of: '.tune1_low = 0x012207FF' from file tegra/clk-tegra124-dfll-fcpu.c (4.9.201-tegra, L4T 32.5.2)

Related topics