Effects of: '.tune1_low = 0x012207FF' from file tegra/clk-tegra124-dfll-fcpu.c (4.9.201-tegra, L4T 32.5.2)

Hello, what’s the meaning of the line ‘.tune1_low = 0x012207FF,’ and its effects to the system?
Where to find information about ´@tune1: DFLL tuning register 1´?
( from clk-dfll.h,
Tegra_X1_TRM_DP07225001_v1.3p.pdf, page 205, 6.1.5 CL_DVFS_TUNE1_0

6.1.5 CL_DVFS_TUNE1_0
Bit Reset Description
31:23 0x0 DFLL_TUNE1_DLY_FINE: Input bits to tune the two phases of the clock. 8 bits to tune, and 1 bit to
choose high vs. low. Drives I_DLY_FINE<8:0>.
22:12 0x0 DFLL_TUNE1_DLY_SRAM: input bits to both coarse (3) and fine (5) tune the delay of sram path like
chain.Drives I_DLY_SRAM<10:0> of DVCO macro
11 0x0 DFLL_TUNE1_DLY_SPARE1: Drives I_DLY_SPARE<16> of DVCO macro
10:0 0x0 DFLL_TUNE1_DLY_WIRE: Input bits to both coarse (2) and fine (9) tune the delay of wire dominated
path. Drives I_DLY_WIRE<10:0>.

0x012207FF
0b1001000100000011111111111
0b0000 0001 00.10 0010 0000 0.1.11 1111 1111 (Little-endian)
)

(thx)

from Kernel sources (4.9.201-tegra, L4T 32.5.2), drivers/clk/tegra/clk-tegra124-dfll-fcpu.c, ln ~620:

´#define CPUB01_CVB_TABLE
.speedo_scale = 100,
.voltage_scale = 1000,
.entries = {
/* f c0, c1, c2 */
{ 204000000UL, { 721589, -12695, 27 } },
{ 306000000UL, { 747134, -14195, 27 } },
{ 408000000UL, { 776324, -15705, 27 } },
{ 510000000UL, { 809160, -17205, 27 } },
{ 612000000UL, { 845641, -18715, 27 } },
{ 714000000UL, { 885768, -20215, 27 } },
{ 816000000UL, { 929540, -21725, 27 } },
{ 918000000UL, { 976958, -23225, 27 } },
{ 1020000000UL, { 1028021, -24725, 27 } },
{ 1122000000UL, { 1082730, -26235, 27 } },
{ 1224000000UL, { 1141084, -27735, 27 } },
{ 1326000000UL, { 1203084, -29245, 27 } },
{ 1428000000UL, { 1268729, -30745, 27 } },
{ 1581000000UL, { 1374032, -33005, 27 } },
[etc.]
{ 0, { } },
},
.vmin_coefficients = { 620000, 0, 0 },
.cpu_dfll_data = {
.tune0_low = 0x0000FFCF,
.tune1_low = 0x012207FF,
.tune1_high = 0x03FFF7FF,
.tune_high_min_millivolts = 850,
.tune_high_margin_millivolts = 38,
.dvco_calibration_max = ULONG_MAX,
},
.cvb_version = “FCPU Table - p4v3”
´

Hi,
All detail is in TRM. If you have further query, please share what issue you are experiencing. We can check and see if can suggest next.

Hello,

the system is stable with idling temperatures (on a rel. small heatsink) at about 29-35°C (84-95°F) and up to ~52/53°C (125/127°F) with (glmark2 --off-screen ~2750 or) all 4 CPUs being at almost highest frequency level, for e.g. (natively) compiling the ‘4.9.201-tegra’ Kernel, what was reduced from ~55min to ~30-35min (tegra_defconfig, without kernel debug options).

For understanding the long term effects there’s a different value for the .tune0_low within the kernel patch that changes from
‘0x012207FF, 0b1,0 010,0010,0000, 0 111,1111,1111’ to
‘0x016607FF, 0b1,0 110,0110,0000, 0 111,1111,1111’.
What’s the meaning or effects of these different values for the Tegra X1 SRAM?

Is this a suitable tool for benchmarking the SRAM (bandwidth, latency)?

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <string.h>
#include <unistd.h>

#define CACHE_LINE_SIZE 64
#define L1D_SIZE (32 * 1024)
#define L1I_SIZE (48 * 1024)
#define ITERATIONS (1000000)

static inline uint64_t read_cycles() {
    uint64_t val;
    asm volatile("mrs %0, cntvct_el0" : "=r"(val));
    return val;
}

static inline uint64_t read_freq() {
    uint64_t freq;
    asm volatile("mrs %0, cntfrq_el0" : "=r"(freq));
    return freq;
}

// --- L1D bandwidth test (streaming read from hot data) ---
void benchmark_l1d_bandwidth() {
    char *buffer = aligned_alloc(CACHE_LINE_SIZE, L1D_SIZE);
    if (!buffer) { perror("alloc"); return; }

    memset(buffer, 1, L1D_SIZE);

    volatile uint64_t sum = 0;
    uint64_t start = read_cycles();
    for (int rep = 0; rep < 1000; ++rep) {
        for (size_t i = 0; i < L1D_SIZE; i += CACHE_LINE_SIZE) {
            sum += buffer[i];
        }
    }
    uint64_t end = read_cycles();

    double freq = (double)read_freq();
    double elapsed_s = (end - start) / freq;
    double total_bytes = L1D_SIZE * 1000;
    double bandwidth_MBps = (total_bytes / (1024.0 * 1024.0)) / elapsed_s;

    printf("[L1D] Bandwidth: %.2f MB/s\n", bandwidth_MBps);
    free((void*)buffer);
}

// --- L1D latency test (pointer chasing to prevent prefetching) ---
void benchmark_l1d_latency() {
    size_t count = L1D_SIZE / sizeof(void*);
    void **buffer = aligned_alloc(CACHE_LINE_SIZE, count * sizeof(void*));
    if (!buffer) { perror("alloc"); return; }

    for (size_t i = 0; i < count; ++i)
        buffer[i] = &buffer[(i + 1) % count]; // loop around

    volatile void *p = buffer[0];

    uint64_t start = read_cycles();
    for (size_t i = 0; i < ITERATIONS; ++i) {
        p = *(void**)p;
    }
    uint64_t end = read_cycles();

    double freq = (double)read_freq();
    double total_time_ns = ((end - start) / freq) * 1e9;
    double latency_ns = total_time_ns / ITERATIONS;

    printf("[L1D] Latency: %.2f ns\n", latency_ns);
    free(buffer);
}

// --- L1I latency test (execute from tight loop) ---
__attribute__((noinline))
void hot_loop(int n) {
    volatile int sum = 0;
    for (int i = 0; i < n; ++i) {
        sum += i;
    }
}

void benchmark_l1i_latency() {
    const int reps = 100000;

    uint64_t start = read_cycles();
    hot_loop(reps);
    uint64_t end = read_cycles();

    double freq = (double)read_freq();
    double total_time_ns = ((end - start) / freq) * 1e9;
    double latency_ns = total_time_ns / reps;

    printf("[L1I] Instruction loop latency: %.2f ns per iteration\n", latency_ns);
}

int main() {
    printf("Cortex-A57 Cache Benchmark (L1D + L1I)\n");
    printf("--------------------------------------\n");

    benchmark_l1d_bandwidth();
    benchmark_l1d_latency();
    benchmark_l1i_latency();

    return 0;
}
gcc   -O2   -o a57_cache_benchmark   a57_cache_benchmark.c

Another useful tool is mixbench.

(thx)