What is the best way to load global memory?

Hi,
I think load1 is the best way to load global memory to achieve the highest bandwidth, but I see code like load4 somewhere and it’s 2~3% faster than load1, why?
Tested on A100.

#include <cstdio>

#include <ctime>

constexpr int n = 1 << 30;

constexpr int BLOCK = 1024;

constexpr int B = 4;

__global__ void load1(const float *__restrict__ src, float *__restrict__ dst)

{

    const int X = threadIdx.x;

    const int st = blockIdx.x * (n / BLOCK);

    float sum = 0;

#pragma runroll

    for (int i = 0; i < n / BLOCK; i += 32)

    {

        sum += src[st + i + X];

    }

    dst[blockIdx.x * 32 + X] = sum;

}

__global__ void load4(const float *__restrict__ src, float *__restrict__ dst)

{

    const int X = threadIdx.x;

    const int st = blockIdx.x * (n / BLOCK);

    float sum = 0;

#pragma unroll

    for (int i = 0; i < n / BLOCK; i += B * 32)

    {

#pragma unroll

        for (int j = 0; j < B; j++)

        {

            sum += src[st + i + X * B + j];

        }

    }

    dst[blockIdx.x * 32 + X] = sum;

}

int main()

{

    float *a, *b;

    cudaMalloc(&a, n * sizeof(float));

    cudaMalloc(&b, BLOCK * 32);

    for (int i = 0; i < 10; i++)

    {

        clock_t st, en;

        cudaMemset(a, 0, n * sizeof(float));

        cudaDeviceSynchronize();

        st = clock();

        load1<<<BLOCK, 32>>>(a, b);

        cudaDeviceSynchronize();

        en = clock();

        clock_t t1 = en - st;

        cudaMemset(a, 0, n * sizeof(float));

        cudaDeviceSynchronize();

        st = clock();

        load4<<<BLOCK, 32>>>(a, b);

        cudaDeviceSynchronize();

        en = clock();

        clock_t t4 = en - st;

        printf("%ld %ld %lf\n", t1, t4, t1 * 1.0 / t4);

    }

    cudaFree(a);

    cudaFree(b);

}

please don’t post code as pictures

Sry, changed.