Hi,
I think load1 is the best way to load global memory to achieve the highest bandwidth, but I see code like load4 somewhere and it’s 2~3% faster than load1, why?
Tested on A100.
#include <cstdio>
#include <ctime>
constexpr int n = 1 << 30;
constexpr int BLOCK = 1024;
constexpr int B = 4;
__global__ void load1(const float *__restrict__ src, float *__restrict__ dst)
{
const int X = threadIdx.x;
const int st = blockIdx.x * (n / BLOCK);
float sum = 0;
#pragma runroll
for (int i = 0; i < n / BLOCK; i += 32)
{
sum += src[st + i + X];
}
dst[blockIdx.x * 32 + X] = sum;
}
__global__ void load4(const float *__restrict__ src, float *__restrict__ dst)
{
const int X = threadIdx.x;
const int st = blockIdx.x * (n / BLOCK);
float sum = 0;
#pragma unroll
for (int i = 0; i < n / BLOCK; i += B * 32)
{
#pragma unroll
for (int j = 0; j < B; j++)
{
sum += src[st + i + X * B + j];
}
}
dst[blockIdx.x * 32 + X] = sum;
}
int main()
{
float *a, *b;
cudaMalloc(&a, n * sizeof(float));
cudaMalloc(&b, BLOCK * 32);
for (int i = 0; i < 10; i++)
{
clock_t st, en;
cudaMemset(a, 0, n * sizeof(float));
cudaDeviceSynchronize();
st = clock();
load1<<<BLOCK, 32>>>(a, b);
cudaDeviceSynchronize();
en = clock();
clock_t t1 = en - st;
cudaMemset(a, 0, n * sizeof(float));
cudaDeviceSynchronize();
st = clock();
load4<<<BLOCK, 32>>>(a, b);
cudaDeviceSynchronize();
en = clock();
clock_t t4 = en - st;
printf("%ld %ld %lf\n", t1, t4, t1 * 1.0 / t4);
}
cudaFree(a);
cudaFree(b);
}