Hello, I’m trying to understand why the following optimization techniques from [1] to this simple saxpy kernel do not seem to work.
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define CUDA_CHECK_ERROR(error) if ((error) != cudaSuccess) printf("CUDA Error: %s\nMessage: %s\nLine: %i\n", cudaGetErrorName(error), cudaGetErrorString(error), __LINE__)
#define BLOCK_DIM 128
#define ILP_COUNT 4
__global__ void saxpy(float a, const float* x, float* y, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
y[i] += a * x[i];
}
}
__global__ void saxpy_gsl(float a, const float* x, float* y, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
for (; i < n; i += gridDim.x * blockDim.x) {
y[i] += a * x[i];
}
}
__global__ void saxpy_atomic(float a, const float* x, float* y, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
atomicAdd(y + i, a * x[i]);
}
}
__global__ void saxpy_ilp(float a, const float* x, float* y, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int nILP = n / (ILP_COUNT * blockDim.x);
if (blockIdx.x < nILP) {
int j = (ILP_COUNT - 1) * blockIdx.x * blockDim.x + i;
#pragma unroll
for (int k = 0; k < ILP_COUNT; ++k) {
y[j] += a * x[j];
j += BLOCK_DIM;
}
}
i += ILP_COUNT * blockDim.x * nILP;
if (i < n) {
y[i] += a * x[i];
}
}
static float* temp;
static void test(const char* name, const void* func, int gridDim, int blockDim, float a, float* x, float* y, int n) {
void* args[] = { &a, &x, &y, &n };
for (int i = 0; i < n; ++i) {
temp[i] = (float)i;
}
CUDA_CHECK_ERROR(cudaMemcpy(x, temp, n * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CHECK_ERROR(cudaMemcpy(y, temp, n * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CHECK_ERROR(cudaLaunchKernel(func, gridDim, blockDim, args));
CUDA_CHECK_ERROR(cudaMemcpy(temp, y, n * sizeof(float), cudaMemcpyDeviceToHost));
int errorCount = 0;
for (int i = 0; i < n; ++i) {
if (temp[i] != a * i + i) {
++errorCount;
}
}
printf("%s Error Count: %i\n", name, errorCount);
}
int main() {
CUDA_CHECK_ERROR(cudaSetDevice(0));
int smCount;
int smThreadCount;
CUDA_CHECK_ERROR(cudaDeviceGetAttribute(&smCount, cudaDevAttrMultiProcessorCount, 0));
CUDA_CHECK_ERROR(cudaDeviceGetAttribute(&smThreadCount, cudaDevAttrMaxThreadsPerMultiProcessor, 0));
int wave = smCount * smThreadCount;
int n = 64 * wave;
assert(n >= ILP_COUNT * BLOCK_DIM);
int gridDim = (n + BLOCK_DIM - 1) / BLOCK_DIM;
int gridDimGSL = wave / BLOCK_DIM;
int gridDimILP = (n + ILP_COUNT * BLOCK_DIM - 1) / (ILP_COUNT * BLOCK_DIM);
temp = (float*)malloc(n * sizeof(float));
assert(temp != nullptr);
float a = 2.0f;
float* x;
float* y;
CUDA_CHECK_ERROR(cudaMalloc((void**)&x, 2 * n * sizeof(float)));
CUDA_CHECK_ERROR(cudaMalloc((void**)&y, 2 * n * sizeof(float)));
test("Baseline", &saxpy, gridDim, BLOCK_DIM, a, x, y, n);
test("Grid-Stride Loop", &saxpy_gsl, gridDimGSL, BLOCK_DIM, a, x, y, n);
test("Atomic", &saxpy_atomic, gridDim, BLOCK_DIM, a, x, y, n);
test("Instruction-Level Parallelism", &saxpy_ilp, gridDimILP, BLOCK_DIM, a, x, y, n);
CUDA_CHECK_ERROR(cudaFree(x));
CUDA_CHECK_ERROR(cudaFree(y));
free(temp);
}
I compiled the code in release mode on a RTX 3060 12 GB and profiled it with Nsight Compute and Nsight Systems. Additionally, I tried this on a RTX 3070 and a GTX 1060 but with the same relative results. I launched an even multiple of the thread count of these GPUs to isolate the performance from the tail effect [2]. All kernels perform roughly the same for all n > wave. Confusingly, n <= wave, all optimizations seem to outperform the baseline.
- Grid-Stride Loop: From my understanding, the grid-stride loop should reduce the runtime by getting rid of the block scheduling overhead. I didn’t expect the kernel to drastically improve but it seems to get a little worse instead (for large n).
- AtomicAdd: I expected a pretty good performance increase because this kernel is clearly memory bound and the atomic reduces the amount of memory which needs to be loaded into registers by a factor of 2. Only x needs to be loaded and the addition of ax and y is done in the L2 cache instead via the atomic operation. Nsight Compute confirms this but somehow the kernel performs equally good.
- Instruction-Level Parallelism: This kernel should improve the performance quite a bit. The idea is to let one thread compute multiple elements, like the gride-stride loop, but by using a compile time offset, the compiler is able to recognize that these elements are independent and can interleave them to keep issuing instruction while other elements need to wait on dependencies. I also confirmed this by looking at the source code in Nsight Compute. Still, this kernel does not perform better than the simple one.
If anyone has any ideas I would really appreciate it, thanks!
[1] https://developer.nvidia.com/blog/boosting-cuda-efficiency-with-essential-techniques-for-new-developers/
[2] https://developer.nvidia.com/blog/cuda-pro-tip-minimize-the-tail-effect/