Can someone try this bench?
This generates a basically uninterrupted IMAD stream in SASS.
I get about 22 TOPs on my 4090.
Note, I count one IMAD as one op, whereas NVIDIA’s whitepaper counts one IMAD as two ops (1x mul, 1x add). So this matches their whitepaper nicely.
My 4090 is liquid cooled and slightly overclocked, so it makes sense I’m a little higher.
I don’t yet have a 50-series card to test yet :(
#include <cstdint>
#include <chrono>
#include <iostream>
#define BLOCK_SIZE 1024
#define N_REGISTERS 32
#define ITER 10000000
__device__ __forceinline__ void imad(uint32_t &acc, uint32_t a, uint32_t b, uint32_t c) {
asm volatile ("mad.lo.u32 %0, %1, %2, %3;\n"
: "=r"(acc)
: "r"(a), "r"(b), "r"(c));
}
__global__ void k_imad_bench(uint32_t *res) {
uint32_t r[N_REGISTERS] = {0};
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = 0; i < N_REGISTERS; i++) {
r[i] = i + tid;
}
for (int i = 0; i < ITER; i++) {
#pragma unroll
for (int j = 0; j < N_REGISTERS; j++) {
imad(r[j], r[j], r[j], r[j]);
}
}
for (int i = 0; i < N_REGISTERS; i++) {
res[tid] ^= r[i];
}
}
int main() {
cudaDeviceProp dev_prop;
cudaGetDeviceProperties(&dev_prop, 0);
uint32_t *res;
uint64_t total_threads = dev_prop.multiProcessorCount * BLOCK_SIZE;
cudaMallocManaged(&res, sizeof(uint32_t) * total_threads);
auto start = std::chrono::high_resolution_clock::now();
k_imad_bench<<<dev_prop.multiProcessorCount, BLOCK_SIZE>>>(res);
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> elapsed = end - start;
double ms = elapsed.count();
uint64_t total_instructions = total_threads * (uint64_t) ITER * N_REGISTERS;
double seconds = ms / 1000.0;
double throughput = total_instructions / seconds;
printf("Throughput: %e IMAD/sec\n", throughput);
return 0;
}