It’s not a trivial matter to write such a test.
You could try something like this. On my L4 GPU it gets me into the 6-7 TOps/s range, and studying the SASS (for cc8.9) shows an inner loop that is mostly integer adds. I haven’t really tried to make the calculation of TOps/s correct so don’t use this as an actual calibrated measurement, but if you run this on a GPU that has twice the integer throughput, the reported number should be twice as large. You’ll want to set num_SM
to the number of multiprocessors in the GPU you are running on (can be gotten from deviceQuery
) and compile for cc8.9 if running on Ada GPU (such as 4080) or cc12.0 for your 5080. Note that your machine is not configured correctly to run this test on a cc12.0 GPU, at least not to my liking (it should work, but it will be hard to analyze the SASS if needed):
That means you have proper driver support (12.8) for your 5080 but not proper CUDA toolkit support (12.3) for your 5080. If you compile with PTX (e.g. cc8.9 PTX), then the driver should JIT the code for your 5080, and things should “work”, but you wouldn’t be able to easily inspect the SASS. It would be better to actually load CUDA toolkit 12.8 on that machine, and compile explicitly for cc12.0
And you’ll have to modify the timing code for your windows machine.
Here is my test case, use at your own risk, does not give calibrated results:
# cat t335.cu
#include <iostream>
#include <cstdlib>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start=0){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
const int AL = 4;
template <typename T>
__global__ void addKernel(T * __restrict__ c, const T * __restrict__ a, const int l)
{
T b[AL];
int index = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < AL; i++) b[i] = a[index + i * gridDim.x*blockDim.x];
T val = 0;
#pragma unroll AL
for (int i = 0; i < l; i++)
{
val += b[i];
}
if (val == 0) c[index] = val;
}
using mt = int;
const int num_SM = 58;
const int blk_per_SM = 3;
const int nBLK = blk_per_SM*num_SM;
const int nTPB = 512;
const int mnr = 8192;
int main(int argc, char *argv[]){
int nr = mnr;
if (argc > 1) nr = atoi(argv[1]);
mt *c, *a;
cudaMalloc(&a, AL*nBLK*nTPB*sizeof(a[0]));
cudaMemset(a,1, AL*nBLK*nTPB*sizeof(a[0]));
cudaMalloc(&c, nBLK*nTPB*sizeof(c[0]));
addKernel<<<nBLK, nTPB>>>(c, a, nr); // warm-up
cudaDeviceSynchronize();
unsigned long long dt = dtime_usec(0);
addKernel<<<nBLK, nTPB>>>(c, a, nr);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
unsigned long long iops = nBLK*nTPB*(unsigned long long)nr;
unsigned long long iopps = iops/dt;
std::cout << "usec: " << dt << std::endl;
std::cout << "iops: " << iops << std::endl;
std::cout << "iops/s: " << iopps/1e6 << "TOps/s" << std::endl;
}
# nvcc -arch=sm_89 t335.cu -o t335 -lineinfo -Xptxas=-v
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z9addKernelIiEvPT_PKS0_i' for 'sm_89'
ptxas info : Function properties for _Z9addKernelIiEvPT_PKS0_i
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 16 registers, 372 bytes cmem[0]
# ./t335 100000
usec: 1238
iops: 8908800000
iops/s: 7.19612TOps/s
#
CUDA 12.2 SASS:
code for sm_89
Function : _Z9addKernelIiEvPT_PKS0_i
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM89 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM89)"
/*0000*/ IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ; /* 0x00000a00ff017624 */
/* 0x000fc400078e00ff */
/*0010*/ S2R R0, SR_CTAID.X ; /* 0x0000000000007919 */
/* 0x000e220000002500 */
/*0020*/ IMAD.MOV.U32 R11, RZ, RZ, c[0x0][0x170] ; /* 0x00005c00ff0b7624 */
/* 0x000fe200078e00ff */
/*0030*/ ULDC.64 UR6, c[0x0][0x118] ; /* 0x0000460000067ab9 */
/* 0x000fe20000000a00 */
/*0040*/ IMAD.MOV.U32 R5, RZ, RZ, c[0x0][0x0] ; /* 0x00000000ff057624 */
/* 0x000fe200078e00ff */
/*0050*/ S2R R3, SR_TID.X ; /* 0x0000000000037919 */
/* 0x000e220000002100 */
/*0060*/ IMAD.MOV.U32 R13, RZ, RZ, 0x4 ; /* 0x00000004ff0d7424 */
/* 0x000fe200078e00ff */
/*0070*/ ISETP.GE.AND P0, PT, R11, 0x1, PT ; /* 0x000000010b00780c */
/* 0x000fe20003f06270 */
/*0080*/ IMAD R0, R0, c[0x0][0x0], R3 ; /* 0x0000000000007a24 */
/* 0x001fc800078e0203 */
/*0090*/ IMAD R4, R5, c[0x0][0xc], R0 ; /* 0x0000030005047a24 */
/* 0x000fc800078e0200 */
/*00a0*/ IMAD R6, R5, c[0x0][0xc], R4 ; /* 0x0000030005067a24 */
/* 0x000fc800078e0204 */
/*00b0*/ IMAD R8, R5, c[0x0][0xc], R6 ; /* 0x0000030005087a24 */
/* 0x000fe200078e0206 */
/*00c0*/ @!P0 BRA 0x350 ; /* 0x0000028000008947 */
/* 0x000fea0003800000 */
/*00d0*/ IMAD.WIDE.U32 R4, R4, R13, c[0x0][0x168] ; /* 0x00005a0004047625 */
/* 0x000fc800078e000d */
/*00e0*/ IMAD.WIDE.U32 R6, R6, R13.reuse, c[0x0][0x168] ; /* 0x00005a0006067625 */
/* 0x080fe400078e000d */
/*00f0*/ LDG.E.CONSTANT R5, [R4.64] ; /* 0x0000000604057981 */
/* 0x000164000c1e9900 */
/*0100*/ IMAD.WIDE.U32 R8, R8, R13.reuse, c[0x0][0x168] ; /* 0x00005a0008087625 */
/* 0x080fe400078e000d */
/*0110*/ LDG.E.CONSTANT R6, [R6.64] ; /* 0x0000000606067981 */
/* 0x000164000c1e9900 */
/*0120*/ IMAD.WIDE.U32 R2, R0, R13, c[0x0][0x168] ; /* 0x00005a0000027625 */
/* 0x000fe400078e000d */
/*0130*/ LDG.E.CONSTANT R9, [R8.64] ; /* 0x0000000608097981 */
/* 0x000168000c1e9900 */
/*0140*/ LDG.E.CONSTANT R2, [R2.64] ; /* 0x0000000602027981 */
/* 0x000162000c1e9900 */
/*0150*/ IADD3 R10, R11, -0x1, RZ ; /* 0xffffffff0b0a7810 */
/* 0x000fc80007ffe0ff */
/*0160*/ ISETP.GE.U32.AND P0, PT, R10, 0x3, PT ; /* 0x000000030a00780c */
/* 0x000fe20003f06070 */
/*0170*/ UMOV UR4, URZ ; /* 0x0000003f00047c82 */
/* 0x000fe20008000000 */
/*0180*/ LOP3.LUT R11, R11, 0x3, RZ, 0xc0, !PT ; /* 0x000000030b0b7812 */
/* 0x000fe200078ec0ff */
/*0190*/ IMAD.MOV.U32 R10, RZ, RZ, RZ ; /* 0x000000ffff0a7224 */
/* 0x000fc600078e00ff */
/*01a0*/ ISETP.NE.AND P1, PT, R11, RZ, PT ; /* 0x000000ff0b00720c */
/* 0x000fce0003f25270 */
/*01b0*/ @!P0 BRA 0x230 ; /* 0x0000007000008947 */
/* 0x000fec0003800000 */
/*01c0*/ IADD3 R3, -R11, c[0x0][0x170], RZ ; /* 0x00005c000b037a10 */
/* 0x001fc80007ffe1ff */
/*01d0*/ IADD3 R3, R3, -0x4, RZ ; /* 0xfffffffc03037810 */
/* 0x000fe20007ffe0ff */
/*01e0*/ UIADD3 UR4, UR4, 0x4, URZ ; /* 0x0000000404047890 */
/* 0x000fe2000fffe03f */
/*01f0*/ IADD3 R10, R5, R2, R10 ; /* 0x00000002050a7210 */
/* 0x020fe40007ffe00a */
/*0200*/ ISETP.NE.AND P0, PT, R3, RZ, PT ; /* 0x000000ff0300720c */
/* 0x000fe40003f05270 */
/*0210*/ IADD3 R10, R9, R6, R10 ; /* 0x00000006090a7210 */
/* 0x000fd60007ffe00a */
/*0220*/ @P0 BRA 0x1d0 ; /* 0xffffffa000000947 */
/* 0x000fea000383ffff */
/*0230*/ @!P1 BRA 0x330 ; /* 0x000000f000009947 */
/* 0x001fea0003800000 */
/*0240*/ USHF.L.U32 UR4, UR4, 0x2, URZ ; /* 0x0000000204047899 */
/* 0x000fcc000800063f */
/*0250*/ MOV R4, UR4 ; /* 0x0000000400047c02 */
/* 0x000fc80008000f00 */
/*0260*/ ISETP.EQ.AND P1, PT, R4.reuse, RZ, PT ; /* 0x000000ff0400720c */
/* 0x040fe40003f22270 */
/*0270*/ ISETP.EQ.AND P2, PT, R4.reuse, 0x4, PT ; /* 0x000000040400780c */
/* 0x040fe40003f42270 */
/*0280*/ ISETP.EQ.AND P3, PT, R4.reuse, 0x8, PT ; /* 0x000000080400780c */
/* 0x040fe40003f62270 */
/*0290*/ IADD3 R11, R11, -0x1, RZ ; /* 0xffffffff0b0b7810 */
/* 0x000fe40007ffe0ff */
/*02a0*/ ISETP.EQ.AND P0, PT, R4.reuse, 0xc, PT ; /* 0x0000000c0400780c */
/* 0x040fe40003f02270 */
/*02b0*/ IADD3 R4, R4, 0x4, RZ ; /* 0x0000000404047810 */
/* 0x000fc60007ffe0ff */
/*02c0*/ @P1 IMAD.MOV.U32 R3, RZ, RZ, R2 ; /* 0x000000ffff031224 */
/* 0x020fe200078e0002 */
/*02d0*/ ISETP.NE.AND P1, PT, R11, RZ, PT ; /* 0x000000ff0b00720c */
/* 0x000fe20003f25270 */
/*02e0*/ @P2 IMAD.MOV.U32 R3, RZ, RZ, R5 ; /* 0x000000ffff032224 */
/* 0x000fe400078e0005 */
/*02f0*/ @P3 IMAD.MOV.U32 R3, RZ, RZ, R6 ; /* 0x000000ffff033224 */
/* 0x000fc800078e0006 */
/*0300*/ @P0 MOV R3, R9 ; /* 0x0000000900030202 */
/* 0x000fca0000000f00 */
/*0310*/ IMAD.IADD R10, R10, 0x1, R3 ; /* 0x000000010a0a7824 */
/* 0x000fe200078e0203 */
/*0320*/ @P1 BRA 0x260 ; /* 0xffffff3000001947 */
/* 0x000fea000383ffff */
/*0330*/ ISETP.NE.AND P0, PT, R10, RZ, PT ; /* 0x000000ff0a00720c */
/* 0x000fda0003f05270 */
/*0340*/ @P0 EXIT ; /* 0x000000000000094d */
/* 0x000fea0003800000 */
/*0350*/ IMAD.WIDE R2, R0, R13, c[0x0][0x160] ; /* 0x0000580000027625 */
/* 0x020fca00078e020d */
/*0360*/ STG.E [R2.64], RZ ; /* 0x000000ff02007986 */
/* 0x000fe2000c101906 */
/*0370*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*0380*/ BRA 0x380; /* 0xfffffff000007947 */
/* 0x000fc0000383ffff */
(the inner loop starts at 01b0, to 0220, and the first 2 IADD3 instructions are for loop control, not the arithmetic of interest. I haven’t done a proper job of counting actual integer ops here.)
I threw this together “hastily”. It may have defects. Someone else may have something better.
Later: Don’t use this code, it has a bug, discussed later.