Here is a quick benchmark for INT32 throughput testing based on IMAD (where 1 IMAD counts as 2 INT32 ops). On my Quadro RTX 4000, the output is:
testing INT32 op throughput with IMAD (one IMAD = two int ops)
running on device 0 (Quadro RTX 4000)
using 256 threads per block, 524160 blocks, 1.073480 GB used
iops= 4.949010e+12 elapsed=0.58310 sec throughput=8.48737 Tiops (via IMAD)
According to the throughput table in the CUDA 12.8 Programming Guide, on sm_75
both FFMA
and IMAD
have the same throughput of 64 instructions / cycle / SM, so this makes sense.
Why this does not jibe with my earlier integer throughput test, I do not know at the moment. According to the throughput table, all simple integer instructions should also have a throughput of 64 instructions / cycle / SM on sm_75
, but my earlier test demonstrated only half that rate on real-life code (byte-wise addition with signed saturation). I donāt think I dropped a factor of two somewhere, but who knows ā¦
#include <cstdlib>
#include <cstdio>
#include <cstdint>
#define DEVICE_ORDINAL (0)
#define THREADS_PER_BLK (256)
#define LEN (65520 * 1024 * 2)
#define STAGES (192)
#define REPS (16)
#define ITER (10)
const int DEPTH = STAGES;
#if defined(_WIN32)
#if !defined(WIN32_LEAN_AND_MEAN)
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
double second (void)
{
LARGE_INTEGER t;
static double oofreq;
static int checkedForHighResTimer;
static BOOL hasHighResTimer;
if (!checkedForHighResTimer) {
hasHighResTimer = QueryPerformanceFrequency (&t);
oofreq = 1.0 / (double)t.QuadPart;
checkedForHighResTimer = 1;
}
if (hasHighResTimer) {
QueryPerformanceCounter (&t);
return (double)t.QuadPart * oofreq;
} else {
return (double)GetTickCount() * 1.0e-3;
}
}
#elif defined(__linux__) || defined(__APPLE__)
#include <stddef.h>
#include <sys/time.h>
double second (void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (double)tv.tv_sec + (double)tv.tv_usec * 1.0e-6;
}
#else
#error unsupported platform
#endif
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaDeviceSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
__device__ uint32_t imad_mix (uint32_t a, uint32_t b, uint32_t c)
{
c = a * b + c;
a = b * c + a;
b = c * a + b;
return b;
}
__global__ void kernel (const uint32_t * __restrict__ src,
uint32_t * __restrict__ dst,
uint32_t a, uint32_t b, int len)
{
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
uint32_t aa = __sinf(a) * a;
uint32_t bb = __cosf(b) * b;
uint32_t p = src[i] * aa + bb;
uint32_t q = src[i] * bb + aa;
for (int k = 0; k < REPS; k++) {
#pragma unroll DEPTH
for (int j = 0; j < DEPTH; j++) {
p = imad_mix (p, bb, aa);
q = imad_mix (q, bb, aa);
}
}
dst[i] = p * q;
}
}
int main (void)
{
double start, stop, elapsed, mintime=1e308, nbr_of_imad;
uint32_t *d_a, *d_b;
struct cudaDeviceProp props;
printf ("testing INT32 op throughput with IMAD (one IMAD = two int ops)\n");
CUDA_SAFE_CALL (cudaGetDeviceProperties (&props, DEVICE_ORDINAL));
printf ("running on device %d (%s)\n", DEVICE_ORDINAL, props.name);
/* Allocate memory on device */
CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0]) * LEN));
CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0]) * LEN));
/* Initialize device memory */
CUDA_SAFE_CALL (cudaMemset(d_a, 0x00, sizeof(d_a[0]) * LEN)); // zero
/* Compute execution configuration */
dim3 dimBlock(THREADS_PER_BLK);
int threadBlocks = (LEN + (dimBlock.x - 1)) / dimBlock.x;
dim3 dimGrid(threadBlocks);
printf ("using %d threads per block, %d blocks, %f GB used\n",
dimBlock.x, dimGrid.x, 2*1e-9*LEN*sizeof(d_a[0]));
for (int k = 0; k < ITER; k++) {
cudaDeviceSynchronize();
start = second();
kernel<<<dimGrid,dimBlock>>>(d_a, d_b, 0x5da07326, 0x5102d832, LEN);
CHECK_LAUNCH_ERROR();
stop = second();
}
elapsed= stop - start;
if (elapsed < mintime) { mintime = elapsed; }
nbr_of_imad = (2.0 * DEPTH * REPS + 3.0) * LEN * 3;
printf ("iops=%13.6e elapsed=%.5f sec throughput=%.5f Tiops (via IMAD)\n",
nbr_of_imad * 2, mintime, nbr_of_imad * 2 *1e-12/mintime);
CUDA_SAFE_CALL (cudaFree(d_a));
CUDA_SAFE_CALL (cudaFree(d_b));
return EXIT_SUCCESS;
}