As a first step, I confirmed that the time to access constant memory grows pretty much linearly with the amount of address divergence across the warp. A 32-bit mask, where each bit controls whether the address for that warp "lane’ is taken from the loop counter (shared by all threads in the warp) or a simple PRNG (initialized with a different seed for each thread). To make the memory access dominant and minimize overhead 16 sequential loads are performed starting at the selected array index such determined. When looking at the data below, keep in mind that when there are, for example, two lanes using a random index, that’s a total of three different addresses across the warp. I verified that various random lane combinations specifying the same number of random lanes also have identical performance, as we would expect (e.g. mask=0x55555555
, mask=0xaaaaaaaa
, mask=0xcccccccc
). Elapsed time is reported in seconds. I ran with CUDA 11.1 on Quadro RTX 4000 (sm_75
)
constant memory LUT (SHARED_TAB=0
):
mask=00000000 rnd_lanes=0 elapsed = 0.0020027
mask=80000000 rnd_lanes=1 elapsed = 0.0038806
mask=80008000 rnd_lanes=2 elapsed = 0.0057145
mask=80808080 rnd_lanes=4 elapsed = 0.009453
mask=88888888 rnd_lanes=8 elapsed = 0.0167938
mask=aaaaaaaa rnd_lanes=16 elapsed = 0.0310192
mask=FFFFFFFF rnd_lanes=32 elapsed = 0.0568734
shared memory LUT (SHARED_TAB=1
):
mask=00000000 rnd_lanes=0 elapsed = 0.0019839
mask=80000000 rnd_lanes=1 elapsed = 0.0020394
mask=80008000 rnd_lanes=2 elapsed = 0.0021156
mask=80808080 rnd_lanes=4 elapsed = 0.0023051
mask=88888888 rnd_lanes=8 elapsed = 0.0026492
mask=aaaaaaaa rnd_lanes=16 elapsed = 0.0032692
mask=FFFFFFFF rnd_lanes=32 elapsed = 0.0034731
As expected, for code that is nearly completely bound by LUT accesses, a LUT in shared memory provides more robust performance characteristics compared to a LUT in constant memory in the presence of intra-warp address divergence.
The real-life kernels I looked at in the past featured a bunch of other work in addition to LUT accesses, and I found with those that between two-way to four-way address divergence for constant memory LUTs had a negligible performance impact. I no longer have access to those codes; the question I was researching at the time was whether these LUTs could profitably be exchanged for some simple computation, with the result that this did not have a positive performance impact.
I think the conclusion here is that careful timing and/or profiling with realistic data is indispensable when deciding whether to use LUTs and if so, what memory to place them in.
My little test program is as follows:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#define NUM_BLK (1024)
#define NUM_THRD (256)
#define NUM_REPS (256)
#define SHARED_TAB (1)
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// A routine to give access to a high precision timer on most systems.
#if defined(_WIN32)
#if !defined(WIN32_LEAN_AND_MEAN)
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
double second (void)
{
LARGE_INTEGER t;
static double oofreq;
static int checkedForHighResTimer;
static BOOL hasHighResTimer;
if (!checkedForHighResTimer) {
hasHighResTimer = QueryPerformanceFrequency (&t);
oofreq = 1.0 / (double)t.QuadPart;
checkedForHighResTimer = 1;
}
if (hasHighResTimer) {
QueryPerformanceCounter (&t);
return (double)t.QuadPart * oofreq;
} else {
return (double)GetTickCount() * 1.0e-3;
}
}
#elif defined(__linux__) || defined(__APPLE__)
#include <stddef.h>
#include <sys/time.h>
double second (void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (double)tv.tv_sec + (double)tv.tv_usec * 1.0e-6;
}
#else
#error unsupported platform
#endif
int32_t popc (uint32_t a)
{
int32_t c;
for (c = 0; a; c++) {
a &= a - 1;
}
return c;
}
#if !SHARED_TAB
__constant__ uint32_t tab [272] =
{
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,
0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff,
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
};
#endif // SHARED_TAB
__global__ void kernel (uint32_t mask, uint32_t *r)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
uint32_t prng = tid;
uint32_t lane_select, sum, idx;
#if SHARED_TAB
__shared__ uint32_t tab [272];
tab [threadIdx.x] = threadIdx.x;
if (threadIdx.x < 16) tab [threadIdx.x + NUM_THRD] = threadIdx.x;
#endif
lane_select = (1 << (tid & 0x1f)) & mask; // one-hot
sum = 0;
for (int i = 0; i < NUM_REPS; i++) {
prng ^= prng << 13;
prng ^= prng >> 17;
prng ^= prng << 5;
idx = lane_select ? (prng & 0xff) : i;
sum += tab [idx+0];
sum += tab [idx+1];
sum += tab [idx+2];
sum += tab [idx+3];
sum += tab [idx+4];
sum += tab [idx+5];
sum += tab [idx+6];
sum += tab [idx+7];
sum += tab [idx+8];
sum += tab [idx+9];
sum += tab [idx+11];
sum += tab [idx+12];
sum += tab [idx+13];
sum += tab [idx+14];
sum += tab [idx+15];
sum += tab [idx+16];
}
r[tid] = sum;
}
int main (void)
{
uint32_t *res_d = 0;
uint32_t mask = 0xffffffff;
double start, stop, elapsed;
CUDA_SAFE_CALL (cudaMalloc ((void **)&res_d, sizeof (res_d[0]) * NUM_BLK * NUM_THRD));
for (int k = 0; k < 2; k++) {
cudaDeviceSynchronize();
start = second();
kernel <<<NUM_BLK,NUM_THRD>>>(mask, res_d);
CHECK_LAUNCH_ERROR();
stop = second();
elapsed = stop - start;
}
printf ("mask=%08x rnd_lanes=%d elapsed = %.6g\n",
mask, popc(mask), elapsed);
return EXIT_SUCCESS;
}