You will need to devise different tests for different purposes. Block copies and block stored can tell us something about the throughput of caches and main memory.
For measuring latency, I have used pointer chasing in the past, on CPUs. Using an LFSR, such a test can visit 2n-1 locations in a memory block of size 2n elements in “random” order. GPUs are designed as throughput machines, so I have had no need to measure latency. Other approaches for measuring latency likely exist, check the literature and open source software.
Don’t know, check the documentation.
Below is an example program I used to measure latencies on an Xeon E3-1270 (IvyBridge) CPU. It prints:
n=8 count=255 size=2040 bytes
elapsed = 2.93250196e-007 per pointer: 4.25500284e+000 cycles
n=9 count=511 size=4088 bytes
elapsed = 5.86500391e-007 per pointer: 4.24667602e+000 cycles
n=10 count=1023 size=8184 bytes
elapsed = 8.79634172e-007 per pointer: 3.18147257e+000 cycles
n=11 count=2047 size=16376 bytes
elapsed = 2.34576873e-006 per pointer: 4.24003142e+000 cycles
n=12 count=4095 size=32760 bytes
elapsed = 4.39840369e-006 per pointer: 3.97413764e+000 cycles <<<<
n=13 count=8191 size=65528 bytes
elapsed = 1.87661499e-005 per pointer: 8.47695697e+000 cycles
n=14 count=16383 size=131064 bytes
elapsed = 4.54494730e-005 per pointer: 1.02644845e+001 cycles
n=15 count=32767 size=262136 bytes
elapsed = 1.00282137e-004 per pointer: 1.13237070e+001 cycles <<<<
n=16 count=65535 size=524280 bytes
elapsed = 4.49803192e-004 per pointer: 2.53951600e+001 cycles
n=17 count=131071 size=1048568 bytes
elapsed = 1.08961470e-003 per pointer: 3.07587063e+001 cycles
n=18 count=262143 size=2097144 bytes
elapsed = 2.32290826e-003 per pointer: 3.27865347e+001 cycles <<<<
From this we see that L1 cache is 32KB in size with access latency of 4 cycles; the L2 cache is 256 KB in size with access latency of 11-12 cycles; L3 cache has an access latency of 33 cycles. Note that the timing methodology used in the program only has a time resolution of slightly under one microsecond. Also, I ran this program on a partially busy machine and did not pin it to one CPU core. Therefore the results are probably noisier than necessary.
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#define PROC_FREQ (3700000000LL) // 3.7 GHz
#define MAX_POW (18) // use up to 2**18 pointer
// A routine to give access to a high precision timer on most systems.
#if defined(_WIN32)
#if !defined(WIN32_LEAN_AND_MEAN)
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
double second (void)
{
LARGE_INTEGER t;
static double oofreq;
static int checkedForHighResTimer;
static BOOL hasHighResTimer;
if (!checkedForHighResTimer) {
hasHighResTimer = QueryPerformanceFrequency (&t);
oofreq = 1.0 / (double)t.QuadPart;
checkedForHighResTimer = 1;
}
if (hasHighResTimer) {
QueryPerformanceCounter (&t);
return (double)t.QuadPart * oofreq;
} else {
return (double)GetTickCount() * 1.0e-3;
}
}
#elif defined(__linux__) || defined(__APPLE__)
#include <stddef.h>
#include <sys/time.h>
double second (void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (double)tv.tv_sec + (double)tv.tv_usec * 1.0e-6;
}
#else
#error unsupported platform
#endif
volatile uintptr_t ptr_array [1 << MAX_POW];
int main (void)
{
const int lfsr_mask [MAX_POW+1] = { 0, 0, 0x3, 0x6, 0xc, 0x14, 0x30,
0x60, 0xb8, 0x110, 0x240, 0x500,
0xe08, 0x1c80, 0x3802, 0x6000,
0xd008, 0x12000, 0x20400};
double start, stop, elapsed;
int count, mask, state, new_state;
for (int n = 8; n < (MAX_POW+1); n++) {
/* use LFSR to initialize array */
mask = lfsr_mask [n];
count = 0;
state = 1;
do {
new_state = (state & 1) ? ((state >> 1) ^ mask) : (state >> 1);
ptr_array [state] = (uintptr_t)(&ptr_array [new_state]);
state = new_state;
count++;
} while (state != 1);
printf ("n=%d count=%d size=%d bytes\n",
n, count, count * (int)(sizeof(*ptr_array)));
/* chase the pointers */
for (int j = 0; j < 3; j++) {
volatile uintptr_t *addr = &ptr_array[1];
start = second();
for (int i = 1; i < count; i++) {
addr = (uintptr_t *)(*addr);
}
stop = second();
}
elapsed = stop - start;
printf ("elapsed = %15.8e per pointer: %15.8e cycles\n",
elapsed, (elapsed / count) / (1.0 / PROC_FREQ));
}
return EXIT_SUCCESS;
}