With CUDA, there is no substitute for experimentation. Of course, any benchmark is only good at testing how fast the benchmark runs. Latency hiding from adding real computations and/or application specific memory access patterns WILL change results significantly. But I post this anyways.
I made a simple benchmark that only does memory access through a small 256 uint array, using each value read as a distance to hop to the next read. See the code below. I tested it in two modes: one where all threads in the warp are always accessing the same elements and one where accesses are completely random. Timing is done with the CUDA_PROFILER=1.
Here are the results, including an “effective” memory bandwidth counting all reads from the 256 element array:
Random mode:
Const, Tex, Shmem
gputime (us) 95951.84, 123481.02, 42771.76
Greads/s 13.34, 10.37, 29.93
GBytes/s 49.7, 38.62, 111.5
Warp coherent mode
Const, Tex, Shmem
gputime (us) 17542.92, 70186, 20436
Greads/s 72.97, 18.24, 62.64
GBytes/s 271.84, 67.95, 233.36
I’m a bit disappointed by the texture cache. Doesn’t the guide say that values in cache no access latency? Why then are random accesses from an array that entirely fits in cache so slow, even compared to the global memory bandwidth of 86.4 GB/s?? I guess this is just another example that the texture cache is really only useful when a single warp is accessing a very local region of the texture (i.e much smaller than a 1K spread). Temporal locality from one read to the next seems to have absolutely no bearing on the texture cache performance.
Anyways, I digress. It seems that the way to go for small random access is definitely the shared memory, being more than twice as fast as constant memory. If bank conflicts can be avoided even with the random accesses, that speed should increase.
Here is the code:
#include <stdio.h>
#include <stdlib.h>
__constant__ unsigned int c_vals[256];
texture<unsigned int, 1, cudaReadModeElementType> tex;
__global__ void const_speedtest(unsigned int *d_outvals)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int cur = c_vals[threadIdx.x];
for (int i = 0; i < 1000; i++)
{
cur = c_vals[cur] & 0xff;
cur = c_vals[cur] & 0xff;
cur = c_vals[cur] & 0xff;
cur = c_vals[cur] & 0xff;
cur = c_vals[cur] & 0xff;
cur = c_vals[cur] & 0xff;
cur = c_vals[cur] & 0xff;
cur = c_vals[cur] & 0xff;
cur = c_vals[cur] & 0xff;
cur = c_vals[cur] & 0xff;
}
d_outvals[idx] = cur;
}
__global__ void tex_speedtest(unsigned int *d_outvals)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int cur = tex1Dfetch(tex, threadIdx.x);
for (int i = 0; i < 1000; i++)
{
cur = tex1Dfetch(tex, cur) & 0xff;
cur = tex1Dfetch(tex, cur) & 0xff;
cur = tex1Dfetch(tex, cur) & 0xff;
cur = tex1Dfetch(tex, cur) & 0xff;
cur = tex1Dfetch(tex, cur) & 0xff;
cur = tex1Dfetch(tex, cur) & 0xff;
cur = tex1Dfetch(tex, cur) & 0xff;
cur = tex1Dfetch(tex, cur) & 0xff;
cur = tex1Dfetch(tex, cur) & 0xff;
cur = tex1Dfetch(tex, cur) & 0xff;
}
d_outvals[idx] = cur;
}
__global__ void shmem_speedtest(unsigned int *d_outvals, unsigned int *d_invals)
{
__shared__ unsigned int shared[256];
int idx = blockIdx.x * blockDim.x + threadIdx.x;
shared[threadIdx.x] = d_invals[threadIdx.x];
unsigned int cur = shared[threadIdx.x];
for (int i = 0; i < 1000; i++)
{
cur = shared[cur] & 0xff;
cur = shared[cur] & 0xff;
cur = shared[cur] & 0xff;
cur = shared[cur] & 0xff;
cur = shared[cur] & 0xff;
cur = shared[cur] & 0xff;
cur = shared[cur] & 0xff;
cur = shared[cur] & 0xff;
cur = shared[cur] & 0xff;
cur = shared[cur] & 0xff;
}
d_outvals[idx] = cur;
}
int main(void)
{
int n_blocks = 500;
int block_size = 256;
// allocate global memory for output
unsigned int *d_outvals;
cudaMalloc((void**)&d_outvals, sizeof(unsigned int) * n_blocks * block_size);
// host memory for values
unsigned int vals[256];
// RANDOM MODE
for (int i = 0; i < 256; i++)
vals[i] = rand() & 0xff;
// global memory for values
unsigned int *d_vals;
cudaMalloc((void**)&d_vals, sizeof(unsigned int) * 256);
// copy memory to device
cudaMemcpy(d_vals, vals, sizeof(unsigned int) * 256, cudaMemcpyHostToDevice);
cudaBindTexture(0,tex,d_vals,sizeof(unsigned int) * 256);
cudaMemcpyToSymbol(c_vals, vals, sizeof(unsigned int) * 256);
// run the kernels
for (int i = 0; i < 10; i++)
{
const_speedtest<<< n_blocks, block_size >>>(d_outvals);
}
cudaThreadSynchronize();
for (int i = 0; i < 10; i++)
{
tex_speedtest<<< n_blocks, block_size >>>(d_outvals);
}
cudaThreadSynchronize();
for (int i = 0; i < 10; i++)
{
shmem_speedtest<<< n_blocks, block_size >>>(d_outvals, d_vals);
}
cudaThreadSynchronize();
// WARP-COHERENT MODE
for (int i = 0; i < 256; i+=32)
{
vals[i] = rand() & 0xff;
for (int j = i+1; j < i+32; j++)
vals[j] = vals[i];
}
// copy memory to device
cudaMemcpy(d_vals, vals, sizeof(unsigned int) * 256, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_vals, vals, sizeof(unsigned int) * 256);
// run the kernels again
for (int i = 0; i < 10; i++)
{
const_speedtest<<< n_blocks, block_size >>>(d_outvals);
}
cudaThreadSynchronize();
for (int i = 0; i < 10; i++)
{
tex_speedtest<<< n_blocks, block_size >>>(d_outvals);
}
cudaThreadSynchronize();
for (int i = 0; i < 10; i++)
{
shmem_speedtest<<< n_blocks, block_size >>>(d_outvals, d_vals);
}
cudaThreadSynchronize();
return 0;
}