Hi all, I have been doing some tests with interleaving loads through different cache levels, and I’ve been getting some results I don’t entirely understand. I am using CUDA C++ 8.0 in VC++2015.
I’m running this code on a 980ti, which from what I’ve researched should let me explicitly use or bypass the L1 cache.
I am using two functions wrapping around PTX instructions for explicit cache usage.
__device__ int loadThroughL1Cache(int* p)
{
int out;
asm("ld.global.ca.s32 %0, [%1];" : "=r"(out) : "l"(p));
return out;
}
__device__ int loadThroughL2Cache(int* p)
{
int out;
asm("ld.global.cg.s32 %0, [%1];" : "=r"(out) : "l"(p));
return out;
}
From the PTX ISA documentation, the .cg flag should explicitly disable usage of the L1 cache, and the .ca flag should explicitly use L1 cache.
I’ve been trying to use these to stop one low hit rate read from thrashing the L1 cache, so that another high hit rate read gets better L1 hit rates.
Here’s my full test code:
#include <cstdio>
#include <ctime>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// 128MB
#define count 1024*1024*128
#define WARP_SIZE 32
#define BLOCK_SIZE 32
__device__ int* ptr;
__device__ int* cudaOut;
int* workspace;
int* output;
void checkError(cudaError_t err) {
if (err == cudaSuccess)
return;
const char* error = cudaGetErrorString(err);
printf(error);
}
__device__ int loadThroughL1Cache(int* p)
{
int out;
asm("ld.global.ca.s32 %0, [%1];" : "=r"(out) : "l"(p));
return out;
}
__device__ int loadThroughL2Cache(int* p)
{
int out;
asm("ld.global.cg.s32 %0, [%1];" : "=r"(out) : "l"(p));
return out;
}
// Result: the overhead
// each thread processes a 1024-byte block of ram
__global__ void interleavedCacheLoadsTest1(int* area, size_t size, int* out)
{
int px = (blockIdx.x * blockDim.x) + threadIdx.x;
int val;
// 1st half has "random" accesses
// over 128m of memory, no threads read same area
if (px >= count)
return;
int offset = px * 16384;
int offset2 = (px % 17) + (px % 3) + (px % 47) + (px % 13);
for (int r = 0; r < 1; r++)
{
for (int i = 0; i < 16384; i++)
{
// When this runs through L1 cache, it has 0% L1 hit rate
val += loadThroughL2Cache(&area[offset + ((i) % 1024)]);
// When this runs through L1 cache, it has 100% L1 hit rate
val += loadThroughL1Cache(&area[(offset2 + (i * offset2)) % 4096]);
}
}
out[px] = val;
}
int main()
{
cudaDeviceSynchronize();
workspace = (int*)malloc(count*sizeof(int));
for (int i = 0; i < count; i++) {
//16 bits set
workspace[i] = 0x000000FF;
}
checkError(cudaMalloc(&ptr, count*sizeof(int)));
checkError(cudaMalloc(&cudaOut, count*sizeof(int)*sizeof(int)));
checkError(cudaMemcpy(ptr, workspace, count*sizeof(int), cudaMemcpyHostToDevice));
dim3 grid = dim3(count / ((WARP_SIZE*WARP_SIZE)* 16384) + 1, 1, 1);
dim3 block = dim3(WARP_SIZE*WARP_SIZE, 1, 1);
time_t time;
cudaDeviceSynchronize();
checkError(cudaGetLastError());
time = -clock();
interleavedCacheLoadsTest1 << < grid, block >> >(ptr, count, cudaOut);
checkError(cudaDeviceSynchronize());
time += clock();
printf("Time: %f\n", ((float)(time) / CLOCKS_PER_SEC));
__debugbreak();
}
[/i]
I have designed line 64 to thrash the L1 cache. If you comment out line 66 and modify line 64 to use the L1 read function, it gets a 0.1% hit rate.
I have designed line 66 to perfectly cache. If you comment out line 64, you will see the L1 read gets 100% L1 cache hit rate.
HOWEVER, when these reads are interleaved as is in the unmodified code, I only get a 15.8% L1 hit rate. If the .cg instruction really is avoiding the L1 cache, shouldn’t line 66 still have 100% L1 cache hits? It seems like the .cg instruction is still loading through L1 cache…
Equally strange, I found that if you comment out line 66, line 64 will actually run substantially faster if loaded through the L1 cache, despite there being virtually no L1 cache hits… which seems to imply there is more overhead in not checking the cache?
I’m wondering if this has something to do with Maxwell having a shared texture/L1 cache? I don’t know if that would affect any of my assumptions on what does/doesn’t count as an L1 cache load?