I am using ncu to profile my code snippet (provided below) to understand the hardware caching behavior on local memory.
__host__ __device__ int hash(int key) {
int x = key;
x = (x + 0x7ed55d16) + (x << 12);
x = (x ^ 0xc761c23c) ^ (x >> 19);
x = (x + 0x165667b1) + (x << 5);
x = (x + 0xd3a2646c) ^ (x << 9);
x = (x + 0xfd7046c5) + (x << 3);
x = (x ^ 0xb55a4f09) ^ (x >> 16);
return x;
}
__global__ void simulated(int *global_arr) {
int tid = blockDim.x * blockIdx.x + threadIdx.x;
int local1[16];
for (int i = 0; i < 16; ++i) {
local1[i] = 0;
}
for (int k = 0; k < 10; ++k) {
for (int i = 0; i < 10; ++i) {
int pos = hash(i) & (16 - 1);
local1[pos] += k;
}
}
for (int i = 0; i < 16; ++i) {
global_arr[tid] += local1[i];
}
}
However, in ncu memory stats, there is no local memory requests at all, which is unexpected.
Anyone has some insights? Thanks!