Local memory access

I am using ncu to profile my code snippet (provided below) to understand the hardware caching behavior on local memory.

__host__ __device__ int hash(int key) {
  int x = key;
  x = (x + 0x7ed55d16) + (x << 12);
  x = (x ^ 0xc761c23c) ^ (x >> 19);
  x = (x + 0x165667b1) + (x << 5);
  x = (x + 0xd3a2646c) ^ (x << 9);
  x = (x + 0xfd7046c5) + (x << 3);
  x = (x ^ 0xb55a4f09) ^ (x >> 16);
  return x;
}

__global__ void simulated(int *global_arr) {
  int tid = blockDim.x * blockIdx.x + threadIdx.x;

  int local1[16];
  
  for (int i = 0; i < 16; ++i) {
      local1[i] = 0;
  }

  for (int k = 0; k < 10; ++k) {
    for (int i = 0; i < 10; ++i) {
      int pos = hash(i) & (16 - 1);
      local1[pos] += k;
    }
  }

  for (int i = 0; i < 16; ++i) {
    global_arr[tid] += local1[i];
  } 
}

However, in ncu memory stats, there is no local memory requests at all, which is unexpected.

Anyone has some insights? Thanks!

Take a look at the ptx code. The compiler is able to fully unroll the loops and precompute all values. All the kernel does is add 450 to each element in global memory.

.visible .entry _Z9simulatedPi(
        .param .u64 _Z9simulatedPi_param_0
)
{

        ld.param.u64    %rd1, [_Z9simulatedPi_param_0];
        mov.u32         %r1, %ntid.x;
        mov.u32         %r2, %ctaid.x;
        mov.u32         %r3, %tid.x;
        mad.lo.s32      %r4, %r1, %r2, %r3;
        cvta.to.global.u64      %rd2, %rd1;
        mul.wide.s32    %rd3, %r4, 4;
        add.s64         %rd4, %rd2, %rd3;
        ld.global.u32   %r5, [%rd4];
        add.s32         %r6, %r5, 450;
        st.global.u32   [%rd4], %r6;
        ret;

}
2 Likes