Local memory access

Take a look at the ptx code. The compiler is able to fully unroll the loops and precompute all values. All the kernel does is add 450 to each element in global memory.

.visible .entry _Z9simulatedPi(
        .param .u64 _Z9simulatedPi_param_0
)
{

        ld.param.u64    %rd1, [_Z9simulatedPi_param_0];
        mov.u32         %r1, %ntid.x;
        mov.u32         %r2, %ctaid.x;
        mov.u32         %r3, %tid.x;
        mad.lo.s32      %r4, %r1, %r2, %r3;
        cvta.to.global.u64      %rd2, %rd1;
        mul.wide.s32    %rd3, %r4, 4;
        add.s64         %rd4, %rd2, %rd3;
        ld.global.u32   %r5, [%rd4];
        add.s32         %r6, %r5, 450;
        st.global.u32   [%rd4], %r6;
        ret;

}
2 Likes