What causes ptx compiler to demote shared variable?

My kernel code is as followed, which uses shared memory to cache input flowx and flowy.

extern "C" __global__ void k16_16_32_32_kernel0( float* __restrict__ flowx,  float* __restrict__ flowy,  float* __restrict__ out,  float* __restrict__ feature) {
  __shared__ float flowx_shared[256];
  __shared__ float flowy_shared[256];
  flowx_shared[((((int)threadIdx.x) * 16) + ((int)threadIdx.y))] = flowx[((((((((int)blockIdx.x) % 72) / 6) * 1536) + (((int)threadIdx.x) * 96)) + ((((int)blockIdx.x) % 6) * 16)) + ((int)threadIdx.y))];
  flowy_shared[((((int)threadIdx.x) * 16) + ((int)threadIdx.y))] = flowy[((((((((int)blockIdx.x) % 72) / 6) * 1536) + (((int)threadIdx.x) * 96)) + ((((int)blockIdx.x) % 6) * 16)) + ((int)threadIdx.y))];
  out[(((((((int)blockIdx.x) / 6) * 1536) + (((int)threadIdx.x) * 96)) + ((((int)blockIdx.x) % 6) * 16)) + ((int)threadIdx.y))] = 0.000000e+00f;
  for (int i_outer = 0; i_outer < 6; ++i_outer) {
    for (int j_outer = 0; j_outer < 3; ++j_outer) {
      for (int i_inner = 0; i_inner < 32; ++i_inner) {
        for (int j_inner = 0; j_inner < 32; ++j_inner) {
          out[(((((((int)blockIdx.x) / 6) * 1536) + (((int)threadIdx.x) * 96)) + ((((int)blockIdx.x) % 6) * 16)) + ((int)threadIdx.y))] = (out[(((((((int)blockIdx.x) / 6) * 1536) + (((int)threadIdx.x) * 96)) + ((((int)blockIdx.x) % 6) * 16)) + ((int)threadIdx.y))] + (((1.000000e+00f - max(fabsf(((((float)((((((int)blockIdx.x) % 72) / 6) * 16) + ((int)threadIdx.x))) + flowx_shared[((((int)threadIdx.x) * 16) + ((int)threadIdx.y))]) - ((float)((i_outer * 32) + i_inner)))), 1.000000e+00f)) * (1.000000e+00f - max(fabsf(((((float)(((((int)blockIdx.x) % 6) * 16) + ((int)threadIdx.y))) + flowy_shared[((((int)threadIdx.x) * 16) + ((int)threadIdx.y))]) - ((float)((j_outer * 32) + j_inner)))), 1.000000e+00f))) * feature[((((((((int)blockIdx.x) / 72) * 18432) + (i_outer * 3072)) + (i_inner * 96)) + (j_outer * 32)) + j_inner)]));
        }
      }
    }
  }
}

The compiled ptx code is as followed:

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-26218862
// Cuda compilation tools, release 10.1, V10.1.168
// Based on LLVM 3.4svn
//

.version 6.4
.target sm_60
.address_size 64

	// .globl	k16_16_32_32_kernel0
// _ZZ20k16_16_32_32_kernel0E12flowx_shared has been demoted
// _ZZ20k16_16_32_32_kernel0E12flowy_shared has been demoted

.visible .entry k16_16_32_32_kernel0(
	.param .u64 k16_16_32_32_kernel0_param_0,
	.param .u64 k16_16_32_32_kernel0_param_1,
	.param .u64 k16_16_32_32_kernel0_param_2,
	.param .u64 k16_16_32_32_kernel0_param_3
)
{
	.reg .pred 	%p<5>;
	.reg .f32 	%f<87>;
	.reg .b32 	%r<69>;
	.reg .b64 	%rd<19>;
	// demoted variable
	.shared .align 4 .b8 _ZZ20k16_16_32_32_kernel0E12flowx_shared[1024];
	// demoted variable
	.shared .align 4 .b8 _ZZ20k16_16_32_32_kernel0E12flowy_shared[1024];

	ld.param.u64 	%rd6, [k16_16_32_32_kernel0_param_0];
	ld.param.u64 	%rd7, [k16_16_32_32_kernel0_param_1];
	ld.param.u64 	%rd8, [k16_16_32_32_kernel0_param_2];
	ld.param.u64 	%rd9, [k16_16_32_32_kernel0_param_3];
	cvta.to.global.u64 	%rd1, %rd9;
	mov.u32 	%r16, %ctaid.x;
	mul.hi.s32 	%r17, %r16, 954437177;
	shr.u32 	%r18, %r17, 31;
	shr.s32 	%r19, %r17, 4;
	add.s32 	%r20, %r19, %r18;
	mul.lo.s32 	%r21, %r20, 72;
	sub.s32 	%r22, %r16, %r21;
	mul.hi.s32 	%r23, %r22, 715827883;
	shr.u32 	%r24, %r23, 31;
	add.s32 	%r25, %r23, %r24;
	mov.u32 	%r26, %tid.x;
	mul.lo.s32 	%r27, %r26, 96;
	mad.lo.s32 	%r28, %r25, 1536, %r27;
	mul.hi.s32 	%r29, %r16, 715827883;
	shr.u32 	%r30, %r29, 31;
	add.s32 	%r31, %r29, %r30;
	mul.lo.s32 	%r32, %r31, 6;
	sub.s32 	%r33, %r16, %r32;
	shl.b32 	%r34, %r33, 4;
	add.s32 	%r35, %r28, %r34;
	mov.u32 	%r36, %tid.y;
	add.s32 	%r37, %r35, %r36;
	cvta.to.global.u64 	%rd10, %rd6;
	mul.wide.s32 	%rd11, %r37, 4;
	add.s64 	%rd12, %rd10, %rd11;
	ld.global.nc.f32 	%f10, [%rd12];
	shl.b32 	%r38, %r26, 4;
	add.s32 	%r39, %r36, %r38;
	shl.b32 	%r40, %r39, 2;
	mov.u32 	%r41, _ZZ20k16_16_32_32_kernel0E12flowx_shared;
	add.s32 	%r42, %r41, %r40;
	st.shared.f32 	[%r42], %f10;
	cvta.to.global.u64 	%rd13, %rd7;
	add.s64 	%rd14, %rd13, %rd11;
	ld.global.nc.f32 	%f11, [%rd14];
	mov.u32 	%r43, _ZZ20k16_16_32_32_kernel0E12flowy_shared;
	add.s32 	%r44, %r43, %r40;
	st.shared.f32 	[%r44], %f11;
	add.s32 	%r45, %r34, %r27;
	add.s32 	%r46, %r45, %r36;
	mad.lo.s32 	%r47, %r31, 1536, %r46;
	cvta.to.global.u64 	%rd15, %rd8;
	mul.wide.s32 	%rd16, %r47, 4;
	add.s64 	%rd2, %rd15, %rd16;
	mov.u32 	%r15, 0;
	st.global.u32 	[%rd2], %r15;
	shl.b32 	%r48, %r25, 4;
	add.s32 	%r49, %r48, %r26;
	cvt.rn.f32.s32	%f12, %r49;
	add.f32 	%f1, %f12, %f10;
	add.s32 	%r50, %r34, %r36;
	cvt.rn.f32.s32	%f13, %r50;
	add.f32 	%f2, %f13, %f11;
	mul.lo.s32 	%r1, %r20, 18432;
	mov.f32 	%f86, 0f00000000;
	mov.u32 	%r64, %r15;

BB0_1:
	shl.b32 	%r3, %r64, 5;
	mov.u32 	%r65, %r15;

BB0_2:
	mad.lo.s32 	%r53, %r64, 3072, %r1;
	shl.b32 	%r5, %r65, 5;
	add.s32 	%r6, %r53, %r5;
	mov.u32 	%r66, %r15;

BB0_3:
	add.s32 	%r55, %r66, %r3;
	cvt.rn.f32.s32	%f14, %r55;
	sub.f32 	%f15, %f1, %f14;
	abs.f32 	%f16, %f15;
	mov.f32 	%f17, 0f3F800000;
	max.f32 	%f18, %f16, %f17;
	sub.f32 	%f6, %f17, %f18;
	mad.lo.s32 	%r56, %r66, 96, %r6;
	mul.wide.s32 	%rd17, %r56, 4;
	add.s64 	%rd18, %rd1, %rd17;
	mov.u32 	%r68, -32;
	mov.u32 	%r67, %r5;

BB0_4:
	cvt.rn.f32.s32	%f19, %r67;
	sub.f32 	%f20, %f2, %f19;
	abs.f32 	%f21, %f20;
	max.f32 	%f23, %f21, %f17;
	sub.f32 	%f24, %f17, %f23;
	mul.f32 	%f25, %f6, %f24;
	ld.global.nc.f32 	%f26, [%rd18];
	fma.rn.f32 	%f27, %f25, %f26, %f86;
	add.s32 	%r57, %r67, 1;
	cvt.rn.f32.s32	%f28, %r57;
	sub.f32 	%f29, %f2, %f28;
	abs.f32 	%f30, %f29;
	max.f32 	%f31, %f30, %f17;
	sub.f32 	%f32, %f17, %f31;
	mul.f32 	%f33, %f6, %f32;
	ld.global.nc.f32 	%f34, [%rd18+4];
	fma.rn.f32 	%f35, %f33, %f34, %f27;
	add.s32 	%r58, %r67, 2;
	cvt.rn.f32.s32	%f36, %r58;
	sub.f32 	%f37, %f2, %f36;
	abs.f32 	%f38, %f37;
	max.f32 	%f39, %f38, %f17;
	sub.f32 	%f40, %f17, %f39;
	mul.f32 	%f41, %f6, %f40;
	ld.global.nc.f32 	%f42, [%rd18+8];
	fma.rn.f32 	%f43, %f41, %f42, %f35;
	add.s32 	%r59, %r67, 3;
	cvt.rn.f32.s32	%f44, %r59;
	sub.f32 	%f45, %f2, %f44;
	abs.f32 	%f46, %f45;
	max.f32 	%f47, %f46, %f17;
	sub.f32 	%f48, %f17, %f47;
	mul.f32 	%f49, %f6, %f48;
	ld.global.nc.f32 	%f50, [%rd18+12];
	fma.rn.f32 	%f51, %f49, %f50, %f43;
	add.s32 	%r60, %r67, 4;
	cvt.rn.f32.s32	%f52, %r60;
	sub.f32 	%f53, %f2, %f52;
	abs.f32 	%f54, %f53;
	max.f32 	%f55, %f54, %f17;
	sub.f32 	%f56, %f17, %f55;
	mul.f32 	%f57, %f6, %f56;
	ld.global.nc.f32 	%f58, [%rd18+16];
	fma.rn.f32 	%f59, %f57, %f58, %f51;
	add.s32 	%r61, %r67, 5;
	cvt.rn.f32.s32	%f60, %r61;
	sub.f32 	%f61, %f2, %f60;
	abs.f32 	%f62, %f61;
	max.f32 	%f63, %f62, %f17;
	sub.f32 	%f64, %f17, %f63;
	mul.f32 	%f65, %f6, %f64;
	ld.global.nc.f32 	%f66, [%rd18+20];
	fma.rn.f32 	%f67, %f65, %f66, %f59;
	add.s32 	%r62, %r67, 6;
	cvt.rn.f32.s32	%f68, %r62;
	sub.f32 	%f69, %f2, %f68;
	abs.f32 	%f70, %f69;
	max.f32 	%f71, %f70, %f17;
	sub.f32 	%f72, %f17, %f71;
	mul.f32 	%f73, %f6, %f72;
	ld.global.nc.f32 	%f74, [%rd18+24];
	fma.rn.f32 	%f75, %f73, %f74, %f67;
	add.s32 	%r63, %r67, 7;
	cvt.rn.f32.s32	%f76, %r63;
	sub.f32 	%f77, %f2, %f76;
	abs.f32 	%f78, %f77;
	max.f32 	%f79, %f78, %f17;
	sub.f32 	%f80, %f17, %f79;
	mul.f32 	%f81, %f6, %f80;
	ld.global.nc.f32 	%f82, [%rd18+28];
	fma.rn.f32 	%f86, %f81, %f82, %f75;
	add.s64 	%rd18, %rd18, 32;
	add.s32 	%r67, %r67, 8;
	add.s32 	%r68, %r68, 8;
	setp.ne.s32	%p1, %r68, 0;
	@%p1 bra 	BB0_4;

	add.s32 	%r66, %r66, 1;
	setp.lt.s32	%p2, %r66, 32;
	@%p2 bra 	BB0_3;

	add.s32 	%r65, %r65, 1;
	setp.lt.s32	%p3, %r65, 3;
	@%p3 bra 	BB0_2;

	add.s32 	%r64, %r64, 1;
	setp.lt.s32	%p4, %r64, 6;
	@%p4 bra 	BB0_1;

	st.global.f32 	[%rd2], %f86;
	ret;
}

what confuses me is there are only shared memory store instructions but no shared memory load instructions in above ptx code. I suspect that is due to demotion as comments said. But why it is demoted?

From the LLVM sources:

I am not sure that your observation about lack of shared loads is related. Your source code copies global data to shared data, then uses that shared data to compute global data. Apparently the compiler figures that performance is the same or better by not using the intermediate storage, i.e. use the input global data directly to produce output global data. But to remain faithful to the source code as written, it still needs to copy the source data to the shared variables.

If you look at the machine code(SASS) that is actually executed on the machine (e.g. with cuobjdump --dump-sass), I would expect the shared stores to disappear, as the machine-specific backend tries to remove all operations that do not contribute to the modification of global state.

I thought the cuda front end promoted all shared variables to global first(with some naming convension to avoid conflict), you can check “.cudafe1.gpu” for the preprocessed source. And then the compiler will determine which kernels will use which, and then demote those used only in one kernel to the corresponding local scopes.

Well, maybe they want to use same strategy for local or global shared variables? Not quite sure…