What causes ptx compiler to demote shared variable?

yuxi-zh · August 15, 2019, 10:48am

My kernel code is as followed, which uses shared memory to cache input flowx and flowy.

extern "C" __global__ void k16_16_32_32_kernel0( float* __restrict__ flowx,  float* __restrict__ flowy,  float* __restrict__ out,  float* __restrict__ feature) {
  __shared__ float flowx_shared[256];
  __shared__ float flowy_shared[256];
  flowx_shared[((((int)threadIdx.x) * 16) + ((int)threadIdx.y))] = flowx[((((((((int)blockIdx.x) % 72) / 6) * 1536) + (((int)threadIdx.x) * 96)) + ((((int)blockIdx.x) % 6) * 16)) + ((int)threadIdx.y))];
  flowy_shared[((((int)threadIdx.x) * 16) + ((int)threadIdx.y))] = flowy[((((((((int)blockIdx.x) % 72) / 6) * 1536) + (((int)threadIdx.x) * 96)) + ((((int)blockIdx.x) % 6) * 16)) + ((int)threadIdx.y))];
  out[(((((((int)blockIdx.x) / 6) * 1536) + (((int)threadIdx.x) * 96)) + ((((int)blockIdx.x) % 6) * 16)) + ((int)threadIdx.y))] = 0.000000e+00f;
  for (int i_outer = 0; i_outer < 6; ++i_outer) {
    for (int j_outer = 0; j_outer < 3; ++j_outer) {
      for (int i_inner = 0; i_inner < 32; ++i_inner) {
        for (int j_inner = 0; j_inner < 32; ++j_inner) {
          out[(((((((int)blockIdx.x) / 6) * 1536) + (((int)threadIdx.x) * 96)) + ((((int)blockIdx.x) % 6) * 16)) + ((int)threadIdx.y))] = (out[(((((((int)blockIdx.x) / 6) * 1536) + (((int)threadIdx.x) * 96)) + ((((int)blockIdx.x) % 6) * 16)) + ((int)threadIdx.y))] + (((1.000000e+00f - max(fabsf(((((float)((((((int)blockIdx.x) % 72) / 6) * 16) + ((int)threadIdx.x))) + flowx_shared[((((int)threadIdx.x) * 16) + ((int)threadIdx.y))]) - ((float)((i_outer * 32) + i_inner)))), 1.000000e+00f)) * (1.000000e+00f - max(fabsf(((((float)(((((int)blockIdx.x) % 6) * 16) + ((int)threadIdx.y))) + flowy_shared[((((int)threadIdx.x) * 16) + ((int)threadIdx.y))]) - ((float)((j_outer * 32) + j_inner)))), 1.000000e+00f))) * feature[((((((((int)blockIdx.x) / 72) * 18432) + (i_outer * 3072)) + (i_inner * 96)) + (j_outer * 32)) + j_inner)]));
        }
      }
    }
  }
}

The compiled ptx code is as followed:

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-26218862
// Cuda compilation tools, release 10.1, V10.1.168
// Based on LLVM 3.4svn
//

.version 6.4
.target sm_60
.address_size 64

	// .globl	k16_16_32_32_kernel0
// _ZZ20k16_16_32_32_kernel0E12flowx_shared has been demoted
// _ZZ20k16_16_32_32_kernel0E12flowy_shared has been demoted

.visible .entry k16_16_32_32_kernel0(
	.param .u64 k16_16_32_32_kernel0_param_0,
	.param .u64 k16_16_32_32_kernel0_param_1,
	.param .u64 k16_16_32_32_kernel0_param_2,
	.param .u64 k16_16_32_32_kernel0_param_3
)
{
	.reg .pred 	%p<5>;
	.reg .f32 	%f<87>;
	.reg .b32 	%r<69>;
	.reg .b64 	%rd<19>;
	// demoted variable
	.shared .align 4 .b8 _ZZ20k16_16_32_32_kernel0E12flowx_shared[1024];
	// demoted variable
	.shared .align 4 .b8 _ZZ20k16_16_32_32_kernel0E12flowy_shared[1024];

	ld.param.u64 	%rd6, [k16_16_32_32_kernel0_param_0];
	ld.param.u64 	%rd7, [k16_16_32_32_kernel0_param_1];
	ld.param.u64 	%rd8, [k16_16_32_32_kernel0_param_2];
	ld.param.u64 	%rd9, [k16_16_32_32_kernel0_param_3];
	cvta.to.global.u64 	%rd1, %rd9;
	mov.u32 	%r16, %ctaid.x;
	mul.hi.s32 	%r17, %r16, 954437177;
	shr.u32 	%r18, %r17, 31;
	shr.s32 	%r19, %r17, 4;
	add.s32 	%r20, %r19, %r18;
	mul.lo.s32 	%r21, %r20, 72;
	sub.s32 	%r22, %r16, %r21;
	mul.hi.s32 	%r23, %r22, 715827883;
	shr.u32 	%r24, %r23, 31;
	add.s32 	%r25, %r23, %r24;
	mov.u32 	%r26, %tid.x;
	mul.lo.s32 	%r27, %r26, 96;
	mad.lo.s32 	%r28, %r25, 1536, %r27;
	mul.hi.s32 	%r29, %r16, 715827883;
	shr.u32 	%r30, %r29, 31;
	add.s32 	%r31, %r29, %r30;
	mul.lo.s32 	%r32, %r31, 6;
	sub.s32 	%r33, %r16, %r32;
	shl.b32 	%r34, %r33, 4;
	add.s32 	%r35, %r28, %r34;
	mov.u32 	%r36, %tid.y;
	add.s32 	%r37, %r35, %r36;
	cvta.to.global.u64 	%rd10, %rd6;
	mul.wide.s32 	%rd11, %r37, 4;
	add.s64 	%rd12, %rd10, %rd11;
	ld.global.nc.f32 	%f10, [%rd12];
	shl.b32 	%r38, %r26, 4;
	add.s32 	%r39, %r36, %r38;
	shl.b32 	%r40, %r39, 2;
	mov.u32 	%r41, _ZZ20k16_16_32_32_kernel0E12flowx_shared;
	add.s32 	%r42, %r41, %r40;
	st.shared.f32 	[%r42], %f10;
	cvta.to.global.u64 	%rd13, %rd7;
	add.s64 	%rd14, %rd13, %rd11;
	ld.global.nc.f32 	%f11, [%rd14];
	mov.u32 	%r43, _ZZ20k16_16_32_32_kernel0E12flowy_shared;
	add.s32 	%r44, %r43, %r40;
	st.shared.f32 	[%r44], %f11;
	add.s32 	%r45, %r34, %r27;
	add.s32 	%r46, %r45, %r36;
	mad.lo.s32 	%r47, %r31, 1536, %r46;
	cvta.to.global.u64 	%rd15, %rd8;
	mul.wide.s32 	%rd16, %r47, 4;
	add.s64 	%rd2, %rd15, %rd16;
	mov.u32 	%r15, 0;
	st.global.u32 	[%rd2], %r15;
	shl.b32 	%r48, %r25, 4;
	add.s32 	%r49, %r48, %r26;
	cvt.rn.f32.s32	%f12, %r49;
	add.f32 	%f1, %f12, %f10;
	add.s32 	%r50, %r34, %r36;
	cvt.rn.f32.s32	%f13, %r50;
	add.f32 	%f2, %f13, %f11;
	mul.lo.s32 	%r1, %r20, 18432;
	mov.f32 	%f86, 0f00000000;
	mov.u32 	%r64, %r15;

BB0_1:
	shl.b32 	%r3, %r64, 5;
	mov.u32 	%r65, %r15;

BB0_2:
	mad.lo.s32 	%r53, %r64, 3072, %r1;
	shl.b32 	%r5, %r65, 5;
	add.s32 	%r6, %r53, %r5;
	mov.u32 	%r66, %r15;

BB0_3:
	add.s32 	%r55, %r66, %r3;
	cvt.rn.f32.s32	%f14, %r55;
	sub.f32 	%f15, %f1, %f14;
	abs.f32 	%f16, %f15;
	mov.f32 	%f17, 0f3F800000;
	max.f32 	%f18, %f16, %f17;
	sub.f32 	%f6, %f17, %f18;
	mad.lo.s32 	%r56, %r66, 96, %r6;
	mul.wide.s32 	%rd17, %r56, 4;
	add.s64 	%rd18, %rd1, %rd17;
	mov.u32 	%r68, -32;
	mov.u32 	%r67, %r5;

BB0_4:
	cvt.rn.f32.s32	%f19, %r67;
	sub.f32 	%f20, %f2, %f19;
	abs.f32 	%f21, %f20;
	max.f32 	%f23, %f21, %f17;
	sub.f32 	%f24, %f17, %f23;
	mul.f32 	%f25, %f6, %f24;
	ld.global.nc.f32 	%f26, [%rd18];
	fma.rn.f32 	%f27, %f25, %f26, %f86;
	add.s32 	%r57, %r67, 1;
	cvt.rn.f32.s32	%f28, %r57;
	sub.f32 	%f29, %f2, %f28;
	abs.f32 	%f30, %f29;
	max.f32 	%f31, %f30, %f17;
	sub.f32 	%f32, %f17, %f31;
	mul.f32 	%f33, %f6, %f32;
	ld.global.nc.f32 	%f34, [%rd18+4];
	fma.rn.f32 	%f35, %f33, %f34, %f27;
	add.s32 	%r58, %r67, 2;
	cvt.rn.f32.s32	%f36, %r58;
	sub.f32 	%f37, %f2, %f36;
	abs.f32 	%f38, %f37;
	max.f32 	%f39, %f38, %f17;
	sub.f32 	%f40, %f17, %f39;
	mul.f32 	%f41, %f6, %f40;
	ld.global.nc.f32 	%f42, [%rd18+8];
	fma.rn.f32 	%f43, %f41, %f42, %f35;
	add.s32 	%r59, %r67, 3;
	cvt.rn.f32.s32	%f44, %r59;
	sub.f32 	%f45, %f2, %f44;
	abs.f32 	%f46, %f45;
	max.f32 	%f47, %f46, %f17;
	sub.f32 	%f48, %f17, %f47;
	mul.f32 	%f49, %f6, %f48;
	ld.global.nc.f32 	%f50, [%rd18+12];
	fma.rn.f32 	%f51, %f49, %f50, %f43;
	add.s32 	%r60, %r67, 4;
	cvt.rn.f32.s32	%f52, %r60;
	sub.f32 	%f53, %f2, %f52;
	abs.f32 	%f54, %f53;
	max.f32 	%f55, %f54, %f17;
	sub.f32 	%f56, %f17, %f55;
	mul.f32 	%f57, %f6, %f56;
	ld.global.nc.f32 	%f58, [%rd18+16];
	fma.rn.f32 	%f59, %f57, %f58, %f51;
	add.s32 	%r61, %r67, 5;
	cvt.rn.f32.s32	%f60, %r61;
	sub.f32 	%f61, %f2, %f60;
	abs.f32 	%f62, %f61;
	max.f32 	%f63, %f62, %f17;
	sub.f32 	%f64, %f17, %f63;
	mul.f32 	%f65, %f6, %f64;
	ld.global.nc.f32 	%f66, [%rd18+20];
	fma.rn.f32 	%f67, %f65, %f66, %f59;
	add.s32 	%r62, %r67, 6;
	cvt.rn.f32.s32	%f68, %r62;
	sub.f32 	%f69, %f2, %f68;
	abs.f32 	%f70, %f69;
	max.f32 	%f71, %f70, %f17;
	sub.f32 	%f72, %f17, %f71;
	mul.f32 	%f73, %f6, %f72;
	ld.global.nc.f32 	%f74, [%rd18+24];
	fma.rn.f32 	%f75, %f73, %f74, %f67;
	add.s32 	%r63, %r67, 7;
	cvt.rn.f32.s32	%f76, %r63;
	sub.f32 	%f77, %f2, %f76;
	abs.f32 	%f78, %f77;
	max.f32 	%f79, %f78, %f17;
	sub.f32 	%f80, %f17, %f79;
	mul.f32 	%f81, %f6, %f80;
	ld.global.nc.f32 	%f82, [%rd18+28];
	fma.rn.f32 	%f86, %f81, %f82, %f75;
	add.s64 	%rd18, %rd18, 32;
	add.s32 	%r67, %r67, 8;
	add.s32 	%r68, %r68, 8;
	setp.ne.s32	%p1, %r68, 0;
	@%p1 bra 	BB0_4;

	add.s32 	%r66, %r66, 1;
	setp.lt.s32	%p2, %r66, 32;
	@%p2 bra 	BB0_3;

	add.s32 	%r65, %r65, 1;
	setp.lt.s32	%p3, %r65, 3;
	@%p3 bra 	BB0_2;

	add.s32 	%r64, %r64, 1;
	setp.lt.s32	%p4, %r64, 6;
	@%p4 bra 	BB0_1;

	st.global.f32 	[%rd2], %f86;
	ret;
}

what confuses me is there are only shared memory store instructions but no shared memory load instructions in above ptx code. I suspect that is due to demotion as comments said. But why it is demoted?

njuffa · August 15, 2019, 3:16pm

From the LLVM sources:

/* Find out if a global variable can be demoted to local scope.
  * Currently, this is valid for CUDA shared variables, which have local
  * scope and global lifetime. So the conditions to check are :
  * 1. Is the global variable in shared address space?
  * 2. Does it have internal linkage?
  * 3. Is the global variable referenced only in one function?
  */

I am not sure that your observation about lack of shared loads is related. Your source code copies global data to shared data, then uses that shared data to compute global data. Apparently the compiler figures that performance is the same or better by not using the intermediate storage, i.e. use the input global data directly to produce output global data. But to remain faithful to the source code as written, it still needs to copy the source data to the shared variables.

If you look at the machine code(SASS) that is actually executed on the machine (e.g. with cuobjdump --dump-sass), I would expect the shared stores to disappear, as the machine-specific backend tries to remove all operations that do not contribute to the modification of global state.

xgr_1986 · August 17, 2019, 11:28am

I thought the cuda front end promoted all shared variables to global first(with some naming convension to avoid conflict), you can check “.cudafe1.gpu” for the preprocessed source. And then the compiler will determine which kernels will use which, and then demote those used only in one kernel to the corresponding local scopes.

Well, maybe they want to use same strategy for local or global shared variables? Not quite sure…

Topic		Replies	Views
pointer to shared memory compiler problems CUDA Programming and Performance	19	14598	June 7, 2008
Cuda compiler will optimize code to use more registers than available by attempting to cache parameters CUDA Programming and Performance	12	2303	November 14, 2017
Slow local memory, feigned constant memory. coalesced? global? CUDA Programming and Performance	29	7292	January 25, 2010
Peculiar Shared Memory Behavior - NEED HELP! A test program appears to have two arrays sharing v CUDA Programming and Performance	21	2180	June 5, 2011
why result varied based on different number of threads per block? CUDA Programming and Performance	8	1955	March 1, 2011
Beginer question Thread synchronization with shared memory CUDA Programming and Performance	35	9341	April 6, 2010
CUDA, more threads for same work = Longer run time despite better occupancy, Why? CUDA Programming and Performance	9	6003	March 25, 2010
coalescing struct loading problem CUDA Programming and Performance	21	12748	March 5, 2010
Using Shared Memory in CUDA C/C++ Technical Blog	36	2023	October 8, 2020
Cuda compiler loop unroll bug? CUDA Programming and Performance	14	2459	October 25, 2017

What causes ptx compiler to demote shared variable?

Related topics