My kernel code is as followed, which uses shared memory to cache input flowx and flowy.
extern "C" __global__ void k16_16_32_32_kernel0( float* __restrict__ flowx, float* __restrict__ flowy, float* __restrict__ out, float* __restrict__ feature) {
__shared__ float flowx_shared[256];
__shared__ float flowy_shared[256];
flowx_shared[((((int)threadIdx.x) * 16) + ((int)threadIdx.y))] = flowx[((((((((int)blockIdx.x) % 72) / 6) * 1536) + (((int)threadIdx.x) * 96)) + ((((int)blockIdx.x) % 6) * 16)) + ((int)threadIdx.y))];
flowy_shared[((((int)threadIdx.x) * 16) + ((int)threadIdx.y))] = flowy[((((((((int)blockIdx.x) % 72) / 6) * 1536) + (((int)threadIdx.x) * 96)) + ((((int)blockIdx.x) % 6) * 16)) + ((int)threadIdx.y))];
out[(((((((int)blockIdx.x) / 6) * 1536) + (((int)threadIdx.x) * 96)) + ((((int)blockIdx.x) % 6) * 16)) + ((int)threadIdx.y))] = 0.000000e+00f;
for (int i_outer = 0; i_outer < 6; ++i_outer) {
for (int j_outer = 0; j_outer < 3; ++j_outer) {
for (int i_inner = 0; i_inner < 32; ++i_inner) {
for (int j_inner = 0; j_inner < 32; ++j_inner) {
out[(((((((int)blockIdx.x) / 6) * 1536) + (((int)threadIdx.x) * 96)) + ((((int)blockIdx.x) % 6) * 16)) + ((int)threadIdx.y))] = (out[(((((((int)blockIdx.x) / 6) * 1536) + (((int)threadIdx.x) * 96)) + ((((int)blockIdx.x) % 6) * 16)) + ((int)threadIdx.y))] + (((1.000000e+00f - max(fabsf(((((float)((((((int)blockIdx.x) % 72) / 6) * 16) + ((int)threadIdx.x))) + flowx_shared[((((int)threadIdx.x) * 16) + ((int)threadIdx.y))]) - ((float)((i_outer * 32) + i_inner)))), 1.000000e+00f)) * (1.000000e+00f - max(fabsf(((((float)(((((int)blockIdx.x) % 6) * 16) + ((int)threadIdx.y))) + flowy_shared[((((int)threadIdx.x) * 16) + ((int)threadIdx.y))]) - ((float)((j_outer * 32) + j_inner)))), 1.000000e+00f))) * feature[((((((((int)blockIdx.x) / 72) * 18432) + (i_outer * 3072)) + (i_inner * 96)) + (j_outer * 32)) + j_inner)]));
}
}
}
}
}
The compiled ptx code is as followed:
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-26218862
// Cuda compilation tools, release 10.1, V10.1.168
// Based on LLVM 3.4svn
//
.version 6.4
.target sm_60
.address_size 64
// .globl k16_16_32_32_kernel0
// _ZZ20k16_16_32_32_kernel0E12flowx_shared has been demoted
// _ZZ20k16_16_32_32_kernel0E12flowy_shared has been demoted
.visible .entry k16_16_32_32_kernel0(
.param .u64 k16_16_32_32_kernel0_param_0,
.param .u64 k16_16_32_32_kernel0_param_1,
.param .u64 k16_16_32_32_kernel0_param_2,
.param .u64 k16_16_32_32_kernel0_param_3
)
{
.reg .pred %p<5>;
.reg .f32 %f<87>;
.reg .b32 %r<69>;
.reg .b64 %rd<19>;
// demoted variable
.shared .align 4 .b8 _ZZ20k16_16_32_32_kernel0E12flowx_shared[1024];
// demoted variable
.shared .align 4 .b8 _ZZ20k16_16_32_32_kernel0E12flowy_shared[1024];
ld.param.u64 %rd6, [k16_16_32_32_kernel0_param_0];
ld.param.u64 %rd7, [k16_16_32_32_kernel0_param_1];
ld.param.u64 %rd8, [k16_16_32_32_kernel0_param_2];
ld.param.u64 %rd9, [k16_16_32_32_kernel0_param_3];
cvta.to.global.u64 %rd1, %rd9;
mov.u32 %r16, %ctaid.x;
mul.hi.s32 %r17, %r16, 954437177;
shr.u32 %r18, %r17, 31;
shr.s32 %r19, %r17, 4;
add.s32 %r20, %r19, %r18;
mul.lo.s32 %r21, %r20, 72;
sub.s32 %r22, %r16, %r21;
mul.hi.s32 %r23, %r22, 715827883;
shr.u32 %r24, %r23, 31;
add.s32 %r25, %r23, %r24;
mov.u32 %r26, %tid.x;
mul.lo.s32 %r27, %r26, 96;
mad.lo.s32 %r28, %r25, 1536, %r27;
mul.hi.s32 %r29, %r16, 715827883;
shr.u32 %r30, %r29, 31;
add.s32 %r31, %r29, %r30;
mul.lo.s32 %r32, %r31, 6;
sub.s32 %r33, %r16, %r32;
shl.b32 %r34, %r33, 4;
add.s32 %r35, %r28, %r34;
mov.u32 %r36, %tid.y;
add.s32 %r37, %r35, %r36;
cvta.to.global.u64 %rd10, %rd6;
mul.wide.s32 %rd11, %r37, 4;
add.s64 %rd12, %rd10, %rd11;
ld.global.nc.f32 %f10, [%rd12];
shl.b32 %r38, %r26, 4;
add.s32 %r39, %r36, %r38;
shl.b32 %r40, %r39, 2;
mov.u32 %r41, _ZZ20k16_16_32_32_kernel0E12flowx_shared;
add.s32 %r42, %r41, %r40;
st.shared.f32 [%r42], %f10;
cvta.to.global.u64 %rd13, %rd7;
add.s64 %rd14, %rd13, %rd11;
ld.global.nc.f32 %f11, [%rd14];
mov.u32 %r43, _ZZ20k16_16_32_32_kernel0E12flowy_shared;
add.s32 %r44, %r43, %r40;
st.shared.f32 [%r44], %f11;
add.s32 %r45, %r34, %r27;
add.s32 %r46, %r45, %r36;
mad.lo.s32 %r47, %r31, 1536, %r46;
cvta.to.global.u64 %rd15, %rd8;
mul.wide.s32 %rd16, %r47, 4;
add.s64 %rd2, %rd15, %rd16;
mov.u32 %r15, 0;
st.global.u32 [%rd2], %r15;
shl.b32 %r48, %r25, 4;
add.s32 %r49, %r48, %r26;
cvt.rn.f32.s32 %f12, %r49;
add.f32 %f1, %f12, %f10;
add.s32 %r50, %r34, %r36;
cvt.rn.f32.s32 %f13, %r50;
add.f32 %f2, %f13, %f11;
mul.lo.s32 %r1, %r20, 18432;
mov.f32 %f86, 0f00000000;
mov.u32 %r64, %r15;
BB0_1:
shl.b32 %r3, %r64, 5;
mov.u32 %r65, %r15;
BB0_2:
mad.lo.s32 %r53, %r64, 3072, %r1;
shl.b32 %r5, %r65, 5;
add.s32 %r6, %r53, %r5;
mov.u32 %r66, %r15;
BB0_3:
add.s32 %r55, %r66, %r3;
cvt.rn.f32.s32 %f14, %r55;
sub.f32 %f15, %f1, %f14;
abs.f32 %f16, %f15;
mov.f32 %f17, 0f3F800000;
max.f32 %f18, %f16, %f17;
sub.f32 %f6, %f17, %f18;
mad.lo.s32 %r56, %r66, 96, %r6;
mul.wide.s32 %rd17, %r56, 4;
add.s64 %rd18, %rd1, %rd17;
mov.u32 %r68, -32;
mov.u32 %r67, %r5;
BB0_4:
cvt.rn.f32.s32 %f19, %r67;
sub.f32 %f20, %f2, %f19;
abs.f32 %f21, %f20;
max.f32 %f23, %f21, %f17;
sub.f32 %f24, %f17, %f23;
mul.f32 %f25, %f6, %f24;
ld.global.nc.f32 %f26, [%rd18];
fma.rn.f32 %f27, %f25, %f26, %f86;
add.s32 %r57, %r67, 1;
cvt.rn.f32.s32 %f28, %r57;
sub.f32 %f29, %f2, %f28;
abs.f32 %f30, %f29;
max.f32 %f31, %f30, %f17;
sub.f32 %f32, %f17, %f31;
mul.f32 %f33, %f6, %f32;
ld.global.nc.f32 %f34, [%rd18+4];
fma.rn.f32 %f35, %f33, %f34, %f27;
add.s32 %r58, %r67, 2;
cvt.rn.f32.s32 %f36, %r58;
sub.f32 %f37, %f2, %f36;
abs.f32 %f38, %f37;
max.f32 %f39, %f38, %f17;
sub.f32 %f40, %f17, %f39;
mul.f32 %f41, %f6, %f40;
ld.global.nc.f32 %f42, [%rd18+8];
fma.rn.f32 %f43, %f41, %f42, %f35;
add.s32 %r59, %r67, 3;
cvt.rn.f32.s32 %f44, %r59;
sub.f32 %f45, %f2, %f44;
abs.f32 %f46, %f45;
max.f32 %f47, %f46, %f17;
sub.f32 %f48, %f17, %f47;
mul.f32 %f49, %f6, %f48;
ld.global.nc.f32 %f50, [%rd18+12];
fma.rn.f32 %f51, %f49, %f50, %f43;
add.s32 %r60, %r67, 4;
cvt.rn.f32.s32 %f52, %r60;
sub.f32 %f53, %f2, %f52;
abs.f32 %f54, %f53;
max.f32 %f55, %f54, %f17;
sub.f32 %f56, %f17, %f55;
mul.f32 %f57, %f6, %f56;
ld.global.nc.f32 %f58, [%rd18+16];
fma.rn.f32 %f59, %f57, %f58, %f51;
add.s32 %r61, %r67, 5;
cvt.rn.f32.s32 %f60, %r61;
sub.f32 %f61, %f2, %f60;
abs.f32 %f62, %f61;
max.f32 %f63, %f62, %f17;
sub.f32 %f64, %f17, %f63;
mul.f32 %f65, %f6, %f64;
ld.global.nc.f32 %f66, [%rd18+20];
fma.rn.f32 %f67, %f65, %f66, %f59;
add.s32 %r62, %r67, 6;
cvt.rn.f32.s32 %f68, %r62;
sub.f32 %f69, %f2, %f68;
abs.f32 %f70, %f69;
max.f32 %f71, %f70, %f17;
sub.f32 %f72, %f17, %f71;
mul.f32 %f73, %f6, %f72;
ld.global.nc.f32 %f74, [%rd18+24];
fma.rn.f32 %f75, %f73, %f74, %f67;
add.s32 %r63, %r67, 7;
cvt.rn.f32.s32 %f76, %r63;
sub.f32 %f77, %f2, %f76;
abs.f32 %f78, %f77;
max.f32 %f79, %f78, %f17;
sub.f32 %f80, %f17, %f79;
mul.f32 %f81, %f6, %f80;
ld.global.nc.f32 %f82, [%rd18+28];
fma.rn.f32 %f86, %f81, %f82, %f75;
add.s64 %rd18, %rd18, 32;
add.s32 %r67, %r67, 8;
add.s32 %r68, %r68, 8;
setp.ne.s32 %p1, %r68, 0;
@%p1 bra BB0_4;
add.s32 %r66, %r66, 1;
setp.lt.s32 %p2, %r66, 32;
@%p2 bra BB0_3;
add.s32 %r65, %r65, 1;
setp.lt.s32 %p3, %r65, 3;
@%p3 bra BB0_2;
add.s32 %r64, %r64, 1;
setp.lt.s32 %p4, %r64, 6;
@%p4 bra BB0_1;
st.global.f32 [%rd2], %f86;
ret;
}
what confuses me is there are only shared memory store instructions but no shared memory load instructions in above ptx code. I suspect that is due to demotion as comments said. But why it is demoted?