Can NVCC safely remove the first store to a[0]
in this code? There are no loads from a
after this store, only a store at the end that overwrites the previous value.
__global__ void kernel(int* a, int N) {
a[0] = 0;
int temp = 0;
for (int j = 0; j < N; j ++) {
temp += 3;
}
a[0] = temp;
}
Compiled with NVCC 12.0 using nvcc -O3 -arch=sm_80
, I get this PTX and SASS (can also be seen on godbolt):
.version 8.0
.target sm_80
.address_size 64
// .globl _Z6kernelPii
.visible .entry _Z6kernelPii(
.param .u64 _Z6kernelPii_param_0,
.param .u32 _Z6kernelPii_param_1
)
{
.reg .pred %p<6>;
.reg .b32 %r<27>;
.reg .b64 %rd<3>;
ld.param.u64 %rd2, [_Z6kernelPii_param_0];
ld.param.u32 %r14, [_Z6kernelPii_param_1];
cvta.to.global.u64 %rd1, %rd2;
mov.u32 %r26, 0;
st.global.u32 [%rd1], %r26;
setp.lt.s32 %p1, %r14, 1;
@%p1 bra $L__BB0_7;
add.s32 %r18, %r14, -1;
and.b32 %r25, %r14, 3;
setp.lt.u32 %p2, %r18, 3;
mov.u32 %r26, 0;
@%p2 bra $L__BB0_4;
sub.s32 %r21, %r14, %r25;
$L__BB0_3:
add.s32 %r26, %r26, 12;
add.s32 %r21, %r21, -4;
setp.ne.s32 %p3, %r21, 0;
@%p3 bra $L__BB0_3;
$L__BB0_4:
setp.eq.s32 %p4, %r25, 0;
@%p4 bra $L__BB0_7;
$L__BB0_6:
.pragma "nounroll";
add.s32 %r26, %r26, 3;
add.s32 %r25, %r25, -1;
setp.ne.s32 %p5, %r25, 0;
@%p5 bra $L__BB0_6;
$L__BB0_7:
st.global.u32 [%rd1], %r26;
ret;
}
kernel(int*, int):
IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28]
IMAD.MOV.U32 R0, RZ, RZ, c[0x0][0x168]
ULDC.64 UR4, c[0x0][0x118]
IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0x160]
IMAD.MOV.U32 R3, RZ, RZ, c[0x0][0x164]
ISETP.GE.AND P0, PT, R0, 0x1, PT
IMAD.MOV.U32 R5, RZ, RZ, RZ
STG.E [R2.64], RZ
@!P0 BRA `(.L_x_0)
IADD3 R4, R0.reuse, -0x1, RZ
IMAD.MOV.U32 R5, RZ, RZ, RZ
LOP3.LUT R0, R0, 0x3, RZ, 0xc0, !PT
ISETP.GE.U32.AND P0, PT, R4, 0x3, PT
@!P0 BRA `(.L_x_1)
IADD3 R4, -R0, c[0x0][0x168], RZ
ISETP.GT.AND P0, PT, R4, RZ, PT
@!P0 BRA `(.L_x_2)
ISETP.GT.AND P1, PT, R4, 0xc, PT
PLOP3.LUT P0, PT, PT, PT, PT, 0x80, 0x0
@!P1 BRA `(.L_x_3)
PLOP3.LUT P0, PT, PT, PT, PT, 0x8, 0x0
.L_x_4:
IADD3 R4, R4, -0x10, RZ
IADD3 R5, R5, 0x30, RZ
ISETP.GT.AND P1, PT, R4, 0xc, PT
@P1 BRA `(.L_x_4)
.L_x_3:
ISETP.GT.AND P1, PT, R4, 0x4, PT
@P1 PLOP3.LUT P0, PT, PT, PT, PT, 0x8, 0x0
@P1 IADD3 R4, R4, -0x8, RZ
@P1 IADD3 R5, R5, 0x18, RZ
ISETP.NE.OR P0, PT, R4, RZ, P0
@!P0 BRA `(.L_x_1)
.L_x_2:
IADD3 R4, R4, -0x4, RZ
IADD3 R5, R5, 0xc, RZ
ISETP.NE.AND P0, PT, R4, RZ, PT
@P0 BRA `(.L_x_2)
.L_x_1:
ISETP.NE.AND P0, PT, R0, RZ, PT
@!P0 BRA `(.L_x_0)
.L_x_5:
IADD3 R0, R0, -0x1, RZ
IADD3 R5, R5, 0x3, RZ
ISETP.NE.AND P0, PT, R0, RZ, PT
@P0 BRA `(.L_x_5)
.L_x_0:
STG.E [R2.64], R5
EXIT
.L_x_6:
BRA `(.L_x_6)
There are two store instructions and no loads, so could the first store be removed? If I replace the loop range N
with a constant, then NVCC optimizes the loop away and only stores the final value of temp
to a
.