Hi there, I used Decuda to dissemble a cubin file (see code below).
// Disassembling _Z4testPi
.entry _Z4testPi
{
.lmem 0
.smem 24
.reg 2
.bar 0
mov.b32 $r0, s[0x0010]
mov.u32 $r0, g[$r0]
mul24.lo.u32.u16.u16 $r1, $r0.lo, $r0.hi
mad24.lo.u32.u16.u16.u32 $r1, $r0.hi, $r0.lo, $r1
shl.u32 $r1, $r1, 0x00000010
set.ne.s32 $p0|$o127, $r0, c1[0x0000]
mad24.lo.u32.u16.u16.u32 $r1, $r0.lo, $r0.lo, $r1
@$p0.ne return
add.half.b32 $r1, $r0, $r1
mov.half.b32 $r0, s[0x0010]
add.b32 $r1, $r1, 0x00000001
mov.end.u32 g[$r0], $r1
#.constseg 1:0x0000 const
#{
#d.u32 0x00000001 // 0000
#}
For most parts of the code I’m ok, hiwever I am confused by these ‘identifiers’ used to describe memory or pointers: s
shl.u32 $r1, $r1, 0x00000010
…
set.ne.s32 $p0|$o127, $r0, c1[0x0000]
…
@$p0.ne return
…
Thanks.
PS: the original kernel in CUDA is:
__global__ void test(int *par) {
int y;
int x=*par;
int z=*par * *par;
if (x==1) {
y=x+z+1;
*par = y;
}
}
I was just trying to see the optimizations the compiler does, so the kernel may look a bit inefficient.