I am trying to understand how division is implemented by reading the sass code for the following simple kernel

```
__global__ void SI(double* arrIn,
double* arrOut,
int N)
{
unsigned int pid = threadIdx.x + blockIdx.x * blockDim.x;
double x0 = arrIn[pid];
double y0 = arrIn[pid + N];
double P = (21.18 + 4.37e-4 * y0) * y0;
double result = 1.8847719e-3 * P / x0;
arrOut[pid] = result;
}
```

I compiled this kernel using nvcc 10.2 on sm_35 and its sass code is as follows

```
label0:
MOV R1, c[0x0][0x44];
S2R R0, SR_TID.X;
MOV32I R4, 0x7ae147ae;
S2R R3, SR_CTAID.X;
MOV32I R5, 0x40352e14;
IMAD R0, R3, c[0x0][0x28], R0;
MOV32I R3, 0x8;
ISCADD R6.CC, R0, c[0x0][0x140], 0x3;
MOV32I R8, 0x1;
IADD R2, R0, c[0x0][0x150];
SSY label3;
IMAD.U32.U32.HI.X R7, R0, R3, c[0x0][0x144];
ISCADD R12.CC, R2, c[0x0][0x140], 0x3;
LD.E.64 R6, [R6];
IMAD.U32.U32.HI.X R13, R2, R3, c[0x0][0x144];
LD.E.64 R2, [R12];
MUFU.RCP64H R9, R7;
DFMA R10, -R6, R8, c[0x2][0x10];
DFMA R4, R2, c[0x2][0x0], R4;
DFMA R10, R10, R10, R10;
DMUL R2, R2, R4;
DFMA R8, R8, R10, R8;
DMUL R2, R2, c[0x2][0x8];
DMUL R4, R2, R8;
DFMA R10, -R6, R4, R2;
DFMA R8, R8, R10, R4;
FSET.GTU.AND RZ.CC, |R3|, c[0x2][0x18], PT;
FFMA R4, RZ, R7, R9;
FSETP.GT.AND P0, PT, |R4|, c[0x2][0x1c], PT;
@P0 BRA CC.NEU, label2;
label1:
CAL label4;
label2:
ISCADD.S R2.CC, R0, c[0x0][0x148], 0x3;
label3:
MOV32I R3, 0x8;
IMAD.U32.U32.HI.X R3, R0, R3, c[0x0][0x14c];
ST.E.64 [R2], R8;
EXIT;
label4:
LOP32I.AND R4, R7, 0x40000000;
LOP32I.AND R9, R3, 0x7f800000;
MOV32I R8, 0x5ff00000;
MOV32I R14, 0x1;
MOV R12, RZ;
PBK label10;
ISETP.GE.U32.AND P0, PT, R4, c[0x2][0x20], PT;
MOV R4, RZ;
SEL R5, R8, c[0x2][0x24], !P0;
ISETP.GE.U32.AND P0, PT, R9, c[0x2][0x28], PT;
DMUL R10, R6, R4;
MUFU.RCP64H R15, R11;
SEL R13, R8, c[0x2][0x24], !P0;
DFMA R16, -R10, R14, c[0x2][0x10];
DMUL R12, R2, R12;
DFMA R16, R16, R16, R16;
DFMA R16, R14, R16, R14;
DMUL R14, R12, R16;
DFMA R10, -R10, R14, R12;
DFMA R10, R16, R10, R14;
DSETP.GT.AND P0, PT, |R10|, RZ, PT;
@!P0 BRA label7;
label5:
ISETP.GT.U32.AND P0, PT, R9, c[0x2][0x2c], PT;
DMUL R12, R4, R10;
SEL R9, R8, c[0x2][0x24], P0;
MOV R8, RZ;
DMUL R10, R10, R8;
DMUL R8, R8, R12;
DMUL R4, R4, R10;
DFMA R10, R6, R8, -R2;
DFMA R12, R6, R4, -R2;
DSETP.GT.AND P0, PT, |R10|, |R12|, PT;
SEL R5, R5, R9, P0;
SEL R8, R4, R8, P0;
FSETP.GTU.AND P0, PT, |R5|, 1.469367938527859385e-39, PT;
MOV R9, R5;
@P0 BRK;
label6:
FSETP.GEU.AND P0, PT, |R3|, 1.5046327690525280102e-36, PT;
LOP32I.AND R4, R8, 0xfffffffe;
MOV32I R12, 0x58500000;
MOV R9, R5;
LOP32I.OR R8, R8, 0x1;
SEL R13, R12, c[0x2][0x14], !P0;
MOV R12, RZ;
DMUL R6, R6, R12;
DMUL R2, R2, R12;
DFMA R10, R4, R6, -R2;
DFMA R12, R8, R6, -R2;
DSETP.GT.AND P0, PT, |R10|, |R12|, PT;
SEL R13, R8, R4, P0;
IADD32I R8.CC, R13, 0x1;
LOP32I.AND R4, R13, 0x1;
ISETP.NE.U32.AND P0, PT, R4, 0x1, PT;
IADD.X R9, RZ, R5;
IADD32I R12.CC, R13, -0x1;
SEL R8, R13, R8, P0;
IADD32I.X R4, R5, -0x1;
SEL R9, R5, R9, P0;
SEL R12, R12, R13, P0;
SEL R13, R4, R5, P0;
DFMA R4, R6, R8, -R2;
DFMA R2, R6, R12, -R2;
DSETP.GT.AND P0, PT, |R4|, |R2|, PT;
SEL R8, R12, R8, P0;
SEL R9, R13, R9, P0;
BRK;
label7:
DSETP.NEU.AND P0, PT, R10, RZ, PT;
@!P0 BRA label9;
label8:
MUFU.RCP64H R9, R7;
MOV R8, RZ;
DSETP.GT.AND P1, PT, |R8|, RZ, PT;
@!P1 DSETP.NEU.AND P0, PT, |R6|, +INF , PT;
@!P1 SEL R4, R6, R8, P0;
@!P1 SEL R7, R7, R9, P0;
@!P1 MOV R8, R4;
@!P1 MOV R9, R7;
DMUL R8, R2, R8;
BRK;
label9:
DMUL R8, R2, R6;
BRK;
label10:
RET;
label11:
BRA label11;
label12:
NOP;
```

I can understand everything in label0 except the last four lines

```
FSET.GTU.AND RZ.CC, |R3|, c[0x2][0x18], PT;
FFMA R4, RZ, R7, R9;
FSETP.GT.AND P0, PT, |R4|, c[0x2][0x1c], PT;
@P0 BRA CC.NEU, label2;
```

If this predicate is taken, the control flow falls from label0 to label2 and then to label3, which goes to EXIT.

However, when this predicate is not taken, the control flow will go to label4. Does anyone know what the code following label4 is doing?

Another question is about MUFU.RCP64H R9, R7. It seems that this is the starting point of approximating a division using Newton-Raphson’s method and this instruction is supposed to compute an approximation of 1/x, where x is saved in R7:R6. However, I do not understand how this is done by performing reciprocal on the upper half of the 64-bit operand. It is highly appreciated if anyone can shed some light on this.