I wrote a simple kernel to test IMAD.HI instruction as follows:
//Used to generate IMAD.HI SASS instruction.
__device__ uint32_t IMAD_HI(uint32_t a, uint32_t b, uint32_t c){
return ((uint64_t)a * (uint64_t)b + (uint64_t)c) >> 32;
}
constexpr int ILP = 2;
__constant__ uint32_t v[ILP];
__global__
void kernel_IMAD_HI(uint32_t *pdata) {
uint32_t a[ILP], b[ILP],c[ILP];
//Initialize a[ILP], b[ILP],c[ILP] with value pointed by pdata.
for (int i = 0; i < 200000; i++) {
for (int j = 0; j < ILP; j++) {
a[j] = IMAD_HI(a[j], v[j%ILP], c[(j+1)%ILP]);
}
for (int j = 0; j < ILP; j++) {
b[j] = IMAD_HI(b[j], v[j%ILP], a[(j+1)%ILP]);
}
for (int j = 0; j < ILP; j++) {
c[j] = IMAD_HI(c[j], v[j%ILP], b[(j+1)%ILP]);
}
}
//Write to global memory
}
The main part of the for loop is compiled into the following SASS code:
/*0290*/ IMAD.HI.U32 R8, R8, c[0x3][0x4], R14 ; /* 0x00c0010008087a27 */
/* 0x000fc800078e000e */
/*02a0*/ IMAD.HI.U32 R6, R6, c[0x3][0x8], R16 ; /* 0x00c0020006067a27 */
/* 0x000fe200078e0010 */
/*02b0*/ IADD3 R8, R8, UR4, RZ ; /* 0x0000000408087c10 */
/* 0x000fc8000fffe0ff */
/*02c0*/ IADD3 R6, R6, UR5, RZ ; /* 0x0000000506067c10 */
/* 0x000fe2000fffe0ff */
/*02d0*/ IMAD.HI.U32 R10, R10, c[0x3][0x8], R8 ; /* 0x00c002000a0a7a27 */
/* 0x000fc800078e0008 */
/*02e0*/ IMAD.HI.U32 R12, R12, c[0x3][0x4], R6 ; /* 0x00c001000c0c7a27 */
/* 0x000fe200078e0006 */
/*02f0*/ IADD3 R10, R10, UR5, RZ ; /* 0x000000050a0a7c10 */
/* 0x000fc8000fffe0ff */
/*0300*/ IADD3 R12, R12, UR4, RZ ; /* 0x000000040c0c7c10 */
/* 0x000fe2000fffe0ff */
/*0310*/ IMAD.HI.U32 R16, R16, c[0x3][0x4], R10 ; /* 0x00c0010010107a27 */
/* 0x000fc800078e000a */
/*0320*/ IMAD.HI.U32 R14, R14, c[0x3][0x8], R12 ; /* 0x00c002000e0e7a27 */
/* 0x000fe200078e000c */
/*0330*/ IADD3 R16, R16, UR4, RZ ; /* 0x0000000410107c10 */
/* 0x000fc8000fffe0ff */
/*0340*/ IADD3 R14, R14, UR5, RZ ; /* 0x000000050e0e7c10 */
As the UR4 and UR5 uniform register is set to zero(by instruction UMOV UR4, URZ and UMOV UR5, URZ),
the instructions like IADD3 R16, R16, UR4, RZ ; seems meaningless.
What’s the effect of these IADD3 instructions?
Why are these instructions needed here?