Yes, here’s the sass comparison between the two kernels. In additional to verifying 128-bit loads, it sheds light on what other factors might influence the performance (e.g. if there were extra integer pressure arising from some other code…)
$ cuobjdump -sass t900.o
Fatbin elf code:
================
arch = sm_20
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit
code for sm_20
Function : _Z9vectorAddPK7double4S1_PS_i
.headerflags @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0010*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0018*/ IMAD R0, R0, c[0x0][0x8], R2; /* 0x2004400020001ca3 */
/*0020*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x38], PT; /* 0x1b0e4000e001dc23 */
/*0028*/ @P0 BRA.U 0xb8; /* 0x40000002200081e7 */
/*0030*/ @!P0 MOV32I R3, 0x20; /* 0x180000008000e1e2 */
/*0038*/ @!P0 IMAD R22.CC, R0, R3, c[0x0][0x20]; /* 0x200780008005a0a3 */
/*0040*/ @!P0 IMAD.HI.X R23, R0, R3, c[0x0][0x24]; /* 0x208680009005e0e3 */
/*0048*/ @!P0 IMAD R20.CC, R0, R3, c[0x0][0x28]; /* 0x20078000a00520a3 */
/*0050*/ @!P0 LD.E.128 R16, [R22]; /* 0x84000000016420c5 */
/*0058*/ @!P0 IMAD.HI.X R21, R0, R3, c[0x0][0x2c]; /* 0x20868000b00560e3 */
/*0060*/ @!P0 LD.E.128 R8, [R22+0x10]; /* 0x84000000416220c5 */
/*0068*/ @!P0 IMAD R2.CC, R0, R3, c[0x0][0x30]; /* 0x20078000c000a0a3 */
/*0070*/ @!P0 LD.E.128 R12, [R20]; /* 0x84000000014320c5 */
/*0078*/ @!P0 LD.E.128 R4, [R20+0x10]; /* 0x84000000414120c5 */
/*0080*/ @!P0 IMAD.HI.X R3, R0, R3, c[0x0][0x34]; /* 0x20868000d000e0e3 */
/*0088*/ @!P0 DADD R14, R18, R14; /* 0x480000003923a001 */
/*0090*/ @!P0 DADD R12, R16, R12; /* 0x4800000031032001 */
/*0098*/ @!P0 DADD R6, R10, R6; /* 0x4800000018a1a001 */
/*00a0*/ @!P0 DADD R4, R8, R4; /* 0x4800000010812001 */
/*00a8*/ @!P0 ST.E.128 [R2], R12; /* 0x94000000002320c5 */
/*00b0*/ @!P0 ST.E.128 [R2+0x10], R4; /* 0x94000000402120c5 */
/*00b8*/ EXIT; /* 0x8000000000001de7 */
..............................................
Function : _Z9vectorAddPKdS0_S0_S0_S0_S0_S0_S0_PdS1_S1_S1_i
.headerflags @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0010*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0018*/ IMAD R0, R0, c[0x0][0x8], R2; /* 0x2004400020001ca3 */
/*0020*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x80], PT; /* 0x1b0e40020001dc23 */
/*0028*/ @P0 BRA.U 0x178; /* 0x40000005200081e7 */
/*0030*/ @!P0 MOV32I R12, 0x8; /* 0x18000000200321e2 */
/*0038*/ @!P0 IMAD R4.CC, R0, R12, c[0x0][0x20]; /* 0x20198000800120a3 */
/*0040*/ @!P0 IMAD.HI.X R5, R0, R12, c[0x0][0x24]; /* 0x20988000900160e3 */
/*0048*/ @!P0 IMAD R14.CC, R0, R12, c[0x0][0x40]; /* 0x201980010003a0a3 */
/*0050*/ @!P0 LD.E.64 R10, [R4]; /* 0x840000000042a0a5 */
/*0058*/ @!P0 IMAD.HI.X R15, R0, R12, c[0x0][0x44]; /* 0x209880011003e0e3 */
/*0060*/ @!P0 IMAD R6.CC, R0, R12, c[0x0][0x60]; /* 0x201980018001a0a3 */
/*0068*/ @!P0 LD.E.64 R8, [R14]; /* 0x8400000000e220a5 */
/*0070*/ @!P0 IMAD.HI.X R7, R0, R12, c[0x0][0x64]; /* 0x209880019001e0e3 */
/*0078*/ @!P0 IMAD R2.CC, R0, R12, c[0x0][0x28]; /* 0x20198000a000a0a3 */
/*0080*/ @!P0 IMAD.HI.X R3, R0, R12, c[0x0][0x2c]; /* 0x20988000b000e0e3 */
/*0088*/ @!P0 IMAD R4.CC, R0, R12, c[0x0][0x48]; /* 0x20198001200120a3 */
/*0090*/ @!P0 IMAD.HI.X R5, R0, R12, c[0x0][0x4c]; /* 0x20988001300160e3 */
/*0098*/ @!P0 IMAD R16.CC, R0, R12, c[0x0][0x68]; /* 0x20198001a00420a3 */
/*00a0*/ @!P0 IMAD.HI.X R17, R0, R12, c[0x0][0x6c]; /* 0x20988001b00460e3 */
/*00a8*/ @!P0 IMAD R14.CC, R0, R12, c[0x0][0x30]; /* 0x20198000c003a0a3 */
/*00b0*/ @!P0 IMAD.HI.X R15, R0, R12, c[0x0][0x34]; /* 0x20988000d003e0e3 */
/*00b8*/ @!P0 DADD R18, R10, R8; /* 0x4800000020a4a001 */
/*00c0*/ @!P0 ST.E.64 [R6], R18; /* 0x940000000064a0a5 */
/*00c8*/ @!P0 LD.E.64 R8, [R2]; /* 0x84000000002220a5 */
/*00d0*/ @!P0 LD.E.64 R10, [R4]; /* 0x840000000042a0a5 */
/*00d8*/ @!P0 IMAD R2.CC, R0, R12, c[0x0][0x50]; /* 0x201980014000a0a3 */
/*00e0*/ @!P0 IMAD.HI.X R3, R0, R12, c[0x0][0x54]; /* 0x209880015000e0e3 */
/*00e8*/ @!P0 DADD R18, R8, R10; /* 0x480000002884a001 */
/*00f0*/ @!P0 IMAD R10.CC, R0, R12, c[0x0][0x70]; /* 0x20198001c002a0a3 */
/*00f8*/ @!P0 ST.E.64 [R16], R18; /* 0x940000000104a0a5 */
/*0100*/ @!P0 LD.E.64 R4, [R14]; /* 0x8400000000e120a5 */
/*0108*/ @!P0 LD.E.64 R6, [R2]; /* 0x840000000021a0a5 */
/*0110*/ @!P0 IMAD.HI.X R11, R0, R12, c[0x0][0x74]; /* 0x20988001d002e0e3 */
/*0118*/ @!P0 IMAD R8.CC, R0, R12, c[0x0][0x38]; /* 0x20198000e00220a3 */
/*0120*/ @!P0 IMAD.HI.X R9, R0, R12, c[0x0][0x3c]; /* 0x20988000f00260e3 */
/*0128*/ @!P0 IMAD R2.CC, R0, R12, c[0x0][0x58]; /* 0x201980016000a0a3 */
/*0130*/ @!P0 IMAD.HI.X R3, R0, R12, c[0x0][0x5c]; /* 0x209880017000e0e3 */
/*0138*/ @!P0 IMAD R16.CC, R0, R12, c[0x0][0x78]; /* 0x20198001e00420a3 */
/*0140*/ @!P0 IMAD.HI.X R17, R0, R12, c[0x0][0x7c]; /* 0x20988001f00460e3 */
/*0148*/ @!P0 DADD R14, R4, R6; /* 0x480000001843a001 */
/*0150*/ @!P0 ST.E.64 [R10], R14; /* 0x9400000000a3a0a5 */
/*0158*/ @!P0 LD.E.64 R4, [R8]; /* 0x84000000008120a5 */
/*0160*/ @!P0 LD.E.64 R6, [R2]; /* 0x840000000021a0a5 */
/*0168*/ @!P0 DADD R2, R4, R6; /* 0x480000001840a001 */
/*0170*/ @!P0 ST.E.64 [R16], R2; /* 0x940000000100a0a5 */
/*0178*/ EXIT; /* 0x8000000000001de7 */
.................................................................
Fatbin ptx code:
================
arch = sm_20
code version = [4,2]
producer = cuda
host = linux
compile_size = 64bit
compressed
$