What’s the SASS your code procudes at your card? For me, (sm52, compute52, CUDA 9.1) there’s no difference (the code is 100% identical down to variable names) between your 64x64 code and this much simpler code:
asm volatile (
"mad.lo.cc.u32 %0, %6, %4, 0;\n\t"
"madc.lo.cc.u32 %1, %6, %5, 0;\n\t"
"addc.u32 %2, 0, 0;\n\t"
"mad.hi.cc.u32 %1, %6, %4, %1;\n\t"
"madc.hi.u32 %2, %6, %5, %2;\n\t"
"mad.lo.cc.u32 %1, %7, %4, %1;\n\t"
"madc.lo.cc.u32 %2, %7, %5, %2;\n\t"
"addc.u32 %3, 0, 0;\n\t"
"mad.hi.cc.u32 %2, %7, %4, %2;\n\t"
"madc.hi.u32 %3, %7, %5, %3;\n\t"
:
"=r"(c.n[0]), "=r"(c.n[1]), "=r"(c.n[2]), "=r"(c.n[3])
:
"r"(a.n[0]), "r"(a.n[1]),
"r"(b.n[0]), "r"(b.n[1]));
This is the SASS generated on my system:
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = cuda
host = windows
compile_size = 64bit
code for sm_52
Function : _Z10testKernelP9uint256_tS0_
.headerflags @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
/* 0x001fd800fec007f6 */
/*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */
/*0010*/ MOV R6, c[0x0][0x140]; /* 0x4c98078005070006 */
/*0018*/ IADD32I R6.CC, R6, 0x4; /* 0x1c10000000470606 */
/* 0x0001c4000e2007f2 */
/*0028*/ IADD.X R7, RZ, c[0x0][0x144]; /* 0x4c1008000517ff07 */
/*0030*/ LDG.E R2, [R6+-0x4]; /* 0xeed42fffffc70602 */
/*0038*/ LDG.E R9, [R6+0x1c]; /* 0xeed4200001c70609 */
/* 0x011f840016400091 */
/*0048*/ LDG.E R0, [R6]; /* 0xeed4200000070600 */
/*0050*/ LDG.E R5, [R6+0x20]; /* 0xeed4200002070605 */
/*0058*/ XMAD R3, R9, R2, RZ; /* 0x5b007f8000270903 */
/* 0x081fd0d0fe2607f1 */
/*0068*/ XMAD.MRG R4, R9.reuse, R2.H1.reuse, RZ; /* 0x5b007fa800270904 */
/*0070*/ XMAD R10, R9.reuse, R0.reuse, RZ; /* 0x5b007f800007090a */
/*0078*/ XMAD.MRG R8, R9.reuse, R0.H1, RZ; /* 0x5b007fa800070908 */
/* 0x001f8440fe2a07f1 */
/*0088*/ XMAD.PSL.CBCC R4, R9.H1.reuse, R4.H1, R3.reuse; /* 0x5b30019800470904 */
/*0090*/ XMAD.CHI R13, R9.H1.reuse, R2, R3; /* 0x5b2801800027090d */
/*0098*/ XMAD.PSL.CBCC R8, R9.H1, R8.H1, R10; /* 0x5b30051800870908 */
/* 0x001fc440fe4607f1 */
/*00a8*/ XMAD R11, R9.reuse, R2.H1.reuse, RZ; /* 0x5b007f880027090b */
/*00b0*/ XMAD R12, R9.H1.reuse, R2.H1, RZ; /* 0x5b207f880027090c */
/*00b8*/ IADD R3.CC, RZ, R4; /* 0x5c1080000047ff03 */
/* 0x001f84c0fe260ff1 */
/*00c8*/ XMAD R6, R9.reuse, R0.H1.reuse, RZ; /* 0x5b007f8800070906 */
/*00d0*/ XMAD.CHI R7, R9.H1.reuse, R0.reuse, R10; /* 0x5b28050000070907 */
/*00d8*/ XMAD R9, R9.H1, R0.H1, RZ; /* 0x5b207f8800070909 */
/* 0x1c1fc400fda007f2 */
/*00e8*/ IADD3.RS R11, R13, R11, R12; /* 0x5cc0062000b70d0b */
/*00f0*/ IADD.X R4.CC, RZ, R8; /* 0x5c1088000087ff04 */
/*00f8*/ XMAD R8, R5.reuse, R2.reuse, RZ; /* 0x5b007f8000270508 */
/* 0x001fc400fe2007f1 */
/*0108*/ XMAD.MRG R10, R5, R2.H1, RZ; /* 0x5b007fa80027050a */
/*0110*/ IADD3.RS R7, R7, R6, R9; /* 0x5cc004a000670707 */
/*0118*/ IADD.X R6, RZ, RZ; /* 0x5c1008000ff7ff06 */
/* 0x181fc540fe2007e3 */
/*0128*/ IADD R4.CC, R4, R11; /* 0x5c10800000b70404 */
/*0130*/ XMAD.PSL.CBCC R10, R5.H1.reuse, R10.H1, R8.reuse; /* 0x5b30041800a7050a */
/*0138*/ XMAD R11, R5.reuse, R0.reuse, RZ; /* 0x5b007f800007050b */
/* 0x001fc800fe2207f1 */
/*0148*/ XMAD.MRG R12, R5.reuse, R0.H1, RZ; /* 0x5b007fa80007050c */
/*0150*/ XMAD.CHI R9, R5.H1, R2, R8; /* 0x5b28040000270509 */
/*0158*/ IADD.X R6, R6, R7; /* 0x5c10080000770606 */
/* 0x281fc4c0fe2007e1 */
/*0168*/ IADD R7.CC, R4, R10; /* 0x5c10800000a70407 */
/*0170*/ XMAD R4, R5.reuse, R2.H1.reuse, RZ; /* 0x5b007f8800270504 */
/*0178*/ XMAD.PSL.CBCC R12, R5.H1.reuse, R12.H1, R11.reuse; /* 0x5b30059800c7050c */
/* 0x181fc4c0fe2207f1 */
/*0188*/ XMAD R2, R5.H1.reuse, R2.H1, RZ; /* 0x5b207f8800270502 */
/*0190*/ XMAD R8, R5.reuse, R0.H1.reuse, RZ; /* 0x5b007f8800070508 */
/*0198*/ XMAD R10, R5.H1.reuse, R0.H1.reuse, RZ; /* 0x5b207f880007050a */
/* 0x001fcc00fe2007e2 */
/*01a8*/ XMAD.CHI R5, R5.H1, R0, R11; /* 0x5b28058000070505 */
/*01b0*/ IADD.X R6.CC, R6, R12; /* 0x5c10880000c70606 */
/*01b8*/ IADD3.RS R4, R9, R4, R2; /* 0x5cc0012000470904 */
/* 0x001fc400fe2007f2 */
/*01c8*/ IADD3.RS R2, R5, R8, R10; /* 0x5cc0052000870502 */
/*01d0*/ IADD.X R0, RZ, RZ; /* 0x5c1008000ff7ff00 */
/*01d8*/ IADD R6.CC, R6, R4; /* 0x5c10800000470606 */
/* 0x0003c800fe4007e1 */
/*01e8*/ MOV R4, c[0x0][0x148]; /* 0x4c98078005270004 */
/*01f0*/ MOV R5, c[0x0][0x14c]; /* 0x4c98078005370005 */
/*01f8*/ STG.E [R4], R3; /* 0xeedc200000070403 */
/* 0x0003c4001e2007f0 */
/*0208*/ { IADD.X R2, R0, R2; /* 0x5c10080000270002 */
/*0210*/ STG.E [R4+0x4], R7; } /* 0xeedc200000470407 */
/*0218*/ STG.E [R4+0x8], R6; /* 0xeedc200000870406 */
/* 0x001ffc00ffe000f5 */
/*0228*/ STG.E [R4+0xc], R2; /* 0xeedc200000c70402 */
/*0230*/ EXIT; /* 0xe30000000007000f */
/*0238*/ BRA 0x238; /* 0xe2400fffff87000f */
.......................................