The code is actually longer because it also includes the warp based minimum search
/* 0x2280428272028007 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x2800400110005de4 */
/*0010*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0018*/ IMUL R2, R0, c[0x0][0x28]; /* 0x50004000a0009ca3 */
/*0020*/ S2R R0, SR_TID.X; /* 0x2c00000084001c04 */
/*0028*/ SHR.U32 R3, R2, 0x5; /* 0x5800c0001420dc03 */
/*0030*/ IMAD.U32.U32.HI R9, R0, c[0x2][0x0], R3; /* 0x2006480000025c43 */
/*0038*/ LOP.AND R10, R0, 0x1f; /* 0x6800c0007c029c03 */
/* 0x2232c0428042e2c7 */
/*0048*/ ISETP.GE.U32.AND P0, PT, R9, c[0x0][0x178], PT; /* 0x1b0e4005e091dc03 */
/*0050*/ @P0 EXIT; /* 0x80000000000001e7 */
/*0058*/ MOV32I R7, 0x4; /* 0x180000001001dde2 */
/*0060*/ ISETP.LT.U32.AND P0, PT, R10, c[0x0][0x17c], PT; /* 0x188e4005f0a1dc03 */
/*0068*/ IMAD.U32.U32 R2.CC, R9, R7, c[0x0][0x148]; /* 0x200f800520909c03 */
/*0070*/ SSY 0x320; /* 0x6000000aa0000007 */
/*0078*/ IMAD.U32.U32.HI.X R3, R9, R7, c[0x0][0x14c]; /* 0x208e80053090dc43 */
/* 0x2200400202c232c7 */
/*0088*/ IMAD.U32.U32 R4.CC, R9, R7, c[0x0][0x158]; /* 0x200f800560911c03 */
/*0090*/ IMAD.U32.U32.HI.X R5, R9, R7, c[0x0][0x15c]; /* 0x208e800570915c43 */
/*0098*/ IMAD.U32.U32 R6.CC, R9, R7, c[0x0][0x160]; /* 0x200f800580919c03 */
/*00a0*/ IMAD.U32.U32.HI.X R7, R9, R7, c[0x0][0x164]; /* 0x208e80059091dc43 */
/*00a8*/ @P0 BRA 0xd0; /* 0x40000000800001e7 */
/*00b0*/ MOV32I R8, 0xffffffff; /* 0x1bfffffffc021de2 */
/*00b8*/ BRA 0x318; /* 0x4000000960001de7 */
/* 0x2002322042004007 */
/*00c8*/ NOP; /* 0x4000000000001de4 */
/*00d0*/ LD.E R15, [R2]; /* 0x840000000023dc85 */
/*00d8*/ MOV32I R8, 0xffffffff; /* 0x1bfffffffc021de2 */
/*00e0*/ MOV32I R17, 0x4; /* 0x1800000010045de2 */
/*00e8*/ MOV32I R19, 0x4; /* 0x180000001004dde2 */
/*00f0*/ LD.E R14, [R4]; /* 0x8400000000439c85 */
/*00f8*/ LD.E R11, [R6]; /* 0x840000000062dc85 */
/* 0x22c043704282c047 */
/*0108*/ IMAD.U32.U32 R4.CC, R10, R17, c[0x0][0x150]; /* 0x2023800540a11c03 */
/*0110*/ SSY 0x200; /* 0x60000003a0000007 */
/*0118*/ IMAD.U32.U32.HI.X R5, R10, R17, c[0x0][0x154]; /* 0x20a2800550a15c43 */
/*0120*/ LD.E R2, [R4]; /* 0x8400000000409c85 */
/*0128*/ MOV R16, c[0x3][0x4]; /* 0x28004c0010041de4 */
/*0130*/ ISETP.GT.U32.AND P0, PT, R15, R2, PT; /* 0x1a0e000008f1dc03 */
/*0138*/ ISUB R3, R2, R15; /* 0x480000003c20dd03 */
/* 0x228282828002c287 */
/*0148*/ @P0 ISUB R3, R15, R2; /* 0x4800000008f0c103 */
/*0150*/ ISETP.GT.U32.AND P0, PT, R3, c[0x3][0x0], PT; /* 0x1a0e4c000031dc03 */
/*0158*/ @P0 NOP.S; /* 0x40000000000001f4 */
/*0160*/ I2F.F32.U32.RP R2, c[0x3] [0x0]; /* 0x18044c0001209c04 */
/*0168*/ MUFU.RCP R2, R2; /* 0xc800000010209c00 */
/*0170*/ IADD32I R2, R2, 0xffffffe; /* 0x083ffffff8209c02 */
/*0178*/ F2I.FTZ.U32.F32.TRUNC R2, R2; /* 0x1486000009209c04 */
/* 0x22c2828282720287 */
/*0188*/ IMUL.U32.U32 R4, R2, c[0x3][0x0]; /* 0x50004c0000211c03 */
/*0190*/ I2I.S32.S32 R4, -R4; /* 0x1c00000011211f84 */
/*0198*/ IMUL R5, R3, c[0x3][0x4]; /* 0x50004c0010315ca3 */
/*01a0*/ IMAD.U32.U32.HI R2, R2, R4, R2; /* 0x2004000010209c43 */
/*01a8*/ IMUL.U32.U32.HI R3, R2, R5; /* 0x500000001420dc43 */
/*01b0*/ IMAD.U32.U32 R2, -R3, c[0x3][0x0], R5; /* 0x200a4c0000309e03 */
/*01b8*/ ISETP.GE.U32.AND P0, PT, R2, c[0x3][0x0], PT; /* 0x1b0e4c000021dc03 */
/* 0x2002c042c0428047 */
/*01c8*/ @P0 ISUB R2, R2, c[0x3][0x0]; /* 0x48004c0000208103 */
/*01d0*/ @P0 IADD R3, R3, 0x1; /* 0x4800c0000430c003 */
/*01d8*/ ISETP.GE.U32.AND P1, PT, R2, c[0x3][0x0], PT; /* 0x1b0e4c000023dc03 */
/*01e0*/ LOP.PASS_B R2, RZ, ~c[0x3][0x0]; /* 0x68004c0003f09dc3 */
/*01e8*/ ISETP.NE.U32.AND P0, PT, RZ, c[0x3][0x0], PT; /* 0x1a8e4c0003f1dc03 */
/*01f0*/ @P1 IADD R3, R3, 0x1; /* 0x4800c0000430c403 */
/*01f8*/ SEL.S R16, R2, R3, !P0; /* 0x201000000c241c14 */
/* 0x22628272423042c7 */
/*0208*/ IMAD.U32.U32 R2.CC, R10, R19, c[0x0][0x168]; /* 0x20278005a0a09c03 */
/*0210*/ IMAD.U32.U32.HI.X R3, R10, R19, c[0x0][0x16c]; /* 0x20a68005b0a0dc43 */
/*0218*/ SSY 0x2e8; /* 0x6000000320000007 */
/*0220*/ IMAD.U32.U32 R4.CC, R10, R19, c[0x0][0x170]; /* 0x20278005c0a11c03 */
/*0228*/ LD.E R2, [R2]; /* 0x8400000000209c85 */
/*0230*/ IMAD.U32.U32.HI.X R5, R10, R19, c[0x0][0x174]; /* 0x20a68005d0a15c43 */
/*0238*/ LD.E R4, [R4]; /* 0x8400000000411c85 */
/* 0x228272404232c047 */
/*0248*/ ISETP.GT.U32.AND P1, PT, R14, R2, PT; /* 0x1a0e000008e3dc03 */
/*0250*/ ISUB R3, R2, R14; /* 0x480000003820dd03 */
/*0258*/ @P1 ISUB R3, R14, R2; /* 0x4800000008e0c503 */
/*0260*/ ISETP.GT.U32.AND P0, PT, R11, R4, PT; /* 0x1a0e000010b1dc03 */
/*0268*/ ISUB R2, R4, R11; /* 0x480000002c409d03 */
/*0270*/ IMUL R3, R3, R3; /* 0x500000000c30dca3 */
/*0278*/ @P0 ISUB R2, R11, R4; /* 0x4800000010b08103 */
/* 0x22004002c2c28047 */
/*0288*/ MOV R4, c[0x3][0x10]; /* 0x28004c0040011de4 */
/*0290*/ IMAD R2, R2, R2, R3; /* 0x2006000008209ca3 */
/*0298*/ ISUB RZ.CC, R2, c[0x3][0x8]; /* 0x48014c00202fdd03 */
/*02a0*/ ISETP.GT.X.AND P0, PT, RZ, c[0x3][0xc], PT; /* 0x1a0e4c0033f1dc63 */
/*02a8*/ @P0 NOP.S; /* 0x40000000000001f4 */
/*02b0*/ IMUL R4, R2, c[0x3][0x10]; /* 0x50004c0040211ca3 */
/*02b8*/ MOV R5, RZ; /* 0x28000000fc015de4 */
/* 0x22c2720002e00047 */
/*02c8*/ MOV R7, c[0x3][0xc]; /* 0x28004c003001dde4 */
/*02d0*/ MOV R6, c[0x3][0x8]; /* 0x28004c0020019de4 */
/*02d8*/ CAL 0x3a8; /* 0x5000000320010007 */
/*02e0*/ NOP.S; /* 0x4000000000001df4 */
/*02e8*/ IADD R10, R10, 0x20; /* 0x4800c00080a29c03 */
/*02f0*/ IADD R3, R4, R16; /* 0x480000004040dc03 */
/*02f8*/ ISETP.LT.U32.AND P0, PT, R10, c[0x0][0x17c], PT; /* 0x188e4005f0a1dc03 */
/* 0x228042804002e047 */
/*0308*/ IMNMX.U32 R8, R8, R3, PT; /* 0x080e00000c821c03 */
/*0310*/ @P0 BRA 0x100; /* 0x4003fff7a00001e7 */
/*0318*/ MOV32I.S R2, 0x20; /* 0x1800000080009df2 */
/*0320*/ IMAD.U32.U32.HI R4, R2, 0x2, R2; /* 0x2004c00008211c43 */
/*0328*/ MOV R3, R2; /* 0x280000000800dde4 */
/*0330*/ SHR R2, R4, 0x1; /* 0x5800c00004409c23 */
/*0338*/ ISETP.GT.AND P0, PT, R3, 0x3, PT; /* 0x1a0ec0000c31dc23 */
/* 0x228002c282e04377 */
/*0348*/ SHFL.DOWN PT, R4, R8, R2, 0x1f; /* 0x8d007c0008811f45 */
/*0350*/ IMNMX.U32 R8, R8, R4, PT; /* 0x080e000010821c03 */
/*0358*/ @P0 BRA 0x320; /* 0x4003ffff000001e7 */
/*0360*/ LOP.AND R0, R0, 0x1f; /* 0x6800c0007c001c03 */
/*0368*/ ISETP.NE.AND P0, PT, R0, RZ, PT; /* 0x1a8e0000fc01dc23 */
/*0370*/ @P0 BRA.U 0x3a0; /* 0x40000000a00081e7 */
/*0378*/ @!P0 MOV32I R0, 0x4; /* 0x18000000100021e2 */
/* 0x228042c2e00282c7 */
/*0388*/ @!P0 IMAD.U32.U32 R2.CC, R9, R0, c[0x0][0x140]; /* 0x200180050090a003 */
/*0390*/ @!P0 IMAD.U32.U32.HI.X R3, R9, R0, c[0x0][0x144]; /* 0x208080051090e043 */
/*0398*/ @!P0 ST.E [R2], R8; /* 0x9400000000222085 */
/*03a0*/ EXIT; /* 0x8000000000001de7 */
/*03a8*/ IADD R2.CC, -R6, RZ; /* 0x48010000fc609e03 */
/*03b0*/ IADD.X R3, -R7, RZ; /* 0x48000000fc70de43 */
/*03b8*/ ICMP.LT R12, R2, R6, R7; /* 0x308e000018231c23 */
/* 0x2342828280434287 */
/*03c8*/ ICMP.LT R13, R3, R7, R7; /* 0x308e00001c335c23 */
/*03d0*/ I2F.F32.U64.RP R2, R12; /* 0x1804000031a09c04 */
/*03d8*/ MUFU.RCP R2, R2; /* 0xc800000010209c00 */
/*03e0*/ ISETP.LT.AND P0, PT, R5, RZ, PT; /* 0x188e0000fc51dc23 */
/*03e8*/ IADD32I R2, R2, 0x1ffffffe; /* 0x087ffffff8209c02 */
/*03f0*/ F2F.FTZ.F32.F32 R2, R2; /* 0x1080000009209c04 */
/*03f8*/ F2I.U64.F32.TRUNC R2, R2; /* 0x1406000009309c04 */
/* 0x22c2824232423287 */
/*0408*/ IMUL.U32.U32.HI R20, R2, R12; /* 0x5000000030251c43 */
/*0410*/ IMAD.U32.U32 R18, R2, R13, R20; /* 0x2028000034249c03 */
/*0418*/ IMAD.U32.U32 R21.CC, -R2, R12, RZ; /* 0x207f000030255e03 */
/*0420*/ IMAD.U32.U32 R18, R3, R12, R18; /* 0x2024000030349c03 */
/*0428*/ IMUL.U32.U32.HI R20, R2, R21; /* 0x5000000054251c43 */
/*0430*/ IADD.X R23, -R18, RZ; /* 0x48000000fd25de43 */
/*0438*/ IMAD.U32.U32 R18.CC, R2, R23, R20; /* 0x202900005c249c03 */
/* 0x22328042c2c232c7 */
/*0448*/ IMAD.U32.U32.HI.X R2.CC, R2, R23, R2; /* 0x208500005c209c43 */
/*0450*/ IMAD.U32.U32.HI.X R20, R3, R23, R3; /* 0x208600005c351c43 */
/*0458*/ IMAD.U32.U32 RZ.CC, R3, R21, R18; /* 0x20250000543fdc03 */
/*0460*/ IMAD.U32.U32.HI.X R2.CC, R3, R21, R2; /* 0x2085000054309c43 */
/*0468*/ IADD.X R18, R20, RZ; /* 0x48000000fd449c43 */
/*0470*/ IMAD.U32.U32 R2.CC, R3, R23, R2; /* 0x200500005c309c03 */
/*0478*/ IMUL.U32.U32.HI R20, R2, R12; /* 0x5000000030251c43 */
/* 0x22c2824232423247 */
/*0488*/ IADD.X R3, R18, RZ; /* 0x48000000fd20dc43 */
/*0490*/ IMAD.U32.U32 R18, R2, R13, R20; /* 0x2028000034249c03 */
/*0498*/ IMAD.U32.U32 R23.CC, -R2, R12, RZ; /* 0x207f00003025de03 */
/*04a0*/ IMAD.U32.U32 R18, R3, R12, R18; /* 0x2024000030349c03 */
/*04a8*/ IMUL.U32.U32.HI R20, R2, R23; /* 0x500000005c251c43 */
/*04b0*/ IADD.X R29, -R18, RZ; /* 0x48000000fd275e43 */
/*04b8*/ IMAD.U32.U32 R18.CC, R2, R29, R20; /* 0x2029000074249c03 */
/* 0x22c2c202b20042c7 */
/*04c8*/ IMAD.U32.U32.HI.X R20.CC, R2, R29, R2; /* 0x2085000074251c43 */
/*04d0*/ IMAD.U32.U32.HI.X R21, R3, R29, R3; /* 0x2086000074355c43 */
/*04d8*/ @P0 IADD R4.CC, -R4, RZ; /* 0x48010000fc410203 */
/*04e0*/ MOV R2, R5; /* 0x2800000014009de4 */
/*04e8*/ @P0 IADD.X R2, -R5, RZ; /* 0x48000000fc508243 */
/*04f0*/ IMAD.U32.U32 RZ.CC, R3, R23, R18; /* 0x202500005c3fdc03 */
/*04f8*/ IMAD.U32.U32.HI.X R18.CC, R3, R23, R20; /* 0x20a900005c349c43 */
/* 0x22c232c242328207 */
/*0508*/ IADD.X R20, R21, RZ; /* 0x48000000fd551c43 */
/*0510*/ IMAD.U32.U32 R3.CC, R3, R29, R18; /* 0x202500007430dc03 */
/*0518*/ IMUL.U32.U32.HI R22, R3, R4; /* 0x5000000010359c43 */
/*0520*/ IADD.X R21, R20, RZ; /* 0x48000000fd455c43 */
/*0528*/ IMAD.U32.U32 R18.CC, R3, R2, R22; /* 0x202d000008349c03 */
/*0530*/ IMAD.U32.U32.HI.X R3, R3, R2, RZ; /* 0x20fe00000830dc43 */
/*0538*/ IMAD.U32.U32 RZ.CC, R21, R4, R18; /* 0x20250000115fdc03 */
/* 0x22824042328232c7 */
/*0548*/ IMAD.U32.U32.HI.X R3.CC, R21, R4, R3; /* 0x208700001150dc43 */
/*0550*/ IMAD.U32.U32.HI.X R18, R21, R2, RZ; /* 0x20fe000009549c43 */
/*0558*/ IMAD.U32.U32 R24.CC, R21, R2, R3; /* 0x2007000009561c03 */
/*0560*/ IMUL.U32.U32.HI R3, R24, R12; /* 0x500000003180dc43 */
/*0568*/ IADD.X R21, R18, RZ; /* 0x48000000fd255c43 */
/*0570*/ IMAD.U32.U32 R18.CC, -R24, R12, R4; /* 0x2009000031849e03 */
/*0578*/ IMAD.U32.U32 R3, R24, R13, R3; /* 0x200600003580dc03 */
/* 0x22c202c202c20287 */
/*0588*/ IMAD.U32.U32 R3, R21, R12, R3; /* 0x200600003150dc03 */
/*0590*/ ISUB.X R2, R2, R3; /* 0x480000000c209d43 */
/*0598*/ ISUB RZ.CC, R18, R12; /* 0x48010000312fdd03 */
/*05a0*/ ISETP.GE.U32.X.AND P0, PT, R2, R13, PT; /* 0x1b0e00003421dc43 */
/*05a8*/ ISUB RZ.CC, R6, RZ; /* 0x48010000fc6fdd03 */
/*05b0*/ ISETP.EQ.X.AND P2, PT, R7, RZ, PT; /* 0x190e0000fc75dc63 */
/*05b8*/ @P0 ISUB R18.CC, R18, R12; /* 0x4801000031248103 */
/* 0x220042c20042c207 */
/*05c8*/ @P0 ISUB.X R2, R2, R13; /* 0x4800000034208143 */
/*05d0*/ ISUB RZ.CC, R18, R12; /* 0x48010000312fdd03 */
/*05d8*/ ISETP.GE.U32.X.AND P1, PT, R2, R13, PT; /* 0x1b0e00003423dc43 */
/*05e0*/ LOP.XOR R2, R7, R5; /* 0x6800000014709c83 */
/*05e8*/ @P0 IADD R24.CC, R24, 0x1; /* 0x4801c00005860003 */
/*05f0*/ @P0 IADD.X R21, R21, RZ; /* 0x48000000fd554043 */
/*05f8*/ IADD R3.CC, R24, 0x1; /* 0x4801c0000580dc03 */
/* 0x22828042c28042b7 */
/*0608*/ ISETP.LT.AND P0, PT, R2, RZ, PT; /* 0x188e0000fc21dc23 */
/*0610*/ SEL R2, R3, R24, P1; /* 0x2002000060309c04 */
/*0618*/ @P1 IADD.X R21, R21, RZ; /* 0x48000000fd554443 */
/*0620*/ @P0 IADD R2.CC, -R2, RZ; /* 0x48010000fc208203 */
/*0628*/ @P0 IADD.X R21, -R21, RZ; /* 0x48000000fd554243 */
/*0630*/ SEL R4, R2, -0x1, !P2; /* 0x2014fffffc211c04 */
/*0638*/ SEL R5, R21, -0x1, !P2; /* 0x2014fffffd515c04 */
/* 0x20000000000002e7 */
/*0648*/ RET; /* 0x9000000000001de7 */
/*0650*/ BRA 0x650; /* 0x4003ffffe0001de7 */
/*0658*/ NOP; /* 0x4000000000001de4 */
/*0660*/ NOP; /* 0x4000000000001de4 */
/*0668*/ NOP; /* 0x4000000000001de4 */
/*0670*/ NOP; /* 0x4000000000001de4 */
/*0678*/ NOP; /* 0x4000000000001de4 */
However, the profiler says that the ALU utilization is high, right?
How do I check the efficiency of the memory access?
Yes, I bought a GT 740 to learn CUDA programming.
I’m sorry for my naive questions, thanks for your help