If I change this:
zisisnotzis:
#pragma unroll
for (int i = I; i > 1; i--) {
sort2(a[x - i], a[x]);
sort2(a[x - 1], a[x + i - 1]);
#pragma unroll
for (int j = 1; j < i - 1; j++) {
sort2(a[x - 1 - j], a[x - 1]);
sort2(a[x], a[x + j]);
}
}
to this:
#pragma unroll
for (int i = N; i > 1; i--) {
if (i <= I){
sort2(a[x - i], a[x]);
sort2(a[x - 1], a[x + i - 1]);
#pragma unroll
for (int j = 1; j < i - 1; j++) {
sort2(a[x - 1 - j], a[x - 1]);
sort2(a[x], a[x + j]);
}
}
}
which I do not believe should affect the program logic, then I get fairly rapid compilation, and the resulting code has no LDL/STL instructions in it.
Here is my test case on CUDA 12.2:
# cat t156.cu
#include <cuda_fp16.h>
template <class T> __device__ __forceinline__ void cp(T* __restrict__ O, const T* __restrict__ A, int I) {
#pragma unroll
for (int i = 0; i < I; i += 16 / sizeof(T))
* (int4*)(O + i) = *(int4*)(A + i);
}
__device__ __forceinline__ void sort2(half& __restrict__ a, half& __restrict__ b) {
half t = __hmin(a, b);
b = __hmax(a, b);
a = t;
};
template <int N = 32> __global__ void sort2_ker(half* __restrict__ A) {
half a[N];
cp(a, A, N);
#pragma unroll
for (int I = 1; I < N; I *= 2) {
#pragma unroll
for (int x = I; x < N; x += I * 2) {
#pragma unroll
for (int i = N; i > 1; i--) {
if (i <= I){
sort2(a[x - i], a[x]);
sort2(a[x - 1], a[x + i - 1]);
#pragma unroll
for (int j = 1; j < i - 1; j++) {
sort2(a[x - 1 - j], a[x - 1]);
sort2(a[x], a[x + j]);
}
}
}
sort2(a[x - 1], a[x]);
}
}
cp(A, a, N);
}
int main(){
half *A=NULL;
sort2_ker<32><<<1,32>>>(A);
cudaDeviceSynchronize();
}
# nvcc -o t156 t156.cu -arch=sm_89
# cuobjdump -sass ./t156 |grep LD
/*0020*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */
/*0040*/ LDG.E.128 R8, [R2.64] ; /* 0x0000000402087981 */
/*0050*/ LDG.E.128 R16, [R2.64+0x10] ; /* 0x0000100402107981 */
/*0060*/ LDG.E.128 R20, [R2.64+0x20] ; /* 0x0000200402147981 */
/*0070*/ LDG.E.128 R4, [R2.64+0x30] ; /* 0x0000300402047981 */
#
The SASS code is too long to post, but you can inspect it in godbolt , I suppose. Here is a typical sequence that I see:
/*34b0*/ HMNMX2 R4, R18.reuse, R19.reuse, PT ; /* 0x0000001312047240 */
/* 0x0c0fe40003800000 */
/*34c0*/ HMNMX2 R19, R18, R19, !PT ; /* 0x0000001312137240 */
/* 0x000fc60007800000 */
/*34d0*/ PRMT R17, R30, 0x5410, R4 ; /* 0x000054101e117816 */
/* 0x000fca0000000004 */
/*34e0*/ HMNMX2 R21, R17, R20, !PT ; /* 0x0000001411157240 */
/* 0x000fe40007800000 */
/*34f0*/ PRMT R17, R4, 0x7610, R19 ; /* 0x0000761004117816 */
/* 0x000fe40000000013 */
/*3500*/ PRMT R4, R36, 0x7610, R4 ; /* 0x0000761024047816 */
/* 0x000fe40000000004 */
/*3510*/ PRMT R16, R16, 0x7610, R21 ; /* 0x0000761010107816 */
/* 0x000fca0000000015 */
/*3520*/ HMNMX2 R16, R17, R16, PT ; /* 0x0000001011107240 */
/* 0x000fca0003800000 */
/*3530*/ PRMT R15, R15, 0x5432, R16 ; /* 0x000054320f0f7816 */
/* 0x000fca0000000010 */
/*3540*/ HMNMX2 R20, R4, R15, !PT ; /* 0x0000000f04147240 */
/* 0x000fe40007800000 */
/*3550*/ PRMT R15, R16, 0x7632, R14 ; /* 0x00007632100f7816 */
/* 0x000fc6000000000e */
/*3560*/ PRMT R14, R7, 0x7610, R20 ; /* 0x00007610070e7816 */
/* 0x000fca0000000014 */
/*3570*/ HMNMX2 R7, R14.reuse, R15.reuse, PT ; /* 0x0000000f0e077240 */
/* 0x0c0fe40003800000 */
/*3580*/ HMNMX2 R14, R14, R15, !PT ; /* 0x0000000f0e0e7240 */
/* 0x000fe40007800000 */
/*3590*/ PRMT R15, R6, 0x7632, R5 ; /* 0x00007632060f7816 */
/* 0x000fe40000000005 */
/*35a0*/ PRMT R13, R13, 0x7610, R7 ; /* 0x000076100d0d7816 */
/* 0x000fe40000000007 */
/*35b0*/ PRMT R18, R14, 0x7610, R11 ; /* 0x000076100e127816 */
/* 0x000fe4000000000b */
/*35c0*/ PRMT R11, R11, 0x5410, R12 ; /* 0x000054100b0b7816 */
/* 0x000fc6000000000c */
/*35d0*/ HMNMX2 R34, R13.reuse, R18.reuse, PT ; /* 0x000000120d227240 */
/* 0x0c0fe40003800000 */
/*35e0*/ HMNMX2 R35, R13, R18, !PT ; /* 0x000000120d237240 */
/* 0x000fe40007800000 */
/*35f0*/ HMNMX2 R5, R34.H1_H1, R12.H1_H1, PT ; /* 0x3000000c22057240 */
/* 0x000fe40003800c00 */
/*3600*/ HMNMX2 R15, R15, R11, PT ; /* 0x0000000b0f0f7240 */
/* 0x000fe40003800000 */
/*3610*/ PRMT R11, R0, 0x5410, R16 ; /* 0x00005410000b7816 */
/* 0x000fe40000000010 */
/*3620*/ PRMT R13, R35, 0x5410, R0 ; /* 0x00005410230d7816 */
/* 0x000fc40000000000 */
/*3630*/ PRMT R4, R5, 0x7610, R4 ; /* 0x0000761005047816 */
/* 0x000fe40000000004 */
/*3640*/ PRMT R0, R15, 0x5410, R5 ; /* 0x000054100f007816 */
/* 0x000fc60000000005 */
/*3650*/ HMNMX2 R11, R4, R11, PT ; /* 0x0000000b040b7240 */
/* 0x000fe40003800000 */
/*3660*/ HMNMX2 R0, R0, R13, !PT ; /* 0x0000000d00007240 */
/* 0x000fc60007800000 */
/*3670*/ PRMT R28, R15, 0x5432, R11 ; /* 0x000054320f1c7816 */
/* 0x000fe4000000000b */
/*3680*/ PRMT R5, R0, 0x5410, R3 ; /* 0x0000541000057816 */
/* 0x000fca0000000003 */
/*3690*/ HMNMX2 R3, R28.reuse, R5.reuse, !PT ; /* 0x000000051c037240 */
/* 0x0c0fe40007800000 */
/*36a0*/ HMNMX2 R28, R28, R5, PT ; /* 0x000000051c1c7240 */
/* 0x000fe40003800000 */
/*36b0*/ HMNMX2 R13, R31.H1_H1, R3.H0_H0, !PT ; /* 0x200000031f0d7240 */
/* 0x000fe40007800c00 */
/*36c0*/ HMNMX2 R5, R28.H1_H1, R20.reuse.H0_H0, PT ; /* 0x200000141c057240 */
/* 0x080fe40003800c00 */
/*36d0*/ HMNMX2 R4, R36.H1_H1, R13.H0_H0, !PT ; /* 0x2000000d24047240 */
/* 0x000fe40007800c00 */
/*36e0*/ HMNMX2 R20, R28.H1_H1, R20.H0_H0, !PT ; /* 0x200000141c147240 */
/* 0x000fc40007800c00 */
/*36f0*/ HMNMX2 R6, R30.H1_H1, R4.H0_H0, !PT ; /* 0x200000041e067240 */
/* 0x000fe40007800c00 */
/*3700*/ PRMT R2, R5, 0x7610, R2 ; /* 0x0000761005027816 */
/* 0x000fe40000000002 */
/*3710*/ PRMT R30, R36, 0x7632, R30 ; /* 0x00007632241e7816 */
/* 0x000fe4000000001e */
/*3720*/ PRMT R17, R21, 0x5410, R6 ; /* 0x0000541015117816 */
/* 0x000fe40000000006 */
/*3730*/ PRMT R13, R13, 0x5410, R4 ; /* 0x000054100d0d7816 */
/* 0x000fe40000000004 */
/*3740*/ HMNMX2 R21, R19.H1_H1, R21.H1_H1, !PT ; /* 0x3000001513157240 */
/* 0x000fc40007800c00 */
/*3750*/ HMNMX2 R18, R2.reuse, R17.reuse, PT ; /* 0x0000001102127240 */
/* 0x0c0fe40003800000 */
/*3760*/ HMNMX2 R17, R2, R17, !PT ; /* 0x0000001102117240 */
/* 0x000fe40007800000 */
/*3770*/ HMNMX2 R22, R18.reuse.H0_H0, R19.reuse.H0_H0, !PT ; /* 0x2000001312167240 */
/* 0x0c0fe40007800800 */
/*3780*/ HMNMX2 R37, R18.reuse.H0_H0, R19.H0_H0, PT ; /* 0x2000001312257240 */
/* 0x040fe40003800800 */
/*3790*/ HMNMX2 R5, R17.H1_H1, R22.H0_H0, PT ; /* 0x2000001611057240 */
/* 0x000fe40003800c00 */
/*37a0*/ HMNMX2 R6, R18.H1_H1, R37.H0_H0, !PT ; /* 0x2000002512067240 */
/* 0x000fc40007800c00 */
/*37b0*/ HMNMX2 R32, R7.H0_H0, R5.H0_H0, !PT ; /* 0x2000000507207240 */
/* 0x000fe40007800800 */
/*37c0*/ HMNMX2 R2, R6.H0_H0, R14.H1_H1, PT ; /* 0x3000000e06027240 */
/* 0x000fe40003800800 */
/*37d0*/ PRMT R5, R5, 0x7610, R0 ; /* 0x0000761005057816 */
/* 0x000fe40000000000 */
/*37e0*/ PRMT R32, R32, 0x7610, R12 ; /* 0x0000761020207816 */
/* 0x000fe4000000000c */
/*37f0*/ PRMT R12, R15, 0x5410, R2 ; /* 0x000054100f0c7816 */
/* 0x000fe40000000002 */
/*3800*/ HMNMX2 R30, R30, R13, PT ; /* 0x0000000d1e1e7240 */
/* 0x000fc40003800000 */
/*3810*/ HMNMX2 R15, R34, R32, !PT ; /* 0x00000020220f7240 */
/* 0x000fe40007800000 */
/*3820*/ HMNMX2 R33, R12, R35.reuse, PT ; /* 0x000000230c217240 */
/* 0x080fe40003800000 */
/*3830*/ HMNMX2 R2, R2.H0_H0, R35.H1_H1, !PT ; /* 0x3000002302027240 */
/* 0x000fe40007800800 */
/*3840*/ HMNMX2 R12, R33.H1_H1, R15.H1_H1, PT ; /* 0x3000000f210c7240 */
/* 0x000fe40003800c00 */
/*3850*/ PRMT R34, R34, 0x5410, R33 ; /* 0x0000541022227816 */
/* 0x000fe40000000021 */
/*3860*/ PRMT R32, R32, 0x5410, R15 ; /* 0x0000541020207816 */
/* 0x000fc4000000000f */
/*3870*/ PRMT R7, R7, 0x5410, R12 ; /* 0x0000541007077816 */
/* 0x000fe4000000000c */
/*3880*/ HMNMX2 R12, R12.H0_H0, R0.H1_H1, !PT ; /* 0x300000000c0c7240 */
/* 0x000fe40007800800 */
/*3890*/ HMNMX2 R37, R18.H1_H1, R37.H0_H0, PT ; /* 0x2000002512257240 */
/* 0x000fe40003800c00 */
/*38a0*/ HMNMX2 R16, R7, R5, PT ; /* 0x0000000507107240 */
/* 0x000fe40003800000 */
/*38b0*/ HMNMX2 R7, R33, R15, !PT ; /* 0x0000000f21077240 */
/* 0x000fe40007800000 */
/*38c0*/ HMNMX2 R15, R34, R32, PT ; /* 0x00000020220f7240 */
/* 0x000fc40003800000 */
/*38d0*/ PRMT R40, R31, 0x7632, R16 ; /* 0x000076321f287816 */
/* 0x000fe40000000010 */
/*38e0*/ HMNMX2 R5, R28.reuse.H0_H0, R7.reuse.H0_H0, !PT ; /* 0x200000071c057240 */
/* 0x0c0fe40007800800 */
/*38f0*/ HMNMX2 R28, R28.H0_H0, R7.H0_H0, PT ; /* 0x200000071c1c7240 */
/* 0x000fe40003800800 */
/*3900*/ HMNMX2 R31, R40, R3, PT ; /* 0x00000003281f7240 */
/* 0x000fe40003800000 */
/*3910*/ PRMT R14, R14, 0x5432, R5 ; /* 0x000054320e0e7816 */
/* 0x000fe40000000005 */
/*3920*/ HMNMX2 R5, R31.H0_H0, R5.H0_H0, PT ; /* 0x200000051f057240 */
/* 0x000fc40003800800 */
/*3930*/ PRMT R6, R6, 0x5410, R31 ; /* 0x0000541006067816 */
/* 0x000fe4000000001f */
/*3940*/ HMNMX2 R3, R16.H1_H1, R3.H1_H1, !PT ; /* 0x3000000310037240 */
/* 0x000fe40007800c00 */
/*3950*/ HMNMX2 R22, R17.H1_H1, R22.H0_H0, !PT ; /* 0x2000001611167240 */
/* 0x000fe40007800c00 */
/*3960*/ HMNMX2 R45, R6, R14, !PT ; /* 0x0000000e062d7240 */
/* 0x000fe40007800000 */
/*3970*/ HMNMX2 R6, R31.H1_H1, R20.H0_H0, PT ; /* 0x200000141f067240 */
/* 0x000fe40003800c00 */
/*3980*/ HMNMX2 R4, R30.H0_H0, R45.H1_H1, !PT ; /* 0x3000002d1e047240 */
/* 0x000fc40007800800 */
/*3990*/ HMNMX2 R14, R6.H0_H0, R17.reuse.H0_H0, PT ; /* 0x20000011060e7240 */
/* 0x080fe40003800800 */
/*39a0*/ HMNMX2 R13, R30.H1_H1, R4.reuse.H0_H0, PT ; /* 0x200000041e0d7240 */
/* 0x080fe40003800c00 */
/*39b0*/ HMNMX2 R6, R6.H0_H0, R17.H0_H0, !PT ; /* 0x2000001106067240 */
/* 0x000fe40007800800 */
/*39c0*/ PRMT R20, R20, 0x5410, R14 ; /* 0x0000541014147816 */
/* 0x000fe4000000000e */
/*39d0*/ PRMT R33, R31, 0x5432, R13 ; /* 0x000054321f217816 */
/* 0x000fe4000000000d */
/*39e0*/ HMNMX2 R4, R30.H1_H1, R4.H0_H0, !PT ; /* 0x200000041e047240 */
/* 0x000fc40007800c00 */
/*39f0*/ HMNMX2 R30, R30.H0_H0, R45.H1_H1, PT ; /* 0x3000002d1e1e7240 */
/* 0x000fe40003800800 */
/*3a00*/ HMNMX2 R35, R4.H0_H0, R6.H0_H0, PT ; /* 0x2000000604237240 */
/* 0x000fe40003800800 */
/*3a10*/ HMNMX2 R20, R33, R20, !PT ; /* 0x0000001421147240 */
/* 0x000fe40007800000 */
/*3a20*/ HMNMX2 R33, R16.H0_H0, R35.H0_H0, !PT ; /* 0x2000002310217240 */
/* 0x000fe40007800800 */
/*3a30*/ HMNMX2 R32, R20.reuse.H1_H1, R45.reuse.H0_H0, PT ; /* 0x2000002d14207240 */
/* 0x0c0fe40003800c00 */
/*3a40*/ HMNMX2 R45, R20.H1_H1, R45.H0_H0, !PT ; /* 0x2000002d142d7240 */
/* 0x000fc40007800c00 */
/*3a50*/ PRMT R33, R33, 0x5410, R2 ; /* 0x0000541021217816 */
/* 0x000fe40000000002 */
/*3a60*/ PRMT R32, R15, 0x5410, R32 ; /* 0x000054100f207816 */
/* 0x000fe40000000020 */
/*3a70*/ HMNMX2 R35, R16.H0_H0, R35.H0_H0, PT ; /* 0x2000002310237240 */
/* 0x000fe40003800800 */
/*3a80*/ PRMT R11, R11, 0x5432, R37 ; /* 0x000054320b0b7816 */
/* 0x000fe40000000025 */
/*3a90*/ HMNMX2 R43, R32.reuse, R33.reuse, PT ; /* 0x00000021202b7240 */
/* 0x0c0fe40003800000 */
/*3aa0*/ HMNMX2 R36, R32, R33, !PT ; /* 0x0000002120247240 */
/* 0x000fc40007800000 */
/*3ab0*/ HMNMX2 R0, R43.H1_H1, R7.H1_H1, PT ; /* 0x300000072b007240 */
/* 0x000fe40003800c00 */
/*3ac0*/ HMNMX2 R2, R15.H1_H1, R36.H0_H0, !PT ; /* 0x200000240f027240 */
/* 0x000fe40007800c00 */
/*3ad0*/ HMNMX2 R13, R13.H0_H0, R14.H0_H0, PT ; /* 0x2000000e0d0d7240 */
/* 0x000fe40003800800 */
/*3ae0*/ PRMT R28, R28, 0x5410, R0 ; /* 0x000054101c1c7816 */
/* 0x000fe40000000000 */
/*3af0*/ PRMT R2, R2, 0x5410, R12 ; /* 0x0000541002027816 */
/* 0x000fca000000000c */
/*3b00*/ HMNMX2 R31, R28.reuse, R2.reuse, !PT ; /* 0x000000021c1f7240 */
/* 0x0c0fe40007800000 */
/*3b10*/ HMNMX2 R32, R28, R2, PT ; /* 0x000000021c207240 */
/* 0x000fe40003800000 */
/*3b20*/ HMNMX2 R28, R5.H0_H0, R31.H0_H0, !PT ; /* 0x2000001f051c7240 */
/* 0x000fe40007800800 */
/*3b30*/ HMNMX2 R2, R32.H1_H1, R3.H0_H0, PT ; /* 0x2000000320027240 */
/* 0x000fe40003800c00 */
/*3b40*/ HMNMX2 R12, R30.H0_H0, R28.H0_H0, PT ; /* 0x2000001c1e0c7240 */
/* 0x000fe40003800800 */
/*3b50*/ HMNMX2 R0, R2.H0_H0, R20.H0_H0, PT ; /* 0x2000001402007240 */
/* 0x000fc40003800800 */
/*3b60*/ HMNMX2 R2, R2.H0_H0, R20.H0_H0, !PT ; /* 0x2000001402027240 */
/* 0x000fe40007800800 */
/*3b70*/ PRMT R30, R30, 0x5410, R12 ; /* 0x000054101e1e7816 */
/* 0x000fe4000000000c */
/*3b80*/ PRMT R28, R28, 0x5410, R0 ; /* 0x000054101c1c7816 */
/* 0x000fe40000000000 */
/*3b90*/ HMNMX2 R20, R43.H1_H1, R7.H1_H1, !PT ; /* 0x300000072b147240 */
/* 0x000fe40007800c00 */
/*3ba0*/ HMNMX2 R0, R12.H0_H0, R0.H0_H0, PT ; /* 0x200000000c007240 */
/* 0x000fe40003800800 */
/*3bb0*/ HMNMX2 R44, R30, R28, !PT ; /* 0x0000001c1e2c7240 */
/* 0x000fc80007800000 */
/*3bc0*/ HMNMX2 R33, R44.reuse.H1_H1, R45.H0_H0, PT ; /* 0x2000002d2c217240 */
/* 0x040fe40003800c00 */
/*3bd0*/ HMNMX2 R30, R44.reuse.H0_H0, R2.H0_H0, PT ; /* 0x200000022c1e7240 */
/* 0x040fe40003800800 */
/*3be0*/ PRMT R2, R2, 0x5410, R45 ; /* 0x0000541002027816 */
/* 0x000fe4000000002d */
/*3bf0*/ PRMT R15, R15, 0x5432, R33 ; /* 0x000054320f0f7816 */
/* 0x000fe40000000021 */
/*3c00*/ HMNMX2 R34, R35.H0_H0, R30.H0_H0, !PT ; /* 0x2000001e23227240 */
/* 0x000fe40007800800 */
/*3c10*/ HMNMX2 R2, R44, R2, !PT ; /* 0x000000022c027240 */
/* 0x000fc40007800000 */
/*3c20*/ HMNMX2 R28, R15, R36, PT ; /* 0x000000240f1c7240 */
/* 0x000fe40003800000 */
/*3c30*/ HMNMX2 R40, R43.H0_H0, R34.H0_H0, !PT ; /* 0x200000222b287240 */
/* 0x000fe40007800800 */
/*3c40*/ HMNMX2 R15, R28.reuse.H1_H1, R20.H0_H0, PT ; /* 0x200000141c0f7240 */
/* 0x040fe40003800c00 */
/*3c50*/ HMNMX2 R16, R28.H0_H0, R40.H0_H0, !PT ; /* 0x200000281c107240 */
/* 0x000fe40007800800 */
/*3c60*/ HMNMX2 R33, R33.H0_H0, R36.H1_H1, !PT ; /* 0x3000002421217240 */
/* 0x000fe40007800800 */
/*3c70*/ PRMT R46, R5, 0x5410, R15 ; /* 0x00005410052e7816 */
/* 0x000fc4000000000f */
/*3c80*/ PRMT R16, R16, 0x5410, R3 ; /* 0x0000541010107816 */
/* 0x000fe40000000003 */
/*3c90*/ HMNMX2 R43, R43.H0_H0, R34.H0_H0, PT ; /* 0x200000222b2b7240 */
/* 0x000fe40003800800 */
/*3ca0*/ HMNMX2 R46, R46, R31, PT ; /* 0x0000001f2e2e7240 */
/* 0x000fe40003800000 */
/*3cb0*/ HMNMX2 R5, R32, R16, !PT ; /* 0x0000001020057240 */
/* 0x000fe40007800000 */
/*3cc0*/ HMNMX2 R40, R28.H0_H0, R40.H0_H0, PT ; /* 0x200000281c287240 */
/* 0x000fe40003800800 */
/*3cd0*/ HMNMX2 R7, R46.H0_H0, R5.H0_H0, !PT ; /* 0x200000052e077240 */
/* 0x000fc40007800800 */
/*3ce0*/ HMNMX2 R3, R46.reuse.H1_H1, R5.reuse.H1_H1, !PT ; /* 0x300000052e037240 */
/* 0x0c0fe40007800c00 */
/*3cf0*/ HMNMX2 R5, R46, R5, PT ; /* 0x000000052e057240 */
/* 0x000fe40003800000 */
/*3d00*/ PRMT R44, R35, 0x5410, R7 ; /* 0x00005410232c7816 */
/* 0x000fe40000000007 */
/*3d10*/ PRMT R30, R30, 0x5410, R3 ; /* 0x000054101e1e7816 */
/* 0x000fe40000000003 */
/*3d20*/ HMNMX2 R35, R5.H0_H0, R5.H1_H1, !PT ; /* 0x3000000505237240 */
/* 0x000fe40007800800 */
/*3d30*/ HMNMX2 R20, R28.H1_H1, R20.H0_H0, !PT ; /* 0x200000141c147240 */
/* 0x000fc40007800c00 */
/*3d40*/ HMNMX2 R30, R44, R30, PT ; /* 0x0000001e2c1e7240 */
/* 0x000fe40003800000 */
/*3d50*/ HMNMX2 R34, R35.H0_H0, R2.H1_H1, PT ; /* 0x3000000223227240 */
/* 0x000fe40003800800 */
/*3d60*/ HMNMX2 R36, R30.H0_H0, R30.H1_H1, !PT ; /* 0x3000001e1e247240 */
/* 0x000fe40007800800 */
/*3d70*/ HMNMX2 R16, R32.H0_H0, R16.H0_H0, PT ; /* 0x2000001020107240 */
/* 0x000fe40003800800 */
/*3d80*/ PRMT R43, R43, 0x5410, R34 ; /* 0x000054102b2b7816 */
/* 0x000fe40000000022 */
/*3d90*/ PRMT R36, R36, 0x5410, R33 ; /* 0x0000541024247816 */
/* 0x000fc40000000021 */
/*3da0*/ HMNMX2 R32, R15.H0_H0, R31.H1_H1, !PT ; /* 0x3000001f0f207240 */
/* 0x000fe40007800800 */
/*3db0*/ HMNMX2 R35, R35.H0_H0, R2.H1_H1, !PT ; /* 0x3000000223237240 */
/* 0x000fe40007800800 */
/*3dc0*/ HMNMX2 R34, R43.reuse, R36.reuse, PT ; /* 0x000000242b227240 */
/* 0x0c0fe40003800000 */
/*3dd0*/ HMNMX2 R28, R43, R36, !PT ; /* 0x000000242b1c7240 */
/* 0x000fe40007800000 */
/*3de0*/ HMNMX2 R15, R34.H1_H1, R20.H0_H0, PT ; /* 0x20000014220f7240 */
/* 0x000fe40003800c00 */
/*3df0*/ HMNMX2 R31, R40.H0_H0, R28.H0_H0, !PT ; /* 0x2000001c281f7240 */
/* 0x000fc40007800800 */
/*3e00*/ HMNMX2 R30, R30.H0_H0, R30.H1_H1, PT ; /* 0x3000001e1e1e7240 */
/* 0x000fe40003800800 */
/*3e10*/ PRMT R16, R16, 0x5410, R15 ; /* 0x0000541010107816 */
/* 0x000fe4000000000f */
/*3e20*/ PRMT R31, R31, 0x5410, R32 ; /* 0x000054101f1f7816 */
/* 0x000fe40000000020 */
/*3e30*/ HMNMX2 R3, R7.H0_H0, R3.H0_H0, !PT ; /* 0x2000000307037240 */
/* 0x000fe40007800800 */
/*3e40*/ HMNMX2 R5, R5.H0_H0, R5.H1_H1, PT ; /* 0x3000000505057240 */
/* 0x000fe40003800800 */
/*3e50*/ HMNMX2 R15, R16, R31, PT ; /* 0x0000001f100f7240 */
/* 0x000fc40003800000 */
/*3e60*/ HMNMX2 R16, R16, R31, !PT ; /* 0x0000001f10107240 */
/* 0x000fe40007800000 */
/*3e70*/ HMNMX2 R36, R15.reuse.H0_H0, R15.H1_H1, !PT ; /* 0x3000000f0f247240 */
/* 0x040fe40007800800 */
/*3e80*/ HMNMX2 R33, R16.H0_H0, R16.H1_H1, PT ; /* 0x3000001010217240 */
/* 0x000fe40003800800 */
/*3e90*/ HMNMX2 R32, R36.H0_H0, R35.H0_H0, PT ; /* 0x2000002324207240 */
/* 0x000fe40003800800 */
/*3ea0*/ HMNMX2 R31, R30.H0_H0, R33.H0_H0, !PT ; /* 0x200000211e1f7240 */
/* 0x000fe40007800800 */
/*3eb0*/ HMNMX2 R15, R15.H0_H0, R15.H1_H1, PT ; /* 0x3000000f0f0f7240 */
/* 0x000fc40003800800 */
/*3ec0*/ PRMT R40, R40, 0x5410, R32 ; /* 0x0000541028287816 */
/* 0x000fe40000000020 */
/*3ed0*/ PRMT R31, R31, 0x5410, R20 ; /* 0x000054101f1f7816 */
/* 0x000fe40000000014 */
/*3ee0*/ HMNMX2 R32, R32.H0_H0, R28.reuse.H1_H1, !PT ; /* 0x3000001c20207240 */
/* 0x080fe40007800800 */
/*3ef0*/ HMNMX2 R40, R40, R28, PT ; /* 0x0000001c28287240 */
/* 0x000fe40003800000 */
/*3f00*/ HMNMX2 R20, R34.reuse, R31.reuse, !PT ; /* 0x0000001f22147240 */
/* 0x0c0fe40007800000 */
/*3f10*/ HMNMX2 R31, R34.H0_H0, R31.H0_H0, PT ; /* 0x2000001f221f7240 */
/* 0x000fc40003800800 */
/*3f20*/ HMNMX2 R17, R40.reuse.H0_H0, R20.reuse.H0_H0, PT ; /* 0x2000001428117240 */
/* 0x0c0fe40003800800 */
/*3f30*/ HMNMX2 R18, R40.reuse.H0_H0, R20.reuse.H0_H0, !PT ; /* 0x2000001428127240 */
/* 0x0c0fe40007800800 */
/*3f40*/ HMNMX2 R19, R40.reuse.H1_H1, R20.reuse.H1_H1, PT ; /* 0x3000001428137240 */
/* 0x0c0fe40003800c00 */
/*3f50*/ HMNMX2 R20, R40.H1_H1, R20.H1_H1, !PT ; /* 0x3000001428147240 */
/* 0x000fe20007800c00 */
/*3f60*/ IMAD.MOV.U32 R40, RZ, RZ, c[0x0][0x160] ; /* 0x00005800ff287624 */
/* 0x000fe200078e00ff */
/*3f70*/ PRMT R36, R36, 0x5410, R17 ; /* 0x0000541024247816 */
/* 0x000fe40000000011 */
/*3f80*/ PRMT R35, R35, 0x5410, R19 ; /* 0x0000541023237816 */
/* 0x000fc40000000013 */
/*3f90*/ STG.E.128 [R40.64], R8 ; /* 0x0000000828007986 */