Can't get ptx to stop using local

I wrote a simple kernel that does merge sort purely in register, like this

template<class T>__device__ __forceinline__ void cp(T* __restrict__ O, const T* __restrict__ A, int I) {
  #pragma unroll
  for (int i = 0; i < I; i += 16 / sizeof(T))
    * (int4*)(O + i) = *(int4*)(A + i);
}
__device__ __forceinline__ void sort2(half& __restrict__ a, half& __restrict__ b) {
  half t = __hmin(a, b);
  b = __hmax(a, b);
  a = t;
};
template<int N = 32>__global__ void __maxnreg__(255) sort2_ker(half* __restrict__ A) {
  half a[N];
  cp(a, A, N);
  #pragma unroll
  for (int I = 1; I < N; I *= 2) {
    #pragma unroll
    for (int x = I; x < N; x += I * 2) {
      #pragma unroll
      for (int i = I; i > 1; i--) {
        sort2(a[x - i], a[x]);
        sort2(a[x - 1], a[x + i - 1]);
        #pragma unroll
        for (int j = 1; j < i - 1; j++) {
          sort2(a[x - 1 - j], a[x - 1]);
          sort2(a[x], a[x + j]);
        }
      }
      sort2(a[x - 1], a[x]);
    }
  }
  cp(A, a, N);
}

Since this program can be fully unrolled, all register ids should be deterministic. But no matter what I do, as long as N>=32 (which uses 16 registers, way less than 256), there’re be a lot of local memory access in PTX and SASS.

Since it already starts to use local in PTX, and register usage 51 is far less from 256, I think it’s using local because it’s trying to address registers. But since all indices are fixed, register ids should also be fixed right?

I suspect the lack of register optimization may be related to the loops whose extent is dependent on an outer loop. The compiler may or may not be able to perform heroic-level optimizations. You have something like 3 nested loops, whose extents depend on the previous loop-nest.

When I convert those to all fixed/obviously discoverable extents, the compile time “blows up”, which suggests to me the compiler is then working hard to unroll everything and optimize. I’m not providing the code since it is obviously meaningless/broken when I do that. But after 5 minutes the compiler is still chugging…

[later]: after an hour or so I got:

Killed

which I think is the kernel OOM killer. This machine has 32G system ram.

If I change this:

to this:

  #pragma unroll
  for (int i = N; i > 1; i--) {
    if (i <= I){
      sort2(a[x - i], a[x]);
      sort2(a[x - 1], a[x + i - 1]);
      #pragma unroll
      for (int j = 1; j < i - 1; j++) {
        sort2(a[x - 1 - j], a[x - 1]);
        sort2(a[x], a[x + j]);
      }
    }
  }

which I do not believe should affect the program logic, then I get fairly rapid compilation, and the resulting code has no LDL/STL instructions in it.

Here is my test case on CUDA 12.2:

# cat t156.cu
#include <cuda_fp16.h>

template <class T> __device__ __forceinline__ void cp(T* __restrict__ O, const T* __restrict__ A, int I) {
  #pragma unroll
  for (int i = 0; i < I; i += 16 / sizeof(T))
    * (int4*)(O + i) = *(int4*)(A + i);
}
__device__ __forceinline__ void sort2(half& __restrict__ a, half& __restrict__ b) {
  half t = __hmin(a, b);
  b = __hmax(a, b);
  a = t;
};
template <int N = 32> __global__ void sort2_ker(half* __restrict__ A) {
  half a[N];
  cp(a, A, N);
  #pragma unroll
  for (int I = 1; I < N; I *= 2) {
    #pragma unroll
    for (int x = I; x < N; x += I * 2) {
      #pragma unroll
      for (int i = N; i > 1; i--) {
        if (i <= I){
          sort2(a[x - i], a[x]);
          sort2(a[x - 1], a[x + i - 1]);
          #pragma unroll
          for (int j = 1; j < i - 1; j++) {
            sort2(a[x - 1 - j], a[x - 1]);
            sort2(a[x], a[x + j]);
          }
        }
      }
      sort2(a[x - 1], a[x]);
    }
  }
  cp(A, a, N);
}

int main(){

  half *A=NULL;
  sort2_ker<32><<<1,32>>>(A);
  cudaDeviceSynchronize();
}
# nvcc -o t156 t156.cu -arch=sm_89
# cuobjdump -sass ./t156 |grep LD
        /*0020*/                   ULDC.64 UR4, c[0x0][0x118] ;                         /* 0x0000460000047ab9 */
        /*0040*/                   LDG.E.128 R8, [R2.64] ;                              /* 0x0000000402087981 */
        /*0050*/                   LDG.E.128 R16, [R2.64+0x10] ;                        /* 0x0000100402107981 */
        /*0060*/                   LDG.E.128 R20, [R2.64+0x20] ;                        /* 0x0000200402147981 */
        /*0070*/                   LDG.E.128 R4, [R2.64+0x30] ;                         /* 0x0000300402047981 */
#

The SASS code is too long to post, but you can inspect it in godbolt, I suppose. Here is a typical sequence that I see:

        /*34b0*/                   HMNMX2 R4, R18.reuse, R19.reuse, PT ;                /* 0x0000001312047240 */
                                                                                        /* 0x0c0fe40003800000 */
        /*34c0*/                   HMNMX2 R19, R18, R19, !PT ;                          /* 0x0000001312137240 */
                                                                                        /* 0x000fc60007800000 */
        /*34d0*/                   PRMT R17, R30, 0x5410, R4 ;                          /* 0x000054101e117816 */
                                                                                        /* 0x000fca0000000004 */
        /*34e0*/                   HMNMX2 R21, R17, R20, !PT ;                          /* 0x0000001411157240 */
                                                                                        /* 0x000fe40007800000 */
        /*34f0*/                   PRMT R17, R4, 0x7610, R19 ;                          /* 0x0000761004117816 */
                                                                                        /* 0x000fe40000000013 */
        /*3500*/                   PRMT R4, R36, 0x7610, R4 ;                           /* 0x0000761024047816 */
                                                                                        /* 0x000fe40000000004 */
        /*3510*/                   PRMT R16, R16, 0x7610, R21 ;                         /* 0x0000761010107816 */
                                                                                        /* 0x000fca0000000015 */
        /*3520*/                   HMNMX2 R16, R17, R16, PT ;                           /* 0x0000001011107240 */
                                                                                        /* 0x000fca0003800000 */
        /*3530*/                   PRMT R15, R15, 0x5432, R16 ;                         /* 0x000054320f0f7816 */
                                                                                        /* 0x000fca0000000010 */
        /*3540*/                   HMNMX2 R20, R4, R15, !PT ;                           /* 0x0000000f04147240 */
                                                                                        /* 0x000fe40007800000 */
        /*3550*/                   PRMT R15, R16, 0x7632, R14 ;                         /* 0x00007632100f7816 */
                                                                                        /* 0x000fc6000000000e */
        /*3560*/                   PRMT R14, R7, 0x7610, R20 ;                          /* 0x00007610070e7816 */
                                                                                        /* 0x000fca0000000014 */
        /*3570*/                   HMNMX2 R7, R14.reuse, R15.reuse, PT ;                /* 0x0000000f0e077240 */
                                                                                        /* 0x0c0fe40003800000 */
        /*3580*/                   HMNMX2 R14, R14, R15, !PT ;                          /* 0x0000000f0e0e7240 */
                                                                                        /* 0x000fe40007800000 */
        /*3590*/                   PRMT R15, R6, 0x7632, R5 ;                           /* 0x00007632060f7816 */
                                                                                        /* 0x000fe40000000005 */
        /*35a0*/                   PRMT R13, R13, 0x7610, R7 ;                          /* 0x000076100d0d7816 */
                                                                                        /* 0x000fe40000000007 */
        /*35b0*/                   PRMT R18, R14, 0x7610, R11 ;                         /* 0x000076100e127816 */
                                                                                        /* 0x000fe4000000000b */
        /*35c0*/                   PRMT R11, R11, 0x5410, R12 ;                         /* 0x000054100b0b7816 */
                                                                                        /* 0x000fc6000000000c */
        /*35d0*/                   HMNMX2 R34, R13.reuse, R18.reuse, PT ;               /* 0x000000120d227240 */
                                                                                        /* 0x0c0fe40003800000 */
        /*35e0*/                   HMNMX2 R35, R13, R18, !PT ;                          /* 0x000000120d237240 */
                                                                                        /* 0x000fe40007800000 */
        /*35f0*/                   HMNMX2 R5, R34.H1_H1, R12.H1_H1, PT ;                /* 0x3000000c22057240 */
                                                                                        /* 0x000fe40003800c00 */
        /*3600*/                   HMNMX2 R15, R15, R11, PT ;                           /* 0x0000000b0f0f7240 */
                                                                                        /* 0x000fe40003800000 */
        /*3610*/                   PRMT R11, R0, 0x5410, R16 ;                          /* 0x00005410000b7816 */
                                                                                        /* 0x000fe40000000010 */
        /*3620*/                   PRMT R13, R35, 0x5410, R0 ;                          /* 0x00005410230d7816 */
                                                                                        /* 0x000fc40000000000 */
        /*3630*/                   PRMT R4, R5, 0x7610, R4 ;                            /* 0x0000761005047816 */
                                                                                        /* 0x000fe40000000004 */
        /*3640*/                   PRMT R0, R15, 0x5410, R5 ;                           /* 0x000054100f007816 */
                                                                                        /* 0x000fc60000000005 */
        /*3650*/                   HMNMX2 R11, R4, R11, PT ;                            /* 0x0000000b040b7240 */
                                                                                        /* 0x000fe40003800000 */
        /*3660*/                   HMNMX2 R0, R0, R13, !PT ;                            /* 0x0000000d00007240 */
                                                                                        /* 0x000fc60007800000 */
        /*3670*/                   PRMT R28, R15, 0x5432, R11 ;                         /* 0x000054320f1c7816 */
                                                                                        /* 0x000fe4000000000b */
        /*3680*/                   PRMT R5, R0, 0x5410, R3 ;                            /* 0x0000541000057816 */
                                                                                        /* 0x000fca0000000003 */
        /*3690*/                   HMNMX2 R3, R28.reuse, R5.reuse, !PT ;                /* 0x000000051c037240 */
                                                                                        /* 0x0c0fe40007800000 */
        /*36a0*/                   HMNMX2 R28, R28, R5, PT ;                            /* 0x000000051c1c7240 */
                                                                                        /* 0x000fe40003800000 */
        /*36b0*/                   HMNMX2 R13, R31.H1_H1, R3.H0_H0, !PT ;               /* 0x200000031f0d7240 */
                                                                                        /* 0x000fe40007800c00 */
        /*36c0*/                   HMNMX2 R5, R28.H1_H1, R20.reuse.H0_H0, PT ;          /* 0x200000141c057240 */
                                                                                        /* 0x080fe40003800c00 */
        /*36d0*/                   HMNMX2 R4, R36.H1_H1, R13.H0_H0, !PT ;               /* 0x2000000d24047240 */
                                                                                        /* 0x000fe40007800c00 */
        /*36e0*/                   HMNMX2 R20, R28.H1_H1, R20.H0_H0, !PT ;              /* 0x200000141c147240 */
                                                                                        /* 0x000fc40007800c00 */
        /*36f0*/                   HMNMX2 R6, R30.H1_H1, R4.H0_H0, !PT ;                /* 0x200000041e067240 */
                                                                                        /* 0x000fe40007800c00 */
        /*3700*/                   PRMT R2, R5, 0x7610, R2 ;                            /* 0x0000761005027816 */
                                                                                        /* 0x000fe40000000002 */
        /*3710*/                   PRMT R30, R36, 0x7632, R30 ;                         /* 0x00007632241e7816 */
                                                                                        /* 0x000fe4000000001e */
        /*3720*/                   PRMT R17, R21, 0x5410, R6 ;                          /* 0x0000541015117816 */
                                                                                        /* 0x000fe40000000006 */
        /*3730*/                   PRMT R13, R13, 0x5410, R4 ;                          /* 0x000054100d0d7816 */
                                                                                        /* 0x000fe40000000004 */
        /*3740*/                   HMNMX2 R21, R19.H1_H1, R21.H1_H1, !PT ;              /* 0x3000001513157240 */
                                                                                        /* 0x000fc40007800c00 */
        /*3750*/                   HMNMX2 R18, R2.reuse, R17.reuse, PT ;                /* 0x0000001102127240 */
                                                                                        /* 0x0c0fe40003800000 */
        /*3760*/                   HMNMX2 R17, R2, R17, !PT ;                           /* 0x0000001102117240 */
                                                                                        /* 0x000fe40007800000 */
        /*3770*/                   HMNMX2 R22, R18.reuse.H0_H0, R19.reuse.H0_H0, !PT ;  /* 0x2000001312167240 */
                                                                                        /* 0x0c0fe40007800800 */
        /*3780*/                   HMNMX2 R37, R18.reuse.H0_H0, R19.H0_H0, PT ;         /* 0x2000001312257240 */
                                                                                        /* 0x040fe40003800800 */
        /*3790*/                   HMNMX2 R5, R17.H1_H1, R22.H0_H0, PT ;                /* 0x2000001611057240 */
                                                                                        /* 0x000fe40003800c00 */
        /*37a0*/                   HMNMX2 R6, R18.H1_H1, R37.H0_H0, !PT ;               /* 0x2000002512067240 */
                                                                                        /* 0x000fc40007800c00 */
        /*37b0*/                   HMNMX2 R32, R7.H0_H0, R5.H0_H0, !PT ;                /* 0x2000000507207240 */
                                                                                        /* 0x000fe40007800800 */
        /*37c0*/                   HMNMX2 R2, R6.H0_H0, R14.H1_H1, PT ;                 /* 0x3000000e06027240 */
                                                                                        /* 0x000fe40003800800 */
        /*37d0*/                   PRMT R5, R5, 0x7610, R0 ;                            /* 0x0000761005057816 */
                                                                                        /* 0x000fe40000000000 */
        /*37e0*/                   PRMT R32, R32, 0x7610, R12 ;                         /* 0x0000761020207816 */
                                                                                        /* 0x000fe4000000000c */
        /*37f0*/                   PRMT R12, R15, 0x5410, R2 ;                          /* 0x000054100f0c7816 */
                                                                                        /* 0x000fe40000000002 */
        /*3800*/                   HMNMX2 R30, R30, R13, PT ;                           /* 0x0000000d1e1e7240 */
                                                                                        /* 0x000fc40003800000 */
        /*3810*/                   HMNMX2 R15, R34, R32, !PT ;                          /* 0x00000020220f7240 */
                                                                                        /* 0x000fe40007800000 */
        /*3820*/                   HMNMX2 R33, R12, R35.reuse, PT ;                     /* 0x000000230c217240 */
                                                                                        /* 0x080fe40003800000 */
        /*3830*/                   HMNMX2 R2, R2.H0_H0, R35.H1_H1, !PT ;                /* 0x3000002302027240 */
                                                                                        /* 0x000fe40007800800 */
        /*3840*/                   HMNMX2 R12, R33.H1_H1, R15.H1_H1, PT ;               /* 0x3000000f210c7240 */
                                                                                        /* 0x000fe40003800c00 */
        /*3850*/                   PRMT R34, R34, 0x5410, R33 ;                         /* 0x0000541022227816 */
                                                                                        /* 0x000fe40000000021 */
        /*3860*/                   PRMT R32, R32, 0x5410, R15 ;                         /* 0x0000541020207816 */
                                                                                        /* 0x000fc4000000000f */
        /*3870*/                   PRMT R7, R7, 0x5410, R12 ;                           /* 0x0000541007077816 */
                                                                                        /* 0x000fe4000000000c */
        /*3880*/                   HMNMX2 R12, R12.H0_H0, R0.H1_H1, !PT ;               /* 0x300000000c0c7240 */
                                                                                        /* 0x000fe40007800800 */
        /*3890*/                   HMNMX2 R37, R18.H1_H1, R37.H0_H0, PT ;               /* 0x2000002512257240 */
                                                                                        /* 0x000fe40003800c00 */
        /*38a0*/                   HMNMX2 R16, R7, R5, PT ;                             /* 0x0000000507107240 */
                                                                                        /* 0x000fe40003800000 */
        /*38b0*/                   HMNMX2 R7, R33, R15, !PT ;                           /* 0x0000000f21077240 */
                                                                                        /* 0x000fe40007800000 */
        /*38c0*/                   HMNMX2 R15, R34, R32, PT ;                           /* 0x00000020220f7240 */
                                                                                        /* 0x000fc40003800000 */
        /*38d0*/                   PRMT R40, R31, 0x7632, R16 ;                         /* 0x000076321f287816 */
                                                                                        /* 0x000fe40000000010 */
        /*38e0*/                   HMNMX2 R5, R28.reuse.H0_H0, R7.reuse.H0_H0, !PT ;    /* 0x200000071c057240 */
                                                                                        /* 0x0c0fe40007800800 */
        /*38f0*/                   HMNMX2 R28, R28.H0_H0, R7.H0_H0, PT ;                /* 0x200000071c1c7240 */
                                                                                        /* 0x000fe40003800800 */
        /*3900*/                   HMNMX2 R31, R40, R3, PT ;                            /* 0x00000003281f7240 */
                                                                                        /* 0x000fe40003800000 */
        /*3910*/                   PRMT R14, R14, 0x5432, R5 ;                          /* 0x000054320e0e7816 */
                                                                                        /* 0x000fe40000000005 */
        /*3920*/                   HMNMX2 R5, R31.H0_H0, R5.H0_H0, PT ;                 /* 0x200000051f057240 */
                                                                                        /* 0x000fc40003800800 */
        /*3930*/                   PRMT R6, R6, 0x5410, R31 ;                           /* 0x0000541006067816 */
                                                                                        /* 0x000fe4000000001f */
        /*3940*/                   HMNMX2 R3, R16.H1_H1, R3.H1_H1, !PT ;                /* 0x3000000310037240 */
                                                                                        /* 0x000fe40007800c00 */
        /*3950*/                   HMNMX2 R22, R17.H1_H1, R22.H0_H0, !PT ;              /* 0x2000001611167240 */
                                                                                        /* 0x000fe40007800c00 */
        /*3960*/                   HMNMX2 R45, R6, R14, !PT ;                           /* 0x0000000e062d7240 */
                                                                                        /* 0x000fe40007800000 */
        /*3970*/                   HMNMX2 R6, R31.H1_H1, R20.H0_H0, PT ;                /* 0x200000141f067240 */
                                                                                        /* 0x000fe40003800c00 */
        /*3980*/                   HMNMX2 R4, R30.H0_H0, R45.H1_H1, !PT ;               /* 0x3000002d1e047240 */
                                                                                        /* 0x000fc40007800800 */
        /*3990*/                   HMNMX2 R14, R6.H0_H0, R17.reuse.H0_H0, PT ;          /* 0x20000011060e7240 */
                                                                                        /* 0x080fe40003800800 */
        /*39a0*/                   HMNMX2 R13, R30.H1_H1, R4.reuse.H0_H0, PT ;          /* 0x200000041e0d7240 */
                                                                                        /* 0x080fe40003800c00 */
        /*39b0*/                   HMNMX2 R6, R6.H0_H0, R17.H0_H0, !PT ;                /* 0x2000001106067240 */
                                                                                        /* 0x000fe40007800800 */
        /*39c0*/                   PRMT R20, R20, 0x5410, R14 ;                         /* 0x0000541014147816 */
                                                                                        /* 0x000fe4000000000e */
        /*39d0*/                   PRMT R33, R31, 0x5432, R13 ;                         /* 0x000054321f217816 */
                                                                                        /* 0x000fe4000000000d */
        /*39e0*/                   HMNMX2 R4, R30.H1_H1, R4.H0_H0, !PT ;                /* 0x200000041e047240 */
                                                                                        /* 0x000fc40007800c00 */
        /*39f0*/                   HMNMX2 R30, R30.H0_H0, R45.H1_H1, PT ;               /* 0x3000002d1e1e7240 */
                                                                                        /* 0x000fe40003800800 */
        /*3a00*/                   HMNMX2 R35, R4.H0_H0, R6.H0_H0, PT ;                 /* 0x2000000604237240 */
                                                                                        /* 0x000fe40003800800 */
        /*3a10*/                   HMNMX2 R20, R33, R20, !PT ;                          /* 0x0000001421147240 */
                                                                                        /* 0x000fe40007800000 */
        /*3a20*/                   HMNMX2 R33, R16.H0_H0, R35.H0_H0, !PT ;              /* 0x2000002310217240 */
                                                                                        /* 0x000fe40007800800 */
        /*3a30*/                   HMNMX2 R32, R20.reuse.H1_H1, R45.reuse.H0_H0, PT ;   /* 0x2000002d14207240 */
                                                                                        /* 0x0c0fe40003800c00 */
        /*3a40*/                   HMNMX2 R45, R20.H1_H1, R45.H0_H0, !PT ;              /* 0x2000002d142d7240 */
                                                                                        /* 0x000fc40007800c00 */
        /*3a50*/                   PRMT R33, R33, 0x5410, R2 ;                          /* 0x0000541021217816 */
                                                                                        /* 0x000fe40000000002 */
        /*3a60*/                   PRMT R32, R15, 0x5410, R32 ;                         /* 0x000054100f207816 */
                                                                                        /* 0x000fe40000000020 */
        /*3a70*/                   HMNMX2 R35, R16.H0_H0, R35.H0_H0, PT ;               /* 0x2000002310237240 */
                                                                                        /* 0x000fe40003800800 */
        /*3a80*/                   PRMT R11, R11, 0x5432, R37 ;                         /* 0x000054320b0b7816 */
                                                                                        /* 0x000fe40000000025 */
        /*3a90*/                   HMNMX2 R43, R32.reuse, R33.reuse, PT ;               /* 0x00000021202b7240 */
                                                                                        /* 0x0c0fe40003800000 */
        /*3aa0*/                   HMNMX2 R36, R32, R33, !PT ;                          /* 0x0000002120247240 */
                                                                                        /* 0x000fc40007800000 */
        /*3ab0*/                   HMNMX2 R0, R43.H1_H1, R7.H1_H1, PT ;                 /* 0x300000072b007240 */
                                                                                        /* 0x000fe40003800c00 */
        /*3ac0*/                   HMNMX2 R2, R15.H1_H1, R36.H0_H0, !PT ;               /* 0x200000240f027240 */
                                                                                        /* 0x000fe40007800c00 */
        /*3ad0*/                   HMNMX2 R13, R13.H0_H0, R14.H0_H0, PT ;               /* 0x2000000e0d0d7240 */
                                                                                        /* 0x000fe40003800800 */
        /*3ae0*/                   PRMT R28, R28, 0x5410, R0 ;                          /* 0x000054101c1c7816 */
                                                                                        /* 0x000fe40000000000 */
        /*3af0*/                   PRMT R2, R2, 0x5410, R12 ;                           /* 0x0000541002027816 */
                                                                                        /* 0x000fca000000000c */
        /*3b00*/                   HMNMX2 R31, R28.reuse, R2.reuse, !PT ;               /* 0x000000021c1f7240 */
                                                                                        /* 0x0c0fe40007800000 */
        /*3b10*/                   HMNMX2 R32, R28, R2, PT ;                            /* 0x000000021c207240 */
                                                                                        /* 0x000fe40003800000 */
        /*3b20*/                   HMNMX2 R28, R5.H0_H0, R31.H0_H0, !PT ;               /* 0x2000001f051c7240 */
                                                                                        /* 0x000fe40007800800 */
        /*3b30*/                   HMNMX2 R2, R32.H1_H1, R3.H0_H0, PT ;                 /* 0x2000000320027240 */
                                                                                        /* 0x000fe40003800c00 */
        /*3b40*/                   HMNMX2 R12, R30.H0_H0, R28.H0_H0, PT ;               /* 0x2000001c1e0c7240 */
                                                                                        /* 0x000fe40003800800 */
        /*3b50*/                   HMNMX2 R0, R2.H0_H0, R20.H0_H0, PT ;                 /* 0x2000001402007240 */
                                                                                        /* 0x000fc40003800800 */
        /*3b60*/                   HMNMX2 R2, R2.H0_H0, R20.H0_H0, !PT ;                /* 0x2000001402027240 */
                                                                                        /* 0x000fe40007800800 */
        /*3b70*/                   PRMT R30, R30, 0x5410, R12 ;                         /* 0x000054101e1e7816 */
                                                                                        /* 0x000fe4000000000c */
        /*3b80*/                   PRMT R28, R28, 0x5410, R0 ;                          /* 0x000054101c1c7816 */
                                                                                        /* 0x000fe40000000000 */
        /*3b90*/                   HMNMX2 R20, R43.H1_H1, R7.H1_H1, !PT ;               /* 0x300000072b147240 */
                                                                                        /* 0x000fe40007800c00 */
        /*3ba0*/                   HMNMX2 R0, R12.H0_H0, R0.H0_H0, PT ;                 /* 0x200000000c007240 */
                                                                                        /* 0x000fe40003800800 */
        /*3bb0*/                   HMNMX2 R44, R30, R28, !PT ;                          /* 0x0000001c1e2c7240 */
                                                                                        /* 0x000fc80007800000 */
        /*3bc0*/                   HMNMX2 R33, R44.reuse.H1_H1, R45.H0_H0, PT ;         /* 0x2000002d2c217240 */
                                                                                        /* 0x040fe40003800c00 */
        /*3bd0*/                   HMNMX2 R30, R44.reuse.H0_H0, R2.H0_H0, PT ;          /* 0x200000022c1e7240 */
                                                                                        /* 0x040fe40003800800 */
        /*3be0*/                   PRMT R2, R2, 0x5410, R45 ;                           /* 0x0000541002027816 */
                                                                                        /* 0x000fe4000000002d */
        /*3bf0*/                   PRMT R15, R15, 0x5432, R33 ;                         /* 0x000054320f0f7816 */
                                                                                        /* 0x000fe40000000021 */
        /*3c00*/                   HMNMX2 R34, R35.H0_H0, R30.H0_H0, !PT ;              /* 0x2000001e23227240 */
                                                                                        /* 0x000fe40007800800 */
        /*3c10*/                   HMNMX2 R2, R44, R2, !PT ;                            /* 0x000000022c027240 */
                                                                                        /* 0x000fc40007800000 */
        /*3c20*/                   HMNMX2 R28, R15, R36, PT ;                           /* 0x000000240f1c7240 */
                                                                                        /* 0x000fe40003800000 */
        /*3c30*/                   HMNMX2 R40, R43.H0_H0, R34.H0_H0, !PT ;              /* 0x200000222b287240 */
                                                                                        /* 0x000fe40007800800 */
        /*3c40*/                   HMNMX2 R15, R28.reuse.H1_H1, R20.H0_H0, PT ;         /* 0x200000141c0f7240 */
                                                                                        /* 0x040fe40003800c00 */
        /*3c50*/                   HMNMX2 R16, R28.H0_H0, R40.H0_H0, !PT ;              /* 0x200000281c107240 */
                                                                                        /* 0x000fe40007800800 */
        /*3c60*/                   HMNMX2 R33, R33.H0_H0, R36.H1_H1, !PT ;              /* 0x3000002421217240 */
                                                                                        /* 0x000fe40007800800 */
        /*3c70*/                   PRMT R46, R5, 0x5410, R15 ;                          /* 0x00005410052e7816 */
                                                                                        /* 0x000fc4000000000f */
        /*3c80*/                   PRMT R16, R16, 0x5410, R3 ;                          /* 0x0000541010107816 */
                                                                                        /* 0x000fe40000000003 */
        /*3c90*/                   HMNMX2 R43, R43.H0_H0, R34.H0_H0, PT ;               /* 0x200000222b2b7240 */
                                                                                        /* 0x000fe40003800800 */
        /*3ca0*/                   HMNMX2 R46, R46, R31, PT ;                           /* 0x0000001f2e2e7240 */
                                                                                        /* 0x000fe40003800000 */
        /*3cb0*/                   HMNMX2 R5, R32, R16, !PT ;                           /* 0x0000001020057240 */
                                                                                        /* 0x000fe40007800000 */
        /*3cc0*/                   HMNMX2 R40, R28.H0_H0, R40.H0_H0, PT ;               /* 0x200000281c287240 */
                                                                                        /* 0x000fe40003800800 */
        /*3cd0*/                   HMNMX2 R7, R46.H0_H0, R5.H0_H0, !PT ;                /* 0x200000052e077240 */
                                                                                        /* 0x000fc40007800800 */
        /*3ce0*/                   HMNMX2 R3, R46.reuse.H1_H1, R5.reuse.H1_H1, !PT ;    /* 0x300000052e037240 */
                                                                                        /* 0x0c0fe40007800c00 */
        /*3cf0*/                   HMNMX2 R5, R46, R5, PT ;                             /* 0x000000052e057240 */
                                                                                        /* 0x000fe40003800000 */
        /*3d00*/                   PRMT R44, R35, 0x5410, R7 ;                          /* 0x00005410232c7816 */
                                                                                        /* 0x000fe40000000007 */
        /*3d10*/                   PRMT R30, R30, 0x5410, R3 ;                          /* 0x000054101e1e7816 */
                                                                                        /* 0x000fe40000000003 */
        /*3d20*/                   HMNMX2 R35, R5.H0_H0, R5.H1_H1, !PT ;                /* 0x3000000505237240 */
                                                                                        /* 0x000fe40007800800 */
        /*3d30*/                   HMNMX2 R20, R28.H1_H1, R20.H0_H0, !PT ;              /* 0x200000141c147240 */
                                                                                        /* 0x000fc40007800c00 */
        /*3d40*/                   HMNMX2 R30, R44, R30, PT ;                           /* 0x0000001e2c1e7240 */
                                                                                        /* 0x000fe40003800000 */
        /*3d50*/                   HMNMX2 R34, R35.H0_H0, R2.H1_H1, PT ;                /* 0x3000000223227240 */
                                                                                        /* 0x000fe40003800800 */
        /*3d60*/                   HMNMX2 R36, R30.H0_H0, R30.H1_H1, !PT ;              /* 0x3000001e1e247240 */
                                                                                        /* 0x000fe40007800800 */
        /*3d70*/                   HMNMX2 R16, R32.H0_H0, R16.H0_H0, PT ;               /* 0x2000001020107240 */
                                                                                        /* 0x000fe40003800800 */
        /*3d80*/                   PRMT R43, R43, 0x5410, R34 ;                         /* 0x000054102b2b7816 */
                                                                                        /* 0x000fe40000000022 */
        /*3d90*/                   PRMT R36, R36, 0x5410, R33 ;                         /* 0x0000541024247816 */
                                                                                        /* 0x000fc40000000021 */
        /*3da0*/                   HMNMX2 R32, R15.H0_H0, R31.H1_H1, !PT ;              /* 0x3000001f0f207240 */
                                                                                        /* 0x000fe40007800800 */
        /*3db0*/                   HMNMX2 R35, R35.H0_H0, R2.H1_H1, !PT ;               /* 0x3000000223237240 */
                                                                                        /* 0x000fe40007800800 */
        /*3dc0*/                   HMNMX2 R34, R43.reuse, R36.reuse, PT ;               /* 0x000000242b227240 */
                                                                                        /* 0x0c0fe40003800000 */
        /*3dd0*/                   HMNMX2 R28, R43, R36, !PT ;                          /* 0x000000242b1c7240 */
                                                                                        /* 0x000fe40007800000 */
        /*3de0*/                   HMNMX2 R15, R34.H1_H1, R20.H0_H0, PT ;               /* 0x20000014220f7240 */
                                                                                        /* 0x000fe40003800c00 */
        /*3df0*/                   HMNMX2 R31, R40.H0_H0, R28.H0_H0, !PT ;              /* 0x2000001c281f7240 */
                                                                                        /* 0x000fc40007800800 */
        /*3e00*/                   HMNMX2 R30, R30.H0_H0, R30.H1_H1, PT ;               /* 0x3000001e1e1e7240 */
                                                                                        /* 0x000fe40003800800 */
        /*3e10*/                   PRMT R16, R16, 0x5410, R15 ;                         /* 0x0000541010107816 */
                                                                                        /* 0x000fe4000000000f */
        /*3e20*/                   PRMT R31, R31, 0x5410, R32 ;                         /* 0x000054101f1f7816 */
                                                                                        /* 0x000fe40000000020 */
        /*3e30*/                   HMNMX2 R3, R7.H0_H0, R3.H0_H0, !PT ;                 /* 0x2000000307037240 */
                                                                                        /* 0x000fe40007800800 */
        /*3e40*/                   HMNMX2 R5, R5.H0_H0, R5.H1_H1, PT ;                  /* 0x3000000505057240 */
                                                                                        /* 0x000fe40003800800 */
        /*3e50*/                   HMNMX2 R15, R16, R31, PT ;                           /* 0x0000001f100f7240 */
                                                                                        /* 0x000fc40003800000 */
        /*3e60*/                   HMNMX2 R16, R16, R31, !PT ;                          /* 0x0000001f10107240 */
                                                                                        /* 0x000fe40007800000 */
        /*3e70*/                   HMNMX2 R36, R15.reuse.H0_H0, R15.H1_H1, !PT ;        /* 0x3000000f0f247240 */
                                                                                        /* 0x040fe40007800800 */
        /*3e80*/                   HMNMX2 R33, R16.H0_H0, R16.H1_H1, PT ;               /* 0x3000001010217240 */
                                                                                        /* 0x000fe40003800800 */
        /*3e90*/                   HMNMX2 R32, R36.H0_H0, R35.H0_H0, PT ;               /* 0x2000002324207240 */
                                                                                        /* 0x000fe40003800800 */
        /*3ea0*/                   HMNMX2 R31, R30.H0_H0, R33.H0_H0, !PT ;              /* 0x200000211e1f7240 */
                                                                                        /* 0x000fe40007800800 */
        /*3eb0*/                   HMNMX2 R15, R15.H0_H0, R15.H1_H1, PT ;               /* 0x3000000f0f0f7240 */
                                                                                        /* 0x000fc40003800800 */
        /*3ec0*/                   PRMT R40, R40, 0x5410, R32 ;                         /* 0x0000541028287816 */
                                                                                        /* 0x000fe40000000020 */
        /*3ed0*/                   PRMT R31, R31, 0x5410, R20 ;                         /* 0x000054101f1f7816 */
                                                                                        /* 0x000fe40000000014 */
        /*3ee0*/                   HMNMX2 R32, R32.H0_H0, R28.reuse.H1_H1, !PT ;        /* 0x3000001c20207240 */
                                                                                        /* 0x080fe40007800800 */
        /*3ef0*/                   HMNMX2 R40, R40, R28, PT ;                           /* 0x0000001c28287240 */
                                                                                        /* 0x000fe40003800000 */
        /*3f00*/                   HMNMX2 R20, R34.reuse, R31.reuse, !PT ;              /* 0x0000001f22147240 */
                                                                                        /* 0x0c0fe40007800000 */
        /*3f10*/                   HMNMX2 R31, R34.H0_H0, R31.H0_H0, PT ;               /* 0x2000001f221f7240 */
                                                                                        /* 0x000fc40003800800 */
        /*3f20*/                   HMNMX2 R17, R40.reuse.H0_H0, R20.reuse.H0_H0, PT ;   /* 0x2000001428117240 */
                                                                                        /* 0x0c0fe40003800800 */
        /*3f30*/                   HMNMX2 R18, R40.reuse.H0_H0, R20.reuse.H0_H0, !PT ;  /* 0x2000001428127240 */
                                                                                        /* 0x0c0fe40007800800 */
        /*3f40*/                   HMNMX2 R19, R40.reuse.H1_H1, R20.reuse.H1_H1, PT ;   /* 0x3000001428137240 */
                                                                                        /* 0x0c0fe40003800c00 */
        /*3f50*/                   HMNMX2 R20, R40.H1_H1, R20.H1_H1, !PT ;              /* 0x3000001428147240 */
                                                                                        /* 0x000fe20007800c00 */
        /*3f60*/                   IMAD.MOV.U32 R40, RZ, RZ, c[0x0][0x160] ;            /* 0x00005800ff287624 */
                                                                                        /* 0x000fe200078e00ff */
        /*3f70*/                   PRMT R36, R36, 0x5410, R17 ;                         /* 0x0000541024247816 */
                                                                                        /* 0x000fe40000000011 */
        /*3f80*/                   PRMT R35, R35, 0x5410, R19 ;                         /* 0x0000541023237816 */
                                                                                        /* 0x000fc40000000013 */
        /*3f90*/                   STG.E.128 [R40.64], R8 ;                             /* 0x0000000828007986 */
1 Like

It seems to work, LoL:

# cat t156.cu
#include <cuda_fp16.h>
#include <iostream>

template <class T> __device__ __forceinline__ void cp(T* __restrict__ O, const T* __restrict__ A, int I) {
  #pragma unroll
  for (int i = 0; i < I; i += 16 / sizeof(T))
    * (int4*)(O + i) = *(int4*)(A + i);
}
__device__ __forceinline__ void sort2(half& __restrict__ a, half& __restrict__ b) {
  half t = __hmin(a, b);
  b = __hmax(a, b);
  a = t;
};
template <int N = 32> __global__ void sort2_ker(half* __restrict__ A) {
  half a[N];
  cp(a, A, N);
  #pragma unroll
  for (int I = 1; I < N; I *= 2) {
    #pragma unroll
    for (int x = I; x < N; x += I * 2) {
      #pragma unroll
      for (int i = N; i > 1; i--) {
        if (i <= I){
          sort2(a[x - i], a[x]);
          sort2(a[x - 1], a[x + i - 1]);
          #pragma unroll
          for (int j = 1; j < i - 1; j++) {
            sort2(a[x - 1 - j], a[x - 1]);
            sort2(a[x], a[x + j]);
          }
        }
      }
      sort2(a[x - 1], a[x]);
    }
  }
  cp(A, a, N);
}

int main(){

  half Ah[32] = {1., 8., 4., 6., 3., 12., 11., 10., 9., 2., 5., 7., 13., 14., 15.,16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 0., 28., 29., 30., 31.};
  half *A;
  cudaMallocManaged(&A, 32*sizeof(half));
  memcpy(A, Ah, 32*sizeof(half));
  sort2_ker<32><<<1,1>>>(A);
  cudaDeviceSynchronize();
  for (int i = 0; i < 32; i++) std::cout << __half2float(A[i]) << " ";
  std::cout << std::endl;
}
# nvcc -o t156 t156.cu
# compute-sanitizer ./t156
========= COMPUTE-SANITIZER
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
========= ERROR SUMMARY: 0 errors
#

If this is of concern to you, I suppose the information in this thread may be enough to file a bug, asking the compiler team to see if the “optimized” result may be obtained without the modification I suggested. I don’t know if it would be practical/sensible or not. I suspect there may be some kind of nesting short-circuit that prevents the compiler from going arbitrarily deep into certain holes, as a practical matter.

1 Like

Thanks a lot! Converting dynamic loop to “a fixed loop + if” is a great idea!

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.