Can't get ptx to stop using local

Robert_Crovella · March 12, 2024, 7:20pm

It seems to work, LoL:

# cat t156.cu
#include <cuda_fp16.h>
#include <iostream>

template <class T> __device__ __forceinline__ void cp(T* __restrict__ O, const T* __restrict__ A, int I) {
  #pragma unroll
  for (int i = 0; i < I; i += 16 / sizeof(T))
    * (int4*)(O + i) = *(int4*)(A + i);
}
__device__ __forceinline__ void sort2(half& __restrict__ a, half& __restrict__ b) {
  half t = __hmin(a, b);
  b = __hmax(a, b);
  a = t;
};
template <int N = 32> __global__ void sort2_ker(half* __restrict__ A) {
  half a[N];
  cp(a, A, N);
  #pragma unroll
  for (int I = 1; I < N; I *= 2) {
    #pragma unroll
    for (int x = I; x < N; x += I * 2) {
      #pragma unroll
      for (int i = N; i > 1; i--) {
        if (i <= I){
          sort2(a[x - i], a[x]);
          sort2(a[x - 1], a[x + i - 1]);
          #pragma unroll
          for (int j = 1; j < i - 1; j++) {
            sort2(a[x - 1 - j], a[x - 1]);
            sort2(a[x], a[x + j]);
          }
        }
      }
      sort2(a[x - 1], a[x]);
    }
  }
  cp(A, a, N);
}

int main(){

  half Ah[32] = {1., 8., 4., 6., 3., 12., 11., 10., 9., 2., 5., 7., 13., 14., 15.,16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 0., 28., 29., 30., 31.};
  half *A;
  cudaMallocManaged(&A, 32*sizeof(half));
  memcpy(A, Ah, 32*sizeof(half));
  sort2_ker<32><<<1,1>>>(A);
  cudaDeviceSynchronize();
  for (int i = 0; i < 32; i++) std::cout << __half2float(A[i]) << " ";
  std::cout << std::endl;
}
# nvcc -o t156 t156.cu
# compute-sanitizer ./t156
========= COMPUTE-SANITIZER
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
========= ERROR SUMMARY: 0 errors
#

If this is of concern to you, I suppose the information in this thread may be enough to file a bug, asking the compiler team to see if the “optimized” result may be obtained without the modification I suggested. I don’t know if it would be practical/sensible or not. I suspect there may be some kind of nesting short-circuit that prevents the compiler from going arbitrarily deep into certain holes, as a practical matter.