It seems to work, LoL:
# cat t156.cu
#include <cuda_fp16.h>
#include <iostream>
template <class T> __device__ __forceinline__ void cp(T* __restrict__ O, const T* __restrict__ A, int I) {
#pragma unroll
for (int i = 0; i < I; i += 16 / sizeof(T))
* (int4*)(O + i) = *(int4*)(A + i);
}
__device__ __forceinline__ void sort2(half& __restrict__ a, half& __restrict__ b) {
half t = __hmin(a, b);
b = __hmax(a, b);
a = t;
};
template <int N = 32> __global__ void sort2_ker(half* __restrict__ A) {
half a[N];
cp(a, A, N);
#pragma unroll
for (int I = 1; I < N; I *= 2) {
#pragma unroll
for (int x = I; x < N; x += I * 2) {
#pragma unroll
for (int i = N; i > 1; i--) {
if (i <= I){
sort2(a[x - i], a[x]);
sort2(a[x - 1], a[x + i - 1]);
#pragma unroll
for (int j = 1; j < i - 1; j++) {
sort2(a[x - 1 - j], a[x - 1]);
sort2(a[x], a[x + j]);
}
}
}
sort2(a[x - 1], a[x]);
}
}
cp(A, a, N);
}
int main(){
half Ah[32] = {1., 8., 4., 6., 3., 12., 11., 10., 9., 2., 5., 7., 13., 14., 15.,16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 0., 28., 29., 30., 31.};
half *A;
cudaMallocManaged(&A, 32*sizeof(half));
memcpy(A, Ah, 32*sizeof(half));
sort2_ker<32><<<1,1>>>(A);
cudaDeviceSynchronize();
for (int i = 0; i < 32; i++) std::cout << __half2float(A[i]) << " ";
std::cout << std::endl;
}
# nvcc -o t156 t156.cu
# compute-sanitizer ./t156
========= COMPUTE-SANITIZER
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
========= ERROR SUMMARY: 0 errors
#
If this is of concern to you, I suppose the information in this thread may be enough to file a bug, asking the compiler team to see if the “optimized” result may be obtained without the modification I suggested. I don’t know if it would be practical/sensible or not. I suspect there may be some kind of nesting short-circuit that prevents the compiler from going arbitrarily deep into certain holes, as a practical matter.