CUDA 10.2 - ptxas bitwise OR miscompilation

I’m seeing unexpected runtime behavior when building with CUDA 10.2 and executing on Jetson r32.6 (Jetson Xavier AGX). The problem also exists when cross-compiling with clang-{15,16,18,19} using ptxas from CUDA 10.2, but goes away when cross-compiling with CUDA 12.0’s ptxas.

cudafail.cpp

#include <cuda_runtime.h>

#include <cstdio>

namespace device {

__device__ unsigned int GetLabel(const unsigned char a, const unsigned char b, const unsigned int t) {
  const int diff = int(b) - int(a);

  unsigned int result = 0;
  if (diff > int(t)) {
    result |= 2;
  }
  if (diff < -int(t)) {
    result |= 1;
  }
  return result;
}

__global__ void KernelFunction(const unsigned int width, const unsigned int threshold, unsigned char* input,
                               float* output) {
  unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;

  const unsigned char a = input[tid];
  const unsigned int b = input[tid < width - 1 ? tid + 1 : tid];

  const bool pass = GetLabel(a, b, threshold);
  output[tid] = float(pass);
}

}  // namespace device

void CallKernelFunction(const unsigned int width, const unsigned char* input, float* output) {
  dim3 block_dims = {32};
  dim3 grid_dims = {(width + block_dims.x - 1) / block_dims.x};
  const unsigned int threshold = 10;

  void* args[] = {(void*)&width, (void*)&threshold, (void*)&input, &output};

  const cudaError_t res = cudaLaunchKernel((void*)device::KernelFunction, grid_dims, block_dims, args);
  if (res != cudaSuccess) {
    printf("Error launching kernel: %s\n", cudaGetErrorString(res));
  }
}

int main() {
  constexpr unsigned int width = 128;

  unsigned char* input = nullptr;
  cudaMallocManaged(&input, width * sizeof(unsigned char));

  // Set pixels to 0 or 128 in blocks of 16
  for (unsigned int x = 0; x < width; ++x) {
    input[x] = (x / 16) % 2 == 0 ? 0 : 128;
  }

  float* output = nullptr;
  cudaMallocManaged(&output, width * sizeof(float));

  CallKernelFunction(width, input, output);
  cudaDeviceSynchronize();

  // Sum the output values
  float sum = 0.f;
  for (unsigned int i = 0; i < width; ++i) {
    sum += output[i];
  }
  printf("Sum: %0.f\n", sum);

  cudaFree(input);
  cudaFree(output);
  return 0;
}

Creating this file on a Jetson Xavier AGX running JetPack R32.6 and compiling with: nvcc cudafail.cu -o cudafail-nvcc -arch=sm_72 -O2, then running produces the unexpected result of:

Sum: 0

But cross-compiling using clang and CUDA 12.0 with:

clang++ \
  --cuda-gpu-arch=sm_72 \
  --cuda-path=/usr/local/cuda-12.0 \
  --target=aarch64-linux-gnu \
  --sysroot=./sysroot/jetson-32.6 \
  -I./sysroot/jetson-32.6/usr/local/cuda/include \
  -L./sysroot/jetson-32.6/usr/local/cuda/lib64 \
  -lcudart \
  -O2 \
  -Wall -Wextra \
  -o cudafail-cuda12 \
  cudaail.cu

Produces the expected result:

Sum: 7

The problem also goes away when changing the GetLabel() function to this simplified but logically equivalent version and compiling with CUDA 10.2 or 12.0:

__device__ unsigned int GetLabel(const unsigned char a, const unsigned char b, const unsigned int t) {
  const int diff = int(b) - int(a);

  if (diff > int(t)) {
    return 2;
  }
  if (diff < -int(t)) {
    return 1;
  }
  return 0;
}

After trying various iterations of the code, the common block of PTXAS among the failing versions is this section of inlined GetLabel() code that appears in KernelFunction():

sub.s32 %r12, %r11, %r7;                    // diff = b - a
...
setp.gt.s32 %p2, %r12, %r2;                 // if (diff > threshold)
selp.b32 %r13, 2, 0, %p2;                   //    result = 2 else 0
neg.s32 %r14, %r2;                          // -threshold
setp.lt.s32 %p3, %r12, %r14;                // if (diff < -threshold)
selp.u32 %r15, 1, 0, %p3;                   //    OR in 1
or.b32 %r16, %r13, %r15;                    // result |= (2 or 1)
setp.ne.s32 %p4, %r16, 0;                   // if (result != 0)
selp.f32 %f1, 0f3F800000, 0f00000000, %p4;  //    pass = 1.0f else 0.0f

Since the issue has been fixed somewhere between CUDA 10.2 and 12.0, I’m guessing this is a known issue that was already fixed. Can anyone confirm this or provide more information? As we’re supporting existing devices with CUDA 10.2, I’d like to know if there’s a workaround or what patterns to avoid to prevent this issue.