I’m seeing unexpected runtime behavior when building with CUDA 10.2 and executing on Jetson r32.6 (Jetson Xavier AGX). The problem also exists when cross-compiling with clang-{15,16,18,19} using ptxas from CUDA 10.2, but goes away when cross-compiling with CUDA 12.0’s ptxas.
cudafail.cpp
#include <cuda_runtime.h>
#include <cstdio>
namespace device {
__device__ unsigned int GetLabel(const unsigned char a, const unsigned char b, const unsigned int t) {
const int diff = int(b) - int(a);
unsigned int result = 0;
if (diff > int(t)) {
result |= 2;
}
if (diff < -int(t)) {
result |= 1;
}
return result;
}
__global__ void KernelFunction(const unsigned int width, const unsigned int threshold, unsigned char* input,
float* output) {
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
const unsigned char a = input[tid];
const unsigned int b = input[tid < width - 1 ? tid + 1 : tid];
const bool pass = GetLabel(a, b, threshold);
output[tid] = float(pass);
}
} // namespace device
void CallKernelFunction(const unsigned int width, const unsigned char* input, float* output) {
dim3 block_dims = {32};
dim3 grid_dims = {(width + block_dims.x - 1) / block_dims.x};
const unsigned int threshold = 10;
void* args[] = {(void*)&width, (void*)&threshold, (void*)&input, &output};
const cudaError_t res = cudaLaunchKernel((void*)device::KernelFunction, grid_dims, block_dims, args);
if (res != cudaSuccess) {
printf("Error launching kernel: %s\n", cudaGetErrorString(res));
}
}
int main() {
constexpr unsigned int width = 128;
unsigned char* input = nullptr;
cudaMallocManaged(&input, width * sizeof(unsigned char));
// Set pixels to 0 or 128 in blocks of 16
for (unsigned int x = 0; x < width; ++x) {
input[x] = (x / 16) % 2 == 0 ? 0 : 128;
}
float* output = nullptr;
cudaMallocManaged(&output, width * sizeof(float));
CallKernelFunction(width, input, output);
cudaDeviceSynchronize();
// Sum the output values
float sum = 0.f;
for (unsigned int i = 0; i < width; ++i) {
sum += output[i];
}
printf("Sum: %0.f\n", sum);
cudaFree(input);
cudaFree(output);
return 0;
}
Creating this file on a Jetson Xavier AGX running JetPack R32.6 and compiling with: nvcc cudafail.cu -o cudafail-nvcc -arch=sm_72 -O2
, then running produces the unexpected result of:
Sum: 0
But cross-compiling using clang and CUDA 12.0 with:
clang++ \
--cuda-gpu-arch=sm_72 \
--cuda-path=/usr/local/cuda-12.0 \
--target=aarch64-linux-gnu \
--sysroot=./sysroot/jetson-32.6 \
-I./sysroot/jetson-32.6/usr/local/cuda/include \
-L./sysroot/jetson-32.6/usr/local/cuda/lib64 \
-lcudart \
-O2 \
-Wall -Wextra \
-o cudafail-cuda12 \
cudaail.cu
Produces the expected result:
Sum: 7
The problem also goes away when changing the GetLabel()
function to this simplified but logically equivalent version and compiling with CUDA 10.2 or 12.0:
__device__ unsigned int GetLabel(const unsigned char a, const unsigned char b, const unsigned int t) {
const int diff = int(b) - int(a);
if (diff > int(t)) {
return 2;
}
if (diff < -int(t)) {
return 1;
}
return 0;
}
After trying various iterations of the code, the common block of PTXAS among the failing versions is this section of inlined GetLabel()
code that appears in KernelFunction()
:
sub.s32 %r12, %r11, %r7; // diff = b - a
...
setp.gt.s32 %p2, %r12, %r2; // if (diff > threshold)
selp.b32 %r13, 2, 0, %p2; // result = 2 else 0
neg.s32 %r14, %r2; // -threshold
setp.lt.s32 %p3, %r12, %r14; // if (diff < -threshold)
selp.u32 %r15, 1, 0, %p3; // OR in 1
or.b32 %r16, %r13, %r15; // result |= (2 or 1)
setp.ne.s32 %p4, %r16, 0; // if (result != 0)
selp.f32 %f1, 0f3F800000, 0f00000000, %p4; // pass = 1.0f else 0.0f
Since the issue has been fixed somewhere between CUDA 10.2 and 12.0, I’m guessing this is a known issue that was already fixed. Can anyone confirm this or provide more information? As we’re supporting existing devices with CUDA 10.2, I’d like to know if there’s a workaround or what patterns to avoid to prevent this issue.