Steps to reproduce:
- Launch an A100 on Lambda
- Install latest cuda and run this reproduction
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get -y install cuda-toolkit-12-8
/usr/local/cuda-12.8/bin/nvcc -std=c++20 --expt-relaxed-constexpr -arch=sm_80 -o repro repro.cu && ./repro
// Compile with:
// nvcc -std=c++20 --expt-relaxed-constexpr -arch=sm_80 -o repro repro.cu && ./repro
// If you remove `-std=c++20`, you get the correct error message:
// error: identifier "std::swap< ::MyStruct> " is undefined in device code
// If you remove `--expt-relaxed-constexpr`, you get a misleading error message:
// error: calling a constexpr __host__ function("swap") from a __global__ function("reproKernel") is not allowed. The experimental flag '--expt-relaxed-constexpr' can be used to allow this.
#include <stdio.h>
#include <cuda/std/type_traits>
struct Foo {
int32_t foo;
__host__ __device__ Foo(const Foo& other) : foo(other.foo) {}
// ^ comment out this line, and the swap succeeds (?!?!?!?!)
__host__ __device__ constexpr Foo() : foo(1337){}
// remove this constexpr ^ to get the correct error message:
// error: identifier "std::swap< ::MyStruct> " is undefined in device code
};
struct MyStruct {
Foo foo; // <-- comment out this line, and the swap succeeds (?!?!?!?!)
int32_t bar;
};
__global__ void reproKernel() {
MyStruct A{.bar = 123};
MyStruct B{.bar = 456};
printf("Before swap %d %d (expect: 123 456)\n", A.bar, B.bar);
std::swap(A, B);
printf("After swap %d %d (expect: 456 123)\n", A.bar, B.bar);
#if 0
::cuda::std::swap(A, B);
printf("After second swap %d %d (expect 123 456) (note that enabling this #if **made the first swap also succeed** (?!?!?!))\n", A.bar, B.bar);
#endif
}
int main() {
reproKernel<<<1, 1>>>();
cudaDeviceSynchronize();
}