Compilation problems with CUDA 12.9

Here is a minimal sample code that does not compile with nothing else than

"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9\bin\nvcc.exe" --keep test.cu -c -m64 -gencode arch=compute_120,code=compute_120 --std=c++17 -DNVCC -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9\include"

(tested with Windows 10, Windows 11, in a a command prompt inheriting latest VS 2019 or latest VS2022 cl.exe)

Is there something broken with CUDA 12.9 ?
It works with CUDA 12.8

(for info those problems prevent OpenCV to be compiled with CUDA 12.9 : https://github.com/opencv/opencv_contrib/issues/3965 )


#include <thrust/device_ptr.h>
#include <thrust/transform.h>

struct some_operator : thrust::unary_function<int, int>
{
    __host__ __device__ int operator()(int a) const {return a;}
};

void f(void)
{
    int sizes[] = {10};
    thrust::device_ptr< int>  sizesPtr(sizes);
    thrust::transform(sizesPtr, sizesPtr+1, sizesPtr, some_operator());
}
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\thrust/system/detail/generic/for_each.h(49): error: static assertion failed with "unimplemented for this system"
    static_assert((thrust::detail::depend_on_instantiation<InputIterator, false>::value), "unimplemented for this system");
    ^
          detected during:
            instantiation of "InputIterator thrust::THRUST_200802_SM_1200_NS::system::detail::generic::for_each(thrust::THRUST_200802_SM_1200_NS::execution_policy<DerivedPolicy> &, InputIterator, InputIterator, UnaryFunction) [with DerivedPolicy=thrust::THRUST_200802_SM_1200_NS::cuda_cub::tag, InputIterator=thrust::THRUST_200802_SM_1200_NS::zip_iterator<cuda::std::__4::tuple<thrust::THRUST_200802_SM_1200_NS::device_ptr<int>, thrust::THRUST_200802_SM_1200_NS::device_ptr<int>>>, UnaryFunction=thrust::THRUST_200802_SM_1200_NS::detail::unary_transform_functor<some_operator>]" at line 46 of C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\thrust/detail/for_each.inl
            instantiation of "InputIterator thrust::THRUST_200802_SM_1200_NS::for_each(const thrust::THRUST_200802_SM_1200_NS::detail::execution_policy_base<DerivedPolicy> &, InputIterator, InputIterator, UnaryFunction) [with DerivedPolicy=thrust::THRUST_200802_SM_1200_NS::cuda_cub::tag, InputIterator=thrust::THRUST_200802_SM_1200_NS::zip_iterator<cuda::std::__4::tuple<thrust::THRUST_200802_SM_1200_NS::device_ptr<int>, thrust::THRUST_200802_SM_1200_NS::device_ptr<int>>>, UnaryFunction=thrust::THRUST_200802_SM_1200_NS::detail::unary_transform_functor<some_operator>]" at line 62 of C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\thrust/system/detail/generic/transform.inl
            instantiation of "OutputIterator thrust::THRUST_200802_SM_1200_NS::system::detail::generic::transform(thrust::THRUST_200802_SM_1200_NS::execution_policy<DerivedPolicy> &, InputIterator, InputIterator, OutputIterator, UnaryFunction) [with DerivedPolicy=thrust::THRUST_200802_SM_1200_NS::cuda_cub::tag, InputIterator=thrust::THRUST_200802_SM_1200_NS::device_ptr<int>, OutputIterator=thrust::THRUST_200802_SM_1200_NS::device_ptr<int>, UnaryFunction=some_operator]" at line 47 of C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\thrust/detail/transform.inl
            instantiation of "OutputIterator thrust::THRUST_200802_SM_1200_NS::transform(const thrust::THRUST_200802_SM_1200_NS::detail::execution_policy_base<DerivedPolicy> &, InputIterator, InputIterator, OutputIterator, UnaryFunction) [with DerivedPolicy=thrust::THRUST_200802_SM_1200_NS::cuda_cub::tag, InputIterator=thrust::THRUST_200802_SM_1200_NS::device_ptr<int>, OutputIterator=thrust::THRUST_200802_SM_1200_NS::device_ptr<int>, UnaryFunction=some_operator]" at line 148 of C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\thrust/detail/transform.inl
            instantiation of "OutputIterator thrust::THRUST_200802_SM_1200_NS::transform(InputIterator, InputIterator, OutputIterator, UnaryFunction) [with InputIterator=thrust::THRUST_200802_SM_1200_NS::device_ptr<int>, OutputIterator=thrust::THRUST_200802_SM_1200_NS::device_ptr<int>, UnaryFunction=some_operator]" at line 13 of test.cu

1 error detected in the compilation of "test.cu".

And another one totally broken with cuda 12.9 (works with cuda 12.8)


#include <cub/cub.cuh>

void f(void)
{
}
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(62): error: expected a ")"
    asm("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
                                                     ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(63): warning #549-D: variable "ret" is used before its value is set
    return ret;
           ^

Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(73): error: expected a ")"
    asm("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
                                                     ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(74): warning #549-D: variable "ret" is used before its value is set
    return ret;
           ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(87): error: expected a ")"
    asm("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
                                  ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(88): warning #549-D: variable "bits" is used before its value is set
    return bits;
           ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(134): error: expected a ")"
    asm("bfi.b32 %0, %1, %2, %3, %4;" : "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
                                      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(143): error: expected a ")"
    asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
                                               ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(178): error: expected a ")"
    asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
                                   ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(179): warning #549-D: variable "ret" is used before its value is set
    return ret;
           ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(190): error: expected a "("
    asm volatile("bar.sync 1, %0;" : : "r"(count));
        ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(190): error: expected a ")"
    asm volatile("bar.sync 1, %0;" : : "r"(count));
                                   ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(199): error: calling a __device__ function("__syncthreads") from a __host__ function("CTA_SYNC") is not allowed
    __syncthreads();
    ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(208): error: calling a __device__ function("__syncthreads_and") from a __host__ function("CTA_SYNC_AND") is not allowed
    return __syncthreads_and(p);
           ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(217): error: calling a __device__ function("__syncthreads_or") from a __host__ function("CTA_SYNC_OR") is not allowed
    return __syncthreads_or(p);
           ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(226): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522010__syncwarpE1?") from a __host__ function("WARP_SYNC") is not allowed
    __syncwarp(member_mask);
    ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(235): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522010__any_syncE1?1?") from a __host__ function("WARP_ANY") is not allowed
    return __any_sync(member_mask, predicate);
           ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(244): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522010__all_syncE1?1?") from a __host__ function("WARP_ALL") is not allowed
    return __all_sync(member_mask, predicate);
           ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(253): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522013__ballot_syncE1?1?") from a __host__ function("WARP_BALLOT") is not allowed
    return __ballot_sync(member_mask, predicate);
           ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(262): error: expected a "("
    asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
        ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(263): error: expected a ")"
                 : "=r"(word)
                 ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(274): error: expected a "("
    asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
        ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(275): error: expected a ")"
                 : "=r"(word)
                 ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(287): error: expected a "("
    asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
        ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(288): error: expected a ")"
                 : "=r"(word)
                 ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(299): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522011__shfl_syncE1?1?1?1?") from a __host__ function("SHFL_IDX_SYNC") is not allowed
    return __shfl_sync(member_mask, word, src_lane);
           ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(309): error: expected a ")"
    asm("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
                                 ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(310): warning #549-D: variable "d" is used before its value is set
    return d;
           ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(320): error: expected a ")"
    asm("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
                                     ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(321): warning #549-D: variable "d" is used before its value is set
    return d;
           ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(331): error: expected a "("
    asm volatile("exit;");
        ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(340): error: expected a "("
    asm volatile("trap;");
        ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(359): error: expected a ")"
    asm("mov.u32 %0, %%laneid;" : "=r"(ret));
                                ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(360): warning #549-D: variable "ret" is used before its value is set
    return ret;
           ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(371): error: expected a ")"
    asm("mov.u32 %0, %%warpid;" : "=r"(ret));
                                ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(372): warning #549-D: variable "ret" is used before its value is set
    return ret;
           ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(411): error: expected a ")"
    asm("mov.u32 %0, %%lanemask_lt;" : "=r"(ret));
                                     ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(412): warning #549-D: variable "ret" is used before its value is set
    return ret;
           ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(422): error: expected a ")"
    asm("mov.u32 %0, %%lanemask_le;" : "=r"(ret));
                                     ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(423): warning #549-D: variable "ret" is used before its value is set
    return ret;
           ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(433): error: expected a ")"
    asm("mov.u32 %0, %%lanemask_gt;" : "=r"(ret));
                                     ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(434): warning #549-D: variable "ret" is used before its value is set
    return ret;
           ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(444): error: expected a ")"
    asm("mov.u32 %0, %%lanemask_ge;" : "=r"(ret));
                                     ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(445): warning #549-D: variable "ret" is used before its value is set
    return ret;
           ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(670): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522011__shfl_syncE1?1?1?1?") from a __host__ function("ShuffleIndex") is not allowed
    shuffle_word    = __shfl_sync(member_mask, (unsigned int) input_alias[0], src_lane, LOGICAL_WARP_THREADS);
                      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(675): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522011__shfl_syncE1?1?1?1?") from a __host__ function("ShuffleIndex") is not allowed
      shuffle_word       = __shfl_sync(member_mask, (unsigned int) input_alias[WORD], src_lane, LOGICAL_WARP_THREADS);
                           ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(731): error: expected a ")"
            : "=r"(mask)
            ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(735): warning #549-D: variable "mask" is used before its value is set
        retval = (BIT == 0) ? mask : retval & mask;
                                              ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(749): error: expected a ")"
    asm("shl.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits));
                              ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(760): error: expected a ")"
    asm("shr.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits));
                              ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_adjacent_difference.cuh(142): error: a static "__shared__" variable declaration is not allowed inside a host function body
      __declspec(__shared__) _TempStorage private_storage;
                                          ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_adjacent_difference.cuh(312): error: calling a __device__ function("__syncthreads") from a __host__ function("SubtractLeft") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_adjacent_difference.cuh(411): error: calling a __device__ function("__syncthreads") from a __host__ function("SubtractLeft") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_adjacent_difference.cuh(502): error: calling a __device__ function("__syncthreads") from a __host__ function("SubtractLeftPartialTile") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_adjacent_difference.cuh(625): error: calling a __device__ function("__syncthreads") from a __host__ function("SubtractLeftPartialTile") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_adjacent_difference.cuh(739): error: calling a __device__ function("__syncthreads") from a __host__ function("SubtractRight") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_adjacent_difference.cuh(840): error: calling a __device__ function("__syncthreads") from a __host__ function("SubtractRight") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_adjacent_difference.cuh(929): error: calling a __device__ function("__syncthreads") from a __host__ function("SubtractRightPartialTile") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_discontinuity.cuh(147): error: a static "__shared__" variable declaration is not allowed inside a host function body
      __declspec(__shared__) _TempStorage private_storage;
                                          ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_discontinuity.cuh(295): error: calling a __device__ function("__syncthreads") from a __host__ function("FlagHeads") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_discontinuity.cuh(340): error: calling a __device__ function("__syncthreads") from a __host__ function("FlagHeads") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_discontinuity.cuh(589): error: calling a __device__ function("__syncthreads") from a __host__ function("FlagTails") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_discontinuity.cuh(689): error: calling a __device__ function("__syncthreads") from a __host__ function("FlagTails") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_discontinuity.cuh(793): error: calling a __device__ function("__syncthreads") from a __host__ function("FlagHeadsAndTails") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_discontinuity.cuh(923): error: calling a __device__ function("__syncthreads") from a __host__ function("FlagHeadsAndTails") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_discontinuity.cuh(1055): error: calling a __device__ function("__syncthreads") from a __host__ function("FlagHeadsAndTails") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_discontinuity.cuh(1192): error: calling a __device__ function("__syncthreads") from a __host__ function("FlagHeadsAndTails") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/warp/specializations/warp_exchange_shfl.cuh(279): error: namespace "cuda::ptx" has no member "get_sreg_laneid"
        : lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
                                              ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/warp/specializations/warp_exchange_shfl.cuh(279): error: namespace "cuda::ptx" has no member "get_sreg_laneid"
        : lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
                                                                                ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/warp/specializations/warp_exchange_shfl.cuh(280): error: namespace "cuda::ptx" has no member "get_sreg_laneid"
        , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS))
                                                   ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/warp/specializations/warp_exchange_smem.cuh(93): error: namespace "cuda::ptx" has no member "get_sreg_laneid"
        , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
                                              ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/warp/specializations/warp_exchange_smem.cuh(93): error: namespace "cuda::ptx" has no member "get_sreg_laneid"
        , lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
                                                                                ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/warp/specializations/warp_exchange_smem.cuh(94): error: namespace "cuda::ptx" has no member "get_sreg_laneid"
        , warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS))
                                                   ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(184): error: namespace "cuda::ptx" has no member "get_sreg_laneid"
    unsigned int lane_id     = ::cuda::ptx::get_sreg_laneid();
                                            ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(191): error: a static "__shared__" variable declaration is not allowed inside a host function body
      __declspec(__shared__) _TempStorage private_storage;
                                          ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(220): error: calling a __device__ function("__syncthreads") from a __host__ function("BlockedToStriped") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(256): error: calling a __device__ function("__syncthreads") from a __host__ function("BlockedToStriped") is not allowed
        __syncthreads();
        ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(272): error: calling a __device__ function("__syncthreads") from a __host__ function("BlockedToStriped") is not allowed
        __syncthreads();
        ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(329): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522010__syncwarpE1?") from a __host__ function("BlockedToWarpStriped") is not allowed
      __syncwarp(0xffffffff);
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(370): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522010__syncwarpE1?") from a __host__ function("BlockedToWarpStriped") is not allowed
        __syncwarp(0xffffffff);
        ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(387): error: calling a __device__ function("__syncthreads") from a __host__ function("BlockedToWarpStriped") is not allowed
        __syncthreads();
        ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(402): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522010__syncwarpE1?") from a __host__ function("BlockedToWarpStriped") is not allowed
          __syncwarp(0xffffffff);
          ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(443): error: calling a __device__ function("__syncthreads") from a __host__ function("StripedToBlocked") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(481): error: calling a __device__ function("__syncthreads") from a __host__ function("StripedToBlocked") is not allowed
        __syncthreads();
        ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(504): error: calling a __device__ function("__syncthreads") from a __host__ function("StripedToBlocked") is not allowed
        __syncthreads();
        ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(554): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522010__syncwarpE1?") from a __host__ function("WarpStripedToBlocked") is not allowed
      __syncwarp(0xffffffff);
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(585): error: calling a __device__ function("__syncthreads") from a __host__ function("WarpStripedToBlocked") is not allowed
        __syncthreads();
        ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(600): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522010__syncwarpE1?") from a __host__ function("WarpStripedToBlocked") is not allowed
          __syncwarp(0xffffffff);
          ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(644): error: calling a __device__ function("__syncthreads") from a __host__ function("ScatterToBlocked") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(680): error: calling a __device__ function("__syncthreads") from a __host__ function("ScatterToBlocked") is not allowed
        __syncthreads();
        ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(698): error: calling a __device__ function("__syncthreads") from a __host__ function("ScatterToBlocked") is not allowed
        __syncthreads();
        ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(751): error: calling a __device__ function("__syncthreads") from a __host__ function("ScatterToStriped") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(790): error: calling a __device__ function("__syncthreads") from a __host__ function("ScatterToStriped") is not allowed
        __syncthreads();
        ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(806): error: calling a __device__ function("__syncthreads") from a __host__ function("ScatterToStriped") is not allowed
        __syncthreads();
        ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(1155): error: calling a __device__ function("__syncthreads") from a __host__ function("ScatterToStripedGuarded") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(1214): error: calling a __device__ function("__syncthreads") from a __host__ function("ScatterToStripedFlagged") is not allowed
      __syncthreads();
      ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/thread/thread_load.cuh(291): error: expected a "("
  template <>  __forceinline uint4 ThreadLoad<LOAD_CA, uint4 const*>(uint4 const* ptr) { uint4 retval; asm volatile("ld." "ca" ".v4.u32 {%0, %1, %2, %3}, [%4];" : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w) : "l"(ptr)); return retval; } template <>  __forceinline ulonglong2 ThreadLoad<LOAD_CA, ulonglong2 const*>(ulonglong2 const* ptr) { ulonglong2 retval; asm volatile("ld." "ca" ".v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr)); return retval; } template <>  __forceinline ushort4 ThreadLoad<LOAD_CA, ushort4 const*>(ushort4 const* ptr) { ushort4 retval; asm volatile("ld." "ca" ".v4.u16 {%0, %1, %2, %3}, [%4];" : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w) : "l"(ptr)); return retval; } template <>  __forceinline uint2 ThreadLoad<LOAD_CA, uint2 const*>(uint2 const* ptr) { uint2 retval; asm volatile("ld." "ca" ".v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr)); return retval; } template <>  __forceinline unsigned long long ThreadLoad<LOAD_CA, unsigned long long const*>( unsigned long long const* ptr) { unsigned long long retval; asm volatile("ld." "ca" ".u64 %0, [%1];" : "=l"(retval) : "l"(ptr)); return retval; } template <>  __forceinline unsigned int ThreadLoad<LOAD_CA, unsigned int const*>(unsigned int const* ptr) { unsigned int retval; asm volatile("ld." "ca" ".u32 %0, [%1];" : "=r"(retval) : "l"(ptr)); return retval; } template <>  __forceinline unsigned short ThreadLoad<LOAD_CA, unsigned short const*>( unsigned short const* ptr) { unsigned short retval; asm volatile("ld." "ca" ".u16 %0, [%1];" : "=h"(retval) : "l"(ptr)); return retval; } template <>  __forceinline unsigned char ThreadLoad<LOAD_CA, unsigned char const*>( unsigned char const* ptr) { unsigned short retval; asm volatile( "{" "   .reg .u8 datum;" "    ld." "ca" ".u8 datum, [%1];" "    cvt.u16.u8 %0, datum;" "}" : "=h"(retval) : "l"(ptr)); return (unsigned char) retval; }
                                                                                                           ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/thread/thread_load.cuh(291): error: expected a ")"
  template <>  __forceinline uint4 ThreadLoad<LOAD_CA, uint4 const*>(uint4 const* ptr) { uint4 retval; asm volatile("ld." "ca" ".v4.u32 {%0, %1, %2, %3}, [%4];" : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w) : "l"(ptr)); return retval; } template <>  __forceinline ulonglong2 ThreadLoad<LOAD_CA, ulonglong2 const*>(ulonglong2 const* ptr) { ulonglong2 retval; asm volatile("ld." "ca" ".v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr)); return retval; } template <>  __forceinline ushort4 ThreadLoad<LOAD_CA, ushort4 const*>(ushort4 const* ptr) { ushort4 retval; asm volatile("ld." "ca" ".v4.u16 {%0, %1, %2, %3}, [%4];" : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w) : "l"(ptr)); return retval; } template <>  __forceinline uint2 ThreadLoad<LOAD_CA, uint2 const*>(uint2 const* ptr) { uint2 retval; asm volatile("ld." "ca" ".v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr)); return retval; } template <>  __forceinline unsigned long long ThreadLoad<LOAD_CA, unsigned long long const*>( unsigned long long const* ptr) { unsigned long long retval; asm volatile("ld." "ca" ".u64 %0, [%1];" : "=l"(retval) : "l"(ptr)); return retval; } template <>  __forceinline unsigned int ThreadLoad<LOAD_CA, unsigned int const*>(unsigned int const* ptr) { unsigned int retval; asm volatile("ld." "ca" ".u32 %0, [%1];" : "=r"(retval) : "l"(ptr)); return retval; } template <>  __forceinline unsigned short ThreadLoad<LOAD_CA, unsigned short const*>( unsigned short const* ptr) { unsigned short retval; asm volatile("ld." "ca" ".u16 %0, [%1];" : "=h"(retval) : "l"(ptr)); return retval; } template <>  __forceinline unsigned char ThreadLoad<LOAD_CA, unsigned char const*>( unsigned char const* ptr) { unsigned short retval; asm volatile( "{" "   .reg .u8 datum;" "    ld." "ca" ".u8 datum, [%1];" "    cvt.u16.u8 %0, datum;" "}" : "=h"(retval) : "l"(ptr)); return (unsigned char) retval; }
                                                                                                                                                                 ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/thread/thread_load.cuh(291): warning #549-D: variable "retval" is used before its value is set
  template <>  __forceinline uint4 ThreadLoad<LOAD_CA, uint4 const*>(uint4 const* ptr) { uint4 retval; asm volatile("ld." "ca" ".v4.u32 {%0, %1, %2, %3}, [%4];" : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w) : "l"(ptr)); return retval; } template <>  __forceinline ulonglong2 ThreadLoad<LOAD_CA, ulonglong2 const*>(ulonglong2 const* ptr) { ulonglong2 retval; asm volatile("ld." "ca" ".v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr)); return retval; } template <>  __forceinline ushort4 ThreadLoad<LOAD_CA, ushort4 const*>(ushort4 const* ptr) { ushort4 retval; asm volatile("ld." "ca" ".v4.u16 {%0, %1, %2, %3}, [%4];" : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w) : "l"(ptr)); return retval; } template <>  __forceinline uint2 ThreadLoad<LOAD_CA, uint2 const*>(uint2 const* ptr) { uint2 retval; asm volatile("ld." "ca" ".v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr)); return retval; } template <>  __forceinline unsigned long long ThreadLoad<LOAD_CA, unsigned long long const*>( unsigned long long const* ptr) { unsigned long long retval; asm volatile("ld." "ca" ".u64 %0, [%1];" : "=l"(retval) : "l"(ptr)); return retval; } template <>  __forceinline unsigned int ThreadLoad<LOAD_CA, unsigned int const*>(unsigned int const* ptr) { unsigned int retval; asm volatile("ld." "ca" ".u32 %0, [%1];" : "=r"(retval) : "l"(ptr)); return retval; } template <>  __forceinline unsigned short ThreadLoad<LOAD_CA, unsigned short const*>( unsigned short const* ptr) { unsigned short retval; asm volatile("ld." "ca" ".u16 %0, [%1];" : "=h"(retval) : "l"(ptr)); return retval; } template <>  __forceinline unsigned char ThreadLoad<LOAD_CA, unsigned char const*>( unsigned char const* ptr) { unsigned short retval; asm volatile( "{" "   .reg .u8 datum;" "    ld." "ca" ".u8 datum, [%1];" "    cvt.u16.u8 %0, datum;" "}" : "=h"(retval) : "l"(ptr)); return (unsigned char) retval; }
                                                                                                                                                                                                                                                      ^

...
Error limit reached.
100 errors detected in the compilation of "test2.cu".
Compilation terminated.

try updating your thrust/cub to the very latest cccl version available on github. I believe there were things fixed recently in this regard.

Nothing different with CCCL 2.8.5
I tried latest from trunk

I can get a different error :


C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cuda/std/__cccl/compiler.h(126): fatal error C1012: parenthèses non équilibrées : ')' manquant(e)

It’s probably best to file a cccl issue, then.

For tracking :

https://github.com/NVIDIA/cccl/issues/5165
https://github.com/NVIDIA/cccl/issues/5166

Workaround is provided in [BUG]: compilation error · Issue #5166 · NVIDIA/cccl · GitHub by using ‘/Zc:preprocessor’ preprocessor . Ticket 5379264 is tracking this internally .

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.