And another one totally broken with cuda 12.9 (works with cuda 12.8)
#include <cub/cub.cuh>
void f(void)
{
}
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(62): error: expected a ")"
asm("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(63): warning #549-D: variable "ret" is used before its value is set
return ret;
^
Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(73): error: expected a ")"
asm("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(74): warning #549-D: variable "ret" is used before its value is set
return ret;
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(87): error: expected a ")"
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(88): warning #549-D: variable "bits" is used before its value is set
return bits;
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(134): error: expected a ")"
asm("bfi.b32 %0, %1, %2, %3, %4;" : "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(143): error: expected a ")"
asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(178): error: expected a ")"
asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(179): warning #549-D: variable "ret" is used before its value is set
return ret;
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(190): error: expected a "("
asm volatile("bar.sync 1, %0;" : : "r"(count));
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(190): error: expected a ")"
asm volatile("bar.sync 1, %0;" : : "r"(count));
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(199): error: calling a __device__ function("__syncthreads") from a __host__ function("CTA_SYNC") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(208): error: calling a __device__ function("__syncthreads_and") from a __host__ function("CTA_SYNC_AND") is not allowed
return __syncthreads_and(p);
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(217): error: calling a __device__ function("__syncthreads_or") from a __host__ function("CTA_SYNC_OR") is not allowed
return __syncthreads_or(p);
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(226): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522010__syncwarpE1?") from a __host__ function("WARP_SYNC") is not allowed
__syncwarp(member_mask);
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(235): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522010__any_syncE1?1?") from a __host__ function("WARP_ANY") is not allowed
return __any_sync(member_mask, predicate);
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(244): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522010__all_syncE1?1?") from a __host__ function("WARP_ALL") is not allowed
return __all_sync(member_mask, predicate);
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(253): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522013__ballot_syncE1?1?") from a __host__ function("WARP_BALLOT") is not allowed
return __ballot_sync(member_mask, predicate);
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(262): error: expected a "("
asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(263): error: expected a ")"
: "=r"(word)
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(274): error: expected a "("
asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(275): error: expected a ")"
: "=r"(word)
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(287): error: expected a "("
asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(288): error: expected a ")"
: "=r"(word)
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(299): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522011__shfl_syncE1?1?1?1?") from a __host__ function("SHFL_IDX_SYNC") is not allowed
return __shfl_sync(member_mask, word, src_lane);
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(309): error: expected a ")"
asm("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(310): warning #549-D: variable "d" is used before its value is set
return d;
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(320): error: expected a ")"
asm("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(321): warning #549-D: variable "d" is used before its value is set
return d;
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(331): error: expected a "("
asm volatile("exit;");
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(340): error: expected a "("
asm volatile("trap;");
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(359): error: expected a ")"
asm("mov.u32 %0, %%laneid;" : "=r"(ret));
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(360): warning #549-D: variable "ret" is used before its value is set
return ret;
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(371): error: expected a ")"
asm("mov.u32 %0, %%warpid;" : "=r"(ret));
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(372): warning #549-D: variable "ret" is used before its value is set
return ret;
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(411): error: expected a ")"
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(ret));
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(412): warning #549-D: variable "ret" is used before its value is set
return ret;
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(422): error: expected a ")"
asm("mov.u32 %0, %%lanemask_le;" : "=r"(ret));
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(423): warning #549-D: variable "ret" is used before its value is set
return ret;
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(433): error: expected a ")"
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(ret));
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(434): warning #549-D: variable "ret" is used before its value is set
return ret;
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(444): error: expected a ")"
asm("mov.u32 %0, %%lanemask_ge;" : "=r"(ret));
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(445): warning #549-D: variable "ret" is used before its value is set
return ret;
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(670): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522011__shfl_syncE1?1?1?1?") from a __host__ function("ShuffleIndex") is not allowed
shuffle_word = __shfl_sync(member_mask, (unsigned int) input_alias[0], src_lane, LOGICAL_WARP_THREADS);
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(675): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522011__shfl_syncE1?1?1?1?") from a __host__ function("ShuffleIndex") is not allowed
shuffle_word = __shfl_sync(member_mask, (unsigned int) input_alias[WORD], src_lane, LOGICAL_WARP_THREADS);
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(731): error: expected a ")"
: "=r"(mask)
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(735): warning #549-D: variable "mask" is used before its value is set
retval = (BIT == 0) ? mask : retval & mask;
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(749): error: expected a ")"
asm("shl.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits));
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/util_ptx.cuh(760): error: expected a ")"
asm("shr.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits));
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_adjacent_difference.cuh(142): error: a static "__shared__" variable declaration is not allowed inside a host function body
__declspec(__shared__) _TempStorage private_storage;
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_adjacent_difference.cuh(312): error: calling a __device__ function("__syncthreads") from a __host__ function("SubtractLeft") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_adjacent_difference.cuh(411): error: calling a __device__ function("__syncthreads") from a __host__ function("SubtractLeft") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_adjacent_difference.cuh(502): error: calling a __device__ function("__syncthreads") from a __host__ function("SubtractLeftPartialTile") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_adjacent_difference.cuh(625): error: calling a __device__ function("__syncthreads") from a __host__ function("SubtractLeftPartialTile") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_adjacent_difference.cuh(739): error: calling a __device__ function("__syncthreads") from a __host__ function("SubtractRight") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_adjacent_difference.cuh(840): error: calling a __device__ function("__syncthreads") from a __host__ function("SubtractRight") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_adjacent_difference.cuh(929): error: calling a __device__ function("__syncthreads") from a __host__ function("SubtractRightPartialTile") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_discontinuity.cuh(147): error: a static "__shared__" variable declaration is not allowed inside a host function body
__declspec(__shared__) _TempStorage private_storage;
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_discontinuity.cuh(295): error: calling a __device__ function("__syncthreads") from a __host__ function("FlagHeads") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_discontinuity.cuh(340): error: calling a __device__ function("__syncthreads") from a __host__ function("FlagHeads") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_discontinuity.cuh(589): error: calling a __device__ function("__syncthreads") from a __host__ function("FlagTails") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_discontinuity.cuh(689): error: calling a __device__ function("__syncthreads") from a __host__ function("FlagTails") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_discontinuity.cuh(793): error: calling a __device__ function("__syncthreads") from a __host__ function("FlagHeadsAndTails") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_discontinuity.cuh(923): error: calling a __device__ function("__syncthreads") from a __host__ function("FlagHeadsAndTails") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_discontinuity.cuh(1055): error: calling a __device__ function("__syncthreads") from a __host__ function("FlagHeadsAndTails") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_discontinuity.cuh(1192): error: calling a __device__ function("__syncthreads") from a __host__ function("FlagHeadsAndTails") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/warp/specializations/warp_exchange_shfl.cuh(279): error: namespace "cuda::ptx" has no member "get_sreg_laneid"
: lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/warp/specializations/warp_exchange_shfl.cuh(279): error: namespace "cuda::ptx" has no member "get_sreg_laneid"
: lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/warp/specializations/warp_exchange_shfl.cuh(280): error: namespace "cuda::ptx" has no member "get_sreg_laneid"
, warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS))
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/warp/specializations/warp_exchange_smem.cuh(93): error: namespace "cuda::ptx" has no member "get_sreg_laneid"
, lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/warp/specializations/warp_exchange_smem.cuh(93): error: namespace "cuda::ptx" has no member "get_sreg_laneid"
, lane_id(IS_ARCH_WARP ? ::cuda::ptx::get_sreg_laneid() : (::cuda::ptx::get_sreg_laneid() % LOGICAL_WARP_THREADS))
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/warp/specializations/warp_exchange_smem.cuh(94): error: namespace "cuda::ptx" has no member "get_sreg_laneid"
, warp_id(IS_ARCH_WARP ? 0 : (::cuda::ptx::get_sreg_laneid() / LOGICAL_WARP_THREADS))
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(184): error: namespace "cuda::ptx" has no member "get_sreg_laneid"
unsigned int lane_id = ::cuda::ptx::get_sreg_laneid();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(191): error: a static "__shared__" variable declaration is not allowed inside a host function body
__declspec(__shared__) _TempStorage private_storage;
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(220): error: calling a __device__ function("__syncthreads") from a __host__ function("BlockedToStriped") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(256): error: calling a __device__ function("__syncthreads") from a __host__ function("BlockedToStriped") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(272): error: calling a __device__ function("__syncthreads") from a __host__ function("BlockedToStriped") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(329): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522010__syncwarpE1?") from a __host__ function("BlockedToWarpStriped") is not allowed
__syncwarp(0xffffffff);
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(370): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522010__syncwarpE1?") from a __host__ function("BlockedToWarpStriped") is not allowed
__syncwarp(0xffffffff);
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(387): error: calling a __device__ function("__syncthreads") from a __host__ function("BlockedToWarpStriped") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(402): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522010__syncwarpE1?") from a __host__ function("BlockedToWarpStriped") is not allowed
__syncwarp(0xffffffff);
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(443): error: calling a __device__ function("__syncthreads") from a __host__ function("StripedToBlocked") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(481): error: calling a __device__ function("__syncthreads") from a __host__ function("StripedToBlocked") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(504): error: calling a __device__ function("__syncthreads") from a __host__ function("StripedToBlocked") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(554): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522010__syncwarpE1?") from a __host__ function("WarpStripedToBlocked") is not allowed
__syncwarp(0xffffffff);
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(585): error: calling a __device__ function("__syncthreads") from a __host__ function("WarpStripedToBlocked") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(600): error: calling a __device__ function("_ZN44_INTERNAL_667f4cef_8_test2_cu_0afe7ecc_1522010__syncwarpE1?") from a __host__ function("WarpStripedToBlocked") is not allowed
__syncwarp(0xffffffff);
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(644): error: calling a __device__ function("__syncthreads") from a __host__ function("ScatterToBlocked") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(680): error: calling a __device__ function("__syncthreads") from a __host__ function("ScatterToBlocked") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(698): error: calling a __device__ function("__syncthreads") from a __host__ function("ScatterToBlocked") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(751): error: calling a __device__ function("__syncthreads") from a __host__ function("ScatterToStriped") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(790): error: calling a __device__ function("__syncthreads") from a __host__ function("ScatterToStriped") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(806): error: calling a __device__ function("__syncthreads") from a __host__ function("ScatterToStriped") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(1155): error: calling a __device__ function("__syncthreads") from a __host__ function("ScatterToStripedGuarded") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/block/block_exchange.cuh(1214): error: calling a __device__ function("__syncthreads") from a __host__ function("ScatterToStripedFlagged") is not allowed
__syncthreads();
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/thread/thread_load.cuh(291): error: expected a "("
template <> __forceinline uint4 ThreadLoad<LOAD_CA, uint4 const*>(uint4 const* ptr) { uint4 retval; asm volatile("ld." "ca" ".v4.u32 {%0, %1, %2, %3}, [%4];" : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w) : "l"(ptr)); return retval; } template <> __forceinline ulonglong2 ThreadLoad<LOAD_CA, ulonglong2 const*>(ulonglong2 const* ptr) { ulonglong2 retval; asm volatile("ld." "ca" ".v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr)); return retval; } template <> __forceinline ushort4 ThreadLoad<LOAD_CA, ushort4 const*>(ushort4 const* ptr) { ushort4 retval; asm volatile("ld." "ca" ".v4.u16 {%0, %1, %2, %3}, [%4];" : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w) : "l"(ptr)); return retval; } template <> __forceinline uint2 ThreadLoad<LOAD_CA, uint2 const*>(uint2 const* ptr) { uint2 retval; asm volatile("ld." "ca" ".v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr)); return retval; } template <> __forceinline unsigned long long ThreadLoad<LOAD_CA, unsigned long long const*>( unsigned long long const* ptr) { unsigned long long retval; asm volatile("ld." "ca" ".u64 %0, [%1];" : "=l"(retval) : "l"(ptr)); return retval; } template <> __forceinline unsigned int ThreadLoad<LOAD_CA, unsigned int const*>(unsigned int const* ptr) { unsigned int retval; asm volatile("ld." "ca" ".u32 %0, [%1];" : "=r"(retval) : "l"(ptr)); return retval; } template <> __forceinline unsigned short ThreadLoad<LOAD_CA, unsigned short const*>( unsigned short const* ptr) { unsigned short retval; asm volatile("ld." "ca" ".u16 %0, [%1];" : "=h"(retval) : "l"(ptr)); return retval; } template <> __forceinline unsigned char ThreadLoad<LOAD_CA, unsigned char const*>( unsigned char const* ptr) { unsigned short retval; asm volatile( "{" " .reg .u8 datum;" " ld." "ca" ".u8 datum, [%1];" " cvt.u16.u8 %0, datum;" "}" : "=h"(retval) : "l"(ptr)); return (unsigned char) retval; }
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/thread/thread_load.cuh(291): error: expected a ")"
template <> __forceinline uint4 ThreadLoad<LOAD_CA, uint4 const*>(uint4 const* ptr) { uint4 retval; asm volatile("ld." "ca" ".v4.u32 {%0, %1, %2, %3}, [%4];" : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w) : "l"(ptr)); return retval; } template <> __forceinline ulonglong2 ThreadLoad<LOAD_CA, ulonglong2 const*>(ulonglong2 const* ptr) { ulonglong2 retval; asm volatile("ld." "ca" ".v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr)); return retval; } template <> __forceinline ushort4 ThreadLoad<LOAD_CA, ushort4 const*>(ushort4 const* ptr) { ushort4 retval; asm volatile("ld." "ca" ".v4.u16 {%0, %1, %2, %3}, [%4];" : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w) : "l"(ptr)); return retval; } template <> __forceinline uint2 ThreadLoad<LOAD_CA, uint2 const*>(uint2 const* ptr) { uint2 retval; asm volatile("ld." "ca" ".v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr)); return retval; } template <> __forceinline unsigned long long ThreadLoad<LOAD_CA, unsigned long long const*>( unsigned long long const* ptr) { unsigned long long retval; asm volatile("ld." "ca" ".u64 %0, [%1];" : "=l"(retval) : "l"(ptr)); return retval; } template <> __forceinline unsigned int ThreadLoad<LOAD_CA, unsigned int const*>(unsigned int const* ptr) { unsigned int retval; asm volatile("ld." "ca" ".u32 %0, [%1];" : "=r"(retval) : "l"(ptr)); return retval; } template <> __forceinline unsigned short ThreadLoad<LOAD_CA, unsigned short const*>( unsigned short const* ptr) { unsigned short retval; asm volatile("ld." "ca" ".u16 %0, [%1];" : "=h"(retval) : "l"(ptr)); return retval; } template <> __forceinline unsigned char ThreadLoad<LOAD_CA, unsigned char const*>( unsigned char const* ptr) { unsigned short retval; asm volatile( "{" " .reg .u8 datum;" " ld." "ca" ".u8 datum, [%1];" " cvt.u16.u8 %0, datum;" "}" : "=h"(retval) : "l"(ptr)); return (unsigned char) retval; }
^
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/include\cub/thread/thread_load.cuh(291): warning #549-D: variable "retval" is used before its value is set
template <> __forceinline uint4 ThreadLoad<LOAD_CA, uint4 const*>(uint4 const* ptr) { uint4 retval; asm volatile("ld." "ca" ".v4.u32 {%0, %1, %2, %3}, [%4];" : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w) : "l"(ptr)); return retval; } template <> __forceinline ulonglong2 ThreadLoad<LOAD_CA, ulonglong2 const*>(ulonglong2 const* ptr) { ulonglong2 retval; asm volatile("ld." "ca" ".v2.u64 {%0, %1}, [%2];" : "=l"(retval.x), "=l"(retval.y) : "l"(ptr)); return retval; } template <> __forceinline ushort4 ThreadLoad<LOAD_CA, ushort4 const*>(ushort4 const* ptr) { ushort4 retval; asm volatile("ld." "ca" ".v4.u16 {%0, %1, %2, %3}, [%4];" : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w) : "l"(ptr)); return retval; } template <> __forceinline uint2 ThreadLoad<LOAD_CA, uint2 const*>(uint2 const* ptr) { uint2 retval; asm volatile("ld." "ca" ".v2.u32 {%0, %1}, [%2];" : "=r"(retval.x), "=r"(retval.y) : "l"(ptr)); return retval; } template <> __forceinline unsigned long long ThreadLoad<LOAD_CA, unsigned long long const*>( unsigned long long const* ptr) { unsigned long long retval; asm volatile("ld." "ca" ".u64 %0, [%1];" : "=l"(retval) : "l"(ptr)); return retval; } template <> __forceinline unsigned int ThreadLoad<LOAD_CA, unsigned int const*>(unsigned int const* ptr) { unsigned int retval; asm volatile("ld." "ca" ".u32 %0, [%1];" : "=r"(retval) : "l"(ptr)); return retval; } template <> __forceinline unsigned short ThreadLoad<LOAD_CA, unsigned short const*>( unsigned short const* ptr) { unsigned short retval; asm volatile("ld." "ca" ".u16 %0, [%1];" : "=h"(retval) : "l"(ptr)); return retval; } template <> __forceinline unsigned char ThreadLoad<LOAD_CA, unsigned char const*>( unsigned char const* ptr) { unsigned short retval; asm volatile( "{" " .reg .u8 datum;" " ld." "ca" ".u8 datum, [%1];" " cvt.u16.u8 %0, datum;" "}" : "=h"(retval) : "l"(ptr)); return (unsigned char) retval; }
^
...
Error limit reached.
100 errors detected in the compilation of "test2.cu".
Compilation terminated.