When upgrade from CUDA12.4 to 12.5 the compilation became broken
The errors described below.
In file included from /usr/local/cuda/include/cub/util_device.cuh:52,
from /usr/local/cuda/include/thrust/system/cuda/detail/util.h:48,
from /usr/local/cuda/include/thrust/system/cuda/detail/par.h:39,
from /home/Yehonatans/work/utils/cuda/SingletonCudaMemoryPool.h:7,
from /home/Yehonatans/work/utils/cuda/unit_testing/TestCubMemoryPool.cpp:8:
/usr/local/cuda/include/cub/util_ptx.cuh: In function ‘void cub::CUB_200400___CUDA_ARCH_LIST___NS::CTA_SYNC()’:
/usr/local/cuda/include/cub/util_ptx.cuh:271:5: error: ‘__syncthreads’ was not declared in this scope
271 | __syncthreads();
| ^~~~~~~~~~~~~
/usr/local/cuda/include/cub/util_ptx.cuh: In function ‘int cub::CUB_200400___CUDA_ARCH_LIST___NS::CTA_SYNC_AND(int)’:
/usr/local/cuda/include/cub/util_ptx.cuh:280:12: error: ‘__syncthreads_and’ was not declared in this scope
280 | return __syncthreads_and(p);
| ^~~~~~~~~~~~~~~~~
/usr/local/cuda/include/cub/util_ptx.cuh: In function ‘int cub::CUB_200400___CUDA_ARCH_LIST___NS::CTA_SYNC_OR(int)’:
/usr/local/cuda/include/cub/util_ptx.cuh:289:12: error: ‘__syncthreads_or’ was not declared in this scope
289 | return __syncthreads_or(p);
| ^~~~~~~~~~~~~~~~
/usr/local/cuda/include/cub/util_ptx.cuh: In function ‘void cub::CUB_200400___CUDA_ARCH_LIST___NS::WARP_SYNC(unsigned int)’:
/usr/local/cuda/include/cub/util_ptx.cuh:298:5: error: ‘__syncwarp’ was not declared in this scope
298 | __syncwarp(member_mask);
| ^~~~~~~~~~
/usr/local/cuda/include/cub/util_ptx.cuh: In function ‘int cub::CUB_200400___CUDA_ARCH_LIST___NS::WARP_ANY(int, unsigned int)’:
/usr/local/cuda/include/cub/util_ptx.cuh:307:12: error: ‘__any_sync’ was not declared in this scope
307 | return __any_sync(member_mask, predicate);
| ^~~~~~~~~~
/usr/local/cuda/include/cub/util_ptx.cuh: In function ‘int cub::CUB_200400___CUDA_ARCH_LIST___NS::WARP_ALL(int, unsigned int)’:
/usr/local/cuda/include/cub/util_ptx.cuh:316:12: error: ‘__all_sync’ was not declared in this scope
316 | return __all_sync(member_mask, predicate);
| ^~~~~~~~~~
/usr/local/cuda/include/cub/util_ptx.cuh: In function ‘int cub::CUB_200400___CUDA_ARCH_LIST___NS::WARP_BALLOT(int, unsigned int)’:
/usr/local/cuda/include/cub/util_ptx.cuh:325:12: error: ‘__ballot_sync’ was not declared in this scope
325 | return __ballot_sync(member_mask, predicate);
| ^~~~~~~~~~~~~
/usr/local/cuda/include/cub/util_ptx.cuh: In function ‘unsigned int cub::CUB_200400___CUDA_ARCH_LIST___NS::SHFL_IDX_SYNC(unsigned int, int, unsigned int)’:
/usr/local/cuda/include/cub/util_ptx.cuh:368:12: error: ‘__shfl_sync’ was not declared in this scope
368 | return __shfl_sync(member_mask, word, src_lane);
| ^~~~~~~~~~~
/usr/local/cuda/include/cub/util_ptx.cuh: In function ‘int cub::CUB_200400___CUDA_ARCH_LIST___NS::RowMajorTid(int, int, int)’:
/usr/local/cuda/include/cub/util_ptx.cuh:415:39: error: ‘threadIdx’ was not declared in this scope
415 | return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
| ^~~~~~~~~
In file included from /usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/barrier_cluster.h:30,
from /usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx.h:74,
from /usr/local/cuda/include/cuda/ptx:19,
from /usr/local/cuda/include/cuda/discard_memory:25,
from /usr/local/cuda/include/cub/util_device.cuh:57,
from /usr/local/cuda/include/thrust/system/cuda/detail/util.h:48,
from /usr/local/cuda/include/thrust/system/cuda/detail/par.h:39,
from /home/Yehonatans/work/utils/cuda/SingletonCudaMemoryPool.h:7,
from /home/Yehonatans/work/utils/cuda/unit_testing/TestCubMemoryPool.cpp:8:
/usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/../ptx_helper_functions.h: In function ‘uint32_t cuda::ptx::__4::__as_ptr_smem(const void*)’:
/usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/../ptx_helper_functions.h:40:44: error: ‘__cvta_generic_to_shared’ was not declared in this scope
40 | return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr));
| ^~~~~~~~~~~~~~~~~~~~~~~~
/usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/../ptx_helper_functions.h: In function ‘uint64_t cuda::ptx::__4::__as_ptr_gmem(const void*)’:
/usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/../ptx_helper_functions.h:60:44: error: ‘__cvta_generic_to_global’ was not declared in this scope
60 | return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr));
| ^~~~~~~~~~~~~~~~~~~~~~~~
/usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/../ptx_helper_functions.h: In function ‘_Tp* cuda::ptx::__4::__from_ptr_smem(size_t)’:
/usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/../ptx_helper_functions.h:73:33: error: there are no arguments to ‘__cvta_shared_to_generic’ that depend on a template parameter, so a declaration of ‘__cvta_shared_to_generic’ must be available [-fpermissive]
73 | return reinterpret_cast<_Tp*>(__cvta_shared_to_generic(__ptr));
| ^~~~~~~~~~~~~~~~~~~~~~~~
/usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/../ptx_helper_functions.h:73:33: note: (if you use ‘-fpermissive’, G++ will accept your code, but allowing the use of an undeclared name is deprecated)
/usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/../ptx_helper_functions.h: In function ‘_Tp* cuda::ptx::__4::__from_ptr_gmem(size_t)’:
/usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/../ptx_helper_functions.h:94:33: error: there are no arguments to ‘__cvta_global_to_generic’ that depend on a template parameter, so a declaration of ‘__cvta_global_to_generic’ must be available [-fpermissive]
94 | return reinterpret_cast<_Tp*>(__cvta_global_to_generic(__ptr));
| ^~~~~~~~~~~~~~~~~~~~~~~~
In file included from /usr/local/cuda/include/thrust/system/cuda/detail/util.h:48,
from /usr/local/cuda/include/thrust/system/cuda/detail/par.h:39,
from /home/Yehonatans/work/utils/cuda/SingletonCudaMemoryPool.h:7,
from /home/Yehonatans/work/utils/cuda/unit_testing/TestCubMemoryPool.cpp:8:
/usr/local/cuda/include/cub/util_device.cuh: In static member function ‘static typename AgentT::TempStorage& cub::CUB_200400___CUDA_ARCH_LIST___NS::detail::vsmem_helper_impl<AgentT>::get_temp_storage(cub::CUB_200400___CUDA_ARCH_LIST___NS::NullType&, cub::CUB_200400___CUDA_ARCH_LIST___NS::detail::vsmem_t&)’:
/usr/local/cuda/include/cub/util_device.cuh:160:63: error: ‘blockIdx’ was not declared in this scope
160 | static_cast<char*>(vsmem.gmem_ptr) + (vsmem_per_block * blockIdx.x));
| ^~~~~~~~
/usr/local/cuda/include/cub/util_device.cuh: In static member function ‘static bool cub::CUB_200400___CUDA_ARCH_LIST___NS::detail::vsmem_helper_impl<AgentT>::discard_temp_storage(typename AgentT::TempStorage&)’:
/usr/local/cuda/include/cub/util_device.cuh:201:38: error: ‘threadIdx’ was not declared in this scope
201 | const std::size_t linear_tid = threadIdx.x;
| ^~~~~~~~~
/usr/local/cuda/include/cub/util_device.cuh:202:50: error: ‘blockDim’ was not declared in this scope
202 | const std::size_t block_stride = line_size * blockDim.x;
I had a similar problem with the thrust library after updating to CUDA 12.5. The problem was that thrust headers were never intended to work when included from a host-only translation unit (i.e. a cpp file). They did work in older CUDA versions, but that was just coincidence. Have you tried renaming your TU from .cpp to .cu?
I’ve just come across thrust compile errors since updating to 12.6 that could be related. And noted your comment:
“The problem was that thrust headers were never intended to work when included from a host-only translation unit (i.e. a cpp file). They did work in older CUDA versions, but that was just coincidence.”
Wow this is news to me! I have been including the same header files in .cpp and .cu files for years that only define data types. I think it’s valid to want to do this though for basic type definitions without causing lots of cpp files to be renamed to cu. That is going to cause a lot of reorganising to fix in our case :-(
There seems to be another way to fix this problem by defining the THRUST_DEVICE_SYSTEM symbol. I did not try this. In our project only a few files were affected, and reorganizing the includes was easy.