When upgrade from CUDA12.4 to 12.5 the compilation became broken

When upgrade from CUDA12.4 to 12.5 the compilation became broken
The errors described below.



In file included from /usr/local/cuda/include/cub/util_device.cuh:52,
                 from /usr/local/cuda/include/thrust/system/cuda/detail/util.h:48,
                 from /usr/local/cuda/include/thrust/system/cuda/detail/par.h:39,
                 from /home/Yehonatans/work/utils/cuda/SingletonCudaMemoryPool.h:7,
                 from /home/Yehonatans/work/utils/cuda/unit_testing/TestCubMemoryPool.cpp:8:
/usr/local/cuda/include/cub/util_ptx.cuh: In function ‘void cub::CUB_200400___CUDA_ARCH_LIST___NS::CTA_SYNC()’:
/usr/local/cuda/include/cub/util_ptx.cuh:271:5: error: ‘__syncthreads’ was not declared in this scope
  271 |     __syncthreads();
      |     ^~~~~~~~~~~~~
/usr/local/cuda/include/cub/util_ptx.cuh: In function ‘int cub::CUB_200400___CUDA_ARCH_LIST___NS::CTA_SYNC_AND(int)’:
/usr/local/cuda/include/cub/util_ptx.cuh:280:12: error: ‘__syncthreads_and’ was not declared in this scope
  280 |     return __syncthreads_and(p);
      |            ^~~~~~~~~~~~~~~~~
/usr/local/cuda/include/cub/util_ptx.cuh: In function ‘int cub::CUB_200400___CUDA_ARCH_LIST___NS::CTA_SYNC_OR(int)’:
/usr/local/cuda/include/cub/util_ptx.cuh:289:12: error: ‘__syncthreads_or’ was not declared in this scope
  289 |     return __syncthreads_or(p);
      |            ^~~~~~~~~~~~~~~~
/usr/local/cuda/include/cub/util_ptx.cuh: In function ‘void cub::CUB_200400___CUDA_ARCH_LIST___NS::WARP_SYNC(unsigned int)’:
/usr/local/cuda/include/cub/util_ptx.cuh:298:5: error: ‘__syncwarp’ was not declared in this scope
  298 |     __syncwarp(member_mask);
      |     ^~~~~~~~~~
/usr/local/cuda/include/cub/util_ptx.cuh: In function ‘int cub::CUB_200400___CUDA_ARCH_LIST___NS::WARP_ANY(int, unsigned int)’:
/usr/local/cuda/include/cub/util_ptx.cuh:307:12: error: ‘__any_sync’ was not declared in this scope
  307 |     return __any_sync(member_mask, predicate);
      |            ^~~~~~~~~~
/usr/local/cuda/include/cub/util_ptx.cuh: In function ‘int cub::CUB_200400___CUDA_ARCH_LIST___NS::WARP_ALL(int, unsigned int)’:
/usr/local/cuda/include/cub/util_ptx.cuh:316:12: error: ‘__all_sync’ was not declared in this scope
  316 |     return __all_sync(member_mask, predicate);
      |            ^~~~~~~~~~
/usr/local/cuda/include/cub/util_ptx.cuh: In function ‘int cub::CUB_200400___CUDA_ARCH_LIST___NS::WARP_BALLOT(int, unsigned int)’:
/usr/local/cuda/include/cub/util_ptx.cuh:325:12: error: ‘__ballot_sync’ was not declared in this scope
  325 |     return __ballot_sync(member_mask, predicate);
      |            ^~~~~~~~~~~~~
/usr/local/cuda/include/cub/util_ptx.cuh: In function ‘unsigned int cub::CUB_200400___CUDA_ARCH_LIST___NS::SHFL_IDX_SYNC(unsigned int, int, unsigned int)’:
/usr/local/cuda/include/cub/util_ptx.cuh:368:12: error: ‘__shfl_sync’ was not declared in this scope
  368 |     return __shfl_sync(member_mask, word, src_lane);
      |            ^~~~~~~~~~~
/usr/local/cuda/include/cub/util_ptx.cuh: In function ‘int cub::CUB_200400___CUDA_ARCH_LIST___NS::RowMajorTid(int, int, int)’:
/usr/local/cuda/include/cub/util_ptx.cuh:415:39: error: ‘threadIdx’ was not declared in this scope
  415 |     return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
      |                                       ^~~~~~~~~
In file included from /usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/barrier_cluster.h:30,
                 from /usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx.h:74,
                 from /usr/local/cuda/include/cuda/ptx:19,
                 from /usr/local/cuda/include/cuda/discard_memory:25,
                 from /usr/local/cuda/include/cub/util_device.cuh:57,
                 from /usr/local/cuda/include/thrust/system/cuda/detail/util.h:48,
                 from /usr/local/cuda/include/thrust/system/cuda/detail/par.h:39,
                 from /home/Yehonatans/work/utils/cuda/SingletonCudaMemoryPool.h:7,
                 from /home/Yehonatans/work/utils/cuda/unit_testing/TestCubMemoryPool.cpp:8:
/usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/../ptx_helper_functions.h: In function ‘uint32_t cuda::ptx::__4::__as_ptr_smem(const void*)’:
/usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/../ptx_helper_functions.h:40:44: error: ‘__cvta_generic_to_shared’ was not declared in this scope
   40 |   return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr));
      |                                            ^~~~~~~~~~~~~~~~~~~~~~~~
/usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/../ptx_helper_functions.h: In function ‘uint64_t cuda::ptx::__4::__as_ptr_gmem(const void*)’:
/usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/../ptx_helper_functions.h:60:44: error: ‘__cvta_generic_to_global’ was not declared in this scope
   60 |   return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr));
      |                                            ^~~~~~~~~~~~~~~~~~~~~~~~
/usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/../ptx_helper_functions.h: In function ‘_Tp* cuda::ptx::__4::__from_ptr_smem(size_t)’:
/usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/../ptx_helper_functions.h:73:33: error: there are no arguments to ‘__cvta_shared_to_generic’ that depend on a template parameter, so a declaration of ‘__cvta_shared_to_generic’ must be available [-fpermissive]
   73 |   return reinterpret_cast<_Tp*>(__cvta_shared_to_generic(__ptr));
      |                                 ^~~~~~~~~~~~~~~~~~~~~~~~
/usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/../ptx_helper_functions.h:73:33: note: (if you use ‘-fpermissive’, G++ will accept your code, but allowing the use of an undeclared name is deprecated)
/usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/../ptx_helper_functions.h: In function ‘_Tp* cuda::ptx::__4::__from_ptr_gmem(size_t)’:
/usr/local/cuda/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/../ptx_helper_functions.h:94:33: error: there are no arguments to ‘__cvta_global_to_generic’ that depend on a template parameter, so a declaration of ‘__cvta_global_to_generic’ must be available [-fpermissive]
   94 |   return reinterpret_cast<_Tp*>(__cvta_global_to_generic(__ptr));
      |                                 ^~~~~~~~~~~~~~~~~~~~~~~~
In file included from /usr/local/cuda/include/thrust/system/cuda/detail/util.h:48,
                 from /usr/local/cuda/include/thrust/system/cuda/detail/par.h:39,
                 from /home/Yehonatans/work/utils/cuda/SingletonCudaMemoryPool.h:7,
                 from /home/Yehonatans/work/utils/cuda/unit_testing/TestCubMemoryPool.cpp:8:
/usr/local/cuda/include/cub/util_device.cuh: In static member function ‘static typename AgentT::TempStorage& cub::CUB_200400___CUDA_ARCH_LIST___NS::detail::vsmem_helper_impl<AgentT>::get_temp_storage(cub::CUB_200400___CUDA_ARCH_LIST___NS::NullType&, cub::CUB_200400___CUDA_ARCH_LIST___NS::detail::vsmem_t&)’:
/usr/local/cuda/include/cub/util_device.cuh:160:63: error: ‘blockIdx’ was not declared in this scope
  160 |       static_cast<char*>(vsmem.gmem_ptr) + (vsmem_per_block * blockIdx.x));
      |                                                               ^~~~~~~~
/usr/local/cuda/include/cub/util_device.cuh: In static member function ‘static bool cub::CUB_200400___CUDA_ARCH_LIST___NS::detail::vsmem_helper_impl<AgentT>::discard_temp_storage(typename AgentT::TempStorage&)’:
/usr/local/cuda/include/cub/util_device.cuh:201:38: error: ‘threadIdx’ was not declared in this scope
  201 |     const std::size_t linear_tid   = threadIdx.x;
      |                                      ^~~~~~~~~
/usr/local/cuda/include/cub/util_device.cuh:202:50: error: ‘blockDim’ was not declared in this scope
  202 |     const std::size_t block_stride = line_size * blockDim.x;

1 Like

Hi there are any update?

Any help?

I had a similar problem with the thrust library after updating to CUDA 12.5. The problem was that thrust headers were never intended to work when included from a host-only translation unit (i.e. a cpp file). They did work in older CUDA versions, but that was just coincidence. Have you tried renaming your TU from .cpp to .cu?

I’ve just come across thrust compile errors since updating to 12.6 that could be related. And noted your comment:

“The problem was that thrust headers were never intended to work when included from a host-only translation unit (i.e. a cpp file). They did work in older CUDA versions, but that was just coincidence.”

Wow this is news to me! I have been including the same header files in .cpp and .cu files for years that only define data types. I think it’s valid to want to do this though for basic type definitions without causing lots of cpp files to be renamed to cu. That is going to cause a lot of reorganising to fix in our case :-(

I got that information from this discussion:

There seems to be another way to fix this problem by defining the THRUST_DEVICE_SYSTEM symbol. I did not try this. In our project only a few files were affected, and reorganizing the includes was easy.