Compilation Errors with GCC Versions 11-14 and CUDA Toolkit 12.5/12.6 Due to Undefined `__builtin_ia32_ldtilecfg` and `__builtin_ia32_sttilecfg`, etc

Hello CUDA Community,

I’ve encountered multiple compilation issues while trying to compile my CUDA code (GpuProcessing.cu, just a simple example) with different GCC compiler versions (g+±11, g+±12, g+±13, g+±14) using CUDA Toolkit 12.5 and 12.6.

Summary of Issues:

1. GCC-11:

nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
/usr/lib/gcc/x86_64-linux-gnu/11/include/amxtileintrin.h(42): error: identifier "__builtin_ia32_ldtilecfg" is undefined
    __builtin_ia32_ldtilecfg (__config);
    ^

/usr/lib/gcc/x86_64-linux-gnu/11/include/amxtileintrin.h(49): error: identifier "__builtin_ia32_sttilecfg" is undefined
    __builtin_ia32_sttilecfg (__config);
    ^

2 errors detected in the compilation of "/home/vladislavsemykin/Documents/Work/Start/src/GpuProcessing.cu".
make[2]: *** [CMakeFiles/nia_start_core.dir/build.make:91: CMakeFiles/nia_start_core.dir/src/GpuProcessing.cu.o] Error 2
make[2]: *** Waiting for unfinished jobs....

2. GCC-12:

nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
/usr/lib/gcc/x86_64-linux-gnu/12/include/avx512bf16vlintrin.h(53): error: identifier "__builtin_ia32_cvtne2ps2bf16_v16hi" is undefined
    return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi(__A, __B);
                     ^

/usr/lib/gcc/x86_64-linux-gnu/12/include/avx512bf16vlintrin.h(60): error: identifier "__builtin_ia32_cvtne2ps2bf16_v16hi_mask" is undefined
    return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_mask(__C, __D, __A, __B);
                     ^

/usr/lib/gcc/x86_64-linux-gnu/12/include/avx512bf16vlintrin.h(67): error: identifier "__builtin_ia32_cvtne2ps2bf16_v16hi_maskz" is undefined
    return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_maskz(__B, __C, __A);
                     ^

/usr/lib/gcc/x86_64-linux-gnu/12/include/avx512bf16vlintrin.h(74): error: identifier "__builtin_ia32_cvtne2ps2bf16_v8hi" is undefined
    return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi(__A, __B);

...


/usr/lib/gcc/x86_64-linux-gnu/12/include/avx512bf16intrin.h(67): error: identifier "__builtin_ia32_cvtne2ps2bf16_v32hi_mask" is undefined
    return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_mask(__C, __D, __A, __B);
                     ^

/usr/lib/gcc/x86_64-linux-gnu/12/include/avx512bf16intrin.h(74): error: identifier "__builtin_ia32_cvtne2ps2bf16_v32hi_maskz" is undefined
    return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_maskz(__B, __C, __A);
                     ^

/usr/lib/gcc/x86_64-linux-gnu/12/include/amxtileintrin.h(42): error: identifier "__builtin_ia32_ldtilecfg" is undefined
    __builtin_ia32_ldtilecfg (__config);
    ^

/usr/lib/gcc/x86_64-linux-gnu/12/include/amxtileintrin.h(49): error: identifier "__builtin_ia32_sttilecfg" is undefined
    __builtin_ia32_sttilecfg (__config);

3. GCC-13:

[ 94%] Building CXX object CMakeFiles/nia_start_core.dir/src/ModelingMainDriver.cpp.o
/usr/lib/gcc/x86_64-linux-gnu/13/include/amxtileintrin.h(42): error: identifier "__builtin_ia32_ldtilecfg" is undefined
    __builtin_ia32_ldtilecfg (__config);
    ^

/usr/lib/gcc/x86_64-linux-gnu/13/include/amxtileintrin.h(49): error: identifier "__builtin_ia32_sttilecfg" is undefined
    __builtin_ia32_sttilecfg (__config);
    ^

2 errors detected in the compilation of "/home/vladislavsemykin/Documents/Work/Start/src/GpuProcessing.cu".
make[2]: *** [CMakeFiles/nia_start_core.dir/build.make:91: CMakeFiles/nia_start_core.dir/src/GpuProcessing.cu.o] Error 2
make[2]: *** Waiting for unfinished jobs....
^Cmake[2]: *** [CMakeFiles/nia_start_core.dir/build.make:76: CMakeFiles/nia_start_core.dir/src/main.cpp.o] Interrupt
make[2]: *** [CMakeFiles/nia_start_core.dir/build.make:105: CMakeFiles/nia_start_core.dir/src/ModelingMainDriver.cpp.o] Interrupt
make[1]: *** [CMakeFiles/Makefile2:285: CMakeFiles/nia_start_core.dir/all] Interrupt
make: *** [Makefile:91: all] Interrupt

4. GCC-14 (unsopported yet as I understand, but I used flag -allow-unsupported-compiler just for test):

[ 94%] Building CXX object CMakeFiles/nia_start_core.dir/src/ModelingMainDriver.cpp.o
nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
/usr/include/x86_64-linux-gnu/c++/14/bits/c++config.h(827): error: user-defined literal operator not found
    typedef __decltype(0.0bf16) __bfloat16_t;
                       ^

/usr/include/c++/14/type_traits(529): error: type name is not allowed
      : public __bool_constant<__is_array(_Tp)>
                                          ^

/usr/include/c++/14/type_traits(529): error: identifier "__is_array" is undefined
      : public __bool_constant<__is_array(_Tp)>
                               ^

/usr/include/c++/14/type_traits(581): error: type name is not allowed
      : public __bool_constant<__is_member_object_pointer(_Tp)>
                                                          ^

/usr/include/c++/14/type_traits(581): error: identifier "__is_member_object_pointer" is undefined
      : public __bool_constant<__is_member_object_pointer(_Tp)>
                               ^

/usr/include/c++/14/type_traits(603): error: type name is not allowed
      : public __bool_constant<__is_member_function_pointer(_Tp)>
                                                            ^

/usr/include/c++/14/type_traits(603): error: identifier "__is_member_function_pointer" is undefined
      : public __bool_constant<__is_member_function_pointer(_Tp)>

...

/usr/lib/gcc/x86_64-linux-gnu/14/include/avx512vlbwintrin.h(5136): error: identifier "__builtin_shufflevector" is undefined
    __v16qi __T1 = (__v16qi)__W; __v16qi __T2 = __builtin_shufflevector (__T1, __T1, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); __v16qi __T3 = __T1 & __T2; __v16qi __T4 = __builtin_shufflevector (__T3, __T3, 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __v16qi __T5 = __T3 & __T4; __v16qi __T6 = __builtin_shufflevector (__T5, __T5, 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __v16qi __T7 = __T5 & __T6; __v16qi __T8 = __builtin_shufflevector (__T7, __T7, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __v16qi __T9 = __T7 & __T8; return __T9[0];
                                                ^

/usr/lib/gcc/x86_64-linux-gnu/14/include/avx512vlbwintrin.h(5144): error: identifier "__builtin_shufflevector" is undefined
    __v16qi __T1 = (__v16qi)__W; __v16qi __T2 = __builtin_shufflevector (__T1, __T1, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); __v16qi __T3 = __T1 | __T2; __v16qi __T4 = __builtin_shufflevector (__T3, __T3, 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __v16qi __T5 = __T3 | __T4; __v16qi __T6 = __builtin_shufflevector (__T5, __T5, 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __v16qi __T7 = __T5 | __T6; __v16qi __T8 = __builtin_shufflevector (__T7, __T7, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __v16qi __T9 = __T7 | __T8; return __T9[0];
                                                ^

Error limit reached.
100 errors detected in the compilation of "/home/vladislavsemykin/Documents/Work/Start/src/GpuProcessing.cu".
Compilation terminated.
make[2]: *** [CMakeFiles/nia_start_core.dir/build.make:91: CMakeFiles/nia_start_core.dir/src/GpuProcessing.cu.o] Error 4
make[2]: *** Waiting for unfinished jobs....
make[1]: *** [CMakeFiles/Makefile2:285: CMakeFiles/nia_start_core.dir/all] Error 2
make: *** [Makefile:91: all] Error 2

Details:

  • CUDA Toolkit Version: 12.5/12.6
  • nvcc: nvcc warning: incompatible redefinition for option 'compiler-bindir'
  • Driver: NVIDIA Driver 560.35.03
  • GNU GCC/G++ compilers versions: 11, 12, 13, 14
  • OS: Debian-based Linux (Linux loveit 6.10.9-amd64 #1 SMP PREEMPT_DYNAMIC Debian 6.10.9-1 (2024-09-08) x86_64 GNU/Linux)

CMakeLists.txt

Here are only those parts which are related to CUDA:

set(ENV{CC} /usr/bin/gcc-13)
set(ENV{CXX} /usr/bin/g++-13)
set(ENV{CUDAHOSTCXX} /usr/bin/g++-13)

set(CMAKE_C_COMPILER gcc-13)
set(CMAKE_CXX_COMPILER g++-13)
set(CMAKE_CUDA_COMPILER nvcc)

project(...
              LANGUAGES CXX CUDA)

...

set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda-12.6)
set(CMAKE_CUDA_STANDARD 20)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -ccbin g++-13")

message(STATUS "C Compiler: ${CMAKE_C_COMPILER}")
message(STATUS "C++ Compiler: ${CMAKE_CXX_COMPILER}")
message(STATUS "CUDA Host Compiler: ${CUDAHOSTCXX}")
message(STATUS "CUDA Compiler: ${CMAKE_CUDA_COMPILER}")

target_link_libraries(...
                                     ${CUDA_LIBRARIES}
                                     ...)

Basic GPU info

nvidia-smi

Result:

+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GeForce RTX 4060 ...    Off |   00000000:01:00.0 Off |                  N/A |
| N/A   36C    P4             14W /   30W |      15MiB /   8188MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|    0   N/A  N/A     29559      G   /usr/lib/xorg/Xorg                              4MiB |
+-----------------------------------------------------------------------------------------+

Problem

The compilation process halts due to errors in various header files for intrinsic functions (such as __builtin_ia32_ldtilecfg, __builtin_ia32_cvtne2ps2bf16, etc.). These errors occur with multiple versions of GCC while compiling CUDA code, specifically when using advanced instruction sets like AVX-512 and AMX.

Some observations:

  1. Incompatible GCC versions: Versions 11 through 14 seem to introduce errors related to undefined intrinsics when compiling CUDA code.
  2. nvcc warning: There’s a warning about an “incompatible redefinition for option ‘compiler-bindir’” that might indicate a conflict between nvcc and CMake configurations.
  3. Built-in functions: Many intrinsic functions (related to AMX and AVX-512 instructions) are not being recognized by nvcc during the build process.

Questions

  • Is there a known incompatibility between CUDA Toolkit 12.5 and GCC versions 11-14, specifically regarding AVX/AMX intrinsics?
  • Are there specific compiler flags or packages I should install to make these built-in functions recognized by nvcc?
  • Should I downgrade to an earlier version of GCC or CUDA to resolve these issues?
  • Is there a known fix or workaround to get nvcc to work with newer GCC versions and avoid these errors?

Any guidance or suggestions to resolve this would be greatly appreciated!

Thank you in advance for your help!

Best regards,
Vladislav Semykin

Bug Fix Report: AMX Instruction Compatibility with GCC

Issue Summary

When compiling a C++ project using GCC 13 with CUDA enabled, the project encountered errors related to AMX Tile Matrix instructions (ldtilecfg and sttilecfg). These errors stemmed from conflicts between the __builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg intrinsics and GCC’s internal handling of these instructions on specific hardware configurations.

Root Cause

The root cause of the issue was the use of GCC’s built-in functions for AMX instructions, which conflicted with the underlying hardware’s support for these intrinsics. Specifically, the __builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg functions failed due to improper handling of the constexpr keyword, which GCC enforces on its built-in cmath functions. As a result, this caused redefinition and incompatibility errors during compilation.

Solution Implemented

To resolve the issue, we replaced the GCC built-in intrinsics with equivalent inline assembly instructions. This bypasses GCC’s handling and directly invokes the AMX instructions:

Modifications Made

  • File Modified: /usr/lib/gcc/x86_64-linux-gnu/13/include/amxtileintrin.h
  • Original Code:
  extern __inline void
  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  _tile_loadconfig (const void *__config)
  {
      __builtin_ia32_ldtilecfg (__config);
  }

  extern __inline void
  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  _tile_storeconfig (void *__config)
  {
      __builtin_ia32_sttilecfg (__config);
  }

Modified Code

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_tile_loadconfig (const void *__config)
{
    __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
}

extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_tile_storeconfig (void *__config)
{
    __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config)));
}

Explanation of the Changes

The updated code uses inline assembly to invoke the AMX instructions directly, allowing more precise control over the instruction handling without relying on GCC’s intrinsics. This change enables the code to compile without triggering constexpr conflicts and ensures compatibility with the AMX Tile Matrix instructions.

Impact Analysis

  • System Compatibility: No changes to system-wide configuration. These changes apply only to the specific project that includes this header.
  • Future Maintenance: If GCC is updated, the modified header may be overwritten. To avoid this, it’s recommended to encapsulate these changes in a custom project-specific header.
  • Testing: The modified code was tested on an Intel processor supporting AMX, and the instructions were verified to execute correctly without further errors.

Recommendation

It is advised to encapsulate these changes within the project or maintain a patch file to reapply modifications as necessary after compiler updates. Avoid modifying system headers directly to maintain compatibility with future GCC versions and ensure that AMX functionality is only utilized where supported.

P.S. found the solution here from @slaren from 13 Sep 2024

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.