I’m getting an ICE with the following OpenMP target program:
nvhpc_target_ice.cpp
#include <vector>
#include <memory>
#include <cstdlib>
#include <cstdio>
template<typename N>
class ranged {
public:
class iterator {
friend class ranged;
public:
using difference_type = N;
using value_type = N;
using pointer = const N *;
using reference = N;
using iterator_category = std::random_access_iterator_tag;
reference operator*() const { return i_; }
iterator &operator++() { ++i_; return *this; }
iterator operator++(int) { iterator copy(*this); ++i_; return copy; }
iterator &operator--() { --i_; return *this; }
iterator operator--(int) { iterator copy(*this); --i_; return copy; }
iterator &operator+=(N by) { i_ += by; return *this; }
value_type operator[](const difference_type &i) const { return i_ + i; }
difference_type operator-(const iterator &it) const { return i_ - it.i_; }
iterator operator+(const value_type v) const { return iterator(i_ + v); }
bool operator==(const iterator &other) const { return i_ == other.i_; }
bool operator!=(const iterator &other) const { return i_ != other.i_; }
bool operator<(const iterator &other) const { return i_ < other.i_; }
bool operator<=(const iterator &other) const { return i_ <= other.i_; }
bool operator>(const iterator &other) const { return i_ > other.i_; }
bool operator>=(const iterator &other) const { return i_ >= other.i_; }
protected: explicit iterator(N start) : i_(start) {}
private: N i_;
};
[[nodiscard]] iterator begin() const { return begin_; }
[[nodiscard]] iterator end() const { return end_; }
ranged(N begin, N end) : begin_(begin), end_(end) {}
private: iterator begin_, end_;
};
int main(){
auto data = (float*) std::malloc(sizeof(float) * 4);
ranged<int> R(0, 4);
#pragma omp target data map( tofrom:data[0:4])
{
// // OK on NVHPC, Clang, and GCC
// auto N = std::distance(R.begin(), R.end());
// #pragma omp target parallel for
// for (size_t i = 0; i < N; i++) {
// data[i] = i*2.f;
// }
// ICE on NVHPC, OK on Clang and GCC
// #pragma omp target parallel for
// for (size_t i = 0; i < std::distance(R.begin(), R.end()); i++) {
// data[i] = i*2.f;
// }
// ICE on NVHPC, OK on Clang and GCC
#pragma omp target teams distribute parallel for
for (auto _p = R.begin(); _p < R.end(); _p++) {
data[*_p] = *_p*2.f;
}
}
// check results
for(auto i : R) {
printf("[%d] = %f\n", i, data[i]);
}
return EXIT_SUCCESS;
}
> /opt/nvidia/hpc_sdk/Linux_x86_64/22.11/compilers/bin/nvc++ -O0 -DNDEBUG -gpu=cc80 -mp=gpu --c++17 nvhpc_target_ice.cpp
NVC++-S-0000-Internal compiler error. BAD sptr in var_refsym 0 (nvhpc_target_ice.cpp: 49)
NVC++-S-0039-Use of undeclared variable (nvhpc_target_ice.cpp: 49)
NVC++-S-0094-Illegal type conversion required (nvhpc_target_ice.cpp: 49)
NVC++-S-0000-Internal compiler error. BAD sptr in var_refsym 0 (nvhpc_target_ice.cpp: 49)
NVC++-S-0000-Internal compiler error. BAD sptr in var_refsym 0 (nvhpc_target_ice.cpp: 49)
NVC++/x86-64 Linux 22.11-0: compilation completed with severe errors
I believe the loop adheres to what the canonical form requires as defined in OpenMP 4.5 Section 2.6 (Canonical Loop Form, see https://www.openmp.org/wp-content/uploads/openmp-4.5.pdf#page=62);
the ranged
class implements what the spec requires for a random-access-iterator-type
which is then used in the specified way in that section.
I’ve also looked through https://docs.nvidia.com/hpc-sdk/compilers/hpc-compilers-user-guide/index.html#openmp-subset but was unable to find anything that disallows this particular usage pattern.
Even if this is not supported, the compiler shouldn’t crash with an ICE here.
When I compile the same program using Clang 14 and GCC 11, the program compiles without any issues and runs correctly on platforms as old as SM35 (a GT710 in a test system):
> export OMP_TARGET_OFFLOAD=MANDATORY
> clang++-14 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_35 -O0 -DNDEBUG --cuda-path=/opt/nvidia/hpc_sdk/Linux_x86_64/22.11/cuda/11.8/ -g nvhpc_target_ice.cpp
clang: warning: CUDA version is newer than the latest supported version 11.5 [-Wunknown-cuda-version]
> ./a.out
[0] = 0.000000
[1] = 2.000000
[2] = 4.000000
[3] = 6.000000
> g++-11 -fopenmp -foffload=nvptx-none -fcf-protection=none -fno-stack-protector -O0 -DNDEBUG nvhpc_target_ice.cpp
> ./a.out
[0] = 0.000000
[1] = 2.000000
[2] = 4.000000
[3] = 6.000000
> clang++-14 -v
Ubuntu clang version 14.0.6-++20221018092219+f28c006a5895-1~exp1~20221018212255.160
Target: x86_64-pc-linux-gnu
Thread model: posix
InstalledDir: /usr/bin
Found candidate GCC installation: /usr/bin/../lib/gcc/x86_64-linux-gnu/10
Found candidate GCC installation: /usr/bin/../lib/gcc/x86_64-linux-gnu/11
Found candidate GCC installation: /usr/bin/../lib/gcc/x86_64-linux-gnu/7
Found candidate GCC installation: /usr/bin/../lib/gcc/x86_64-linux-gnu/7.5.0
Found candidate GCC installation: /usr/bin/../lib/gcc/x86_64-linux-gnu/8
Found candidate GCC installation: /usr/bin/../lib/gcc/x86_64-linux-gnu/9
Selected GCC installation: /usr/bin/../lib/gcc/x86_64-linux-gnu/11
Candidate multilib: .;@m64
Selected multilib: .;@m64
Found HIP installation: /opt/rocm, version 5.1.20532
> g++-11 -v
Using built-in specs.
COLLECT_GCC=g++-11
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/11/lto-wrapper
OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa
OFFLOAD_TARGET_DEFAULT=1
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu 11.1.0-1ubuntu1~20.04' --with-bugurl=file:///usr/share/doc/gcc-11/README.Bugs --enable-languages=c,ada,c++,go,brig,d,fortran,objc,obj-c++,m2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-11 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-bootstrap --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --enable-libphobos-checking=release --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --disable-cet --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-11-2V7zgg/gcc-11-11.1.0/debian/tmp-nvptx/usr,amdgcn-amdhsa=/build/gcc-11-2V7zgg/gcc-11-11.1.0/debian/tmp-gcn/usr --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu --with-build-config=bootstrap-lto-lean --enable-link-serialization=2
Thread model: posix
Supported LTO compression algorithms: zlib zstd
gcc version 11.1.0 (Ubuntu 11.1.0-1ubuntu1~20.04)