Nvc++ error: "Redefinition of STDEXEC_ASSERT" when compiling stdexec sample code

Hi, I am trying to write some test cases based on the Maxwell example code available on GitHub. However, I am encountering difficulties during compilation. I would like to compile directly using nvc++, but even after specifying the path in the compile command, I am still unable to compile successfully.
The test case contains three files (common.cuh, snr.cuh and stencil.cc) similar to the examples. The header files marked with double quotes are from a local path cloned from GitHub.
The first one is common.cuh:

#pragma once

#include "../stdexec/include/stdexec/__detail/__config.hpp"
#include <map>
#include <chrono>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <charconv>
#include <string_view>
#include <memory>
#include <vector>
#include <string.h>

#include <math.h>

#if defined(_NVHPC_CUDA) || defined(__CUDACC__)
#define STDEXEC_STDERR
#include "../stdexec/include/nvexec/detail/throw_on_cuda_error.cuh"
#endif

struct deleter_t{
    bool on_gpu{};

    template <class T>
    void operator()(T *ptr){
#if defined(_NVHPC_CUDA) || defined(__CUDACC__)
        if(on_gpu){
            STDEXEC_DBG_ERR(cudaFree(ptr));
        }else
#endif
        {
            free(ptr);
        }
    }
};

template <class T>
STDEXEC_ATTRIBUTE((host, device))
inline std::unique_ptr<T, deleter_t> allocate_on(bool gpu, std::size_t elements = 1){
    T *ptr{};
#if defined(_NVHPC_CUDA) || defined(__CUDACC__)
    if(gpu){
        STDEXEC_DBG_ERR(cudaMallocManaged(&ptr, elements * sizeof(T)));
    }else
#endif
    {
        ptr = reinterpret_cast<T *>(malloc(elements * sizeof(T)));
    }
    return std::unique_ptr<T, deleter_t>(ptr, deleter_t{gpu});
}

enum class field_id : int {
    f,
    fn,
    fields_count
};

struct fields_accessor{
    std::size_t nx;
    std::size_t ny;
    std::size_t n;

    float *base_ptr;
    std::vector<std::size_t> size = {0, 1};

    STDEXEC_ATTRIBUTE((nodiscard, host, device)) float *get(field_id id) const{
        return base_ptr + size[static_cast<int>(id)]*n;
        //return ptrs[static_cast<int>(id)];
    }

    STDEXEC_ATTRIBUTE((host, device)) void swap_f_fn() {
        std::size_t temp = size[0];
        size[0] = size[1];
        size[1] = temp;
    }
};

struct grid_t{
    std::size_t nx{};
    std::size_t ny{};
    std::size_t n{};

    std::unique_ptr<float, deleter_t> fields_{};
    grid_t(grid_t &&) = delete;
    grid_t(const grid_t &) = delete;

    grid_t(std::size_t nx, std::size_t ny, bool gpu)
        : nx(nx), ny(ny), n(nx*ny)
        , fields_(allocate_on<float>(
            gpu,
            static_cast<std::size_t>(n) * static_cast<int>(field_id::fields_count))){
    }
    [[nodiscard]] fields_accessor accessor() const {
        return{nx, ny, n,             
               fields_.get()};
    }
};

struct grid_initializer_t{
    fields_accessor accessor;
    STDEXEC_ATTRIBUTE((host, device)) void operator()(std::size_t cell_id) const {
        /*const std::size_t j  = cell_id / accessor.nx;
        const std::size_t i  = cell_id % accessor.nx;*/
        const std::size_t ix = cell_id;
        accessor.get(field_id::f )[ix] = 1.0;
        accessor.get(field_id::fn)[ix] = 0.0;
    }
};

inline grid_initializer_t grid_initializer(fields_accessor accessor){
    return{accessor};
}

snr.cuh:

#pragma once

#include "common.cuh"
#include "../stdexec/include/stdexec/execution.hpp"
#include "../stdexec/include/exec/on.hpp"


#if defined(_NVHPC_CUDA) || defined(__CUDACC__)
#include "../stdexec/include/nvexec/detail/throw_on_cuda_error.cuh"
#include <nvexec/stream_context.cuh>
#include <nvexec/multi_gpu_context.cuh>
#else
namespace nvexec {
  struct stream_receiver_base {
    using receiver_concept = stdexec::receiver_t;
  };

  struct stream_sender_base {
    using sender_concept = stdexec::sender_t;
  };

  namespace detail {
    struct stream_op_state_base { };
  }

  inline bool is_on_gpu() {
    return false;
  }
}
#endif

#include <optional>
#include <exec/inline_scheduler.hpp>
#include <exec/static_thread_pool.hpp>

namespace ex = stdexec;

#if defined(_NVHPC_CUDA) || defined(__CUDACC__)
namespace nvexec::STDEXEC_STREAM_DETAIL_NS { //

  namespace repeat_n {
    template <class OpT>
    class receiver_2_t : public stream_receiver_base {
      using Sender = typename OpT::PredSender;
      using Receiver = typename OpT::Receiver;

      OpT& op_state_;

     public:
      template <stdexec::__one_of<ex::set_error_t, ex::set_stopped_t> _Tag, class... _Args>
      friend void tag_invoke(_Tag __tag, receiver_2_t&& __self, _Args&&... __args) noexcept {
        OpT& op_state = __self.op_state_;
        op_state.propagate_completion_signal(_Tag{}, (_Args&&) __args...);
      }

      friend void tag_invoke(ex::set_value_t, receiver_2_t&& __self) noexcept {
        using inner_op_state_t = typename OpT::inner_op_state_t;

        OpT& op_state = __self.op_state_;
        op_state.i_++;

        if (op_state.i_ == op_state.n_) {
          op_state.propagate_completion_signal(stdexec::set_value);
          return;
        }

        auto sch = stdexec::get_scheduler(stdexec::get_env(op_state.rcvr_));
        inner_op_state_t& inner_op_state = op_state.inner_op_state_.emplace(
          stdexec::__conv{[&]() noexcept {
            return ex::connect(ex::schedule(sch) | op_state.closure_, receiver_2_t<OpT>{op_state});
          }});

        ex::start(inner_op_state);
      }

      friend typename OpT::env_t tag_invoke(ex::get_env_t, const receiver_2_t& self) noexcept {
        return self.op_state_.make_env();
      }

      explicit receiver_2_t(OpT& op_state)
        : op_state_(op_state) {
      }
    };

    template <class OpT>
    class receiver_1_t : public stream_receiver_base {
      using Receiver = typename OpT::Receiver;

      OpT& op_state_;

     public:
      template <stdexec::__one_of<ex::set_error_t, ex::set_stopped_t> _Tag, class... _Args>
      friend void tag_invoke(_Tag __tag, receiver_1_t&& __self, _Args&&... __args) noexcept {
        OpT& op_state = __self.op_state_;
        op_state.propagate_completion_signal(_Tag{}, (_Args&&) __args...);
      }

      friend void tag_invoke(ex::set_value_t, receiver_1_t&& __self) noexcept {
        using inner_op_state_t = typename OpT::inner_op_state_t;

        OpT& op_state = __self.op_state_;

        if (op_state.n_) {
          auto sch = stdexec::get_scheduler(stdexec::get_env(op_state.rcvr_));
          inner_op_state_t& inner_op_state = op_state.inner_op_state_.emplace(
            stdexec::__conv{[&]() noexcept {
              return ex::connect(
                ex::schedule(sch) | op_state.closure_, receiver_2_t<OpT>{op_state});
            }});

          ex::start(inner_op_state);
        } else {
          op_state.propagate_completion_signal(stdexec::set_value);
        }
      }

      friend typename OpT::env_t tag_invoke(ex::get_env_t, const receiver_1_t& self) noexcept {
        return self.op_state_.make_env();
      }

      explicit receiver_1_t(OpT& op_state)
        : op_state_(op_state) {
      }
    };

    template <class PredecessorSenderId, class Closure, class ReceiverId>
    struct operation_state_t : operation_state_base_t<ReceiverId> {
      using PredSender = stdexec::__t<PredecessorSenderId>;
      using Receiver = stdexec::__t<ReceiverId>;
      using Scheduler =
        stdexec::tag_invoke_result_t<stdexec::get_scheduler_t, stdexec::env_of_t<Receiver>>;
      using InnerSender =
        std::invoke_result_t<Closure, stdexec::tag_invoke_result_t<stdexec::schedule_t, Scheduler>>;

      using predecessor_op_state_t =
        ex::connect_result_t<PredSender, receiver_1_t<operation_state_t>>;
      using inner_op_state_t = ex::connect_result_t<InnerSender, receiver_2_t<operation_state_t>>;

      PredSender pred_sender_;
      Closure closure_;
      std::optional<predecessor_op_state_t> pred_op_state_;
      std::optional<inner_op_state_t> inner_op_state_;
      std::size_t n_{};
      std::size_t i_{};

      friend void tag_invoke(stdexec::start_t, operation_state_t& op) noexcept {
        if (op.stream_provider_.status_ != cudaSuccess) {
          // Couldn't allocate memory for operation state, complete with error
          op.propagate_completion_signal(
            stdexec::set_error, std::move(op.stream_provider_.status_));
        } else {
          if (op.n_) {
            stdexec::start(*op.pred_op_state_);
          } else {
            op.propagate_completion_signal(stdexec::set_value);
          }
        }
      }

      operation_state_t(PredSender&& pred_sender, Closure closure, Receiver&& rcvr, std::size_t n)
        : operation_state_base_t<ReceiverId>(
          (Receiver&&) rcvr,
          stdexec::get_completion_scheduler<stdexec::set_value_t>(stdexec::get_env(pred_sender))
            .context_state_)
        , pred_sender_{(PredSender&&) pred_sender}
        , closure_(closure)
        , n_(n) {
        pred_op_state_.emplace(stdexec::__conv{[&]() noexcept {
          return ex::connect((PredSender&&) pred_sender_, receiver_1_t{*this});
        }});
      }
    };
}}
#endif

namespace repeat_n_detail {

  template <class OpT>
  class receiver_2_t {
    using Sender = typename OpT::PredSender;
    using Receiver = typename OpT::Receiver;

    OpT& op_state_;

   public:
    using receiver_concept = stdexec::receiver_t;

    template <stdexec::__one_of<ex::set_error_t, ex::set_stopped_t> _Tag, class... _Args>
    friend void tag_invoke(_Tag __tag, receiver_2_t&& __self, _Args&&... __args) noexcept {
      OpT& op_state = __self.op_state_;
      __tag(std::move(op_state.rcvr_), (_Args&&) __args...);
    }

    friend void tag_invoke(ex::set_value_t, receiver_2_t&& __self) noexcept {
      using inner_op_state_t = typename OpT::inner_op_state_t;

      OpT& op_state = __self.op_state_;
      op_state.i_++;

      if (op_state.i_ == op_state.n_) {
        stdexec::set_value(std::move(op_state.rcvr_));
        return;
      }

      auto sch = stdexec::get_scheduler(stdexec::get_env(op_state.rcvr_));
      inner_op_state_t& inner_op_state = op_state.inner_op_state_.emplace(
        stdexec::__conv{[&]() noexcept {
          return ex::connect(ex::schedule(sch) | op_state.closure_, receiver_2_t<OpT>{op_state});
        }});

      ex::start(inner_op_state);
    }

    friend auto tag_invoke(ex::get_env_t, const receiver_2_t& self) noexcept
      -> stdexec::env_of_t<Receiver> {
      return stdexec::get_env(self.op_state_.rcvr_);
    }

    explicit receiver_2_t(OpT& op_state)
      : op_state_(op_state) {
    }
  };

  template <class OpT>
  class receiver_1_t {
    using Receiver = typename OpT::Receiver;

    OpT& op_state_;

   public:
    using receiver_concept = stdexec::receiver_t;

    template <stdexec::__one_of<ex::set_error_t, ex::set_stopped_t> _Tag, class... _Args>
    friend void tag_invoke(_Tag __tag, receiver_1_t&& __self, _Args&&... __args) noexcept {
      OpT& op_state = __self.op_state_;
      __tag(std::move(op_state.rcvr_), (_Args&&) __args...);
    }

    friend void tag_invoke(ex::set_value_t, receiver_1_t&& __self) noexcept {
      using inner_op_state_t = typename OpT::inner_op_state_t;

      OpT& op_state = __self.op_state_;

      if (op_state.n_) {
        auto sch = stdexec::get_scheduler(stdexec::get_env(op_state.rcvr_));
        inner_op_state_t& inner_op_state = op_state.inner_op_state_.emplace(
          stdexec::__conv{[&]() noexcept {
            return ex::connect(ex::schedule(sch) | op_state.closure_, receiver_2_t<OpT>{op_state});
          }});

        ex::start(inner_op_state);
      } else {
        stdexec::set_value(std::move(op_state.rcvr_));
      }
    }

    friend auto tag_invoke(ex::get_env_t, const receiver_1_t& self) noexcept
      -> stdexec::env_of_t<Receiver> {
      return stdexec::get_env(self.op_state_.rcvr_);
    }

    explicit receiver_1_t(OpT& op_state)
      : op_state_(op_state) {
    }
  };

  template <class PredecessorSenderId, class Closure, class ReceiverId>
  struct operation_state_t {
    using PredSender = stdexec::__t<PredecessorSenderId>;
    using Receiver = stdexec::__t<ReceiverId>;
    using Scheduler =
      stdexec::tag_invoke_result_t<stdexec::get_scheduler_t, stdexec::env_of_t<Receiver>>;
    using InnerSender =
      std::invoke_result_t<Closure, stdexec::tag_invoke_result_t<stdexec::schedule_t, Scheduler>>;

    using predecessor_op_state_t =
      ex::connect_result_t<PredSender, receiver_1_t<operation_state_t>>;
    using inner_op_state_t = ex::connect_result_t<InnerSender, receiver_2_t<operation_state_t>>;

    PredSender pred_sender_;
    Closure closure_;
    Receiver rcvr_;
    std::optional<predecessor_op_state_t> pred_op_state_;
    std::optional<inner_op_state_t> inner_op_state_;
    std::size_t n_{};
    std::size_t i_{};

    friend void tag_invoke(stdexec::start_t, operation_state_t& op) noexcept {
      if (op.n_) {
        stdexec::start(*op.pred_op_state_);
      } else {
        stdexec::set_value(std::move(op.rcvr_));
      }
    }

    operation_state_t(PredSender&& pred_sender, Closure closure, Receiver&& rcvr, std::size_t n)
      : pred_sender_{(PredSender&&) pred_sender}
      , closure_(closure)
      , rcvr_(rcvr)
      , n_(n) {
      pred_op_state_.emplace(stdexec::__conv{[&]() noexcept {
        return ex::connect((PredSender&&) pred_sender_, receiver_1_t{*this});
      }});
    }
  };

  template <class SenderId, class Closure>
  struct repeat_n_sender_t {
    using __t = repeat_n_sender_t;
    using __id = repeat_n_sender_t;
    using Sender = stdexec::__t<SenderId>;
    using sender_concept = stdexec::sender_t;

    using completion_signatures = //
      stdexec::completion_signatures<
        stdexec::set_value_t(),
        stdexec::set_stopped_t(),
        stdexec::set_error_t(std::exception_ptr)
#if defined(_NVHPC_CUDA) || defined(__CUDACC__)
          ,
        stdexec::set_error_t(cudaError_t)
#endif
        >;

    Sender sender_;
    Closure closure_;
    std::size_t n_{};

#if defined(_NVHPC_CUDA) || defined(__CUDACC__)
    template <stdexec::__decays_to<repeat_n_sender_t> Self, stdexec::receiver Receiver>
      requires(stdexec::sender_to<Sender, Receiver>)
           && (!nvexec::STDEXEC_STREAM_DETAIL_NS::receiver_with_stream_env<Receiver>)
    friend auto tag_invoke(stdexec::connect_t, Self&& self, Receiver r)
      -> repeat_n_detail::operation_state_t<SenderId, Closure, stdexec::__id<Receiver>> {
      return repeat_n_detail::operation_state_t<SenderId, Closure, stdexec::__id<Receiver>>(
        (Sender&&) self.sender_, self.closure_, (Receiver&&) r, self.n_);
    }

    template <stdexec::__decays_to<repeat_n_sender_t> Self, stdexec::receiver Receiver>
      requires(stdexec::sender_to<Sender, Receiver>)
           && (nvexec::STDEXEC_STREAM_DETAIL_NS::receiver_with_stream_env<Receiver>)
    friend auto tag_invoke(stdexec::connect_t, Self&& self, Receiver r)
      -> nvexec::STDEXEC_STREAM_DETAIL_NS::repeat_n::
        operation_state_t<SenderId, Closure, stdexec::__id<Receiver>> {
      return nvexec::STDEXEC_STREAM_DETAIL_NS::repeat_n::
        operation_state_t<SenderId, Closure, stdexec::__id<Receiver>>(
          (Sender&&) self.sender_, self.closure_, (Receiver&&) r, self.n_);
    }
#else
    template <stdexec::__decays_to<repeat_n_sender_t> Self, stdexec::receiver Receiver>
      requires stdexec::sender_to<Sender, Receiver>
    friend auto tag_invoke(stdexec::connect_t, Self&& self, Receiver r)
      -> repeat_n_detail::operation_state_t<SenderId, Closure, stdexec::__id<Receiver>> {
      return repeat_n_detail::operation_state_t<SenderId, Closure, stdexec::__id<Receiver>>(
        (Sender&&) self.sender_, self.closure_, (Receiver&&) r, self.n_);
    }
#endif

    friend auto tag_invoke(stdexec::get_env_t, const repeat_n_sender_t& s) //
      noexcept(stdexec::__nothrow_callable<stdexec::get_env_t, const Sender&>)
        -> stdexec::env_of_t<const Sender&> {
      return stdexec::get_env(s.sender_);
    }
  };
}

struct repeat_n_t {
  template <stdexec::sender Sender, stdexec::__sender_adaptor_closure Closure>
  auto operator()(Sender&& __sndr, std::size_t n, Closure closure) const noexcept
    -> repeat_n_detail::repeat_n_sender_t<stdexec::__id<Sender>, Closure> {
    return repeat_n_detail::repeat_n_sender_t<stdexec::__id<Sender>, Closure>{
      std::forward<Sender>(__sndr), closure, n};
  }

  template <stdexec::__sender_adaptor_closure Closure>
  auto operator()(std::size_t n, Closure closure) const
    -> stdexec::__binder_back<repeat_n_t, std::size_t, Closure> {
    return {
      {},
      {},
      {n, (Closure&&) closure}
    };
  }
};

inline constexpr repeat_n_t repeat_n{};

template <class SchedulerT>
[[nodiscard]] bool is_gpu_scheduler(SchedulerT&& scheduler) {
  auto snd = ex::just() | exec::on(scheduler, ex::then([] { return nvexec::is_on_gpu(); }));
  auto [on_gpu] = stdexec::sync_wait(std::move(snd)).value();
  return on_gpu;
}
/*
auto diffusion_eqs_snr(
  std::size_t n_iterations,
  fields_accessor accessor,
  stdexec::scheduler auto&& computer) {
  return ex::just()
       | exec::on(
           computer,
           repeat_n(
             n_iterations,
             ex::bulk(accessor.n, update_f(accessor))))
       | ex::then();
}
*/
void run_snr(
  std::size_t n_iterations,
  grid_t& grid,
  stdexec::scheduler auto&& computer) {
  fields_accessor accessor = grid.accessor();

  auto init = ex::just() | exec::on(computer, ex::bulk(grid.n, grid_initializer(accessor)));
  stdexec::sync_wait(init);

  //auto snd = diffusion_eqs_snr(n_iterations, accessor, computer);
  //stdexec::sync_wait(snd);
}

stencil.cc

#include "snr.cuh"

int main(int argc, char *argv[]){
    const std::size_t n_iterations = 1;
    const std::size_t nx = 3;
    const std::size_t ny = 3;
    auto run_snr_on = [&](stdexec::scheduler auto &&scheduler){
        grid_t grid{nx, ny, is_gpu_scheduler(scheduler)};

        auto accessor = grid.accessor();

        run_snr(n_iterations, grid, std::forward<decltype(scheduler)>(scheduler));
    };

    nvexec::stream_context stream_ctx{};
    run_snr_on(stream_ctx.get_scheduler());
}

Compile command:

nvc++ -std=c++20 -cuda --gcc-toolchain=/work/opt/local/x86_64/cores/gcc/12.2.0 --experimental-stdpar -stdpar=gpu -o run stencil.cc

Error message:

"/work/opt/local/x86_64/cores/nvidia/23.11/Linux_x86_64/23.11/compilers/include-stdexec/experimental/nvexec/../stdexec/__detail/__config.hpp", line 178: catastrophic error: #error directive: "Redefinition of STDEXEC_ASSERT is not permitted. Define STDEXEC_ASSERT_FN instead."
  #error "Redefinition of STDEXEC_ASSERT is not permitted. Define STDEXEC_ASSERT_FN instead."
   ^

1 catastrophic error detected in the compilation of "stencil.cc".
Compilation terminated.

Besides, I would like to ask about how to implement iterations in the sender-receiver model. Currently, I have only seen the implementation of repeat_n() in maxwell sample codes. Will this part be added to the library in the future? If not, are there any simpler methods of implementation?

It looks like you’re mixing the experimental header files that we ship with the ones from the github repo. I believe Eric is actively updating the repo so the version we ship can be older.

When I switched to using just the repo’s header files, then it gets past this error but encounters the same static assert you see in your other posting. I didn’t try, but if you use an earlier compiler version, like 23.7 or 23.3, then you’ll like be able to get the code to compile.

Hi Mat.
Thank you very much for your reply. As you mentioned, version 23.3 can compile normally. I noticed this a few days ago, but I still appreciate your explanation.