Inheriting base class causes wrong values of members after construction

Following code example:
test.cu

#include "lib.h"

__global__ void test(vec0<float, 3> abc){
	printf("%.9f %.9f %.9f\n", abc[0], abc[1], abc[2]);
}

__global__ void test(vec1<float, 3> abc){
	printf("%.9f %.9f %.9f\n", abc[0], abc[1], abc[2]);
}

void test_host(vec0<float, 3> abc){
	printf("%.9f %.9f %.9f\n", abc[0], abc[1], abc[2]);
}

void test_host(vec1<float, 3> abc){
	printf("%.9f %.9f %.9f\n", abc[0], abc[1], abc[2]);
}

int main(){
	const vec0<float, 3> abc0{1.0f, 2.0f, 3.0f};
	const vec1<float, 3> abc1{1.0f, 2.0f, 3.0f};
	printf("%.9f %.9f %.9f\n", abc0[0], abc0[1], abc0[2]);
	printf("%.9f %.9f %.9f\n", abc1[0], abc1[1], abc1[2]);
	printf("DEVICE:\n");
	test<<<1, 1>>>(abc0);
	test<<<1, 1>>>(abc1);
	cudaDeviceSynchronize();
	printf("HOST:\n");
	test_host(abc0);
	test_host(abc1);
}

lib.h

#include <functional>
#include <type_traits>
#include <utility>
#include <array>
#include <iostream>

/// use some designs from https://github.com/taocpp/sequences/
namespace meta_impl {
	template<typename T, T... Ns>
	constexpr T integral_seq_mul_impl() noexcept {
		T res {1};
		(void) std::initializer_list<int> {(res *= Ns, 0)...};
		return res;
	}

	template<typename T, T... Ns>
	constexpr T integral_seq_sum_impl() noexcept {
		T res {0};
		(void) std::initializer_list<int> {(res += Ns, 0)...};
		return res;
	}
}// namespace meta_impl

template<typename T>
struct value_seq_traits_impl;
template<typename T, T... Ns>
struct value_seq_traits_impl<std::integer_sequence<T, Ns...>> {
	static constexpr T sum	= meta_impl::integral_seq_sum_impl<T, Ns...>();
	static constexpr T prod = meta_impl::integral_seq_mul_impl<T, Ns...>();
};
template<typename T, T... Ns>
using value_seq_traits = value_seq_traits_impl<std::integer_sequence<T, Ns...>>;

/// seq reduction
template<typename T, T... Ns>
using integral_seq_sum = std::integral_constant<T, value_seq_traits<T, ((Ns > 0) ? Ns : 0)...>::sum - value_seq_traits<T, ((Ns < 0) ? -Ns : 0)...>::sum>;
template<typename T, T... Ns>
using integral_seq_mul = std::integral_constant<T, value_seq_traits<T, Ns...>::prod>;

template<bool... Bs>
using enable_if_all = typename std::enable_if<std::conjunction<std::integral_constant<bool, Bs>...>::value, char>::type;

struct base0 {
	
	base0(){
		printf("BASE0 - Construtor 0\n");
	}
	
	base0(base0& other){
		printf("BASE0 - Construtor 1\n");
	}
	
	base0(const base0& other){
		printf("BASE0 - Construtor 2\n");
	}
	
	base0(base0&& other){
		printf("BASE0 - Construtor 3\n");
	}
	
};

struct base1 {
	base1(){
		printf("BASE1 - Construtor 0\n");
	}
	
	base1(base1& other){
		printf("BASE1 - Construtor 1\n");
	}
	
	base1(const base1& other){
		printf("BASE1 - Construtor 2\n");
	}
	
	base1(base1&& other){
		printf("BASE1 - Construtor 3\n");
	}
};

/// declarations

template<typename T, typename Extents>
struct vec_impl0;

template<typename T, typename Extents>
struct vec_impl1;

template<typename T, int... Ns>
using vec0 = vec_impl0<T, std::integer_sequence<int, Ns...>>;

template<typename T, int... Ns>
using vec1 = vec_impl1<T, std::integer_sequence<int, Ns...>>;


/// vec
template<typename T, typename Tn, Tn... Ns>
struct vec_impl0<T, std::integer_sequence<Tn, Ns...>>
	: base0
	, base1 {
	static_assert(std::is_trivial<T>::value, "Vec element type is not trivial!\n");

	static constexpr auto dim	 = sizeof...(Ns);
	static constexpr auto extent = integral_seq_mul<Tn, Ns...>::value;

	using value_type = T;

   private:
	std::array<T, extent> m_data;

   public:
	/// construct
	constexpr vec_impl0() = default;
	
	vec_impl0(vec_impl0& other)
	: m_data(other.m_data)
	{
		printf("VEC0 - Construtor 1\n");
	}
	
	vec_impl0(const vec_impl0& other)
	: m_data(other.m_data)
	{
		printf("VEC0 - Construtor 2\n");
		for(T tmp: other.m_data){
			printf("VEC0 - Construtor 2 - other.m_data %.9f\n", tmp);
		}
		for(T tmp: m_data){
			printf("VEC0 - Construtor 2 - this->m_data %.9f\n", tmp);
		}
	}
	
	vec_impl0(vec_impl0&& other)
	: m_data(std::move(other.m_data)){
		printf("VEC0 - Construtor 3\n");
	}

	template<typename... Vals, enable_if_all<(sizeof...(Vals)) == extent, std::is_convertible<Vals, value_type>::value...> = 0>
	explicit constexpr vec_impl0(Vals&&... vals) noexcept
		: m_data {std::forward<Vals>(vals)...} 
	{
		printf("VEC0 - Construtor 4\n");
		for(T tmp: m_data){
			printf("VEC0 - Construtor 4 - this->m_data %.9f\n", tmp);
		}
	}

	// []
	template<typename Index, Tn D = dim, std::enable_if_t<D == 1, char> = 0>
	constexpr const T& operator[](Index index) const noexcept {
		printf("VEC0 - operator[] %d\n", index);
		return m_data[std::forward<Index>(index)];
	}
};

template<typename T, typename Tn, Tn... Ns>
struct vec_impl1<T, std::integer_sequence<Tn, Ns...>>
	{
	static_assert(std::is_trivial<T>::value, "Vec element type is not trivial!\n");

	static constexpr auto dim	 = sizeof...(Ns);
	static constexpr auto extent = integral_seq_mul<Tn, Ns...>::value;

	using value_type = T;

   private:
	std::array<T, extent> m_data;

   public:
	/// construct
	constexpr vec_impl1() = default;
	
	vec_impl1(vec_impl1& other)
	: m_data(other.m_data)
	{
		printf("VEC1 - Construtor 1\n");
	}
	
	vec_impl1(const vec_impl1& other)
	: m_data(other.m_data)
	{
		printf("VEC1 - Construtor 2\n");
		for(T tmp: other.m_data){
			printf("VEC1 - Construtor 2 - other.m_data %.9f\n", tmp);
		}
		for(T tmp: m_data){
			printf("VEC1 - Construtor 2 - this->m_data %.9f\n", tmp);
		}
	}
	
	vec_impl1(vec_impl1&& other)
	: m_data(std::move(other.m_data)){
		printf("VEC1 - Construtor 3\n");
	}

	template<typename... Vals, enable_if_all<(sizeof...(Vals)) == extent, std::is_convertible<Vals, value_type>::value...> = 0>
	explicit constexpr vec_impl1(Vals&&... vals) noexcept
		: m_data {std::forward<Vals>(vals)...} 
	{
		printf("VEC1 - Construtor 4\n");
		for(T tmp: m_data){
			printf("VEC1 - Construtor 4 - this->m_data %.9f\n", tmp);
		}
	}

	// []
	template<typename Index, Tn D = dim, std::enable_if_t<D == 1, char> = 0>
	constexpr const T& operator[](Index index) const noexcept {
		printf("VEC1 - operator[] %d\n", index);
		return m_data[std::forward<Index>(index)];
	}
};

lib.cu:

#include "lib.h"

CMakerLists.txt

cmake_minimum_required(VERSION 3.15)    # support relative path

project(Test 
    LANGUAGES   CXX CUDA)
	
include(CheckLanguage)
check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
  enable_language(CUDA)
  message("-- cuda-compiler " ${CMAKE_CUDA_COMPILER})
else()
  message(STATUS "No CUDA support")
endif()
set(CUDA_FOUND ${CMAKE_CUDA_COMPILER})

set(CMAKE_CUDA_ARCHITECTURES native)

if (CUDA_FOUND)

 add_library(lib_test)

target_compile_options(lib_test
  PUBLIC        $<$<AND:$<CONFIG:Debug>,$<COMPILE_LANGUAGE:CUDA>>:-g> --expt-extended-lambda --expt-relaxed-constexpr --default-stream=per-thread --use_fast_math -lineinfo --ptxas-options=-allow-expensive-optimizations=true>
)

target_compile_features(lib_test PRIVATE cuda_std_17)
set_target_properties(lib_test
  PROPERTIES  CUDA_EXTENSIONS ON
			  CUDA_SEPARABLE_COMPILATION OFF
			  CUDA_RESOLVE_DEVICE_SYMBOLS OFF
			  POSITION_INDEPENDENT_CODE ON
			  #LINKER_LANGUAGE CUDA
)
target_compile_definitions(lib_test 
  PUBLIC        CMAKE_GENERATOR_PLATFORM=x64
)

target_sources(lib_test
    PRIVATE     lib.cu
)
target_precompile_headers(lib_test 
    INTERFACE   lib.h
)

add_executable(test)

target_compile_options(test
  PRIVATE     $<$<AND:$<CONFIG:Debug>,$<COMPILE_LANGUAGE:CUDA>>:-g> --expt-extended-lambda --expt-relaxed-constexpr --default-stream=per-thread --use_fast_math -lineinfo --ptxas-options=-allow-expensive-optimizations=true>
)

target_compile_features(test PRIVATE cuda_std_17)

target_link_libraries(test
	PRIVATE lib_test
)

set_target_properties(test
  PROPERTIES  CUDA_EXTENSIONS ON
			  CUDA_SEPARABLE_COMPILATION OFF
			  RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
install(TARGETS
	test
)

target_sources(test
    PRIVATE     test.cu
)

endif()

Compile Script:

@echo off
setlocal
cd /d %~dp0
reg Query "HKLM\Hardware\Description\System\CentralProcessor\0" | find /i "x86" > NUL && set arch_command_length=32 || set arch_command_length=64
echo "%PROCESSOR_ARCHITECTURE%" | find /i "arm" > NUL && goto ARM || goto AMD

:ARM
IF %arch_command_length% == 32 (
	set arch_string=x86_arm
	set arch_command_length_string=x86
) ELSE (
	set arch_string=amd64_arm64
	set arch_command_length_string=x64
)
goto VC_VARC_INIT
:AMD
IF %arch_command_length% == 32 (
	set arch_string=x86
	set arch_command_length_string=x86
) ELSE (
	set arch_string=amd64
	set arch_command_length_string=x64
)
goto VC_VARC_INIT
:VC_VARC_INIT
call vcvarsall.bat %arch_string% -vcvars_ver=
set compiler_path=%VCToolsInstallDir%\bin\Host%arch_command_length_string%\%arch_command_length_string%\cl.exe
set linker_path=%VCToolsInstallDir%\bin\Host%arch_command_length_string%\%arch_command_length_string%\link.exe
set archiver_path=%VCToolsInstallDir%\bin\Host%arch_command_length_string%\%arch_command_length_string%\lib.exe
call "%DevEnvDir%\COMMONEXTENSIONS\MICROSOFT\CMAKE\CMake\bin\cmake.exe" -G "Ninja Multi-Config" -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_AR:FILEPATH="%archiver_path%" -DCMAKE_LINKER:FILEPATH="%linker_path%" -DCMAKE_INSTALL_PREFIX:PATH="%CD%\out\install" -DCMAKE_CXX_COMPILER:FILEPATH="%compiler_path%" -DCMAKE_C_COMPILER:FILEPATH="%compiler_path%" -DCMAKE_MAKE_PROGRAM="%DevEnvDir%\COMMONEXTENSIONS\MICROSOFT\CMAKE\Ninja\ninja.exe" "%CD%\.."
call "%DevEnvDir%\COMMONEXTENSIONS\MICROSOFT\CMAKE\CMake\bin\cmake.exe" --build . --target install --config Release

Produces following output on my machine:

BASE0 - Construtor 0
BASE1 - Construtor 0
VEC0 - Construtor 4
VEC0 - Construtor 4 - this->m_data 1.000000000
VEC0 - Construtor 4 - this->m_data 2.000000000
VEC0 - Construtor 4 - this->m_data 3.000000000
VEC1 - Construtor 4
VEC1 - Construtor 4 - this->m_data 1.000000000
VEC1 - Construtor 4 - this->m_data 2.000000000
VEC1 - Construtor 4 - this->m_data 3.000000000
VEC0 - operator[] 2
VEC0 - operator[] 1
VEC0 - operator[] 0
1.000000000 2.000000000 3.000000000
VEC1 - operator[] 2
VEC1 - operator[] 1
VEC1 - operator[] 0
1.000000000 2.000000000 3.000000000
DEVICE:
BASE0 - Construtor 0
BASE1 - Construtor 0
VEC0 - Construtor 2
VEC0 - Construtor 2 - other.m_data 1.000000000
VEC0 - Construtor 2 - other.m_data 2.000000000
VEC0 - Construtor 2 - other.m_data 3.000000000
VEC0 - Construtor 2 - this->m_data 1.000000000
VEC0 - Construtor 2 - this->m_data 2.000000000
VEC0 - Construtor 2 - this->m_data 3.000000000
VEC1 - Construtor 2
VEC1 - Construtor 2 - other.m_data 1.000000000
VEC1 - Construtor 2 - other.m_data 2.000000000
VEC1 - Construtor 2 - other.m_data 3.000000000
VEC1 - Construtor 2 - this->m_data 1.000000000
VEC1 - Construtor 2 - this->m_data 2.000000000
VEC1 - Construtor 2 - this->m_data 3.000000000
VEC0 - operator[] 0
VEC0 - operator[] 1
VEC0 - operator[] 2
0.000000000 1.000000000 2.000000000
VEC1 - operator[] 0
VEC1 - operator[] 1
VEC1 - operator[] 2
1.000000000 2.000000000 3.000000000
HOST:
BASE0 - Construtor 0
BASE1 - Construtor 0
VEC0 - Construtor 2
VEC0 - Construtor 2 - other.m_data 1.000000000
VEC0 - Construtor 2 - other.m_data 2.000000000
VEC0 - Construtor 2 - other.m_data 3.000000000
VEC0 - Construtor 2 - this->m_data 1.000000000
VEC0 - Construtor 2 - this->m_data 2.000000000
VEC0 - Construtor 2 - this->m_data 3.000000000
VEC0 - operator[] 2
VEC0 - operator[] 1
VEC0 - operator[] 0
1.000000000 2.000000000 3.000000000
VEC1 - Construtor 2
VEC1 - Construtor 2 - other.m_data 1.000000000
VEC1 - Construtor 2 - other.m_data 2.000000000
VEC1 - Construtor 2 - other.m_data 3.000000000
VEC1 - Construtor 2 - this->m_data 1.000000000
VEC1 - Construtor 2 - this->m_data 2.000000000
VEC1 - Construtor 2 - this->m_data 3.000000000
VEC1 - operator[] 2
VEC1 - operator[] 1
VEC1 - operator[] 0
1.000000000 2.000000000 3.000000000

Note the first device call. The values printed vary and do not match the values passed.
No problem occures for a type not inheriting from base classes (and iirc also not when only inheriting from one base class).

System parameters:
Windows 10.
CMake output:

**********************************************************************
** Visual Studio 2022 Developer Command Prompt v17.4.4
** Copyright (c) 2022 Microsoft Corporation
**********************************************************************
[vcvarsall.bat] Environment initialized for: 'x64'
-- The CXX compiler identification is MSVC 19.34.31937.0
-- The CUDA compiler identification is NVIDIA 12.1.66
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.34.31933//bin/Hostx64/x64/cl.exe - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Detecting CUDA compiler ABI info
-- Detecting CUDA compiler ABI info - done
-- Check for working CUDA compiler: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.1/bin/nvcc.exe - skipped
-- Detecting CUDA compile features
-- Detecting CUDA compile features - done
-- cuda-compiler C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.1/bin/nvcc.exe
-- Configuring done
-- Generating done

Seems like several other errors occure with the original struct, but I could not create a minimal example for this.
1.) The compiler does not report when calling the structs function form device code, although they are not marked with device.
2.) Passing the struct form device code as argument to another device function seems to corrupt memory in a similiar way, regardless of how many base classes the struct has.