Following code example:
test.cu
#include "lib.h"
__global__ void test(vec0<float, 3> abc){
printf("%.9f %.9f %.9f\n", abc[0], abc[1], abc[2]);
}
__global__ void test(vec1<float, 3> abc){
printf("%.9f %.9f %.9f\n", abc[0], abc[1], abc[2]);
}
void test_host(vec0<float, 3> abc){
printf("%.9f %.9f %.9f\n", abc[0], abc[1], abc[2]);
}
void test_host(vec1<float, 3> abc){
printf("%.9f %.9f %.9f\n", abc[0], abc[1], abc[2]);
}
int main(){
const vec0<float, 3> abc0{1.0f, 2.0f, 3.0f};
const vec1<float, 3> abc1{1.0f, 2.0f, 3.0f};
printf("%.9f %.9f %.9f\n", abc0[0], abc0[1], abc0[2]);
printf("%.9f %.9f %.9f\n", abc1[0], abc1[1], abc1[2]);
printf("DEVICE:\n");
test<<<1, 1>>>(abc0);
test<<<1, 1>>>(abc1);
cudaDeviceSynchronize();
printf("HOST:\n");
test_host(abc0);
test_host(abc1);
}
lib.h
#include <functional>
#include <type_traits>
#include <utility>
#include <array>
#include <iostream>
/// use some designs from https://github.com/taocpp/sequences/
namespace meta_impl {
template<typename T, T... Ns>
constexpr T integral_seq_mul_impl() noexcept {
T res {1};
(void) std::initializer_list<int> {(res *= Ns, 0)...};
return res;
}
template<typename T, T... Ns>
constexpr T integral_seq_sum_impl() noexcept {
T res {0};
(void) std::initializer_list<int> {(res += Ns, 0)...};
return res;
}
}// namespace meta_impl
template<typename T>
struct value_seq_traits_impl;
template<typename T, T... Ns>
struct value_seq_traits_impl<std::integer_sequence<T, Ns...>> {
static constexpr T sum = meta_impl::integral_seq_sum_impl<T, Ns...>();
static constexpr T prod = meta_impl::integral_seq_mul_impl<T, Ns...>();
};
template<typename T, T... Ns>
using value_seq_traits = value_seq_traits_impl<std::integer_sequence<T, Ns...>>;
/// seq reduction
template<typename T, T... Ns>
using integral_seq_sum = std::integral_constant<T, value_seq_traits<T, ((Ns > 0) ? Ns : 0)...>::sum - value_seq_traits<T, ((Ns < 0) ? -Ns : 0)...>::sum>;
template<typename T, T... Ns>
using integral_seq_mul = std::integral_constant<T, value_seq_traits<T, Ns...>::prod>;
template<bool... Bs>
using enable_if_all = typename std::enable_if<std::conjunction<std::integral_constant<bool, Bs>...>::value, char>::type;
struct base0 {
base0(){
printf("BASE0 - Construtor 0\n");
}
base0(base0& other){
printf("BASE0 - Construtor 1\n");
}
base0(const base0& other){
printf("BASE0 - Construtor 2\n");
}
base0(base0&& other){
printf("BASE0 - Construtor 3\n");
}
};
struct base1 {
base1(){
printf("BASE1 - Construtor 0\n");
}
base1(base1& other){
printf("BASE1 - Construtor 1\n");
}
base1(const base1& other){
printf("BASE1 - Construtor 2\n");
}
base1(base1&& other){
printf("BASE1 - Construtor 3\n");
}
};
/// declarations
template<typename T, typename Extents>
struct vec_impl0;
template<typename T, typename Extents>
struct vec_impl1;
template<typename T, int... Ns>
using vec0 = vec_impl0<T, std::integer_sequence<int, Ns...>>;
template<typename T, int... Ns>
using vec1 = vec_impl1<T, std::integer_sequence<int, Ns...>>;
/// vec
template<typename T, typename Tn, Tn... Ns>
struct vec_impl0<T, std::integer_sequence<Tn, Ns...>>
: base0
, base1 {
static_assert(std::is_trivial<T>::value, "Vec element type is not trivial!\n");
static constexpr auto dim = sizeof...(Ns);
static constexpr auto extent = integral_seq_mul<Tn, Ns...>::value;
using value_type = T;
private:
std::array<T, extent> m_data;
public:
/// construct
constexpr vec_impl0() = default;
vec_impl0(vec_impl0& other)
: m_data(other.m_data)
{
printf("VEC0 - Construtor 1\n");
}
vec_impl0(const vec_impl0& other)
: m_data(other.m_data)
{
printf("VEC0 - Construtor 2\n");
for(T tmp: other.m_data){
printf("VEC0 - Construtor 2 - other.m_data %.9f\n", tmp);
}
for(T tmp: m_data){
printf("VEC0 - Construtor 2 - this->m_data %.9f\n", tmp);
}
}
vec_impl0(vec_impl0&& other)
: m_data(std::move(other.m_data)){
printf("VEC0 - Construtor 3\n");
}
template<typename... Vals, enable_if_all<(sizeof...(Vals)) == extent, std::is_convertible<Vals, value_type>::value...> = 0>
explicit constexpr vec_impl0(Vals&&... vals) noexcept
: m_data {std::forward<Vals>(vals)...}
{
printf("VEC0 - Construtor 4\n");
for(T tmp: m_data){
printf("VEC0 - Construtor 4 - this->m_data %.9f\n", tmp);
}
}
// []
template<typename Index, Tn D = dim, std::enable_if_t<D == 1, char> = 0>
constexpr const T& operator[](Index index) const noexcept {
printf("VEC0 - operator[] %d\n", index);
return m_data[std::forward<Index>(index)];
}
};
template<typename T, typename Tn, Tn... Ns>
struct vec_impl1<T, std::integer_sequence<Tn, Ns...>>
{
static_assert(std::is_trivial<T>::value, "Vec element type is not trivial!\n");
static constexpr auto dim = sizeof...(Ns);
static constexpr auto extent = integral_seq_mul<Tn, Ns...>::value;
using value_type = T;
private:
std::array<T, extent> m_data;
public:
/// construct
constexpr vec_impl1() = default;
vec_impl1(vec_impl1& other)
: m_data(other.m_data)
{
printf("VEC1 - Construtor 1\n");
}
vec_impl1(const vec_impl1& other)
: m_data(other.m_data)
{
printf("VEC1 - Construtor 2\n");
for(T tmp: other.m_data){
printf("VEC1 - Construtor 2 - other.m_data %.9f\n", tmp);
}
for(T tmp: m_data){
printf("VEC1 - Construtor 2 - this->m_data %.9f\n", tmp);
}
}
vec_impl1(vec_impl1&& other)
: m_data(std::move(other.m_data)){
printf("VEC1 - Construtor 3\n");
}
template<typename... Vals, enable_if_all<(sizeof...(Vals)) == extent, std::is_convertible<Vals, value_type>::value...> = 0>
explicit constexpr vec_impl1(Vals&&... vals) noexcept
: m_data {std::forward<Vals>(vals)...}
{
printf("VEC1 - Construtor 4\n");
for(T tmp: m_data){
printf("VEC1 - Construtor 4 - this->m_data %.9f\n", tmp);
}
}
// []
template<typename Index, Tn D = dim, std::enable_if_t<D == 1, char> = 0>
constexpr const T& operator[](Index index) const noexcept {
printf("VEC1 - operator[] %d\n", index);
return m_data[std::forward<Index>(index)];
}
};
lib.cu:
#include "lib.h"
CMakerLists.txt
cmake_minimum_required(VERSION 3.15) # support relative path
project(Test
LANGUAGES CXX CUDA)
include(CheckLanguage)
check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
enable_language(CUDA)
message("-- cuda-compiler " ${CMAKE_CUDA_COMPILER})
else()
message(STATUS "No CUDA support")
endif()
set(CUDA_FOUND ${CMAKE_CUDA_COMPILER})
set(CMAKE_CUDA_ARCHITECTURES native)
if (CUDA_FOUND)
add_library(lib_test)
target_compile_options(lib_test
PUBLIC $<$<AND:$<CONFIG:Debug>,$<COMPILE_LANGUAGE:CUDA>>:-g> --expt-extended-lambda --expt-relaxed-constexpr --default-stream=per-thread --use_fast_math -lineinfo --ptxas-options=-allow-expensive-optimizations=true>
)
target_compile_features(lib_test PRIVATE cuda_std_17)
set_target_properties(lib_test
PROPERTIES CUDA_EXTENSIONS ON
CUDA_SEPARABLE_COMPILATION OFF
CUDA_RESOLVE_DEVICE_SYMBOLS OFF
POSITION_INDEPENDENT_CODE ON
#LINKER_LANGUAGE CUDA
)
target_compile_definitions(lib_test
PUBLIC CMAKE_GENERATOR_PLATFORM=x64
)
target_sources(lib_test
PRIVATE lib.cu
)
target_precompile_headers(lib_test
INTERFACE lib.h
)
add_executable(test)
target_compile_options(test
PRIVATE $<$<AND:$<CONFIG:Debug>,$<COMPILE_LANGUAGE:CUDA>>:-g> --expt-extended-lambda --expt-relaxed-constexpr --default-stream=per-thread --use_fast_math -lineinfo --ptxas-options=-allow-expensive-optimizations=true>
)
target_compile_features(test PRIVATE cuda_std_17)
target_link_libraries(test
PRIVATE lib_test
)
set_target_properties(test
PROPERTIES CUDA_EXTENSIONS ON
CUDA_SEPARABLE_COMPILATION OFF
RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
install(TARGETS
test
)
target_sources(test
PRIVATE test.cu
)
endif()
Compile Script:
@echo off
setlocal
cd /d %~dp0
reg Query "HKLM\Hardware\Description\System\CentralProcessor\0" | find /i "x86" > NUL && set arch_command_length=32 || set arch_command_length=64
echo "%PROCESSOR_ARCHITECTURE%" | find /i "arm" > NUL && goto ARM || goto AMD
:ARM
IF %arch_command_length% == 32 (
set arch_string=x86_arm
set arch_command_length_string=x86
) ELSE (
set arch_string=amd64_arm64
set arch_command_length_string=x64
)
goto VC_VARC_INIT
:AMD
IF %arch_command_length% == 32 (
set arch_string=x86
set arch_command_length_string=x86
) ELSE (
set arch_string=amd64
set arch_command_length_string=x64
)
goto VC_VARC_INIT
:VC_VARC_INIT
call vcvarsall.bat %arch_string% -vcvars_ver=
set compiler_path=%VCToolsInstallDir%\bin\Host%arch_command_length_string%\%arch_command_length_string%\cl.exe
set linker_path=%VCToolsInstallDir%\bin\Host%arch_command_length_string%\%arch_command_length_string%\link.exe
set archiver_path=%VCToolsInstallDir%\bin\Host%arch_command_length_string%\%arch_command_length_string%\lib.exe
call "%DevEnvDir%\COMMONEXTENSIONS\MICROSOFT\CMAKE\CMake\bin\cmake.exe" -G "Ninja Multi-Config" -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_AR:FILEPATH="%archiver_path%" -DCMAKE_LINKER:FILEPATH="%linker_path%" -DCMAKE_INSTALL_PREFIX:PATH="%CD%\out\install" -DCMAKE_CXX_COMPILER:FILEPATH="%compiler_path%" -DCMAKE_C_COMPILER:FILEPATH="%compiler_path%" -DCMAKE_MAKE_PROGRAM="%DevEnvDir%\COMMONEXTENSIONS\MICROSOFT\CMAKE\Ninja\ninja.exe" "%CD%\.."
call "%DevEnvDir%\COMMONEXTENSIONS\MICROSOFT\CMAKE\CMake\bin\cmake.exe" --build . --target install --config Release
Produces following output on my machine:
BASE0 - Construtor 0
BASE1 - Construtor 0
VEC0 - Construtor 4
VEC0 - Construtor 4 - this->m_data 1.000000000
VEC0 - Construtor 4 - this->m_data 2.000000000
VEC0 - Construtor 4 - this->m_data 3.000000000
VEC1 - Construtor 4
VEC1 - Construtor 4 - this->m_data 1.000000000
VEC1 - Construtor 4 - this->m_data 2.000000000
VEC1 - Construtor 4 - this->m_data 3.000000000
VEC0 - operator[] 2
VEC0 - operator[] 1
VEC0 - operator[] 0
1.000000000 2.000000000 3.000000000
VEC1 - operator[] 2
VEC1 - operator[] 1
VEC1 - operator[] 0
1.000000000 2.000000000 3.000000000
DEVICE:
BASE0 - Construtor 0
BASE1 - Construtor 0
VEC0 - Construtor 2
VEC0 - Construtor 2 - other.m_data 1.000000000
VEC0 - Construtor 2 - other.m_data 2.000000000
VEC0 - Construtor 2 - other.m_data 3.000000000
VEC0 - Construtor 2 - this->m_data 1.000000000
VEC0 - Construtor 2 - this->m_data 2.000000000
VEC0 - Construtor 2 - this->m_data 3.000000000
VEC1 - Construtor 2
VEC1 - Construtor 2 - other.m_data 1.000000000
VEC1 - Construtor 2 - other.m_data 2.000000000
VEC1 - Construtor 2 - other.m_data 3.000000000
VEC1 - Construtor 2 - this->m_data 1.000000000
VEC1 - Construtor 2 - this->m_data 2.000000000
VEC1 - Construtor 2 - this->m_data 3.000000000
VEC0 - operator[] 0
VEC0 - operator[] 1
VEC0 - operator[] 2
0.000000000 1.000000000 2.000000000
VEC1 - operator[] 0
VEC1 - operator[] 1
VEC1 - operator[] 2
1.000000000 2.000000000 3.000000000
HOST:
BASE0 - Construtor 0
BASE1 - Construtor 0
VEC0 - Construtor 2
VEC0 - Construtor 2 - other.m_data 1.000000000
VEC0 - Construtor 2 - other.m_data 2.000000000
VEC0 - Construtor 2 - other.m_data 3.000000000
VEC0 - Construtor 2 - this->m_data 1.000000000
VEC0 - Construtor 2 - this->m_data 2.000000000
VEC0 - Construtor 2 - this->m_data 3.000000000
VEC0 - operator[] 2
VEC0 - operator[] 1
VEC0 - operator[] 0
1.000000000 2.000000000 3.000000000
VEC1 - Construtor 2
VEC1 - Construtor 2 - other.m_data 1.000000000
VEC1 - Construtor 2 - other.m_data 2.000000000
VEC1 - Construtor 2 - other.m_data 3.000000000
VEC1 - Construtor 2 - this->m_data 1.000000000
VEC1 - Construtor 2 - this->m_data 2.000000000
VEC1 - Construtor 2 - this->m_data 3.000000000
VEC1 - operator[] 2
VEC1 - operator[] 1
VEC1 - operator[] 0
1.000000000 2.000000000 3.000000000
Note the first device call. The values printed vary and do not match the values passed.
No problem occures for a type not inheriting from base classes (and iirc also not when only inheriting from one base class).
System parameters:
Windows 10.
CMake output:
**********************************************************************
** Visual Studio 2022 Developer Command Prompt v17.4.4
** Copyright (c) 2022 Microsoft Corporation
**********************************************************************
[vcvarsall.bat] Environment initialized for: 'x64'
-- The CXX compiler identification is MSVC 19.34.31937.0
-- The CUDA compiler identification is NVIDIA 12.1.66
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.34.31933//bin/Hostx64/x64/cl.exe - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Detecting CUDA compiler ABI info
-- Detecting CUDA compiler ABI info - done
-- Check for working CUDA compiler: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.1/bin/nvcc.exe - skipped
-- Detecting CUDA compile features
-- Detecting CUDA compile features - done
-- cuda-compiler C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.1/bin/nvcc.exe
-- Configuring done
-- Generating done