Hey Detlef,
Thanks for getting back to me so quickly. Here’s the missing system information:
- OS version: Windows 10 (64-bit)
- GPUs:
- NVIDIA GeForce RTX 3060, compute-capability=8.6, total-memory=11GB
- NVIDIA GeForce RTX 2060, compute-capability=7.5, total-memory=11GB
- Optix version: 8.0.0
- CUDA toolkit version: 12.1.105
- Host compiler: MSVC 19.29.30146.0
- Optix-IR: Yes
Here’s the CMake macro I am using to generate my code. I think it might come from the OptiX advanced samples, but I modified it to use a custom marker 0xfeedf00d
to be able to embed optix-ir code in my binary:
macro(cuda_compile_and_embed is_debug output_var cuda_file)
set(c_var_name ${output_var})
set(nvcc_options "--use_fast_math --keep --relocatable-device-code=true --optix-ir --expt-relaxed-constexpr --std=c++17 --machine=64 --gpu-architecture=compute_50 -Wno-deprecated-gpu-targets --resource-usage")
if (${is_debug})
message(WARNING "Building Optix-IR code with debug enabled, this hurts performance.")
# Use -G and --generate-line-info for debugging
cuda_compile_ptx(ptx_files ${cuda_file} OPTIONS -G --generate-line-info ${nvcc_options})
else ()
message(STATUS "Building optimized Optix-IR")
cuda_compile_ptx(ptx_files ${cuda_file} OPTIONS ${nvcc_options})
endif ()
list(GET ptx_files 0 ptx_file)
set(embedded_file ${ptx_file}_embedded.c)
message(STATUS "Adding rule to compile and embed ${cuda_file} to \"const char ${var_name}[];\". Debug is ${is_debug}")
add_custom_command(
OUTPUT ${embedded_file}
COMMAND ${BIN2C} -c --padd 0xfe,0xed,0xf0,0x0d --type char --name ${c_var_name} ${ptx_file} > ${embedded_file}
DEPENDS ${ptx_file}
COMMENT "compiling (and embedding optix-ir from) ${cuda_file}"
)
set(${output_var} ${embedded_file})
endmacro()
Here is what my macro definitions look like - they are similar to the ones you posted, only difference is that I don’t have an inline on the RS_CPU_GPU macro when it’s compiled as host code:
// #if defined(CUDA_GPU_ENABLED) && defined(__CUDACC__)
//
// // These macros become active when compiling CUDA code via nvcc. They should be used to embellish functions and methods
// // to make them available for use in the CUDA code.
// #define RS_CPU_GPU __forceinline__ __device__ __host__
// #define RS_GPU __forceinline__ __device__
//
// #else
//
// // These macros become active when compiling regular host code with a regular compiler, i.e. not nvcc.
// #define RS_CPU_GPU
// #define RS_GPU
//
// #endif
I put your suggestions in place:
- Remove the constructor.
- Prefix my functions with
RS_CPU_GPU
.
None of these worked, I am getting a similar error but not the same:
2024-01-24 10:33:20.388 ( 245.111s) [GpuWorker-1 ] OptixRenderDevice.cpp:18 ERR| COMPILE ERROR: Malformed input. See compile details for more information.
Error: Taking the address of functions is illegal because indirect function calls are illegal. Address of function is taken and used in function _ZN12kernelshared14FixedSizeStackIN6cgmath11RGBSpectrumELh32EEC1Ev (C:/Users/Thomas/dev/Rayscaper/src/cpp/kernelshared/textures/FixedSizeStack.h:12:7)
2024-01-24 10:33:20.389 ( 245.112s) [GpuWorker-1 ] OptixIntegrator.cpp:116 FATL| FATAL: C:\Users\Thomas\dev\Rayscaper\src\cpp\kernelgpu\OptixIntegrator.cpp:116 (optixModuleCreate(optix_device_context_, &module_compile_options, &pipeline_compile_options, (const char*)embedded_optix_code, optix_ir_code_len, log_buffer, &log_buffer_size, &optix_module_)) failed with result: Invalid input [OPTIX_ERROR_INVALID
_INPUT]
The direct callable chain like this (WARN: wall of code):
extern "C" __device__ cgmath::Spectrum __direct_callable__eval_complex_spectrum_texture(
const OptixSpectrumTexture& texture, const kernelshared::TextureEvalContext& ctx) {
return ComplexSpectrumTextureEval(texture, ctx);
}
RS_CPU_GPU cgmath::Spectrum ComplexSpectrumTextureEval(const OptixSpectrumTexture& texture,
const kernelshared::TextureEvalContext& ctx) {
return ComplexTexEval<cgmath::Spectrum>(texture, ctx, kOptixLaunchParams.spectrum_textures_);
}
// Implementation of evaluation of complex textures on the GPU. Our definition of a complex texture is any texture
// that can recursively contain other textures. For example the checkerboard texture.
//
// Recursive function calls aren't allowed on the GPU, hence we evaluate textures using fixed size stacks. First, we
// have an input stack, where we push all textures that still require evaluation. After evaluation, the texture writes
// its values to the output stack, textures can also consume the values from their children from the output stack.
//
// A limitation of this setup is that we can't mix texture types. For example, we cannot evaluate a float texture
// required to evaluate a Spectrum texture. This mainly cripples the usefulness of the mix texture node.
//
// We took a little inspiration from Cycles' SVM (Shader Virtual Machine) although our textures are a lot less complex
// to evaluate than some of Blender's textures.
template <typename TexelType>
RS_CPU_GPU inline TexelType ComplexTexEval(const OptixTexture<TexelType>& texture,
const kernelshared::TextureEvalContext& ctx,
rstd::span<const OptixTexture<TexelType>> textures) {
using namespace kernelshared;
struct StackItem {
const OptixTexture<TexelType>* texture_;
bool pushed_children_;
};
FixedSizeStack<StackItem, 32> input_stack;
FixedSizeStack<TexelType, 32> output_stack;
input_stack.Push({&texture, false});
while (!input_stack.IsEmpty()) {
const OptixTexture<TexelType>* current_texture = input_stack.Peek().texture_;
const bool pushed_children = input_stack.Peek().pushed_children_;
input_stack.Pop();
switch (current_texture->GetType()) {
case OptixTextureType::kCheckers2d: {
if (pushed_children) {
const TexelType odd = output_stack.Pop();
const TexelType even = output_stack.Pop();
const cgmath::Point2d st = current_texture->checkers_texture_.tex_mapping_.Map(ctx);
output_stack.Push(EvalCheckers(st, even, odd));
} else {
input_stack.Push({current_texture, true});
input_stack.Push({&textures[current_texture->checkers_texture_.odd_texture_index_], false});
input_stack.Push({&textures[current_texture->checkers_texture_.even_texture_index_], false});
}
break;
}
case OptixTextureType::kConstant: {
output_stack.Push(current_texture->const_texture_.value_);
break;
}
case OptixTextureType::kImage: {
auto image_coords = kernelshared::CalcImageCoordinate(current_texture->image_texture_.image_.GetImageSize(),
ctx,
current_texture->image_texture_.tex_mapping_,
current_texture->image_texture_.wrap_mode_);
if (!image_coords.has_value()) {
output_stack.Push(TexelType{});
} else {
output_stack.Push(kernelshared::TransformTexel<TexelType>(
current_texture->image_texture_.image_.template GetValue<TexelType>(image_coords->x, image_coords->y),
current_texture->image_texture_.inverse_gamma_correct_,
current_texture->image_texture_.invert_));
}
break;
}
case OptixTextureType::kMix: {
if (pushed_children) {
const TexelType input_a = output_stack.Pop();
const TexelType input_b = output_stack.Pop();
float blend_amount = current_texture->mix_texture_.blend_amount_;
output_stack.Push(cgmath::Lerp<TexelType>(blend_amount, input_a, input_b));
} else {
input_stack.Push({current_texture, true});
input_stack.Push({&textures[current_texture->mix_texture_.input_a_texture_index_], false});
input_stack.Push({&textures[current_texture->mix_texture_.input_b_texture_index_], false});
}
break;
}
case OptixTextureType::kNoise: {
if (pushed_children) {
const TexelType input_a = output_stack.Pop();
const TexelType input_b = output_stack.Pop();
const cgmath::Point3d uvw = current_texture->noise_texture_.texture_mapping_.Map(ctx);
output_stack.Push(EvalNoise<TexelType>(current_texture->noise_texture_.noise_type_,
uvw,
current_texture->noise_texture_.omega_,
current_texture->noise_texture_.num_octaves_,
input_a,
input_b));
} else {
input_stack.Push({current_texture, true});
input_stack.Push({&textures[current_texture->noise_texture_.input_b_texture_index_], false});
input_stack.Push({&textures[current_texture->noise_texture_.input_a_texture_index_], false});
}
break;
}
case OptixTextureType::kNull: {
output_stack.Push(GetFallbackValue<TexelType>());
break;
}
case OptixTextureType::kUvTexture: {
cgmath::Point2d st = current_texture->uv_texture_.tex_mapping_.Map(ctx);
output_stack.Push(EvalUvTexture<TexelType>(st));
break;
}
default: {
// Unknown texture type -> bail out.
return GetFallbackValue<TexelType>();
}
}
// Bail out on a stack overflow.
if (input_stack.IsFull() || output_stack.IsFull()) {
return GetFallbackValue<TexelType>();
}
}
return output_stack.Peek();
}
Also attached the Spectrum class implementation for completeness:
spectrum.txt (7.7 KB)
I will follow up to see if changing my driver or CUDA toolkit versions helps.
Regards,
Thomas