Thanks for the clarifications. And also thanks for reiterating that I should spend more time looking at your example code. The temptation is strong to ask for help first and look at code later.
I was able to rework the optixTriangle example to use two triangles, each in its own respective GAS, and then instancing each GAS with identity transform to an IAS. This is the only change I made, and I can now see both triangles on the screen!
//
// SPLIT TRIANGLES INTO TWO GAS, UNDER A SINGLE IAS
//
// Note that we also need to set:
// pipeline_compile_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
// and max traversal depth to 2:
// OPTIX_CHECK( optixPipelineSetStackSize( pipeline, direct_callable_stack_size_from_traversal,
// direct_callable_stack_size_from_state, continuation_stack_size,
// 2 // maxTraversableDepth
// ) );
OptixAccelBuildOptions accel_options = {};
accel_options.buildFlags = OPTIX_BUILD_FLAG_NONE;
accel_options.operation = OPTIX_BUILD_OPERATION_BUILD;
const std::array<float3, 4> vertices =
{ {
{ -0.5f, -0.5f, 0.0f },
{ 0.5f, -0.5f, 0.0f },
{ 0.0f, 0.5f, 0.0f },
{ 1.0f, 0.5f, 0.0f }
} };
const std::array<uint3, 2> indices1 =
{ {
{ 0, 1, 2 },
} };
const std::array<uint3, 2> indices2 =
{ {
{ 1, 3, 2 }
} };
CUdeviceptr d_vertices;
CUdeviceptr d_indices1;
CUdeviceptr d_indices2;
const size_t vertices_size = sizeof(float3) * vertices.size();
CUDA_CHECK( cudaMalloc( reinterpret_cast<void**>( &d_vertices ), vertices_size ) );
CUDA_CHECK( cudaMemcpy( reinterpret_cast<void*>( d_vertices ), vertices.data(), vertices_size, cudaMemcpyHostToDevice ) );
const size_t indices_size1 = sizeof(uint3) * indices1.size();
CUDA_CHECK(cudaMalloc( reinterpret_cast<void**>( &d_indices1 ), indices_size1 ) );
CUDA_CHECK(cudaMemcpy( reinterpret_cast<void*>( d_indices1 ), indices1.data(), indices_size1, cudaMemcpyHostToDevice ) );
const size_t indices_size2 = sizeof(uint3) * indices2.size();
CUDA_CHECK(cudaMalloc( reinterpret_cast<void**>( &d_indices2 ), indices_size2 ) );
CUDA_CHECK(cudaMemcpy( reinterpret_cast<void*>( d_indices2 ), indices2.data(), indices_size2, cudaMemcpyHostToDevice ) );
const uint32_t triangle_input_flags[1] = { OPTIX_GEOMETRY_FLAG_NONE };
OptixTraversableHandle gas_handle1;
CUdeviceptr d_gas_handle1;
OptixTraversableHandle gas_handle2;
CUdeviceptr d_gas_handle2;
OptixBuildInput triangle_input1 = {};
triangle_input1.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
triangle_input1.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
triangle_input1.triangleArray.numVertices = static_cast<uint32_t>( vertices.size() );
triangle_input1.triangleArray.vertexBuffers = &d_vertices;
triangle_input1.triangleArray.flags = triangle_input_flags;
triangle_input1.triangleArray.numSbtRecords = 1;
triangle_input1.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
triangle_input1.triangleArray.numIndexTriplets = 1;
// triangle_input1.triangleArray.primitiveIndexOffset = 0;
triangle_input1.triangleArray.indexBuffer = d_indices1;
OptixBuildInput triangle_input2 = {};
triangle_input2.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
triangle_input2.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
triangle_input2.triangleArray.numVertices = static_cast<uint32_t>( vertices.size() );
triangle_input2.triangleArray.vertexBuffers = &d_vertices;
triangle_input2.triangleArray.flags = triangle_input_flags;
triangle_input2.triangleArray.numSbtRecords = 1;
triangle_input2.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
triangle_input2.triangleArray.numIndexTriplets = 1;
// triangle_input2.triangleArray.primitiveIndexOffset = 0;
triangle_input2.triangleArray.indexBuffer = d_indices2;
OptixAccelBufferSizes gas_buffer_sizes;
OPTIX_CHECK( optixAccelComputeMemoryUsage(
context,
&accel_options,
&triangle_input1,
1, // Number of build inputs
&gas_buffer_sizes
) );
CUdeviceptr d_temp_buffer_gas;
CUDA_CHECK( cudaMalloc( reinterpret_cast<void**>( &d_temp_buffer_gas ), gas_buffer_sizes.tempSizeInBytes ) );
CUDA_CHECK( cudaMalloc( reinterpret_cast<void**>( &d_gas_handle1 ), gas_buffer_sizes.outputSizeInBytes ) );
OPTIX_CHECK( optixAccelBuild(
context, 0, // CUDA stream
&accel_options,
&triangle_input1,
1, // num build inputs
d_temp_buffer_gas, gas_buffer_sizes.tempSizeInBytes,
d_gas_handle1, gas_buffer_sizes.outputSizeInBytes,
&gas_handle1,
nullptr, 0
) );
CUDA_CHECK( cudaFree( reinterpret_cast<void*>( d_temp_buffer_gas ) ) );
OPTIX_CHECK( optixAccelComputeMemoryUsage(
context,
&accel_options,
&triangle_input2,
1, // Number of build inputs
&gas_buffer_sizes
) );
CUDA_CHECK( cudaMalloc( reinterpret_cast<void**>( &d_temp_buffer_gas ), gas_buffer_sizes.tempSizeInBytes ) );
CUDA_CHECK( cudaMalloc( reinterpret_cast<void**>( &d_gas_handle2 ), gas_buffer_sizes.outputSizeInBytes ) );
OPTIX_CHECK( optixAccelBuild(
context, 0, // CUDA stream
&accel_options,
&triangle_input2,
1, // num build inputs
d_temp_buffer_gas, gas_buffer_sizes.tempSizeInBytes,
d_gas_handle2, gas_buffer_sizes.outputSizeInBytes,
&gas_handle2,
nullptr, 0
) );
CUDA_CHECK( cudaFree( reinterpret_cast<void*>( d_temp_buffer_gas ) ) );
const float transform[12] = // identity since all points are in world coordinates
{
1.0f, 0.0f, 0.0f, 0.0f,
0.0f, 1.0f, 0.0f, 0.0f,
0.0f, 0.0f, 1.0f, 0.0f
};
OptixInstance instances[2];
instances[0] = {};
memcpy(instances[0].transform, transform, sizeof(float) * 12);
instances[0].instanceId = 0;
instances[0].visibilityMask = OptixVisibilityMask(255);
instances[0].sbtOffset = 0;
instances[0].flags = OPTIX_INSTANCE_FLAG_NONE;
instances[0].traversableHandle = gas_handle1;
instances[1] = {};
memcpy(instances[1].transform, transform, sizeof(float) * 12);
instances[1].instanceId = 1;
instances[1].visibilityMask = OptixVisibilityMask(255);
instances[1].sbtOffset = 0;
instances[1].flags = OPTIX_INSTANCE_FLAG_NONE;
instances[1].traversableHandle = gas_handle2;
CUdeviceptr d_instances;
size_t instances_size = sizeof(OptixInstance) * 2;
CUDA_CHECK( cudaMalloc( (void**)&d_instances, instances_size ) );
CUDA_CHECK( cudaMemcpy( (void*)d_instances, instances, instances_size, cudaMemcpyHostToDevice ) );
OptixBuildInput instance_input = {};
instance_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
instance_input.instanceArray.instances = d_instances;
instance_input.instanceArray.numInstances = 2;
OptixAccelBufferSizes ias_buffer_sizes = {};
OPTIX_CHECK(
optixAccelComputeMemoryUsage(
context,
&accel_options,
&instance_input,
1,
&ias_buffer_sizes
)
);
CUdeviceptr d_ias;
CUDA_CHECK( cudaMalloc( (void**)&d_ias, ias_buffer_sizes.outputSizeInBytes ) );
CUdeviceptr d_tmp;
CUDA_CHECK( cudaMalloc( (void**)&d_tmp, ias_buffer_sizes.tempSizeInBytes ) );
OptixTraversableHandle ias_handle;
OPTIX_CHECK(
optixAccelBuild(
context, 0,
&accel_options,
&instance_input,
1,
d_tmp, ias_buffer_sizes.tempSizeInBytes,
d_ias, ias_buffer_sizes.outputSizeInBytes,
&ias_handle,
nullptr, 0
)
);
CUDA_CHECK( cudaStreamSynchronize( 0 ) );
CUDA_CHECK( cudaFree( (void*)d_tmp ) );
CUDA_CHECK( cudaFree( (void*)d_instances ) );
I am puzzled that I did not have to make any changes to the SBT. I expected that I would need to add a hit program entry for the second GAS.
Also, it was necessary to split the mesh indices into two device arrays, one for each triangle build input. I tried using a single device array with all (2 of) the index triplets, then using primitiveIndexOffset but that didn’t seem to work.