Instance Acceleration Structure-OptiX 7.1

I am trying to figure out how to build a TLAS properly.
Using the samples that came with OptiX 7.1 and Ingo Wald’s Optix 7 samples, started with a triangle (just the BLAS that holds the geometry) and it works fine (moved the triangle sample of the SDK to Wald’s example framework).
Next I introduced a TLAS with one instance (that is the BLAS from before) and using that TLAS in the shader but I am not getting a single hit.
What am I not doing correctly?

OptixTraversableHandle SampleRenderer::buildAccelerator() {

	OptixTraversableHandle geometryAcceleratorHandle{ 0 };
	CUdeviceptr dAcceleratorBuffer;
	OptixAccelBuildOptions acceleratorOptions{};
	acceleratorOptions.buildFlags = OPTIX_BUILD_FLAG_NONE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
	acceleratorOptions.operation = OPTIX_BUILD_OPERATION_BUILD;

	//Triangle build input: simple list of three vertices
	//const std::array<float3, 3> vertices{ {{ -0.5f, -0.5f, 0.0f },{  0.5f, -0.5f, 0.0f },{  0.0f,  0.5f, 0.0f }} };
	const std::array<float3, 3> vertices{ { { 0.33f, 0.33f, 0.0f },{  0.33f, -0.33f, 0.0f },{  0.66f,  0.33f, 0.0f }} };
	const size_t verticesSize = sizeof(float3) * vertices.size();
	CUdeviceptr dVertices{ 0ull };
	CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dVertices), verticesSize));
	CUDA_CHECK(Memcpy(reinterpret_cast<void*>(dVertices), vertices.data(), verticesSize, cudaMemcpyHostToDevice));

	//Build input is a simple list of non-indexed triangle vertices
	const uint32_t triangleInputFlags{ OPTIX_GEOMETRY_FLAG_NONE };
	OptixBuildInput triangleInput{};
	triangleInput.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
	triangleInput.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
	triangleInput.triangleArray.numVertices = static_cast<uint32_t>(vertices.size());
	triangleInput.triangleArray.vertexBuffers = &dVertices;
	triangleInput.triangleArray.flags = &triangleInputFlags;
	triangleInput.triangleArray.numSbtRecords = 1u;

	OptixAccelBufferSizes blasBufferSizes;
	OPTIX_CHECK(optixAccelComputeMemoryUsage(optixContext, &acceleratorOptions, &triangleInput, 1, &blasBufferSizes));
	CUdeviceptr dTempBuffer;
	CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dTempBuffer), blasBufferSizes.tempSizeInBytes));

	CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dAcceleratorBuffer), blasBufferSizes.outputSizeInBytes));

	OPTIX_CHECK(
		optixAccelBuild(
			optixContext,
			0,
			&acceleratorOptions,
			&triangleInput,
			1,
			dTempBuffer,
			blasBufferSizes.tempSizeInBytes,
			dAcceleratorBuffer,
			blasBufferSizes.outputSizeInBytes,
			&geometryAcceleratorHandle,
			nullptr,
			0)
		);

	CUDA_CHECK(Free((void*)dTempBuffer));
	CUDA_CHECK(Free((void*)dVertices));

	return geometryAcceleratorHandle;
}

Instead of using the return value of the function above, I feed it to the TLAS creation function below and use its output handle in the shaders:

OptixTraversableHandle SampleRenderer::buildInstanceAccelerator(const OptixTraversableHandle& geoHandle){
	OptixInstance optixInstance = { { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f } };
	optixInstance.flags = OPTIX_INSTANCE_FLAG_NONE;
	optixInstance.instanceId = 0u;
	optixInstance.sbtOffset = 0u;
	optixInstance.visibilityMask = 1u;
	optixInstance.traversableHandle = geoHandle;
	CUdeviceptr dOptixInstance;
	CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dOptixInstance), sizeof(OptixInstance)));
	CUDA_CHECK(Memcpy(reinterpret_cast<void*>(dOptixInstance), &optixInstance, sizeof(OptixInstance), cudaMemcpyHostToDevice));

	OptixAabb optixAabb[2]{
		{ -1.5f, -1.0f, -0.5f,
		  -0.5f,  0.5f,  0.5f  },
		{  0.5f,  0.0f, -0.01f,
		   1.5f,  1.5f,  0.01f } };
	CUdeviceptr  dAabb;
	CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dAabb), 2 * sizeof(OptixAabb)));
	CUDA_CHECK(Memcpy(reinterpret_cast<void*>(dAabb), optixAabb, 2 * sizeof(OptixAabb), cudaMemcpyHostToDevice));
	OptixBuildInput instanceBuildInput{};
	instanceBuildInput.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
	instanceBuildInput.instanceArray.instances = dOptixInstance;
	instanceBuildInput.instanceArray.numInstances = 1u;
	instanceBuildInput.instanceArray.aabbs = dAabb;
	instanceBuildInput.instanceArray.numAabbs =1u;

	OptixAccelBuildOptions acceleratorBuildOptions{};
	acceleratorBuildOptions.buildFlags = OPTIX_BUILD_FLAG_NONE;
	acceleratorBuildOptions.operation = OPTIX_BUILD_OPERATION_BUILD;

	OptixAccelBufferSizes acceleratorBufferSizes;
	OPTIX_CHECK(optixAccelComputeMemoryUsage(
		optixContext,
		&acceleratorBuildOptions,
		&instanceBuildInput,
		1u,
		&acceleratorBufferSizes));
	CUdeviceptr dTempBuffer;
	CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dTempBuffer), acceleratorBufferSizes.tempSizeInBytes));
	CUdeviceptr dInstanceAcceleratorBuffer;
	CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dInstanceAcceleratorBuffer), acceleratorBufferSizes.outputSizeInBytes));
	
	OptixTraversableHandle instanceAcceleratorHandle{ 0 };
	OPTIX_CHECK(optixAccelBuild(
		optixContext,
		0,
		&acceleratorBuildOptions,
		&instanceBuildInput,
		1,
		dTempBuffer,
		acceleratorBufferSizes.tempSizeInBytes,
		dInstanceAcceleratorBuffer,
		acceleratorBufferSizes.outputSizeInBytes,
		&instanceAcceleratorHandle,
		nullptr,
		0));
	return instanceAcceleratorHandle;
}

For reference the shader code (it works just fine when the OptixTraversableHandle comes from the first function above:

namespace osc {

extern "C" __constant__ LaunchParams optixLaunchParams;
//Single ray type
enum { SURFACE_RAY_TYPE = 0, RAY_TYPE_COUNT };

static __forceinline__ __device__ void* unpackPointer(uint32_t i0, uint32_t i1) {
	const uint64_t uptr = static_cast<uint64_t>(i0) << 32 | i1;
	void* ptr = reinterpret_cast<void*>(uptr);
	return ptr;
}

static __forceinline__ __device__ void  packPointer(void* ptr, uint32_t& i0, uint32_t& i1) {
	const uint64_t uptr = reinterpret_cast<uint64_t>(ptr);
	i0 = uptr >> 32;
	i1 = uptr & 0x00000000ffffffff;
}

template<typename T>	static __forceinline__ __device__ T* getPRD() {
	const uint32_t u0 = optixGetPayload_0();
	const uint32_t u1 = optixGetPayload_1();
	return reinterpret_cast<T*>(unpackPointer(u0, u1));
}

static __forceinline__ __device__ void trace(
	OptixTraversableHandle handle,
	vec3f                 ray_origin,
	vec3f                 ray_direction,
	float                  tmin,
	float                  tmax,
	float3* prd) {
	unsigned int p0, p1, p2;
	p0 = float_as_int(prd->x);
	p1 = float_as_int(prd->y);
	p2 = float_as_int(prd->z);
	optixTrace(
		handle,
		ray_origin,
		ray_direction,
		tmin,
		tmax,
		0.0f,                // rayTime
		OptixVisibilityMask(1),
		OPTIX_RAY_FLAG_DISABLE_ANYHIT,//OPTIX_RAY_FLAG_NONE,
		0,                   // SBT offset
		0,                   // SBT stride
		0,                   // missSBTIndex
		p0, 
		p1, 
		p2);
	prd->x = int_as_float(p0);
	prd->y = int_as_float(p1);
	prd->z = int_as_float(p2);
}

static __forceinline__ __device__ void setPayload(float3 p)	{
	optixSetPayload_0(float_as_int(p.x));
	optixSetPayload_1(float_as_int(p.y));
	optixSetPayload_2(float_as_int(p.z));
}

static __forceinline__ __device__ float3 getPayload() {
	return make_float3(int_as_float(optixGetPayload_0()), int_as_float(optixGetPayload_1()), int_as_float(optixGetPayload_2()));
}

extern "C" __global__ void __closesthit__radiance() {
	//When built-in triangle intersection is used, a number of fundamental
	//attributes are provided by the OptiX API, including barycentric coordinates
	const float2 barycentricCoordinates = optixGetTriangleBarycentrics();
	setPayload(make_float3(barycentricCoordinates.x, barycentricCoordinates.y, 1.f - barycentricCoordinates.x - barycentricCoordinates.y));
}

extern "C" __global__ void __anyhit__radiance() {  }

extern "C" __global__ void __intersection__radiance() { }

extern "C" __global__ void __miss__radiance() {
	MissData* missData = reinterpret_cast<MissData*>(optixGetSbtDataPointer());
	float3 payload = getPayload();//Why???
	setPayload(missData->backgroundColor);
}

extern "C" __global__ void __raygen__renderFrame() {
	// compute a test pattern based on pixel ID
	const int ix = optixGetLaunchIndex().x;
	const int iy = optixGetLaunchIndex().y;

	const auto& camera = optixLaunchParams.camera;

	// our per-ray data for this example. what we initialize it to
	// won't matter, since this value will be overwritten by either
	// the miss or hit program, anyway
	float3 pixelColorPRD = { 0.5f, 0.5f, 0.5f };
	// normalized screen plane position, in [0,1]^2
	const vec2f screen(vec2f(ix + .5f, iy + .5f) / vec2f(optixLaunchParams.frame.size));

	// generate ray direction
	vec3f rayDir = normalize(camera.direction + (screen.x - 0.5f) * camera.horizontal + (screen.y - 0.5f) * camera.vertical);

	trace(optixLaunchParams.traversable,
		camera.position,
		rayDir,
		0.f,    // tmin
		1e16f,  // tmax
		&pixelColorPRD);

	const int r = int(255.99f * pixelColorPRD.x);
	const int g = int(255.99f * pixelColorPRD.y);
	const int b = int(255.99f * pixelColorPRD.z);

	const uint32_t rgba = 0xff000000 | (r << 0) | (g << 8) | (b << 16);

	const uint32_t fbIndex = ix + iy * optixLaunchParams.frame.size.x;
	optixLaunchParams.frame.colorBuffer[fbIndex] = rgba;
}

}

Please have a look at my OptiX 7 applications https://github.com/NVIDIA/OptiX_Apps which are all using IAS with each instance holding a single GAS with triangles.

You probably have not changed the OptixPipelineCompileOptions traversableGraphFlags to contain the flag OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING which should be present for IAS->GAS only scene hierarchies.
https://github.com/NVIDIA/OptiX_Apps/blob/master/apps/intro_runtime/src/Application.cpp#L1733

I’m assuming you set the instanceAcceleratorHandle as the root traversable handle inside the launch parameters.

There is also no reason for the OptixAabbs on the instances.

instanceBuildInput.instanceArray.aabbs = dAabb;
stanceBuildInput.instanceArray.numAabbs =1u;

Those fields were only used for motion blur and actually have been removed in OptiX 7.2 because OptiX can calculate these itself based on the children.
https://raytracing-docs.nvidia.com/optix7/api/html/struct_optix_build_input_instance_array.html
Means I would recommend updating to the OptiX SDK 7.2 and matching drivers.

There are logger and validation features in OptiX 7.2 which might report that mismatch:
https://raytracing-docs.nvidia.com/optix7/api/html/struct_optix_device_context_options.html

I would also always recommend to calculate the pipeline’s required stack space explicitly yourself.
The built-in stack size calculation does not handle all cases, esp. not with direct or continuation callables.
It needs to take the maximum traversable graph depth into account which is 2 with single level instancing:
https://github.com/NVIDIA/OptiX_Apps/blob/master/apps/intro_runtime/src/Application.cpp#L2018

1 Like

Thanks a lot.
OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING and setting
maxTraversableGraphDepth = 2 in optixPipelineSetStackSize() did the job.
I will definitely study your applications as well.