Calling a virtual function of an object instantiated on CUDA device results in CUDA_EXCEPTION_5, Warp Out-of-range Address

I am trying to get a simple ray tracer following Peter Shirley’s “Ray Tracing in One Weekend Series” running on CUDA, but I am running into, what I believe is, undefined behaviour using virtual functions in kernels.

I’ve made sure that the objects are instantiated and only ever used on the device side, which according to the CUDA Programming Guide should be enough to get virtual functions to run.

What is really odd is that the virtual functions work just fine inside one kernel (I call the hit method inside of the render kernel), but not the other, which leads me to believe that it might not necessarily be just a virtual functions problem, but maybe I am not allocating memory properly.

What the code should boil down to is

  • instantiating a bunch of sphere objects on the device, which all inherit from a virtual base class called hittable and have just two functions - hit and bounding_box.
  • running through them and constructing a Bounding Volume Hierarchy (this is where I get the error)
  • the render function checks for hits against the Bounding Volume Hierarchy (I’ve removed the code for that as even without it we get the error, so it should be irrelevant)

I am running this on a RTX 2070, with Cuda release 10.1.105.

One thing to note is that if I use the --relocatable-device-code (-rdc=true) the code runs with no errors, but since that is bound to have performance overhead, I’d rather avoid it. If I must use rdc=true in the end, that’s fine, but I’d like to understand why.

My question is why am I getting the errors from cuda-memcheck and cuda-gdb posted at the bottom?

Here’s what I’ve got, and at the bottom I’ll add the cuda-gdb and cuda-memcheck outputs

hittable.h

class hittable
{
	public:
		__device__ virtual bool hit(
				const ray& r, float t_min,
				float t_max, hit_record& rec) const = 0;

		__device__ virtual bool bounding_box(float t0, float t1, aabb& box) const = 0;

		aabb box;
};

sphere.h

class sphere: public hittable
{
	public:
		__device__ sphere() {}
		__device__ sphere(vec3 cen, float r, material *m) :
			center(cen), radius(r), mat_ptr(m) {}

		__device__ virtual bool hit(const ray& r, float t_min,
				float t_max, hit_record& rec) const;
		__device__ virtual bool bounding_box(float t0, float t1, aabb& _box) const;

		vec3 center;
		float radius;
		material *mat_ptr;
		aabb box;
};

// Skipping hit implementation as it is irrelevant

__device__ bool sphere::bounding_box(float t0, float t1, aabb& _box) const
{
	_box = aabb(center - vec3(radius, radius, radius),
			   center + vec3(radius, radius, radius));

	return true;
}

Here are the construction kernels defined in main.cu

__global__ void create_world(hittable** d_list, hittable** d_world)
{
    // The kernel that initializes the spheres on the device
	if (threadIdx.x == 0 && blockIdx.x == 0)
	{
		*(d_list) = new sphere(vec3(-1,0,-1), .5, new dielectric(1.5));
		*(d_list+1) = new sphere(vec3(-1,0,-1), -.45, new dielectric(1.5));
		*(d_list+2) = new sphere(vec3(0,0,-1), .5, new lambertian(vec3(.1, .2, .5)));
		*(d_list+3) = new sphere(vec3(0,-100.5,-1), 100, new lambertian(vec3(.8, .8, .0)));
		*(d_list+4) = new sphere(vec3(1,0,-1), .5, new metal(vec3(.8, .6, .2), 0.3));
		*d_world = new hittable_list(d_list, 5);
	}
}

__global__ void generate_BVH_iterative(hittable** l, hittable** world, int n, float time0, float ime1, bvh_node** root_bvh, bvh_node** bvh_heap)
{
	bvh_node* explore_list[1000];
	bvh_node** explore_ptr = explore_list;

	*explore_ptr++ = NULL;
	bvh_node* node = new bvh_node();

	int c = 0;
	while(node != NULL)
	{
		c++;

		printf("anything\n"); // Doing anything at all in here causes the error
        // This is where I do my BVH construction implementation
        // The reason I am including this loop is that without it, even,
        // with the bounding_box call at the bottom, the code runs

		node = *--explore_ptr;
	}

	aabb* box = &((*l)->box);
	(*l)->bounding_box(.0f, 1.0f, *box); // If I comment this out, and use the cached box, it all works,
    //even with the full BVH construction in the while loop above

	return;
}

And finally, here is the stripped down int main()

// Allocate world pointers
	hittable **d_list;
	hittable **d_world;

	cudaMalloc((void**)&d_list, 5 * sizeof(hittable*));
	cudaMalloc((void**)&d_world, sizeof(hittable*));

	// use world pointers to create world
	create_world<<<1, 1>>>(d_list, d_world);

	CudaCheckError();

	// Allocate bvh data
	bvh_node **bvh_heap;
	bvh_node **bvh_root;

	cudaMalloc((void**)&bvh_heap, 1000 * sizeof(bvh_node*));
	cudaMalloc((void**)&bvh_root, 1 * sizeof(bvh_node*));

	// Create bvh
	generate_BVH_iterative<<<1, 1>>>(d_list, d_world, 5, .0f, 1.0f, bvh_root, bvh_heap);

	CudaCheckError();  // This prints the error

    // I am including the call to render, just to make sure I am not missing something in terms
    // of there being difference between the 2 kernels. 'd_pixels', 'd_rand_state' and 'cam' are
    // properly defined above, and they work fine when not using BVH, but I've trimmed down the code
	render<<<blocks, threads>>>(nx, ny, d_pixels, bvh_root, d_world, d_rand_state, cam);
    // Even though in the above render kernel I use the hit method, which is also a virtual function
    // it works just fine

	free_world<<<1,1>>>(d_list, d_world);

Here is the cuda-memcheck output (it reports the error in material.h:81 which has no relation at all to the BVH construction, which leads me to believe it’s just accessing the wrong memory address)

========= CUDA-MEMCHECK
========= Invalid __local__ write of size 4
=========     at 0x000037f0 in /home/vshotarov/Projects/death-star-gpu/material.h:81:generate_BVH_iterative(hittable**, hittable**, int, float, float, bvh_node**, bvh_node**)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x00ffdc2c is out of bounds
=========     Device Frame:/home/vshotarov/Projects/death-star-gpu/material.h:81:generate_BVH_iterative(hittable**, hittable**, int, float, float, bvh_node**, bvh_node**) ($_Z22generate_BVH_iterativePP8hittableS1_iffPP8bvh_nodeS4_$_ZNK6sphere12bounding_boxEffR4aabb : 0x20)
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2fe) [0x28ccce]
=========     Host Frame:./a.out [0x11a19]
=========     Host Frame:./a.out [0x11aa7]
=========     Host Frame:./a.out [0x47df5]
=========     Host Frame:./a.out [0x49f5]
=========     Host Frame:./a.out [0x4661]
=========     Host Frame:./a.out [0x46be]
=========     Host Frame:./a.out [0x3fa5]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf3) [0x24413]
=========     Host Frame:./a.out [0x39ce]
=========
========= Program hit cudaErrorLaunchFailure (error 719) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/lib64/libcuda.so.1 [0x3a0403]
=========     Host Frame:./a.out [0x2fdb6]
=========     Host Frame:./a.out [0x4b53]
=========     Host Frame:./a.out [0x3fb4]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf3) [0x24413]
=========     Host Frame:./a.out [0x39ce]
=========
========= ERROR SUMMARY: 2 errors

And lastly, here is the cuda-gdb output

[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
[New Thread 0x7fffdffff700 (LWP 8972)]
[New Thread 0x7fffdf7fe700 (LWP 8973)]
[New Thread 0x7fffdeffd700 (LWP 8974)]

CUDA Exception: Warp Out-of-range Address
The exception was triggered at PC 0xba5ff0

Thread 1 "a.out" received signal CUDA_EXCEPTION_5, Warp Out-of-range Address.
[Switching focus to CUDA kernel 0, grid 3, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 3, lane 0]
0x0000000000ba6060 in generate_BVH_iterative(hittable**, hittable**, int, float, float, bvh_node**, bvh_node**)<<<(1,1,1),(1,1,1)>>> ()

https://stackoverflow.com/questions/59918673/calling-a-virtual-function-of-an-object-instantiated-on-cuda-device-results-in-c

Thanks a lot!

You’ve resolved a stack overflow on stack overflow. Funny.

By the way there is a simple CUDA raytracer based on Peter Shirley’s work here:

https://devblogs.nvidia.com/accelerated-ray-tracing-cuda/

I’m sorry to bump a dead topic, but I’m having the same issue above and the stackoverflow link is dead. @Robert_Crovella do you have more insight on this problem?

The text of my (now deleted) response on SO was as follows:

" You’re using recursion ( bounding_box() in hittable_list::bounding_box() ), which makes it difficult to know what the stack size needs to be for correct operation. When I insert this: cudaDeviceSetLimit(cudaLimitStackSize, 65536); into the beginning of your main routine, the problem goes away according to my testing. The stack is a form of local memory on the GPU. The local memory problem being reported by cuda_memcheck is ultimately a stack issue (stack overflow)."

also note my response above: a fully-worked example of CUDA-enabled ray tracing is given there.