Ray Tracing Over Weekend with CUDA using unified memory

Dear all ,
I have read a very interesting post Accelerated Ray Tracing in One Weekend in CUDA by By Roger Allen (https://devblogs.nvidia.com/accelerated-ray-tracing-cuda/). In this post virtual functions were employed to create ray-traced pictures of spheres. I modified the code provided in Chapter 5
(https://github.com/rogerallen/raytracinginoneweekendincuda/tree/ch05_normals_cuda)

a little bit in order to create_world on host instead of device. In other words I replaced device functions with host device functions in headers ray.h; hitable.h; hitable_list.h; sphere.h; and finally I added host function in main.cu file

void create_world_host(hitable **d_list, hitable **d_world) {

    *(d_list)   = new sphere(vec3(0,0,-1), 0.5);
    *(d_list+1) = new sphere(vec3(0,-100.5,-1), 100);
    *d_world    = new hitable_list(d_list,2);

}

and used unified memory

// make our world of hitables
//but using unified memory
hitable **d_list;
checkCudaErrors(cudaMallocManaged((void **)&d_list, 2*sizeof(hitable *)));
hitable **d_world;
checkCudaErrors(cudaMallocManaged((void **)&d_world, sizeof(hitable *)));)

I found it is impossible to fill in the unified memory using my host function create_world_host instead of similar global void create_world . I have Quadro P4000 card with Pascal architecture.

Why does usage of the host function create_world_host instead of global void create_world lead to
the error : endering a 1200x600 image in 8x8 blocks.
CUDA error = 700 at main.cu:114 ‘cudaDeviceSynchronize()’ ???

My version of ray.h is

#ifndef RAYH
#define RAYH
#include “vec3.h”

class ray
{
public:
host device ray() {}
host device ray(const vec3& a, const vec3& b) { A = a; B = b; }
host device vec3 origin() const { return A; }
host device vec3 direction() const { return B; }
host device vec3 point_at_parameter(float t) const { return A + t*B; }

    vec3 A;
    vec3 B;

};

#endif

My version of hitable.h is

#ifndef HITABLEH
#define HITABLEH

#include “ray.h”

struct hit_record
{
float t;
vec3 p;
vec3 normal;
};

class hitable {
public:
host device virtual bool hit(const ray& r, float t_min, float t_max, hit_record& rec) const = 0;
};

#endif

My version of hitable_list.h is:

#ifndef HITABLELISTH
#define HITABLELISTH

#include “hitable.h”

class hitable_list: public hitable {
public:
host device hitable_list() {}
host device hitable_list(hitable **l, int n) {list = l; list_size = n; }
host device virtual bool hit(const ray& r, float tmin, float tmax, hit_record& rec) const;
hitable **list;
int list_size;
};

host device bool hitable_list::hit(const ray& r, float t_min, float t_max, hit_record& rec) const {
hit_record temp_rec;
bool hit_anything = false;
float closest_so_far = t_max;
for (int i = 0; i < list_size; i++) {
if (list[i]->hit(r, t_min, closest_so_far, temp_rec)) {
hit_anything = true;
closest_so_far = temp_rec.t;
rec = temp_rec;
}
}
return hit_anything;
}

#endif

My version of sphere.h is:

#ifndef SPHEREH
#define SPHEREH

#include “hitable.h”

class sphere: public hitable {
public:
host device sphere() {}
host device sphere(vec3 cen, float r) : center(cen), radius® {};
host device virtual bool hit(const ray& r, float tmin, float tmax, hit_record& rec) const;
vec3 center;
float radius;
};

host device bool sphere::hit(const ray& r, float t_min, float t_max, hit_record& rec) const {
vec3 oc = r.origin() - center;
float a = dot(r.direction(), r.direction());
float b = dot(oc, r.direction());
float c = dot(oc, oc) - radiusradius;
float discriminant = b
b - a*c;
if (discriminant > 0) {
float temp = (-b - sqrt(discriminant))/a;
if (temp < t_max && temp > t_min) {
rec.t = temp;
rec.p = r.point_at_parameter(rec.t);
rec.normal = (rec.p - center) / radius;
return true;
}
temp = (-b + sqrt(discriminant)) / a;
if (temp < t_max && temp > t_min) {
rec.t = temp;
rec.p = r.point_at_parameter(rec.t);
rec.normal = (rec.p - center) / radius;
return true;
}
}
return false;
}

#endif

and finally my version of main.cu:

#include
#include <time.h>
#include <float.h>
#include “vec3.h”
#include “ray.h”
#include “sphere.h”
#include “hitable_list.h”

// limited version of checkCudaErrors from helper_cuda.h in CUDA examples
#define checkCudaErrors(val) check_cuda( (val), #val, FILE, LINE )

void check_cuda(cudaError_t result, char const *const func, const char *const file, int const line) {
if (result) {
std::cerr << “CUDA error = " << static_cast(result) << " at " <<
file << “:” << line << " '” << func << “’ \n”;
// Make sure we call CUDA Device Reset before exiting
cudaDeviceReset();
exit(99);
}
}

device vec3 color(const ray& r, hitable *world) {
hit_record rec;
if ((world)->hit(r, 0.0, FLT_MAX, rec)) {
return 0.5f
vec3(rec.normal.x()+1.0f, rec.normal.y()+1.0f, rec.normal.z()+1.0f);
}
else {
vec3 unit_direction = unit_vector(r.direction());
float t = 0.5f
(unit_direction.y() + 1.0f);
return (1.0f-t)vec3(1.0, 1.0, 1.0) + tvec3(0.5, 0.7, 1.0);
}
}

global void render(vec3 fb, int max_x, int max_y,
vec3 lower_left_corner, vec3 horizontal, vec3 vertical, vec3 origin,
hitable **world) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if((i >= max_x) || (j >= max_y)) return;
int pixel_index = j
max_x + i;
float u = float(i) / float(max_x);
float v = float(j) / float(max_y);
ray r(origin, lower_left_corner + uhorizontal + vvertical);
fb[pixel_index] = color(r, world);
}

global void create_world(hitable **d_list, hitable **d_world) {
if (threadIdx.x == 0 && blockIdx.x == 0) {
*(d_list) = new sphere(vec3(0,0,-1), 0.5);
*(d_list+1) = new sphere(vec3(0,-100.5,-1), 100);
*d_world = new hitable_list(d_list,2);
}
}

void create_world_host(hitable **d_list, hitable **d_world) {

    *(d_list)   = new sphere(vec3(0,0,-1), 0.5);
    *(d_list+1) = new sphere(vec3(0,-100.5,-1), 100);
    *d_world    = new hitable_list(d_list,2);

}

global void free_world(hitable **d_list, hitable **d_world) {
delete *(d_list);
delete *(d_list+1);
delete *d_world;
}

int main() {
int nx = 1200;
int ny = 600;
int tx = 8;
int ty = 8;

std::cerr << "Rendering a " << nx << "x" << ny << " image ";
std::cerr << "in " << tx << "x" << ty << " blocks.\n";

int num_pixels = nx*ny;
size_t fb_size = num_pixels*sizeof(vec3);

// allocate FB
vec3 *fb;
checkCudaErrors(cudaMallocManaged((void **)&fb, fb_size));

// make our world of hitables
//but using unified memory
hitable **d_list;
checkCudaErrors(cudaMallocManaged((void **)&d_list, 2*sizeof(hitable *)));
hitable **d_world;
checkCudaErrors(cudaMallocManaged((void **)&d_world, sizeof(hitable *)));

//***Creation of world on device using unified memory works***//
create_world<<<1,1>>>(d_list,d_world);

//Creation of world on host using unified memory fails//
// create_world_host(d_list,d_world);

checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());

clock_t start, stop;
start = clock();
// Render our buffer
dim3 blocks(nx/tx+1,ny/ty+1);
dim3 threads(tx,ty);
render<<<blocks, threads>>>(fb, nx, ny,
                            vec3(-2.0, -1.0, -1.0),
                            vec3(4.0, 0.0, 0.0),
                            vec3(0.0, 2.0, 0.0),
                            vec3(0.0, 0.0, 0.0),
                            d_world);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
stop = clock();
double timer_seconds = ((double)(stop - start)) / CLOCKS_PER_SEC;
std::cerr << "took " << timer_seconds << " seconds.\n";

// Output FB as Image
std::cout << "P3\n" << nx << " " << ny << "\n255\n";
for (int j = ny-1; j >= 0; j--) {
    for (int i = 0; i < nx; i++) {
        size_t pixel_index = j*nx + i;
        int ir = int(255.99*fb[pixel_index].r());
        int ig = int(255.99*fb[pixel_index].g());
        int ib = int(255.99*fb[pixel_index].b());
        std::cout << ir << " " << ig << " " << ib << "\n";
    }
}

// clean up
checkCudaErrors(cudaDeviceSynchronize());
free_world<<<1,1>>>(d_list,d_world);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaFree(d_list));
checkCudaErrors(cudaFree(d_world));
checkCudaErrors(cudaFree(fb));

// useful for cuda-memcheck --leak-check full
cudaDeviceReset();

}

This is due to the fact that virtual functions are being used.

virtual functions require a virtual function pointer table in the object. The virtual function pointer table is created/initialized at object construction time. If the object is constructed on the device, and used on the device, the table entries are valid (they point to valid locations in device memory, where the relevant functions are stored).

If the object is constructed on the host, and used on the host, the table is also valid.

The table is not valid if the object is constructed in one domain but used (i.e. virtual functions are invoked) in the other.

This requirement/limitation is covered in the programming guide:

https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#virtual-functions

"If an object is created in host code, invoking a virtual function for that object in device code has undefined behavior.

If an object is created in device code, invoking a virtual function for that object in host code has undefined behavior."

Do you mean that even when the unified memory is used the table created on the host can not be correctly read by the device ? Is there any way to pass correctly the table created by the host in the unified memory to the device?

The table can be read, whether you use unified memory or not. The problem is not in accessibility of the table. Remember, the table is a table of (function) pointers.

If the object is created on the host, the table will be full of entries that point to entry points in host memory. Unified memory doesn’t magically “fix” this. The object itself is allocated using managed memory. The storage for the pointer in the object itself is in unified memory. But pointers in the object don’t necessarily point to locations in unified memory.

If you attempt to invoke one of these functions on the device, from a table (i.e. object) initialized on the host, it could not possibly work correctly on the device. The location it points to is not in unified memory, therefore not available in device code, and even if it were in unified memory, it contains x86 host code. x86 host code cannot run on the device.

The limitations I highlighted from the programming guide are quite real, and there is no exception to these limitations for unified memory.

Someone reminded me of this:

https://devtalk.nvidia.com/default/topic/1026116/cuda-programming-and-performance/copying-objects-to-device-with-virtual-functions/

(all the things I’ve forgotten…)

If you like wild living, you’re welcome to give it a try. I have no idea if its valid or not. It probably isn’t.