CUDA pointer inside kernel becomes null

I’m trying to pass a pointer to triangle data to a kernel, but when debugging I find the pointer becomes null, d_list contains the triangles and both d_list and d_world are members of the main window class, also the error checking returns “no error”

d_list is of type hittable* and d_world is hittable_list*

__global__ void create_world(hittable* d_list, hittable_list* d_world, int num_triangles) {
    if (threadIdx.x == 0 && blockIdx.x == 0) {
        // the class hittable_list contains a counter for the list size, which no matter the
        // scene size it always becomes zero
        d_world = new hittable_list(&d_list, num_triangles);
    }
}

checkCudaErrors(cudaMalloc((void**)&d_list, num_hittables * sizeof(triangle)));
checkCudaErrors(cudaMalloc((void**)&d_world, sizeof(hittable_list)));

cudaMemcpy(d_list, m_triangles.data(), num_hittables * sizeof(triangle), cudaMemcpyHostToDevice);

create_world << <1, 1 >> > (d_list, d_world, num_hittables);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());

I tried initializing the “world” in the host then cudaMemcpy’ing to the d_world, but it also fails

Minimal example:

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <vector>

struct make_list {
    __device__ make_list(float** list, int n) { contents = list; size = n; };
    float** contents;
    int size;
};

__global__ void render(make_list** world) {
    int size = (*world)->size; // set a breakpoint here, the size is 0
}

__global__ void create_world(float* d_list, make_list* d_world, int num_triangles) {
    if (threadIdx.x == 0 && blockIdx.x == 0) {
        // the class hittable_list contains a counter for the list size, which no matter the
        // scene size it always becomes zero
        d_world = new make_list(&d_list, num_triangles);
    }
}

int main () {
    float* d_list;
    make_list* d_world;

    int size = 8;
    std::vector<float> m_triangles(size);

    cudaMalloc((void**)&d_list, size * sizeof(float));
    cudaMalloc((void**)&d_world, sizeof(make_list));

    cudaMemcpy(d_list, m_triangles.data(), size * sizeof(float), cudaMemcpyHostToDevice);

    create_world << <1, 1 >> > (d_list, d_world, size);
    cudaDeviceSynchronize();

    render << <1, 1 >> > (&d_world);
    cudaDeviceSynchronize();

    return 0;
}

There are at least a few issues.

  1. In C++, when you pass a variable to a function via the function parameters, a copy of that variable is made for local use by the function. Any modifications made to that variable will not show up globally, i.e. in the calling environment, because the function is operating on a copy of the variable. Therefore this could never do what you want:

     d_world = new make_list(&d_list, num_triangles);
    

    There is nothing illegal about it, per se, but it will not have the desired effect. The global copy of d_world is unchanged by that assignment. This is a C++ concept, not unique or specific to CUDA, and it trips people up from time to time.

  2. This is almost never legal in CUDA:

    render << <1, 1 >> > (&d_world);
                          ^       
    

    In typical usage, it is not possible to pass the address of a host location to device code via a kernel call parameter. Any attempt to dereference that pointer &d_world will result in dereferencing the address of a host location. That is illegal in CUDA device code.

  3. While not necessarily a problem at this point, you should be aware of the fact that in-kernel new operates against the device heap which has a default limit of 8MB, and furthermore allocations created this way cannot take part in host-issued cudaMemcpy* calls. These topics are covered in the programming guide.

When I make changes to address those first 2 items, I get what appear to be sensible results:

$ cat t2190.cu
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <vector>
#include <cstdio>

struct make_list {
    __device__ make_list(float* list, int n) { contents = list; size = n; };
    float* contents;
    int size;
};

__global__ void render(make_list** world) {
    int size = (*world)->size; // set a breakpoint here, the size is 0
    printf("size = %d\n", size);
}

__global__ void create_world(float* d_list, make_list** d_world, int num_triangles) {
    if (threadIdx.x == 0 && blockIdx.x == 0) {
        // the class hittable_list contains a counter for the list size, which no matter the
        // scene size it always becomes zero
        *d_world = new make_list(d_list, num_triangles);
    }
}

int main () {
    float* d_list;
    make_list** d_world;
    cudaMalloc(&d_world, sizeof(make_list*));
    int size = 8;
    std::vector<float> m_triangles(size);

    cudaMalloc((void**)&d_list, size * sizeof(float));

    cudaMemcpy(d_list, m_triangles.data(), size * sizeof(float), cudaMemcpyHostToDevice);

    create_world << <1, 1 >> > (d_list, d_world, size);
    cudaDeviceSynchronize();

    render << <1, 1 >> > (d_world);
    cudaDeviceSynchronize();

    return 0;
}
$ nvcc -o t2190 t2190.cu
$ compute-sanitizer ./t2190
========= COMPUTE-SANITIZER
size = 8
========= ERROR SUMMARY: 0 errors
$

(Yes, I have changed your handling of d_list/contents as well, and indicated the reasons why on my answer on your cross posting.)

Thanks again Robert, I hope it’s fine posting an updated version in this thread, please see the folowing code, it’s causing crashes now

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <vector>
#include <cstdio>


class hittable {
public:
    __device__ virtual int hit() const = 0;
};

class triangle : public hittable {
public:
    __device__ virtual int hit() const {
        return -2;
    }
};

class make_list : public hittable {
public:
    __device__ make_list(triangle* list, int n) { contents = list; size = n; };
    __device__ virtual int hit() const {
        
        if (contents[0].hit())
            return -1;

        return size;
    }
    triangle* contents;
    int size;
};

__global__ void render(make_list** world) {
    int size = (*world)->size; // set a breakpoint here, the size is 0
    printf("size = %d\n", size);
    int new_size = (*world)->hit();
    printf("new size = %d\n", new_size);
}

__global__ void create_world(triangle* d_list, make_list** d_world, int num_triangles) {
    if (threadIdx.x == 0 && blockIdx.x == 0) {
        // the class hittable_list contains a counter for the list size, which no matter the
        // scene size it always becomes zero
        *d_world = new make_list(d_list, num_triangles);
    }
}

int main() {
    triangle* d_list;
    make_list** d_world;
    cudaMalloc(&d_world, sizeof(make_list*));
    int size = 8;
    std::vector<triangle> m_triangles(size);

    cudaMalloc((void**)&d_list, size * sizeof(triangle));

    cudaMemcpy(d_list, m_triangles.data(), size * sizeof(triangle), cudaMemcpyHostToDevice);

    create_world << <1, 1 >> > (d_list, d_world, size);
    cudaDeviceSynchronize();

    render << <1, 1 >> > (d_world);
    cudaDeviceSynchronize();

    return 0;
}

You are now stepping on a limitation of classes with virtual functions. This is initializing objects of a class with virtual functions (on the host):

std::vector<triangle> m_triangles(size);

this is invoking a virtual function of an object of that type, in device code:

    if (contents[0].hit())

That is illegal. This topic is covered in numerous forums postings. The most direct solution is to initialize such objects in device code.

Here is a crude example of a possible workaround method using placement new to avoid this limitation:

$ cat t2193.cu
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <vector>
#include <cstdio>
#include <new>

class hittable {
public:
    __device__ virtual int hit() const = 0;
};

class triangle : public hittable {
public:
    __device__ virtual int hit() const {
        return -2;
    }
};

class make_list : public hittable {
public:
    __device__ make_list(triangle* list, int n) { contents = list; size = n;
      for (int i = 0; i < n; i++)
        new(list+i) triangle;
};
    __device__ virtual int hit() const {

        if (contents[0].hit())
            return -1;

        return size;
    }
    triangle* contents;
    int size;
};

__global__ void render(make_list** world) {
    int size = (*world)->size; // set a breakpoint here, the size is 0
    printf("size = %d\n", size);
    int new_size = (*world)->hit();
    printf("new size = %d\n", new_size);
}

__global__ void create_world(triangle* d_list, make_list** d_world, int num_triangles) {
    if (threadIdx.x == 0 && blockIdx.x == 0) {
        // the class hittable_list contains a counter for the list size, which no matter the
        // scene size it always becomes zero
        *d_world = new make_list(d_list, num_triangles);
    }
}

int main() {
    triangle* d_list;
    make_list** d_world;
    cudaMalloc(&d_world, sizeof(make_list*));
    int size = 8;

    cudaMalloc((void**)&d_list, size * sizeof(triangle));

    create_world << <1, 1 >> > (d_list, d_world, size);
    cudaDeviceSynchronize();

    render << <1, 1 >> > (d_world);
    cudaDeviceSynchronize();

    return 0;
}
$ nvcc -o t2193 t2193.cu
ptxas warning : Stack size for entry function '_Z12create_worldP8trianglePP9make_listi' cannot be statically determined
ptxas warning : Stack size for entry function '_Z6renderPP9make_list' cannot be statically determined
$ compute-sanitizer ./t2193
========= COMPUTE-SANITIZER
size = 8
new size = -1
========= ERROR SUMMARY: 0 errors
$

Note the compiler warnings. They are not an issue for the code you have shown here but may be in the future. I’m not suggesting they can always be safely ignored. You can find various posts discussing the meaning of them.

wow thank you so much (for now lol), I wasn’t familiar with that syntax