Allocating device memory for an struc inside an std::vector<struct>

Hello,

I don’t know how exactly to allocate memory for a std::vector of a struct that contains pointers. The first two lines of allocation for the d_nodes and the copy of the static memory works well. However when I allocate memory for the pointers inside d_nodes (only for the nodes with triangles) I get segmentation fault.

Thank you in advance,

Rafael Scatena

Error check:

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error in %s at %s:%d: %s\n", \
                    __FUNCTION__, __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(err); \
        } \
    } while (0)\

Host struct:

    struct OctreeNode {int64_t index; 	Point Center;	double HalfWidth[3];    int64_t ChildIndex[8]; 	int Level;  int TriangleCount;  double* p1x;    double* p1y;    double* p1z;        double* p2x;    double* p2y;    double* p2z;    double* p3x;    double* p3y;    double* p3z;    int* Material;  int* Body;      int64_t* triangle_id;
    
    + cpu constructor
    }

Device Struct:

  struct OctreeNodeGPU {int64_t index; 	Point Center;	double HalfWidth[3];    int64_t ChildIndex[8]; 	int Level;  int TriangleCount;  double* p1x;    double* p1y;    double* p1z;   double* p2x;    double* p2y;    double* p2z; double* p3x;    double* p3y;    double* p3z;    int* Material;  int* Body;      int64_t* triangle_id;}

Function to allocate memory on the gpu:

__host__   void AllocateAndLaunchOctree(OctreeNodeGPU* &d_nodes, std::vector<OctreeNode>& h_nodes) {
    // Allocate memory for the entire OctreeNode array on the device
    
    int numNodes = h_nodes.size();    
 
    CUDA_CHECK(cudaMalloc(&d_nodes, numNodes * sizeof(OctreeNodeGPU)));

    std::cerr << "Main Node Alloc Ok" << std::endl;     

    
    CUDA_CHECK(cudaMemcpy(d_nodes, h_nodes.data(), numNodes * sizeof(OctreeNodeGPU), cudaMemcpyHostToDevice));

   std::cerr << "Static Node Alloc Ok: "<< std::endl;
    
    for (int i = 0; i < numNodes; ++i) {
    if(h_nodes[i].TriangleCount>0){
    std::cerr << "Dynamic Node Alloc Node: "<<i<<" Triangle Count: "<<h_nodes[i].TriangleCount<< std::endl;        
    // Allocate memory on the GPU for each pointer member of the current node
    CUDA_CHECK(cudaMalloc(&(d_nodes[i].p1x), sizeof(double) * h_nodes[i].TriangleCount));  // p1x
    CUDA_CHECK(cudaMalloc(&(d_nodes[i].p1y), sizeof(double) * h_nodes[i].TriangleCount));  // p1y
    CUDA_CHECK(cudaMalloc(&(d_nodes[i].p1z), sizeof(double) * h_nodes[i].TriangleCount));  // p1z
    
    CUDA_CHECK(cudaMalloc(&(d_nodes[i].p2x), sizeof(double) * h_nodes[i].TriangleCount));  // p2x
    CUDA_CHECK(cudaMalloc(&(d_nodes[i].p2y), sizeof(double) * h_nodes[i].TriangleCount));  // p2y
    CUDA_CHECK(cudaMalloc(&(d_nodes[i].p2z), sizeof(double) * h_nodes[i].TriangleCount));  // p2z

    CUDA_CHECK(cudaMalloc(&(d_nodes[i].p3x), sizeof(double) * h_nodes[i].TriangleCount));  // p3x
    CUDA_CHECK(cudaMalloc(&(d_nodes[i].p3y), sizeof(double) * h_nodes[i].TriangleCount));  // p3y
    CUDA_CHECK(cudaMalloc(&(d_nodes[i].p3z), sizeof(double) * h_nodes[i].TriangleCount));  // p3z
    
    CUDA_CHECK(cudaMalloc(&(d_nodes[i].Material), sizeof(int) * h_nodes[i].TriangleCount));  // Material
    CUDA_CHECK(cudaMalloc(&(d_nodes[i].Body), sizeof(int) * h_nodes[i].TriangleCount));      // Body
    CUDA_CHECK(cudaMalloc(&(d_nodes[i].triangle_id), sizeof(int64_t) * h_nodes[i].TriangleCount));  // triangle_id
    std::cerr << "Dynamic Allocated Node: "<<i << std::endl;             
    }}

Error message:

Segmentation Fault (core image dumped)

Once you have done this:

You cannot do this:

The proximal reason for this is that cudaMalloc requires a location in host memory to store the allocated pointer. This:

&d_nodes

is an address in host memory. After doing thatcudaMalloc operation, this:

&(d_nodes[i].

is an address in device memory. When cudaMalloc tries to write the allocated pointer value to that location from host code, you get a seg fault: in CUDA, for basic usage such as what you have here, host code cannot access a location in device memory and vice versa.

This general category of problems requires a deep-copy. There are numerous questions like this that show how to do it, here is one that links to a variety of others. Like I said, there are many questions related to this general topic of deep-copies. Here is another one that focuses specifically on arrays of structs with embedded pointers.

As a simple approach, which certainly may have performance drawbacks, if you just switch to a single representation of your data, and change your usage of cudaMalloc to cudaMallocManaged(), and get rid of all cudaMemcpy operations, your code is likely to “just work”.