CopyDeviceToHost:Invalid Argument

Hello,

I coded two functions one to allocate data on the gpu and one to copy the data back to the cpu and print. However I get the error:

CUDA error in printOctreeNodeFromGPU at /home/3484681/Área de Trabalho/PRISMAMESH/PRISMAMESH/CudaOctreeAux.cu:174: invalid argument

When I try to copy from the device back to the host. The error is in this line:

CUDA_CHECK(cudaMemcpy(&h_Node_copy.index, &Node->index, sizeof(int64_t), cudaMemcpyDeviceToHost));

Thank you in addvance,

Rafael Scatena

The structs are:

struct OctreeNode {int64_t index; 	Point Center;	double HalfWidth[3];    int64_t ChildIndex[8]; 	int Level;  int TriangleCount;      double* p1x;    double* p1y;    double* p1z;        double* p2x;    double* p2y;    double* p2z;    double* p3x;    double* p3y;    double* p3z;    int* Material;  int* Body;      int64_t* triangle_id;
            + cpu constructor
  
  struct OctreeNodeGPU {int64_t index; 	Point Center;	double HalfWidth[3];    int64_t ChildIndex[8]; 	int Level;  int TriangleCount;  double* p1x;    double* p1y;    double* p1z;   double* p2x;    double* p2y;    double* p2z; double* p3x;    double* p3y;    double* p3z;    int* Material;  int* Body;      int64_t* triangle_id;


// Constructor: Initialize all pointers to nullptr
__host__ __device__ OctreeNodeGPU() {
     
    
    index = 0;        Center = {0,0,0};
    for (int i = 0; i < 3; ++i) {HalfWidth[i] = 0;}
    for (int i = 0; i < 8; ++i) {ChildIndex[i] = -1;
    TriangleCount = 0;    Level = 0;}
    
    p1x = nullptr;        p1y = nullptr;        p1z = nullptr;        
    p2x = nullptr;        p2y = nullptr;        p2z = nullptr;        
    p3x = nullptr;        p3y = nullptr;        p3z = nullptr;        
    Material = nullptr;   Body = nullptr;       triangle_id = nullptr;}
    

// Destructor: free allocated memory (on host side)
__host__ __device__ ~OctreeNodeGPU() {
    if (p1x) cudaFree(p1x);        if (p1y) cudaFree(p1y);        if (p1z) cudaFree(p1z);
    if (p2x) cudaFree(p2x);        if (p2y) cudaFree(p2y);        if (p2z) cudaFree(p2z);        
    if (p3x) cudaFree(p3x);        if (p3y) cudaFree(p3y);        if (p3z) cudaFree(p3z);
    if (Material) cudaFree(Material);        if (Body) cudaFree(Body);        if (triangle_id) cudaFree(triangle_id);
}

};

the main code:

    OctreeNodeGPU* Node; 
    copyNodeToGPU(h_Node, Node);
    printOctreeNodeFromGPU(Node);

the functions are:

__host__ void copyNodeToGPU(const OctreeNode& cpuNode,OctreeNodeGPU* Node) {
   
           
      
                CUDA_CHECK(cudaMalloc((void**)&Node, sizeof(OctreeNodeGPU)));
           
                if(Node)std::cerr << "Node Parsed for Copy" <<  std::endl;       
                else{}     
      
           
                std::cerr << "Static Allocated" <<  std::endl;                   

                CUDA_CHECK(cudaMemcpy(&Node->index, &cpuNode.index, sizeof(int64_t), cudaMemcpyHostToDevice));
                CUDA_CHECK(cudaMemcpy(&Node->Center, &cpuNode.Center, sizeof(Point), cudaMemcpyHostToDevice));
                CUDA_CHECK(cudaMemcpy(Node->HalfWidth, cpuNode.HalfWidth, 3*sizeof(double), cudaMemcpyHostToDevice));
                CUDA_CHECK(cudaMemcpy(Node->ChildIndex, cpuNode.ChildIndex, 8*sizeof(int64_t), cudaMemcpyHostToDevice));
                CUDA_CHECK(cudaMemcpy(&Node->Level, &cpuNode.Level, sizeof(int), cudaMemcpyHostToDevice));
                CUDA_CHECK(cudaMemcpy(&Node->TriangleCount, &cpuNode.TriangleCount, sizeof(int), cudaMemcpyHostToDevice));                        
                


            // Allocate memory on GPU and copy each dynamic array
            if (cpuNode.TriangleCount > 0) {
                CUDA_CHECK(cudaMalloc((void**)&Node->p1x, cpuNode.TriangleCount * sizeof(double)));
                CUDA_CHECK(cudaMalloc((void**)&Node->p1y, cpuNode.TriangleCount * sizeof(double)));
                CUDA_CHECK(cudaMalloc((void**)&Node->p1z, cpuNode.TriangleCount * sizeof(double)));
                CUDA_CHECK(cudaMalloc((void**)&Node->p2x, cpuNode.TriangleCount * sizeof(double)));
                CUDA_CHECK(cudaMalloc((void**)&Node->p2y, cpuNode.TriangleCount * sizeof(double)));
                CUDA_CHECK(cudaMalloc((void**)&Node->p2z, cpuNode.TriangleCount * sizeof(double)));
                CUDA_CHECK(cudaMalloc((void**)&Node->p3x, cpuNode.TriangleCount * sizeof(double)));
                CUDA_CHECK(cudaMalloc((void**)&Node->p3y, cpuNode.TriangleCount * sizeof(double)));
                CUDA_CHECK(cudaMalloc((void**)&Node->p3z, cpuNode.TriangleCount * sizeof(double)));
                CUDA_CHECK(cudaMalloc((void**)&Node->Material, cpuNode.TriangleCount * sizeof(int)));
                CUDA_CHECK(cudaMalloc((void**)&Node->Body, cpuNode.TriangleCount * sizeof(int)));
                CUDA_CHECK(cudaMalloc((void**)&Node->triangle_id, cpuNode.TriangleCount * sizeof(int64_t)));

                // Copy data from CPU to GPU
                CUDA_CHECK(cudaMemcpy(Node->p1x, cpuNode.p1x, cpuNode.TriangleCount * sizeof(double), cudaMemcpyHostToDevice));
                CUDA_CHECK(cudaMemcpy(Node->p1y, cpuNode.p1y, cpuNode.TriangleCount * sizeof(double), cudaMemcpyHostToDevice));
                CUDA_CHECK(cudaMemcpy(Node->p1z, cpuNode.p1z, cpuNode.TriangleCount * sizeof(double), cudaMemcpyHostToDevice));
                CUDA_CHECK(cudaMemcpy(Node->p2x, cpuNode.p2x, cpuNode.TriangleCount * sizeof(double), cudaMemcpyHostToDevice));
                CUDA_CHECK(cudaMemcpy(Node->p2y, cpuNode.p2y, cpuNode.TriangleCount * sizeof(double), cudaMemcpyHostToDevice));
                CUDA_CHECK(cudaMemcpy(Node->p2z, cpuNode.p2z, cpuNode.TriangleCount * sizeof(double), cudaMemcpyHostToDevice));
                CUDA_CHECK(cudaMemcpy(Node->p3x, cpuNode.p3x, cpuNode.TriangleCount * sizeof(double), cudaMemcpyHostToDevice));
                CUDA_CHECK(cudaMemcpy(Node->p3y, cpuNode.p3y, cpuNode.TriangleCount * sizeof(double), cudaMemcpyHostToDevice));
                CUDA_CHECK(cudaMemcpy(Node->p3z, cpuNode.p3z, cpuNode.TriangleCount * sizeof(double), cudaMemcpyHostToDevice));
                CUDA_CHECK(cudaMemcpy(Node->Material, cpuNode.Material, cpuNode.TriangleCount * sizeof(int), cudaMemcpyHostToDevice));
                CUDA_CHECK(cudaMemcpy(Node->Body, cpuNode.Body, cpuNode.TriangleCount * sizeof(int), cudaMemcpyHostToDevice));
                CUDA_CHECK(cudaMemcpy(Node->triangle_id, cpuNode.triangle_id, cpuNode.TriangleCount * sizeof(int64_t), cudaMemcpyHostToDevice));
            }


      }        

and the function to copy the information back to the cpu:

void printOctreeNodeFromGPU(OctreeNodeGPU* Node) {

                Point a = {0,0,0}; double b[3] = {0,0,0};
                OctreeNode h_Node_copy(a, b, 0,0, 0);
                //OctreeNode h_Node_copy;
                
                
                // Copy the node from the GPU to the host
                CUDA_CHECK(cudaMemcpy(&h_Node_copy.index, &Node->index, sizeof(int64_t), cudaMemcpyDeviceToHost));


                CUDA_CHECK(cudaMemcpy(&h_Node_copy.Center, &Node->Center, sizeof(Point), cudaMemcpyDeviceToHost));

                CUDA_CHECK(cudaMemcpy(&h_Node_copy.HalfWidth, &Node->HalfWidth, 3*sizeof(double), cudaMemcpyDeviceToHost));

                CUDA_CHECK(cudaMemcpy(&h_Node_copy.ChildIndex, &Node->ChildIndex, 8* sizeof(int64_t), cudaMemcpyDeviceToHost)); 

                CUDA_CHECK(cudaMemcpy(&h_Node_copy.Level, &Node->Level, sizeof(int), cudaMemcpyDeviceToHost));

                CUDA_CHECK(cudaMemcpy(&h_Node_copy.TriangleCount, &Node->TriangleCount, sizeof(int), cudaMemcpyDeviceToHost));




                // Print the non-dynamically allocated members
                std::cout << "index: " << h_Node_copy.index << std::endl;
                std::cout << "Center: (" << h_Node_copy.Center.x << ", "<< h_Node_copy.Center.y << ", "
                          << h_Node_copy.Center.z << ")" << std::endl;
                std::cout << "HalfWidth: [" << h_Node_copy.HalfWidth[0] << ", "
                          << h_Node_copy.HalfWidth[1] << ", "<< h_Node_copy.HalfWidth[2] << "]" << std::endl;
                std::cout << "ChildIndex: [";
                for (int i = 0; i < 8; i++) {
                    std::cout << h_Node_copy.ChildIndex[i];if (i < 7) std::cout << ", ";        }
                std::cout << "]" << std::endl;
                std::cout << "Level: " << h_Node_copy.Level << std::endl;
                std::cout << "TriangleCount: " << h_Node_copy.TriangleCount << std::endl;
                
                }