Hello,
I coded two functions one to allocate data on the gpu and one to copy the data back to the cpu and print. However I get the error:
CUDA error in printOctreeNodeFromGPU at /home/3484681/Área de Trabalho/PRISMAMESH/PRISMAMESH/CudaOctreeAux.cu:174: invalid argument
When I try to copy from the device back to the host. The error is in this line:
CUDA_CHECK(cudaMemcpy(&h_Node_copy.index, &Node->index, sizeof(int64_t), cudaMemcpyDeviceToHost));
Thank you in addvance,
Rafael Scatena
The structs are:
struct OctreeNode {int64_t index; Point Center; double HalfWidth[3]; int64_t ChildIndex[8]; int Level; int TriangleCount; double* p1x; double* p1y; double* p1z; double* p2x; double* p2y; double* p2z; double* p3x; double* p3y; double* p3z; int* Material; int* Body; int64_t* triangle_id;
+ cpu constructor
struct OctreeNodeGPU {int64_t index; Point Center; double HalfWidth[3]; int64_t ChildIndex[8]; int Level; int TriangleCount; double* p1x; double* p1y; double* p1z; double* p2x; double* p2y; double* p2z; double* p3x; double* p3y; double* p3z; int* Material; int* Body; int64_t* triangle_id;
// Constructor: Initialize all pointers to nullptr
__host__ __device__ OctreeNodeGPU() {
index = 0; Center = {0,0,0};
for (int i = 0; i < 3; ++i) {HalfWidth[i] = 0;}
for (int i = 0; i < 8; ++i) {ChildIndex[i] = -1;
TriangleCount = 0; Level = 0;}
p1x = nullptr; p1y = nullptr; p1z = nullptr;
p2x = nullptr; p2y = nullptr; p2z = nullptr;
p3x = nullptr; p3y = nullptr; p3z = nullptr;
Material = nullptr; Body = nullptr; triangle_id = nullptr;}
// Destructor: free allocated memory (on host side)
__host__ __device__ ~OctreeNodeGPU() {
if (p1x) cudaFree(p1x); if (p1y) cudaFree(p1y); if (p1z) cudaFree(p1z);
if (p2x) cudaFree(p2x); if (p2y) cudaFree(p2y); if (p2z) cudaFree(p2z);
if (p3x) cudaFree(p3x); if (p3y) cudaFree(p3y); if (p3z) cudaFree(p3z);
if (Material) cudaFree(Material); if (Body) cudaFree(Body); if (triangle_id) cudaFree(triangle_id);
}
};
the main code:
OctreeNodeGPU* Node;
copyNodeToGPU(h_Node, Node);
printOctreeNodeFromGPU(Node);
the functions are:
__host__ void copyNodeToGPU(const OctreeNode& cpuNode,OctreeNodeGPU* Node) {
CUDA_CHECK(cudaMalloc((void**)&Node, sizeof(OctreeNodeGPU)));
if(Node)std::cerr << "Node Parsed for Copy" << std::endl;
else{}
std::cerr << "Static Allocated" << std::endl;
CUDA_CHECK(cudaMemcpy(&Node->index, &cpuNode.index, sizeof(int64_t), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(&Node->Center, &cpuNode.Center, sizeof(Point), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(Node->HalfWidth, cpuNode.HalfWidth, 3*sizeof(double), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(Node->ChildIndex, cpuNode.ChildIndex, 8*sizeof(int64_t), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(&Node->Level, &cpuNode.Level, sizeof(int), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(&Node->TriangleCount, &cpuNode.TriangleCount, sizeof(int), cudaMemcpyHostToDevice));
// Allocate memory on GPU and copy each dynamic array
if (cpuNode.TriangleCount > 0) {
CUDA_CHECK(cudaMalloc((void**)&Node->p1x, cpuNode.TriangleCount * sizeof(double)));
CUDA_CHECK(cudaMalloc((void**)&Node->p1y, cpuNode.TriangleCount * sizeof(double)));
CUDA_CHECK(cudaMalloc((void**)&Node->p1z, cpuNode.TriangleCount * sizeof(double)));
CUDA_CHECK(cudaMalloc((void**)&Node->p2x, cpuNode.TriangleCount * sizeof(double)));
CUDA_CHECK(cudaMalloc((void**)&Node->p2y, cpuNode.TriangleCount * sizeof(double)));
CUDA_CHECK(cudaMalloc((void**)&Node->p2z, cpuNode.TriangleCount * sizeof(double)));
CUDA_CHECK(cudaMalloc((void**)&Node->p3x, cpuNode.TriangleCount * sizeof(double)));
CUDA_CHECK(cudaMalloc((void**)&Node->p3y, cpuNode.TriangleCount * sizeof(double)));
CUDA_CHECK(cudaMalloc((void**)&Node->p3z, cpuNode.TriangleCount * sizeof(double)));
CUDA_CHECK(cudaMalloc((void**)&Node->Material, cpuNode.TriangleCount * sizeof(int)));
CUDA_CHECK(cudaMalloc((void**)&Node->Body, cpuNode.TriangleCount * sizeof(int)));
CUDA_CHECK(cudaMalloc((void**)&Node->triangle_id, cpuNode.TriangleCount * sizeof(int64_t)));
// Copy data from CPU to GPU
CUDA_CHECK(cudaMemcpy(Node->p1x, cpuNode.p1x, cpuNode.TriangleCount * sizeof(double), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(Node->p1y, cpuNode.p1y, cpuNode.TriangleCount * sizeof(double), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(Node->p1z, cpuNode.p1z, cpuNode.TriangleCount * sizeof(double), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(Node->p2x, cpuNode.p2x, cpuNode.TriangleCount * sizeof(double), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(Node->p2y, cpuNode.p2y, cpuNode.TriangleCount * sizeof(double), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(Node->p2z, cpuNode.p2z, cpuNode.TriangleCount * sizeof(double), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(Node->p3x, cpuNode.p3x, cpuNode.TriangleCount * sizeof(double), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(Node->p3y, cpuNode.p3y, cpuNode.TriangleCount * sizeof(double), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(Node->p3z, cpuNode.p3z, cpuNode.TriangleCount * sizeof(double), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(Node->Material, cpuNode.Material, cpuNode.TriangleCount * sizeof(int), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(Node->Body, cpuNode.Body, cpuNode.TriangleCount * sizeof(int), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(Node->triangle_id, cpuNode.triangle_id, cpuNode.TriangleCount * sizeof(int64_t), cudaMemcpyHostToDevice));
}
}
and the function to copy the information back to the cpu:
void printOctreeNodeFromGPU(OctreeNodeGPU* Node) {
Point a = {0,0,0}; double b[3] = {0,0,0};
OctreeNode h_Node_copy(a, b, 0,0, 0);
//OctreeNode h_Node_copy;
// Copy the node from the GPU to the host
CUDA_CHECK(cudaMemcpy(&h_Node_copy.index, &Node->index, sizeof(int64_t), cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(&h_Node_copy.Center, &Node->Center, sizeof(Point), cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(&h_Node_copy.HalfWidth, &Node->HalfWidth, 3*sizeof(double), cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(&h_Node_copy.ChildIndex, &Node->ChildIndex, 8* sizeof(int64_t), cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(&h_Node_copy.Level, &Node->Level, sizeof(int), cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(&h_Node_copy.TriangleCount, &Node->TriangleCount, sizeof(int), cudaMemcpyDeviceToHost));
// Print the non-dynamically allocated members
std::cout << "index: " << h_Node_copy.index << std::endl;
std::cout << "Center: (" << h_Node_copy.Center.x << ", "<< h_Node_copy.Center.y << ", "
<< h_Node_copy.Center.z << ")" << std::endl;
std::cout << "HalfWidth: [" << h_Node_copy.HalfWidth[0] << ", "
<< h_Node_copy.HalfWidth[1] << ", "<< h_Node_copy.HalfWidth[2] << "]" << std::endl;
std::cout << "ChildIndex: [";
for (int i = 0; i < 8; i++) {
std::cout << h_Node_copy.ChildIndex[i];if (i < 7) std::cout << ", "; }
std::cout << "]" << std::endl;
std::cout << "Level: " << h_Node_copy.Level << std::endl;
std::cout << "TriangleCount: " << h_Node_copy.TriangleCount << std::endl;
}