Function doesn't copy all the elements I want

I have this function:

__host__ void copyArrayDeviceToHost(Vertex* d, Vertex* h, int numNodes){

    Vertex* tmp = (Vertex*)malloc(numNodes * sizeof(Vertex));
    cudaMemcpy(tmp, d, numNodes * sizeof(Vertex), cudaMemcpyDeviceToHost);

    for(int i = 0; i < numNodes; i++){
        h[i].edges = (Edge*)malloc(tmp[i].deg * sizeof(Edge));
        cudaMemcpy(h[i].edges, tmp[i].edges, tmp[i].deg * sizeof(Edge), cudaMemcpyDeviceToHost);
    }

    //for (int i = 0; i < numNodes; i++) {
    //    cudaFree(tmp[i].edges);
    //}
    free(tmp);
    cudaDeviceSynchronize();
}

to which I pass my device variable d, that has all the right data in it, and then I call this function to copy all the data on a variable host h. I have this structs:

typedef struct Edge {
    int start;
    int end;
} Edge;

typedef struct {
    int deg;
    int nome;
    Edge *edges;
} Vertex;

so after the function my host variable has all the informations abount the edges, but the field like name is not updated and seems to be rand( not sure if it is rand)…

well, you copy your vertex struct data to tmp:

And you never put that data in h. Then you free tmp:

So I’m not surprised that your vertex struct data in h is messed up.

Do this instead:

__host__ void copyArrayDeviceToHost(Vertex* d, Vertex* h, int numNodes){

    cudaMemcpy(h, d, numNodes * sizeof(Vertex), cudaMemcpyDeviceToHost);

    for(int i = 0; i < numNodes; i++){
        Edge* tmp = (Edge*)malloc(h[i].deg * sizeof(Edge));
        cudaMemcpy(tmp, h[i].edges, h[i].deg * sizeof(Edge), cudaMemcpyDeviceToHost);
        h[i].edges = tmp;
    }

}

Here is a complete example using the code I previously provided from here:

# cat t149.cu
#include <cstdio>
const int num_e = 3;
const int num_v = 2;

typedef struct Edge {
    int start;
    int end;
} Edge;

typedef struct {
    int deg;
    int nome;
    Edge *edges;
} Vertex;
__host__ __device__ void print_data(Vertex *vert, int num_vertex, int num_edge){
  // print data
  for (int v = 0; v < num_vertex; v++)
    for (int e = 0; e < num_edge; e++)
      printf("vertex: %d, deg: %d, nome: %d, edge: %d, start: %d, end: %d\n", v, vert[v].deg, vert[v].nome, e, vert[v].edges[e].start, vert[v].edges[e].end);
}
__global__ void k(Vertex *vert, int num_vertex, int num_edge){
  print_data(vert, num_vertex, num_edge);
  // modify data
  vert[0].edges[1].end = 256;
}

__host__ void copyArrayDeviceToHost(Vertex* d, Vertex* h, int numNodes){

    cudaMemcpy(h, d, numNodes * sizeof(Vertex), cudaMemcpyDeviceToHost);

    for(int i = 0; i < numNodes; i++){
        Edge* tmp = (Edge*)malloc(h[i].deg*sizeof(Edge));
        cudaMemcpy(tmp, h[i].edges, h[i].deg * sizeof(Edge), cudaMemcpyDeviceToHost);
        h[i].edges = tmp;
    }
}

int main(){
  // set up host data
  Edge he[num_v][num_e] = {{{1,2}, {2,3}, {3,4}}, {{5,6}, {6,7}, {7,8}}};
  Vertex hv[num_v];
  hv[0].deg = num_e;
  hv[0].nome = 33;
  hv[0].edges = he[0];
  hv[1].deg = num_e;
  hv[1].nome = 35;
  hv[1].edges = he[1];
  // set up device data
  Vertex* d_initial_partition;
  // allocate top level pointer to vertex array
  cudaMalloc((void **)&d_initial_partition, num_v * sizeof(Vertex));
  // copy top level array data
  cudaMemcpy(d_initial_partition, hv, num_v*sizeof(Vertex), cudaMemcpyHostToDevice);
  Edge *d_e[num_v]; // array of nested pointers for allocation on device
  for (int i = 0; i < num_v; i++) {
    cudaMalloc(d_e+i, num_e*sizeof(Edge)); // allocate nested pointer
    cudaMemcpy(d_e[i], he[i], num_e*sizeof(Edge), cudaMemcpyHostToDevice); // copy data to nested pointer
    cudaMemcpy(&((d_initial_partition+i)->edges), d_e+i, sizeof(Edge *), cudaMemcpyHostToDevice);}//copy nested pointer value to proper location in top level array
  k<<<1,1>>>(d_initial_partition, num_v, num_e);
  cudaDeviceSynchronize();
  // copy data back
#if 0
  for (int i = 0; i < num_v; i++)
    cudaMemcpy(he[i], d_e[i], num_e*sizeof(Edge), cudaMemcpyDeviceToHost);
  cudaMemcpy(hv, d_initial_partition, num_v*sizeof(Vertex), cudaMemcpyDeviceToHost);
  // fix up pointers
  hv[0].edges = he[0];
  hv[1].edges = he[1];
#endif
  Vertex *c = new Vertex[num_v];
  copyArrayDeviceToHost(d_initial_partition, c, num_v);
  print_data(c, num_v, num_e);
}


# nvcc -o t149 t149.cu
# compute-sanitizer ./t149
========= COMPUTE-SANITIZER
vertex: 0, deg: 3, nome: 33, edge: 0, start: 1, end: 2
vertex: 0, deg: 3, nome: 33, edge: 1, start: 2, end: 3
vertex: 0, deg: 3, nome: 33, edge: 2, start: 3, end: 4
vertex: 1, deg: 3, nome: 35, edge: 0, start: 5, end: 6
vertex: 1, deg: 3, nome: 35, edge: 1, start: 6, end: 7
vertex: 1, deg: 3, nome: 35, edge: 2, start: 7, end: 8
vertex: 0, deg: 3, nome: 33, edge: 0, start: 1, end: 2
vertex: 0, deg: 3, nome: 33, edge: 1, start: 2, end: 256
vertex: 0, deg: 3, nome: 33, edge: 2, start: 3, end: 4
vertex: 1, deg: 3, nome: 35, edge: 0, start: 5, end: 6
vertex: 1, deg: 3, nome: 35, edge: 1, start: 6, end: 7
vertex: 1, deg: 3, nome: 35, edge: 2, start: 7, end: 8
========= ERROR SUMMARY: 0 errors
#

Thank you so much is clear.