How to allocation in cudaMalloc...? Please help me...

Hi, all.
I am wondering about how to allocation double pointer to cuda.
My code is below.

------------this is structure—
typedef struct vertex vertex;

struct vertex {
unsigned int vertex_id;
float pagerank;
float pagerank_next;
unsigned int n_successors;
vertex ** successors;
};

------------these are vertics----
vertex * vertices;
vertices = (vertex *)malloc(n_vertices * sizeof(vertex));
vertex *d_vertices;
cudaMalloc((void **)&d_vertices, n_vertices * sizeof(vertex));

---------these are vertics’ succesors. It means double pointers----
vertices[i].successors = (vertex **)malloc(vertices[i].n_successors*sizeof(vertex *));
cudaMalloc((void ***)&d_vertices[i].successors, vertices[i].n_successors * sizeof(vertex *)); // error part.

When I allocate cudaMalloc((void ***)&d_vertices[i].successors, vertices[i].n_successors * sizeof(vertex *));
Visual Studio is stopped… I guess pointer error. But I don’t know how to double pointer allocation in cuda.
Please, let me know what’s wrong in that code.
Thank you so much.

best regards

CUDA is a language in the C++ family, so this works exactly the same as it works in regular C++.

But you need to consider that you are operating in two separate memory spaces: malloc() returns a pointer to host memory, which you can dereference in host code (and only in host code). cudaMalloc()returns a pointer to device memory, which you can dereference in device code (and only in device code).

I am not going to attempt to debug based on the code fragments shown, but typical error cases are the following, the first one of which is generic to C++ and the second specific to CUDA:

(1) Programmer loses track of the correct level of indirection (pointer vs pointer to pointer).
(2) Programmer loses track of the memory space a pointer belongs to (host vs device)

Dear. njuffa

I attached full lines.
Thank you for your help.

#include <stdio.h>
#include <stdlib.h>
// For the CUDA runtime routines (prefixed with “cuda_”)
#include <cuda_runtime.h>

typedef struct vertex vertex;

struct vertex {
unsigned int vertex_id;
float pagerank;
float pagerank_next;
unsigned int n_successors;
vertex ** successors;
};

float abs_float(float in) {
if (in >= 0)
return in;
else
return -in;
}

global void setPagerankNextToZero(vertex * vertices) {
int i = threadIdx.x;

vertices[i].pagerank_next = 0;

}

global void initializePageranks(vertex * vertices, int n_vertices) {
int i = threadIdx.x;

vertices[i].pagerank = 1.0 / (float)n_vertices;

}

global void addToNextPagerank(vertex * vertices, float * dangling_value) {
int i = threadIdx.x;
int j;

if (vertices[i].n_successors > 0) {
	for (j = 0; j < vertices[i].n_successors; j++) {
		atomicAdd(&(vertices[i].successors[j]->pagerank_next), 0.85*(vertices[i].pagerank) / vertices[i].n_successors);
	}
}
else {
	atomicAdd(dangling_value, 0.85*vertices[i].pagerank);
}

}

global void finalPagerankForIteration(vertex * vertices, int n_vertices, float dangling_value) {
int i = threadIdx.x;

vertices[i].pagerank_next += (dangling_value + (1 - 0.85)) / ((float)n_vertices);

}

global void setPageranksFromNext(vertex * vertices) {
int i = threadIdx.x;

vertices[i].pagerank = vertices[i].pagerank_next;

}

int main(void) {
size_t free_device_mem = 0;
size_t total_device_mem = 0;
cudaMemGetInfo(&free_device_mem, &total_device_mem);
printf(“total memory: %zu bytes\n”, total_device_mem);
printf(“gpu memory: %zu bytes\n”, free_device_mem);
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;

/*************************************************************************/
// build up the graph
int i, j;
unsigned int n_vertices = 0;
unsigned int n_edges = 0;
unsigned int vertex_from = 0, vertex_to = 0;

vertex * vertices;

FILE * fp;
if ((fp = fopen("Stanford.txt", "r")) == NULL) {
	fprintf(stderr, "ERROR: Could not open input file.\n");
	exit(-1);
}

// parse input file to count the number of vertices
// expected format: vertex_from vertex_to
while (fscanf(fp, "%u %u", &vertex_from, &vertex_to) != EOF) {
	if (vertex_from > n_vertices)
		n_vertices = vertex_from;
	else if (vertex_to > n_vertices)
		n_vertices = vertex_to;
}
n_vertices++;

// allocate memory for vertices
vertices = (vertex *)malloc(n_vertices * sizeof(vertex));
//err = cudaMallocManaged((void **)&vertices, n_vertices * sizeof(vertex));
vertex *d_vertices;
err = cudaMalloc((void **)&d_vertices, n_vertices * sizeof(vertex));

cudaMemGetInfo(&free_device_mem, &total_device_mem);
printf("1- gpu memory: %zu bytes\n", free_device_mem);

// SET Initial values  **********************************************************
unsigned int n_iterations = 25;
float alpha = 0.85;
float eps = 0.000001;


if (!vertices) {
	fprintf(stderr, "Malloc failed for vertices.\n");
	exit(-1);
}
memset((void *)vertices, 0, (size_t)(n_vertices * sizeof(vertex)));
err = cudaMemcpy(d_vertices, vertices, n_vertices * sizeof(vertex), cudaMemcpyHostToDevice);

// parse input file to count the number of successors of each vertex
fseek(fp, 0L, SEEK_SET);
while (fscanf(fp, "%u %u", &vertex_from, &vertex_to) != EOF) {
	vertices[vertex_from].n_successors++;
	n_edges++;
}

// allocate memory for successor pointers
for (i = 0; i<n_vertices; i++) {
	vertices[i].vertex_id = i;
	if (vertices[i].n_successors > 0) {
		 vertices[i].successors = (vertex **)malloc(vertices[i].n_successors*sizeof(vertex *));
		 
		 
		 err = cudaMalloc((void **)&d_vertices[i].successors, vertices[i].n_successors * sizeof(vertex *));

		 printf("gpu memory: %zu bytes\n", free_device_mem);
		 printf("allocation: %d\n", vertices[i].n_successors * sizeof(vertex*));
		//err = cudaMallocManaged((void***)&vertices[i].successors, vertices[i].n_successors * sizeof(vertex*));
		if (!vertices[i].successors) {
			fprintf(stderr, "Malloc failed for successors of vertex %d.\n", i);
			exit(-1);
		}
		memset((void *)vertices[i].successors, 0, (size_t)(vertices[i].n_successors * sizeof(vertex *)));
		//err = cudaMemcpy(d_vertices[i].successors, vertices[i].successors, vertices[i].n_successors * sizeof(vertex *), cudaMemcpyHostToDevice);
	}
	else {
		vertices[i].successors = NULL;
		//err = cudaMalloc((void **)&d_vertices[i].successors, NULL);
	}
}

cudaMemGetInfo(&free_device_mem, &total_device_mem);
printf("2- gpu memory: %zu bytes\n", free_device_mem);

// parse input file to set up the successor pointers
fseek(fp, 0L, SEEK_SET);
while (fscanf(fp, "%d %d", &vertex_from, &vertex_to) != EOF) {
	for (i = 0; i<vertices[vertex_from].n_successors; i++) {
		if (vertices[vertex_from].successors[i] == NULL) {
			vertices[vertex_from].successors[i] = &vertices[vertex_to];
			break;
		}
		else if (i == vertices[vertex_from].n_successors - 1) {
			printf("Setting up the successor pointers of virtex %u failed", vertex_from);
			return -1;
		}
	}
}

fclose(fp);

/*************************************************************************/
// compute the pagerank on the GPU

float dangling_value_h = 0;
float * dangling_value_d;
int threadsPerBlock = 1024;
int blocksPerGrid = (n_vertices + threadsPerBlock - 1) / threadsPerBlock;

err = cudaMalloc((void **)&dangling_value_d, sizeof(float));
err = cudaMemcpy(dangling_value_d, &dangling_value_h, sizeof(float), cudaMemcpyHostToDevice);
//err = cudaMallocManaged((void *)&dangling_value, sizeof(float));

initializePageranks << <blocksPerGrid, threadsPerBlock >> >(vertices, n_vertices);
cudaDeviceSynchronize();

for (i = 0; i < 23; i++) {
	// set the next pagerank values to 0
	setPagerankNextToZero << <blocksPerGrid, threadsPerBlock >> >(vertices);
	cudaDeviceSynchronize();

	// set the dangling value to 0 
	dangling_value_h = 0;
	err = cudaMemcpy(dangling_value_d, &dangling_value_h, sizeof(float), cudaMemcpyHostToDevice);

	// initial parallel pagerank_next computation
	addToNextPagerank << <blocksPerGrid, threadsPerBlock >> >(vertices, dangling_value_d);
	cudaDeviceSynchronize();

	// get the dangling value
	err = cudaMemcpy(&dangling_value_h, dangling_value_d, sizeof(float), cudaMemcpyDeviceToHost);
	//printf("the dangling_value is now: %.3f\n", dangling_value_h);

	// final parallel pagerank_next computation
	finalPagerankForIteration << <blocksPerGrid, threadsPerBlock >> >(vertices, n_vertices, dangling_value_h);
	cudaDeviceSynchronize();

	setPageranksFromNext << <blocksPerGrid, threadsPerBlock >> >(vertices);
	cudaDeviceSynchronize();
}


// print the pagerank values computed on the GPU
double sum2 = 0.0f;
// print the pagerank values computed on the GPU
for (i = 0; i<n_vertices; i++) {
	sum2 += vertices[i].pagerank;
}
printf("GPU total PR: %.8f\n", sum2);

/*****************************************************************************************/
// Compute pagerank on host using old method for comparison purposes
unsigned int i_iteration;

float value, diff;
float pr_dangling_factor = alpha / (float)n_vertices;   // pagerank to redistribute from dangling nodes
float pr_dangling;
float pr_random_factor = (1 - alpha) / (float)n_vertices; // random portion of the pagerank
float pr_random;
float pr_sum, pr_sum_inv, pr_sum_dangling;
float temp;

// initialization of values before pagerank loop
for (i = 0; i<n_vertices; i++) {
	vertices[i].pagerank = 1 / (float)n_vertices;
	vertices[i].pagerank_next = 0;
}

pr_sum = 0;
pr_sum_dangling = 0;
for (i = 0; i<n_vertices; i++) {
	pr_sum += vertices[i].pagerank;
	if (!vertices[i].n_successors)
		pr_sum_dangling += vertices[i].pagerank;
}

i_iteration = 0;
diff = eps + 1;
 
while (i_iteration<20) {

	for (i = 0; i<n_vertices; i++) {
		if (vertices[i].n_successors)
			value = (alpha / vertices[i].n_successors)*vertices[i].pagerank;  //value = vote value after splitting equally
		else
			value = 0;
		//printf("vertex %d: value = %.6f \n",i,value);
		for (j = 0; j<vertices[i].n_successors; j++) {               // pagerank_next = sum of votes linking to it
			vertices[i].successors[j]->pagerank_next += value;
		}
	}

	// for normalization
	pr_sum_inv = 1 / pr_sum;

	// alpha
	pr_dangling = pr_dangling_factor * pr_sum_dangling;
	pr_random = pr_random_factor * pr_sum;

	pr_sum = 0;
	pr_sum_dangling = 0;

	diff = 0;
	for (i = 0; i<n_vertices; i++) {
		// update pagerank
		temp = vertices[i].pagerank;
		vertices[i].pagerank = vertices[i].pagerank_next*pr_sum_inv + pr_dangling + pr_random;
		vertices[i].pagerank_next = 0;

		// for normalization in next cycle
		pr_sum += vertices[i].pagerank;
		if (!vertices[i].n_successors)
			pr_sum_dangling += vertices[i].pagerank;

		// convergence
		diff += abs_float(temp - vertices[i].pagerank);
	}
	//printf("Iteration %u:\t diff = %.12f\n", i_iteration, diff);

	i_iteration++;
}

printf("CPU total PR: %.8f\n", pr_sum);

err = cudaDeviceReset();

if (err != cudaSuccess)
{
	fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
	exit(EXIT_FAILURE);
}

cudaFree(dangling_value_d);
cudaFree(d_vertices);
free(vertices);

printf("Done\n");
return 0;

}

I guess I expressed myself in a misleading manner. I wasn’t trying to imply that I would debug your code if only you posted the entire thing. I find debugging my own code annoying enough.

oh… I am so sorry…
Thank you for your help.

Have an awesome day!

This question is asked quite frequently. In your cross posting, you were given a link for a question which is similar:

https://stackoverflow.com/questions/48258088/how-to-allocation-in-cudamalloc

There are many other examples available, such as here:

https://stackoverflow.com/questions/45643682/cuda-using-2d-and-3d-arrays/45644824#45644824

and here:

https://stackoverflow.com/questions/16024087/copy-an-object-to-device

and here:

https://stackoverflow.com/questions/15431365/cudamemcpy-segmentation-fault/15435592#15435592

Dear txbob

That’s my question in stackoverflow… Thank you…
I already read that all. but I can’t understand…

I think a little bit different case.
I referenced this code.

NLayer* nL;
NLayer h_nL;
cudaMalloc((void**)&nL, sizeof(NLayer));
cudaMalloc((void**)&h_nL.neurons, 6*sizeof(Neuron));

but there is double pointer in my code. (Like a adjacency list.)
that code is structure’s structure…

I don’t understand how to allocate from C malloc double pointer to CUDA malloc. (I know the how to allocate C single pointer.)

please, tell me the example…
Thank you for your help.
Thank you so much.

This tells you exactly how to allocate a double pointer:

https://stackoverflow.com/questions/6137218/how-can-i-add-up-two-2d-pitched-arrays-using-nested-for-loops

This tells you exactly how to allocate a pointer to an array of structures, where each structure contains an embedded pointer:

https://stackoverflow.com/questions/15431365/cudamemcpy-segmentation-fault/15435592#15435592