Dear. njuffa
I attached full lines.
Thank you for your help.
#include <stdio.h>
#include <stdlib.h>
// For the CUDA runtime routines (prefixed with “cuda_”)
#include <cuda_runtime.h>
typedef struct vertex vertex;
struct vertex {
unsigned int vertex_id;
float pagerank;
float pagerank_next;
unsigned int n_successors;
vertex ** successors;
};
float abs_float(float in) {
if (in >= 0)
return in;
else
return -in;
}
global void setPagerankNextToZero(vertex * vertices) {
int i = threadIdx.x;
vertices[i].pagerank_next = 0;
}
global void initializePageranks(vertex * vertices, int n_vertices) {
int i = threadIdx.x;
vertices[i].pagerank = 1.0 / (float)n_vertices;
}
global void addToNextPagerank(vertex * vertices, float * dangling_value) {
int i = threadIdx.x;
int j;
if (vertices[i].n_successors > 0) {
for (j = 0; j < vertices[i].n_successors; j++) {
atomicAdd(&(vertices[i].successors[j]->pagerank_next), 0.85*(vertices[i].pagerank) / vertices[i].n_successors);
}
}
else {
atomicAdd(dangling_value, 0.85*vertices[i].pagerank);
}
}
global void finalPagerankForIteration(vertex * vertices, int n_vertices, float dangling_value) {
int i = threadIdx.x;
vertices[i].pagerank_next += (dangling_value + (1 - 0.85)) / ((float)n_vertices);
}
global void setPageranksFromNext(vertex * vertices) {
int i = threadIdx.x;
vertices[i].pagerank = vertices[i].pagerank_next;
}
int main(void) {
size_t free_device_mem = 0;
size_t total_device_mem = 0;
cudaMemGetInfo(&free_device_mem, &total_device_mem);
printf(“total memory: %zu bytes\n”, total_device_mem);
printf(“gpu memory: %zu bytes\n”, free_device_mem);
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
/*************************************************************************/
// build up the graph
int i, j;
unsigned int n_vertices = 0;
unsigned int n_edges = 0;
unsigned int vertex_from = 0, vertex_to = 0;
vertex * vertices;
FILE * fp;
if ((fp = fopen("Stanford.txt", "r")) == NULL) {
fprintf(stderr, "ERROR: Could not open input file.\n");
exit(-1);
}
// parse input file to count the number of vertices
// expected format: vertex_from vertex_to
while (fscanf(fp, "%u %u", &vertex_from, &vertex_to) != EOF) {
if (vertex_from > n_vertices)
n_vertices = vertex_from;
else if (vertex_to > n_vertices)
n_vertices = vertex_to;
}
n_vertices++;
// allocate memory for vertices
vertices = (vertex *)malloc(n_vertices * sizeof(vertex));
//err = cudaMallocManaged((void **)&vertices, n_vertices * sizeof(vertex));
vertex *d_vertices;
err = cudaMalloc((void **)&d_vertices, n_vertices * sizeof(vertex));
cudaMemGetInfo(&free_device_mem, &total_device_mem);
printf("1- gpu memory: %zu bytes\n", free_device_mem);
// SET Initial values **********************************************************
unsigned int n_iterations = 25;
float alpha = 0.85;
float eps = 0.000001;
if (!vertices) {
fprintf(stderr, "Malloc failed for vertices.\n");
exit(-1);
}
memset((void *)vertices, 0, (size_t)(n_vertices * sizeof(vertex)));
err = cudaMemcpy(d_vertices, vertices, n_vertices * sizeof(vertex), cudaMemcpyHostToDevice);
// parse input file to count the number of successors of each vertex
fseek(fp, 0L, SEEK_SET);
while (fscanf(fp, "%u %u", &vertex_from, &vertex_to) != EOF) {
vertices[vertex_from].n_successors++;
n_edges++;
}
// allocate memory for successor pointers
for (i = 0; i<n_vertices; i++) {
vertices[i].vertex_id = i;
if (vertices[i].n_successors > 0) {
vertices[i].successors = (vertex **)malloc(vertices[i].n_successors*sizeof(vertex *));
err = cudaMalloc((void **)&d_vertices[i].successors, vertices[i].n_successors * sizeof(vertex *));
printf("gpu memory: %zu bytes\n", free_device_mem);
printf("allocation: %d\n", vertices[i].n_successors * sizeof(vertex*));
//err = cudaMallocManaged((void***)&vertices[i].successors, vertices[i].n_successors * sizeof(vertex*));
if (!vertices[i].successors) {
fprintf(stderr, "Malloc failed for successors of vertex %d.\n", i);
exit(-1);
}
memset((void *)vertices[i].successors, 0, (size_t)(vertices[i].n_successors * sizeof(vertex *)));
//err = cudaMemcpy(d_vertices[i].successors, vertices[i].successors, vertices[i].n_successors * sizeof(vertex *), cudaMemcpyHostToDevice);
}
else {
vertices[i].successors = NULL;
//err = cudaMalloc((void **)&d_vertices[i].successors, NULL);
}
}
cudaMemGetInfo(&free_device_mem, &total_device_mem);
printf("2- gpu memory: %zu bytes\n", free_device_mem);
// parse input file to set up the successor pointers
fseek(fp, 0L, SEEK_SET);
while (fscanf(fp, "%d %d", &vertex_from, &vertex_to) != EOF) {
for (i = 0; i<vertices[vertex_from].n_successors; i++) {
if (vertices[vertex_from].successors[i] == NULL) {
vertices[vertex_from].successors[i] = &vertices[vertex_to];
break;
}
else if (i == vertices[vertex_from].n_successors - 1) {
printf("Setting up the successor pointers of virtex %u failed", vertex_from);
return -1;
}
}
}
fclose(fp);
/*************************************************************************/
// compute the pagerank on the GPU
float dangling_value_h = 0;
float * dangling_value_d;
int threadsPerBlock = 1024;
int blocksPerGrid = (n_vertices + threadsPerBlock - 1) / threadsPerBlock;
err = cudaMalloc((void **)&dangling_value_d, sizeof(float));
err = cudaMemcpy(dangling_value_d, &dangling_value_h, sizeof(float), cudaMemcpyHostToDevice);
//err = cudaMallocManaged((void *)&dangling_value, sizeof(float));
initializePageranks << <blocksPerGrid, threadsPerBlock >> >(vertices, n_vertices);
cudaDeviceSynchronize();
for (i = 0; i < 23; i++) {
// set the next pagerank values to 0
setPagerankNextToZero << <blocksPerGrid, threadsPerBlock >> >(vertices);
cudaDeviceSynchronize();
// set the dangling value to 0
dangling_value_h = 0;
err = cudaMemcpy(dangling_value_d, &dangling_value_h, sizeof(float), cudaMemcpyHostToDevice);
// initial parallel pagerank_next computation
addToNextPagerank << <blocksPerGrid, threadsPerBlock >> >(vertices, dangling_value_d);
cudaDeviceSynchronize();
// get the dangling value
err = cudaMemcpy(&dangling_value_h, dangling_value_d, sizeof(float), cudaMemcpyDeviceToHost);
//printf("the dangling_value is now: %.3f\n", dangling_value_h);
// final parallel pagerank_next computation
finalPagerankForIteration << <blocksPerGrid, threadsPerBlock >> >(vertices, n_vertices, dangling_value_h);
cudaDeviceSynchronize();
setPageranksFromNext << <blocksPerGrid, threadsPerBlock >> >(vertices);
cudaDeviceSynchronize();
}
// print the pagerank values computed on the GPU
double sum2 = 0.0f;
// print the pagerank values computed on the GPU
for (i = 0; i<n_vertices; i++) {
sum2 += vertices[i].pagerank;
}
printf("GPU total PR: %.8f\n", sum2);
/*****************************************************************************************/
// Compute pagerank on host using old method for comparison purposes
unsigned int i_iteration;
float value, diff;
float pr_dangling_factor = alpha / (float)n_vertices; // pagerank to redistribute from dangling nodes
float pr_dangling;
float pr_random_factor = (1 - alpha) / (float)n_vertices; // random portion of the pagerank
float pr_random;
float pr_sum, pr_sum_inv, pr_sum_dangling;
float temp;
// initialization of values before pagerank loop
for (i = 0; i<n_vertices; i++) {
vertices[i].pagerank = 1 / (float)n_vertices;
vertices[i].pagerank_next = 0;
}
pr_sum = 0;
pr_sum_dangling = 0;
for (i = 0; i<n_vertices; i++) {
pr_sum += vertices[i].pagerank;
if (!vertices[i].n_successors)
pr_sum_dangling += vertices[i].pagerank;
}
i_iteration = 0;
diff = eps + 1;
while (i_iteration<20) {
for (i = 0; i<n_vertices; i++) {
if (vertices[i].n_successors)
value = (alpha / vertices[i].n_successors)*vertices[i].pagerank; //value = vote value after splitting equally
else
value = 0;
//printf("vertex %d: value = %.6f \n",i,value);
for (j = 0; j<vertices[i].n_successors; j++) { // pagerank_next = sum of votes linking to it
vertices[i].successors[j]->pagerank_next += value;
}
}
// for normalization
pr_sum_inv = 1 / pr_sum;
// alpha
pr_dangling = pr_dangling_factor * pr_sum_dangling;
pr_random = pr_random_factor * pr_sum;
pr_sum = 0;
pr_sum_dangling = 0;
diff = 0;
for (i = 0; i<n_vertices; i++) {
// update pagerank
temp = vertices[i].pagerank;
vertices[i].pagerank = vertices[i].pagerank_next*pr_sum_inv + pr_dangling + pr_random;
vertices[i].pagerank_next = 0;
// for normalization in next cycle
pr_sum += vertices[i].pagerank;
if (!vertices[i].n_successors)
pr_sum_dangling += vertices[i].pagerank;
// convergence
diff += abs_float(temp - vertices[i].pagerank);
}
//printf("Iteration %u:\t diff = %.12f\n", i_iteration, diff);
i_iteration++;
}
printf("CPU total PR: %.8f\n", pr_sum);
err = cudaDeviceReset();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
cudaFree(dangling_value_d);
cudaFree(d_vertices);
free(vertices);
printf("Done\n");
return 0;
}