cuMemcpyDtoHAsync error when using OpenACC directives

I am trying to compile this benchmark:

https://github.com/pathscale/rodinia/blob/master/openacc/bfs/bfs.cpp

I list my slightly modified version (I have added some print statements) at the end of the post. When I try to run the benchmark, I get the following error:

call to cuMemcpyDtoHAsync returned error 1: Invalid value

I have tried to run this through gdb to get some better idea where does the problem occur, and this is what I get:


(gdb)
183 }
(gdb) n
call to cuMemcpyDtoHAsync returned error 1: Invalid value

So it seems that there is at least one successful iteration through the kernels.

How can I go about debugging this? What information would you need to help me?

EDIT:

I was able to compile the code with the Cray compiler that is also available on my machine (I had to remove the reduction on the stop value because of compilation errors). The generated code fails in a similar way:

CC: craylibs/libcrayacc/acc_hw_nvidia.c:560 CRAY_ACC_ERROR - cuMemcpyHtoD returned CUDA_ERROR_INVALID_VALUE from bfs.cpp:139

This leads me to believe that perhaps the benchmark uses OpenACC incorrectly. Can anybody confirm this?

Thanks,
Marcin


Here is the code:

#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>

#define TRANSFER_GRAPH_NODE 1

int no_of_nodes;
int edge_list_size;
FILE *fp;

//Structure to hold a node information
struct Node
{
        int starting;
        int no_of_edges;
};

void BFSGraph(int argc, char** argv);

void Usage(int argc, char**argv){

fprintf(stderr,"Usage: %s <input_file>\n", argv[0]);

}
////////////////////////////////////////////////////////////////////////////////
// Main Program
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv) 
{
        no_of_nodes=0;
        edge_list_size=0;
        BFSGraph( argc, argv);
}



////////////////////////////////////////////////////////////////////////////////
//Apply BFS on a Graph using CUDA
////////////////////////////////////////////////////////////////////////////////
void BFSGraph( int argc, char** argv) 
{
    char *input_f;

    int* h_cost;
    int* h_graph_edges;

        if(argc!=2){
        Usage(argc, argv);
        exit(0);
        }
    
        input_f = argv[1];

        printf("Reading File\n");
        //Read in Graph from a file
        fp = fopen(input_f,"r");
        if(!fp)
        {
                printf("Error Reading graph file\n");
                return;
        }

        int source = 0;

        fscanf(fp,"%d",&no_of_nodes);

        printf("Number of nodes: %d\n", no_of_nodes);
   
        // allocate host memory
        Node* h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes);
        bool *h_graph_mask = (bool*) malloc(sizeof(bool)*no_of_nodes);
        bool *h_updating_graph_mask = (bool*) malloc(sizeof(bool)*no_of_nodes);
        bool *h_graph_visited = (bool*) malloc(sizeof(bool)*no_of_nodes);

        printf("Reading nodes: ");
        int start, edgeno;   
        // initalize the memory
        for( unsigned int i = 0; i < no_of_nodes; i++) 
        {
                fscanf(fp,"%d %d",&start,&edgeno);
                h_graph_nodes[i].starting = start;
                h_graph_nodes[i].no_of_edges = edgeno;
                if(i % 100 == 0) printf(".");
        }
        printf("\n");

        //read the source node from the file
        fscanf(fp,"%d",&source);
        source=0;
        printf("Source vertex: %d\n", source);

#pragma acc data create(h_updating_graph_mask[0:no_of_nodes]) \
        create(h_graph_mask[0:no_of_nodes],h_graph_visited[0:no_of_nodes]) \
        create(h_graph_nodes[0:no_of_nodes], h_graph_edges[0:edge_list_size]) \
        copyout(h_cost[0:no_of_nodes])
{
  //printf("In the acc scope.\n");

        #pragma acc update device(h_graph_nodes[0:no_of_nodes]) async(TRANSFER_GRAPH_NODE)

        #pragma acc parallel loop
        for( unsigned int i = 0; i < no_of_nodes; i++)
        {
          //printf("Processing node %d", i);
                h_updating_graph_mask[i]=false;
                h_graph_mask[i]=false;
                h_graph_visited[i]=false;
        }

        #pragma acc kernels present(h_graph_mask[0:no_of_nodes],h_graph_visited[0:no_of_nodes])
        {
            //set the source node as true in the mask
            h_graph_mask[source]=true;
                h_graph_visited[source]=true;
        }

        fscanf(fp,"%d",&edge_list_size);

        int id,cost;
        h_graph_edges = (int*) malloc(sizeof(int)*edge_list_size);
        for(int i=0; i < edge_list_size ; i++)
        {
                fscanf(fp,"%d",&id);
                fscanf(fp,"%d",&cost);
                h_graph_edges[i] = id;
        }

        if(fp)
                fclose(fp);    


        // allocate mem for the result on host side
        h_cost = (int*) malloc( sizeof(int)*no_of_nodes);
        #pragma acc parallel loop
        for(int i=0;i<no_of_nodes;i++) {
                h_cost[i]=-1;
                if(i == source) h_cost[source]=0;
        }

        // finish transfer node and edge to target
        #pragma acc update device(h_graph_edges[0:edge_list_size])
        #pragma acc wait(TRANSFER_GRAPH_NODE)

        printf("Start traversing the tree\n");

        int k=0;
    
        bool stop;
        do
        {
                //if no thread changes this value then the loop stops
                stop=false;

                #pragma acc parallel loop
                for(int tid = 0; tid < no_of_nodes; tid++ )
                {
                        if (h_graph_mask[tid] == true){ 
                        h_graph_mask[tid]=false;
                        for(int i=h_graph_nodes[tid].starting; i<(h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting); i++)
                                {
                                int id = h_graph_edges[i];
                                if(!h_graph_visited[id])
                                        {
                                        h_cost[id]=h_cost[tid]+1;
                                        h_updating_graph_mask[id]=true;
                                        }
                                }
                        }
                }

                #pragma acc parallel loop vector reduction(||:stop)
                for(int tid=0; tid< no_of_nodes ; tid++ )
                {
                        if (h_updating_graph_mask[tid] == true){
                        h_graph_mask[tid]=true;
                        h_graph_visited[tid]=true;
                        stop=true;
                        h_updating_graph_mask[tid]=false;
                        }
                }
                k++;
        }
        while(stop);

} /* end acc data */

        //Store the result into a file
        FILE *fpo = fopen("result.txt","w");
        for(int i=0;i<no_of_nodes;i++)
                fprintf(fpo,"%d) cost:%d\n",i,h_cost[i]);
        fclose(fpo);
        printf("Result stored in result.txt\n");


        // cleanup memory
        free( h_graph_nodes);
        free( h_graph_edges);
        free( h_graph_mask);
        free( h_updating_graph_mask);
        free( h_graph_visited);
        free( h_cost);

}

[/b]

Hi Marcin,

The benchmark does have a few errors. This particular error is due the use of an array in a data clause before it’s allocated. Worse, not only is “h_graph_edges” not allocated, the size that’s used, “edge_list_size”, isn’t initialized.

A second issue is that the OpenACC standard does not list a “bool” type as supported reduction variable. Though, I put in a RFE since adding “bool” may be something we can add as an extension (TPR#20375).

Note that the version of Rodinia that was used in this port is old. If you’re looking to port these to OpenACC, you may wish to get the latest version directly from the University of Virginia:
https://www.cs.virginia.edu/~skadron/wiki/rodinia/index.php/Main_Page
Here’s my update which fixes these issues:


% cat bfs.cpp
#include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <stdlib.h>

 #define TRANSFER_GRAPH_NODE 1
 int no_of_nodes;
 int edge_list_size;
 FILE *fp;
 //Structure to hold a node information
 struct Node
 {
         int starting;
         int no_of_edges;
 };

 void BFSGraph(int argc, char** argv);

 void Usage(int argc, char**argv){

 fprintf(stderr,"Usage: %s <input_file>\n", argv[0]);

 }
 ////////////////////////////////////////////////////////////////////////////////
 // Main Program
 ////////////////////////////////////////////////////////////////////////////////
 int main( int argc, char** argv)
 {
         no_of_nodes=0;
         edge_list_size=0;
         BFSGraph( argc, argv);
 }



 ////////////////////////////////////////////////////////////////////////////////
 //Apply BFS on a Graph using CUDA
 ////////////////////////////////////////////////////////////////////////////////
 void BFSGraph( int argc, char** argv)
 {
     char *input_f;

     int* h_cost;
     int* h_graph_edges;

         if(argc!=2){
         Usage(argc, argv);
         exit(0);
         }

         input_f = argv[1];

         printf("Reading File\n");
         //Read in Graph from a file
         fp = fopen(input_f,"r");
         if(!fp)
         {
                 printf("Error Reading graph file\n");
                 return;
         }

         int source = 0;

         fscanf(fp,"%d",&no_of_nodes);

         printf("Number of nodes: %d\n", no_of_nodes);

         // allocate host memory
         Node* h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes);
         bool *h_graph_mask = (bool*) malloc(sizeof(bool)*no_of_nodes);
         bool *h_updating_graph_mask = (bool*) malloc(sizeof(bool)*no_of_nodes);
         bool *h_graph_visited = (bool*) malloc(sizeof(bool)*no_of_nodes);
         // allocate mem for the result on host side
         h_cost = (int*) malloc( sizeof(int)*no_of_nodes);

         printf("Reading nodes: ");
         int start, edgeno;
         // initalize the memory
         for( unsigned int i = 0; i < no_of_nodes; i++)
         {
                 fscanf(fp,"%d %d",&start,&edgeno);
                 h_graph_nodes[i].starting = start;
                 h_graph_nodes[i].no_of_edges = edgeno;
//                 if(i % 100 == 0) printf(".");
         }
         printf("\n");

         //read the source node from the file
         fscanf(fp,"%d",&source);
         source=0;
         printf("Source vertex: %d\n", source);

 #pragma acc data create(h_updating_graph_mask[0:no_of_nodes]) \
         create(h_graph_mask[0:no_of_nodes],h_graph_visited[0:no_of_nodes]) \
         create(h_graph_nodes[0:no_of_nodes]) \
         copyout(h_cost[0:no_of_nodes])
 {
   //printf("In the acc scope.\n");

         #pragma acc update device(h_graph_nodes[0:no_of_nodes]) async(TRANSFER_GRAPH_NODE)

         #pragma acc parallel loop
         for( unsigned int i = 0; i < no_of_nodes; i++)
         {
           //printf("Processing node %d", i);
                 h_updating_graph_mask[i]=false;
                 h_graph_mask[i]=false;
                 h_graph_visited[i]=false;
         }

         #pragma acc kernels present(h_graph_mask[0:no_of_nodes],h_graph_visited[0:no_of_nodes])
         {
             //set the source node as true in the mask
             h_graph_mask[source]=true;
                 h_graph_visited[source]=true;
         }

         fscanf(fp,"%d",&edge_list_size);
         h_graph_edges = (int*) malloc(sizeof(int)*edge_list_size);

         int id,cost;
         for(int i=0; i < edge_list_size ; i++)
         {
                 fscanf(fp,"%d",&id);
                 fscanf(fp,"%d",&cost);
                 h_graph_edges[i] = id;
         }

         if(fp)
                 fclose(fp);


         #pragma acc parallel loop
         for(int i=0;i<no_of_nodes;i++) {
                 h_cost[i]=-1;
                 if(i == source) h_cost[source]=0;
         }

         // finish transfer node and edge to target
         #pragma acc enter data copyin(h_graph_edges[0:edge_list_size])

         #pragma acc wait(TRANSFER_GRAPH_NODE)

         printf("Start traversing the tree\n");

         int k=0;

         int stop;
         do
         {
                 //if no thread changes this value then the loop stops
                 stop=false;

                 #pragma acc parallel loop present(h_graph_edges[0:edge_list_size])
                 for(int tid = 0; tid < no_of_nodes; tid++ )
                 {
                         if (h_graph_mask[tid] == true){
                         h_graph_mask[tid]=false;
                         for(int i=h_graph_nodes[tid].starting; i<(h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting); i++)
                                 {
                                 int id = h_graph_edges[i];
                                 if(!h_graph_visited[id])
                                         {
                                         h_cost[id]=h_cost[tid]+1;
                                         h_updating_graph_mask[id]=true;
                                         }
                                 }
                         }
                 }

                 #pragma acc kernels loop independent vector reduction(||:stop)
                 for(int tid=0; tid< no_of_nodes ; tid++ )
                 {
                         if (h_updating_graph_mask[tid] == true){
                         h_graph_mask[tid]=true;
                         h_graph_visited[tid]=true;
                         stop=true;
                         h_updating_graph_mask[tid]=false;
                         }
                 }
                 k++;
         }
         while(stop);
 } /* end acc data */
         #pragma acc exit data delete(h_graph_edges[0:edge_list_size])

         //Store the result into a file
         FILE *fpo = fopen("result.txt","w");
         for(int i=0;i<no_of_nodes;i++)
                 fprintf(fpo,"%d) cost:%d\n",i,h_cost[i]);
         fclose(fpo);
         printf("Result stored in result.txt\n");


         // cleanup memory
         free( h_graph_nodes);
         free( h_graph_edges);
         free( h_graph_mask);
         free( h_updating_graph_mask);
         free( h_graph_visited);
         free( h_cost);

 }

! generate the CPU results
% pgcpp -fast -Msafeptr -V14.3 bfs.cpp
% a.out graph1MW_6.txt
Reading File
Number of nodes: 1000000
Reading nodes:
Source vertex: 0
Start traversing the tree
Result stored in result.txt
% cp result.txt resultCPU.txt

! run the OpenACC version
% pgcpp -fast -Msafeptr -V14.3 bfs.cpp -acc
% a.out graph1MW_6.txt
Reading File
Number of nodes: 1000000
Reading nodes:
Source vertex: 0
Start traversing the tree
Result stored in result.txt
% diff result.txt resultCPU.txt
%

Thanks!

I did download the newest version of Rodinia, but it does not include OpenACC benchmarks. Fortunately, the bfs benchmarks have not changed much, but there were fixes such as changing bool to char in OpenCL. Perhaps it is related to the issue you found.

Do you know of any other OpenACC BFS implementations?

Do you know of any other OpenACC BFS implementations?

Sorry, I don’t.

TPR 20375 - OpenACC: using “bool” as a reduction variable get wrong results
is fixe in the current 14.6 release.

Thanks,
dave