How to create a simple event big counter (equal to long int counter) constructed from 2 int numbers properly working in parallel on GPU using OpenAcc?

In my full code there are many dozens billions of events which I need to count. Since the maximum value of int is less than 2.5e9 (less than 4.5e9 for unsigned int), I face the trouble of the overflow of a simple int counter variable.
As far as I know, atomic operations on long int types are not supported on GPU in OpenAcc yet (PGI C++ compiler pgc++ 19.10). Is it true?
So, I tried to implement a simple concept of a big number counter: there are 2 int numbers n and N; when n becomes equal to MAX_INT, it is set to zero and N is incremented. Finally, the total number of events is calculated as N * MAX_INT+n. The idea is simple.
I created a simple reproducing piece of code:

#include <iostream>
#include <fstream>
#include <limits>
#include <chrono>
#include <accelmath.h>
#include <openacc.h>

using namespace std;
const long int N=10000000000l;

class Counter {
public:
  Counter():n(0),N(0),X(0)
  {
    cout<<"INT_MAX="<<INT_MAX<<endl;
  }
#pragma acc routine seq
  void operator()()
  {
    if(n==INT_MAX)
    {
#pragma acc atomic update
      ++N;
#pragma acc atomic write
      n=1;
    }
    else
    {
 #pragma acc atomic update
      ++n;
    }
   //#pragma acc atomic update
   //++X; 
  }
  long int GetTotalCount()
  {
    return (static_cast<long int>(n)+static_cast<long int>(N)*static_cast<long int>(INT_MAX));
  }
  friend std::ostream & operator<<(std::ostream & o, Counter & counter)
  {
    o<<counter.GetTotalCount()<<" ("<<counter.N<<","<<counter.n<<") = "
     <<static_cast<long int>(counter.N)*static_cast<long int>(counter.INT_MAX)+static_cast<long int> 
    (counter.n)
     <<" X="<<counter.X<<endl;
     return o;
  }
private:
  const int INT_MAX=std::numeric_limits<int>::max();
  int n;
  int N;
  long int X;
};

int main(int argc, char **argv)
{
  const auto begin = std::chrono::steady_clock::now();
  Counter counter;
#pragma acc parallel loop gang vector copy(counter)
  for(long int i=0; i<N; ++i)
  {
    if(i%2==0) counter();
  }
  const auto end = std::chrono::steady_clock::now();
  cout<<"N="<<N<<std::endl;
  cout<<"Total count="<<counter<<endl;
  auto time = std::chrono::duration_cast<std::chrono::milliseconds>(end-begin).count();
  cout<<"t="<<time<<" ms"<<endl;
  return 0;
}

I use the compiler flags:

-mcmodel=medium -ta=tesla:cc30 -Minline

But the code works improperly (gives incorrect results). I realize that there is race condition in

 if(n==INT_MAX){...}

But I don’t know how to treat this race condition properly.
As I understood, I can implement some big number class that supports initialization from int and operator+ on itself or, If that doesn’t work, use declare reduction approach on CPU.

When i uncomment the 2 commented lines

 #pragma acc atomic update
 ++X;

the program is frozen (awfully hangs and does not give any result). This is because there is no support for long int type in atomic operations on GPU yet, isn’ it?

You’re probably running into some general issues we were having with C++ and atomics rather than anything specific to using longs. These issues were addressed in the NVHPC 20.7 release. Though NVIDIA discontinued support for CC30 devices, so the new NV HPC Compilers don’t support CC30 which may be an issue for you unless you can upgrade your device.

% cat test2.cpp
#include <iostream>
#include <fstream>
#include <limits>
#include <chrono>
#include <accelmath.h>
#include <openacc.h>

using namespace std;
const long int N=10000000000l;

class Counter {
public:
  Counter():n(0)
  {
    cout<<"INT_MAX="<<INT_MAX<<endl;
  }
#pragma acc routine seq
  void operator()()
  {
#pragma acc atomic update
      ++n;
  }
  long int GetTotalCount()
  {
    return n;
  }
  friend std::ostream & operator<<(std::ostream & o, Counter & counter)
  {
    o<<counter.GetTotalCount()
     <<endl;
     return o;
  }
private:
  const int INT_MAX=std::numeric_limits<int>::max();
  long n;
};

int main(int argc, char **argv)
{
  const auto begin = std::chrono::steady_clock::now();
  Counter counter;
#pragma acc parallel loop gang vector copy(counter)
  for(long int i=0; i<N; ++i)
  {
    if(i%2==0) counter();
  }
  const auto end = std::chrono::steady_clock::now();
  cout<<"N="<<N<<std::endl;
  cout<<"Total count="<<counter<<endl;
  auto time = std::chrono::duration_cast<std::chrono::milliseconds>(end-begin).count();
  cout<<"t="<<time<<" ms"<<endl;
  return 0;
}
   % nvc++ -acc test2.cpp -fast -Minfo=accel -V20.9 ; a.out
    main:
         41, Generating copy(counter) [if not already present]
             Generating Tesla code
             43, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    Counter::operator ()():
         19, Generating acc routine seq
             Generating Tesla code
    INT_MAX=2147483647
    N=10000000000
    Total count=5000000000

    t=4054 ms