I summing i from 0 to n-1. I’m testing i and n being either int or long int. My loop is either inside an acc parallel loop or is not. 3 of the 4 choices work. However long int and parallel crashes.
#include <iostream>
using std::cout;
using std::endl;
int main() {
cout << "[Starting]\n";
{
const int n = 2'000'000'000;
cout << "n: " << n << endl;
double correct = n*(n+1.0)*0.5;
double computed=0;
for (int i=0; i<n; i++) computed +=i;
cout << "int n, sequential. correct: " << correct << ", computed: " << computed << endl;
}
{
const int n = 2'000'000'000;
cout << "n: " << n << endl;
double correct = n*(n+1.0)*0.5;
double computed=0;
#pragma acc parallel loop reduction(+:computed)
for (int i=0; i<n; i++) computed +=i;
cout << "int n, parallel. correct: " << correct << ", computed: " << computed << endl;
}
{
const long int n = 20'000'000'000;
cout << "n: " << n << endl;
double correct = n*(n+1.0)*0.5;
double computed=0;
for (long int i=0; i<n; i++) computed +=i;
cout << "long int n, sequential. correct: " << correct << ", computed: " << computed << endl;
}
{
const long int n = 20'000'000'000;
cout << "n: " << n << endl;
double correct = n*(n+1.0)*0.5;
double computed=0;
#pragma acc parallel loop reduction(+:computed)
for (long int i=0; i<n; i++) computed +=i;
cout << "long int n, parallel. correct: " << correct << ", computed: " << computed << endl;
}
}
I compiled it thus:
pgc++ -fast -Minfo=accel -mp -acc bad-acc.cc -lfmt -o bad-acc
This is the output:
n: 2000000000
int n, sequential. correct: 2e+18, computed: 2e+18
n: 2000000000
upload CUDA data file=/p73/wrf/git/parallel-research/openacc-1st/bad-acc.cc function=main line=25 device=0 threadid=1 variable=computed bytes=8
download CUDA data file=/p73/wrf/git/parallel-research/openacc-1st/bad-acc.cc function=main line=27 device=0 threadid=1 variable=computed bytes=8
int n, parallel. correct: 2e+18, computed: 2e+18
n: 20000000000
long int n, sequential. correct: 2e+20, computed: 2e+20
n: 20000000000
upload CUDA data file=/p73/wrf/git/parallel-research/openacc-1st/bad-acc.cc function=main line=44 device=0 threadid=1 variable=computed bytes=8
[2] 415351 segmentation fault (core dumped) ./bad-acc
11.75s real 11.44s user 0.12s system 98% 0,0 socket 165 mem ./bad-acc
I’m running Ubuntu 20.04. The GPU is a Quadro RTX 5000.
If this is a known limitation, are other limitations documented so I’ll know to work around them?
Thanks.