Should I be concerned if when I compile it, I don’t get the same messages? I.e. I don’t get notified of the reductions in the loops
The loop at line 155 is getting a “gang vector” and the inner loops aren’t getting scheduled. Hence no reduction needed. The question is why the compiler would be ignoring the schedule you provided? It’s because of the dependency “Accelerator restriction: size of the GPU copy of ‘rr’ is unknown”.
The main difference between what you have and what I did, is I was using the C compiler but you’re using C++. C++ doesn’t have true VLAs so “rr[N][N][3]” gets turned into a pointer to a pointer to a fixed size array while in C this becomes a fixed size array. Fixed size arrays have a known size while pointers don’t. To fix, either use C or linearize rr (see below).
source/n_body.cpp", line 65: warning: invalid text in pragma
#pragma acc routine
Sorry, I didn’t realize you were using C++. C++ support for “routine” will be added in 14.3 along with several other C++ improvements (like accessing class member variables in compute regions and data clauses, and single dimension VLAs).
Here’s the linearized “rr” version of the example:
% cat test_fixed.cpp
#define IDX3D(i,j,k) ((i*N*3)+(j*3)+k)
void rhs(double r[][3], double v[][3], double ro[][3], double vo[][3],double * mass, double * eta, int N)
{
// register int m;
double vo0,vo1,vo2;
#ifdef PGI_144
double rr[N*N*3],rp[3],temp,temp2,temp3;
#else
double *rr,rp[3],temp,temp2,temp3;
rr = new double[N*N*3];
#endif
#pragma acc data copyin(N, mass[:N]) create(rr[0:N*N*3])
{
#pragma acc kernels
{
#pragma acc loop independent //for private(rp[:3])
for (int j = 0; j < N; j++) {
#pragma acc loop independent
for (int i = 0; i < N; i++) {
if (i >= j) continue;
/// rij(r,mass,eta,i,j,rp);
/// temp = norm3(rp);
// rp[0]*temp^3
rr[IDX3D(i,j,0)] = (1/temp)*rp[0];
rr[IDX3D(i,j,0)] = (1/temp)*rp[1];
rr[IDX3D(i,j,0)] = (1/temp)*rp[2];
}
}
}
/// A3(ro[0],v[0]);
vo[0][0] = 0; vo[0][1] = 0; vo[0][2] = 0;
//#pragma acc kernels copyin(mass[:N])
//{
#pragma acc kernels loop independent copy(vo[0:N][0:3], ro[0:N][0:3])
for (int i = 0; i < N; i++) {
ro[i][0] = v[i][0];//A3(ro[i],v[i]);
vo0 = (temp*mass[0])*rr[IDX3D(0,i,0)];//A3(vo[i],(eta[i]/eta[i-1]*mass[0])*rr[0][i]);
ro[i][1] = v[i][1];//A3(ro[i],v[i]);
vo1 = (temp*mass[0])*rr[IDX3D(0,i,1)];
ro[i][2] = v[i][2];//A3(ro[i],v[i]);
vo2 = (temp*mass[0])*rr[IDX3D(0,i,2)];
#pragma acc loop reduction(+:vo0,vo1,vo2) independent
for (int j = 1; j < N; j++) {
if (j > i-1) continue;
vo0 += (temp*mass[j])*rr[IDX3D(j,i,0)];//A3P(vo[i],temp*mass[j]*rr[j][i]);
vo1 += (temp*mass[j])*rr[IDX3D(j,i,1)];
vo2 += (temp*mass[j])*rr[IDX3D(j,i,2)];
}
#pragma acc loop reduction(+:vo0,vo1,vo2) independent
for (int j=i+1; j<=N-1; j++) {
vo0 += -(mass[j])*(rr[IDX3D(j,i,0)]); //A3P(vo[i],-mass[j]*rr[i][j]);
vo1 += -(mass[j])*(rr[IDX3D(j,i,1)]);
vo2 += -(mass[j])*(rr[IDX3D(j,i,2)]);
}
#pragma acc loop reduction(+:vo0,vo1,vo2) independent
for (int j=0; j <= i-1; j++) {
#pragma acc loop reduction(+:vo0,vo1,vo2) independent
for (int k = i+1; k <= N-1; k++) {
vo0 += (mass[k]*mass[j]*temp3)*(rr[IDX3D(j,i,0)]);//A3P(vo[i],mass[j]*mass[k]*temp*rr[j][k]);
vo1 += (mass[k]*mass[j]*temp3)*(rr[IDX3D(j,i,1)]);
vo2 += (mass[k]*mass[j]*temp3)*(rr[IDX3D(j,i,2)]);
}
}
vo[i][0] += vo0;
vo[i][1] += vo1;
vo[i][2] += vo2;
}
}
#ifndef PGI_144
delete rr;
#endif
}
% pgcpp -acc -Minfo -V14.3 test_fixed.cpp -Msafeptr -c -w
rhs(double (*)[3], double (*)[3], double (*)[3], double (*)[3], double *, double *, int):
14, Generating copyin(N)
Generating copyin(mass[:N])
Generating create(rr[:(N*N)*3])
16, Generating present_or_copyin(rp[:])
Generating NVIDIA code
18, Loop is parallelizable
20, Loop is parallelizable
Accelerator kernel generated
18, #pragma acc loop gang /* blockIdx.y */
20, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
35, Generating copy(vo[:N][:])
Generating copy(ro[:N][:])
Generating present_or_copyin(v[:N][:])
Generating NVIDIA code
39, Loop is parallelizable
Accelerator kernel generated
39, #pragma acc loop gang /* blockIdx.x */
48, #pragma acc loop vector(128) /* threadIdx.x */
Sum reduction generated for vo0
Sum reduction generated for vo1
Sum reduction generated for vo2
55, #pragma acc loop vector(128) /* threadIdx.x */
Sum reduction generated for vo0
Sum reduction generated for vo1
Sum reduction generated for vo2
61, Sum reduction generated for vo0
Sum reduction generated for vo1
Sum reduction generated for vo2
63, #pragma acc loop vector(128) /* threadIdx.x */
Sum reduction generated for vo0
Sum reduction generated for vo1
Sum reduction generated for vo2
48, Loop is parallelizable
55, Loop is parallelizable
61, Loop is parallelizable
63, Loop is parallelizable