OpenACC GPU + MPI - Acceleration preventend in triple loop C code

Hi everyone! I’m dealing with the acceleration of a simple subroutine in C with OpenACC GPU. I implemented all the things I know about and also the loops are not complicated at all. Nevertheless some variables are called externally and there are a significant amount of ifdef options. The complete code is the following:

#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <math.h>
#include <mpi.h>
#include <defsC.h>

extern int my_id, bbb, nsmax, *iweg, M_NsMAX_c, mini[], maxi[], minj[], maxj[], mink[], maxk[], ghost_c ;
extern int i_ind_max[], i_ind_min[], j_ind_max[], j_ind_min[], k_ind_max[], k_ind_min[];
extern double f0[100][100], mi_i[100][51], K_i[100][51], *Sc, RQW[100][100], Cpi[100][51], *chi, *Yi, *T, *p, W[100];
extern double TcritPR[], PcritPR[], TAKA[];
extern double *VISC, *COND, *DIFF, *rho, *Wmix, T_rif, *tempo;
extern double *weg ;
extern double d1_KT, b2_KT, LJij[100][100];
extern double a1_KT, a2_KT, a3_KT, a4_KT, a5_KT, a6_KT, a7_KT,
              b1_KT, b2_KT, b3_KT, b4_KT, b5_KT, b6_KT, b7_KT,
              c1_KT, c2_KT, c3_KT, c4_KT;
extern int II, JJ, KK, una_volta;
extern void var_comm_1_( int *b, double * f );
extern void var_comm_2_( int *b, double * f );

#ifdef HP
  extern void y_to_x_hp_ ( double *Wmix_ijk, double *Yi_ijk, double *x_hp);
  extern void ispure_ ( double *x_hp, int *icomp_hp);
  extern void rmix_new_ ( double *x_hp, int *icomp_hp, double *R_hp);
  extern void trnprp_new_c( double *T, double *D_hp, double *x_hp, int *icomp_hp, double *R_hp, double *VISC, double *COND, int *ierr_hp);
#endif

#define min(a,b)            (((a) < (b)) ? (a) : (b))
#define max(a,b)            (((a) > (b)) ? (a) : (b))

#ifdef TKH

  int LOCATE( double a[], int n0, int n, double x, int nmin, int nmax) {
    int jmin, jmax, jL, ju, jm;
    jmin = max(n0-1,nmin);   jmin = min(n0,jmin);
    jmax = min(n,nmax);      jmax = max(n-1,jmax);
    jL = jmin;               ju = jmax+1;
    while(ju-jL > 1){
      jm = (ju+jL)/2;
      if(((a[n] > a[1]) && (x > a[jm])) || ((a[n] < a[1]) && (x < a[jm])) )
        jL = jm;
      else
        ju=jm;
    }
    return jL;
  }



  void FINDINDEX (double x, int *i, double *frac, double a[], int n, int *iascend){

    if(n == 1)  { *i = 1;  *frac = 0.0e0; *iascend = 0; return; }
    *iascend = -1;  if( a[1]>a[0] ) *iascend = 1;
    if( (x<=a[0] && *iascend == 1)   || (x>=a[0] && *iascend == -1) ) { *i = 1;   *frac = 0.0e0; return; }
    if( (x>=a[n-1] && *iascend == 1) || (x<=a[n-1] && *iascend == -1)){ *i = n-2; *frac = 1.0e0; return; }

    *i = LOCATE( a, 1, n, x, 1, n-1);
    *frac = (x - a[*i]) / ( a[*i+1] - a[*i] );

    return;
  }

  void TKHSHC(double Tloc, double Ploc, double Xiloc[], double **FCorr){

    double w1, w2, frac, X_tot, DP_Rl, A_T, B_T, C_T, E_T, Tr_mix, Pr_mix;
    int is, iascend ;
    for (int s = 0; s<nsmax; s++)
      for (int ss = 0; ss<nsmax; ss++){
        FCorr[s][ss] = 1.0e0;
        X_tot = Xiloc[ss] + Xiloc[s];
        if( X_tot != 0.0e0 ){
          Tr_mix = (Xiloc[s] / X_tot * TcritPR[s]) + (Xiloc[ss] / X_tot * TcritPR[ss]);
          Tr_mix = Tloc / Tr_mix;
          Pr_mix = (Xiloc[s] / X_tot * PcritPR[s]) + (Xiloc[ss] / X_tot * PcritPR[ss]);
          Pr_mix = Ploc / Pr_mix;
          FINDINDEX( Pr_mix, &is, &frac, TAKA, 17, &iascend);
          w1    = 1.e0 - frac ;  w2 = frac;
          DP_Rl = w1* *(TAKA+is+17) + w2* *(TAKA+is+1+17);
          A_T   = w1* *(TAKA+is+34) + w2* *(TAKA+is+1+34);
          B_T   = w1* *(TAKA+is+51) + w2* *(TAKA+is+1+51);
          C_T   = w1* *(TAKA+is+68) + w2* *(TAKA+is+1+68);
          E_T   = w1* *(TAKA+is+85) + w2* *(TAKA+is+1+85);
          if ( Tr_mix<0.9e0 )
            FCorr[s][ss] = 1.0e0;
          else
            FCorr[s][ss] = max( 0.1e0, DP_Rl * ( 1.0e0 - A_T * pow(Tr_mix, (-B_T)) ) * (1.0e0 - C_T * pow( Tr_mix, (-E_T)) ));
        }
    }
  }

#endif

int molp (){

#ifdef HP
  double x_hp[20], Xir[nsmax], D_hp, R_hp;
  double t0_hp, rho0_hp, Tmin_mi, Tmax_mi, Dmax_mi, Pmax_mi, Tmin_k, Tmax_k, Dmax_k, Pmax_k;
  int ierr_hp, icomp_hp, ss;
  char herr_hp[255];
#ifdef TD
  int c, c_p1;
  double T_dim, DelTc;
#endif
#else
  int  c, c_p1;
  double T_dim, DelTc;
#endif
  int app_ijk_s, app_ijk;
  int ijk, s_ijk, max_k, max_j, max_i;
  int how_many_i, how_many_j, how_many_k, inizio_i, inizio_j, inizio_k, incremento;
#ifdef VW
  double d3, d4, d5, Mis[nsmax], appo1, appo2;
#endif

#ifdef CM
  double Ks[nsmax], Xir[nsmax], appo3, appo4;
#endif

#ifdef DKT
  double d2_KT;
  double appo, appo5, appo6, OMij, T_LJij, T_LJij2, T_LJij3, T_LJij4, T_LJij5;
#endif
#ifdef TKH
  double *FCorr[nsmax];
  for ( int i = 0; i < nsmax; i++ ) {
    FCorr[i] = malloc( nsmax* sizeof *FCorr[i]);
    for ( int j = 0; j < nsmax; j++ )
      FCorr[i][j] = 1.e0;
  }
#endif

#ifdef TD
  int p_c;
  double p_app, W_app,  Dtemp ;
  double *xxir = (double *) malloc(sizeof(double) * nsmax);
  double *CCp =  (double *) malloc(sizeof(double) * nsmax);
  double *d_ij = (double *) malloc(sizeof(double) * nsmax * nsmax);
#endif

  double *p_mi_i = &mi_i[0][0];
  double *p_K_i  = &K_i[0][0];
  double *p_Cpi  = &Cpi[0][0];

  max_k = (maxk[bbb]+ghost_c) - (mink[bbb]-ghost_c) + 1;
  max_j = (maxj[bbb]+ghost_c) - (minj[bbb]-ghost_c) + 1;
  max_i = (maxi[bbb]+ghost_c) - (mini[bbb]-ghost_c) + 1;

  how_many_k = (maxk[bbb]+ghost_c) - (mink[bbb]-ghost_c) + 1;
  how_many_j = (maxj[bbb]+ghost_c) - (minj[bbb]-ghost_c) + 1;
  how_many_i = (maxi[bbb]+ghost_c) - (mini[bbb]-ghost_c) + 1;

  max_k    = maxk[bbb] - mink[bbb] + 1;

  if( mink[bbb] == k_ind_min[bbb] ){
    inizio_k = 0;   max_k    = max_k + ghost_c;
  }
  else
    inizio_k = ( how_many_j * how_many_i * ghost_c );

  if( maxk[bbb] == k_ind_max[bbb] )
    max_k    = max_k + ghost_c;

  max_j    = maxj[bbb] - minj[bbb] + 1;
  if( minj[bbb] == j_ind_min[bbb] ){
    inizio_j = 0;   max_j    = max_j + ghost_c;
  }
  else
    inizio_j = ( how_many_i * ghost_c );
  if( maxj[bbb] == j_ind_max[bbb] )
    max_j    = max_j + ghost_c;

  max_i    = maxi[bbb] - mini[bbb] + 1; incremento = 2 * ghost_c;
  if( mini[bbb] == i_ind_min[bbb] ){
    inizio_i = 0;   max_i    = max_i + ghost_c; incremento = incremento - ghost_c;
  }
  else
    inizio_i =  ghost_c ;
  if( maxi[bbb] == i_ind_max[bbb] ){
    max_i    = max_i + ghost_c; incremento = incremento - ghost_c;
  }
  ijk = 0;  s_ijk = 0;


  #pragma acc data copy(mi_i,Mis,Ks,K_i,W,Xir,f0,LJij,RQW,T[0:nsmax][0:nsmax])
  #pragma acc parallel loop private(ijk,s_ijk,VISC[0:nsmax][0:nsmax],COND[0:nsmax][0:nsmax],p[0:nsmax][0:nsmax],Yi[0:nsmax][0:nsmax],Wmix[0:nsmax][0:nsmax],DIFF[0:nsmax][0:nsmax]) 
  for (int k = 0; k<max_k; k++){
    ijk = inizio_k + ( how_many_j * how_many_i ) * k + inizio_j + inizio_i;
    s_ijk = ijk * nsmax;
    for (int j = 0; j<max_j; j++){
      for (int i = 0; i<max_i; i++) {
        #ifndef HP
          T_dim = *(T+ijk) * T_rif * 0.01e0;
          c = (int)floor( T_dim );  if (c<=0 || c>50) printf(" MP proc =%5d ZONA =%5d - %d %d %d - T = %10.4e \n", my_id, bbb, i,j,k, *(T+ijk)*T_rif );
          DelTc = T_dim - (double) c;
          c_p1 = c + 1 ;
        #endif
        for (int s = 0; s< nsmax; s++){
          #ifdef VW
            Mis[s] = ( mi_i[s][c_p1] - mi_i[s][c] ) * DelTc + mi_i[s][c] ;
          #endif
          #ifdef CM
            Ks[s] = ( K_i[s][c_p1] - K_i[s][c] ) * DelTc + K_i[s][c] ;
            Xir[s] = *(Yi + s_ijk + s) / (*(Wmix + ijk) * W[s] );
          #endif
        }
        *( VISC+ijk ) = 0.0e0;
        #ifdef CM
          appo3 = 0.e0;    appo4 = 0.e0;
        #else
          *( COND+ijk ) = 0.0e0;
        #endif
        #ifdef HP         
          y_to_x_hp_( (Wmix + ijk), (Yi + s_ijk) , x_hp);
          icomp_hp=0;
          ispure_( x_hp, &icomp_hp);
          D_hp = *(rho + ijk ) * *(Wmix + ijk) / 1000.e0;
          rmix_new_( x_hp, &icomp_hp, &R_hp);
          trnprp_new_c ( T+ijk, &D_hp, x_hp, &icomp_hp, &R_hp, VISC+ijk, COND+ijk, &ierr_hp);
          *(VISC+ijk) = *(VISC+ijk) * 1.0e-6 ;
          if (*(VISC+ijk)>1.e-2)  *(VISC+ijk) = 1.0e-2;
          if (*(VISC+ijk)<0.e-0)  *(VISC+ijk) = 1.0e-4;
          if (*(COND+ijk)>1.e0)   *(COND+ijk) = 1.0e0;
          if (*(COND+ijk)<1.e-5)  *(COND+ijk) = 1.0e-5;

        #else
          for (int s = 0; s< nsmax; s++){
          #ifdef VISC_WILKE
            appo1 = Xir[s] * Mis[s];     appo2 = 0.e0;
            for (int ss = 0; ss< nsmax; ss++){
              d3 = Mis[s] / Mis[ss];
              d4 = sqrt( W[ss] / W[s]);
              d5 = d3 * d4;
              appo2 = appo2 + Xir[ss] * f0[ss][s] * (1.0e0 + d3 * d4 + 2.0e0 * sqrt( d5 ));
            }
            *( VISC+ijk ) = *( VISC+ijk ) + appo1 / appo2;
          #else
            *( VISC+ijk ) = *( VISC+ijk ) + *(Yi + s_ijk + s) * (( mi_i[s][c_p1] - mi_i[s][c] ) * DelTc + mi_i[s][c]) ;
          #endif
          #ifdef CM
            appo3 = appo3 + *(Xir+s) * *(Ks+s);
            appo4 = appo4 + *(Xir+s) / *(Ks+s);
          #else
            *(COND+ijk)  = *(COND+ijk) + *(Yi+ s_ijk + s) * (( K_i[s][c_p1] - K_i[s][c] ) * DelTc + K_i[s][c]) ;
          #endif
          }
        #endif
        #ifdef CM
          *(COND+ijk) = 0.5e0 * (appo3 + 1.e0/appo4);
        #endif

        #ifdef FIX
          *(VISC+ijk) = *(rho+ijk) * 1.57e-5;
        #endif

        #ifdef ENHANCE
          *(VISC+ijk) = 100.e0* *(VISC+ijk)
          *(COND+ijk) = 100.e0* *(COND+ijk)
        #endif

        #ifndef DKT
          for (int s = 0; s< nsmax; s++)
            *(DIFF+s+s_ijk) = *(VISC+ijk) / ( *(rho+ijk) * *(Sc+s));
        #endif

        #ifdef DKT
        #ifdef HP
          for (int s = 0; s< nsmax; s++)
            *(Xir+s) = *(Yi+s_ijk+s) / (*(Wmix+ijk) * *(W+s)) ;
          #ifdef TKH    
            TKHSHC(*(T+ijk) ,*(p+ijk) , &Xir, &FCorr);
          #endif
        #endif
          d2_KT = pow( *(T+ijk), 3.0e0/2.0e0);
          appo6 = d1_KT * d2_KT / *(p+ijk);
          for (int s = 0; s< nsmax; s++)  Xir[s]= Xir[s] + 1.0e-12;

          for (int s = 0; s< nsmax; s++){
            appo = 0.0e0;        appo5= 0.0e0;
            for (int ss = 0; ss< nsmax; ss++)
              if (s != ss){
                T_LJij  = *(T+ijk) / LJij[s][ss];
                T_LJij2 = T_LJij  * T_LJij;
                T_LJij3 = T_LJij  * T_LJij2;
                T_LJij4 = T_LJij2 * T_LJij2;
                T_LJij5 = T_LJij3 * T_LJij2;
                if ( 4.5e0 < T_LJij && T_LJij <= 100.e0 )
                  OMij = b1_KT + b2_KT / T_LJij - b3_KT * T_LJij + b4_KT * T_LJij2 - b5_KT * T_LJij3 + b6_KT * T_LJij4 - b7_KT * T_LJij5;
                else if ( T_LJij <= 4.5e0)
                  OMij = a1_KT + a2_KT / T_LJij - a3_KT * T_LJij + a4_KT * T_LJij2 - a5_KT * T_LJij3 + a6_KT * T_LJij4 - a7_KT * T_LJij5;
                else
                  OMij = c1_KT + c2_KT / T_LJij - c3_KT * T_LJij + c4_KT * T_LJij3;
                #ifdef TKH
                  appo = appo + Xir[ss] / (RQW[ss][s] * FCorr[s][ss] / OMij);
                #else
                  appo = appo + Xir[ss] / (RQW[ss][s] / OMij);
                #endif
                appo5= appo5+ Xir[ss] * *(W+ss);
              }
              *(DIFF+s_ijk+s) = *(Wmix+ijk) * appo5 * appo6 / appo;
          }

          #ifdef TD
            p_app = *( p+ijk ) * 1.e1 ;  W_app = 1.0e3 / *(Wmix+ijk);
            for (int s = 0; s< nsmax; s++)
              *(xxir+s) = *(Xir+s);

            c = (int)floor( *(T+ijk) * 0.01e0 );  Dtemp = *(T+ijk) * 0.01e0 - (double) c;
            for (int s = 0; s< nsmax; s++)
              CCp[s] = (( Cpi[s][c+1] - Cpi[s][c]) * Dtemp + Cpi[s][c]) * 1.0e04 ;
              egspar_c( *(T+ijk),  xxir, *(Yi + s_ijk) , CCp, weg, iweg );
              egstd1_c(p_app, *(T+ijk), *(Yi + s_ijk) , W_app, weg, iweg, (chi + s_ijk), d_ij);
              for (int s = s_ijk; s < s_ijk + nsmax; s++)
                *(chi + s) = *(chi + s) * 1.0e-04;
          #endif
        #endif
        ijk++;  s_ijk = s_ijk + nsmax;
      }
      ijk   = ijk + incremento;
      s_ijk = ijk * nsmax;
    }
  }
#ifdef therm_diff
  free (xxir); free(CCp); free(d_ij);
#endif
}

When I compile the code with:

nvc -c -acc=gpu,noautopar -target=gpu -gpu=ccall -traceback -Minfo=accel molp.c

the following acceleration information arise:

molp:
    223, Generating copy(f0[:][:],mi_i[:][:],LJij[:][:],K_i[:][:],T[:nsmax],W[:],Xir[:],Ks[:],Mis[:],RQW[:][:]) [if not already present]
         Generating NVIDIA GPU code
        228, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
        231, #pragma acc loop seq
        232, #pragma acc loop seq
        240, #pragma acc loop seq
        273, #pragma acc loop seq
        277, #pragma acc loop seq
        364, #pragma acc loop seq
        366, #pragma acc loop seq
        368, #pragma acc loop seq
    231, Loop carried scalar dependence for ijk at line 416
         Complex loop carried dependence of T->,VISC->,COND->,p->,mi_i,Mis prevents parallelization
         Loop carried dependence of Mis prevents parallelization
         Loop carried dependence of Mis prevents vectorization
         Loop carried backward dependence of Mis prevents vectorization
         Complex loop carried dependence of K_i,Ks prevents parallelization
         Loop carried dependence of Ks prevents parallelization
         Loop carried dependence of Ks prevents vectorization
         Loop carried backward dependence of Ks prevents vectorization
         Complex loop carried dependence of Yi->,W,Wmix-> prevents parallelization
         Loop carried dependence of Xir prevents parallelization
         Loop carried dependence of Xir prevents vectorization
         Complex loop carried dependence of Xir prevents parallelization
         Loop carried backward dependence of Xir prevents vectorization
         Complex loop carried dependence of f0,DIFF->,LJij,RQW prevents parallelization
    232, Complex loop carried dependence of T->,VISC->,COND->,p-> prevents parallelization
         Loop carried scalar dependence for s_ijk at line 249,414,396
         Complex loop carried dependence of mi_i,Mis prevents parallelization
         Loop carried dependence of Mis prevents parallelization
         Loop carried backward dependence of Mis prevents vectorization
         Complex loop carried dependence of K_i,Ks prevents parallelization
         Loop carried dependence of Ks prevents parallelization
         Loop carried backward dependence of Ks prevents vectorization
         Complex loop carried dependence of Yi->,W,Wmix-> prevents parallelization
         Loop carried dependence of Xir prevents parallelization
         Complex loop carried dependence of Xir prevents parallelization
         Loop carried backward dependence of Xir prevents vectorization
         Complex loop carried dependence of f0,DIFF->,LJij,RQW prevents parallelization
    240, Complex loop carried dependence of Mis,Ks,Yi->,Wmix->,Xir prevents parallelization
    273, Complex loop carried dependence of Mis,Xir,VISC->,Ks,W,f0 prevents parallelization
    277, Loop is parallelizable
    364, Loop is parallelizable
    366, Complex loop carried dependence of W,DIFF->,RQW,Xir,Wmix->,LJij,T-> prevents parallelization
    368, Loop is parallelizable

I’ve tried to change some variable from copy to private and viceversa, however there is always the problem of complex loop carried dependence. I solved this problem in a similar fortran code with pointer and I think the problem here is the same. Unfortunately, the solution seems not to be the same. Maybe there is something wrong with the variables that are called from outside or something related to ifdef?

Hi Matteo,

The code is getting parallelize on the GPU as shown in the message:

228, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */

The messages are why the compiler can’t auto-parallelize the inner loops. They usually can be ignored if you aren’t planning on parallelizing the inner loops, though in this case, it does show an issue which will prevent you from getting correct results. Let look at a few and explain what they mean.

Loop carried scalar dependence for ijk at line 416

“ijk” is being used as a counter to create an offset into several global pointers. It gets it’s initial value set prior to the inner loops and then incremented within the loops. Given the previous loop iteration’s value needs to be computed before the current iteration, this creates a dependency and the loops can’t be parallelized.

The typically fix is to compute ijk in the inner loops based on both the inner and outer loop indices rather than use it as a counter.

In some cases atomic capture can be used, but atomics only guarantees that each loop iteration would have a unique value for ijk, not the order in which the values are set, i.e. the value of ijk will not be linear with the loop indices.

Same issue occurs with s_ijk.

Complex loop carried dependence of T->,VISC->,COND->,p->,mi_i,Mis prevents parallelization

There’s a few issues here. First the index is being computed at runtime using a variable as opposed as loop index. Therefor it can tell if the computed indices are the same. If they are the same, this would prevent parallelization. Given they can, the compiler must assume they do overlap and hence can’t not auto-parallelize the loops.

If you add a “loop” directive on the inner loops, you will be telling the compiler to go ahead and parallelize these loops, but you’ll also be asserting that there are no dependencies. If the indices do overlap, this becomes a race condition and an error in your code but can be fixed using an atomic update.

Loop carried dependence of Mis prevents parallelization

This one will cause your code to get incorrect results. Mis needs to be private else every thread will be using the same memory and writing over each other’s values. Same issue for Ks.

Complex loop carried dependence of K_i,Ks prevents parallelization

I’m not sure about K_i, but it’s likely that given it’s a global variable with other pointers potentially referencing if (p_K_i does take it’s reference), it’s an aliasing problem. The compiler can tell if multiple pointers point to the same memory location, thus potentially causing a race condition.

You can use the “restrict” keyword or the flag “-Msafeptr” to assert that no aliasing occurs, but would be incorrect given the code does using aliasing (such as p_K_i).

The only fix here would be to explicitly add the loop directive on the inner loops to force parallelization.

The remaining messages indicate similar issues with the remaining variables.

The bottom line is that you probably don’t want to parallelize the inner loop but do want to privatize Mis, Ks, Xir, and CCp when TD is defined.

If you don’t want to see these extra messages, adding “loop seq” to the inner loops should get rid of them since the compiler wont attempt to auto-parallelize them.

I just noticed that you’re privatizing VISC, COND, p, Wmix, and DIFF. This seems incorrect as they are global variables, you’re likely wanting the results, and each thread is only updating a single element. With private, each thread will get a full private copy of the array wasting memory and the values can’t be used after the kernel ends.

Given I don’t have access to you’re full code, nor can compile this file, I can’t be sure I’m correct, but the following is the direction I’d recommend:

#pragma acc data copy(T[0:nsmax][0:nsmax], VISC[0:nsmax][0:nsmax], COND[0:nsmax][0:nsmax], \
                      p[0:nsmax][0:nsmax],Yi[0:nsmax][0:nsmax],Wmix[0:nsmax][0:nsmax],  \
                      DIFF[0:nsmax][0:nsmax], K_i, W, f0, LJik, RQW,T[0:nsmax][0:nsmax])
#pragma acc parallel loop private(Mis,Ks,Xir)
  for (int k = 0; k<max_k; k++){
    ijk = inizio_k + ( how_many_j * how_many_i ) * k + inizio_j + inizio_i;
    s_ijk = ijk * nsmax;
#pragma acc loop seq
    for (int j = 0; j<max_j; j++){
#pragma acc loop seq
      for (int i = 0; i<max_i; i++) {

-Mat

Hi Mat,

I am only responding now because I have not worked on this topic for several months but have now resumed. I thank you very much for your advice which I always find very useful.

Kind regards,
Matteo

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.