Hi everyone! I’m dealing with the acceleration of a simple subroutine in C with OpenACC GPU. I implemented all the things I know about and also the loops are not complicated at all. Nevertheless some variables are called externally and there are a significant amount of ifdef options. The complete code is the following:
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <math.h>
#include <mpi.h>
#include <defsC.h>
extern int my_id, bbb, nsmax, *iweg, M_NsMAX_c, mini[], maxi[], minj[], maxj[], mink[], maxk[], ghost_c ;
extern int i_ind_max[], i_ind_min[], j_ind_max[], j_ind_min[], k_ind_max[], k_ind_min[];
extern double f0[100][100], mi_i[100][51], K_i[100][51], *Sc, RQW[100][100], Cpi[100][51], *chi, *Yi, *T, *p, W[100];
extern double TcritPR[], PcritPR[], TAKA[];
extern double *VISC, *COND, *DIFF, *rho, *Wmix, T_rif, *tempo;
extern double *weg ;
extern double d1_KT, b2_KT, LJij[100][100];
extern double a1_KT, a2_KT, a3_KT, a4_KT, a5_KT, a6_KT, a7_KT,
b1_KT, b2_KT, b3_KT, b4_KT, b5_KT, b6_KT, b7_KT,
c1_KT, c2_KT, c3_KT, c4_KT;
extern int II, JJ, KK, una_volta;
extern void var_comm_1_( int *b, double * f );
extern void var_comm_2_( int *b, double * f );
#ifdef HP
extern void y_to_x_hp_ ( double *Wmix_ijk, double *Yi_ijk, double *x_hp);
extern void ispure_ ( double *x_hp, int *icomp_hp);
extern void rmix_new_ ( double *x_hp, int *icomp_hp, double *R_hp);
extern void trnprp_new_c( double *T, double *D_hp, double *x_hp, int *icomp_hp, double *R_hp, double *VISC, double *COND, int *ierr_hp);
#endif
#define min(a,b) (((a) < (b)) ? (a) : (b))
#define max(a,b) (((a) > (b)) ? (a) : (b))
#ifdef TKH
int LOCATE( double a[], int n0, int n, double x, int nmin, int nmax) {
int jmin, jmax, jL, ju, jm;
jmin = max(n0-1,nmin); jmin = min(n0,jmin);
jmax = min(n,nmax); jmax = max(n-1,jmax);
jL = jmin; ju = jmax+1;
while(ju-jL > 1){
jm = (ju+jL)/2;
if(((a[n] > a[1]) && (x > a[jm])) || ((a[n] < a[1]) && (x < a[jm])) )
jL = jm;
else
ju=jm;
}
return jL;
}
void FINDINDEX (double x, int *i, double *frac, double a[], int n, int *iascend){
if(n == 1) { *i = 1; *frac = 0.0e0; *iascend = 0; return; }
*iascend = -1; if( a[1]>a[0] ) *iascend = 1;
if( (x<=a[0] && *iascend == 1) || (x>=a[0] && *iascend == -1) ) { *i = 1; *frac = 0.0e0; return; }
if( (x>=a[n-1] && *iascend == 1) || (x<=a[n-1] && *iascend == -1)){ *i = n-2; *frac = 1.0e0; return; }
*i = LOCATE( a, 1, n, x, 1, n-1);
*frac = (x - a[*i]) / ( a[*i+1] - a[*i] );
return;
}
void TKHSHC(double Tloc, double Ploc, double Xiloc[], double **FCorr){
double w1, w2, frac, X_tot, DP_Rl, A_T, B_T, C_T, E_T, Tr_mix, Pr_mix;
int is, iascend ;
for (int s = 0; s<nsmax; s++)
for (int ss = 0; ss<nsmax; ss++){
FCorr[s][ss] = 1.0e0;
X_tot = Xiloc[ss] + Xiloc[s];
if( X_tot != 0.0e0 ){
Tr_mix = (Xiloc[s] / X_tot * TcritPR[s]) + (Xiloc[ss] / X_tot * TcritPR[ss]);
Tr_mix = Tloc / Tr_mix;
Pr_mix = (Xiloc[s] / X_tot * PcritPR[s]) + (Xiloc[ss] / X_tot * PcritPR[ss]);
Pr_mix = Ploc / Pr_mix;
FINDINDEX( Pr_mix, &is, &frac, TAKA, 17, &iascend);
w1 = 1.e0 - frac ; w2 = frac;
DP_Rl = w1* *(TAKA+is+17) + w2* *(TAKA+is+1+17);
A_T = w1* *(TAKA+is+34) + w2* *(TAKA+is+1+34);
B_T = w1* *(TAKA+is+51) + w2* *(TAKA+is+1+51);
C_T = w1* *(TAKA+is+68) + w2* *(TAKA+is+1+68);
E_T = w1* *(TAKA+is+85) + w2* *(TAKA+is+1+85);
if ( Tr_mix<0.9e0 )
FCorr[s][ss] = 1.0e0;
else
FCorr[s][ss] = max( 0.1e0, DP_Rl * ( 1.0e0 - A_T * pow(Tr_mix, (-B_T)) ) * (1.0e0 - C_T * pow( Tr_mix, (-E_T)) ));
}
}
}
#endif
int molp (){
#ifdef HP
double x_hp[20], Xir[nsmax], D_hp, R_hp;
double t0_hp, rho0_hp, Tmin_mi, Tmax_mi, Dmax_mi, Pmax_mi, Tmin_k, Tmax_k, Dmax_k, Pmax_k;
int ierr_hp, icomp_hp, ss;
char herr_hp[255];
#ifdef TD
int c, c_p1;
double T_dim, DelTc;
#endif
#else
int c, c_p1;
double T_dim, DelTc;
#endif
int app_ijk_s, app_ijk;
int ijk, s_ijk, max_k, max_j, max_i;
int how_many_i, how_many_j, how_many_k, inizio_i, inizio_j, inizio_k, incremento;
#ifdef VW
double d3, d4, d5, Mis[nsmax], appo1, appo2;
#endif
#ifdef CM
double Ks[nsmax], Xir[nsmax], appo3, appo4;
#endif
#ifdef DKT
double d2_KT;
double appo, appo5, appo6, OMij, T_LJij, T_LJij2, T_LJij3, T_LJij4, T_LJij5;
#endif
#ifdef TKH
double *FCorr[nsmax];
for ( int i = 0; i < nsmax; i++ ) {
FCorr[i] = malloc( nsmax* sizeof *FCorr[i]);
for ( int j = 0; j < nsmax; j++ )
FCorr[i][j] = 1.e0;
}
#endif
#ifdef TD
int p_c;
double p_app, W_app, Dtemp ;
double *xxir = (double *) malloc(sizeof(double) * nsmax);
double *CCp = (double *) malloc(sizeof(double) * nsmax);
double *d_ij = (double *) malloc(sizeof(double) * nsmax * nsmax);
#endif
double *p_mi_i = &mi_i[0][0];
double *p_K_i = &K_i[0][0];
double *p_Cpi = &Cpi[0][0];
max_k = (maxk[bbb]+ghost_c) - (mink[bbb]-ghost_c) + 1;
max_j = (maxj[bbb]+ghost_c) - (minj[bbb]-ghost_c) + 1;
max_i = (maxi[bbb]+ghost_c) - (mini[bbb]-ghost_c) + 1;
how_many_k = (maxk[bbb]+ghost_c) - (mink[bbb]-ghost_c) + 1;
how_many_j = (maxj[bbb]+ghost_c) - (minj[bbb]-ghost_c) + 1;
how_many_i = (maxi[bbb]+ghost_c) - (mini[bbb]-ghost_c) + 1;
max_k = maxk[bbb] - mink[bbb] + 1;
if( mink[bbb] == k_ind_min[bbb] ){
inizio_k = 0; max_k = max_k + ghost_c;
}
else
inizio_k = ( how_many_j * how_many_i * ghost_c );
if( maxk[bbb] == k_ind_max[bbb] )
max_k = max_k + ghost_c;
max_j = maxj[bbb] - minj[bbb] + 1;
if( minj[bbb] == j_ind_min[bbb] ){
inizio_j = 0; max_j = max_j + ghost_c;
}
else
inizio_j = ( how_many_i * ghost_c );
if( maxj[bbb] == j_ind_max[bbb] )
max_j = max_j + ghost_c;
max_i = maxi[bbb] - mini[bbb] + 1; incremento = 2 * ghost_c;
if( mini[bbb] == i_ind_min[bbb] ){
inizio_i = 0; max_i = max_i + ghost_c; incremento = incremento - ghost_c;
}
else
inizio_i = ghost_c ;
if( maxi[bbb] == i_ind_max[bbb] ){
max_i = max_i + ghost_c; incremento = incremento - ghost_c;
}
ijk = 0; s_ijk = 0;
#pragma acc data copy(mi_i,Mis,Ks,K_i,W,Xir,f0,LJij,RQW,T[0:nsmax][0:nsmax])
#pragma acc parallel loop private(ijk,s_ijk,VISC[0:nsmax][0:nsmax],COND[0:nsmax][0:nsmax],p[0:nsmax][0:nsmax],Yi[0:nsmax][0:nsmax],Wmix[0:nsmax][0:nsmax],DIFF[0:nsmax][0:nsmax])
for (int k = 0; k<max_k; k++){
ijk = inizio_k + ( how_many_j * how_many_i ) * k + inizio_j + inizio_i;
s_ijk = ijk * nsmax;
for (int j = 0; j<max_j; j++){
for (int i = 0; i<max_i; i++) {
#ifndef HP
T_dim = *(T+ijk) * T_rif * 0.01e0;
c = (int)floor( T_dim ); if (c<=0 || c>50) printf(" MP proc =%5d ZONA =%5d - %d %d %d - T = %10.4e \n", my_id, bbb, i,j,k, *(T+ijk)*T_rif );
DelTc = T_dim - (double) c;
c_p1 = c + 1 ;
#endif
for (int s = 0; s< nsmax; s++){
#ifdef VW
Mis[s] = ( mi_i[s][c_p1] - mi_i[s][c] ) * DelTc + mi_i[s][c] ;
#endif
#ifdef CM
Ks[s] = ( K_i[s][c_p1] - K_i[s][c] ) * DelTc + K_i[s][c] ;
Xir[s] = *(Yi + s_ijk + s) / (*(Wmix + ijk) * W[s] );
#endif
}
*( VISC+ijk ) = 0.0e0;
#ifdef CM
appo3 = 0.e0; appo4 = 0.e0;
#else
*( COND+ijk ) = 0.0e0;
#endif
#ifdef HP
y_to_x_hp_( (Wmix + ijk), (Yi + s_ijk) , x_hp);
icomp_hp=0;
ispure_( x_hp, &icomp_hp);
D_hp = *(rho + ijk ) * *(Wmix + ijk) / 1000.e0;
rmix_new_( x_hp, &icomp_hp, &R_hp);
trnprp_new_c ( T+ijk, &D_hp, x_hp, &icomp_hp, &R_hp, VISC+ijk, COND+ijk, &ierr_hp);
*(VISC+ijk) = *(VISC+ijk) * 1.0e-6 ;
if (*(VISC+ijk)>1.e-2) *(VISC+ijk) = 1.0e-2;
if (*(VISC+ijk)<0.e-0) *(VISC+ijk) = 1.0e-4;
if (*(COND+ijk)>1.e0) *(COND+ijk) = 1.0e0;
if (*(COND+ijk)<1.e-5) *(COND+ijk) = 1.0e-5;
#else
for (int s = 0; s< nsmax; s++){
#ifdef VISC_WILKE
appo1 = Xir[s] * Mis[s]; appo2 = 0.e0;
for (int ss = 0; ss< nsmax; ss++){
d3 = Mis[s] / Mis[ss];
d4 = sqrt( W[ss] / W[s]);
d5 = d3 * d4;
appo2 = appo2 + Xir[ss] * f0[ss][s] * (1.0e0 + d3 * d4 + 2.0e0 * sqrt( d5 ));
}
*( VISC+ijk ) = *( VISC+ijk ) + appo1 / appo2;
#else
*( VISC+ijk ) = *( VISC+ijk ) + *(Yi + s_ijk + s) * (( mi_i[s][c_p1] - mi_i[s][c] ) * DelTc + mi_i[s][c]) ;
#endif
#ifdef CM
appo3 = appo3 + *(Xir+s) * *(Ks+s);
appo4 = appo4 + *(Xir+s) / *(Ks+s);
#else
*(COND+ijk) = *(COND+ijk) + *(Yi+ s_ijk + s) * (( K_i[s][c_p1] - K_i[s][c] ) * DelTc + K_i[s][c]) ;
#endif
}
#endif
#ifdef CM
*(COND+ijk) = 0.5e0 * (appo3 + 1.e0/appo4);
#endif
#ifdef FIX
*(VISC+ijk) = *(rho+ijk) * 1.57e-5;
#endif
#ifdef ENHANCE
*(VISC+ijk) = 100.e0* *(VISC+ijk)
*(COND+ijk) = 100.e0* *(COND+ijk)
#endif
#ifndef DKT
for (int s = 0; s< nsmax; s++)
*(DIFF+s+s_ijk) = *(VISC+ijk) / ( *(rho+ijk) * *(Sc+s));
#endif
#ifdef DKT
#ifdef HP
for (int s = 0; s< nsmax; s++)
*(Xir+s) = *(Yi+s_ijk+s) / (*(Wmix+ijk) * *(W+s)) ;
#ifdef TKH
TKHSHC(*(T+ijk) ,*(p+ijk) , &Xir, &FCorr);
#endif
#endif
d2_KT = pow( *(T+ijk), 3.0e0/2.0e0);
appo6 = d1_KT * d2_KT / *(p+ijk);
for (int s = 0; s< nsmax; s++) Xir[s]= Xir[s] + 1.0e-12;
for (int s = 0; s< nsmax; s++){
appo = 0.0e0; appo5= 0.0e0;
for (int ss = 0; ss< nsmax; ss++)
if (s != ss){
T_LJij = *(T+ijk) / LJij[s][ss];
T_LJij2 = T_LJij * T_LJij;
T_LJij3 = T_LJij * T_LJij2;
T_LJij4 = T_LJij2 * T_LJij2;
T_LJij5 = T_LJij3 * T_LJij2;
if ( 4.5e0 < T_LJij && T_LJij <= 100.e0 )
OMij = b1_KT + b2_KT / T_LJij - b3_KT * T_LJij + b4_KT * T_LJij2 - b5_KT * T_LJij3 + b6_KT * T_LJij4 - b7_KT * T_LJij5;
else if ( T_LJij <= 4.5e0)
OMij = a1_KT + a2_KT / T_LJij - a3_KT * T_LJij + a4_KT * T_LJij2 - a5_KT * T_LJij3 + a6_KT * T_LJij4 - a7_KT * T_LJij5;
else
OMij = c1_KT + c2_KT / T_LJij - c3_KT * T_LJij + c4_KT * T_LJij3;
#ifdef TKH
appo = appo + Xir[ss] / (RQW[ss][s] * FCorr[s][ss] / OMij);
#else
appo = appo + Xir[ss] / (RQW[ss][s] / OMij);
#endif
appo5= appo5+ Xir[ss] * *(W+ss);
}
*(DIFF+s_ijk+s) = *(Wmix+ijk) * appo5 * appo6 / appo;
}
#ifdef TD
p_app = *( p+ijk ) * 1.e1 ; W_app = 1.0e3 / *(Wmix+ijk);
for (int s = 0; s< nsmax; s++)
*(xxir+s) = *(Xir+s);
c = (int)floor( *(T+ijk) * 0.01e0 ); Dtemp = *(T+ijk) * 0.01e0 - (double) c;
for (int s = 0; s< nsmax; s++)
CCp[s] = (( Cpi[s][c+1] - Cpi[s][c]) * Dtemp + Cpi[s][c]) * 1.0e04 ;
egspar_c( *(T+ijk), xxir, *(Yi + s_ijk) , CCp, weg, iweg );
egstd1_c(p_app, *(T+ijk), *(Yi + s_ijk) , W_app, weg, iweg, (chi + s_ijk), d_ij);
for (int s = s_ijk; s < s_ijk + nsmax; s++)
*(chi + s) = *(chi + s) * 1.0e-04;
#endif
#endif
ijk++; s_ijk = s_ijk + nsmax;
}
ijk = ijk + incremento;
s_ijk = ijk * nsmax;
}
}
#ifdef therm_diff
free (xxir); free(CCp); free(d_ij);
#endif
}
When I compile the code with:
nvc -c -acc=gpu,noautopar -target=gpu -gpu=ccall -traceback -Minfo=accel molp.c
the following acceleration information arise:
molp:
223, Generating copy(f0[:][:],mi_i[:][:],LJij[:][:],K_i[:][:],T[:nsmax],W[:],Xir[:],Ks[:],Mis[:],RQW[:][:]) [if not already present]
Generating NVIDIA GPU code
228, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
231, #pragma acc loop seq
232, #pragma acc loop seq
240, #pragma acc loop seq
273, #pragma acc loop seq
277, #pragma acc loop seq
364, #pragma acc loop seq
366, #pragma acc loop seq
368, #pragma acc loop seq
231, Loop carried scalar dependence for ijk at line 416
Complex loop carried dependence of T->,VISC->,COND->,p->,mi_i,Mis prevents parallelization
Loop carried dependence of Mis prevents parallelization
Loop carried dependence of Mis prevents vectorization
Loop carried backward dependence of Mis prevents vectorization
Complex loop carried dependence of K_i,Ks prevents parallelization
Loop carried dependence of Ks prevents parallelization
Loop carried dependence of Ks prevents vectorization
Loop carried backward dependence of Ks prevents vectorization
Complex loop carried dependence of Yi->,W,Wmix-> prevents parallelization
Loop carried dependence of Xir prevents parallelization
Loop carried dependence of Xir prevents vectorization
Complex loop carried dependence of Xir prevents parallelization
Loop carried backward dependence of Xir prevents vectorization
Complex loop carried dependence of f0,DIFF->,LJij,RQW prevents parallelization
232, Complex loop carried dependence of T->,VISC->,COND->,p-> prevents parallelization
Loop carried scalar dependence for s_ijk at line 249,414,396
Complex loop carried dependence of mi_i,Mis prevents parallelization
Loop carried dependence of Mis prevents parallelization
Loop carried backward dependence of Mis prevents vectorization
Complex loop carried dependence of K_i,Ks prevents parallelization
Loop carried dependence of Ks prevents parallelization
Loop carried backward dependence of Ks prevents vectorization
Complex loop carried dependence of Yi->,W,Wmix-> prevents parallelization
Loop carried dependence of Xir prevents parallelization
Complex loop carried dependence of Xir prevents parallelization
Loop carried backward dependence of Xir prevents vectorization
Complex loop carried dependence of f0,DIFF->,LJij,RQW prevents parallelization
240, Complex loop carried dependence of Mis,Ks,Yi->,Wmix->,Xir prevents parallelization
273, Complex loop carried dependence of Mis,Xir,VISC->,Ks,W,f0 prevents parallelization
277, Loop is parallelizable
364, Loop is parallelizable
366, Complex loop carried dependence of W,DIFF->,RQW,Xir,Wmix->,LJij,T-> prevents parallelization
368, Loop is parallelizable
I’ve tried to change some variable from copy to private and viceversa, however there is always the problem of complex loop carried dependence. I solved this problem in a similar fortran code with pointer and I think the problem here is the same. Unfortunately, the solution seems not to be the same. Maybe there is something wrong with the variables that are called from outside or something related to ifdef?