Thanks for your last feedback I am accelerating the below function part of a solver, It’s hard to extract it to a minimal example, but I analyzed every part of it the problem is with the last part of it the fluxes rhou_vis_flux[I1D(i,j,k)], rhov_vis_flux[I1D(i,j,k)], rhow_vis_flux[I1D(i,j,k)] and rhoE_vis_flux[I1D(i,j,k)].
void FlowSolverRHEA::calculateViscousFluxes() {
/// Second-order central finite differences for derivatives:
/// P. Moin.
/// Fundamentals of engineering numerical analysis.
/// Cambridge University Press, 2010.
/// Inner points: rho_vis_flux, rhou_vis_flux, rhov_vis_flux, rhow_vis_flux and rhoE_vis_flux
const int local_size_x = _lNx_;
const int local_size_y = _lNy_;
const int local_size_z = _lNz_;
const int local_size = local_size_x*local_size_y*local_size_z;
double delta_x, delta_y, delta_z;
double d_u_x, d_u_y, d_u_z, d_v_x, d_v_y, d_v_z, d_w_x, d_w_y, d_w_z;
double d_T_x, d_T_y, d_T_z;
double d_mu_x, d_mu_y, d_mu_z, d_kappa_x, d_kappa_y, d_kappa_z;
double div_uvw, tau_xx, tau_xy, tau_xz, tau_yy, tau_yz, tau_zz;
double div_tau_x, div_tau_y, div_tau_z;
double div_q, div_uvw_tau;
int inix = topo->iter_common[_INNER_][_INIX_];
int iniy = topo->iter_common[_INNER_][_INIY_];
int iniz = topo->iter_common[_INNER_][_INIZ_];
int endx = topo->iter_common[_INNER_][_ENDX_];
int endy = topo->iter_common[_INNER_][_ENDY_];
int endz = topo->iter_common[_INNER_][_ENDZ_];
#pragma acc kernels
for(int i = inix; i <= endx; i++) {
for(int j = iniy; j <= endy; j++) {
for(int k = iniz; k <= endz; k++) {
/// Geometric stuff
delta_x = delta_x_field[I1D(i,j,k)];
delta_y = delta_y_field[I1D(i,j,k)];
delta_z = delta_z_field[I1D(i,j,k)];
/// Velocity derivatives
d_u_x = ( u_field[I1D(i+1,j,k)] - u_field[I1D(i-1,j,k)] )/( 2.0delta_x );
d_u_y = ( u_field[I1D(i,j+1,k)] - u_field[I1D(i,j-1,k)] )/( 2.0delta_y );
d_u_z = ( u_field[I1D(i,j,k+1)] - u_field[I1D(i,j,k-1)] )/( 2.0delta_z );
d_v_x = ( v_field[I1D(i+1,j,k)] - v_field[I1D(i-1,j,k)] )/( 2.0delta_x );
d_v_y = ( v_field[I1D(i,j+1,k)] - v_field[I1D(i,j-1,k)] )/( 2.0delta_y );
d_v_z = ( v_field[I1D(i,j,k+1)] - v_field[I1D(i,j,k-1)] )/( 2.0delta_z );
d_w_x = ( w_field[I1D(i+1,j,k)] - w_field[I1D(i-1,j,k)] )/( 2.0delta_x );
d_w_y = ( w_field[I1D(i,j+1,k)] - w_field[I1D(i,j-1,k)] )/( 2.0delta_y );
d_w_z = ( w_field[I1D(i,j,k+1)] - w_field[I1D(i,j,k-1)] )/( 2.0delta_z );
/// Temperature derivatives
d_T_x = ( T_field[I1D(i+1,j,k)] - T_field[I1D(i-1,j,k)] )/( 2.0delta_x );
d_T_y = ( T_field[I1D(i,j+1,k)] - T_field[I1D(i,j-1,k)] )/( 2.0delta_y );
d_T_z = ( T_field[I1D(i,j,k+1)] - T_field[I1D(i,j,k-1)] )/( 2.0delta_z );
/// Transport coefficients derivatives
d_mu_x = ( mu_field[I1D(i+1,j,k)] - mu_field[I1D(i-1,j,k)] )/( 2.0delta_x );
d_mu_y = ( mu_field[I1D(i,j+1,k)] - mu_field[I1D(i,j-1,k)] )/( 2.0delta_y );
d_mu_z = ( mu_field[I1D(i,j,k+1)] - mu_field[I1D(i,j,k-1)] )/( 2.0delta_z );
d_kappa_x = ( kappa_field[I1D(i+1,j,k)] - kappa_field[I1D(i-1,j,k)] )/( 2.0delta_x );
d_kappa_y = ( kappa_field[I1D(i,j+1,k)] - kappa_field[I1D(i,j-1,k)] )/( 2.0delta_y );
d_kappa_z = ( kappa_field[I1D(i,j,k+1)] - kappa_field[I1D(i,j,k-1)] )/( 2.0delta_z );
/// Divergence of velocity
div_uvw = d_u_x + d_v_y + d_w_z;
/// Viscous stresses ( symmetric tensor )
tau_xx = 2.0mu_field[I1D(i,j,k)]( d_u_x - ( div_uvw/3.0 ) );
tau_xy = mu_field[I1D(i,j,k)]( d_u_y + d_v_x );
tau_xz = mu_field[I1D(i,j,k)]( d_u_z + d_w_x );
tau_yy = 2.0mu_field[I1D(i,j,k)]( d_v_y - ( div_uvw/3.0 ) );
tau_yz = mu_field[I1D(i,j,k)]( d_v_z + d_w_y );
tau_zz = 2.0mu_field[I1D(i,j,k)]*( d_w_z - ( div_uvw/3.0 ) );
/// Divergence of viscous stresses
div_tau_x = mu_field[I1D(i,j,k)]( ( 1.00/delta_x )( ( u_field[I1D(i+1,j,k)] - u_field[I1D(i,j,k)] )/( mesh->x[i+1] - mesh->x[i] )
- ( u_field[I1D(i,j,k)] - u_field[I1D(i-1,j,k)] )/( mesh->x[i] - mesh->x[i-1] ) )
+ ( 1.00/delta_y )( ( u_field[I1D(i,j+1,k)] - u_field[I1D(i,j,k)] )/( mesh->y[j+1] - mesh->y[j] )
- ( u_field[I1D(i,j,k)] - u_field[I1D(i,j-1,k)] )/( mesh->y[j] - mesh->y[j-1] ) )
+ ( 1.00/delta_z )( ( u_field[I1D(i,j,k+1)] - u_field[I1D(i,j,k)] )/( mesh->z[k+1] - mesh->z[k] )
- ( u_field[I1D(i,j,k)] - u_field[I1D(i,j,k-1)] )/( mesh->z[k] - mesh->z[k-1] ) ) )
+ ( 1.0/3.0 )mu_field[I1D(i,j,k)]( ( 1.00/delta_x )( ( u_field[I1D(i+1,j,k)] - u_field[I1D(i,j,k)] )/( mesh->x[i+1] - mesh->x[i] )
- ( u_field[I1D(i,j,k)] - u_field[I1D(i-1,j,k)] )/( mesh->x[i] - mesh->x[i-1] ) )
+ ( 0.25/delta_x )( ( v_field[I1D(i+1,j+1,k)] - v_field[I1D(i+1,j-1,k)] )/delta_y
- ( v_field[I1D(i-1,j+1,k)] - v_field[I1D(i-1,j-1,k)] )/delta_y )
+ ( 0.25/delta_x )( ( w_field[I1D(i+1,j,k+1)] - w_field[I1D(i+1,j,k-1)] )/delta_z
- ( w_field[I1D(i-1,j,k+1)] - w_field[I1D(i-1,j,k-1)] )/delta_z ) )
+ d_mu_xtau_xx + d_mu_ytau_xy + d_mu_ztau_xz;
div_tau_y = mu_field[I1D(i,j,k)]( ( 1.00/delta_x )( ( v_field[I1D(i+1,j,k)] - v_field[I1D(i,j,k)] )/( mesh->x[i+1] - mesh->x[i] )
- ( v_field[I1D(i,j,k)] - v_field[I1D(i-1,j,k)] )/( mesh->x[i] - mesh->x[i-1] ) )
+ ( 1.00/delta_y )( ( v_field[I1D(i,j+1,k)] - v_field[I1D(i,j,k)] )/( mesh->y[j+1] - mesh->y[j] )
- ( v_field[I1D(i,j,k)] - v_field[I1D(i,j-1,k)] )/( mesh->y[j] - mesh->y[j-1] ) )
+ ( 1.00/delta_z )( ( v_field[I1D(i,j,k+1)] - v_field[I1D(i,j,k)] )/( mesh->z[k+1] - mesh->z[k] )
- ( v_field[I1D(i,j,k)] - v_field[I1D(i,j,k-1)] )/( mesh->z[k] - mesh->z[k-1] ) ) )
+ ( 1.0/3.0 )mu_field[I1D(i,j,k)]( ( 0.25/delta_y )( ( u_field[I1D(i+1,j+1,k)] - u_field[I1D(i-1,j+1,k)] )/delta_x
- ( u_field[I1D(i+1,j-1,k)] - u_field[I1D(i-1,j-1,k)] )/delta_x )
+ ( 1.00/delta_y )( ( v_field[I1D(i,j+1,k)] - v_field[I1D(i,j,k)] )/( mesh->y[j+1] - mesh->y[j] )
- ( v_field[I1D(i,j,k)] - v_field[I1D(i,j-1,k)] )/( mesh->y[j] - mesh->y[j-1] ) )
+ ( 0.25/delta_y )( ( w_field[I1D(i,j+1,k+1)] - w_field[I1D(i,j+1,k-1)] )/delta_z
- ( w_field[I1D(i,j-1,k+1)] - w_field[I1D(i,j-1,k-1)] )/delta_z ) )
+ d_mu_xtau_xy + d_mu_ytau_yy + d_mu_ztau_yz;
div_tau_z = mu_field[I1D(i,j,k)]( ( 1.00/delta_x )( ( w_field[I1D(i+1,j,k)] - w_field[I1D(i,j,k)] )/( mesh->x[i+1] - mesh->x[i] )
- ( w_field[I1D(i,j,k)] - w_field[I1D(i-1,j,k)] )/( mesh->x[i] - mesh->x[i-1] ) )
+ ( 1.00/delta_y )( ( w_field[I1D(i,j+1,k)] - w_field[I1D(i,j,k)] )/( mesh->y[j+1] - mesh->y[j] )
- ( w_field[I1D(i,j,k)] - w_field[I1D(i,j-1,k)] )/( mesh->y[j] - mesh->y[j-1] ) )
+ ( 1.00/delta_z )( ( w_field[I1D(i,j,k+1)] - w_field[I1D(i,j,k)] )/( mesh->z[k+1] - mesh->z[k] )
- ( w_field[I1D(i,j,k)] - w_field[I1D(i,j,k-1)] )/( mesh->z[k] - mesh->z[k-1] ) ) )
+ ( 1.0/3.0 )mu_field[I1D(i,j,k)]( ( 0.25/delta_z )( ( u_field[I1D(i+1,j,k+1)] - u_field[I1D(i-1,j,k+1)] )/delta_x
- ( u_field[I1D(i+1,j,k-1)] - u_field[I1D(i-1,j,k-1)] )/delta_x )
+ ( 0.25/delta_z )( ( v_field[I1D(i,j+1,k+1)] - v_field[I1D(i,j-1,k+1)] )/delta_y
- ( v_field[I1D(i,j+1,k-1)] - v_field[I1D(i,j-1,k-1)] )/delta_y )
+ ( 1.00/delta_z )( ( w_field[I1D(i,j,k+1)] - w_field[I1D(i,j,k)] )/( mesh->z[k+1] - mesh->z[k] )
- ( w_field[I1D(i,j,k)] - w_field[I1D(i,j,k-1)] )/( mesh->z[k] - mesh->z[k-1] ) ) )
+ d_mu_xtau_xz + d_mu_ytau_yz + d_mu_ztau_zz;
/// Fourier term
div_q = ( -1.0 )kappa_field[I1D(i,j,k)]( ( 1.0/delta_x )( ( T_field[I1D(i+1,j,k)] - T_field[I1D(i,j,k)] )/( mesh->x[i+1] - mesh->x[i] )
- ( T_field[I1D(i,j,k)] - T_field[I1D(i-1,j,k)] )/( mesh->x[i] - mesh->x[i-1] ) )
+ ( 1.0/delta_y )( ( T_field[I1D(i,j+1,k)] - T_field[I1D(i,j,k)] )/( mesh->y[j+1] - mesh->y[j] )
- ( T_field[I1D(i,j,k)] - T_field[I1D(i,j-1,k)] )/( mesh->y[j] - mesh->y[j-1] ) )
+ ( 1.0/delta_z )( ( T_field[I1D(i,j,k+1)] - T_field[I1D(i,j,k)] )/( mesh->z[k+1] - mesh->z[k] )
- ( T_field[I1D(i,j,k)] - T_field[I1D(i,j,k-1)] )/( mesh->z[k] - mesh->z[k-1] ) ) )
- d_kappa_xd_T_x - d_kappa_yd_T_y - d_kappa_zd_T_z;
/// Work of viscous stresses
div_uvw_tau = u_field[I1D(i,j,k)]div_tau_x + v_field[I1D(i,j,k)]div_tau_y + w_field[I1D(i,j,k)]div_tau_z
+ tau_xxd_u_x + tau_xyd_u_y + tau_xzd_u_z
+ tau_xyd_v_x + tau_yyd_v_y + tau_yzd_v_z
+ tau_xzd_w_x + tau_yzd_w_y + tau_zzd_w_z;
/// Viscous fluxes
rhou_vis_flux[I1D(i,j,k)] = div_tau_x;
rhov_vis_flux[I1D(i,j,k)] = div_tau_y;
rhow_vis_flux[I1D(i,j,k)] = div_tau_z;
rhoE_vis_flux[I1D(i,j,k)] = ( -1.0 )*div_q + div_uvw_tau;
}
}
}
/// Update halo values
//rhou_vis_flux.update();
//rhov_vis_flux.update();
//rhow_vis_flux.update();
//rhoE_vis_flux.update();
};
All the arrays in this function are arrays of objects and all of them can be copied in and out without problems and the function runs properly except for the fluxes at the end as I mentioned in the beginning when I uncomment them and run I got below message
FlowSolverRHEA::calculateViscousFluxes():
1927, Generating implicit copyin(this[:]) [if not already present]
1934, Complex loop carried dependence of lNy,lNz,mesh,mesh->x,mesh->x->,mesh->z,mesh->z->,mesh->y-> prevents parallelization
Accelerator serial kernel generated
Generating Tesla code
1934, #pragma acc loop seq
1935, #pragma acc loop seq
1936, #pragma acc loop seq
1935, Complex loop carried dependence of lNz,lNy,mesh->x->,mesh->x,mesh->z->,mesh->y->,mesh->y,mesh->z prevents parallelization
1936, Complex loop carried dependence of lNz,lNy,mesh->x->,mesh->x,mesh->z->,mesh->y->,mesh->y,mesh->z prevents parallelization
Then when executing I receive below error:
RHEA (v1.1.0): START SIMULATION
Time iteration 10: time = 6.42379e-05 [s], time-step = 6.42238e-06 [s], wall-clock time = 4.25751e-03 [h]
Failing in Thread:1
call to cuStreamSynchronize returned error 700: Illegal address during kernel execution
Do you have an Idea why this could be happening only with these arrays ( rhou_vis_flux[I1D(i,j,k)], rhov_vis_flux[I1D(i,j,k)], rhow_vis_flux[I1D(i,j,k)] and rhoE_vis_flux[I1D(i,j,k)]) noting that all the other arrays in the function are the same but only these that generate this problem ?
Thanks
Ahmed