The code is as follows,
double ftkv[4], f_h[Q];
int i, j, k, n;
double F;
double ftk, rhot, ut0, ut1, uu, eu;
#pragma acc data present(f_star, f_xbud, f_xblr, L_dh, e, W, vn, dsx, dsy)
#pragma acc kernels
#pragma acc loop independent
for (i = 1; i <= NX; ++i)
{
#pragma acc loop independent
for (j = 1; j <= NY; ++j)
{
rhot = 0;
ut0 = 0;
ut1 = 0;
#pragma acc loop private(ftkv, F, n, ftk, k)
for (k = 0; k < Q; ++k)
{
ftkv[0]= f_xblr[k][i ][j - 1];
ftkv[2]= f_xblr[k][i - 1][j - 1];
ftkv[1]= f_xbud[k][i - 1][j ];
ftkv[3]= f_xbud[k][i - 1][j - 1];
//compute f_(n+1/2)
f_h[k] = 0.25 * (ftkv[0] + ftkv[1] + ftkv[2] + ftkv[3]);
F = 0;
n = 0;
f += (e[k][0] * vn[n][0] + e[k][1] * vn[n][1]) * ftkv[n] * dsy[j];
n = 1;
F += (e[k][0] * vn[n][0] + e[k][1] * vn[n][1]) * ftkv[n] * dsx[i];
n = 2;
F += (e[k][0] * vn[n][0] + e[k][1] * vn[n][1]) * ftkv[n] * dsy[j];
n = 3;
F += (e[k][0] * vn[n][0] + e[k][1] * vn[n][1]) * ftkv[n] * dsx[i];
L_dh[k][i][j] = -st / (dsx[i] * dsy[j]) * F;
ftk = f_h[k];
rhot += ftk;
ut0 += e[k][0] * ftk;
ut1 += e[k][1] * ftk;
}
uu = ut0 * ut0 + ut1 * ut1;
#pragma acc loop private(eu, k)
for (k = 0; k < Q; ++k)
{
eu = e[k][0] * ut0 + e[k][1] * ut1;
L_dh[k][i][j] += -st / tau * f_h[k];
}
}
}
where the outer loop i,j represent every mesh point, “f_h[Q]” and “ftkv[4]” are the local temporary arrays for each point (i,j). But the compiler said:
742, Generating present(f_xbud[:][:][:],f_xblr[:][:][:],L_dt[:][:][:],e[:][:],vn[:][:],dsx[:],dsy[:])
Loop is parallelizable
Generating copyout(f_h[:])
745, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
742, #pragma acc loop gang, vector(2) /* blockIdx.y threadIdx.y */
745, #pragma acc loop gang, vector(64) /* blockIdx.x threadIdx.x */
748, Loop is parallelizable
779, Loop is parallelizable
791, Complex loop carried dependence of ->,L_dt prevents parallelization
Inner sequential loop scheduled on accelerator
see that “Generating copyout(f_h[:])”, I think the compiler treat the “f_h[Q]” as a global array, and the computational results are wrong. How can I define “f_h[Q]” as a local array for each (i,j)?
Besides, Q is small which is less than 10. So I want to tell the compiler that the loop k is no need to parallelise, just run as sequential. I have tried the clause “loop seq” but it seems no use in “kernels”. What can I do if I want to force a loop run as sequential?
Thank you!