Hi
I have a SOR Poisson solver kernel that i want to try to use the cache directive on. Im am however a little bit unclear how and where to use it. Was wonder if anyone could give me some suggestions.
I am using the deviceptr clause because data is allocated using acc_malloc, and the mask value represents obstacles in the volume.
Also wondering what would be the optimal gang vector configuration when working on a 3D grid such as 128x32x128 (x,y,z). As default the compiler partitions the two inner loops (x,z) across gangs an the inner loop (x) across vectors.
Thanks for any suggestions.
#define I(X,Y,Z) ((X) + (Z)*dim.x + (Y)*dim.z*dim.x)
void solve_poisson(float *p, float *p0, float *b, int *obs, float *poisson_tab, dim_3 dim, float w) {
#pragma acc kernels deviceptr(p,p0,obs,b,poisson_tab) copyin(w)
{
#pragma acc loop independent
for(int y = 1; y < dim.y-1; ++y) {
#pragma acc loop independent
for(int z = 1; z < dim.z-1; ++z) {
#pragma acc loop independent
for(int x = 1; x < dim.x-1; ++x) {
int mask = obs[I(x,y,z)] & 127;
if ((~mask & 126) && (mask & VOX_SELF) == 0) {
float res = 0.0f;
res += p0[I(x-1,y,z)] * (float)((mask & VOX_LEFT) == 0);
res += p0[I(x+1,y,z)] * (float)((mask & VOX_RIGHT) == 0);
res += p0[I(x,y-1,z)] * (float)((mask & VOX_BELOW) == 0);
res += p0[I(x,y+1,z)] * (float)((mask & VOX_ABOVE) == 0);
res += p0[I(x,y,z-1)] * (float)((mask & VOX_UP) == 0);
res += p0[I(x,y,z+1)] * (float)((mask & VOX_DOWN) == 0);
res -= b[I(x,y,z)];
res *= poisson_tab[mask>>1];
res *= w;
res += p0[I(x,y,z)]*(1.0f - w);
p[I(x,y,z)] = res;
}
else {
p[I(x,y,z)] = 0.0f;
}
}
}
}
}
}