Dear Mat,
I have a code with 3levels of loop. I tried to use openACC to accelerate the outside loop as the attached.
#pragma acc kernels copy(l[:N*N],u[:N*N]) copyin(a[:N*N]) local(sum)
for(i=0; i<n-1; i++)
{
for(j=0; j<n; j++)
{
if(j>i)
{
for(k=0,sum=0; k<n; k++)
{
if(k != i)
{
sum += l[j][k]*u[k][i];
}
}
l[j][i] = (float)((a[j][i]-sum)/u[i][i]);
}
}
for(j=0; j<n; j++)
{
if(j>i)
{
for(k=0,sum=0; k<n; k++)
{
if(k != i+1)
{
sum += l[i+1][k]*u[k][j];
}
}
u[i+1][j] = (float)((a[i+1][j]-sum));
}
}
}
but i found the result is not the same with CPU code. and I also try to accelerate the inner loop, but failed.
Can you give me some suggestions?