I am doing a LU docomposition. I try to improve my performance. Then I have a problem that I don’t know why.
I replace the first line for(int ty=1; ty<=dim-1; ty++) with if(ty>0 && ty<=dim-1). But I got totally different answer.
Could somebody else help me with that?
Thank you ^_^
code 1
for(int ty=1; ty<=dim-1; ty++) {
if(tx>ty-1 && tx<=dim-1)
{
temp=0.0;
for(int k=0;k<=ty-1;k++)
{
temp=temp+L[tx+k*dim]*U[k+ty*dim];
}
L[tx+ty*dim]=A[tx+ty*dim]-temp;
}
__syncthreads();
U[ty+ty*dim]=1.0;
if(tx>ty && tx<=dim-1)
{
temp=0.0;
for(int k=0; k<=ty-1;k++)
{
temp=temp+L[ty+k*dim]*U[k+tx*dim];
}
__syncthreads();
U[ty+tx*dim]=(A[ty+tx*dim]-temp)/L[ty+ty*dim];
}
}
Code2
if(ty>0 && ty<=dim-1) {
if(tx>ty-1 && tx<=dim-1)
{
temp=0.0;
for(int k=0;k<=ty-1;k++)
{
temp=temp+L[tx+k*dim]*U[k+ty*dim];
}
L[tx+ty*dim]=A[tx+ty*dim]-temp;
}
__syncthreads();
U[ty+ty*dim]=1.0;
if(tx>ty && tx<=dim-1)
{
temp=0.0;
for(int k=0; k<=ty-1;k++)
{
temp=temp+L[ty+k*dim]*U[k+tx*dim];
}
__syncthreads();
U[ty+tx*dim]=(A[ty+tx*dim]-temp)/L[ty+ty*dim];
}
}