LU decomposition Can you tell me the different between my two code

I am doing a LU docomposition. I try to improve my performance. Then I have a problem that I don’t know why.

I replace the first line for(int ty=1; ty<=dim-1; ty++) with if(ty>0 && ty<=dim-1). But I got totally different answer.

Could somebody else help me with that?

Thank you ^_^

code 1

for(int ty=1; ty<=dim-1; ty++) {

   if(tx>ty-1 && tx<=dim-1)

      {

      temp=0.0;

      for(int k=0;k<=ty-1;k++)

         {

         temp=temp+L[tx+k*dim]*U[k+ty*dim];

         }

      L[tx+ty*dim]=A[tx+ty*dim]-temp;

      }

      __syncthreads();

U[ty+ty*dim]=1.0;

      if(tx>ty && tx<=dim-1)

         {

         temp=0.0;

         for(int k=0; k<=ty-1;k++)

            {

            temp=temp+L[ty+k*dim]*U[k+tx*dim];

            }

       		__syncthreads();     

         U[ty+tx*dim]=(A[ty+tx*dim]-temp)/L[ty+ty*dim];

         }

      }   

Code2

if(ty>0 && ty<=dim-1) {

   if(tx>ty-1 && tx<=dim-1)

      {

      temp=0.0;

      for(int k=0;k<=ty-1;k++)

         {

         temp=temp+L[tx+k*dim]*U[k+ty*dim];

         }

      L[tx+ty*dim]=A[tx+ty*dim]-temp;

      }

      __syncthreads();

U[ty+ty*dim]=1.0;

      if(tx>ty && tx<=dim-1)

         {

         temp=0.0;

         for(int k=0; k<=ty-1;k++)

            {

            temp=temp+L[ty+k*dim]*U[k+tx*dim];

            }

       		__syncthreads();     

         U[ty+tx*dim]=(A[ty+tx*dim]-temp)/L[ty+ty*dim];

         }            

   }

checkout :
http://www.noctua-blog.co.nf/index.php/2011/04/21/lu-matrix-decomposition-in-parallel-with-cuda/

http://blog-noctua.rhcloud.com/index.php/2011/04/21/lu-matrix-decomposition-in-parallel-with-cuda/