Memory Access Pattern

I run profiling for the two code below, and found that the 2nd code performs much worse (more than 10 times slower) than the first one. I suspect this has something to do with the way the code read the global memory. May I know what cause the performance difference?

#define INDEX(i,j,pitch) (i +__mul24(j,pitch))

__global__ void rhssetup2(float*x, float*u,float*bu,float*bd,int n,int pitch,float alpha)

{

	unsigned int thidy=INDEX(threadIdx.x,blockIdx.x,blockDim.x);

	unsigned int thidx=INDEX(threadIdx.y,blockIdx.y,blockDim.y);

	int top=thidy==0,down=thidy==(n-1);

	if(top)

	{

		x[INDEX(thidy,thidx,pitch)]=alpha*(bu[thidx]+u[INDEX(thidy,thidx-1,pitch)])+(1-alpha)*u[INDEX(thidy,thidx,pitch)];

	}

	else

	{

		if(down)

		{

		   x[INDEX(thidy,thidx,pitch)]=alpha*(bd[thidy]+u[INDEX(thidy,thidx+1,pitch)])+(1-alpha)*u[INDEX(thidy,thidx,pitch)];

		}

		else

		{

			x[INDEX(thidy,thidx,pitch)]=alpha*(u[INDEX(thidy,thidx+1,pitch)]+u[INDEX(thidy,thidx-1,pitch)])+(1-alpha)*u[INDEX(thidy,thidx,pitch)];

		}

	}  				

}

global void rhssetup3(floatx, floatu,floatbu,floatbd,int n,int pitch,float alpha)

{

unsigned int thidy=INDEX(threadIdx.x,blockIdx.x,blockDim.x);

unsigned int thidx=INDEX(threadIdx.y,blockIdx.y,blockDim.y);

int top=thidy==0,down=thidy==(n-1);

if(top)

{

	x[INDEX(thidy,thidx,pitch)]=alpha*(bu[thidx]+u[INDEX(thidx-1,thidy,pitch)])+(1-alpha)*u[INDEX(thidx,thidy,pitch)];

}

else

{

	if(down)

	{

	   x[INDEX(thidy,thidx,pitch)]=alpha*(bd[thidx]+u[INDEX(thidx+1,thidy,pitch)])+(1-alpha)*u[INDEX(thidx,thidy,pitch)];

	}

	else

	{

		x[INDEX(thidy,thidx,pitch)]=alpha*(u[INDEX(thidx+1,thidy,pitch)]+u[INDEX(thidx-1,thidy,pitch)])+(1-alpha)*u[INDEX(thidx,thidy,pitch)];

	}

}  				

}