I run profiling for the two code below, and found that the 2nd code performs much worse (more than 10 times slower) than the first one. I suspect this has something to do with the way the code read the global memory. May I know what cause the performance difference?
#define INDEX(i,j,pitch) (i +__mul24(j,pitch))
__global__ void rhssetup2(float*x, float*u,float*bu,float*bd,int n,int pitch,float alpha)
{
unsigned int thidy=INDEX(threadIdx.x,blockIdx.x,blockDim.x);
unsigned int thidx=INDEX(threadIdx.y,blockIdx.y,blockDim.y);
int top=thidy==0,down=thidy==(n-1);
if(top)
{
x[INDEX(thidy,thidx,pitch)]=alpha*(bu[thidx]+u[INDEX(thidy,thidx-1,pitch)])+(1-alpha)*u[INDEX(thidy,thidx,pitch)];
}
else
{
if(down)
{
x[INDEX(thidy,thidx,pitch)]=alpha*(bd[thidy]+u[INDEX(thidy,thidx+1,pitch)])+(1-alpha)*u[INDEX(thidy,thidx,pitch)];
}
else
{
x[INDEX(thidy,thidx,pitch)]=alpha*(u[INDEX(thidy,thidx+1,pitch)]+u[INDEX(thidy,thidx-1,pitch)])+(1-alpha)*u[INDEX(thidy,thidx,pitch)];
}
}
}
global void rhssetup3(floatx, floatu,floatbu,floatbd,int n,int pitch,float alpha)
{
unsigned int thidy=INDEX(threadIdx.x,blockIdx.x,blockDim.x);
unsigned int thidx=INDEX(threadIdx.y,blockIdx.y,blockDim.y);
int top=thidy==0,down=thidy==(n-1);
if(top)
{
x[INDEX(thidy,thidx,pitch)]=alpha*(bu[thidx]+u[INDEX(thidx-1,thidy,pitch)])+(1-alpha)*u[INDEX(thidx,thidy,pitch)];
}
else
{
if(down)
{
x[INDEX(thidy,thidx,pitch)]=alpha*(bd[thidx]+u[INDEX(thidx+1,thidy,pitch)])+(1-alpha)*u[INDEX(thidx,thidy,pitch)];
}
else
{
x[INDEX(thidy,thidx,pitch)]=alpha*(u[INDEX(thidx+1,thidy,pitch)]+u[INDEX(thidx-1,thidy,pitch)])+(1-alpha)*u[INDEX(thidx,thidy,pitch)];
}
}
}