FDTD speed problem

Hi,

I am trying to use CUDA to solve FDTD question. I have speed up problem.

The red section I highlight is where take the longest calculation time.

I use the texture memory for those read-only variable. I use global memory For those variable

that need update.

[codebox]

global void EField_Texture( float* Ex, float* Ey, float* Ez, float* V ,float* Hx, float* Hy, float* Hz, float* V1, float* V2,int xe1,int xs2,int zs,int ze,float dz,int xe2,int xs3,int n,int yss1,int yss2)

{

int tx = blockIdx.x * blockDim.x + threadIdx.x;  

for (int i=1; i<16; i++)

	{

		if (tx-i*(81*76)>81-1 && tx-i*(81*76)<81*75)

		{

		if((tx+1) % 81 !=0 )

        {

           int test1;

           test1=(tex1Dfetch( rT1, tx )* Ex[tx]+tex1Dfetch( rT2, tx )*(Hz[tx]-Hz[tx-81])-tex1Dfetch( rT4, tx )*(Hy[tx]-Hy[tx-81*76]))*1000000;

           Ex[tx]=(float) test1/1000000;

        }

   }

}

for (int i=1; i<16; i++)

{

   if (tx-i*(81*76)>=0 && tx-i*(81*76)<81*75)

      {

         if((tx+1) % 81 !=0 && (tx) % 81 !=0  )

         {    

           int test1;

           test1=(tex1Dfetch( rT6, tx )*Ey[tx]+tex1Dfetch( rT7, tx )*(Hx[tx]-Hx[tx-81*76])-tex1Dfetch( rT9, tx )*(Hz[tx]-Hz[tx-1]))*1000000;

           Ey[tx]=(float) test1/1000000;     

         }

     }

}     

for (int i=0; i<16; i++)

{

	if (tx-i*(81*76)>81-1 && tx-i*(81*76)<81*75)

    {

		if((tx+1) % 81 !=0 && (tx) % 81 !=0  )

	    { 

		   int test1;

           test1=(tex1Dfetch( rT10, tx )* Ez[tx]+tex1Dfetch( rT11, tx )*(Hy[tx]-Hy[tx-1])-tex1Dfetch( rT12, tx )*(Hx[tx]-Hx[tx-81])-tex1Dfetch( rT13, tx )*V[n-1])*1000000;

           Ez[tx]=(float) test1/1000000;        

        }

    } 

}  

// Voltage sampling

float sum1=0;

float sum2=0;

int round=((xe1+xs2)/2)+0.5;

for (int z=zs-1 ; z<ze-1; z++)

{   

	sum1 =sum1+ Ez[round-1+(yss1-1)*81+z*81*76];

}

V1[n-1] = -dz*sum1;

round=((xe2+xs3)/2)+0.5;

for (int z=zs-1 ; z<ze-1; z++)

{

	sum2 = sum2+Ez[round-1+(yss2-1)*81+z*81*76];

}

V2[n-1] = -dz*sum2; 

}

[/codebox]

That is the first subroutine for my Electrical field. Then follow with another two subroutines to update the data.

Run a 3000 times loop. I ran it with a 8400 card. I had the calculation time is 10 times longer than CPU.

[codebox]

for(int n=0;n<3000;n++)

{

   printf("\nloop%d",n);

   time = time+dt/2;

   int nn = nwrap[n-1]; 

   int ln=nwrap[n+2-1];   

  EField_Texture<<< ceil( (float)size / BLOCK_DIM ), BLOCK_DIM >>>( dev_Ex,dev_Ey,dev_Ez,dev_V,dev_Hx,dev_Hy,dev_Hz,dev_V1,dev_V

2,xe1,xs2,zs,ze,dz,xe2,xs3,n,yss1,yss2);

   tv[n-1] = time*1e+9;  // Voltage time array in nano seconds

 ABC_Texture<<< ceil( (float)size / BLOCK_DIM ), BLOCK_DIM >>>( dev_Ey,dev_Ez,dev_ezlx1,dev_ezlx2,dev_eylx1,dev_eylx2,nn,ln,

dev_Ex,dev_ezly1,dev_ezly2,dev_exly1,dev_exly2,dev_exlz1,dev_

exlz2,dev_eylz1,dev_eylz2,dev_nwrap,n);

HField_Texture<<< ceil( (float)size / BLOCK_DIM ), BLOCK_DIM >>>( dev_Hx,dev_Hy,dev_Hz,dev_Ex,dev_Ey,dev_Ez,dev_I1,dev_I2,xe1,

xs2,yss1,ze,xe2,xs3,yss2,dz,n,dx);

   time = time+dt/2;

   ti[n-1] = time*1e+9;  // Current time array in nano seconds

}

[/codebox]

I also tried to put the EField_Texture, ABC_Texture, and HField_Texture into the same kernel instead of calling 3 subroutines.

Then I had another problem which is the program seems start running the next calculation before finished the current one.

I can send a code to you if you need to see the code to help me.

Thank you very much

Eric