# FDTD speed problem

Hi,

I am trying to use CUDA to solve FDTD question. I have speed up problem.

The red section I highlight is where take the longest calculation time.

I use the texture memory for those read-only variable. I use global memory For those variable

that need update.

[codebox]

global void EField_Texture( float* Ex, float* Ey, float* Ez, float* V ,float* Hx, float* Hy, float* Hz, float* V1, float* V2,int xe1,int xs2,int zs,int ze,float dz,int xe2,int xs3,int n,int yss1,int yss2)

{

``````int tx = blockIdx.x * blockDim.x + threadIdx.x;

for (int i=1; i<16; i++)

{

if (tx-i*(81*76)>81-1 && tx-i*(81*76)<81*75)

{

if((tx+1) % 81 !=0 )

{

int test1;

test1=(tex1Dfetch( rT1, tx )* Ex[tx]+tex1Dfetch( rT2, tx )*(Hz[tx]-Hz[tx-81])-tex1Dfetch( rT4, tx )*(Hy[tx]-Hy[tx-81*76]))*1000000;

Ex[tx]=(float) test1/1000000;

}

}
``````

}

``````for (int i=1; i<16; i++)

{

if (tx-i*(81*76)>=0 && tx-i*(81*76)<81*75)

{

if((tx+1) % 81 !=0 && (tx) % 81 !=0  )

{

int test1;

test1=(tex1Dfetch( rT6, tx )*Ey[tx]+tex1Dfetch( rT7, tx )*(Hx[tx]-Hx[tx-81*76])-tex1Dfetch( rT9, tx )*(Hz[tx]-Hz[tx-1]))*1000000;

Ey[tx]=(float) test1/1000000;

}

}

}

for (int i=0; i<16; i++)

{

if (tx-i*(81*76)>81-1 && tx-i*(81*76)<81*75)

{

if((tx+1) % 81 !=0 && (tx) % 81 !=0  )

{

int test1;

test1=(tex1Dfetch( rT10, tx )* Ez[tx]+tex1Dfetch( rT11, tx )*(Hy[tx]-Hy[tx-1])-tex1Dfetch( rT12, tx )*(Hx[tx]-Hx[tx-81])-tex1Dfetch( rT13, tx )*V[n-1])*1000000;

Ez[tx]=(float) test1/1000000;

}

}

}
``````

// Voltage sampling

``````float sum1=0;

float sum2=0;

int round=((xe1+xs2)/2)+0.5;

for (int z=zs-1 ; z<ze-1; z++)

{

sum1 =sum1+ Ez[round-1+(yss1-1)*81+z*81*76];

}

V1[n-1] = -dz*sum1;

round=((xe2+xs3)/2)+0.5;

for (int z=zs-1 ; z<ze-1; z++)

{

sum2 = sum2+Ez[round-1+(yss2-1)*81+z*81*76];

}

V2[n-1] = -dz*sum2;
``````

}

[/codebox]

That is the first subroutine for my Electrical field. Then follow with another two subroutines to update the data.

Run a 3000 times loop. I ran it with a 8400 card. I had the calculation time is 10 times longer than CPU.

[codebox]

for(int n=0;n<3000;n++)

``````{

printf("\nloop%d",n);

time = time+dt/2;

int nn = nwrap[n-1];

int ln=nwrap[n+2-1];

EField_Texture<<< ceil( (float)size / BLOCK_DIM ), BLOCK_DIM >>>( dev_Ex,dev_Ey,dev_Ez,dev_V,dev_Hx,dev_Hy,dev_Hz,dev_V1,dev_V
``````

2,xe1,xs2,zs,ze,dz,xe2,xs3,n,yss1,yss2);

``````   tv[n-1] = time*1e+9;  // Voltage time array in nano seconds

ABC_Texture<<< ceil( (float)size / BLOCK_DIM ), BLOCK_DIM >>>( dev_Ey,dev_Ez,dev_ezlx1,dev_ezlx2,dev_eylx1,dev_eylx2,nn,ln,
``````

dev_Ex,dev_ezly1,dev_ezly2,dev_exly1,dev_exly2,dev_exlz1,dev_

exlz2,dev_eylz1,dev_eylz2,dev_nwrap,n);

HField_Texture<<< ceil( (float)size / BLOCK_DIM ), BLOCK_DIM >>>( dev_Hx,dev_Hy,dev_Hz,dev_Ex,dev_Ey,dev_Ez,dev_I1,dev_I2,xe1,

xs2,yss1,ze,xe2,xs3,yss2,dz,n,dx);

``````   time = time+dt/2;

ti[n-1] = time*1e+9;  // Current time array in nano seconds

}
``````

[/codebox]

I also tried to put the EField_Texture, ABC_Texture, and HField_Texture into the same kernel instead of calling 3 subroutines.

Then I had another problem which is the program seems start running the next calculation before finished the current one.

I can send a code to you if you need to see the code to help me.

Thank you very much

Eric