Hi I am new to Cuda
I got wrong o/p after compiling Fortran+CUDA. And here is my Code.
Serial 3D-FDTD Code( in fortran 77):
problem size : 1501501200
…
…
CALL EXSFLD
CALL EYSFLD
CALL EZSFLD
…
…
CALL HXSFLD
CALL HYSFLD
CALL HZSFLD
…
…
SUBROUTINE EXSFLD
DO 30 K=2,NZ1
KK = K
DO 20 J=2,NY1
JJ = J
DO 10 I=1,NX1
IF(IDONE(I,J,K).EQ.0) GO TO 100
IF(IDONE(I,J,K).EQ.1) GO TO 200
GO TO 300
C
C FREE SPACE
C
100 EXS(I,J,K)=EXS(I,J,K)+(HZS(I,J,K)-HZS(I,J-1,K))*DTEDY
$ -(HYS(I,J,K)-HYS(I,J,K-1))*DTEDZ
GO TO 10
200 II = I
EXS(I,J,K)=-EXI(II,JJ,KK)
GO TO 10
300 II = I
EXS(I,J,K)=EXS(I,J,K)*ESCTC(IDONE(I,J,K))
$ -EINCC( IDONE(I,J,K))*EXI(II,JJ,KK)
$ -EDEVCN(IDONE(I,J,K))*DEXI(II,JJ,KK)
$ +(HZS(I,J,K)-HZS(I,J-1,K))*ECRLY(IDONE(I,J,K))
$ -(HYS(I,J,K)-HYS(I,J,K-1))*ECRLZ(IDONE(I,J,K))
10 CONTINUE
20 CONTINUE
30 CONTINUE
RETURN
END
…
lly EYSFLD and EZSFLD
SUBROUTINE HYSFLD
DO 30 K=1,NZ1
DO 20 J=2,NY1
DO 10 I=1,NX1
HYS(I,J,K)=HYS(I,J,K)-(EXS(I,J,K+1)-EXS(I,J,K))*DTMDZ+(EZS(I+1,J,K)-EZS(I,J,K))*DTMDX
10 CONTINUE
20 CONTINUE
30 CONTINUE
Here, EXS, EYS, EZS, HXS, HYS, HZS are 1501501200 of size of real8. IDONE is of integer and 1501501200 size remaining arrays are of real8 of size 9.
FUNCTION EXI(I,J,K)
DIST= xxxxxxxxxxx
EXI = AMPX*SOURCE(DIST)
FUNCTION SOURCE(DIST)
TAU=T-DIST/C
omega = 2.0d0 * pi * 1.5d9
source = sin(omega * tau)
lly DEXI(I,J,K)
And my Fortran + CUDA code is
serialcu.f
…
…
CALL exsfld(EXS,IDONE,HYS,HZS,
$ ESCTC,EINCC,ESCTC,ECRLX,ECRLY,ECRLZ,T)
CALL EYSFLD
…
…
exsfld.cu
#include <stdio.h>
#include <cuda.h>
#include <math.h>
#include <stdlib.h>
#include “cutil_inline.h”
#define AMPX 707.106781186548
#define AMPY 0.0
#define AMPZ -707.106781186547
#define XDISP -0.707106781186547
#define YDISP 0.0
#define ZDISP -0.707106781186548
#define C 299796408.447888
#define f 9424777960.76937
// #define d 1.925807824645716E-012
#define DELX 0.001
#define DELY 0.001
#define DELZ 0.001
#define DTEDX 217.507095622963
#define DTEDY 217.507095622963
#define DTEDZ 217.507095622963
#define NX 150
#define NY 150
#define NZ 1200
#define DELAY 0.953179941039466
global void ker_exs(double *EXS, int *IDONE, double *HYS, double *HZS, double *ESCTC, double *EINCC, double *EDEVCN, double *ECRLY, double *ECRLZ, double T1)
{
int I,J,K, II,JJ,KK;
double T=T1;
for(K=1; K<NZ; K++)
{
KK=K;
for(J=1; J<NY; J++)
{
JJ=J;
for(I=0; I<NX; I++)
{
if ( IDONE[I+J*NX+K*NX*NY]==0 )
{
EXS[I+J*NX+K*NX*NY]=EXS[I+J*NX+K*NX*NY]+(HZS[I+J*NX+K*NX*NY]-HZS[I+(J-1)*NX+K*NX*NY])*DTEDY-(HYS[I+J*NX+K*NX*NY]-HYS[I+J*NX+(K-1)*NX*NY])*DTEDZ;
}
else if (IDONE[I+J*NX+K*NX*NY]==1)
{
II = I;
EXS[I+J*NX+K*NX*NY]=-AMPX * sinf((T- ((xxxxxxxxxx)/C )*f);
}
else
{
II=I;
EXS[I+J*NX+K*NX*NY]= EXS[I+J*NX+K*NX*NY]*ESCTC[IDONE[I+J*NX+K*NX*NY]]-EINCC[IDONE[I+J*NX+K*NX*NY]]*AMPX*sinf( T-(xxxxx)*f)-EDEVCN[IDONE[I+J*NX+K*NX*NY]]* AMPX*(f * cosf( f* xxxxxx))+(HZS[I+J*NX+K*NX*NY]-HZS[I+(J-1)*NX+K*NX*NY])*ECRLY[IDONE[I+J*NX+K*NX*NY]]-(HYS[I+J*NX+K*NX*NY]-HYS[I+J*NX+(K-1)*NX*NY])*ECRLZ[IDONE[I+J*NX+K*NX*NY]]; }
}
}
}
}
extern “C”
void exsfld_(double *EXS, int *IDONE, double *HYS, double *HZS, double *ESCTC, double *EINCC, double *EDEVCN, double *ECRLY, double *ECRLZ, double *T)
{
double *d_a, *d_h, *d_i, *d_j, *d_k, *d_l, *d_n, *d_o;
int *d_d, I, J, K;
double t=*T;
size_t M= 9sizeof(double);
size_t N=NXNYNZsizeof(double);
size_t NI=NXNYNZ*sizeof(int);
cudaMalloc( (void **) &d_a, N );
cudaMalloc( (void **) &d_d, NI );
cudaMalloc( (void **) &d_h, N );
cudaMalloc( (void **) &d_i, N );
cudaMalloc( (void **) &d_j, M );
cudaMalloc( (void **) &d_k, M );
cudaMalloc( (void **) &d_l, M );
cudaMalloc( (void **) &d_n, M );
cudaMalloc( (void **) &d_o, M );
cudaMemcpy( d_a, EXS, N, cudaMemcpyHostToDevice);
cudaMemcpy( d_d, IDONE, NI, cudaMemcpyHostToDevice );
cudaMemcpy( d_h, HYS, N, cudaMemcpyHostToDevice );
cudaMemcpy( d_i, HZS, N, cudaMemcpyHostToDevice );
cudaMemcpy( d_j, ESCTC, M, cudaMemcpyHostToDevice );
cudaMemcpy( d_k, EINCC, M, cudaMemcpyHostToDevice );
cudaMemcpy( d_l, EDEVCN,M, cudaMemcpyHostToDevice );
cudaMemcpy( d_n, ECRLY, M, cudaMemcpyHostToDevice );
cudaMemcpy( d_o, ECRLZ, M, cudaMemcpyHostToDevice );
ker_exs<<<1,1>>>(d_a, d_d, d_h, d_i, d_j, d_k, d_l, d_n, d_o,t);
cudaMemcpy(EXS,d_a,N,cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_d);
cudaFree(d_h);
cudaFree(d_i);
cudaFree(d_j);
cudaFree(d_k);
cudaFree(d_l);
cudaFree(d_n);
cudaFree(d_o);
}
Does my conversion 3D to 1D is correct? Is there any other possible way to do these type of programs other than PGI Fortran and cublas library.
I thought communication is not possible inside exsfld. EXSFLD calls HYS,HZS and other subroutines. But HYS and HZS are arrays. subroutine contains EYS and EZS fields.
Thnaks in advance.