3D FDTD - Fortran + CUDA program help

Hi I am new to Cuda
I got wrong o/p after compiling Fortran+CUDA. And here is my Code.

Serial 3D-FDTD Code( in fortran 77):

problem size : 1501501200



CALL EXSFLD
CALL EYSFLD
CALL EZSFLD


CALL HXSFLD
CALL HYSFLD
CALL HZSFLD

SUBROUTINE EXSFLD
DO 30 K=2,NZ1
KK = K
DO 20 J=2,NY1
JJ = J
DO 10 I=1,NX1

        IF(IDONE(I,J,K).EQ.0) GO TO 100
        IF(IDONE(I,J,K).EQ.1) GO TO 200
        GO TO 300

C
C FREE SPACE
C
100 EXS(I,J,K)=EXS(I,J,K)+(HZS(I,J,K)-HZS(I,J-1,K))*DTEDY
$ -(HYS(I,J,K)-HYS(I,J,K-1))*DTEDZ

        GO TO 10

200 II = I
EXS(I,J,K)=-EXI(II,JJ,KK)

        GO TO 10

300 II = I
EXS(I,J,K)=EXS(I,J,K)*ESCTC(IDONE(I,J,K))
$ -EINCC( IDONE(I,J,K))*EXI(II,JJ,KK)
$ -EDEVCN(IDONE(I,J,K))*DEXI(II,JJ,KK)
$ +(HZS(I,J,K)-HZS(I,J-1,K))*ECRLY(IDONE(I,J,K))
$ -(HYS(I,J,K)-HYS(I,J,K-1))*ECRLZ(IDONE(I,J,K))

10 CONTINUE
20 CONTINUE
30 CONTINUE

    RETURN
    END


lly EYSFLD and EZSFLD

SUBROUTINE HYSFLD

 DO 30 K=1,NZ1
    DO 20 J=2,NY1
      DO 10 I=1,NX1
        HYS(I,J,K)=HYS(I,J,K)-(EXS(I,J,K+1)-EXS(I,J,K))*DTMDZ+(EZS(I+1,J,K)-EZS(I,J,K))*DTMDX

10 CONTINUE
20 CONTINUE
30 CONTINUE

Here, EXS, EYS, EZS, HXS, HYS, HZS are 1501501200 of size of real8. IDONE is of integer and 1501501200 size remaining arrays are of real8 of size 9.

FUNCTION EXI(I,J,K)

DIST= xxxxxxxxxxx
EXI = AMPX*SOURCE(DIST)

FUNCTION SOURCE(DIST)

TAU=T-DIST/C
omega = 2.0d0 * pi * 1.5d9
source = sin(omega * tau)

lly DEXI(I,J,K)

And my Fortran + CUDA code is

serialcu.f


CALL exsfld(EXS,IDONE,HYS,HZS,
$ ESCTC,EINCC,ESCTC,ECRLX,ECRLY,ECRLZ,T)
CALL EYSFLD

exsfld.cu

#include <stdio.h>
#include <cuda.h>
#include <math.h>
#include <stdlib.h>
#include “cutil_inline.h”

#define AMPX 707.106781186548
#define AMPY 0.0
#define AMPZ -707.106781186547

#define XDISP -0.707106781186547
#define YDISP 0.0
#define ZDISP -0.707106781186548

#define C 299796408.447888
#define f 9424777960.76937
// #define d 1.925807824645716E-012

#define DELX 0.001
#define DELY 0.001
#define DELZ 0.001

#define DTEDX 217.507095622963
#define DTEDY 217.507095622963
#define DTEDZ 217.507095622963

#define NX 150
#define NY 150
#define NZ 1200

#define DELAY 0.953179941039466

global void ker_exs(double *EXS, int *IDONE, double *HYS, double *HZS, double *ESCTC, double *EINCC, double *EDEVCN, double *ECRLY, double *ECRLZ, double T1)
{
int I,J,K, II,JJ,KK;
double T=T1;

for(K=1; K<NZ; K++)
{
KK=K;
for(J=1; J<NY; J++)
{
JJ=J;
for(I=0; I<NX; I++)
{
if ( IDONE[I+J*NX+K*NX*NY]==0 )
{
EXS[I+J*NX+K*NX*NY]=EXS[I+J*NX+K*NX*NY]+(HZS[I+J*NX+K*NX*NY]-HZS[I+(J-1)*NX+K*NX*NY])*DTEDY-(HYS[I+J*NX+K*NX*NY]-HYS[I+J*NX+(K-1)*NX*NY])*DTEDZ;
}

     else if (IDONE[I+J*NX+K*NX*NY]==1)
        {
        II = I;
        EXS[I+J*NX+K*NX*NY]=-AMPX * sinf((T- ((xxxxxxxxxx)/C )*f);
        }
 else
        {
        II=I;
        EXS[I+J*NX+K*NX*NY]= EXS[I+J*NX+K*NX*NY]*ESCTC[IDONE[I+J*NX+K*NX*NY]]-EINCC[IDONE[I+J*NX+K*NX*NY]]*AMPX*sinf( T-(xxxxx)*f)-EDEVCN[IDONE[I+J*NX+K*NX*NY]]* AMPX*(f * cosf( f* xxxxxx))+(HZS[I+J*NX+K*NX*NY]-HZS[I+(J-1)*NX+K*NX*NY])*ECRLY[IDONE[I+J*NX+K*NX*NY]]-(HYS[I+J*NX+K*NX*NY]-HYS[I+J*NX+(K-1)*NX*NY])*ECRLZ[IDONE[I+J*NX+K*NX*NY]];  } 
    }
 }

}
}

extern “C”
void exsfld_(double *EXS, int *IDONE, double *HYS, double *HZS, double *ESCTC, double *EINCC, double *EDEVCN, double *ECRLY, double *ECRLZ, double *T)
{
double *d_a, *d_h, *d_i, *d_j, *d_k, *d_l, *d_n, *d_o;
int *d_d, I, J, K;
double t=*T;

size_t M= 9sizeof(double);
size_t N=NX
NYNZsizeof(double);
size_t NI=NXNYNZ*sizeof(int);

cudaMalloc( (void **) &d_a, N );

cudaMalloc( (void **) &d_d, NI );

cudaMalloc( (void **) &d_h, N );
cudaMalloc( (void **) &d_i, N );

cudaMalloc( (void **) &d_j, M );
cudaMalloc( (void **) &d_k, M );
cudaMalloc( (void **) &d_l, M );

cudaMalloc( (void **) &d_n, M );
cudaMalloc( (void **) &d_o, M );

cudaMemcpy( d_a, EXS, N, cudaMemcpyHostToDevice);

cudaMemcpy( d_d, IDONE, NI, cudaMemcpyHostToDevice );

cudaMemcpy( d_h, HYS, N, cudaMemcpyHostToDevice );
cudaMemcpy( d_i, HZS, N, cudaMemcpyHostToDevice );

cudaMemcpy( d_j, ESCTC, M, cudaMemcpyHostToDevice );
cudaMemcpy( d_k, EINCC, M, cudaMemcpyHostToDevice );
cudaMemcpy( d_l, EDEVCN,M, cudaMemcpyHostToDevice );

cudaMemcpy( d_n, ECRLY, M, cudaMemcpyHostToDevice );
cudaMemcpy( d_o, ECRLZ, M, cudaMemcpyHostToDevice );

ker_exs<<<1,1>>>(d_a, d_d, d_h, d_i, d_j, d_k, d_l, d_n, d_o,t);

cudaMemcpy(EXS,d_a,N,cudaMemcpyDeviceToHost);

cudaFree(d_a);
cudaFree(d_d);
cudaFree(d_h);
cudaFree(d_i);
cudaFree(d_j);
cudaFree(d_k);
cudaFree(d_l);
cudaFree(d_n);
cudaFree(d_o);
}

Does my conversion 3D to 1D is correct? Is there any other possible way to do these type of programs other than PGI Fortran and cublas library.

I thought communication is not possible inside exsfld. EXSFLD calls HYS,HZS and other subroutines. But HYS and HZS are arrays. subroutine contains EYS and EZS fields.

Thnaks in advance.