ĄĄĄĄ#include <stdio.h>  
   #include <cuda.h>  
   #define MATRIXSIZE 480*480
   #define XYLENGTH   480
   #define BLOCK_LENGTH 16
   #define AR_INDEX(ar , a, b) (ar[ a*XYLENGTH + b ])
   
   //declaration of the matrices used in the host memeory
   float* h_ga;
   float* h_dz;
   float* h_ez;
   float* h_hx;   
   float* h_hy;
   float* h_ihx;
   float* h_ihy;
   float* h_gi2;
   float* h_gj2;
   float* h_gi3;
   float* h_gj3;
   float* h_fi1;
   float* h_fj1;
   float* h_fi2;
   float* h_fj2;
   float* h_fi3;
   float* h_fj3;
   
  //declaration of the matrices used in the kernal
  __device__ float* d_ga;
  __device__ float* d_dz;
  __device__ float* d_ez;
  __device__ float* d_hx;
  __device__ float* d_hy;
  __device__ float* d_ihx;
  __device__ float* d_ihy;
  __device__ float* d_gi2;
  __device__ float* d_gj2;
  __device__ float* d_gi3;
  __device__ float* d_gj3;
  __device__ float* d_fi1;
  __device__ float* d_fj1;
  __device__ float* d_fi2;
  __device__ float* d_fj2;
  __device__ float* d_fi3;
  __device__ float* d_fj3;
  
  
   
   // Kernel that executes on the CUDA device  
   
   __global__ void UpdateDZfield(int time)
{

  // the coordinate for sinusoidal source
  int ic = XYLENGTH /2 -5;
  int jc = XYLENGTH /2 -5;
  float ddx = 0.01;     // cell size
  float dt = ddx / 6e8;  // time step
  float pulse;
  float pi = 3.14159;
 
  //Block index
  int bx = blockIdx.x;
  int by = blockIdx.y;
  
  // Tread index
  int tx = threadIdx.x;
  int ty = threadIdx.y;

  // coordinates location in the global matrix
  int x = bx * BLOCK_LENGTH + tx;
  int y = by *BLOCK_LENGTH + ty;

  if( x== 0 || y ==0){
  // boundary conditions do nothing

  }else{
  // update the dz field
   AR_INDEX(d_dz, x, y) = d_gi3[x] * d_gj3[y]* AR_INDEX( d_dz, x, y)
    + d_gi2[x]* d_gj2[y] * 0.5 *( AR_INDEX( d_hy, x ,y) - AR_INDEX( d_hy, x-1 ,y)
    - AR_INDEX( d_hx, x ,y) + AR_INDEX( d_hx, x ,y-1) );
   }

  // sinusoidal sources
  pulse = sinf(2*pi*1500*1e6*dt*time);
 
  if( x ==ic && y ==jc)
  AR_INDEX(d_dz, x, y) = pulse;
  
 } // end of the UpdateDZfield(int time)
 
 __global__ void UpdateEZfield( )
{
   //Block index
   int bx = blockIdx.x;
   int by = blockIdx.y;
  
   // Tread index
   int tx = threadIdx.x;
   int ty = threadIdx.y;

   // coordinates location in the global matrix
   int x = bx * BLOCK_LENGTH + tx;
   int y = by *BLOCK_LENGTH + ty;

   //calculation of EZ field
   AR_INDEX(d_ez, x, y) = AR_INDEX(d_ga, x, y) * AR_INDEX(d_dz, x,y);

   //set the Ez edges to 0, as part of the PML
   if( x==0 || x == (480-1) || y==0 || y==(480-1))
     AR_INDEX(d_ez,x,y) = 0.0;

 }// end of the UpdateEZfield( ) 
 
 
 
 __global__ void UpdateHfield( )

{
  float curl_e;
  
  //Block index
   int bx = blockIdx.x;
   int by = blockIdx.y;
  
   // Tread index
   int tx = threadIdx.x;
   int ty = threadIdx.y;

   // coordinates location in the global matrix
   int x = bx * BLOCK_LENGTH + tx;
   int y = by * BLOCK_LENGTH + ty;
   
   //calculation of Hx field
   if(y == (480-1)) {
   // the boundary condition,do nothing

   } else{
   curl_e = AR_INDEX(d_ez, x,y) - AR_INDEX(d_ez, x,y+1);
   AR_INDEX(d_ihx, x, y) = AR_INDEX(d_ihx, x, y) + d_fi1[x]* curl_e;
   AR_INDEX(d_hx, x, y) = d_fj3[y]* AR_INDEX(d_hx,x,y) + d_fj2[y]*0.5*(curl_e + AR_INDEX( d_ihx,x,y) );
   }
  
   //calculation of Hy field
   if(x == (480-1)) {
   // the boundary condition ,do nothing

   }else{
   curl_e = AR_INDEX(d_ez, x+1,y) - AR_INDEX(d_ez, x,y);
   AR_INDEX(d_ihy, x, y) = AR_INDEX(d_ihy, x, y) + d_fj1[y]* curl_e;
   AR_INDEX(d_hy, x, y) = d_fi3[x]* AR_INDEX(d_hy,x,y) + d_fi2[x]*0.5*(curl_e + AR_INDEX( d_ihy,x,y) );
   
   }
 
}  // end of the UpdateHfield( )
   
  
    
   // main driver that executes on the host  
   
   int main(void)  
   {  
   
    
    // declare variables used in FDTD simulation
    int l, n ,i , j, nsteps, npml,T;
    float ddx, dt,epsz, pi, epsilon, sigma, eaf;
    float xn, xxn, xnum, xd, curl_e;
    float t0, spread;
    
    //assign a file to record the result
    FILE *fp,*fopen();
    
    // allocate host memory for matrices and arrays
    unsigned int mem_size_matrix = sizeof(float) * MATRIXSIZE;
    h_ga = (float*) malloc(mem_size_matrix);
    h_dz = (float*) malloc(mem_size_matrix);
    h_ez = (float*) malloc(mem_size_matrix);
    h_hx = (float*) malloc(mem_size_matrix);   
    h_hy = (float*) malloc(mem_size_matrix);
    h_ihx = (float*) malloc(mem_size_matrix);
    h_ihy = (float*) malloc(mem_size_matrix);
    
    unsigned int mem_size_array = sizeof(float) * XYLENGTH;
    h_gi2 = (float*) malloc(mem_size_array);
    h_gj2 = (float*) malloc(mem_size_array);
    h_gi3 = (float*) malloc(mem_size_array);
    h_gj3 = (float*) malloc(mem_size_array);
    h_fi1 = (float*) malloc(mem_size_array);
    h_fj1 = (float*) malloc(mem_size_array);
    h_fi2 = (float*) malloc(mem_size_array);
    h_fj2 = (float*) malloc(mem_size_array);
    h_fi3 = (float*) malloc(mem_size_array);
    h_fj3 = (float*) malloc(mem_size_array);

    // allocate device memory for matrices and arrays

    cudaMalloc( (void**) & d_ga, mem_size_matrix);
    cudaMalloc( (void**) & d_dz, mem_size_matrix);
    cudaMalloc( (void**) & d_ez, mem_size_matrix);
    cudaMalloc( (void**) & d_hx, mem_size_matrix);
    cudaMalloc( (void**) & d_hy, mem_size_matrix); 
    cudaMalloc( (void**) & d_ihx, mem_size_matrix);
    cudaMalloc( (void**) & d_ihy, mem_size_matrix);

  
    cudaMalloc( (void**) & d_gi2, mem_size_array );
    cudaMalloc( (void**) & d_gj2, mem_size_array );
    cudaMalloc( (void**) & d_gi3, mem_size_array );
    cudaMalloc( (void**) & d_gj3, mem_size_array );
    cudaMalloc( (void**) & d_fi1, mem_size_array );
    cudaMalloc( (void**) & d_fj1, mem_size_array );
    cudaMalloc( (void**) & d_fi2, mem_size_array );
    cudaMalloc( (void**) & d_fj2, mem_size_array );
    cudaMalloc( (void**) & d_fi3, mem_size_array );
    cudaMalloc( (void**) & d_fj3, mem_size_array );
    
    //assign the value to the variables declared
    epsz = 8.8e-12;
    
    
    // initialize the matrix
   for( i =0; i< XYLENGTH ; i++) {
    
          for( j =0; j< XYLENGTH;j++){
		
    AR_INDEX(h_dz , i , j)=  0.0;
    AR_INDEX(h_ez , i , j)=  0.0;
    AR_INDEX(h_hx , i , j)=  0.0;
    AR_INDEX(h_hy , i , j)=  0.0;
    AR_INDEX(h_ihx , i , j)= 0.0;
    AR_INDEX(h_ihy , i , j)= 0.0;
    AR_INDEX(h_ga , i , j) =  1.0;
           } // end inner for loop
		  
    } // end outer for loop
    
     
   // calculate the PML parameters
   for( i=0;i<XYLENGTH; i++){
    h_gi2[i] = 1.0;
    h_gi3[i] = 1.0;
    h_fi1[i] = 0.0;
    h_fi2[i] = 1.0;
    h_fi3[i] = 1.0;
    h_gj2[i] = 1.0;
    h_gj3[i] = 1.0;
    h_fj1[i] = 0.0;
    h_fj2[i] = 1.0;
    h_fj3[i] = 1.0;
   }
     
   //Ask for the Number of PML cells for calcualtion
   printf("Number of PML cells ---->");
   scanf("%d", &npml);

   for( i =0; i <= npml; i++){

		xnum = npml -i;
		xd = npml;
		xxn = xnum/xd;
		xn = 0.33 *pow(xxn,3);
		h_gi2[i] = 1.0 / (1.0 + xn);
		h_gi2[XYLENGTH-1-i] = 1.0 /(1.0 + xn) ;
		h_gi3[i] = (1.0 -xn)/ (1.0 +xn);
		h_gi3[XYLENGTH-1-i] = (1.0 -xn)/ (1.0 +xn);

		xxn = (xnum -.5) /xd;
		xn = 0.25 * pow(xxn,3);
		h_fi1[i] = xn;
		h_fi1[XYLENGTH-2-i] =xn;
		h_fi2[i] = 1.0 / (1.0 + xn);
		h_fi2[XYLENGTH-2-i] = 1.0 /(1.0 + xn) ;
		h_fi3[i] = (1.0 -xn)/ (1.0 +xn);
		h_fi3[XYLENGTH-2-i] = (1.0 -xn)/ (1.0 +xn);

	}
	
	for( j =0; j<=npml; j++){
		xnum = npml -j;
		xd = npml;
		xxn = xnum /xd;
		xn = 0.33 * pow(xxn, 3);
            
		h_gj2[XYLENGTH-1-j] = 1.0 /(1.0 + xn) ;
		h_gj3[j] = (1.0 -xn)/ (1.0 +xn);
		h_gj3[XYLENGTH-1-j] = (1.0 -xn)/ (1.0 +xn);

		xxn = (xnum -.5) /xd;
		xn = 0.25 * pow(xxn,3);
		h_fj1[j] = xn;
		h_fj1[XYLENGTH-2-j] =xn;
		h_fj2[j] = 1.0 / (1.0 + xn);
		h_fj2[XYLENGTH-2-j] = 1.0 /(1.0 + xn) ;
		h_fj3[j] = (1.0 -xn)/ (1.0 +xn);
		h_fj3[XYLENGTH-2-j] = (1.0 -xn)/ (1.0 +xn);


	 }
	 
	// simulation set up
      t0 = 40.0;
	spread = 15.0;
	T = 0;
	nsteps = 1;
    //copy the matrices and arrays from host memory to device memory 
    cudaMemcpy(d_dz,h_dz, mem_size_matrix ,cudaMemcpyHostToDevice);
    cudaMemcpy(d_ez,h_ez, mem_size_matrix ,cudaMemcpyHostToDevice);
    cudaMemcpy(d_hx,h_hx, mem_size_matrix ,cudaMemcpyHostToDevice);
    cudaMemcpy(d_hy,h_hy, mem_size_matrix ,cudaMemcpyHostToDevice);
    cudaMemcpy(d_ga,h_ga, mem_size_matrix ,cudaMemcpyHostToDevice);
    cudaMemcpy(d_ihx,h_ihx, mem_size_matrix ,cudaMemcpyHostToDevice);
    cudaMemcpy(d_ihy,h_ihy, mem_size_matrix ,cudaMemcpyHostToDevice);
    cudaMemcpy(d_gi2,h_gi2, mem_size_array,cudaMemcpyHostToDevice);
    cudaMemcpy(d_gj2,h_gj2, mem_size_array,cudaMemcpyHostToDevice);
    cudaMemcpy(d_gi3,h_gi3, mem_size_array,cudaMemcpyHostToDevice);
    cudaMemcpy(d_gj2,h_gj2, mem_size_array,cudaMemcpyHostToDevice);
    cudaMemcpy(d_fi1,h_fi1, mem_size_array,cudaMemcpyHostToDevice);
    cudaMemcpy(d_fj1,h_fj1, mem_size_array,cudaMemcpyHostToDevice);
    cudaMemcpy(d_fi2,h_fi2, mem_size_array,cudaMemcpyHostToDevice);
    cudaMemcpy(d_fj2,h_fj2, mem_size_array,cudaMemcpyHostToDevice);
    cudaMemcpy(d_fi3,h_fi3, mem_size_array,cudaMemcpyHostToDevice);
    cudaMemcpy(d_fj3,h_fj3, mem_size_array,cudaMemcpyHostToDevice);
    
   
    
    //kernal dimensions setup
    dim3 dimBlock  (BLOCK_LENGTH, BLOCK_LENGTH);
    dim3 dimGrid   (XYLENGTH / BLOCK_LENGTH , XYLENGTH / BLOCK_LENGTH);

    
    
	//start simulation
	while(nsteps >0){

		printf("nsteps -->");
		scanf("%d", &nsteps);
		printf("%d\n", nsteps);

		for ( n =1; n <= nsteps; n++){
        
        T = T+1;
        
        UpdateDZfield <<< dimGrid , dimBlock >>> ( T ) ;
        cudaThreadSynchronize();
 
        UpdateEZfield <<< dimGrid , dimBlock >>> ( );
        cudaThreadSynchronize();


        UpdateHfield <<< dimGrid , dimBlock >>> ( );
        cudaThreadSynchronize();


     

         } // end for loop
      
    } // end while loop
    
    
    
    
    
    //copy d_ez back to h_ez and record into file

    cudaMemcpy(h_ez, d_ez, mem_size_matrix ,cudaMemcpyDeviceToHost);
    //cudaMemcpy(h_hx, d_hx, mem_size_matrix ,cudaMemcpyDeviceToHost);
    
    
    // write the E field out to a file "Ez"
    fopen_s(&fp,"Ez.txt", "w" );
   
    for(j=0;j< XYLENGTH;j++){
      for(i=0;i< XYLENGTH;i++) {
      
          fprintf(fp, "%6.3f ", AR_INDEX(h_ez,i,j));
         
         } // end inner for loop
    fprintf( fp," \n");
    } // end outer for loop
    /*
    // write the d_hx field out to a file "Ez"
    fopen_s(&fp,"Hx", "w" );
   
    for(j=0;j< XYLENGTH;j++){
      for(i=0;i< XYLENGTH;i++) {
      
          fprintf(fp, "%6.3f ", AR_INDEX(h_hx,i,j));
         
         } // end inner for loop
    fprintf( fp," \n");
    } // end outer for loop
    */
   // close the file
   fclose(fp);

   // clear host and device memories 
    

   free(h_ez);
   free(h_dz);
   free(h_hx);
   free(h_hy);
   free(h_ihx);
   free(h_ihy);
   free(h_ga);
   free(h_gi2);
   free(h_gj2);
   free(h_gi3);
   free(h_gj3);
   free(h_fi1);
   free(h_fj1);
   free(h_fi2);
   free(h_fj2);
   free(h_fi3);
   free(h_fj3);

   cudaFree(d_ez);
   cudaFree(d_dz);
   cudaFree(d_hx);
   cudaFree(d_hy);
   cudaFree(d_ihx);
   cudaFree(d_ihy);
   cudaFree(d_ga);
   cudaFree(d_gi2);
   cudaFree(d_gj3);
   cudaFree(d_fi1);
   cudaFree(d_fj1);
   cudaFree(d_fi2);
   cudaFree(d_fj2);
   cudaFree(d_fi3);
   cudaFree(d_fj3);
 
   system("pause");
 }