2D Transmission Line Matrix (TLM) in CUDA

I had written a CPU code and GPU code to simulation the process of 2D Transmission Line Matrix(TLM) Computing.

My CPU code used 2D array to complete the process and GPU code converted 2D array to 1D array in order to compute faster. I think my GPU code is pretty much running the same thing as CPU code. However, it resulted differently and happened accumulation at the monitored node (15,15).

If you try to run both code, you will find both of the codes output differently as well. I used cuda-memcheck as well. It did not return any errors.

Can anyone explain what made it different in two programs and happened different output?

Here is my CPU code for the 2TLM process:

#include<stdlib.h>
#include<stdio.h>
#include<iostream>
#include<fstream>
#include <ctime>
#define M_PI 3.14276
#define c 299792458
#define mu0 M_PI*4e-7
#define eta0 c*mu0

double** declare_array2D(int, int);

using namespace std;

int main() {
    std::clock_t start = std::clock();
    int NX = 100;
    int NY = 100;
    int NT =5000;
    double dl = 1;
    double dt = dl / (sqrt(2.) * c);

    //2D mesh variables
    double I = 0, tempV = 0, E0 = 0, V = 0;
    double** V1 = declare_array2D(NX, NY);
    double** V2 = declare_array2D(NX, NY);
    double** V3 = declare_array2D(NX, NY);
    double** V4 = declare_array2D(NX, NY);

    double Z = eta0 / sqrt(2.);

    //boundary coefficients
    double rXmin = -1;
    double rXmax = -1;
    double rYmin = -1;
    double rYmax = -1;

    //input / output
    double width = 20 * dt * sqrt(2.);
    double delay = 100 * dt * sqrt(2.);
    int Ein[] = { 10,10 };
    int Eout[] = { 15,15 };

    ofstream output("output.out");

    for (int n = 0; n < NT; n++) {

        //source
        E0 = (1 / sqrt(2.)) * exp(-(n * dt - delay) * (n * dt - delay) / (width * width));
        V1[Ein[0]][Ein[1]] = V1[Ein[0]][Ein[1]] + E0;
        V2[Ein[0]][Ein[1]] = V2[Ein[0]][Ein[1]] - E0;
        V3[Ein[0]][Ein[1]] = V3[Ein[0]][Ein[1]] - E0;
        V4[Ein[0]][Ein[1]] = V4[Ein[0]][Ein[1]] + E0;

        //scatter
        for (int x = 0; x < NX; x++) {
            for (int y = 0; y < NY; y++) {
                I = ( V1[x][y] + V4[x][y] -  V2[x][y] - V3[x][y]) / (2* Z);

                //port1
                V1[x][y] = V1[x][y] - I * Z;;
                //port2
                V2[x][y] = V2[x][y] + I * Z;
                 //port3
                V3[x][y] = V3[x][y] + I * Z;
                //port4    
                V4[x][y] = V4[x][y] - I * Z;
            }
        }

        //connect
        for (int x = 1; x < NX; x++) {
            for (int y = 0; y < NY; y++) {
                tempV = V2[x][y];
                V2[x][y] = V4[x - 1][y];
                V4[x - 1][y] = tempV;
            }
        }
        for (int x = 0; x < NX; x++) {
            for (int y = 1; y < NY; y++) {
                tempV = V1[x][y];
                V1[x][y] = V3[x][y - 1];
                V3[x][y - 1] = tempV;
            }
        }

        //boundary
        for (int x = 0; x < NX; x++) {
            V3[x][NY - 1] = rYmax * V3[x][NY - 1];
            V1[x][0] = rYmin * V1[x][0];
        }
        for (int y = 0; y < NY; y++) {
            V4[NX - 1][y] = rXmax * V4[NX - 1][y];
            V2[0][y] = rXmin * V2[0][y];
        }


        output << n * dt << "  " << V2[Eout[0]][Eout[1]] + V4[Eout[0]][Eout[1]] << endl;
        if (n % 100 == 0)
            cout << n * dt << "  " << V2[Eout[0]][Eout[1]] + V4[Eout[0]][Eout[1]] << endl;

    }
    output.close();
    cout << "Done";
    std::cout << ((std::clock() - start) / (double)CLOCKS_PER_SEC) << '\n';
    cin.get();


}


double** declare_array2D(int NX, int NY) {
    double** V = new double* [NX];
    for (int x = 0; x < NX; x++) {
        V[x] = new double[NY];
    }

    for (int x = 0; x < NX; x++) {
        for (int y = 0; y < NY; y++) {
            V[x][y] = 0;
        }
    }
    return V;
}

Here is my GPU CUDA code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#include <math.h>

#include <iostream>
#include <fstream>
#include <iomanip>  // for setprecision

using namespace std;

#define threads 1024  //define the number of thread to use
#define blocks 10 //define the number of blocks to use

#define c 299792458        // speed of light in a vacuum
#define PI 3.14276  // PI
#define mu0 PI*4e-7         // magnetic permeability in a vacuum H/m
#define eta0 c*mu0          // wave impedance in free space 

ofstream gaussian_time("gaussian_excitation.out");  // log excitation function to file
ofstream input("input.out");  // log excitation function to file
ofstream outputY("outputY.out");  // log excitation function to file
ofstream outputX("outputX.out");  // log excitation function to file
ofstream line_voltage("line_voltage_1.out");        // log probe voltage at a pint on the line versus time

//Gaussian Source

double tlmSource(double time, double delay, double width)
{
    // calculate value of gaussian ecitation voltage at time point
    double source = (1 / sqrt(2.)) * exp(-1.0 * (time - delay) * (time - delay) / (width * width));

    // log value of gaussian voltage to file
    gaussian_time << time << "  " << source << endl; //write source funtion to file, comment out for timing

    return source;
}


//Kernal Function
__global__ void Initial(double* V1, double* V2, double* V3, double* V4, double* Vp, int NX, int NY, int NT)
{
    unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;

    for (int i = 0; i < NX * NY; i += blockDim.x * gridDim.x)
    {
        
        if ((i + idx) < NX * NY)
            {
                V1[idx+i] = 0;
                V2[idx+i] = 0;
                V3[idx+i] = 0;
                V4[idx+i] = 0;
            }
        
    }

    for (int i = 0; i < NT; i += blockDim.x * gridDim.x)
    {
        if ((i + idx) < NT)
        {
            Vp[idx+i] = 0;
        }
    }
}

//Scatter Function
__global__ void scatter(double* V1, double* V2, double* V3, double* V4, int NX, int NY, int Ein, double source , double Z)
{
    unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
    

    //Excitation on the input node
        if (idx == 0)
        {
            V1[Ein] = V1[Ein] - source;
            V2[Ein] = V2[Ein] - source;
            V3[Ein] = V3[Ein] + source;
            V4[Ein] = V4[Ein] + source;
        }


    __syncthreads();

    for (int i = 0; i < NX * NY; i += blockDim.x * gridDim.x)
    {
        if ((i + idx) < (NX * NY))
        {
            double I = (-V1[i+idx] + V4[i+idx] - V2[i+idx]  + V3[i+idx]) / (2 * Z);

            //port1
            V1[i+idx] = V1[i + idx] + I * Z;
            //port2
            V2[i + idx] = V2[i + idx] + I * Z;
            //port3
            V3[i + idx] = V3[i + idx] - I * Z;
            //port4
            V4[i + idx] = V4[i + idx] - I * Z;

            
        }
    }
}



__global__ void connect(double* V1, double* V2, double* V3, double* V4, int NX, int NY)
{
    unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
    
    //connect part for V1 and V3
    for (int i = 0; i < NX * NY; i += blockDim.x * gridDim.x)
    {
        if ((idx+i) < (NX * NY - NX ))
        {
            double tmp = V1[i+idx];

            V1[i + idx] = V3[i + idx + NX];
            V3[i + idx + NX] = tmp;

        }
    }

    __syncthreads();
    

    //connect part for V2 and V4
    for (int i = 0; i < NX * NY; i += blockDim.x * gridDim.x)
    {
        if ((idx+i) < NX*NY)
        {
            
            double LimV2 = V2[i + idx];
            double LimV4 = V4[i + idx];
            

            double tmpV2 = V2[i + idx + 1];
            V2[i + idx +1] = V4[i + idx];
            V4[i + idx] = tmpV2;

            __syncthreads();

            for (int j =0; j < NY; j += blockDim.x * gridDim.x)
            {
                if ((j+idx) < NY)
                {
                    V2[(j + idx) * NX] = LimV2;
                    V4[(NX - 1)+ (j + idx)*NX] = LimV4;
                }
            }
        
        }
    

    }
}



__global__ void boundary(double* V1, double* V2, double* V3, double* V4, double* Vp, int NX, int NY, double rXmin, double rXmax, double rYmin, double rYmax, int n, int Eout)
{
    unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;

    for (int i = 0; i < NY; i += blockDim.x * gridDim.x)
    {
        if ((i+idx) < NY)
        {
            V2[(i + idx) * NY] = rXmin * V2[(i + idx) * NY];
            V4[(NY-1)+ (i + idx) *NY] = rXmax * V4[(NY-1)+ (i + idx)*NY];
        }
    }

    __syncthreads();

    for (int i = 0; i < NX; i += blockDim.x * gridDim.x)
    {
        if ((i+idx) < NX)
        {
            V3[(i + idx)] = rYmax * V3[(i + idx)];
            V1[NX * NY - 1 - (i + idx)] = rYmin * V1[NX * NY - 1 - (i + idx)];
        }
    }



    if (idx == Eout)
    {
        Vp[n] = V2[Eout]+V4[Eout];
    }
}





int main(void) 
{
    clock_t start, end;
    

    //number of nodes (max.5000)
    int NX = 100;       
    int NY = 100;
    int NT = 5000;

    double dl = 1;       // set node line segment length in metres
    double dt = dl / (sqrt(2.) * c);    // set time step duration

    //boundary coefficients
    double rXmin = -1;          //reflect coefficient
    double rXmax = -1;
    double rYmin = -1;
    double rYmax = -1;

    double Z = eta0 / sqrt(2.);     //permerability impedance

    // set paremeters for gaussian excitation
    double width = 20 * dt * sqrt(2.);     // gaussian width 
    double delay = 100 * dt * sqrt(2.);    // set time delay before starting excitation

    //initialise input node location
    int xin = 10;
    int yin = 10;

    int Ein = xin + NX * yin;

    //initialise monitor node location
    int xout = 15;
    int yout = 15;

    int Eout = xout + NX * yout;


    size_t bytes = 6* NX * NY * sizeof(double);  // define the memory size which needs to use in this application
    size_t bytes_probe = NT * sizeof(double);

    //declare port in host
    double* hos_V1 ;
    double* hos_V2 ;
    double* hos_V3 ;
    double* hos_V4 ;
    double* hos_Vp;

    double* dev_V1 ;
    double* dev_V2 ;
    double* dev_V3 ;
    double* dev_V4 ;
    double* dev_Vp;



    hos_V1 = (double*)malloc(bytes);
    hos_V2 = (double*)malloc(bytes);
    hos_V3 = (double*)malloc(bytes);
    hos_V4 = (double*)malloc(bytes);
    hos_Vp = (double*)malloc(bytes_probe);



    cudaMalloc(&dev_V1, bytes);
    cudaMalloc(&dev_V2, bytes);
    cudaMalloc(&dev_V3, bytes);
    cudaMalloc(&dev_V4, bytes);
    cudaMalloc(&dev_Vp, bytes_probe);


    cudaThreadSynchronize();


    start = clock();

    Initial <<< blocks , threads >>> (dev_V1, dev_V2, dev_V3, dev_V4, dev_Vp, NX, NY, NT);

    for (int n = 0; n < NT; n++)
    {
        double source = tlmSource(n * dt, delay, width);

        scatter <<< blocks, threads >>> (dev_V1, dev_V2, dev_V3, dev_V4, NX, NY, Ein, source, Z);
            
        connect <<< blocks, threads >>> (dev_V1, dev_V2, dev_V3, dev_V4, NX, NY);

        boundary <<< blocks, threads >>>  (dev_V1, dev_V2, dev_V3, dev_V4, dev_Vp, NX, NY, rXmin, rXmax, rYmin, rYmax, n, Eout);
        
    }

    cudaMemcpy(hos_Vp, dev_Vp, bytes_probe, cudaMemcpyDeviceToHost);
    end = clock();


    for (int n = 0; n < NT; n++) {
        line_voltage << dt * n << "  " << hos_Vp[n] << endl;
        
        cout << dt * n << "  " << hos_Vp[n] << endl;
    }

    double TLM_Execution_Time = double(end - start) / double(CLOCKS_PER_SEC);

    cout << "Time taken by TLM algorithm : " << fixed << TLM_Execution_Time << setprecision(5);
    cout << " sec " << endl;

    input.close();
    outputX.close();
    outputY.close();



    // free the memory allocated on the GPU
    cudaFree(dev_V1);
    cudaFree(dev_V2);
    cudaFree(dev_V3);
    cudaFree(dev_V4);
    cudaFree(dev_Vp);

    return 0;
}