I had written a CPU code and GPU code to simulation the process of 2D Transmission Line Matrix(TLM) Computing.
My CPU code used 2D array to complete the process and GPU code converted 2D array to 1D array in order to compute faster. I think my GPU code is pretty much running the same thing as CPU code. However, it resulted differently and happened accumulation at the monitored node (15,15).
If you try to run both code, you will find both of the codes output differently as well. I used cuda-memcheck
as well. It did not return any errors.
Can anyone explain what made it different in two programs and happened different output?
Here is my CPU code for the 2TLM process:
#include<stdlib.h>
#include<stdio.h>
#include<iostream>
#include<fstream>
#include <ctime>
#define M_PI 3.14276
#define c 299792458
#define mu0 M_PI*4e-7
#define eta0 c*mu0
double** declare_array2D(int, int);
using namespace std;
int main() {
std::clock_t start = std::clock();
int NX = 100;
int NY = 100;
int NT =5000;
double dl = 1;
double dt = dl / (sqrt(2.) * c);
//2D mesh variables
double I = 0, tempV = 0, E0 = 0, V = 0;
double** V1 = declare_array2D(NX, NY);
double** V2 = declare_array2D(NX, NY);
double** V3 = declare_array2D(NX, NY);
double** V4 = declare_array2D(NX, NY);
double Z = eta0 / sqrt(2.);
//boundary coefficients
double rXmin = -1;
double rXmax = -1;
double rYmin = -1;
double rYmax = -1;
//input / output
double width = 20 * dt * sqrt(2.);
double delay = 100 * dt * sqrt(2.);
int Ein[] = { 10,10 };
int Eout[] = { 15,15 };
ofstream output("output.out");
for (int n = 0; n < NT; n++) {
//source
E0 = (1 / sqrt(2.)) * exp(-(n * dt - delay) * (n * dt - delay) / (width * width));
V1[Ein[0]][Ein[1]] = V1[Ein[0]][Ein[1]] + E0;
V2[Ein[0]][Ein[1]] = V2[Ein[0]][Ein[1]] - E0;
V3[Ein[0]][Ein[1]] = V3[Ein[0]][Ein[1]] - E0;
V4[Ein[0]][Ein[1]] = V4[Ein[0]][Ein[1]] + E0;
//scatter
for (int x = 0; x < NX; x++) {
for (int y = 0; y < NY; y++) {
I = ( V1[x][y] + V4[x][y] - V2[x][y] - V3[x][y]) / (2* Z);
//port1
V1[x][y] = V1[x][y] - I * Z;;
//port2
V2[x][y] = V2[x][y] + I * Z;
//port3
V3[x][y] = V3[x][y] + I * Z;
//port4
V4[x][y] = V4[x][y] - I * Z;
}
}
//connect
for (int x = 1; x < NX; x++) {
for (int y = 0; y < NY; y++) {
tempV = V2[x][y];
V2[x][y] = V4[x - 1][y];
V4[x - 1][y] = tempV;
}
}
for (int x = 0; x < NX; x++) {
for (int y = 1; y < NY; y++) {
tempV = V1[x][y];
V1[x][y] = V3[x][y - 1];
V3[x][y - 1] = tempV;
}
}
//boundary
for (int x = 0; x < NX; x++) {
V3[x][NY - 1] = rYmax * V3[x][NY - 1];
V1[x][0] = rYmin * V1[x][0];
}
for (int y = 0; y < NY; y++) {
V4[NX - 1][y] = rXmax * V4[NX - 1][y];
V2[0][y] = rXmin * V2[0][y];
}
output << n * dt << " " << V2[Eout[0]][Eout[1]] + V4[Eout[0]][Eout[1]] << endl;
if (n % 100 == 0)
cout << n * dt << " " << V2[Eout[0]][Eout[1]] + V4[Eout[0]][Eout[1]] << endl;
}
output.close();
cout << "Done";
std::cout << ((std::clock() - start) / (double)CLOCKS_PER_SEC) << '\n';
cin.get();
}
double** declare_array2D(int NX, int NY) {
double** V = new double* [NX];
for (int x = 0; x < NX; x++) {
V[x] = new double[NY];
}
for (int x = 0; x < NX; x++) {
for (int y = 0; y < NY; y++) {
V[x][y] = 0;
}
}
return V;
}
Here is my GPU CUDA code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <fstream>
#include <iomanip> // for setprecision
using namespace std;
#define threads 1024 //define the number of thread to use
#define blocks 10 //define the number of blocks to use
#define c 299792458 // speed of light in a vacuum
#define PI 3.14276 // PI
#define mu0 PI*4e-7 // magnetic permeability in a vacuum H/m
#define eta0 c*mu0 // wave impedance in free space
ofstream gaussian_time("gaussian_excitation.out"); // log excitation function to file
ofstream input("input.out"); // log excitation function to file
ofstream outputY("outputY.out"); // log excitation function to file
ofstream outputX("outputX.out"); // log excitation function to file
ofstream line_voltage("line_voltage_1.out"); // log probe voltage at a pint on the line versus time
//Gaussian Source
double tlmSource(double time, double delay, double width)
{
// calculate value of gaussian ecitation voltage at time point
double source = (1 / sqrt(2.)) * exp(-1.0 * (time - delay) * (time - delay) / (width * width));
// log value of gaussian voltage to file
gaussian_time << time << " " << source << endl; //write source funtion to file, comment out for timing
return source;
}
//Kernal Function
__global__ void Initial(double* V1, double* V2, double* V3, double* V4, double* Vp, int NX, int NY, int NT)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = 0; i < NX * NY; i += blockDim.x * gridDim.x)
{
if ((i + idx) < NX * NY)
{
V1[idx+i] = 0;
V2[idx+i] = 0;
V3[idx+i] = 0;
V4[idx+i] = 0;
}
}
for (int i = 0; i < NT; i += blockDim.x * gridDim.x)
{
if ((i + idx) < NT)
{
Vp[idx+i] = 0;
}
}
}
//Scatter Function
__global__ void scatter(double* V1, double* V2, double* V3, double* V4, int NX, int NY, int Ein, double source , double Z)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
//Excitation on the input node
if (idx == 0)
{
V1[Ein] = V1[Ein] - source;
V2[Ein] = V2[Ein] - source;
V3[Ein] = V3[Ein] + source;
V4[Ein] = V4[Ein] + source;
}
__syncthreads();
for (int i = 0; i < NX * NY; i += blockDim.x * gridDim.x)
{
if ((i + idx) < (NX * NY))
{
double I = (-V1[i+idx] + V4[i+idx] - V2[i+idx] + V3[i+idx]) / (2 * Z);
//port1
V1[i+idx] = V1[i + idx] + I * Z;
//port2
V2[i + idx] = V2[i + idx] + I * Z;
//port3
V3[i + idx] = V3[i + idx] - I * Z;
//port4
V4[i + idx] = V4[i + idx] - I * Z;
}
}
}
__global__ void connect(double* V1, double* V2, double* V3, double* V4, int NX, int NY)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
//connect part for V1 and V3
for (int i = 0; i < NX * NY; i += blockDim.x * gridDim.x)
{
if ((idx+i) < (NX * NY - NX ))
{
double tmp = V1[i+idx];
V1[i + idx] = V3[i + idx + NX];
V3[i + idx + NX] = tmp;
}
}
__syncthreads();
//connect part for V2 and V4
for (int i = 0; i < NX * NY; i += blockDim.x * gridDim.x)
{
if ((idx+i) < NX*NY)
{
double LimV2 = V2[i + idx];
double LimV4 = V4[i + idx];
double tmpV2 = V2[i + idx + 1];
V2[i + idx +1] = V4[i + idx];
V4[i + idx] = tmpV2;
__syncthreads();
for (int j =0; j < NY; j += blockDim.x * gridDim.x)
{
if ((j+idx) < NY)
{
V2[(j + idx) * NX] = LimV2;
V4[(NX - 1)+ (j + idx)*NX] = LimV4;
}
}
}
}
}
__global__ void boundary(double* V1, double* V2, double* V3, double* V4, double* Vp, int NX, int NY, double rXmin, double rXmax, double rYmin, double rYmax, int n, int Eout)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = 0; i < NY; i += blockDim.x * gridDim.x)
{
if ((i+idx) < NY)
{
V2[(i + idx) * NY] = rXmin * V2[(i + idx) * NY];
V4[(NY-1)+ (i + idx) *NY] = rXmax * V4[(NY-1)+ (i + idx)*NY];
}
}
__syncthreads();
for (int i = 0; i < NX; i += blockDim.x * gridDim.x)
{
if ((i+idx) < NX)
{
V3[(i + idx)] = rYmax * V3[(i + idx)];
V1[NX * NY - 1 - (i + idx)] = rYmin * V1[NX * NY - 1 - (i + idx)];
}
}
if (idx == Eout)
{
Vp[n] = V2[Eout]+V4[Eout];
}
}
int main(void)
{
clock_t start, end;
//number of nodes (max.5000)
int NX = 100;
int NY = 100;
int NT = 5000;
double dl = 1; // set node line segment length in metres
double dt = dl / (sqrt(2.) * c); // set time step duration
//boundary coefficients
double rXmin = -1; //reflect coefficient
double rXmax = -1;
double rYmin = -1;
double rYmax = -1;
double Z = eta0 / sqrt(2.); //permerability impedance
// set paremeters for gaussian excitation
double width = 20 * dt * sqrt(2.); // gaussian width
double delay = 100 * dt * sqrt(2.); // set time delay before starting excitation
//initialise input node location
int xin = 10;
int yin = 10;
int Ein = xin + NX * yin;
//initialise monitor node location
int xout = 15;
int yout = 15;
int Eout = xout + NX * yout;
size_t bytes = 6* NX * NY * sizeof(double); // define the memory size which needs to use in this application
size_t bytes_probe = NT * sizeof(double);
//declare port in host
double* hos_V1 ;
double* hos_V2 ;
double* hos_V3 ;
double* hos_V4 ;
double* hos_Vp;
double* dev_V1 ;
double* dev_V2 ;
double* dev_V3 ;
double* dev_V4 ;
double* dev_Vp;
hos_V1 = (double*)malloc(bytes);
hos_V2 = (double*)malloc(bytes);
hos_V3 = (double*)malloc(bytes);
hos_V4 = (double*)malloc(bytes);
hos_Vp = (double*)malloc(bytes_probe);
cudaMalloc(&dev_V1, bytes);
cudaMalloc(&dev_V2, bytes);
cudaMalloc(&dev_V3, bytes);
cudaMalloc(&dev_V4, bytes);
cudaMalloc(&dev_Vp, bytes_probe);
cudaThreadSynchronize();
start = clock();
Initial <<< blocks , threads >>> (dev_V1, dev_V2, dev_V3, dev_V4, dev_Vp, NX, NY, NT);
for (int n = 0; n < NT; n++)
{
double source = tlmSource(n * dt, delay, width);
scatter <<< blocks, threads >>> (dev_V1, dev_V2, dev_V3, dev_V4, NX, NY, Ein, source, Z);
connect <<< blocks, threads >>> (dev_V1, dev_V2, dev_V3, dev_V4, NX, NY);
boundary <<< blocks, threads >>> (dev_V1, dev_V2, dev_V3, dev_V4, dev_Vp, NX, NY, rXmin, rXmax, rYmin, rYmax, n, Eout);
}
cudaMemcpy(hos_Vp, dev_Vp, bytes_probe, cudaMemcpyDeviceToHost);
end = clock();
for (int n = 0; n < NT; n++) {
line_voltage << dt * n << " " << hos_Vp[n] << endl;
cout << dt * n << " " << hos_Vp[n] << endl;
}
double TLM_Execution_Time = double(end - start) / double(CLOCKS_PER_SEC);
cout << "Time taken by TLM algorithm : " << fixed << TLM_Execution_Time << setprecision(5);
cout << " sec " << endl;
input.close();
outputX.close();
outputY.close();
// free the memory allocated on the GPU
cudaFree(dev_V1);
cudaFree(dev_V2);
cudaFree(dev_V3);
cudaFree(dev_V4);
cudaFree(dev_Vp);
return 0;
}