0
I am trying to solve this problem where I am trying to combine all separate functions into one big function call. There are now 3 function calls that need to be one. At this point, the code works good and I am getting the correct result but with merging With the second function called “GPUmatmul_nosqr_portion2_real” into the function “GPUmatmul_element”. the code gives the wrong answer. From my debugging, after the function call “GPUmatmul_element” the next function has the issue of accessing the memory as if I debug, I cannot access the values of the first function in the immediate widow as it shows . However, when I use the cudaDeviceSynchronize(); before the second function call, I can access the values. But, interestingly, even though in the debugging mode, I cannot access the value, the whole existing code gives the correct answer without issue. The issue only starts when I tried merging the second function into the first function. This tells me that there is probably an issue with synchronization. I have tried using __syncthread(); but that didn’t have any impact.
I would like to request some expert help and advice on how this can be avoided and how I can successfully merge all functions together. I am using Windows 10 and Visual Studio 2017.
I would really appreciate it if someone can please take a look and help me.
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include<cmath>
#include <sstream> //istringstream
#include <fstream> // ifstream
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>
#include <time.h>
#include <float.h>
#include <iomanip>
using namespace std;
#define N 9
#define F 12
#define Sub 3
#define G 1
#define LDBL_EPSILON 20
__global__
void GPUmatmul_element(float *S_real, float *V_real, float *S_imag, float *V_imag, float *V_real_sqr, float *V_imag_sqr, float *IL_real, float *IL_imag, float *ILn_real, float *ILn_imag,
float *portion1_real_1, float *x, float *y, float *portion1_real_2, float *portion1_real, float *portion1_imag_1, float *portion1_imag_2, float *portion1_imag,
float *portion2_real_1_part1, float *Ysub_imag_GPU, float *Ysub_real_GPU, float *portion2_real_1_part2, float *portion2_real_1,
float *portion2_imag_1_part1, float *portion2_imag_1_part2, float *portion2_imag_1)
{
//calculates unique thread ID in the block
int t = (threadIdx.x);
int b = (blockIdx.x);
int T = blockDim.x;
int B = gridDim.x;
for (int i = b; i < F; i += B)
{
for (int j = t; j < G; j += T)
{
IL_imag[i*G + j] = ((S_imag[i*G + j] * V_real[i*G + j]) - (S_real[i*G + j] * V_imag[i*G + j])) / (V_real_sqr[i*G + j] + V_imag_sqr[i*G + j]);
IL_real[i*G + j] = -1 * ((S_real[i*G + j] * V_real[i*G + j]) + (S_imag[i*G + j] * V_imag[i*G + j])) / (V_real_sqr[i*G + j] + V_imag_sqr[i*G + j]);
}
}
for (int i = 0; i < N; i++)
{
for (int j = 0; j < G; j++)
{
ILn_imag[i*G + j] = IL_imag[i*G + j + 3];
}
}
for (int i = 0; i < N; i++)
{
for (int j = 0; j < G; j++)
{
ILn_real[i*G + j] = IL_real[i*G + j + 3];
}
}
for (int i = b; i < G; i += B)
{
for (int j = t; j < N; j += T)
{
for (int k = 0; k < N; k++)
{
portion1_real_1[i*G + j] += (x[j*N + k] * ILn_imag[k]); // x is the Ybus_imag
}
}
}
for (int i = b; i < G; i += B)
{
for (int j = t; j < N; j += T)
{
for (int k = 0; k < N; k++)
{
portion1_real_2[i*G + j] += (y[j*N + k] * ILn_real[k]); // y is the Ybus_real
}
}
}
for (int k = 0; k < N; k++)
{
portion1_real[k] = (portion1_real_2[k] - portion1_real_1[k]);
}
for (int i = b; i < G; i += B)
{
for (int j = t; j < N; j += T)
{
for (int k = 0; k < N; k++)
{
portion1_imag_1[i*G + j] += (y[j*N + k] * ILn_imag[k]);
}
}
}
for (int i = b; i < G; i += B)
{
for (int j = t; j < N; j += T)
{
for (int k = 0; k < N; k++)
{
portion1_imag_2[i*G + j] += (x[j*N + k] * ILn_real[k]);
}
}
}
for (int k = 0; k < N; k++)
{
portion1_imag[k] = (portion1_imag_1[k] + portion1_imag_2[k]);
}
for (int i = b; i < N; i += B)
{
for (int j = t; j < Sub; j += T)
{
for (int k = 0; k < N; k++)
{
portion2_real_1_part1[i*Sub + j] += (y[i*N + k] * Ysub_real_GPU[k*Sub + j]);
}
}
}
for (int i = b; i < N; i += B)
{
for (int j = t; j < Sub; j += T)
{
for (int k = 0; k < N; k++)
{
portion2_real_1_part2[i*Sub + j] += (x[i*N + k] * Ysub_imag_GPU[k*Sub + j]);
}
}
}
for (int k = 0; k < N*Sub; k++)
{
portion2_real_1[k] = (portion2_real_1_part1[k] - portion2_real_1_part2[k]);
}
for (int i = b; i < N; i += B)
{
for (int j = t; j < Sub; j += T)
{
for (int k = 0; k < N; k++)
{
portion2_imag_1_part1[i*Sub + j] += (x[i*N + k] * Ysub_real_GPU[k*Sub + j]);
}
}
}
for (int i = b; i < N; i += B)
{
for (int j = t; j < Sub; j += T)
{
for (int k = 0; k < N; k++)
{
portion2_imag_1_part2[i*Sub + j] += (y[i*N + k] * Ysub_imag_GPU[k*Sub + j]);
}
}
}
for (int k = 0; k < N*Sub; k++)
{
portion2_imag_1[k] = (portion2_imag_1_part1[k] + portion2_imag_1_part2[k]);// / 6970700000000000;
}
}
__global__
void GPUmatmul_nosqr_portion2_real(float *portion2_real_1, float *Vsub_real_GPU, float *portion2_real_part1, float *portion2_imag_1, float *Vsub_imag_GPU, float *portion2_real_part2, float *portion2_real)
{
//calculates unique thread ID in the block
int t = (threadIdx.x);
int b = (blockIdx.x);
int T = blockDim.x;
int B = gridDim.x;
for (int i = b; i < G; i += B)
{
for (int j = t; j < N; j += T)
{
for (int k = 0; k < Sub; k++)
{
portion2_real_part1[i*G + j] += (portion2_real_1[j*Sub + k] * Vsub_real_GPU[k]);
}
}
}
for (int i = b; i < G; i += B)
{
for (int j = t; j < N; j += T)
{
for (int k = 0; k < Sub; k++)
{
portion2_real_part2[i*G + j] += (portion2_imag_1[j*Sub + k] * Vsub_imag_GPU[k]);
}
}
}
for (int k = 0; k < N; k++)
{
portion2_real[k] = (portion2_real_part1[k] - portion2_real_part2[k]);
}
}
__global__
void GPUmatmul_nosqr_portion2_imag(float *portion2_real_1, float *Vsub_real_GPU, float *portion2_imag_part1, float *portion2_imag_1, float *Vsub_imag_GPU, float *portion2_imag_part2, float *portion2_imag)
{
//calculates unique thread ID in the block
int t = (threadIdx.x);
int b = (blockIdx.x);
int T = blockDim.x;
int B = gridDim.x;
for (int i = b; i < G; i += B)
{
for (int j = t; j < N; j += T)
{
for (int k = 0; k < Sub; k++)
{
portion2_imag_part1[i*G + j] += (portion2_imag_1[j*Sub + k] * Vsub_real_GPU[k]);
}
}
}
for (int i = b; i < G; i += B)
{
for (int j = t; j < N; j += T)
{
for (int k = 0; k < Sub; k++)
{
portion2_imag_part2[i*G + j] += (portion2_real_1[j*Sub + k] * Vsub_imag_GPU[k]);
}
}
}
for (int k = 0; k < N; k++)
{
portion2_imag[k] = (portion2_imag_part1[k] + portion2_imag_part2[k]);
}
}
vector<vector<float>> parse2DCsvFile(string inputFileName) {
vector<vector<float> > data;
ifstream inputFile(inputFileName);
int l = 0;
while (inputFile) {
l++;
string s;
if (!getline(inputFile, s)) break;
if (s[0] != '#') {
istringstream ss(s);
vector<float> record;
while (ss) {
string line;
if (!getline(ss, line, ','))
break;
try {
record.push_back(stof(line));
}
catch (const std::invalid_argument e) {
cout << "NaN found in file " << inputFileName << " line " << l
<< endl;
e.what();
}
}
data.push_back(record);
}
}
if (!inputFile.eof()) {
cerr << "Could not read file " << inputFileName << "\n";
//__throw_invalid_argument("File not found.");
}
return data;
}
int main()
{
vector<vector<float>> Ybus_imag = parse2DCsvFile("inv_Ybus_imag.csv");
vector<vector<float>> Ybus_real = parse2DCsvFile("inv_Ybus_real.csv");
vector<vector<float>> Ysub_real = parse2DCsvFile("Ysub_real.csv");
vector<vector<float>> Ysub_imag = parse2DCsvFile("Ysub_imag.csv");
vector<vector<float>> S_real_cpu = parse2DCsvFile("S_real.csv");
vector<vector<float>> V_real_cpu = parse2DCsvFile("V_real.csv");
vector<vector<float>> S_imag_cpu = parse2DCsvFile("S_imag.csv");
vector<vector<float>> V_imag_cpu = parse2DCsvFile("V_imag.csv");
vector<vector<float>> Vsub_real = parse2DCsvFile("Vsub_real.csv");
vector<vector<float>> Vsub_imag = parse2DCsvFile("Vsub_imag.csv");
float V_real_sqr_cpu[F][G];
memset(V_real_sqr_cpu, 0, sizeof(float)*F*G);
float V_imag_sqr_cpu[F][G];
memset(V_imag_sqr_cpu, 0, sizeof(float)*F*G);
for (int x = 0; x < F; x++) // loop 3 times for three lines
{
for (int y = 0; y < G; y++) // loop for the three elements on the line
{
V_real_sqr_cpu[x][y] = pow(V_real_cpu[x][y], 2);
}
}
for (int x = 0; x < F; x++) // loop 3 times for three lines
{
for (int y = 0; y < G; y++) // loop for the three elements on the line
{
V_imag_sqr_cpu[x][y] = pow(V_imag_cpu[x][y], 2);
}
}
float *IL_real, *IL_imag, *ILn_real, *ILn_imag, *S_real, *S_imag, *V_real, *V_imag, *V_real_sqr, *V_imag_sqr, float *IL_real_dummy;// *V_real_sqr, *V_imag_sqr;
float *Ysub_real_GPU, *Ysub_imag_GPU, *Vsub_real_GPU, *Vsub_imag_GPU;
cudaMallocManaged(&IL_real, F*G * sizeof(float)); cudaMallocManaged(&IL_imag, F*G * sizeof(float)); cudaMallocManaged(&ILn_real, F*G * sizeof(float)); cudaMallocManaged(&ILn_imag, N*G * sizeof(float)); cudaMallocManaged(&S_real, F*G * sizeof(float));
cudaMallocManaged(&S_imag, F*G * sizeof(float)); cudaMallocManaged(&V_real, F*G * sizeof(float)); cudaMallocManaged(&V_imag, F*G * sizeof(float)); cudaMallocManaged(&V_real_sqr, F*G * sizeof(float)); cudaMallocManaged(&V_imag_sqr, F*G * sizeof(float));
cudaMallocManaged(&Ysub_real_GPU, N*Sub * sizeof(float)); cudaMallocManaged(&Ysub_imag_GPU, N*Sub * sizeof(float)); cudaMallocManaged(&Vsub_real_GPU, G*Sub * sizeof(float)); cudaMallocManaged(&Vsub_imag_GPU, G*Sub * sizeof(float));
// initialize x,y and ans arrays on the host
for (int i = 0; i < F; i++)
{
for (int j = 0; j < G; j++)
{
S_real[i*G + j] = S_real_cpu[i][j]; V_real[i*G + j] = V_real_cpu[i][j]; S_imag[i*G + j] = S_imag_cpu[i][j]; V_imag[i*G + j] = V_imag_cpu[i][j]; V_real_sqr[i*G + j] = V_real_sqr_cpu[i][j]; V_imag_sqr[i*G + j] = V_imag_sqr_cpu[i][j];
IL_real[i*G + j] = 0; IL_imag[i*G + j] = 0; }
}
float *x, *y, *answer, *portion1_real_1, *portion1_real_2, *portion1_imag_1, *portion1_real, *portion1_imag_2, *portion1_imag;
float *portion2_real_1_part1, *portion2_real_1_part2, *portion2_real_1, *portion2_imag_1, *portion2_imag_2, *portion2_real_part1;
float *portion2_imag_1_part1, *portion2_imag_1_part2, *portion2_real_part2, *portion2_real, *portion2_imag_part1, *portion2_imag_part2, *portion2_imag;
//float *Vn_real;
cudaMallocManaged(&answer, N*G * sizeof(float)); cudaMallocManaged(&x, N*N * sizeof(float)); cudaMallocManaged(&y, N*N * sizeof(float));
cudaMallocManaged(&portion1_real_1, N*G * sizeof(float)); cudaMallocManaged(&portion1_real_2, N*G * sizeof(float)); cudaMallocManaged(&portion1_imag_1, N*G * sizeof(float)); cudaMallocManaged(&portion1_real, N*G * sizeof(float));
cudaMallocManaged(&portion1_imag_2, N*G * sizeof(float)); cudaMallocManaged(&portion1_imag, N*G * sizeof(float)); cudaMallocManaged(&portion2_real_1_part1, N*Sub * sizeof(float)); cudaMallocManaged(&portion2_real_1_part2, N*Sub * sizeof(float));
cudaMallocManaged(&portion2_imag_1_part1, N*Sub * sizeof(float)); cudaMallocManaged(&portion2_imag_1_part2, N*Sub * sizeof(float)); cudaMallocManaged(&portion2_imag_1, N*Sub * sizeof(float)); cudaMallocManaged(&portion2_imag_2, N*Sub * sizeof(float));
cudaMallocManaged(&portion2_real_1, N*Sub * sizeof(float)); cudaMallocManaged(&portion2_real_part1, N*G * sizeof(float)); cudaMallocManaged(&portion2_real_part2, N*G * sizeof(float)); cudaMallocManaged(&portion2_real, N*G * sizeof(float));
cudaMallocManaged(&portion2_imag_part1, N*G * sizeof(float)); cudaMallocManaged(&portion2_imag_part2, N*G * sizeof(float)); cudaMallocManaged(&portion2_imag, N*G * sizeof(float));
for (int i = 0; i < N; i++)
{
for (int j = 0; j < Sub; j++)
{
Ysub_real_GPU[i*Sub + j] = Ysub_real[i][j]; Ysub_imag_GPU[i*Sub + j] = Ysub_imag[i][j];
}
}
for (int i = 0; i < Sub; i++)
{
for (int j = 0; j < G; j++)
{
Vsub_real_GPU[i*G + j] = Vsub_real[i][j]; Vsub_imag_GPU[i*G + j] = Vsub_imag[i][j];
}
}
float *ILn_real2;
cudaMallocManaged(&ILn_real2, N*G * sizeof(float));
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
x[i*N + j] = Ybus_imag[i][j];
}
}
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
y[i*N + j] = Ybus_real[i][j];
}
}
for (int kk = 0; kk < 5; kk++)
{
for (int i = 0; i < N; i++)
{
for (int j = 0; j < G; j++)
{
ILn_real[i*G + j] = 0; ILn_real2[i*G + j] = 0; ILn_imag[i*G + j] = 0; portion1_real_1[i*G + j] = 0; portion1_real_2[i*G + j] = 0; portion1_imag_1[i*G + j] = 0; portion1_real[i*G + j] = 0;
portion1_imag_2[i*G + j] = 0; portion1_imag[i*G + j] = 0; portion2_real_part1[i*G + j] = 0; portion2_real_part2[i*G + j] = 0; portion2_real[i*G + j] = 0;
portion2_imag_part1[i*G + j] = 0; portion2_imag_part2[i*G + j] = 0; portion2_imag[i*G + j] = 0;
}
}
for (int i = 0; i < N; i++)
{
for (int j = 0; j < Sub; j++)
{
portion2_real_1_part1[i*Sub + j] = 0; portion2_real_1_part2[i*Sub + j] = 0; portion2_imag_1[i*Sub + j] = 0; portion2_imag_2[i*Sub + j] = 0; portion2_real_1[i*Sub + j] = 0;
}
}
GPUmatmul_element << <dim3(1), dim3(512) >> > (S_real, V_real, S_imag, V_imag, V_real_sqr, V_imag_sqr, IL_real, IL_imag, ILn_real, ILn_imag, portion1_real_1, x, y, portion1_real_2, portion1_real, portion1_imag_1, portion1_imag_2, portion1_imag,
portion2_real_1_part1, Ysub_imag_GPU, Ysub_real_GPU, portion2_real_1_part2, portion2_real_1, portion2_imag_1_part1, portion2_imag_1_part2, portion2_imag_1);
GPUmatmul_nosqr_portion2_real << <dim3(1), dim3(512) >> > (portion2_real_1, Vsub_real_GPU, portion2_real_part1, portion2_imag_1, Vsub_imag_GPU, portion2_real_part2, portion2_real);
GPUmatmul_nosqr_portion2_imag << <dim3(1), dim3(512) >> > (portion2_real_1, Vsub_real_GPU, portion2_imag_part1, portion2_imag_1, Vsub_imag_GPU, portion2_imag_part2, portion2_imag);
cudaDeviceSynchronize();
float Vn_imag[N];
float Vn_real[N];
for (int kk = 0; kk < N; kk++)
{
Vn_imag[kk] = portion1_imag[kk] - portion2_imag[kk];
}
for (int kk = 0; kk < N; kk++)
{
Vn_real[kk] = portion1_real[kk] - portion2_real[kk];
}
for (int kk = 0; kk < Sub; kk++)
{
V_real[kk] = Vsub_real_GPU[kk];
}
for (int kk = 0; kk < N; kk++)
{
V_real[kk + 3] = Vn_real[kk];
}
for (int kk = 0; kk < Sub; kk++)
{
V_imag[kk] = Vsub_imag_GPU[kk];
}
for (int kk = 0; kk < N; kk++)
{
V_imag[kk + 3] = Vn_imag[kk];
}
}
cout << "The Value of V_real is" << "\n";
for (int k = 0; k < F; k++)
{
cout << V_real[k] << "\n";
}
cudaFree(S_real); cudaFree(S_imag); cudaFree(V_real); cudaFree(V_imag); cudaFree(V_real_sqr); cudaFree(V_imag_sqr); cudaFree(IL_real); cudaFree(IL_imag); cudaFree(ILn_real); cudaFree(ILn_imag); cudaFree(x); cudaFree(portion1_real_1);
cudaFree(ILn_real2); cudaFree(portion2_real_part1);
system("Pause");
return 0;
}