Issue with thread Synchronization and CUDA function

0

I am trying to solve this problem where I am trying to combine all separate functions into one big function call. There are now 3 function calls that need to be one. At this point, the code works good and I am getting the correct result but with merging With the second function called “GPUmatmul_nosqr_portion2_real” into the function “GPUmatmul_element”. the code gives the wrong answer. From my debugging, after the function call “GPUmatmul_element” the next function has the issue of accessing the memory as if I debug, I cannot access the values of the first function in the immediate widow as it shows . However, when I use the cudaDeviceSynchronize(); before the second function call, I can access the values. But, interestingly, even though in the debugging mode, I cannot access the value, the whole existing code gives the correct answer without issue. The issue only starts when I tried merging the second function into the first function. This tells me that there is probably an issue with synchronization. I have tried using __syncthread(); but that didn’t have any impact.

I would like to request some expert help and advice on how this can be avoided and how I can successfully merge all functions together. I am using Windows 10 and Visual Studio 2017.

I would really appreciate it if someone can please take a look and help me.

#include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include<cmath>
#include <sstream> //istringstream
#include <fstream> // ifstream
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>
#include <time.h>
#include <float.h>
#include <iomanip>


using namespace std;


#define N 9
#define F 12
#define Sub 3
#define G 1
#define LDBL_EPSILON 20


__global__
void GPUmatmul_element(float *S_real, float *V_real, float *S_imag, float *V_imag, float *V_real_sqr, float *V_imag_sqr, float *IL_real, float *IL_imag, float *ILn_real, float *ILn_imag,
	float *portion1_real_1,  float *x,  float *y, float *portion1_real_2, float *portion1_real, float *portion1_imag_1, float *portion1_imag_2, float *portion1_imag, 
	float *portion2_real_1_part1, float *Ysub_imag_GPU, float *Ysub_real_GPU, float *portion2_real_1_part2, float *portion2_real_1,
	float *portion2_imag_1_part1, float *portion2_imag_1_part2, float *portion2_imag_1)
{
	//calculates unique thread ID in the block

	int t = (threadIdx.x);
	int b =  (blockIdx.x);
	int T = blockDim.x;
	int B = gridDim.x;



	for (int i = b; i < F; i += B)
	{
		for (int j = t; j < G; j += T)
		{
			IL_imag[i*G + j] = ((S_imag[i*G + j] * V_real[i*G + j]) - (S_real[i*G + j] * V_imag[i*G + j])) / (V_real_sqr[i*G + j] + V_imag_sqr[i*G + j]);
			IL_real[i*G + j] = -1 * ((S_real[i*G + j] * V_real[i*G + j]) + (S_imag[i*G + j] * V_imag[i*G + j])) / (V_real_sqr[i*G + j] + V_imag_sqr[i*G + j]);
		}
	}

	
	for (int i = 0; i < N; i++)
	{
		for (int j = 0; j < G; j++)
		{
			ILn_imag[i*G + j] = IL_imag[i*G + j + 3];
		}
	}

	for (int i = 0; i < N; i++)
	{
		for (int j = 0; j < G; j++)
		{
			ILn_real[i*G + j] = IL_real[i*G + j + 3];
		}
	}
	   	  

	for (int i = b; i < G; i += B)
	{
		for (int j = t; j < N; j += T)
		{
			for (int k = 0; k < N; k++)
			{
				portion1_real_1[i*G + j] += (x[j*N + k] * ILn_imag[k]);   // x is the Ybus_imag
			}
		}
	}
	

	for (int i = b; i < G; i += B)
	{
		for (int j = t; j < N; j += T)
		{
			for (int k = 0; k < N; k++)
			{
				portion1_real_2[i*G + j] += (y[j*N + k] * ILn_real[k]);   // y is the Ybus_real
			}
		}
	}
	

	for (int k = 0; k < N; k++)
	{
		portion1_real[k] = (portion1_real_2[k] - portion1_real_1[k]);
	}



	for (int i = b; i < G; i += B)
	{
		for (int j = t; j < N; j += T)
		{
			for (int k = 0; k < N; k++)
			{
				portion1_imag_1[i*G + j] += (y[j*N + k] * ILn_imag[k]);
			}
		}
	}
	

	for (int i = b; i < G; i += B)
	{
		for (int j = t; j < N; j += T)
		{
			for (int k = 0; k < N; k++)
			{
				portion1_imag_2[i*G + j] += (x[j*N + k] * ILn_real[k]);
			}
		}
	}
	

	for (int k = 0; k < N; k++)
	{
		portion1_imag[k] = (portion1_imag_1[k] + portion1_imag_2[k]);
	}

	
	for (int i = b; i < N; i += B)
	{
		for (int j = t; j < Sub; j += T)
		{
			for (int k = 0; k < N; k++)
			{
				portion2_real_1_part1[i*Sub + j] += (y[i*N + k] * Ysub_real_GPU[k*Sub + j]);
			}
		}
	}


	for (int i = b; i < N; i += B)
	{
		for (int j = t; j < Sub; j += T)
		{
			for (int k = 0; k < N; k++)
			{
				portion2_real_1_part2[i*Sub + j] += (x[i*N + k] * Ysub_imag_GPU[k*Sub + j]);
			}
		}
	}


	for (int k = 0; k < N*Sub; k++)
	{
		portion2_real_1[k] = (portion2_real_1_part1[k] - portion2_real_1_part2[k]);
	}


	for (int i = b; i < N; i += B)
	{
		for (int j = t; j < Sub; j += T)
		{
			for (int k = 0; k < N; k++)
			{
				portion2_imag_1_part1[i*Sub + j] += (x[i*N + k] * Ysub_real_GPU[k*Sub + j]);
			}
		}
	}



	for (int i = b; i < N; i += B)
	{
		for (int j = t; j < Sub; j += T)
		{
			for (int k = 0; k < N; k++)
			{
				portion2_imag_1_part2[i*Sub + j] += (y[i*N + k] * Ysub_imag_GPU[k*Sub + j]);
			}
		}
	}
	

	for (int k = 0; k < N*Sub; k++)
	{
		portion2_imag_1[k] = (portion2_imag_1_part1[k] + portion2_imag_1_part2[k]);// / 6970700000000000;
	}
		
}



__global__
void GPUmatmul_nosqr_portion2_real(float *portion2_real_1, float *Vsub_real_GPU, float *portion2_real_part1, float *portion2_imag_1, float *Vsub_imag_GPU, float *portion2_real_part2, float *portion2_real)
{
	//calculates unique thread ID in the block
	int t = (threadIdx.x);
	int b = (blockIdx.x);
	int T = blockDim.x;
	int B = gridDim.x;

    for (int i = b; i < G; i += B)
	{
		for (int j = t; j < N; j += T)
		{
			for (int k = 0; k < Sub; k++)
			{
				portion2_real_part1[i*G + j] += (portion2_real_1[j*Sub + k] * Vsub_real_GPU[k]);
			}
		}
	}

	
	for (int i = b; i < G; i += B)
	{
		for (int j = t; j < N; j += T)
		{
			for (int k = 0; k < Sub; k++)
			{
				portion2_real_part2[i*G + j] += (portion2_imag_1[j*Sub + k] * Vsub_imag_GPU[k]);
			}
		}

	}
	
	for (int k = 0; k < N; k++)
	{
		portion2_real[k] = (portion2_real_part1[k] - portion2_real_part2[k]);
	}

}



__global__
void GPUmatmul_nosqr_portion2_imag(float *portion2_real_1, float *Vsub_real_GPU, float *portion2_imag_part1, float *portion2_imag_1, float *Vsub_imag_GPU, float *portion2_imag_part2, float *portion2_imag)
{
	//calculates unique thread ID in the block
	int t = (threadIdx.x);
	int b = (blockIdx.x);
	int T = blockDim.x;
	int B = gridDim.x;

	
	for (int i = b; i < G; i += B)
	{
		for (int j = t; j < N; j += T)
		{
			for (int k = 0; k < Sub; k++)
			{
				portion2_imag_part1[i*G + j] += (portion2_imag_1[j*Sub + k] * Vsub_real_GPU[k]);
			}
		}
	}
	

	for (int i = b; i < G; i += B)
	{
		for (int j = t; j < N; j += T)
		{
			for (int k = 0; k < Sub; k++)
			{
				portion2_imag_part2[i*G + j] += (portion2_real_1[j*Sub + k] * Vsub_imag_GPU[k]);
			}
		}

	}
	
	for (int k = 0; k < N; k++)
	{
		portion2_imag[k] = (portion2_imag_part1[k] + portion2_imag_part2[k]);
	}

}





vector<vector<float>> parse2DCsvFile(string inputFileName) {

	vector<vector<float> > data;
	ifstream inputFile(inputFileName);
	int l = 0;

	while (inputFile) {
		l++;
		string s;
		if (!getline(inputFile, s)) break;
		if (s[0] != '#') {
			istringstream ss(s);
			vector<float> record;

			while (ss) {
				string line;
				if (!getline(ss, line, ','))
					break;
				try {
					record.push_back(stof(line));
				}
				catch (const std::invalid_argument e) {
					cout << "NaN found in file " << inputFileName << " line " << l
						<< endl;
					e.what();
				}
			}

			data.push_back(record);
		}
	}

	if (!inputFile.eof()) {
		cerr << "Could not read file " << inputFileName << "\n";
		//__throw_invalid_argument("File not found.");
	}

	return data;
}



int main()
{
	vector<vector<float>> Ybus_imag = parse2DCsvFile("inv_Ybus_imag.csv");
	vector<vector<float>> Ybus_real = parse2DCsvFile("inv_Ybus_real.csv");
	vector<vector<float>> Ysub_real = parse2DCsvFile("Ysub_real.csv");
	vector<vector<float>> Ysub_imag = parse2DCsvFile("Ysub_imag.csv");
	vector<vector<float>> S_real_cpu = parse2DCsvFile("S_real.csv");
	vector<vector<float>> V_real_cpu = parse2DCsvFile("V_real.csv");
	vector<vector<float>> S_imag_cpu = parse2DCsvFile("S_imag.csv");
	vector<vector<float>> V_imag_cpu = parse2DCsvFile("V_imag.csv");
	vector<vector<float>> Vsub_real = parse2DCsvFile("Vsub_real.csv");
	vector<vector<float>> Vsub_imag = parse2DCsvFile("Vsub_imag.csv");


	float  V_real_sqr_cpu[F][G];
	memset(V_real_sqr_cpu, 0, sizeof(float)*F*G);
	float  V_imag_sqr_cpu[F][G];
	memset(V_imag_sqr_cpu, 0, sizeof(float)*F*G);


	for (int x = 0; x < F; x++)  // loop 3 times for three lines
	{
		for (int y = 0; y < G; y++)  // loop for the three elements on the line
		{
			V_real_sqr_cpu[x][y] = pow(V_real_cpu[x][y], 2);	
		}
	}


	for (int x = 0; x < F; x++)  // loop 3 times for three lines
	{
		for (int y = 0; y < G; y++)  // loop for the three elements on the line
		{
			V_imag_sqr_cpu[x][y] = pow(V_imag_cpu[x][y], 2);
		}
	}



	float *IL_real, *IL_imag, *ILn_real, *ILn_imag, *S_real, *S_imag, *V_real, *V_imag, *V_real_sqr, *V_imag_sqr, float *IL_real_dummy;// *V_real_sqr, *V_imag_sqr;
	float *Ysub_real_GPU, *Ysub_imag_GPU, *Vsub_real_GPU, *Vsub_imag_GPU;



	cudaMallocManaged(&IL_real, F*G * sizeof(float));	cudaMallocManaged(&IL_imag, F*G * sizeof(float));	cudaMallocManaged(&ILn_real, F*G * sizeof(float));	cudaMallocManaged(&ILn_imag, N*G * sizeof(float));	cudaMallocManaged(&S_real, F*G * sizeof(float));

	cudaMallocManaged(&S_imag, F*G * sizeof(float));	cudaMallocManaged(&V_real, F*G * sizeof(float));	cudaMallocManaged(&V_imag, F*G * sizeof(float));	cudaMallocManaged(&V_real_sqr, F*G * sizeof(float));	cudaMallocManaged(&V_imag_sqr, F*G * sizeof(float));

	cudaMallocManaged(&Ysub_real_GPU, N*Sub * sizeof(float));	cudaMallocManaged(&Ysub_imag_GPU, N*Sub * sizeof(float));	cudaMallocManaged(&Vsub_real_GPU, G*Sub * sizeof(float));	cudaMallocManaged(&Vsub_imag_GPU, G*Sub * sizeof(float));



	// initialize x,y and ans arrays on the host
	for (int i = 0; i < F; i++)
	{
		for (int j = 0; j < G; j++)
		{
			S_real[i*G + j] = S_real_cpu[i][j];		V_real[i*G + j] = V_real_cpu[i][j];		S_imag[i*G + j] = S_imag_cpu[i][j];		V_imag[i*G + j] = V_imag_cpu[i][j];		V_real_sqr[i*G + j] = V_real_sqr_cpu[i][j];		V_imag_sqr[i*G + j] = V_imag_sqr_cpu[i][j];
			IL_real[i*G + j] = 0;		IL_imag[i*G + j] = 0;	}
	}

	float *x, *y, *answer, *portion1_real_1, *portion1_real_2, *portion1_imag_1, *portion1_real, *portion1_imag_2, *portion1_imag;

	float *portion2_real_1_part1, *portion2_real_1_part2, *portion2_real_1, *portion2_imag_1, *portion2_imag_2, *portion2_real_part1;

	float *portion2_imag_1_part1, *portion2_imag_1_part2, *portion2_real_part2, *portion2_real, *portion2_imag_part1, *portion2_imag_part2, *portion2_imag;

	//float *Vn_real;

	cudaMallocManaged(&answer, N*G * sizeof(float)); cudaMallocManaged(&x, N*N * sizeof(float));	cudaMallocManaged(&y, N*N * sizeof(float));
	cudaMallocManaged(&portion1_real_1, N*G * sizeof(float));  cudaMallocManaged(&portion1_real_2, N*G * sizeof(float));	cudaMallocManaged(&portion1_imag_1, N*G * sizeof(float));	cudaMallocManaged(&portion1_real, N*G * sizeof(float));
	cudaMallocManaged(&portion1_imag_2, N*G * sizeof(float));	cudaMallocManaged(&portion1_imag, N*G * sizeof(float));	cudaMallocManaged(&portion2_real_1_part1, N*Sub * sizeof(float)); cudaMallocManaged(&portion2_real_1_part2, N*Sub * sizeof(float));
	cudaMallocManaged(&portion2_imag_1_part1, N*Sub * sizeof(float));	cudaMallocManaged(&portion2_imag_1_part2, N*Sub * sizeof(float));	cudaMallocManaged(&portion2_imag_1, N*Sub * sizeof(float)); cudaMallocManaged(&portion2_imag_2, N*Sub * sizeof(float));
	cudaMallocManaged(&portion2_real_1, N*Sub * sizeof(float));	cudaMallocManaged(&portion2_real_part1, N*G * sizeof(float));	cudaMallocManaged(&portion2_real_part2, N*G * sizeof(float));	cudaMallocManaged(&portion2_real, N*G * sizeof(float));
	cudaMallocManaged(&portion2_imag_part1, N*G * sizeof(float));	cudaMallocManaged(&portion2_imag_part2, N*G * sizeof(float));	cudaMallocManaged(&portion2_imag, N*G * sizeof(float));




	for (int i = 0; i < N; i++)
	{
		for (int j = 0; j < Sub; j++)
		{
			Ysub_real_GPU[i*Sub + j] = Ysub_real[i][j];			Ysub_imag_GPU[i*Sub + j] = Ysub_imag[i][j];
		}
	}

	for (int i = 0; i < Sub; i++)
	{
		for (int j = 0; j < G; j++)
		{
			Vsub_real_GPU[i*G + j] = Vsub_real[i][j];			Vsub_imag_GPU[i*G + j] = Vsub_imag[i][j];
		}
	}



	float *ILn_real2;
	cudaMallocManaged(&ILn_real2, N*G * sizeof(float));

	for (int i = 0; i < N; i++)
	{
		for (int j = 0; j < N; j++)
		{
			x[i*N + j] = Ybus_imag[i][j];
		}
	}


	for (int i = 0; i < N; i++)
	{
		for (int j = 0; j < N; j++)
		{
			y[i*N + j] = Ybus_real[i][j];
		}
	}






	for (int kk = 0; kk < 5; kk++)
	{

		for (int i = 0; i < N; i++)
		{
			for (int j = 0; j < G; j++)
			{
				ILn_real[i*G + j] = 0;		ILn_real2[i*G + j] = 0;		ILn_imag[i*G + j] = 0;		portion1_real_1[i*G + j] = 0;		portion1_real_2[i*G + j] = 0;		portion1_imag_1[i*G + j] = 0;		portion1_real[i*G + j] = 0;
				portion1_imag_2[i*G + j] = 0;		portion1_imag[i*G + j] = 0;		portion2_real_part1[i*G + j] = 0;		portion2_real_part2[i*G + j] = 0;		portion2_real[i*G + j] = 0;
				portion2_imag_part1[i*G + j] = 0;		portion2_imag_part2[i*G + j] = 0;			portion2_imag[i*G + j] = 0;
			}
		}


		for (int i = 0; i < N; i++)
		{
			for (int j = 0; j < Sub; j++)
			{
				portion2_real_1_part1[i*Sub + j] = 0;	portion2_real_1_part2[i*Sub + j] = 0;	portion2_imag_1[i*Sub + j] = 0;	 portion2_imag_2[i*Sub + j] = 0;	portion2_real_1[i*Sub + j] = 0;
			}
		}

		

		GPUmatmul_element << <dim3(1), dim3(512) >> > (S_real, V_real, S_imag, V_imag, V_real_sqr, V_imag_sqr, IL_real, IL_imag, ILn_real, ILn_imag, portion1_real_1, x, y, portion1_real_2, portion1_real, portion1_imag_1, portion1_imag_2, portion1_imag,
			                                           portion2_real_1_part1, Ysub_imag_GPU, Ysub_real_GPU, portion2_real_1_part2, portion2_real_1, portion2_imag_1_part1, portion2_imag_1_part2, portion2_imag_1);

		GPUmatmul_nosqr_portion2_real << <dim3(1), dim3(512) >> > (portion2_real_1, Vsub_real_GPU, portion2_real_part1, portion2_imag_1, Vsub_imag_GPU, portion2_real_part2, portion2_real);

		GPUmatmul_nosqr_portion2_imag << <dim3(1), dim3(512) >> > (portion2_real_1, Vsub_real_GPU, portion2_imag_part1, portion2_imag_1, Vsub_imag_GPU, portion2_imag_part2, portion2_imag);

	

		cudaDeviceSynchronize();

	

		float Vn_imag[N];
		float Vn_real[N];

		for (int kk = 0; kk < N; kk++)
		{
			Vn_imag[kk] = portion1_imag[kk] - portion2_imag[kk];
		}
		for (int kk = 0; kk < N; kk++)
		{
			Vn_real[kk] = portion1_real[kk] - portion2_real[kk];
		}


		for (int kk = 0; kk < Sub; kk++)
		{
			V_real[kk] = Vsub_real_GPU[kk];
		}
		for (int kk = 0; kk < N; kk++)
		{
			V_real[kk + 3] = Vn_real[kk];
		}



		for (int kk = 0; kk < Sub; kk++)
		{
			V_imag[kk] = Vsub_imag_GPU[kk];
		}
		for (int kk = 0; kk < N; kk++)
		{
			V_imag[kk + 3] = Vn_imag[kk];
		}

	}


	cout << "The Value of V_real is" << "\n";

	for (int k = 0; k < F; k++)

	{
		cout << V_real[k] << "\n";
	}

	   
	cudaFree(S_real);	cudaFree(S_imag);	cudaFree(V_real);	cudaFree(V_imag);	cudaFree(V_real_sqr);	cudaFree(V_imag_sqr);	cudaFree(IL_real);	cudaFree(IL_imag);	cudaFree(ILn_real);	cudaFree(ILn_imag);	cudaFree(x);	cudaFree(portion1_real_1);
	cudaFree(ILn_real2);	cudaFree(portion2_real_part1);

	system("Pause");
	return 0;
}