Sending data from the host to the device and vice versa (CUDA - single GPU, MPI - multi-processes)

Hello friends, I need some help/suggestion/advice how to do this. The point is that MPI divides the table into parts, each process calculates its own. In the loop “main.c” where there are calculations for each part I want to put the kernel. I want calculations to be performed on the GPU. I probably have a problem with copying them from GPU to the CPU. I’m not sure. I would like to hear the advice of an experienced programmer am I doing the whole scheme correctly? Please, I am asking for any comments.

I transform 2d to 1d tables before sending them to the GPU, and after receiving on the CPU changing from 1d → 2d.

FILE “main.c”

float **T_last;
float **T;
int local_NY, local_NX;

local_NY = NY;
local_NX = calc_NX_from_rank(rank, size);

...
//Tab T with additional COLS for HALO (overlapping)
T = (float**) malloc(sizeof(float*) * local_NY);
for (i = 0; i < local_NY; i++) {
	T[i] = (float*) malloc(sizeof(float) * (local_NX + 2));
}

....
//Allocate tab T
for (i = 0; i < local_NY; i++) {
	for (j = 1; j < local_NX + 2; j++) {
		T[i][j] = (int) rand() % 10;
	}
}

//2D to 1D
int totalSize = local_NX * local_NY; //printf("%i", totalSize);
float *T1D = new float[totalSize];
float *T1D2 = new float[totalSize];

Allocate_Memory(&T1D, &d_a, &d_b, totalSize);

for (iter = 0; iter < ITERATIONS; iter++) {
		//MPI EXCHANGE HALOS
                for (i = 0; i < local_NY; i++)
			sendBuff[i] = T[i][1];
		MPI_Sendrecv(sendBuff, local_NY, MPI_FLOAT, left, tag, recvBuff,
					 local_NY, MPI_FLOAT, right, tag,
					 MPI_COMM_WORLD, &status);
		for (i = 0; i < local_NY; i++)
			T[i][local_NX + 1] = recvBuff[i];
                ...other MPI sendrecv...
                
                //Calculations for each part
                for (i = 0; i < local_NY; i++) {
			if (rank == 0) {
				startCol = 0;
				endCol = local_NX;
			} else if (rank == size - 1) {
				startCol = 0;
				endCol = local_NX;
			} else {
				startCol = 0;
				endCol = local_NX;
			}
			for (j = startCol; j < endCol + 1; j++) {
				if (i == 0) {
                                        
                                        int idx = 0;	
					for (int i = 0; i < local_NY; i++){
    						for (int j = 0; j < local_NX; j++){
							T1D[idx++] = T_last[i][j];	
						}
					}
					
					Copy_All_To_GPU(&T1D, &d_a, &d_b, totalSize);
					
					GPU_Compute(&T1D, &d_a, &d_b, totalSize);
					
					Copy_All_From_GPU(&T1D, &d_a, &d_b, totalSize);
					
					idx = 0;
					for (int i = 0; i < local_NY; i++){
    						for (int j = 0; j < local_NX; j++){
							T[i][j] = T1D[idx++];	
						}
					}
				}
			}

FILE “gpu_main.h”

void Allocate_Memory(float **T1D, float **d_a, float **d_b, int totalSize);

void Copy_All_To_GPU(float **T1D, float **d_a, float **d_b, int totalSize);

void GPU_Compute(float **T1D, float **d_a, float **d_b, int totalSize);

void Copy_All_From_GPU(float **T1D, float **d_a, float **d_b, int totalSize);

FILE “gpu_main.cu”

__global__ void GPU_Calc_Temp(float *d_a, float *d_b, int totalSize) {
	int i = threadIdx.x + blockIdx.x * blockDim.x;	
	d_b[i] = d_a[i];  //copy tab
	//printf("%6.2f", d_b[i]);
}

void GPU_Compute(float **T1D, float **d_a, float **d_b, int totalSize) {
	cudaError_t Error;	
		
	// Calculate the new Temperature
	GPU_Calc_Temp <<<1 , totalSize >>> (*d_a, *d_b, totalSize);

	Error = cudaMemcpy(*d_a, *d_b, totalSize, cudaMemcpyDeviceToDevice); 
	if (DEBUG) printf("CUDA Error (MemCpy d_b->d_a) = %s\n", cudaGetErrorString(Error));
}

void Allocate_Memory(float **T1D, float **d_a, float **d_b, int totalSize) {
	cudaError_t Error;
	Error = cudaSetDevice(0);
	//if (DEBUG) printf("CUDA Error (setDevice) = %s\n", cudaGetErrorString(Error));

	Error = cudaMalloc((void**)d_a, totalSize * sizeof(float));
	//if (DEBUG) printf("CUDA Error (cudaMalloc d_a) = %s\n", cudaGetErrorString(Error));

	Error = cudaMalloc((void**)d_b, totalSize * sizeof(float));
	//if (DEBUG) printf("CUDA Error (cudaMalloc d_b) = %s\n", cudaGetErrorString(Error));
}

void Copy_All_To_GPU(float **T1D, float **d_a, float **d_b, int totalSize) {
	cudaError_t Error;
	Error =  cudaMemcpy(*d_a, *T1D, totalSize * sizeof(float), cudaMemcpyHostToDevice);
	//if (DEBUG) printf("CUDA Error (MemCpy T1D->d_a) = %s\n", cudaGetErrorString(Error));
}

void Copy_All_From_GPU(float **T1D, float **d_a, float **d_b, int totalSize) {
	cudaError_t Error;
	Error =  cudaMemcpy(*T1D, *d_a, totalSize * sizeof(float), cudaMemcpyDeviceToHost);
	if (DEBUG) printf("CUDA Error (MemCpy d_a->T1D) = %s\n", cudaGetErrorString(Error));
}