Program MPI / CUDA, copy 2D matrix from host to device

Hello, I created a program in mpi and cuda. I want to send from the host table “T” with data to GPU, this has been done. Now, I want to change the data (in global void matrixCopy) on the GPU, but I have some errors with “The application may have hit an error when dereferencing Unified Memory from the host.”. Its probably by bad use of pointers, but I’m not sure. Can anyone help?

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <cuda.h>
#include "mpi.h"

#define NROWS 16
#define NCOLS 10
#define ITERATIONS 2
#define MASTER 0

#define X 20
#define Y 10

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void  matrixCopy(float* A, float* C, int local_nrows){

        int i = threadIdx.x + blockIdx.x * blockDim.x;
        int j = threadIdx.y + blockIdx.y * blockDim.y;

        C[local_nrows * j + i] = A[local_nrows * j + i] * 20; //change data
}

int calc_NX_from_rank(int rank, int size) {
    int ncols;
    ncols = NROWS / size;
    if ((NROWS % size) != 0) {
        if (rank == size - 1)
            ncols += NROWS % size; 
    }
    return ncols;
}

int main(int argc, char* argv[]) {
    int i, j;               
    int rank;                       
    int size;               

    int local_nrows;            
    int local_ncols;            

    float **T_last;     // T - 1
    float **T;      // T

    MPI_Init(&argc, &argv);
    MPI_Comm_size( MPI_COMM_WORLD, &size);
    MPI_Comm_rank( MPI_COMM_WORLD, &rank);

    local_nrows = NCOLS;
    local_ncols = calc_NX_from_rank(rank, size);

    T_last= (float**) malloc(sizeof(float*) * local_nrows);
    for (i = 0; i < local_nrows; i++) {
        T_last[i] = (float*) malloc(sizeof(float) * (local_ncols + 2));
    }

    T = (float**) malloc(sizeof(float*) * local_nrows);
    for (i = 0; i < local_nrows; i++) {
        T[i] = (float*) malloc(sizeof(float) * (local_ncols + 2));
    }

    for (i = 0; i < local_nrows; i++) {
        for (j = 0; j < local_ncols; j++) {
            T[i][j] = (int) rand() % 5;
        }
    }

    float *dev_a, *dev_c;

    gpuErrchk(cudaMalloc((void **)&dev_a, local_nrows * (local_ncols) * sizeof(float)));
    gpuErrchk(cudaMalloc((void **)&dev_c, local_nrows * (local_ncols) * sizeof(float)));

    gpuErrchk(cudaMemcpy(dev_a, *T, local_nrows * (local_ncols) * sizeof(float), cudaMemcpyHostToDevice));
    matrixCopy<<< local_nrows, local_ncols >>>(dev_a, dev_c, local_nrows);
    gpuErrchk(cudaMemcpy(*T_last, dev_c, local_nrows * (local_ncols) * sizeof(float), cudaMemcpyDeviceToHost));

    for ( i = 0; i < local_nrows; i++){
         for (int j = 0; j < local_ncols; j++){
                //printf("[%d, %d ] = %f = %f \n",i,j, a[i][j], c[i][j]);
                printf("%6.2f",T_last[i][j]);
            }
        printf("\n");
    }

    MPI_Finalize();

    for (i = 0; i < local_nrows; i++) {
        free(T_last[i]);
        free(T[i]);
    }

    cudaFree(dev_a);
    cudaFree(dev_c);

    return EXIT_SUCCESS;
}
========= CUDA-MEMCHECK
 60.00 20.00 40.00  0.00 60.00  0.00 20.00 40.00 80.00 20.00 40.00 40.00  0.00 80.00 60.00 20.00
  0.00 20.00 40.00 20.00 20.00 60.00 40.00 80.00 40.00  0.00 40.00 60.00 40.00  0.00 80.00 40.00
 40.00 60.00 80.00 40.00 60.00 20.00 20.00 40.00 80.00 60.00 20.00 80.00 80.00 40.00 60.00 80.00
  0.00  0.00 60.00 20.00 20.00  0.00 20.00 60.00 40.00  0.00 20.00 20.00  0.00  0.00 80.00 40.00
 20.00  0.00 20.00 80.00 60.00 40.00 80.00  0.00 40.00  0.00 80.00 40.00 80.00 80.00 60.00  0.00
 40.00 60.00 20.00 60.00 60.00 80.00 60.00 20.00 80.00 80.00 40.00  0.00 20.00 60.00 80.00 40.00
 20.00 20.00 80.00 80.00  0.00  0.00 80.00 60.00 40.00 20.00 40.00 40.00 40.00 40.00 40.00 20.00
 20.00  0.00 20.00 20.00  0.00 80.00 80.00 80.00  0.00 80.00 20.00 40.00 40.00 20.00 20.00  0.00
  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00
  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00
========= Error: process didn't terminate successfully
=========        The application may have hit an error when dereferencing Unified Memory from the host. Please rerun the application under cuda-gdb or Nsight Eclipse Edition to catch host side errors.
========= Internal error (7)
========= No CUDA-MEMCHECK results found