Unified memory

Hello, I am new to CUDA. I am trying to understand how UM (unified memory) works when it is accessed
by CPU or/and GPU.

First, I define some host function and equivalent kernel.

__global__
void deviceKernel(int *a, int N)
{
  int idx = threadIdx.x + blockIdx.x * blockDim.x;
  int stride = blockDim.x * gridDim.x;

  for (int i = idx; i < N; i += stride)
  {
    a[i] = 1;
  }
}

void hostFunction(int *a, int N)
{
  for (int i = 0; i < N; ++i)
  {
    a[i] = 1;
  }
}

Then I consider 4 cases:

(1) UM accessed only by GPU:

int main(int argc, char*argv[])
{

  int N = 2<<24;
  size_t size = N * sizeof(int);
  int *a;
  cudaMallocManaged(&a, size);

  int threadsPerBlock = 256;
  int blocksPerGrid =(N + threadsPerBlock - 1) / threadsPerBlock;
  
  deviceKernel<<<blocksPerGrid, threadsPerBlock>>>(a,N);

  cudaFree(a);
}

(2) UM accessed only by CPU:

int main(int argc, char*argv[])
{

  int N = 2<<24;
  size_t size = N * sizeof(int);
  int *a;
  cudaMallocManaged(&a, size);

  int threadsPerBlock = 256;
  int blocksPerGrid =(N + threadsPerBlock - 1) / threadsPerBlock;
  
  hostFunction(a,N);

  cudaFree(a);
}

(3) UM accessed by GPU, then CPU:

int main(int argc, char*argv[])
{

  int N = 2<<24;
  size_t size = N * sizeof(int);
  int *a;
  cudaMallocManaged(&a, size);

  int threadsPerBlock = 256;
  int blocksPerGrid =(N + threadsPerBlock - 1) / threadsPerBlock;
  
  deviceKernel<<<blocksPerGrid, threadsPerBlock>>>(a,N);
  hostFunction(a,N);

  cudaFree(a);
}

(4) UM accessed by CPU, then GPU:

int main(int argc, char*argv[])
{

  int N = 2<<24;
  size_t size = N * sizeof(int);
  int *a;
  cudaMallocManaged(&a, size);

  int threadsPerBlock = 256;
  int blocksPerGrid =(N + threadsPerBlock - 1) / threadsPerBlock;
  
  hostFunction(a,N);
  deviceKernel<<<blocksPerGrid, threadsPerBlock>>>(a,N);

  cudaFree(a);
}

The nvprof outputs I get are the following:

(1) for GPU only:

==30390== Unified Memory profiling result:
Device "GeForce GTX 1060 6GB (0)"
   Count  Avg Size  Min Size  Max Size  Total Size  Total Time  Name
     384         -         -         -           -  30.83862ms  Gpu page fault groups

(2) for CPU only:

==30626== Unified Memory profiling result:
Total CPU Page faults: 384

(3) for GPU then CPU:

==30688== Unified Memory profiling result:
Device "GeForce GTX 1060 6GB (0)"
   Count  Avg Size  Min Size  Max Size  Total Size  Total Time  Name
       8  24.000KB  4.0000KB  64.000KB  192.0000KB  24.19200us  Host To Device
     777  168.79KB  4.0000KB  0.9961MB  128.0742MB  10.73443ms  Device To Host
     386         -         -         -           -  29.73021ms  Gpu page fault groups
      16  4.0000KB  4.0000KB  4.0000KB  64.00000KB           -  Memory thrashes
Total CPU Page faults: 390
Total CPU thrashes: 16

(4) for CPU then GPU:

==30816== Unified Memory profiling result:
Device "GeForce GTX 1060 6GB (0)"
   Count  Avg Size  Min Size  Max Size  Total Size  Total Time  Name
    1374  95.394KB  4.0000KB  0.9961MB  128.0000MB  12.34474ms  Host To Device
     396         -         -         -           -  54.74906ms  Gpu page fault groups
Total CPU Page faults: 384

I tried to understand what those mean, but I still can’t figure it out. I think I understand where the data movement (host to device/device to host) come from, but don’t get the whole page fault concept. Could anyone explain?

Thanks

http://on-demand.gputechconf.com/gtc/2018/presentation/s8430-everything-you-need-to-know-about-unified-memory.pdf

https://en.wikipedia.org/wiki/Page_fault

Thanks.