Question on working of CUDA Unified Memory

I wanted to know how CUDA Unified Memory and related functions work. I have already read the unified memory blog for beginners. I wrote a small code given below:

#include <cstdio>
#include <iostream>
#include <fstream>
#include <climits>
#include <vector>

__global__ void transfer(int *X)
{
    X[threadIdx.x] = X[threadIdx.x]+3;
}
using namespace std;
int main()
{
    int *x;
    size_t free_bytes, total_bytes;
    
    cudaMemGetInfo(&free_bytes, &total_bytes);
    std::cout << "Before cudaMallocManaged: " << "free: " << free_bytes << " total: " << total_bytes <<'\n'; 
    cudaMallocManaged(&x,sizeof(int)*512);
    
    cudaMemGetInfo(&free_bytes, &total_bytes);
    std::cout << "After cudaMallocManaged and Before Prefetch to GPU: " << "free: " << free_bytes << " total: " << total_bytes <<'\n';
    cudaMemPrefetchAsync(x, sizeof(int)*512, 0);
    cudaMemset(x,0,sizeof(int)*512);
    cudaDeviceSynchronize();
    cudaMemGetInfo(&free_bytes, &total_bytes);
    std::cout << "After Prefetch to GPU Before Kernel call: " << "free: " << free_bytes << " total: " << total_bytes <<'\n'; 
    transfer<<<1,512>>>(x);
    cudaMemGetInfo(&free_bytes, &total_bytes);
    std::cout << "After Kernel call Before memAdvise: " << "free: " << free_bytes << " total: " << total_bytes <<'\n';
    cudaMemAdvise(x,sizeof(int)*512, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
    cudaMemGetInfo(&free_bytes, &total_bytes);
    std::cout << "After memAdvise Before Prefetch to CPU: " << "free: " << free_bytes << " total: " << total_bytes <<'\n';
    cudaMemPrefetchAsync(x, sizeof(int)*512, cudaCpuDeviceId);
    cudaDeviceSynchronize();
    cudaMemGetInfo(&free_bytes, &total_bytes);
    std::cout << "After Prefetch Before processing in CPU: " << "free: " << free_bytes << " total: " << total_bytes <<'\n'; 
    for(int i=0;i<512;i++)
    {
        x[i] = x[i]+1;
        cout << x[i];
    }
    cudaMemGetInfo(&free_bytes, &total_bytes);
    std::cout << "\nAfter processing in CPU Before free: " << "free: " << free_bytes << " total: " << total_bytes <<'\n';
    cudaFree(x);
    cudaMemGetInfo(&free_bytes, &total_bytes);
    std::cout << "After free: " << "free: " << free_bytes << " total: " << total_bytes <<'\n';
    return 0;
}

I am running the code on Kaggle which uses 16GB Tesla P100 PCIe GPU. I have an array x and I have allocated it using cudaMallocManaged. First I prefetch it to GPU and do some processing then I prefetch it to CPU and do some processing. In between, I print the free memory of GPU before I do any memory transfer and after it. I have two questions:

  1. During first prefetch just after mallocManaged the free memory decreases as expected but the reduction in free memory is much more than the memory I have allocated, why?

  2. The when I prefetch to CPU I see no change in free memory of GPU. Also even when I actually access it from the CPU and do some operation on it still GPU-free memory is unchanged. Why is this? I understand that when in unified memory pages are transferred between GPU and CPU so when I am prefetching/using data in CPU shouldn’t corresponding pages in GPU be evicted and their memory reflects in free memory?

The output I get is:

Before cudaMallocManaged: free: 16804216832 total: 17071734784
After cudaMallocManaged and Before Prefetch to GPU: free: 16804216832 total: 17071734784
After Prefetch to GPU Before Kernel call: free: 16669999104 total: 17071734784
After Kernel call Before memAdvise: free: 16669999104 total: 17071734784
After memAdvise Before Prefetch to CPU: free: 16669999104 total: 17071734784
After Prefetch Before processing in CPU: free: 16669999104 total: 17071734784
44444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444
After processing in CPU Before free: free: 16669999104 total: 17071734784
After free: free: 16674193408 total: 17071734784

From the output, it seems even after accessing managed memory from the CPU the GPU pages are not being freed up.

Do I have a wrong understanding of how Unified Memory works? If so, please correct me.