I wanted to know how CUDA Unified Memory and related functions work. I have already read the unified memory blog for beginners. I wrote a small code given below:
#include <cstdio>
#include <iostream>
#include <fstream>
#include <climits>
#include <vector>
__global__ void transfer(int *X)
{
X[threadIdx.x] = X[threadIdx.x]+3;
}
using namespace std;
int main()
{
int *x;
size_t free_bytes, total_bytes;
cudaMemGetInfo(&free_bytes, &total_bytes);
std::cout << "Before cudaMallocManaged: " << "free: " << free_bytes << " total: " << total_bytes <<'\n';
cudaMallocManaged(&x,sizeof(int)*512);
cudaMemGetInfo(&free_bytes, &total_bytes);
std::cout << "After cudaMallocManaged and Before Prefetch to GPU: " << "free: " << free_bytes << " total: " << total_bytes <<'\n';
cudaMemPrefetchAsync(x, sizeof(int)*512, 0);
cudaMemset(x,0,sizeof(int)*512);
cudaDeviceSynchronize();
cudaMemGetInfo(&free_bytes, &total_bytes);
std::cout << "After Prefetch to GPU Before Kernel call: " << "free: " << free_bytes << " total: " << total_bytes <<'\n';
transfer<<<1,512>>>(x);
cudaMemGetInfo(&free_bytes, &total_bytes);
std::cout << "After Kernel call Before memAdvise: " << "free: " << free_bytes << " total: " << total_bytes <<'\n';
cudaMemAdvise(x,sizeof(int)*512, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId);
cudaMemGetInfo(&free_bytes, &total_bytes);
std::cout << "After memAdvise Before Prefetch to CPU: " << "free: " << free_bytes << " total: " << total_bytes <<'\n';
cudaMemPrefetchAsync(x, sizeof(int)*512, cudaCpuDeviceId);
cudaDeviceSynchronize();
cudaMemGetInfo(&free_bytes, &total_bytes);
std::cout << "After Prefetch Before processing in CPU: " << "free: " << free_bytes << " total: " << total_bytes <<'\n';
for(int i=0;i<512;i++)
{
x[i] = x[i]+1;
cout << x[i];
}
cudaMemGetInfo(&free_bytes, &total_bytes);
std::cout << "\nAfter processing in CPU Before free: " << "free: " << free_bytes << " total: " << total_bytes <<'\n';
cudaFree(x);
cudaMemGetInfo(&free_bytes, &total_bytes);
std::cout << "After free: " << "free: " << free_bytes << " total: " << total_bytes <<'\n';
return 0;
}
I am running the code on Kaggle which uses 16GB Tesla P100 PCIe GPU. I have an array x and I have allocated it using cudaMallocManaged. First I prefetch it to GPU and do some processing then I prefetch it to CPU and do some processing. In between, I print the free memory of GPU before I do any memory transfer and after it. I have two questions:
-
During first prefetch just after mallocManaged the free memory decreases as expected but the reduction in free memory is much more than the memory I have allocated, why?
-
The when I prefetch to CPU I see no change in free memory of GPU. Also even when I actually access it from the CPU and do some operation on it still GPU-free memory is unchanged. Why is this? I understand that when in unified memory pages are transferred between GPU and CPU so when I am prefetching/using data in CPU shouldn’t corresponding pages in GPU be evicted and their memory reflects in free memory?
The output I get is:
Before cudaMallocManaged: free: 16804216832 total: 17071734784
After cudaMallocManaged and Before Prefetch to GPU: free: 16804216832 total: 17071734784
After Prefetch to GPU Before Kernel call: free: 16669999104 total: 17071734784
After Kernel call Before memAdvise: free: 16669999104 total: 17071734784
After memAdvise Before Prefetch to CPU: free: 16669999104 total: 17071734784
After Prefetch Before processing in CPU: free: 16669999104 total: 17071734784
44444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444
After processing in CPU Before free: free: 16669999104 total: 17071734784
After free: free: 16674193408 total: 17071734784
From the output, it seems even after accessing managed memory from the CPU the GPU pages are not being freed up.
Do I have a wrong understanding of how Unified Memory works? If so, please correct me.