Can a page stay in host when using cudaMemAdviseSetReadMostly?

Can a page stay in host when using cudaMemAdviseSetReadMostly?
I write a simple test, the answer seems cannot.

#include <cuda_runtime.h>
#include <iostream>
using namespace std;

#define CHECK(v) if (v) { cout << "fault" << #v; exit(1); }

__global__ void func(int* a, int* b, int n) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int tnum = gridDim.x * blockDim.x;
    for (int i=tid; i<n; i+=tnum)
        b[i] = a[i];
}

extern "C" void nvtxRangePushA(const char* s);
extern "C" void nvtxRangePop();

int main() {
    int* a, *b;
    int n = 1024*1024*60;
    CHECK(cudaMallocManaged(&a, n*4));
    CHECK(cudaMallocManaged(&b, n*4));
    CHECK(cudaMemAdvise(a, n*4, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    // CHECK(cudaMemAdvise(a, n*4, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    nvtxRangePushA("init");
    for (int i=0; i<n; i++)
        a[i] = 0;
    nvtxRangePop();
    for (int j=0; j<10; j++) {
        nvtxRangePushA("init2");
        for (int i=0; i<n; i++)
            a[i] = 0;
        nvtxRangePop();
        for (int i=0; i<100; i++)
            func<<<32, 1024>>>(a, b, n);
        CHECK(cudaDeviceSynchronize());
    }
    return 0;
}

// /usr/local/cuda/bin/nvcc ./test.cu -o a.out -lnvToolsExt && nvprof -o ~/tmp/x.prof -f ./a.out

The nvprof profiling results show that the page of “a” migrates to host at each iteration.
How can I keep the page of “a” in host? so the initialize doesn’t cause page fault

To keep the data in CPU memory and allow other processors to read it (which seems to be the intent of your code) the correct “advice” is the one you have commented out.

When I uncomment that, and comment the preceding line, the CPU page faults (mostly) disappear.

Your code, as posted:

$ nvprof ./t1624
==29131== NVPROF is profiling process 29131, command: ./t1624
==29131== Profiling application: ./t1624
==29131== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  2.06408s      1000  2.0641ms  1.4181ms  94.390ms  func(int*, int*, int)
      API calls:   85.39%  2.05442s        10  205.44ms  201.03ms  239.32ms  cudaDeviceSynchronize
                   13.64%  328.16ms         2  164.08ms  172.76us  327.99ms  cudaMallocManaged
                    0.47%  11.397ms      1000  11.396us  8.4880us  389.03us  cudaLaunchKernel
                    0.22%  5.3900ms         4  1.3475ms  640.34us  3.3566ms  cuDeviceTotalMem
                    0.20%  4.9175ms       388  12.673us     325ns  527.19us  cuDeviceGetAttribute
                    0.06%  1.4861ms         4  371.51us  97.560us  1.1759ms  cuDeviceGetName
                    0.00%  27.205us         4  6.8010us  3.3270us  11.857us  cuDeviceGetPCIBusId
                    0.00%  10.506us         1  10.506us  10.506us  10.506us  cudaMemAdvise
                    0.00%  8.9220us         8  1.1150us     497ns  1.8730us  cuDeviceGet
                    0.00%  6.8350us         3  2.2780us     485ns  4.4900us  cuDeviceGetCount
                    0.00%  2.7120us         4     678ns     504ns     926ns  cuDeviceGetUuid

==29131== Unified Memory profiling result:
Device "Tesla V100-PCIE-32GB (0)"
   Count  Avg Size  Min Size  Max Size  Total Size  Total Time  Name
   16267  151.08KB  4.0000KB  0.9766MB  2.343750GB  250.1772ms  Host To Device
    6647         -         -         -           -  709.3456ms  Gpu page fault groups
Total CPU Page faults: 614400

==29131== NVTX result:
==29131==   Thread "<unnamed>" (id = 2790779392)
==29131==     Domain "<unnamed>"
==29131==       Range "init"
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
          Range:  100.00%  1.43432s         1  1.43432s  1.43432s  1.43432s  init
No kernels were profiled in this range.
No API activities were profiled in this range.

==29131==       Range "init2"
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
          Range:  100.00%  16.4560s        10  1.64560s  175.44ms  1.87524s  init2
No kernels were profiled in this range.
No API activities were profiled in this range.

modified:

$ cat t1624.cu
#include <cuda_runtime.h>
#include <iostream>
using namespace std;

#define CHECK(v) if (v) { cout << "fault" << #v; exit(1); }

__global__ void func(int* a, int* b, int n) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int tnum = gridDim.x * blockDim.x;
    for (int i=tid; i<n; i+=tnum)
        b[i] = a[i];
}

extern "C" void nvtxRangePushA(const char* s);
extern "C" void nvtxRangePop();

int main() {
    int* a, *b;
    int n = 1024*1024*60;
    CHECK(cudaMallocManaged(&a, n*4));
    CHECK(cudaMallocManaged(&b, n*4));
    //CHECK(cudaMemAdvise(a, n*4, cudaMemAdviseSetReadMostly, cudaCpuDeviceId));
    CHECK(cudaMemAdvise(a, n*4, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    nvtxRangePushA("init");
    for (int i=0; i<n; i++)
        a[i] = 0;
    nvtxRangePop();
    for (int j=0; j<10; j++) {
        nvtxRangePushA("init2");
        for (int i=0; i<n; i++)
            a[i] = 0;
        nvtxRangePop();
        for (int i=0; i<100; i++)
            func<<<32, 1024>>>(a, b, n);
        CHECK(cudaDeviceSynchronize());
    }
    return 0;
}
$ nvcc t1624.cu -o t1624 -arch=sm_70 -lnvToolsExt
$ nvprof ./t1624
==29186== NVPROF is profiling process 29186, command: ./t1624
==29186== Profiling application: ./t1624
==29186== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  21.1277s      1000  21.128ms  20.920ms  179.52ms  func(int*, int*, int)
      API calls:   98.46%  21.1178s        10  2.11178s  2.09486s  2.25843s  cudaDeviceSynchronize
                    1.42%  304.52ms         2  152.26ms  173.24us  304.35ms  cudaMallocManaged
                    0.07%  14.452ms      1000  14.452us  8.4590us  2.1737ms  cudaLaunchKernel
                    0.03%  5.3939ms         4  1.3485ms  688.56us  3.3054ms  cuDeviceTotalMem
                    0.02%  5.0238ms       388  12.947us     328ns  534.47us  cuDeviceGetAttribute
                    0.00%  557.09us         4  139.27us  101.95us  241.32us  cuDeviceGetName
                    0.00%  27.399us         4  6.8490us  3.2470us  12.768us  cuDeviceGetPCIBusId
                    0.00%  15.956us         1  15.956us  15.956us  15.956us  cudaMemAdvise
                    0.00%  7.5140us         8     939ns     470ns  1.3880us  cuDeviceGet
                    0.00%  3.4030us         3  1.1340us     363ns  1.6800us  cuDeviceGetCount
                    0.00%  2.7100us         4     677ns     503ns  1.0070us  cuDeviceGetUuid

==29186== Unified Memory profiling result:
Device "Tesla V100-PCIE-32GB (0)"
   Count  Avg Size  Min Size  Max Size  Total Size  Total Time  Name
    2427         -         -         -           -  206.5282ms  Gpu page fault groups
    9932  24.744KB  4.0000KB  128.00KB  240.0000MB           -  Remote mapping from device
Total CPU Page faults: 120
Total remote mappings to CPU: 9932

==29186== NVTX result:
==29186==   Thread "<unnamed>" (id = 3172297216)
==29186==     Domain "<unnamed>"
==29186==       Range "init"
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
          Range:  100.00%  282.52ms         1  282.52ms  282.52ms  282.52ms  init
No kernels were profiled in this range.
No API activities were profiled in this range.

==29186==       Range "init2"
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
          Range:  100.00%  1.76453s        10  176.45ms  159.69ms  193.38ms  init2
No kernels were profiled in this range.
No API activities were profiled in this range.

$

Tesla V100, CUDA 10.1.243, CentOS 7.

Thanks, Robert.
No CPU page fault has occurred when uncomment “cudaMemAdviseSetPreferredLocation”, But lots of GPU page fault happened.
In your profiling results, the GPU kernel execution times are different(2ms vs 20ms).

What I expect is the GPU page fault only happened once when migrate the read-only page from CPU to GPU. The “ReadMostly” advice seems not working.

The GPU kernel execution times are different because in the slow case you are reading the data over PCIE (every time it is accessed) whereas in the fast case it can be migrated to the GPU.

Data is not migrated in the slow case.

Thanks for correcting me. As you said the slow case is reading data over PCIE. Is there any way to let the page stay in CPU and leave a copy in GPU, so GPU read data from its own memory rather than over PCIE?