Performance drop after specifying CUDA_VISIBLE_DEVICES=0

I am running a memory swapping code on a workstation with 4 A100s, by default the code is running on the first A100. I try to specify the GPU to run the code but the program slows down significantly even if I just specify CUDA_VISIBLE_DEVICES=0, which should be the same as the default case. But when I am running with CUDA_VISIBLE_DEVICES=0,1,2,3 the performance is normal although only the first A100 is active.
Here is the topology

        GPU0    GPU1    GPU2    GPU3    NIC0    NIC1    CPU Affinity    NUMA Affinity   GPU NUMA ID
GPU0     X      NV12    SYS     SYS     SYS     SYS     0-63,128-191    0               N/A
GPU1    NV12     X      SYS     SYS     SYS     SYS     0-63,128-191    0               N/A
GPU2    SYS     SYS      X      NV12    PHB     PHB     64-127,192-255  1               N/A
GPU3    SYS     SYS     NV12     X      NODE    NODE    64-127,192-255  1               N/A

Is that related to the sharing of PCIE bandwidth?
Below is the test code I created,

#include <iostream>
#include <math.h>
#include <cuda_runtime.h>
#include <unistd.h>
#include <cstdlib>
#include <chrono>
#include <assert.h>

// Dummy process function:
__global__ void process_data_chunk(float *d_data, size_t data_size){
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    for(; tid < data_size; tid += gridDim.x * blockDim.x){
        if(tid < data_size){
            d_data[tid] *= 2;
        }
    }
}

int main(int argc, char **argv) {

    // Input: oversubscription factor & chunk_size
    size_t granularity = atoi(argv[1]);
    size_t input_chunks = atoi(argv[2]);

    size_t mf, ma;
    cudaMemGetInfo(&mf, &ma);
    std::cout << "Free: " << mf << " Total: " << ma << std::endl;

    size_t max_total_float = (mf-500000000) / sizeof(float);
    size_t chunk_size = max_total_float / granularity; // Per chunk size decided

    // print the summary of profiling config
    std::cout << "Profiling with config:\n"
            << "Device memory: " << mf/1024/1024 << " / " << ma/1024/1024 << " MB\n"
            << "Input chunks: " << input_chunks << " over " << granularity << " pages by " << input_chunks/granularity << " factor\n"
            << "Chunk size: " << chunk_size/1024/1024 << "M floats x " << input_chunks << " / " << max_total_float/1024/1024 << "M floats" << std::endl;
    
    // Allocate host memory
    float *h_data = new float[chunk_size * input_chunks];
    size_t *table = new size_t[granularity];
    std::fill_n(h_data, chunk_size * input_chunks, 1.5f); // Fill the array with 1
    
    // Allocate device memory
    float *d_data;
    cudaError_t err = cudaMalloc(&d_data, granularity * chunk_size * sizeof(float)); // Allocate memory for one chunk at a time
    if (err != cudaSuccess) {
        std::cerr << "cudaMemAlloc failed: " << cudaGetErrorString(err) << std::endl;
        return 1;
    }
    dim3 dimBlock(1024, 1, 1);
    dim3 dimGrid(1024, 1, 1);
    
    // assume we are done from the last cycle
    for (size_t i = 0; i < granularity; ++i) {
        size_t selected_page = (input_chunks - i - 1) % granularity;
        table[selected_page] = (input_chunks - i - 1);
    }
    cudaDeviceSynchronize();
    std::cout << "Sleeping for 3 seconds" << std::endl;
    sleep(3);

    auto start = std::chrono::high_resolution_clock::now();

    for (size_t chunk = 0; chunk < input_chunks; ++chunk) {
        size_t selected_page = chunk % granularity; 
        size_t evicted_chunk = table[selected_page];
        std::cout << "Processing chunk: " << chunk << ", evicting chunk " << evicted_chunk << std::endl;

        // Copy previous chunk back from device to host memory
        cudaMemcpy(h_data + evicted_chunk * chunk_size, d_data+selected_page*chunk_size, chunk_size * sizeof(float), cudaMemcpyDeviceToHost);
        // Copy current chunk from host to device memory
        cudaMemcpy(d_data+selected_page*chunk_size, h_data + chunk * chunk_size, chunk_size * sizeof(float), cudaMemcpyHostToDevice);
        // Check for errors in kernel launch
        err = cudaGetLastError();
        if (err != cudaSuccess) {
            std::cerr << "Memcpy failed: " << cudaGetErrorString(err) << std::endl;
            return 1;
        }
        table[selected_page] = chunk;
        // Process current chunk on the device
        process_data_chunk<<<dimGrid, dimBlock>>>(d_data+selected_page*chunk_size, chunk_size);

        // Wait for GPU to finish before accessing on host
        cudaDeviceSynchronize();
    }

    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> elapsed = end - start;
    std::cout << "Elapsed time: " << elapsed.count() << " seconds\n";

    // Free the allocated device memory
    cudaFree(d_data);

    // Free host memory
    delete[] h_data;
    delete[] table;
    
    return 0;
}

Results:

$ nvcc mvp.cu -o mvp
# set visible device
$ CUDA_VISIBLE_DEVICES=0 ./mvp 4 8
Free: 84544192512 Total: 84987740160
Profiling with config:
Device memory: 80627 / 81050 MB
Input chunks: 8 over 4 pages by 2 factor
Chunk size: 5009M floats x 8 / 20037M floats
Sleeping for 3 seconds
Processing chunk: 0, evicting chunk 4
Processing chunk: 1, evicting chunk 5
Processing chunk: 2, evicting chunk 6
Processing chunk: 3, evicting chunk 7
Processing chunk: 4, evicting chunk 0
Processing chunk: 5, evicting chunk 1
Processing chunk: 6, evicting chunk 2
Processing chunk: 7, evicting chunk 3
Elapsed time: 57.4109 seconds

# use default device
$ ./mvp 4 8
Free: 84544192512 Total: 84987740160
Profiling with config:
Device memory: 80627 / 81050 MB
Input chunks: 8 over 4 pages by 2 factor
Chunk size: 5009M floats x 8 / 20037M floats
Sleeping for 3 seconds
Processing chunk: 0, evicting chunk 4
Processing chunk: 1, evicting chunk 5
Processing chunk: 2, evicting chunk 6
Processing chunk: 3, evicting chunk 7
Processing chunk: 4, evicting chunk 0
Processing chunk: 5, evicting chunk 1
Processing chunk: 6, evicting chunk 2
Processing chunk: 7, evicting chunk 3
Elapsed time: 43.3565 seconds

I also created a UVM version of this code which is not affected by specifying the visible device.

It shouldn’t be difficult using nsight systems to figure out which steps are taking longer

Unfortunately, once I run that with nsys, both of them are as the same slow.

well, all your operations in the loop are effectively synchronous, you could time each one of them using host-based methods.

I could reproduce your issue. In my case, it could be resolved by pinning the CPU processes to the NUMA node where the GPU is attached. In your case this would be

CUDA_VISIBLE_DEVICES=0 numactl -i 0 ./mvp 4 8 

(or --physcpubind=0-63).

I tried replacing h_data by cudaMallocHost which also unified the latency. So the reason is because the host workload is scheduled on to different CPU node?

It could be the reason. Does process pinning like the example given seem to make the problem disappear?