How to parallel cudaMemPrefetchAsync to different GPUs

I recently read about CUDA unified memory, and I want to use cudaMemPrefetchAsync as a replacement for cudaMemcpy . When using cudaMemcpy , I can easily perform parallel copy operations to different GPUs using C++ code. However, I’m facing difficulties achieving the same parallelism with cudaMemPrefetchAsync . Can anyone help me clarify this
This is the code using cudaMemcpy

#include <iostream>
#include <thread>
#include <vector>
#include <atomic>
#include <cassert>
using namespace std;

const int N = 1 << 26;
__global__ void double_arr(int* a, int n)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) { a[tid] *= 2; }
}

void thread_func_invoke(int *a, int *a_dev, int device_id)
{
    cudaSetDevice(device_id);
    int current_device;
    cudaGetDevice(&current_device);
    cudaStream_t stream;
    // cudaStreamCreate(&stream);
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    std::cout << "Expected device: " << device_id << ", current device: " << current_device << "\n";
    assert(current_device == device_id);
    // cudaMemPrefetchAsync(a, N * sizeof(int), device_id, stream);
    cudaMemcpy(a_dev, a, N * sizeof(int), cudaMemcpyHostToDevice);
    // cudaMemcpy(a_dev, a, N * sizeof(int), cudaMemcpyHostToDevice, stream);
    // cudaStreamSynchronize(stream);

    int block_size = 256;
    int grid_size = (N + block_size - 1) / block_size;
    double_arr<<<grid_size, block_size>>>(a_dev, N);
}


int main(){
    int *a_host, *b_host, *c_host;
    int *a_dev, *b_dev, *c_dev;
    cudaMallocHost((void**)&a_host, N * sizeof(int));
    cudaMallocHost((void**)&b_host, N * sizeof(int));
    cudaMallocHost((void**)&c_host, N * sizeof(int));
    cudaMalloc((void**)&a_dev, N * sizeof(int));
    cudaSetDevice(1);
    cudaMalloc((void**)&b_dev, N * sizeof(int));
    cudaSetDevice(2);
    cudaMalloc((void**)&c_dev, N * sizeof(int));

    for (int i = 0; i < N; i++) {
        a_host[i] = i;
        b_host[i] = i;
        c_host[i] = i;
    }

    std::vector<std::thread> threads;
    threads.push_back(std::thread(thread_func_invoke, a_host, a_dev, 0));
    threads.push_back(std::thread(thread_func_invoke, b_host, b_dev, 1));
    threads.push_back(std::thread(thread_func_invoke, c_host, c_dev, 2));
    for (auto& t : threads) {
        t.join();
    }

    cudaDeviceSynchronize();

    cudaSetDevice(0);
    cudaMemcpy(a_host, a_dev, N * sizeof(int), cudaMemcpyDeviceToHost);
    cudaSetDevice(1);
    cudaMemcpy(b_host, b_dev, N * sizeof(int), cudaMemcpyDeviceToHost);
    cudaSetDevice(2);
    cudaMemcpy(c_host, c_dev, N * sizeof(int), cudaMemcpyDeviceToHost);
    for (int i = 0; i < N; i++) {
        assert(a_host[i] == 2 * i);
        assert(b_host[i] == 2 * i);
        assert(c_host[i] == 2 * i);
    }

    cudaFreeHost(a_host);
    cudaFreeHost(b_host);
    cudaFreeHost(c_host);
    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);
    return 0;

}

This is the code using cudaMemPrefetchAsync

#include <iostream>
#include <thread>
#include <vector>
#include <atomic>
#include <cassert>
using namespace std;
atomic<int> counter(0);

__global__ void add(int* a, int* b, int* c, int n)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) { c[tid] = a[tid] + b[tid]; }
}
__global__ void double_arr(int* a, int n)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) { a[tid] *= 2; }
}

const int N = 1 << 26;

void thread_func(int* a, int device_id)
{
    cudaSetDevice(device_id);
    int current_device;
    cudaGetDevice(&current_device);
    std::cout << "Expected device: " << device_id << ", current device: " << current_device << "\n";
    assert(current_device == device_id);
    cudaMemPrefetchAsync(a, N * sizeof(int), device_id);
}

void thread_func_invoke(int *a, int device_id)
{
    cudaSetDevice(device_id);
    int current_device;
    cudaGetDevice(&current_device);
    cudaStream_t stream;
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    std::cout << "Expected device: " << device_id << ", current device: " << current_device << "\n";
    assert(current_device == device_id);
    // cudaMemPrefetchAsync(a, N * sizeof(int), device_id, stream);
    cudaMemPrefetchAsync(a, N * sizeof(int), device_id);
    int block_size = 256;
    int grid_size = (N + block_size - 1) / block_size;
    double_arr<<<grid_size, block_size>>>(a, N);
}

int main()
{
    int *a, *b, *c;
    cudaMallocManaged(&a, N * sizeof(int));
    cudaMallocManaged(&b, N * sizeof(int));
    cudaMallocManaged(&c, N * sizeof(int));
    for (int i = 0; i < N; i++) {
        a[i] = i;
        b[i] = i;
        c[i] = i;
    }


    // std::this_thread::sleep_for(chrono::seconds(1));
    cout << "set device 4\n";
    cudaSetDevice(1);
    cudaMemPrefetchAsync(a, sizeof(int) * 128, 4);
    // std::this_thread::sleep_for(chrono::seconds(1));
    cout << "set device 6\n";
    cudaSetDevice(2);
    cudaMemPrefetchAsync(a, sizeof(int) * 128, 6);
    // std::this_thread::sleep_for(chrono::seconds(1));
    vector<std::thread> threads;
    // threads.push_back(thread(thread_func, a, 0));
    // threads.push_back(thread(thread_func, b, 4));
    // threads.push_back(thread(thread_func, c, 6));

    threads.push_back(thread(thread_func_invoke, b, 0));
    threads.push_back(thread(thread_func_invoke, a, 1));
    threads.push_back(thread(thread_func_invoke, c, 2));



    for (auto& t : threads) { t.join(); }
    // this_thread::sleep_for(chrono::seconds(1));
    // cudaSetDevice(4);
    // int block_size = 256;
    // int grid_size = (N + block_size - 1) / block_size;

    // add<<<grid_size, block_size>>>(a, b, c, N);

    cudaDeviceSynchronize();
    std::cout << "kernel finished\n";
    // this_thread::sleep_for(chrono::seconds(1));

    for(int i = 0; i < N; i++){
        assert(a[i] == 2 * i);
        assert(b[i] == 2 * i);
        assert(c[i] == 2 * i);

    }
    std::cout << "assertion passed\n";
    // this_thread::sleep_for(chrono::seconds(1));
    cudaFree(a);
    cudaFree(b);
    cudaFree(c);
    return 0;
}

This is the result of cudaMemPrefetchAsync
image

We can see that there is no parallelism in copying the data from host to GPU.
Can anyone tell me how I can parallel the copy via cudaMemPrefetchAsync?

These are the nsight system report files
cudaMemcpy
multi-thread-memcpy-1-stream.nsys-rep (245.1 KB)

cudaMemPrefetchAsync
multi-thread-memcpyasync-2-streams-nonblock.nsys-rep (244.4 KB)

I tried your code on a dual-gpu machine. The two transfers are serialized. However, this is simply because of
API overhead as shown in the profiler picture below.
I have modified your thread function to perform 3 prefetches. to the gpu, back to the cpu, and again to the gpu.
The second transfer to the gpu by one thread is able to overlap the first transfer of the second thread.

1 Like

Is the API Overhead referring to cudaStreamCreateWithFlags in this context?

The ability to do this, and what you will actually witness, will depend on the system (PCIE) topology. The ability to run two H->D transfers simultaneously with expected bandwidth on each, implies that each GPU is connected to a separate CPU PCIE root port. Depending on the exact topology, it may be necessary to use careful process or thread placement in order to witness the desired outcome.

Thanks for your reply, but I witnessed the concurrent transfer when using cudaMemcpyAsync or cudaMemcpy


Is there anything missing when I use cudaMemPrefetchAsync?

Yes, there are differences when using cudaMemPrefetchAsync(). See here, particularly the section “Overlapping Kernels and Prefetches”. (Yes, I realize that is not exactly what you are asking about). I also believe that section hints at what the API Overhead could be. However I wouldn’t be able to give an exact description of what is happening in your case without more information.

Good luck!