How to parallel cudaMemPrefetchAsync to different GPUs

lausannel666 · May 23, 2023, 12:52pm

I recently read about CUDA unified memory, and I want to use cudaMemPrefetchAsync as a replacement for cudaMemcpy . When using cudaMemcpy , I can easily perform parallel copy operations to different GPUs using C++ code. However, I’m facing difficulties achieving the same parallelism with cudaMemPrefetchAsync . Can anyone help me clarify this
This is the code using cudaMemcpy

#include <iostream>
#include <thread>
#include <vector>
#include <atomic>
#include <cassert>
using namespace std;

const int N = 1 << 26;
__global__ void double_arr(int* a, int n)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) { a[tid] *= 2; }
}

void thread_func_invoke(int *a, int *a_dev, int device_id)
{
    cudaSetDevice(device_id);
    int current_device;
    cudaGetDevice(&current_device);
    cudaStream_t stream;
    // cudaStreamCreate(&stream);
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    std::cout << "Expected device: " << device_id << ", current device: " << current_device << "\n";
    assert(current_device == device_id);
    // cudaMemPrefetchAsync(a, N * sizeof(int), device_id, stream);
    cudaMemcpy(a_dev, a, N * sizeof(int), cudaMemcpyHostToDevice);
    // cudaMemcpy(a_dev, a, N * sizeof(int), cudaMemcpyHostToDevice, stream);
    // cudaStreamSynchronize(stream);

    int block_size = 256;
    int grid_size = (N + block_size - 1) / block_size;
    double_arr<<<grid_size, block_size>>>(a_dev, N);
}


int main(){
    int *a_host, *b_host, *c_host;
    int *a_dev, *b_dev, *c_dev;
    cudaMallocHost((void**)&a_host, N * sizeof(int));
    cudaMallocHost((void**)&b_host, N * sizeof(int));
    cudaMallocHost((void**)&c_host, N * sizeof(int));
    cudaMalloc((void**)&a_dev, N * sizeof(int));
    cudaSetDevice(1);
    cudaMalloc((void**)&b_dev, N * sizeof(int));
    cudaSetDevice(2);
    cudaMalloc((void**)&c_dev, N * sizeof(int));

    for (int i = 0; i < N; i++) {
        a_host[i] = i;
        b_host[i] = i;
        c_host[i] = i;
    }

    std::vector<std::thread> threads;
    threads.push_back(std::thread(thread_func_invoke, a_host, a_dev, 0));
    threads.push_back(std::thread(thread_func_invoke, b_host, b_dev, 1));
    threads.push_back(std::thread(thread_func_invoke, c_host, c_dev, 2));
    for (auto& t : threads) {
        t.join();
    }

    cudaDeviceSynchronize();

    cudaSetDevice(0);
    cudaMemcpy(a_host, a_dev, N * sizeof(int), cudaMemcpyDeviceToHost);
    cudaSetDevice(1);
    cudaMemcpy(b_host, b_dev, N * sizeof(int), cudaMemcpyDeviceToHost);
    cudaSetDevice(2);
    cudaMemcpy(c_host, c_dev, N * sizeof(int), cudaMemcpyDeviceToHost);
    for (int i = 0; i < N; i++) {
        assert(a_host[i] == 2 * i);
        assert(b_host[i] == 2 * i);
        assert(c_host[i] == 2 * i);
    }

    cudaFreeHost(a_host);
    cudaFreeHost(b_host);
    cudaFreeHost(c_host);
    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);
    return 0;

}

This is the code using cudaMemPrefetchAsync

#include <iostream>
#include <thread>
#include <vector>
#include <atomic>
#include <cassert>
using namespace std;
atomic<int> counter(0);

__global__ void add(int* a, int* b, int* c, int n)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) { c[tid] = a[tid] + b[tid]; }
}
__global__ void double_arr(int* a, int n)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) { a[tid] *= 2; }
}

const int N = 1 << 26;

void thread_func(int* a, int device_id)
{
    cudaSetDevice(device_id);
    int current_device;
    cudaGetDevice(&current_device);
    std::cout << "Expected device: " << device_id << ", current device: " << current_device << "\n";
    assert(current_device == device_id);
    cudaMemPrefetchAsync(a, N * sizeof(int), device_id);
}

void thread_func_invoke(int *a, int device_id)
{
    cudaSetDevice(device_id);
    int current_device;
    cudaGetDevice(&current_device);
    cudaStream_t stream;
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    std::cout << "Expected device: " << device_id << ", current device: " << current_device << "\n";
    assert(current_device == device_id);
    // cudaMemPrefetchAsync(a, N * sizeof(int), device_id, stream);
    cudaMemPrefetchAsync(a, N * sizeof(int), device_id);
    int block_size = 256;
    int grid_size = (N + block_size - 1) / block_size;
    double_arr<<<grid_size, block_size>>>(a, N);
}

int main()
{
    int *a, *b, *c;
    cudaMallocManaged(&a, N * sizeof(int));
    cudaMallocManaged(&b, N * sizeof(int));
    cudaMallocManaged(&c, N * sizeof(int));
    for (int i = 0; i < N; i++) {
        a[i] = i;
        b[i] = i;
        c[i] = i;
    }


    // std::this_thread::sleep_for(chrono::seconds(1));
    cout << "set device 4\n";
    cudaSetDevice(1);
    cudaMemPrefetchAsync(a, sizeof(int) * 128, 4);
    // std::this_thread::sleep_for(chrono::seconds(1));
    cout << "set device 6\n";
    cudaSetDevice(2);
    cudaMemPrefetchAsync(a, sizeof(int) * 128, 6);
    // std::this_thread::sleep_for(chrono::seconds(1));
    vector<std::thread> threads;
    // threads.push_back(thread(thread_func, a, 0));
    // threads.push_back(thread(thread_func, b, 4));
    // threads.push_back(thread(thread_func, c, 6));

    threads.push_back(thread(thread_func_invoke, b, 0));
    threads.push_back(thread(thread_func_invoke, a, 1));
    threads.push_back(thread(thread_func_invoke, c, 2));



    for (auto& t : threads) { t.join(); }
    // this_thread::sleep_for(chrono::seconds(1));
    // cudaSetDevice(4);
    // int block_size = 256;
    // int grid_size = (N + block_size - 1) / block_size;

    // add<<<grid_size, block_size>>>(a, b, c, N);

    cudaDeviceSynchronize();
    std::cout << "kernel finished\n";
    // this_thread::sleep_for(chrono::seconds(1));

    for(int i = 0; i < N; i++){
        assert(a[i] == 2 * i);
        assert(b[i] == 2 * i);
        assert(c[i] == 2 * i);

    }
    std::cout << "assertion passed\n";
    // this_thread::sleep_for(chrono::seconds(1));
    cudaFree(a);
    cudaFree(b);
    cudaFree(c);
    return 0;
}

This is the result of cudaMemPrefetchAsync

We can see that there is no parallelism in copying the data from host to GPU.
Can anyone tell me how I can parallel the copy via cudaMemPrefetchAsync?

lausannel666 · May 23, 2023, 12:57pm

These are the nsight system report files
cudaMemcpy
multi-thread-memcpy-1-stream.nsys-rep (245.1 KB)

lausannel666 · May 23, 2023, 1:00pm

cudaMemPrefetchAsync
multi-thread-memcpyasync-2-streams-nonblock.nsys-rep (244.4 KB)

striker159 · May 23, 2023, 2:57pm

I tried your code on a dual-gpu machine. The two transfers are serialized. However, this is simply because of
API overhead as shown in the profiler picture below.
I have modified your thread function to perform 3 prefetches. to the gpu, back to the cpu, and again to the gpu.
The second transfer to the gpu by one thread is able to overlap the first transfer of the second thread.

lausannel666 · May 23, 2023, 4:23pm

Is the API Overhead referring to cudaStreamCreateWithFlags in this context?

Robert_Crovella · May 23, 2023, 5:49pm

The ability to do this, and what you will actually witness, will depend on the system (PCIE) topology. The ability to run two H->D transfers simultaneously with expected bandwidth on each, implies that each GPU is connected to a separate CPU PCIE root port. Depending on the exact topology, it may be necessary to use careful process or thread placement in order to witness the desired outcome.

lausannel666 · May 24, 2023, 2:02am

Thanks for your reply, but I witnessed the concurrent transfer when using cudaMemcpyAsync or cudaMemcpy

Is there anything missing when I use cudaMemPrefetchAsync?

Robert_Crovella · May 24, 2023, 4:35am

Yes, there are differences when using cudaMemPrefetchAsync(). See here, particularly the section “Overlapping Kernels and Prefetches”. (Yes, I realize that is not exactly what you are asking about). I also believe that section hints at what the API Overhead could be. However I wouldn’t be able to give an exact description of what is happening in your case without more information.

Good luck!

Topic		Replies	Views
Cannot force kernels to concurrent execution CUDA Programming and Performance	8	5548	April 28, 2012
Memory copy/set async to kernel execution in different stream CUDA Programming and Performance	5	1052	December 15, 2022
Is concurrent cudaMemcpyAsync possible? CUDA Programming and Performance	2	637	April 5, 2019
Cannot get any stream parallelism. CUDA Programming and Performance	13	1287	December 31, 2019
How to Overlap Data Transfers in CUDA C/C++ Technical Blog	23	2231	January 18, 2023
cudaDeviceSynchronize needed between kernel launch and cudaMemcpy ? CUDA Programming and Performance	15	16279	September 29, 2017
Some CUDA/GPU implementation related questions CUDA Programming and Performance	6	2259	May 30, 2009
Overhead of using more than one streams? CUDA Programming and Performance	5	6175	April 14, 2009
GPU Pro Tip: CUDA 7 Streams Simplify Concurrency Technical Blog	51	2114	February 5, 2020
Can't get any concurrency on simple vector add across multi-GPU and streams CUDA Programming and Performance	17	5762	April 28, 2012

How to parallel cudaMemPrefetchAsync to different GPUs

Related topics