I recently read about CUDA unified memory, and I want to use cudaMemPrefetchAsync
as a replacement for cudaMemcpy
. When using cudaMemcpy
, I can easily perform parallel copy operations to different GPUs using C++ code. However, I’m facing difficulties achieving the same parallelism with cudaMemPrefetchAsync
. Can anyone help me clarify this
This is the code using cudaMemcpy
#include <iostream>
#include <thread>
#include <vector>
#include <atomic>
#include <cassert>
using namespace std;
const int N = 1 << 26;
__global__ void double_arr(int* a, int n)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) { a[tid] *= 2; }
}
void thread_func_invoke(int *a, int *a_dev, int device_id)
{
cudaSetDevice(device_id);
int current_device;
cudaGetDevice(¤t_device);
cudaStream_t stream;
// cudaStreamCreate(&stream);
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
std::cout << "Expected device: " << device_id << ", current device: " << current_device << "\n";
assert(current_device == device_id);
// cudaMemPrefetchAsync(a, N * sizeof(int), device_id, stream);
cudaMemcpy(a_dev, a, N * sizeof(int), cudaMemcpyHostToDevice);
// cudaMemcpy(a_dev, a, N * sizeof(int), cudaMemcpyHostToDevice, stream);
// cudaStreamSynchronize(stream);
int block_size = 256;
int grid_size = (N + block_size - 1) / block_size;
double_arr<<<grid_size, block_size>>>(a_dev, N);
}
int main(){
int *a_host, *b_host, *c_host;
int *a_dev, *b_dev, *c_dev;
cudaMallocHost((void**)&a_host, N * sizeof(int));
cudaMallocHost((void**)&b_host, N * sizeof(int));
cudaMallocHost((void**)&c_host, N * sizeof(int));
cudaMalloc((void**)&a_dev, N * sizeof(int));
cudaSetDevice(1);
cudaMalloc((void**)&b_dev, N * sizeof(int));
cudaSetDevice(2);
cudaMalloc((void**)&c_dev, N * sizeof(int));
for (int i = 0; i < N; i++) {
a_host[i] = i;
b_host[i] = i;
c_host[i] = i;
}
std::vector<std::thread> threads;
threads.push_back(std::thread(thread_func_invoke, a_host, a_dev, 0));
threads.push_back(std::thread(thread_func_invoke, b_host, b_dev, 1));
threads.push_back(std::thread(thread_func_invoke, c_host, c_dev, 2));
for (auto& t : threads) {
t.join();
}
cudaDeviceSynchronize();
cudaSetDevice(0);
cudaMemcpy(a_host, a_dev, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaSetDevice(1);
cudaMemcpy(b_host, b_dev, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaSetDevice(2);
cudaMemcpy(c_host, c_dev, N * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) {
assert(a_host[i] == 2 * i);
assert(b_host[i] == 2 * i);
assert(c_host[i] == 2 * i);
}
cudaFreeHost(a_host);
cudaFreeHost(b_host);
cudaFreeHost(c_host);
cudaFree(a_dev);
cudaFree(b_dev);
cudaFree(c_dev);
return 0;
}
This is the code using cudaMemPrefetchAsync
#include <iostream>
#include <thread>
#include <vector>
#include <atomic>
#include <cassert>
using namespace std;
atomic<int> counter(0);
__global__ void add(int* a, int* b, int* c, int n)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) { c[tid] = a[tid] + b[tid]; }
}
__global__ void double_arr(int* a, int n)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) { a[tid] *= 2; }
}
const int N = 1 << 26;
void thread_func(int* a, int device_id)
{
cudaSetDevice(device_id);
int current_device;
cudaGetDevice(¤t_device);
std::cout << "Expected device: " << device_id << ", current device: " << current_device << "\n";
assert(current_device == device_id);
cudaMemPrefetchAsync(a, N * sizeof(int), device_id);
}
void thread_func_invoke(int *a, int device_id)
{
cudaSetDevice(device_id);
int current_device;
cudaGetDevice(¤t_device);
cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
std::cout << "Expected device: " << device_id << ", current device: " << current_device << "\n";
assert(current_device == device_id);
// cudaMemPrefetchAsync(a, N * sizeof(int), device_id, stream);
cudaMemPrefetchAsync(a, N * sizeof(int), device_id);
int block_size = 256;
int grid_size = (N + block_size - 1) / block_size;
double_arr<<<grid_size, block_size>>>(a, N);
}
int main()
{
int *a, *b, *c;
cudaMallocManaged(&a, N * sizeof(int));
cudaMallocManaged(&b, N * sizeof(int));
cudaMallocManaged(&c, N * sizeof(int));
for (int i = 0; i < N; i++) {
a[i] = i;
b[i] = i;
c[i] = i;
}
// std::this_thread::sleep_for(chrono::seconds(1));
cout << "set device 4\n";
cudaSetDevice(1);
cudaMemPrefetchAsync(a, sizeof(int) * 128, 4);
// std::this_thread::sleep_for(chrono::seconds(1));
cout << "set device 6\n";
cudaSetDevice(2);
cudaMemPrefetchAsync(a, sizeof(int) * 128, 6);
// std::this_thread::sleep_for(chrono::seconds(1));
vector<std::thread> threads;
// threads.push_back(thread(thread_func, a, 0));
// threads.push_back(thread(thread_func, b, 4));
// threads.push_back(thread(thread_func, c, 6));
threads.push_back(thread(thread_func_invoke, b, 0));
threads.push_back(thread(thread_func_invoke, a, 1));
threads.push_back(thread(thread_func_invoke, c, 2));
for (auto& t : threads) { t.join(); }
// this_thread::sleep_for(chrono::seconds(1));
// cudaSetDevice(4);
// int block_size = 256;
// int grid_size = (N + block_size - 1) / block_size;
// add<<<grid_size, block_size>>>(a, b, c, N);
cudaDeviceSynchronize();
std::cout << "kernel finished\n";
// this_thread::sleep_for(chrono::seconds(1));
for(int i = 0; i < N; i++){
assert(a[i] == 2 * i);
assert(b[i] == 2 * i);
assert(c[i] == 2 * i);
}
std::cout << "assertion passed\n";
// this_thread::sleep_for(chrono::seconds(1));
cudaFree(a);
cudaFree(b);
cudaFree(c);
return 0;
}
This is the result of cudaMemPrefetchAsync
We can see that there is no parallelism in copying the data from host to GPU.
Can anyone tell me how I can parallel the copy via cudaMemPrefetchAsync?