Here is the perftest script written by myself.
#include <iostream>
#include <stdio.h>
#include <mpi.h>
#include <nvshmem.h>
#include <nvshmemx.h>
#include "utils.cuh"
__global__ void warp_copy_kernel(float *dst, float *src, int dim, int p)
{
int idx = blockIdx.x;
nvshmemx_float_get_warp((float *)&dst[idx * dim],
&src[idx * dim], dim, p);
}
void warp_copy(float *dst, float *src, int nrows, int dim, int p, cudaStream_t stream)
{
dim3 blockDim(32, 1, 1);
dim3 gridDim(nrows, 1, 1);
nvshmemx_barrier_all_on_stream(stream);
warp_copy_kernel<<<gridDim, blockDim, 0, stream> > >(dst, src, dim, p);
nvshmemx_barrier_all_on_stream(stream);
}
int main(int argc, char *argv[])
{
int dim = atoi(argv[1]);
int rank, nranks;
cudaStream_t stream;
nvshmemx_init_attr_t attr;
MPI_Comm mpi_comm = MPI_COMM_WORLD;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nranks);
attr.mpi_comm = &mpi_comm;
printf("mpi_info: rank: %d, nranks: %d\n", rank, nranks);
// Set up NVSHMEM device.
nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
int mype_node = nvshmem_my_pe();
int local_gpu_num = 0;
cudaGetDeviceCount(&local_gpu_num);
cudaSetDevice(mype_node % local_gpu_num);
printf("PE-%d, local_gpu_num: %d, local_gpu_id: %d\n", mype_node, local_gpu_num, mype_node % local_gpu_num);
cudaStreamCreate(&stream);
// test bandwidth of using nvshmem for cross-GPU data copy
float *buffer0 = nullptr, *buffer1 = nullptr;
int iter = 2000, warmup = 100;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int pow = 15;
for (int i = 5; i < pow; i++)
{
int nrows = 2 << i;
int perf_size = nrows * dim;
if (mype_node == 0)
printf("nrows: %d, dim: %d, perf_size: %d\n", nrows, dim, perf_size);
float ms = 0;
nvshmemx_barrier_all_on_stream(stream);
gpuErrchk(cudaStreamSynchronize(stream));
buffer0 = (float *)nvshmem_malloc(perf_size * sizeof(float));
buffer1 = (float *)nvshmem_malloc(perf_size * sizeof(float));
gpuErrchk(cudaMemset(buffer0, 0, perf_size * sizeof(float)));
gpuErrchk(cudaMemset(buffer1, 0, perf_size * sizeof(float)));
gpuErrchk(cudaStreamSynchronize(stream));
// copy data from buffer0 to buffer1
// copy buffer0(PE0) to buffer1(PE1)
// copy buffer0(PE1) to buffer1(PE0)
for (int j = 0; j < warmup; j++)
{
nvshmemx_float_get_on_stream(buffer1, buffer0, perf_size, (mype_node + 1) % nranks, stream);
nvshmemx_barrier_all_on_stream(stream);
}
gpuErrchk(cudaStreamSynchronize(stream));
cudaEventRecord(start, stream);
for (int j = 0; j < iter; j++)
{
nvshmemx_float_get_on_stream(buffer1, buffer0, perf_size, (mype_node + 1) % nranks, stream);
nvshmemx_barrier_all_on_stream(stream);
}
nvshmemx_quiet_on_stream(stream);
gpuErrchk(cudaStreamSynchronize(stream));
gpuErrchk(cudaEventRecord(stop, stream));
gpuErrchk(cudaEventSynchronize(stop));
gpuErrchk(cudaEventElapsedTime(&ms, start, stop));
// in KB
if (mype_node == 0)
{
float data_size = 2 * static_cast<float>(perf_size) * sizeof(float) / 1024 / 1024;
float bandwidth = data_size * iter / ms * 1000;
printf("nv_host_copy perf_size: %d elems, data_size: %.1f KB, bandwidth: %.3f MB/s\n", perf_size, data_size * 1024, bandwidth);
}
// nvshem devive warp copy
// warmup
for (int j = 0; j < warmup; j++)
{
warp_copy(buffer1, buffer0, nrows, dim, (mype_node + 1) % nranks, stream);
nvshmemx_barrier_all_on_stream(stream);
}
gpuErrchk(cudaStreamSynchronize(stream));
cudaEventRecord(start, stream);
for (int j = 0; j < iter; j++)
{
warp_copy(buffer1, buffer0, nrows, dim, (mype_node + 1) % nranks, stream);
nvshmemx_barrier_all_on_stream(stream);
}
nvshmemx_quiet_on_stream(stream);
gpuErrchk(cudaStreamSynchronize(stream));
gpuErrchk(cudaEventRecord(stop, stream));
gpuErrchk(cudaEventSynchronize(stop));
gpuErrchk(cudaEventElapsedTime(&ms, start, stop));
if (mype_node == 0)
{
float data_size = 2 * static_cast<float>(perf_size) * sizeof(float) / 1024 / 1024;
float bandwidth = data_size * iter / ms * 1000;
printf("nv_device_copy perf_size: %d elems, data_size: %.1f KB, bandwidth: %.3f MB/s\n", perf_size, data_size * 1024, bandwidth);
}
nvshmem_free(buffer0);
nvshmem_free(buffer1);
}
nvshmem_finalize();
MPI_Finalize();
if (mype_node == 0)
printf("=================Done!=================\n");
return 0;
}
When running this test on two-node with two GPUs, the cpu-side nvshmemx_float_get_on_stream
can achieve the optimal bandwidth as expected (16 GB/s), which is the bandwidth of my infiniband. The GPU-side warp_copy
can not achieve this expected bandwidth, which is constrained by 64MB/s. While my application is based on GPU-side data copy, it encounters severe performance degradation when running on two nodes with multiple GPUs. To clarify, my application can run very fast on single-node multi-GPU. Is this a normal behavior? or I have incorrectly configured my environment. GPU-side data copy on the cross-node connection (e.g., Infiniband) does have high latency?
To check, here is part of my test result:
nrows: 128, dim: 32, perf_size: 4096
nv_host_copy perf_size: 4096 elems, data_size: 32.0 KB, bandwidth: 622.998 MB/s
nv_device_copy perf_size: 4096 elems, data_size: 32.0 KB, bandwidth: 79.315 MB/s
nrows: 256, dim: 32, perf_size: 8192
nv_host_copy perf_size: 8192 elems, data_size: 64.0 KB, bandwidth: 1209.587 MB/s
nv_device_copy perf_size: 8192 elems, data_size: 64.0 KB, bandwidth: 87.977 MB/s
nrows: 512, dim: 32, perf_size: 16384
nv_host_copy perf_size: 16384 elems, data_size: 128.0 KB, bandwidth: 2047.404 MB/s
nv_device_copy perf_size: 16384 elems, data_size: 128.0 KB, bandwidth: 87.772 MB/s
nrows: 1024, dim: 32, perf_size: 32768
nv_host_copy perf_size: 32768 elems, data_size: 256.0 KB, bandwidth: 3633.369 MB/s
nv_device_copy perf_size: 32768 elems, data_size: 256.0 KB, bandwidth: 68.729 MB/s
nrows: 2048, dim: 32, perf_size: 65536
nv_host_copy perf_size: 65536 elems, data_size: 512.0 KB, bandwidth: 6358.160 MB/s
nv_device_copy perf_size: 65536 elems, data_size: 512.0 KB, bandwidth: 66.602 MB/s
nrows: 4096, dim: 32, perf_size: 131072
nv_host_copy perf_size: 131072 elems, data_size: 1024.0 KB, bandwidth: 8842.070 MB/s
nv_device_copy perf_size: 131072 elems, data_size: 1024.0 KB, bandwidth: 64.663 MB/s
nrows: 8192, dim: 32, perf_size: 262144
nv_host_copy perf_size: 262144 elems, data_size: 2048.0 KB, bandwidth: 12345.572 MB/s
nv_device_copy perf_size: 262144 elems, data_size: 2048.0 KB, bandwidth: 63.868 MB/s
nrows: 16384, dim: 32, perf_size: 524288
nv_host_copy perf_size: 524288 elems, data_size: 4096.0 KB, bandwidth: 14776.895 MB/s
nv_device_copy perf_size: 524288 elems, data_size: 4096.0 KB, bandwidth: 63.069 MB/s
nrows: 32768, dim: 32, perf_size: 1048576
nv_host_copy perf_size: 1048576 elems, data_size: 8192.0 KB, bandwidth: 16232.806 MB/s
nv_device_copy perf_size: 1048576 elems, data_size: 8192.0 KB, bandwidth: 62.359 MB/s