CUDA copies serialised when using CUDA IPC

Hello,

I am working on a piece of code which uses two MPI processes. I have one MPI process per GPU, i.e rank 0 is on GPU-0.

In this code Rank 1 shares the address of the data on GPU 1 using CUDA IPC. Now Rank 0 has the source buffer (GPU0) and the destination buffer (GPU1). Rank 0 issues multiple copies from D2H and H2D to transfer the data. Here the data is serialised but we want the CUDA copies to be overlapped with one another.

CUDA IPC example:

#include "example.h"

#define D2H 0
#define H2D 1

// Global Variables
int size, rank, remote_rank;
cudaIpcMemHandle_t *mem_handle;
cudaStream_t host_stream[2][4];
cudaEvent_t host_event[4];

int main() {
  // Initialize Enviroment
  MPI_Init(NULL, NULL);
  MPI_Comm_size(MPI_COMM_WORLD, &size);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);

  remote_rank = (rank + 1) % size;
  CUDA_CHECK(cudaSetDevice(rank));

  // Allocate buffers
  char *src_buf, *dst_buf;
  char *host_buf;
  if (0 == rank) {
    CUDA_CHECK(cudaMalloc(&src_buf, MAX_MSG_SIZE));
    CUDA_CHECK(cudaMallocHost(&host_buf, MAX_MSG_SIZE));
  } else {
    CUDA_CHECK(cudaMalloc(&dst_buf, MAX_MSG_SIZE));
  }

  // Create Streams and events
  for (int i=0; i<4; i++) {
    CUDA_CHECK(cudaStreamCreateWithFlags(&host_stream[D2H][i], cudaStreamNonBlocking));

    CUDA_CHECK(cudaSetDevice(remote_rank));
    CUDA_CHECK(cudaStreamCreateWithFlags(&host_stream[H2D][i], cudaStreamNonBlocking));

    CUDA_CHECK(cudaSetDevice(rank));
    CUDA_CHECK(cudaEventCreateWithFlags(&host_event[i], cudaEventDisableTiming));
  }

  // Share the destination buffer's location in process 1
  // with process 0 using CUDA IPC
  mem_handle = alloc_shared_mem<cudaIpcMemHandle_t>(1, "IPC");
  MPI_Barrier(MPI_COMM_WORLD);

  if (1 == rank) {
    CUDA_CHECK(cudaIpcGetMemHandle(mem_handle, (void*) dst_buf));
  }

  MPI_Barrier(MPI_COMM_WORLD);

  if (0 == rank) {
    CUDA_CHECK(cudaIpcOpenMemHandle((void**) &dst_buf,
                                    *mem_handle,
                                    cudaIpcMemLazyEnablePeerAccess));
  }

  MPI_Barrier(MPI_COMM_WORLD);

  // Start of Code which we are asking about
  if (0 == rank) {
    int msg_chunk_size = (int) MAX_MSG_SIZE / 4;

    for (int i=0; i<4; i++) {
      int offset = i * MAX_MSG_CHUNK_SIZE;

      CUDA_CHECK(cudaMemcpyAsync((void*) &host_buf[offset],
                                 (void*) &src_buf[offset],
                                 MAX_MSG_CHUNK_SIZE,
                                 cudaMemcpyDeviceToHost,
                                 host_stream[D2H][i]));

      CUDA_CHECK(cudaEventRecord(host_event[i], host_stream[D2H][i]));
    }

    CUDA_CHECK(cudaSetDevice(remote_rank));

    for (int i=0; i<4; i++) {
      int offset = i * MAX_MSG_CHUNK_SIZE;

      CUDA_CHECK(cudaStreamWaitEvent(host_stream[H2D][i], host_event[i], 0));
      CUDA_CHECK(cudaMemcpyAsync((void*) &dst_buf[offset],
                                 (void*) &host_buf[offset],
                                 MAX_MSG_CHUNK_SIZE,
                                 cudaMemcpyHostToDevice,
                                 host_stream[H2D][i]));
    }

    for (int i=0; i<4; i++) {
      CUDA_CHECK(cudaStreamSynchronize(host_stream[H2D][i]));
    }

    CUDA_CHECK(cudaSetDevice(rank));
  }

  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Finalize();
}

We can see in the profiler output that these CUDA copies are serialised:

Something that I have observed is that, if I allocate both the source and destination buffer in the same process that the code perfectly pipelines as expected. You can see in the code bellow that there are minimal changes to the actual copy when compared to the IPC version:

#include "example.h"

#define D2H 0
#define H2D 1

// Global Variables
int size, rank, remote_rank;
cudaIpcMemHandle_t *mem_handle;
cudaStream_t host_stream[2][4];
cudaEvent_t host_event[4];

int main() {
  // Initialize Enviroment
  MPI_Init(NULL, NULL);
  MPI_Comm_size(MPI_COMM_WORLD, &size);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);

  remote_rank = (rank + 1) % size;

  CUDA_CHECK(cudaSetDevice(rank));

  // Allocate buffers
  char *src_buf, *dst_buf, *host_buf;
  if (0 == rank) {
    CUDA_CHECK(cudaMalloc(&src_buf, MAX_MSG_SIZE));

    CUDA_CHECK(cudaSetDevice(remote_rank));
    CUDA_CHECK(cudaMalloc(&dst_buf, MAX_MSG_SIZE));
    CUDA_CHECK(cudaSetDevice(rank));

    CUDA_CHECK(cudaMallocHost(&host_buf, MAX_MSG_SIZE));

    // Create Streams and events
    for (int i=0; i<4; i++) {
      CUDA_CHECK(cudaStreamCreateWithFlags(&host_stream[D2H][i], cudaStreamNonBlocking));

      CUDA_CHECK(cudaSetDevice(remote_rank));
      CUDA_CHECK(cudaStreamCreateWithFlags(&host_stream[H2D][i], cudaStreamNonBlocking));
      CUDA_CHECK(cudaSetDevice(rank));

      CUDA_CHECK(cudaEventCreateWithFlags(&host_event[i], cudaEventDisableTiming));
    }

    // Start of Code which we are asking about
    int msg_chunk_size = (int) MAX_MSG_SIZE / 4;

    for (int i=0; i<4; i++) {
      int offset = i * MAX_MSG_CHUNK_SIZE;

      CUDA_CHECK(cudaMemcpyAsync((void*) &host_buf[offset],
                                 (void*) &src_buf[offset],
                                 MAX_MSG_CHUNK_SIZE,
                                 cudaMemcpyDeviceToHost,
                                 host_stream[D2H][i]));

      CUDA_CHECK(cudaEventRecord(host_event[i], host_stream[D2H][i]));
    }

    CUDA_CHECK(cudaSetDevice(remote_rank));

    for (int i=0; i<4; i++) {
      int offset = i * MAX_MSG_CHUNK_SIZE;

      CUDA_CHECK(cudaStreamWaitEvent(host_stream[H2D][i], host_event[i], 0));
      CUDA_CHECK(cudaMemcpyAsync((void*) &dst_buf[offset],
                                 (void*) &host_buf[offset],
                                 MAX_MSG_CHUNK_SIZE,
                                 cudaMemcpyHostToDevice,
                                 host_stream[H2D][i]));
    }

    for (int i=0; i<4; i++) {
      CUDA_CHECK(cudaStreamSynchronize(host_stream[H2D][i]));
    }

    CUDA_CHECK(cudaSetDevice(rank));
  }

  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Finalize();
}

Now the profiler outputs the following:

Question:
Is it possible to get my CUDA IPC code to pipeline like the example which only transfers data within a single process?