Is concurrent cudaMemcpyAsync possible?

I first asked this question on Stack Overflow[1] but it looks like this is a better place to find some help. Sorry for duplicating the question.

When I was trying to do concurrent cudaMemcpyAsync in a single context, the copy operations are queuing up and get executed one by one with throughput 12.4 GB/s, which is consistent with the answer here[2]. The screenshot is here[3].

#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>

inline
cudaError_t checkCuda(cudaError_t result)
{
  if (result != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
    assert(result == cudaSuccess);
  }
  return result;
}

const int nStreams = 4;
const int N = 100000000;
const int bytes = N * sizeof(int);
int* arr_H;
int* arr_D[nStreams];
cudaStream_t stream[nStreams];
int args[nStreams];
pthread_t threads[nStreams];

void* worker(void *arg)
{
  int i = *((int *)arg);
  checkCuda(cudaMemcpyAsync(arr_D[i], arr_H, bytes, cudaMemcpyHostToDevice, stream[i]));

  return NULL;
}

int main()
{
  for(int i = 0; i < nStreams; i++)
    checkCuda(cudaStreamCreate(&stream[i]));

  checkCuda(cudaMallocHost((void**)&arr_H, bytes));
  for (int i = 0; i < N; i++)
    arr_H[i] = random();

  for (int i = 0; i < nStreams; i++)
    checkCuda(cudaMalloc((void**)&arr_D[i], bytes));

  for (int i = 0; i < nStreams; i++) {
    args[i] = i;
    pthread_create(&threads[i], NULL, worker, &args[i]);
  }

  for (int i = 0; i < nStreams; i++)
    pthread_join(threads[i], NULL);

  cudaFreeHost(arr_H);
  for (int i = 0; i < nStreams; i++) {
    checkCuda(cudaStreamDestroy(stream[i]));
    cudaFree(arr_D[i]);
  }

  return 0;

But when I tried to do concurrent cudaMemcpyAsync in different contexts (by separating them into 4 processes), it seems that the first and the last one are running concurrently. The first 2 sequential cudaMemcpyAsync are running with a throughput 12.4 GB/s while the last 2 concurrent ones are running with a throughput 5.3 GB/s. The screenshot is here[4].

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>

inline
cudaError_t checkCuda(cudaError_t result)
{
  if (result != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
    assert(result == cudaSuccess);
  }
  return result;
}

int main()
{
  const int nStreams = 1;
  const int N = 100000000;
  const int bytes = N * sizeof(int);
  int* arr_H;
  int* arr_D[nStreams];
  cudaStream_t stream[nStreams];

  for(int i = 0; i < nStreams; i++)
    checkCuda(cudaStreamCreate(&stream[i]));

  checkCuda(cudaMallocHost((void**)&arr_H, bytes));
  for (int i = 0; i < N; i++)
    arr_H[i] = random();

  for (int i = 0; i < nStreams; i++)
    checkCuda(cudaMalloc((void**)&arr_D[i], bytes));

  for (int i = 0; i < nStreams; i++)
    checkCuda(cudaMemcpyAsync(arr_D[i], arr_H, bytes, cudaMemcpyHostToDevice, stream[i]));

  cudaFreeHost(arr_H);
  for (int i = 0; i < nStreams; i++) {
    checkCuda(cudaStreamDestroy(stream[i]));
    cudaFree(arr_D[i]);
  }

  return 0;
}

I used a python script to run multiple processes concurrently:

#!/usr/bin/env python3
import subprocess

N = 4

processes = [subprocess.Popen('./a.out', shell=True) for _ in range(N)]

for process in processes:
    process.wait()

I’m just wondering if is possible do concurrent cudaMemcpyAsync within single context?

The background is that I’m writing a concurrent, single context CUDA program where there’re memcpys for both large chunks and really small chunks. The small chunks may get delayed if the larger one get transferred first. So it would be much easier if several cudaMemcpyAsync can be done concurrently.

I’m using CUDA 9.0 on TITAN Xp, which has 2 copy engines.

[1] https://stackoverflow.com/questions/55467942/is-concurrent-cudamemcpyasync-possible
[2] https://stackoverflow.com/questions/18893479/cudamemcpyasync-and-streams-behaviour-understanding
[3] https://i.stack.imgur.com/1llhj.png
[4] https://i.stack.imgur.com/0nQTO.png

Any updates on this question?

I noticed data less than 64KB can be transferred via in-band transport[1]. But I’m copying 400MB data, which probably has to be sent through copy engine.

[1] [url]https://devtalk.nvidia.com/default/topic/1045814/cuda-programming-and-performance/two-concurrent-htod-copies-in-titan-x-pascal-with-2-copy-engines/[/url]

It’s not possible. And the supposed concurrency you are witnessing in the multi-process case is an artifact of the profiler coupled with your interpretation assumptions, and the fact that the 2 supposed concurrent copies are each running at half the normal bandwidth should be a good clue that there is no magic here.

There are various questions/answers on the web stating that with 2 copy engines, only one transfer per direction is possible. Multi-process doesn’t change that, and even if it could, its not clear to me that it would provide any benefit without some stated specification and guarantees of behvior in that scenario. There are none.