I first asked this question on Stack Overflow[1] but it looks like this is a better place to find some help. Sorry for duplicating the question.
When I was trying to do concurrent cudaMemcpyAsync in a single context, the copy operations are queuing up and get executed one by one with throughput 12.4 GB/s, which is consistent with the answer here[2]. The screenshot is here[3].
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
inline
cudaError_t checkCuda(cudaError_t result)
{
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
return result;
}
const int nStreams = 4;
const int N = 100000000;
const int bytes = N * sizeof(int);
int* arr_H;
int* arr_D[nStreams];
cudaStream_t stream[nStreams];
int args[nStreams];
pthread_t threads[nStreams];
void* worker(void *arg)
{
int i = *((int *)arg);
checkCuda(cudaMemcpyAsync(arr_D[i], arr_H, bytes, cudaMemcpyHostToDevice, stream[i]));
return NULL;
}
int main()
{
for(int i = 0; i < nStreams; i++)
checkCuda(cudaStreamCreate(&stream[i]));
checkCuda(cudaMallocHost((void**)&arr_H, bytes));
for (int i = 0; i < N; i++)
arr_H[i] = random();
for (int i = 0; i < nStreams; i++)
checkCuda(cudaMalloc((void**)&arr_D[i], bytes));
for (int i = 0; i < nStreams; i++) {
args[i] = i;
pthread_create(&threads[i], NULL, worker, &args[i]);
}
for (int i = 0; i < nStreams; i++)
pthread_join(threads[i], NULL);
cudaFreeHost(arr_H);
for (int i = 0; i < nStreams; i++) {
checkCuda(cudaStreamDestroy(stream[i]));
cudaFree(arr_D[i]);
}
return 0;
But when I tried to do concurrent cudaMemcpyAsync in different contexts (by separating them into 4 processes), it seems that the first and the last one are running concurrently. The first 2 sequential cudaMemcpyAsync are running with a throughput 12.4 GB/s while the last 2 concurrent ones are running with a throughput 5.3 GB/s. The screenshot is here[4].
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
inline
cudaError_t checkCuda(cudaError_t result)
{
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
return result;
}
int main()
{
const int nStreams = 1;
const int N = 100000000;
const int bytes = N * sizeof(int);
int* arr_H;
int* arr_D[nStreams];
cudaStream_t stream[nStreams];
for(int i = 0; i < nStreams; i++)
checkCuda(cudaStreamCreate(&stream[i]));
checkCuda(cudaMallocHost((void**)&arr_H, bytes));
for (int i = 0; i < N; i++)
arr_H[i] = random();
for (int i = 0; i < nStreams; i++)
checkCuda(cudaMalloc((void**)&arr_D[i], bytes));
for (int i = 0; i < nStreams; i++)
checkCuda(cudaMemcpyAsync(arr_D[i], arr_H, bytes, cudaMemcpyHostToDevice, stream[i]));
cudaFreeHost(arr_H);
for (int i = 0; i < nStreams; i++) {
checkCuda(cudaStreamDestroy(stream[i]));
cudaFree(arr_D[i]);
}
return 0;
}
I used a python script to run multiple processes concurrently:
#!/usr/bin/env python3
import subprocess
N = 4
processes = [subprocess.Popen('./a.out', shell=True) for _ in range(N)]
for process in processes:
process.wait()
I’m just wondering if is possible do concurrent cudaMemcpyAsync within single context?
The background is that I’m writing a concurrent, single context CUDA program where there’re memcpys for both large chunks and really small chunks. The small chunks may get delayed if the larger one get transferred first. So it would be much easier if several cudaMemcpyAsync can be done concurrently.
I’m using CUDA 9.0 on TITAN Xp, which has 2 copy engines.
[1] https://stackoverflow.com/questions/55467942/is-concurrent-cudamemcpyasync-possible
[2] https://stackoverflow.com/questions/18893479/cudamemcpyasync-and-streams-behaviour-understanding
[3] https://i.stack.imgur.com/1llhj.png
[4] https://i.stack.imgur.com/0nQTO.png