Is cudaMemcpyAsync + cudaStreamSynchronize on default stream equal to cudaMemcpy (non-async)

Hi,
Someone on github, told me that cudaMemcpyAsync + cudaStreamSynchronize on defalutl stream is equal to cudaMemcpy (non-async), below is implementation of cudaMemcpy.

__host__ ​cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ){
    cudaMemcpyAsync(dst,src,count,kind,0);
    return cudaStreamSynchronize(0);   
}

However I’m doubt about it.
I find description of default stream on this site
https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html#stream-sync-behavior.
The legacy default stream is an implicit stream which synchronizes with all other streams in the same CUcontext except for non-blocking streams
So, I think cudaMemcpyAsync + cudaStreamSynchronize will cost more than cudaMemcpy (non-async).
Am I right ?
looking forward to any reply.
Thank you.

What performance differences did you find when you tried these two variants? Practical experiments are a valuable check on thought experiments.

My memory is a bit hazy (and hopefully someone more knowledgeable will correct me if I am wrong here), but I am reasonably sure cudaMemcpy() includes an implicit cudaDeviceSynchronize(). Which means it waits for everything. As a consequence, I would generally expect higher performance from cudaMemcpyAsync() + cudaStreamSynchronize() versus use of cudaMemcpy(). But in many use case, thee may not be a discernable performance difference outside of measurement noise (±2%).

Hi, njuffa
Thank you for reply, I have made a study on the cudaMemcpy, and it seems that cudaMemcpy doesn’t include an implicit cudaDeviceSynchronize.
This is my test code.

#include <cuda_runtime.h>
#include <cassert>
#include <iostream>

typedef unsigned long long int ullint;

__global__ void sum_test(ullint *ptr){
    for(int i=0;i<100000; ++i) atomicAdd(ptr,1ULL);
}

int main(){
    ullint* ptr;
    assert(cudaMallocManaged(&ptr,sizeof(ullint))==cudaSuccess);
    *ptr=0;

    ullint hv = 0;
    ullint *dv;

    assert(cudaMalloc(&dv,sizeof(ullint))==cudaSuccess);

    cudaStream_t ss;
    assert(cudaStreamCreateWithFlags(&ss,cudaStreamNonBlocking)==cudaSuccess);

    sum_test<<<128,128,0,ss>>>(ptr);
    std::cout << "start kernel, sum = " << *ptr << std::endl;

    assert(cudaMemcpy(dv,&hv,sizeof(ullint),cudaMemcpyDefault)==cudaSuccess);

    std::cout << "finish, sum = " << *ptr << std::endl;

    assert(cudaStreamDestroy(ss)==cudaSuccess);
    assert(cudaFree(ptr)==cudaSuccess);
    assert(cudaFree(dv)==cudaSuccess);
    return 0;
}

If cudaMemcpy includes an implicit cudaDeviceSynchronize,the code should output like this:

finis, sum = 1638400000

However the actual output is this:

start kernel, sum = 0
finish, sum = 0

Then I replace the cudaMemcpy with cudaDeviceSynchronize, the output is correct.

#include <cuda_runtime.h>
#include <cassert>
#include <iostream>

typedef unsigned long long int ullint;

__global__ void sum_test(ullint *ptr){
    for(int i=0;i<100000; ++i) atomicAdd(ptr,1ULL);
}

int main(){
    ullint* ptr;
    assert(cudaMallocManaged(&ptr,sizeof(ullint))==cudaSuccess);
    *ptr=0;

    ullint hv = 0;
    ullint *dv;

    assert(cudaMalloc(&dv,sizeof(ullint))==cudaSuccess);

    cudaStream_t ss;
    assert(cudaStreamCreateWithFlags(&ss,cudaStreamNonBlocking)==cudaSuccess);

    sum_test<<<128,128,0,ss>>>(ptr);
    std::cout << "start kernel, sum = " << *ptr << std::endl;

    // assert(cudaMemcpy(dv,&hv,sizeof(ullint),cudaMemcpyDefault)==cudaSuccess);
    assert(cudaDeviceSynchronize()==cudaSuccess);

    std::cout << "finish, sum = " << *ptr << std::endl;

    assert(cudaStreamDestroy(ss)==cudaSuccess);
    assert(cudaFree(ptr)==cudaSuccess);
    assert(cudaFree(dv)==cudaSuccess);
    return 0;
}

So, I believe cudaMemcpy doesn’t include an implicit cudaDeviceSynchronize

@yefu.chen.

You are correct, cudaMemcpy doesn’t include implicit cudaDeviceSynchronize. If you’re writing straight CUDA, nothing include an implicit cudaDeviceSynchronize that I can think of at the moment. Maybe some CUDA libraries. That’s not to say a function is blocking, like cudaMemcpy.

Please read https://devblogs.nvidia.com/how-overlap-data-transfers-cuda-cc/ for more details.

@mnicely
thanks for your reply,
but I’m still confuse, is cudaMemcpyAsync + cudaStreamSynchronize on default stream equal to cudaMemcpy (non-async) ?

cudaMemcpy includes a synchronization on the default stream. It does not include the equivalent of cudaDeviceSynchronize().

Normally, a synchronization on the default stream synchronizes all other created streams on that device. However, if you create a stream with the cudaStreamNonBlocking flag, that stream will not be synchronized by a synchronization in the default stream.

However, the non-blocking stream will be synchronized by cudaDeviceSynchronize(), which synchronizes all previously issued work to that device, regardless of stream.

Thanks for your replay.
So cudaMemcpy is equal to cudaMemcpyAsync + cudaMemcpySynchronize, and bellow code is implementation of cudaMemcpy.

__host__ ​cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ){
    cudaMemcpyAsync(dst,src,count,kind,0);
    return cudaStreamSynchronize(0);   
}

Okay, I’ve modified the example at https://devblogs.nvidia.com/how-overlap-data-transfers-cuda-cc/ to create two simple scenarios that indeed make cudaMemcpy is equal to cudaMemcpyAsync + cudaStreamSynchronize. One is with the default stream creation and one with the cudaStreamNonBlocking flag. I’ve added screenshots of Nsight Systems as well.

I can’t guarantee that it is true in every scenario.

...

for (int i = 0; i < nStreams; ++i)
    checkCuda( cudaStreamCreateWithFlags(&stream[i], cudaStreamNonBlocking) );
//  checkCuda( cudaStreamCreateWithFlags(&stream[i], cudaStreamDefault) );

...

  int offset { };
  memset(a, 0, bytes);
  checkCuda( cudaEventRecord(startEvent,0) );
  offset = 0 * streamSize;
  checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset], streamBytes, cudaMemcpyHostToDevice, stream[0]) );
  kernel<<<streamSize/blockSize, blockSize, 0, stream[0]>>>(d_a, offset);
  checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset], streamBytes, cudaMemcpyDeviceToHost, stream[0]) );
  offset = 1 * streamSize;
  checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset], streamBytes, cudaMemcpyHostToDevice, stream[1]) );
  kernel<<<streamSize/blockSize, blockSize, 0, stream[1]>>>(d_a, offset);
  checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset], streamBytes, cudaMemcpyDeviceToHost, stream[1]) );
  offset = 2 * streamSize;
  checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset], streamBytes, cudaMemcpyHostToDevice, stream[2]) );
  kernel<<<streamSize/blockSize, blockSize, 0, stream[2]>>>(d_a, offset);
  checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset], streamBytes, cudaMemcpyDeviceToHost, stream[2]) );
  offset = 3 * streamSize;
  checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset], streamBytes, cudaMemcpyHostToDevice, stream[3]) );
  kernel<<<streamSize/blockSize, blockSize, 0, stream[3]>>>(d_a, offset);
  checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset], streamBytes, cudaMemcpyDeviceToHost, stream[3]) );
  checkCuda( cudaEventRecord(stopEvent, 0) );
  checkCuda( cudaEventSynchronize(stopEvent) );
  checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
  printf("Time for asynchronous V1 transfer and execute (ms): %f\n", ms);
  printf("  max error: %e\n", maxError(a, n));
  
  memset(a, 0, bytes);
  checkCuda( cudaEventRecord(startEvent,0) );
  offset = 0 * streamSize;
  checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset], streamBytes, cudaMemcpyHostToDevice, stream[0]) );
  kernel<<<streamSize/blockSize, blockSize, 0, stream[0]>>>(d_a, offset);
  checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset], streamBytes, cudaMemcpyDeviceToHost, stream[0]) );
  offset = 1 * streamSize;
  checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset], streamBytes, cudaMemcpyHostToDevice, stream[1]) );
  kernel<<<streamSize/blockSize, blockSize, 0, stream[1]>>>(d_a, offset);
  checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset], streamBytes, cudaMemcpyDeviceToHost, stream[1]) );
  offset = 2 * streamSize;
  checkCuda( cudaMemcpy(&d_a[offset], &a[offset], streamBytes, cudaMemcpyHostToDevice) );
  kernel<<<streamSize/blockSize, blockSize, 0>>>(d_a, offset);
  checkCuda( cudaMemcpy(&a[offset], &d_a[offset], streamBytes, cudaMemcpyDeviceToHost) );
  offset = 3 * streamSize;
  checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset], streamBytes, cudaMemcpyHostToDevice, stream[3]) );
  kernel<<<streamSize/blockSize, blockSize, 0, stream[3]>>>(d_a, offset);
  checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset], streamBytes, cudaMemcpyDeviceToHost, stream[3]) );
  checkCuda( cudaEventRecord(stopEvent, 0) );
  checkCuda( cudaEventSynchronize(stopEvent) );
  checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
  printf("Time for asynchronous V1 transfer and execute (ms): %f\n", ms);
  printf("  max error: %e\n", maxError(a, n));