cudaMemcpyAsync makes code faster even when using the default stream 0

To verify the guidance on CUDA streams in the CUDA Programming Guide, I wrote up and tested a relatively simple program whose outputs seems to go against what the Guide says:

Kernel launches and host ↔ device memory copies that do not specify any stream parameter, or equivalently that set the stream parameter to zero, are issued to the default stream. They are therefore executed in order.

From the above, I assumed that if my program issues kernel launches and memory copies using cudaMemcpyAsync() to the default stream, then it will not be any faster than just using cudaMemcpy.

My program tests this assumption as follows (pseudocode below, full code at the very bottom of my post):

pinned_memory = // from cudaMallocHost()

num_iter = // from command line

// time_synchronous()
for num_iter
   dummy_compute_kernel()
   cudaMemcpy()
   cudaMemcpy()
end

// time_asynchronous()
for num_iter
   dummy_compute_kernel()
   cudaMemcpyAsync()
   cudaMemcpyAsync()
end

// time_async_then_sync()
for num_iter
   dummy_compute_kernel()
   cudaMemcpyAsync()
   cudaMemcpy()
end

I do not specify any stream parameter, so I would expect time_synchronous(), time_asynchronous() and time_async_then_sync() to all have similar execution times but this is not the case. Instead, the execution times ranked from lowest to highest were (on a GTX 2070):

  1. time_asynchronous()
  2. time_async_then_sync()
  3. time_synchronous().

Why is cudaMemcpyAsync() faster than cudaMemcpy() when I am only using the default stream 0?

As an important follow up question, if there is a cudaMemcpy(..., cudaMemcpyDefault) call in some code, will it serialise any further cudaMemcpyAsync(...,cudaMemcpyDefault) calls?

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "cstdio"
#include "cstdlib"
#include "cstring"

#define ELEMS (1 << 20)
#define THREADS_PER_BLOCK 256

#define CHECK_CUDA_ERROR(ans) { cuda_assert( ans, __FILE__, __LINE__ ); }

inline
void cuda_assert
(
	cudaError_t error,
	const char* file,
	int         line
)
{
	if (error != cudaSuccess)
	{
		fprintf(stderr, "CUDA error: %s, %s, %d\n", cudaGetErrorString(error), file, line);

		exit(error);
	}
}

void* malloc_device(size_t bytes);

void* malloc_pinned(size_t bytes);

cudaError_t cu_copy
(
	void*  dst,
	void*  src,
	size_t bytes
);

cudaError_t cu_copy_async
(
	void*  dst,
	void*  src,
	size_t bytes
);

__global__
void dummy_compute_kernel(double* d_array);

void time_synchronous
(
	double* d_array,
	double* h_array,
	int     num_iter,
	int     num_blocks,
	size_t  bytes
);

void time_asynchronous
(
	double* d_array,
	double* h_array,
	int     num_iter,
	int     num_blocks,
	size_t  bytes
);

void time_async_then_sync
(
	double* d_array,
	double* h_array,
	int     num_iter,
	int     num_blocks,
	size_t  bytes
);

int main
(
	int    argc,
	char** argv
)
{
	if (argc < 2)
	{
		fprintf(stderr, "\nPlease enter the number of iterations as a command line argument.\n");
		exit(-1);
	}
	
	const int    num_iter   = strtol(argv[1], nullptr, 10);
	const int    num_blocks = ELEMS / THREADS_PER_BLOCK;
	const size_t bytes      = sizeof(double) * ELEMS;

	double* h_array = (double*)malloc_pinned(bytes);

	double* d_array = (double*)malloc_device(bytes);

	CHECK_CUDA_ERROR( cu_copy(d_array, h_array, bytes) );

	time_synchronous
	(
		d_array, 
		h_array, 
		num_iter,
		num_blocks, 
		bytes
	);

	time_asynchronous
	(
		d_array,
		h_array,
		num_iter,
		num_blocks,
		bytes
	);
	
	time_async_then_sync
	(
		d_array,
		h_array,
		num_iter,
		num_blocks,
		bytes
	);

	CHECK_CUDA_ERROR( cudaFreeHost(h_array) );
	CHECK_CUDA_ERROR( cudaFree(d_array) );	

	return 0;
}

void* malloc_device(size_t bytes)
{
	void* ptr;

	CHECK_CUDA_ERROR( cudaMalloc(&ptr, bytes) );

	return ptr;
}

void* malloc_pinned(size_t bytes)
{
	void* ptr;

	CHECK_CUDA_ERROR( cudaMallocHost(&ptr, bytes) );

	memset(ptr, 0, bytes);

	return ptr;
}

cudaError_t cu_copy
(
	void*  dst,
	void*  src,
	size_t bytes
)
{
	return cudaMemcpy(dst, src, bytes, cudaMemcpyDefault);
}

cudaError_t cu_copy_async
(
	void*  dst,
	void*  src,
	size_t bytes
)
{
	return cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDefault);
}

__global__
void dummy_compute_kernel(double* d_array)
{
	const int idx = blockIdx.x * blockDim.x + threadIdx.x;

	d_array[idx] += cos( (double)idx );
}

void time_synchronous
(
	double* d_array,
	double* h_array,
	int     num_iter,
	int     num_blocks,
	size_t  bytes
)
{
	cudaEvent_t cuda_begin, cuda_end;
	CHECK_CUDA_ERROR( cudaEventCreate(&cuda_begin) );
	CHECK_CUDA_ERROR( cudaEventCreate(&cuda_end) );

	cudaEventRecord(cuda_begin);

	for (int i = 0; i < num_iter; i++)
	{
		dummy_compute_kernel<<<num_blocks, THREADS_PER_BLOCK>>>(d_array);

		CHECK_CUDA_ERROR( cu_copy(h_array, d_array, bytes) );
		CHECK_CUDA_ERROR( cu_copy(h_array, d_array, bytes) );
	}

	CHECK_CUDA_ERROR( cudaEventRecord(cuda_end) );
	CHECK_CUDA_ERROR( cudaEventSynchronize(cuda_end) );

	float cuda_time = 0;

	CHECK_CUDA_ERROR( cudaEventElapsedTime(&cuda_time, cuda_begin, cuda_end) );

	printf("\nSynchronous runtime: %f ms\n", cuda_time);
}

void time_asynchronous
(
	double* d_array,
	double* h_array,
	int     num_iter,
	int     num_blocks,
	size_t  bytes
)
{
	cudaEvent_t cuda_begin, cuda_end;
	CHECK_CUDA_ERROR( cudaEventCreate(&cuda_begin) );
	CHECK_CUDA_ERROR( cudaEventCreate(&cuda_end) );

	cudaEventRecord(cuda_begin);

	for (int i = 0; i < num_iter; i++)
	{
		dummy_compute_kernel<<<num_blocks, THREADS_PER_BLOCK>>>(d_array);

		CHECK_CUDA_ERROR( cu_copy_async(h_array, d_array, bytes) );
		CHECK_CUDA_ERROR( cu_copy_async(h_array, d_array, bytes) );
	}

	CHECK_CUDA_ERROR( cudaEventRecord(cuda_end) );
	CHECK_CUDA_ERROR( cudaEventSynchronize(cuda_end) );

	float cuda_time = 0;

	CHECK_CUDA_ERROR( cudaEventElapsedTime(&cuda_time, cuda_begin, cuda_end) );

	printf("\nAsynchronous runtime: %f ms\n", cuda_time);
}

void time_async_then_sync
(
	double* d_array,
	double* h_array,
	int     num_iter,
	int     num_blocks,
	size_t  bytes
)
{
	cudaEvent_t cuda_begin, cuda_end;
	CHECK_CUDA_ERROR( cudaEventCreate(&cuda_begin) );
	CHECK_CUDA_ERROR( cudaEventCreate(&cuda_end) );

	cudaEventRecord(cuda_begin);

	for (int i = 0; i < num_iter; i++)
	{
		dummy_compute_kernel<<<num_blocks, THREADS_PER_BLOCK>>>(d_array);

		CHECK_CUDA_ERROR( cu_copy_async(h_array, d_array, bytes) );
		CHECK_CUDA_ERROR( cu_copy      (h_array, d_array, bytes) );
	}

	CHECK_CUDA_ERROR( cudaEventRecord(cuda_end) );
	CHECK_CUDA_ERROR( cudaEventSynchronize(cuda_end) );

	float cuda_time = 0;

	CHECK_CUDA_ERROR( cudaEventElapsedTime(&cuda_time, cuda_begin, cuda_end) );

	printf("\nAsynchronous then synchronous runtime: %f ms\n", cuda_time);
}

cudaMemcpyAsync can be asynchronous, as the name suggests. It can return before the transfer is finished. This allows better overlap between gpu work and cpu work (cuda api overhead). In contrast, cudaMemcpy will block the current cpu thread until the transfer is complete.

This is not directly related to cuda stream semantics.

You should be able to verify the different behaviours in a profile such as nsight-systems