cudaMemcpyAsync makes code faster even when using the default stream 0

al0vya · January 10, 2022, 5:04pm

To verify the guidance on CUDA streams in the CUDA Programming Guide, I wrote up and tested a relatively simple program whose outputs seems to go against what the Guide says:

Kernel launches and host ↔ device memory copies that do not specify any stream parameter, or equivalently that set the stream parameter to zero, are issued to the default stream. They are therefore executed in order.

From the above, I assumed that if my program issues kernel launches and memory copies using cudaMemcpyAsync() to the default stream, then it will not be any faster than just using cudaMemcpy.

My program tests this assumption as follows (pseudocode below, full code at the very bottom of my post):

pinned_memory = // from cudaMallocHost()

num_iter = // from command line

// time_synchronous()
for num_iter
   dummy_compute_kernel()
   cudaMemcpy()
   cudaMemcpy()
end

// time_asynchronous()
for num_iter
   dummy_compute_kernel()
   cudaMemcpyAsync()
   cudaMemcpyAsync()
end

// time_async_then_sync()
for num_iter
   dummy_compute_kernel()
   cudaMemcpyAsync()
   cudaMemcpy()
end

I do not specify any stream parameter, so I would expect time_synchronous(), time_asynchronous() and time_async_then_sync() to all have similar execution times but this is not the case. Instead, the execution times ranked from lowest to highest were (on a GTX 2070):

time_asynchronous()
time_async_then_sync()
time_synchronous().

Why is cudaMemcpyAsync() faster than cudaMemcpy() when I am only using the default stream 0?

As an important follow up question, if there is a cudaMemcpy(..., cudaMemcpyDefault) call in some code, will it serialise any further cudaMemcpyAsync(...,cudaMemcpyDefault) calls?

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "cstdio"
#include "cstdlib"
#include "cstring"

#define ELEMS (1 << 20)
#define THREADS_PER_BLOCK 256

#define CHECK_CUDA_ERROR(ans) { cuda_assert( ans, __FILE__, __LINE__ ); }

inline
void cuda_assert
(
	cudaError_t error,
	const char* file,
	int         line
)
{
	if (error != cudaSuccess)
	{
		fprintf(stderr, "CUDA error: %s, %s, %d\n", cudaGetErrorString(error), file, line);

		exit(error);
	}
}

void* malloc_device(size_t bytes);

void* malloc_pinned(size_t bytes);

cudaError_t cu_copy
(
	void*  dst,
	void*  src,
	size_t bytes
);

cudaError_t cu_copy_async
(
	void*  dst,
	void*  src,
	size_t bytes
);

__global__
void dummy_compute_kernel(double* d_array);

void time_synchronous
(
	double* d_array,
	double* h_array,
	int     num_iter,
	int     num_blocks,
	size_t  bytes
);

void time_asynchronous
(
	double* d_array,
	double* h_array,
	int     num_iter,
	int     num_blocks,
	size_t  bytes
);

void time_async_then_sync
(
	double* d_array,
	double* h_array,
	int     num_iter,
	int     num_blocks,
	size_t  bytes
);

int main
(
	int    argc,
	char** argv
)
{
	if (argc < 2)
	{
		fprintf(stderr, "\nPlease enter the number of iterations as a command line argument.\n");
		exit(-1);
	}
	
	const int    num_iter   = strtol(argv[1], nullptr, 10);
	const int    num_blocks = ELEMS / THREADS_PER_BLOCK;
	const size_t bytes      = sizeof(double) * ELEMS;

	double* h_array = (double*)malloc_pinned(bytes);

	double* d_array = (double*)malloc_device(bytes);

	CHECK_CUDA_ERROR( cu_copy(d_array, h_array, bytes) );

	time_synchronous
	(
		d_array, 
		h_array, 
		num_iter,
		num_blocks, 
		bytes
	);

	time_asynchronous
	(
		d_array,
		h_array,
		num_iter,
		num_blocks,
		bytes
	);
	
	time_async_then_sync
	(
		d_array,
		h_array,
		num_iter,
		num_blocks,
		bytes
	);

	CHECK_CUDA_ERROR( cudaFreeHost(h_array) );
	CHECK_CUDA_ERROR( cudaFree(d_array) );	

	return 0;
}

void* malloc_device(size_t bytes)
{
	void* ptr;

	CHECK_CUDA_ERROR( cudaMalloc(&ptr, bytes) );

	return ptr;
}

void* malloc_pinned(size_t bytes)
{
	void* ptr;

	CHECK_CUDA_ERROR( cudaMallocHost(&ptr, bytes) );

	memset(ptr, 0, bytes);

	return ptr;
}

cudaError_t cu_copy
(
	void*  dst,
	void*  src,
	size_t bytes
)
{
	return cudaMemcpy(dst, src, bytes, cudaMemcpyDefault);
}

cudaError_t cu_copy_async
(
	void*  dst,
	void*  src,
	size_t bytes
)
{
	return cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDefault);
}

__global__
void dummy_compute_kernel(double* d_array)
{
	const int idx = blockIdx.x * blockDim.x + threadIdx.x;

	d_array[idx] += cos( (double)idx );
}

void time_synchronous
(
	double* d_array,
	double* h_array,
	int     num_iter,
	int     num_blocks,
	size_t  bytes
)
{
	cudaEvent_t cuda_begin, cuda_end;
	CHECK_CUDA_ERROR( cudaEventCreate(&cuda_begin) );
	CHECK_CUDA_ERROR( cudaEventCreate(&cuda_end) );

	cudaEventRecord(cuda_begin);

	for (int i = 0; i < num_iter; i++)
	{
		dummy_compute_kernel<<<num_blocks, THREADS_PER_BLOCK>>>(d_array);

		CHECK_CUDA_ERROR( cu_copy(h_array, d_array, bytes) );
		CHECK_CUDA_ERROR( cu_copy(h_array, d_array, bytes) );
	}

	CHECK_CUDA_ERROR( cudaEventRecord(cuda_end) );
	CHECK_CUDA_ERROR( cudaEventSynchronize(cuda_end) );

	float cuda_time = 0;

	CHECK_CUDA_ERROR( cudaEventElapsedTime(&cuda_time, cuda_begin, cuda_end) );

	printf("\nSynchronous runtime: %f ms\n", cuda_time);
}

void time_asynchronous
(
	double* d_array,
	double* h_array,
	int     num_iter,
	int     num_blocks,
	size_t  bytes
)
{
	cudaEvent_t cuda_begin, cuda_end;
	CHECK_CUDA_ERROR( cudaEventCreate(&cuda_begin) );
	CHECK_CUDA_ERROR( cudaEventCreate(&cuda_end) );

	cudaEventRecord(cuda_begin);

	for (int i = 0; i < num_iter; i++)
	{
		dummy_compute_kernel<<<num_blocks, THREADS_PER_BLOCK>>>(d_array);

		CHECK_CUDA_ERROR( cu_copy_async(h_array, d_array, bytes) );
		CHECK_CUDA_ERROR( cu_copy_async(h_array, d_array, bytes) );
	}

	CHECK_CUDA_ERROR( cudaEventRecord(cuda_end) );
	CHECK_CUDA_ERROR( cudaEventSynchronize(cuda_end) );

	float cuda_time = 0;

	CHECK_CUDA_ERROR( cudaEventElapsedTime(&cuda_time, cuda_begin, cuda_end) );

	printf("\nAsynchronous runtime: %f ms\n", cuda_time);
}

void time_async_then_sync
(
	double* d_array,
	double* h_array,
	int     num_iter,
	int     num_blocks,
	size_t  bytes
)
{
	cudaEvent_t cuda_begin, cuda_end;
	CHECK_CUDA_ERROR( cudaEventCreate(&cuda_begin) );
	CHECK_CUDA_ERROR( cudaEventCreate(&cuda_end) );

	cudaEventRecord(cuda_begin);

	for (int i = 0; i < num_iter; i++)
	{
		dummy_compute_kernel<<<num_blocks, THREADS_PER_BLOCK>>>(d_array);

		CHECK_CUDA_ERROR( cu_copy_async(h_array, d_array, bytes) );
		CHECK_CUDA_ERROR( cu_copy      (h_array, d_array, bytes) );
	}

	CHECK_CUDA_ERROR( cudaEventRecord(cuda_end) );
	CHECK_CUDA_ERROR( cudaEventSynchronize(cuda_end) );

	float cuda_time = 0;

	CHECK_CUDA_ERROR( cudaEventElapsedTime(&cuda_time, cuda_begin, cuda_end) );

	printf("\nAsynchronous then synchronous runtime: %f ms\n", cuda_time);
}

striker159 · January 10, 2022, 7:49pm

cudaMemcpyAsync can be asynchronous, as the name suggests. It can return before the transfer is finished. This allows better overlap between gpu work and cpu work (cuda api overhead). In contrast, cudaMemcpy will block the current cpu thread until the transfer is complete.

This is not directly related to cuda stream semantics.

You should be able to verify the different behaviours in a profile such as nsight-systems

Topic		Replies	Views
cudaMemcpyAsync CUDA Programming and Performance	10	20517	October 16, 2015
Is cudaMemcpyAsync + cudaStreamSynchronize on default stream equal to cudaMemcpy (non-async) CUDA Programming and Performance	7	3728	December 12, 2019
cudaMemcpyAsync, unexpected behaviour while using cudaStreamNonBlocking? CUDA Programming and Performance	6	2042	May 29, 2018
Memory copy/set async to kernel execution in different stream CUDA Programming and Performance	5	1001	December 15, 2022
is kernel in stream 0 asynchronous? CUDA Programming and Performance	10	3710	April 23, 2011
cudaDeviceSynchronize needed between kernel launch and cudaMemcpy ? CUDA Programming and Performance	15	16172	September 29, 2017
some cuda question CUDA Programming and Performance	6	980	December 23, 2015
Is cudaMemcpy() real-time safe? CUDA Programming and Performance cuda	11	511	March 30, 2024
Much slower async memcpy in a separate stream than in stream 0 CUDA Programming and Performance	4	5193	July 23, 2015
Fail to sync the cudaMemcpyAsync using the cudaEvent in two streams CUDA Programming and Performance	4	241	April 1, 2024

cudaMemcpyAsync makes code faster even when using the default stream 0

Related topics