Slow device to device memory copy


I am trying device to device copies on V100 like this:

typedef unsigned long long uint64;

void kernel(uint64* mm1, uint64 * mm2) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int slice = 4096;
    mm1 += idx*slice;
    mm2 += idx*slice;
    uint64 * end = mm1+slice;
    while(mm1<end) *mm1++ = *mm2++;

int main() {
    void *mm1,*mm2;

Kernel is launched with 32 threads per block, single block. mm1 and mm2 are allocated with cudaMallocManaged and passed into the kernel. Total time measured in the while loop is around ~900us with ~1ms for kernel execution as measured from host, giving a rate of little over 1GB/sec, which is quite pathetic. Is there any faster way to copy memory in the device? This throughput is way too small.

That would seem to be your problem right there: too few threads. For device-to-device copies (e.g. DCOPY, see code below), you would want to use on the order of several tens of thousands of threads.

#include <stdlib.h>
#include <stdio.h>

#define DCOPY_THREADS  128
#define DCOPY_DEFLEN   20000000
#define DCOPY_ITER     10           // as in STREAM benchmark

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaDeviceSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

// A routine to give access to a high precision timer on most systems.
#if defined(_WIN32)
#if !defined(WIN32_LEAN_AND_MEAN)
#include <windows.h>
double second (void)
    static double oofreq;
    static int checkedForHighResTimer;
    static BOOL hasHighResTimer;

    if (!checkedForHighResTimer) {
        hasHighResTimer = QueryPerformanceFrequency (&t);
        oofreq = 1.0 / (double)t.QuadPart;
        checkedForHighResTimer = 1;
    if (hasHighResTimer) {
        QueryPerformanceCounter (&t);
        return (double)t.QuadPart * oofreq;
    } else {
        return (double)GetTickCount() * 1.0e-3;
#elif defined(__linux__) || defined(__APPLE__)
#include <stddef.h>
#include <sys/time.h>
double second (void)
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return (double)tv.tv_sec + (double)tv.tv_usec * 1.0e-6;
#error unsupported platform

__global__ void dcopy (const double * __restrict__ src, 
                       double * __restrict__ dst, int len)
    int stride = gridDim.x * blockDim.x;
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    for (int i = tid; i < len; i += stride) {
        dst[i] = src[i];

struct dcopyOpts {
    int len;

static int processArgs (int argc, char *argv[], struct dcopyOpts *opts)
    int error = 0;
    memset (opts, 0, sizeof(*opts));
    while (argc) {
        if (*argv[0] == '-') {
            switch (*(argv[0]+1)) {
            case 'n':
                opts->len = atol(argv[0]+2);
                fprintf (stderr, "Unknown switch '%c%s'\n", '-', argv[0]+1);
    return error;

int main (int argc, char *argv[])
    double start, stop, elapsed, mintime;
    double *d_a, *d_b;
    int errors;
    struct dcopyOpts opts;

    errors = processArgs (argc, argv, &opts);
    if (errors) {
        return EXIT_FAILURE;
    opts.len = (opts.len) ? opts.len : DCOPY_DEFLEN;

    /* Allocate memory on device */
    CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0]) * opts.len));
    CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0]) * opts.len));
    /* Initialize device memory */
    CUDA_SAFE_CALL (cudaMemset(d_a, 0x00, sizeof(d_a[0]) * opts.len)); // zero
    CUDA_SAFE_CALL (cudaMemset(d_b, 0xff, sizeof(d_b[0]) * opts.len)); // NaN

    /* Compute execution configuration */
    dim3 dimBlock(DCOPY_THREADS);
    int threadBlocks = (opts.len + (dimBlock.x - 1)) / dimBlock.x;
    if (threadBlocks > 65520) threadBlocks = 65520;
    dim3 dimGrid(threadBlocks);
    printf ("dcopy: operating on vectors of %d doubles (= %.3e bytes)\n", 
            opts.len, (double)sizeof(d_a[0]) * opts.len);
    printf ("dcopy: using %d threads per block, %d blocks\n", 
            dimBlock.x, dimGrid.x);

    mintime = fabs(log(0.0));
    for (int k = 0; k < DCOPY_ITER; k++) {
        start = second();
        dcopy<<<dimGrid,dimBlock>>>(d_a, d_b, opts.len);
        stop = second();
        elapsed = stop - start;
        if (elapsed < mintime) mintime = elapsed;
    printf ("dcopy: mintime = %.3f msec  throughput = %.2f GB/sec\n",
            1.0e3 * mintime, (2.0e-9 * sizeof(d_a[0]) * opts.len) / mintime);

    CUDA_SAFE_CALL (cudaFree(d_a));
    CUDA_SAFE_CALL (cudaFree(d_b));

    return EXIT_SUCCESS;


I tried increasing threads and blocks, but it is capping at 4gb/sec. Is that expected?

No. Try running the code I posted. What memory throughput does it report? Note also the pattern in which threads are assigned to data elements in my code.

I notice belatedly that your code uses cudaMallocManaged(). Are you sure that you are copying between memory objects that are resident on the device?

BTW, it is better practice to #include if you want a specific-width integer type like uint64_t, instead of typedef’ing your own.

Yes, I am using UM. I tried looping the kernel many times and averaging, so at least after the 2nd iteration everything should be on device.

I ran your code, it ups to 750gb/sec. But you are using thousands of blocks, I will try that.

750 GB/sec seems about right, based on the theoretical bandwidth of 900 GB/sec that NVIDIA specifies, so 83.3% efficiency. You may be able to tweak that by a couple of percent by trying different block / thread partitionings, and/or changing the size of the memory block being copied.

Yes, there is a lot of tuning. I will need to use this for dynamic copy (ie. different size). Any recommendations on the maximum bytes I should copy per thread? Would going as far as an 8 bytes copy per thread yield the best result?

My code is supposed to be universal across all GPUs, and therefore does not go to configuration extremes.

I would certainly try assigning only one data element per thread on modern hardware. Note that GPUs have native 16-byte copy instructions, so try copying using uint4 elements as well.

In general, if copy performance becomes important to app performance, it is a good time to rethink one’s software design. Pure copies without data processing should be avoided, regardless of platform.