How to achieve peak memory bandwidth？

BlueWanderer · April 16, 2017, 3:53pm

I found I can only reach about 360GB/s on 1080ti with peak bandwidth of 484GB/s. While I can easy have 500GB/s on a Fury card with peak of 512GB/s. What’s wrong here?

BlueWanderer · April 16, 2017, 3:54pm

I’m stupid… again… that 500GB/s is because of cache hits. Raw speed is about 360GB/s, too.

njuffa · April 17, 2017, 3:57am

It’s unlikely that a simple copy benchmark achieves more than 80% of theoretical bandwidth. Your numbers are a little below that mark. You might want to cross-check your results with my little program zcopy below:

#include <stdlib.h>
#include <stdio.h>

#define ZCOPY_THREADS  128
#define ZCOPY_DEFLEN   10000000
#define ZCOPY_ITER     10           // as in STREAM benchmark

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaThreadSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

// A routine to give access to a high precision timer on most systems.
#if defined(_WIN32)
#if !defined(WIN32_LEAN_AND_MEAN)
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
double second (void)
{
    LARGE_INTEGER t;
    static double oofreq;
    static int checkedForHighResTimer;
    static BOOL hasHighResTimer;

    if (!checkedForHighResTimer) {
        hasHighResTimer = QueryPerformanceFrequency (&t);
        oofreq = 1.0 / (double)t.QuadPart;
        checkedForHighResTimer = 1;
    }
    if (hasHighResTimer) {
        QueryPerformanceCounter (&t);
        return (double)t.QuadPart * oofreq;
    } else {
        return (double)GetTickCount() * 1.0e-3;
    }
}
#elif defined(__linux__) || defined(__APPLE__)
#include <stddef.h>
#include <sys/time.h>
double second (void)
{
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return (double)tv.tv_sec + (double)tv.tv_usec * 1.0e-6;
}
#else
#error unsupported platform
#endif

__global__ void zcopy (const double2 * __restrict__ src, 
                       double2 * __restrict__ dst, int len)
{
    int stride = gridDim.x * blockDim.x;
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    for (int i = tid; i < len; i += stride) {
        dst[i] = src[i];
    }
}    

struct zcopyOpts {
    int len;
};

static int processArgs (int argc, char *argv[], struct zcopyOpts *opts)
{
    int error = 0;
    memset (opts, 0, sizeof(*opts));
    while (argc) {
        if (*argv[0] == '-') {
            switch (*(argv[0]+1)) {
            case 'n':
                opts->len = atol(argv[0]+2);
                break;
            default:
                fprintf (stderr, "Unknown switch '%c%s'\n", '-', argv[0]+1);
                error++;
                break;
            }
        }
        argc--;
        argv++;
    }
    return error;
}

int main (int argc, char *argv[])
{
    double start, stop, elapsed, mintime;
    double2 *d_a, *d_b;
    int errors;
    struct zcopyOpts opts;

    errors = processArgs (argc, argv, &opts);
    if (errors) {
        return EXIT_FAILURE;
    }
    opts.len = (opts.len) ? opts.len : ZCOPY_DEFLEN;

    /* Allocate memory on device */
    CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0]) * opts.len));
    CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0]) * opts.len));
    
    /* Initialize device memory */
    CUDA_SAFE_CALL (cudaMemset(d_a, 0x00, sizeof(d_a[0]) * opts.len)); // zero
    CUDA_SAFE_CALL (cudaMemset(d_b, 0xff, sizeof(d_b[0]) * opts.len)); // NaN

    /* Compute execution configuration */
    dim3 dimBlock(ZCOPY_THREADS);
    int threadBlocks = (opts.len + (dimBlock.x - 1)) / dimBlock.x;
    if (threadBlocks > 65520) threadBlocks = 65520;
    dim3 dimGrid(threadBlocks);
    
    printf ("zcopy: operating on vectors of %d double2s (= %.3e bytes)\n", 
            opts.len, (double)sizeof(d_a[0]) * opts.len);
    printf ("zcopy: using %d threads per block, %d blocks\n", 
            dimBlock.x, dimGrid.x);

    mintime = fabs(log(0.0));
    for (int k = 0; k < ZCOPY_ITER; k++) {
        start = second();
        zcopy<<<dimGrid,dimBlock>>>(d_a, d_b, opts.len);
        CHECK_LAUNCH_ERROR();
        stop = second();
        elapsed = stop - start;
        if (elapsed < mintime) mintime = elapsed;
    }
    printf ("zcopy: mintime = %.3f msec  throughput = %.2f GB/sec\n",
            1.0e3 * mintime, (2.0e-9 * sizeof(d_a[0]) * opts.len) / mintime);

    CUDA_SAFE_CALL (cudaFree(d_a));
    CUDA_SAFE_CALL (cudaFree(d_b));

    return EXIT_SUCCESS;
}

BlueWanderer · April 17, 2017, 11:18am

njuffa,

Don’t quite get it :| Your code is 360GB/s on my card, too.

And actually cudaMemcpy does no better. (On an AMD’s Fury card, the copy buffer API is worse than a kernel, though…)

I did various test on it, and I found that as long as the accesses were coalesced the speed won’t differs too much (only on CUDA cards). What’s limiting the performance? The profiler seemed not giving much information on memory performances.

njuffa · April 17, 2017, 3:16pm

Your results are as expected, nothing is wrong. Note that I mentioned 80% efficiency as an upper limit for copy operations. I do not know your code, and have not used a GTX 1080 Ti. I thought your test app may leave a few percent of throughput on the table. Based on your latest information that is apparently not the case.

The specified theoretical DRAM throughput is never achievable in practice, whether on CPUs or GPUs. Your x86 processor may have a four-channel DDR4 memory subsystem with a theoretical throughput of 76.8 GB/sec, but in reality you will see a copy throughput of around 60 GB/sec. Again about 80% efficiency.

The maximum achievable memory throughput will differ slightly with the mix of read and write operations. You may see slightly higher throughput for a TRIAD operation (see STREAM benchmark) due to the higher percentage of read traffic. You could also play with the size of the array used in the zcopy test (it takes a command-line argument for the size), and see performance fluctuate a few percent as you make the array bigger.

GigaCore · July 17, 2020, 9:13am

Note that , at least in windows+geforce combo, cuda workloads work in P2 power state which has reduced memory clock.

Topic		Replies	Views
Device to Device cudaMemcpy performance CUDA Programming and Performance cuda	5	10214	March 24, 2021
Why nvidia gpu vectorized mem bandwidth drop a lot after a specific problem size CUDA Programming and Performance	5	158	May 22, 2024
NVIDIA Quadro RTX 8000 bandwidthTest Theoretical Max Results CUDA Programming and Performance	4	922	August 9, 2021
Quadro 4000 Bandwidth The device to device bandwidth obtained with CUDA Programming and Performance	8	3516	March 7, 2011
PCIe A100s - Slow PCIe speed tests? CUDA Programming and Performance	2	1284	April 12, 2021
How to reach HBM Peak bandwidth performance CUDA Programming and Performance	2	875	November 30, 2021
Memory-intensive applications? How to know the resource bottleneck? CUDA Programming and Performance	6	8979	January 30, 2016
Low Bandwidth with simple data copy CUDA Programming and Performance	4	9116	December 7, 2011
theoretical/real shared/dram peak memory throughput CUDA Programming and Performance	12	4967	January 5, 2017
A few questions on CUDA performance with pictures! CUDA Programming and Performance	6	3349	January 10, 2009

How to achieve peak memory bandwidth？

Related topics