Cudamemcpy latency exhibits significant variation

I’ve observed inconsistent CUDA memcpy performance like below image. What factors should I investigate first?

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <stdbool.h>
#include <limits>
#include <time.h>
#include <string>
#include <sys/time.h>
#include <sys/mman.h>
 
//#include <rte_cycles.h>
 
#define RTE_STD_C11 __extension__
#define LOOP_COUNT 128
#define VECTOR_LENGTH 512
#define SCALAR_LENGTH 64
#define NUM_ELEMENTS (VECTOR_LENGTH/SCALAR_LENGTH)
 
using namespace std;
 
//--------------------------------------------------------------------------------------------------------------------------
 
//Computing CPU Cycles consumed using rdtsc
 
static inline uint64_t rte_rdtsc(void) {
  union {
    uint64_t tsc_64;
    RTE_STD_C11
    struct {
      uint32_t lo_32;
      uint32_t hi_32;
    };
  } tsc;
 
    asm volatile("rdtsc" :
                "=a" (tsc.lo_32),
                "=d" (tsc.hi_32));
    return tsc.tsc_64;
}
 
#define GET_CYCLES()         rte_rdtsc()
 
//--------------------------------------------------------------------------------------------------------------------------
 
//Computing CPU Cycles consumed using rdtscp
 
#define RDTSCP()            \
  __asm__ volatile("RDTSCP\n\t" \
                   "mov %%edx, %0\n\t" \
                   "mov %%eax, %1\n\t" \
                   "CPUID\n\t" \
                   : "=r" (tsc.hi_32), "=r" (tsc.lo_32) \
                   :: "%rax", "%rbx", "%rcx", "%rdx");
 
 
static inline uint64_t
GET_PRECISE_CYCLES(void) {
  union {
    uint64_t tsc_64;
    struct {
      uint32_t lo_32;
      uint32_t hi_32;
    };
  } tsc;
  RDTSCP();
  return tsc.tsc_64;
}
 
//----------------------------------------------------------------------------------------
 
#include "device_launch_parameters.h"
#include <cuda_runtime.h>
#include <stdlib.h>
#include <stdio.h>
#include <signal.h>
#include <stdlib.h>
#include <unistd.h>
 
volatile sig_atomic_t status = 0;
 
static void catch_function(int signo) {
  status = signo;
}
 
#define N (1024)
#define THREADS_PER_BLOCK 32
 
__global__ void dot(int *a, int *b, int *c) {
  __shared__ int temp[THREADS_PER_BLOCK];
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  temp[threadIdx.x] = a[index] * b[index];
 
  __syncthreads();
 
  if (threadIdx.x == 0) {
    int sum = 0;
    for(int i = 0 ; i < THREADS_PER_BLOCK ; i++){
      sum += temp[i];
    }
    atomicAdd(c, sum);
  }
}
 
 #define PINNED_MEMCPY
 #define D2H
 
int main(int argc, char* argv[])
{
  // For statistics
  if(argc == 1) {
    printf("Please enter the Iteration Count\n");
    return 1;
  }
  // Set above function as signal handler for the SIGINT signal:
  if (signal(SIGINT, catch_function) == SIG_ERR) {
    fputs("An error occurred while setting a signal handler.\n", stderr);
    return EXIT_FAILURE;
  }
  int iter_count = stoi(argv[1]);
 
  uint64_t min_c = numeric_limits<uint64_t>::max();
  uint64_t max_c = numeric_limits<uint64_t>::min();
  double avg_c = 0.0;
  double accum = 0.0;
  uint64_t start_c, stop_c, diff_c, diff_c_cold_cache = 0;
 
  int *a, *b, *c;
#if defined(DEFAULT_MEMCPY) || defined(PINNED_MEMCPY) || defined(DEFAULT_MEMCPY_ALIGNED) || defined(UNIFIED_MEMCPY)
  int *dev_a, *dev_b, *dev_c;
#endif
  //int size = N * sizeof(int);
  int size = 288 * 64 * 4;
 
  // allocate host memories
#if defined(DEFAULT_MEMCPY)
  a = (int *)malloc(size + 1);
  b = (int *)malloc(size + 1);
  //c = (int *)malloc(sizeof(char) + 1);
  c = (int *)malloc(size + 1);
#elif defined(PINNED_MEMCPY)
  cudaMallocHost(&a, size + 1);
  cudaMallocHost(&b, size + 1);
  //cudaMallocHost(&c, sizeof(char) + 1);
  cudaMallocHost(&c, size + 1);
#elif defined(DEFAULT_MEMCPY_ALIGNED)
  a = (int *)aligned_alloc(sizeof(char), size + 1);
  b = (int *)aligned_alloc(sizeof(char), size + 1);
  //c = (int *)aligned_alloc(sizeof(char), sizeof(char) + 1);
  c = (int *)aligned_alloc(sizeof(char), size + 1);
#endif
 
  // allocate device memories
#if defined(DEFAULT_MEMCPY) || defined(PINNED_MEMCPY)
  cudaMalloc(&dev_a, size + 1);
  cudaMalloc(&dev_b, size + 1);
  //cudaMalloc(&dev_c, sizeof(char) + 1);
  cudaMalloc(&dev_c, size + 1);
#elif defined(UNIFIED_MEMCPY)
  cudaHostAlloc(&dev_a, size + 1, cudaHostAllocMapped);
  cudaHostAlloc(&dev_b, size + 1, cudaHostAllocMapped);
  cudaHostAlloc(&dev_c, sizeof(char) + 1, cudaHostAllocMapped);
#endif
  srand(GET_PRECISE_CYCLES());
 
  mlockall(MCL_CURRENT | MCL_FUTURE);
#if 0
  // Cold Cache case
  start_c = GET_PRECISE_CYCLES();
  //----------------------------------------------------------------------------------------
  for(int j = 0; j < iter_count; ++j) {
    // copy host memories to device memories
    cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
 
    // run dot with N threads
    dot <<< N / THREADS_PER_BLOCK, THREADS_PER_BLOCK >>> (dev_a, dev_b, dev_c);
 
    // copy device memories sum result(dev_c) to host memories(c)
    cudaMemcpy(c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
  }
  //----------------------------------------------------------------------------------------
  stop_c = GET_PRECISE_CYCLES();
  diff_c_cold_cache = stop_c - start_c;
#endif
 
  for(int i = 0; i < iter_count; ++i) {
    // initialize variable
    for (int k = 0; k < size; ++k) {
      a[i] = rand(), b[i] = rand();//, dev_c[i] = rand();
    }
    //----------------------------------------------------------------------------------------
    start_c = GET_PRECISE_CYCLES(); //Cycle measure start
    //----------------------------------------------------------------------------------------
 
    // copy host memories to device memories
#if defined(H2D)
    cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
    //printf("1\n");
#endif
 
#if defined(KERNEL_LAUNCH)
    // run dot with N threads
    dot <<< N / THREADS_PER_BLOCK, THREADS_PER_BLOCK >>> (dev_a, dev_b, dev_c);
    //printf("2\n");
#endif
    // copy device memories sum result(dev_c) to host memories(c)
#if defined(D2H)
    cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);
    //printf("3\n");
#endif
    //----------------------------------------------------------------------------------------
    stop_c = GET_PRECISE_CYCLES(); // Cycle measure stop
    //----------------------------------------------------------------------------------------
 
    diff_c = stop_c - start_c;
    avg_c += diff_c;
    //printf("avg_c = %.0lf diff_c = %" PRIu64 "\n", avg_c, diff_c);
    if(diff_c < min_c) {
      min_c = diff_c;
    }
    if(diff_c > max_c) {
      max_c = diff_c;
    }
    //sleep(1); // to check whether the linux kernen scheduler called or not
  }
 #if defined(DEFAULT_MEMCPY) || defined(PINNED_MEMCPY) || defined(DEFAULT_MEMCPY_ALIGNED) || defined(UNIFIED_MEMCPY)
  cudaFreeHost(a); cudaFreeHost(b); cudaFreeHost(c);
  cudaFree(dev_a);
  cudaFree(dev_b);
  cudaFree(dev_c);
  #endif
 
  avg_c /= iter_count;
 
  printf("Average Cycles Consumed: %.0lf\n", avg_c);
  printf("Minimum Cycles Consumed: %" PRIu64 "\n", min_c);
  printf("Maximum Cycles Consumed: %" PRIu64 "\n", max_c);
  printf("Cold Cache Cycles Consumed: %" PRIu64 "\n", diff_c_cold_cache);
 
  printf("\nDummy Print: %lf\n", accum);
 
  return 0;
}

I see fairly consistent behavior after the first few iterations:

# ./t245 100
avg_c = 82293 diff_c = 82293
avg_c = 109806 diff_c = 27513
avg_c = 133827 diff_c = 24021
avg_c = 158547 diff_c = 24720
avg_c = 181995 diff_c = 23448
avg_c = 205407 diff_c = 23412
avg_c = 228990 diff_c = 23583
avg_c = 253197 diff_c = 24207
avg_c = 276708 diff_c = 23511
avg_c = 299607 diff_c = 22899
avg_c = 323523 diff_c = 23916
avg_c = 347448 diff_c = 23925
avg_c = 371016 diff_c = 23568
avg_c = 394800 diff_c = 23784
avg_c = 418254 diff_c = 23454
avg_c = 441990 diff_c = 23736
avg_c = 465576 diff_c = 23586
avg_c = 488961 diff_c = 23385
avg_c = 512439 diff_c = 23478
avg_c = 536391 diff_c = 23952
avg_c = 559836 diff_c = 23445
avg_c = 583506 diff_c = 23670
avg_c = 604677 diff_c = 21171
avg_c = 628569 diff_c = 23892
avg_c = 652236 diff_c = 23667
avg_c = 675678 diff_c = 23442
avg_c = 699297 diff_c = 23619
avg_c = 723042 diff_c = 23745
avg_c = 746370 diff_c = 23328
avg_c = 769797 diff_c = 23427
avg_c = 793668 diff_c = 23871
avg_c = 817575 diff_c = 23907
avg_c = 841020 diff_c = 23445
avg_c = 864483 diff_c = 23463
avg_c = 888126 diff_c = 23643
avg_c = 912057 diff_c = 23931
avg_c = 935130 diff_c = 23073
avg_c = 958587 diff_c = 23457
avg_c = 982044 diff_c = 23457
avg_c = 1006203 diff_c = 24159
avg_c = 1029939 diff_c = 23736
avg_c = 1053561 diff_c = 23622
avg_c = 1077618 diff_c = 24057
avg_c = 1101183 diff_c = 23565
avg_c = 1124277 diff_c = 23094
avg_c = 1147827 diff_c = 23550
avg_c = 1171455 diff_c = 23628
avg_c = 1195299 diff_c = 23844
avg_c = 1219419 diff_c = 24120
avg_c = 1242879 diff_c = 23460
avg_c = 1265739 diff_c = 22860
avg_c = 1289505 diff_c = 23766
avg_c = 1313148 diff_c = 23643
avg_c = 1336668 diff_c = 23520
avg_c = 1358859 diff_c = 22191
avg_c = 1382595 diff_c = 23736
avg_c = 1406403 diff_c = 23808
avg_c = 1430337 diff_c = 23934
avg_c = 1453851 diff_c = 23514
avg_c = 1477773 diff_c = 23922
avg_c = 1499538 diff_c = 21765
avg_c = 1523187 diff_c = 23649
avg_c = 1546968 diff_c = 23781
avg_c = 1570422 diff_c = 23454
avg_c = 1594365 diff_c = 23943
avg_c = 1617744 diff_c = 23379
avg_c = 1641570 diff_c = 23826
avg_c = 1665984 diff_c = 24414
avg_c = 1689384 diff_c = 23400
avg_c = 1712733 diff_c = 23349
avg_c = 1736484 diff_c = 23751
avg_c = 1760367 diff_c = 23883
avg_c = 1783974 diff_c = 23607
avg_c = 1807362 diff_c = 23388
avg_c = 1830963 diff_c = 23601
avg_c = 1854603 diff_c = 23640
avg_c = 1876554 diff_c = 21951
avg_c = 1900104 diff_c = 23550
avg_c = 1923567 diff_c = 23463
avg_c = 1947351 diff_c = 23784
avg_c = 1971312 diff_c = 23961
avg_c = 1994766 diff_c = 23454
avg_c = 2018256 diff_c = 23490
avg_c = 2041983 diff_c = 23727
avg_c = 2066154 diff_c = 24171
avg_c = 2089734 diff_c = 23580
avg_c = 2113266 diff_c = 23532
avg_c = 2136993 diff_c = 23727
avg_c = 2160537 diff_c = 23544
avg_c = 2183901 diff_c = 23364
avg_c = 2207967 diff_c = 24066
avg_c = 2231247 diff_c = 23280
avg_c = 2254584 diff_c = 23337
avg_c = 2277984 diff_c = 23400
avg_c = 2301531 diff_c = 23547
avg_c = 2325768 diff_c = 24237
avg_c = 2349252 diff_c = 23484
avg_c = 2372574 diff_c = 23322
avg_c = 2395701 diff_c = 23127
avg_c = 2419344 diff_c = 23643
Average Cycles Consumed: 24193
Minimum Cycles Consumed: 21171
Maximum Cycles Consumed: 82293
Cold Cache Cycles Consumed: 0

Dummy Print: 0.000000

@isk These are transfers between host and device, correct? Are you on a machine with NUMA characteristics by any chance, that is either a system with multiple CPU sockets or a a CPU constructed internally of multiple physical chiplets? If so, try re-running your tests with processor and memory affinity set such the GPU is “talking” to the “near” CPU / system memory.

Thanks for the comment. The server has one cpu(6443N) and one GPU(L4). It seems unlikely to be related to NUMA.

As shown in the graph, about 80% of the data exhibits consistent latency, but approximately 20% shows significantly higher than normal latency. I would like to investigate the cause of this.

Since we have established that my machine doesn’t show the variability with your code, we could hypothesize that the machine matters. This word “machine” as I am using it here includes both the hardware setup, the software stack setup, the operating system, cloud/virtualization, etc, i.e. everything except the code you have shown.

If I were trying to investigate this issue, and knowing that someone else had demonstrated a low-variability case, I would seek to understand the difference above, possibly set up a similar low-variability case/environment, see if I can reproduce the low-variability results, and then carefully consider all the platform differences, perhaps trying to eliminate them one-by-one, to see if I could identify the culprit.

You could also try, whether Nsight Systems shows some hint, what is happening at the outliers.

Also does your CPU occupation go up in regular intervals? Or some I/O?

Does it change, if you give your process a higher priority?