I’ve observed inconsistent CUDA memcpy performance like below image. What factors should I investigate first?
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <stdbool.h>
#include <limits>
#include <time.h>
#include <string>
#include <sys/time.h>
#include <sys/mman.h>
//#include <rte_cycles.h>
#define RTE_STD_C11 __extension__
#define LOOP_COUNT 128
#define VECTOR_LENGTH 512
#define SCALAR_LENGTH 64
#define NUM_ELEMENTS (VECTOR_LENGTH/SCALAR_LENGTH)
using namespace std;
//--------------------------------------------------------------------------------------------------------------------------
//Computing CPU Cycles consumed using rdtsc
static inline uint64_t rte_rdtsc(void) {
union {
uint64_t tsc_64;
RTE_STD_C11
struct {
uint32_t lo_32;
uint32_t hi_32;
};
} tsc;
asm volatile("rdtsc" :
"=a" (tsc.lo_32),
"=d" (tsc.hi_32));
return tsc.tsc_64;
}
#define GET_CYCLES() rte_rdtsc()
//--------------------------------------------------------------------------------------------------------------------------
//Computing CPU Cycles consumed using rdtscp
#define RDTSCP() \
__asm__ volatile("RDTSCP\n\t" \
"mov %%edx, %0\n\t" \
"mov %%eax, %1\n\t" \
"CPUID\n\t" \
: "=r" (tsc.hi_32), "=r" (tsc.lo_32) \
:: "%rax", "%rbx", "%rcx", "%rdx");
static inline uint64_t
GET_PRECISE_CYCLES(void) {
union {
uint64_t tsc_64;
struct {
uint32_t lo_32;
uint32_t hi_32;
};
} tsc;
RDTSCP();
return tsc.tsc_64;
}
//----------------------------------------------------------------------------------------
#include "device_launch_parameters.h"
#include <cuda_runtime.h>
#include <stdlib.h>
#include <stdio.h>
#include <signal.h>
#include <stdlib.h>
#include <unistd.h>
volatile sig_atomic_t status = 0;
static void catch_function(int signo) {
status = signo;
}
#define N (1024)
#define THREADS_PER_BLOCK 32
__global__ void dot(int *a, int *b, int *c) {
__shared__ int temp[THREADS_PER_BLOCK];
int index = threadIdx.x + blockIdx.x * blockDim.x;
temp[threadIdx.x] = a[index] * b[index];
__syncthreads();
if (threadIdx.x == 0) {
int sum = 0;
for(int i = 0 ; i < THREADS_PER_BLOCK ; i++){
sum += temp[i];
}
atomicAdd(c, sum);
}
}
#define PINNED_MEMCPY
#define D2H
int main(int argc, char* argv[])
{
// For statistics
if(argc == 1) {
printf("Please enter the Iteration Count\n");
return 1;
}
// Set above function as signal handler for the SIGINT signal:
if (signal(SIGINT, catch_function) == SIG_ERR) {
fputs("An error occurred while setting a signal handler.\n", stderr);
return EXIT_FAILURE;
}
int iter_count = stoi(argv[1]);
uint64_t min_c = numeric_limits<uint64_t>::max();
uint64_t max_c = numeric_limits<uint64_t>::min();
double avg_c = 0.0;
double accum = 0.0;
uint64_t start_c, stop_c, diff_c, diff_c_cold_cache = 0;
int *a, *b, *c;
#if defined(DEFAULT_MEMCPY) || defined(PINNED_MEMCPY) || defined(DEFAULT_MEMCPY_ALIGNED) || defined(UNIFIED_MEMCPY)
int *dev_a, *dev_b, *dev_c;
#endif
//int size = N * sizeof(int);
int size = 288 * 64 * 4;
// allocate host memories
#if defined(DEFAULT_MEMCPY)
a = (int *)malloc(size + 1);
b = (int *)malloc(size + 1);
//c = (int *)malloc(sizeof(char) + 1);
c = (int *)malloc(size + 1);
#elif defined(PINNED_MEMCPY)
cudaMallocHost(&a, size + 1);
cudaMallocHost(&b, size + 1);
//cudaMallocHost(&c, sizeof(char) + 1);
cudaMallocHost(&c, size + 1);
#elif defined(DEFAULT_MEMCPY_ALIGNED)
a = (int *)aligned_alloc(sizeof(char), size + 1);
b = (int *)aligned_alloc(sizeof(char), size + 1);
//c = (int *)aligned_alloc(sizeof(char), sizeof(char) + 1);
c = (int *)aligned_alloc(sizeof(char), size + 1);
#endif
// allocate device memories
#if defined(DEFAULT_MEMCPY) || defined(PINNED_MEMCPY)
cudaMalloc(&dev_a, size + 1);
cudaMalloc(&dev_b, size + 1);
//cudaMalloc(&dev_c, sizeof(char) + 1);
cudaMalloc(&dev_c, size + 1);
#elif defined(UNIFIED_MEMCPY)
cudaHostAlloc(&dev_a, size + 1, cudaHostAllocMapped);
cudaHostAlloc(&dev_b, size + 1, cudaHostAllocMapped);
cudaHostAlloc(&dev_c, sizeof(char) + 1, cudaHostAllocMapped);
#endif
srand(GET_PRECISE_CYCLES());
mlockall(MCL_CURRENT | MCL_FUTURE);
#if 0
// Cold Cache case
start_c = GET_PRECISE_CYCLES();
//----------------------------------------------------------------------------------------
for(int j = 0; j < iter_count; ++j) {
// copy host memories to device memories
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
// run dot with N threads
dot <<< N / THREADS_PER_BLOCK, THREADS_PER_BLOCK >>> (dev_a, dev_b, dev_c);
// copy device memories sum result(dev_c) to host memories(c)
cudaMemcpy(c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
}
//----------------------------------------------------------------------------------------
stop_c = GET_PRECISE_CYCLES();
diff_c_cold_cache = stop_c - start_c;
#endif
for(int i = 0; i < iter_count; ++i) {
// initialize variable
for (int k = 0; k < size; ++k) {
a[i] = rand(), b[i] = rand();//, dev_c[i] = rand();
}
//----------------------------------------------------------------------------------------
start_c = GET_PRECISE_CYCLES(); //Cycle measure start
//----------------------------------------------------------------------------------------
// copy host memories to device memories
#if defined(H2D)
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
//printf("1\n");
#endif
#if defined(KERNEL_LAUNCH)
// run dot with N threads
dot <<< N / THREADS_PER_BLOCK, THREADS_PER_BLOCK >>> (dev_a, dev_b, dev_c);
//printf("2\n");
#endif
// copy device memories sum result(dev_c) to host memories(c)
#if defined(D2H)
cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);
//printf("3\n");
#endif
//----------------------------------------------------------------------------------------
stop_c = GET_PRECISE_CYCLES(); // Cycle measure stop
//----------------------------------------------------------------------------------------
diff_c = stop_c - start_c;
avg_c += diff_c;
//printf("avg_c = %.0lf diff_c = %" PRIu64 "\n", avg_c, diff_c);
if(diff_c < min_c) {
min_c = diff_c;
}
if(diff_c > max_c) {
max_c = diff_c;
}
//sleep(1); // to check whether the linux kernen scheduler called or not
}
#if defined(DEFAULT_MEMCPY) || defined(PINNED_MEMCPY) || defined(DEFAULT_MEMCPY_ALIGNED) || defined(UNIFIED_MEMCPY)
cudaFreeHost(a); cudaFreeHost(b); cudaFreeHost(c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
#endif
avg_c /= iter_count;
printf("Average Cycles Consumed: %.0lf\n", avg_c);
printf("Minimum Cycles Consumed: %" PRIu64 "\n", min_c);
printf("Maximum Cycles Consumed: %" PRIu64 "\n", max_c);
printf("Cold Cache Cycles Consumed: %" PRIu64 "\n", diff_c_cold_cache);
printf("\nDummy Print: %lf\n", accum);
return 0;
}