For a second opinion, you could try my zcopy.cu
program below, which uses a 128-bit vector type to maximize the throughput. Sample output:
C:\Users\Norbert\My Programs>zcopy -d1 -n100000000
CUDA initialization: 0.296 seconds
zcopy: running on device 1 (Quadro P2000)
zcopy: using vectors of 100000000 double2 elements (= 1.600e+09 bytes)
zcopy: using 128 threads per block, 781250 blocks
zcopy: mintime = 26.155 msec; eqv. memory bandwith = 122.35 GB/sec
C:\Users\Norbert\My Programs>zcopy -d0 -n100000000
CUDA initialization: 0.292 seconds
zcopy: running on device 0 (Quadro RTX 4000)
zcopy: using vectors of 100000000 double2 elements (= 1.600e+09 bytes)
zcopy: using 128 threads per block, 781250 blocks
zcopy: mintime = 8.440 msec; eqv. memory bandwith = 379.13 GB/sec
According to the TechPowerUp database, the theoretical bandwidth of the Quadro RTX 4000 is 416 GB/sec and 140 GB/sec for the Quadro P2000. So these achieve 91% and 87% of theoretical, respectively.
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#define ZCOPY_THREADS 128
#define ZCOPY_DEFLEN 30000000
#define ZCOPY_DEFDEV 0
#define ZCOPY_ITER 10 // as in STREAM benchmark
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaDeviceSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
#if defined(_WIN32)
#if !defined(WIN32_LEAN_AND_MEAN)
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
double second (void)
{
LARGE_INTEGER t;
static double oofreq;
static int checkedForHighResTimer;
static BOOL hasHighResTimer;
if (!checkedForHighResTimer) {
hasHighResTimer = QueryPerformanceFrequency (&t);
oofreq = 1.0 / (double)t.QuadPart;
checkedForHighResTimer = 1;
}
if (hasHighResTimer) {
QueryPerformanceCounter (&t);
return (double)t.QuadPart * oofreq;
} else {
return (double)GetTickCount() * 1.0e-3;
}
}
#elif defined(__linux__) || defined(__APPLE__)
#include <stddef.h>
#include <sys/time.h>
double second (void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (double)tv.tv_sec + (double)tv.tv_usec * 1.0e-6;
}
#else
#error unsupported platform
#endif
__global__ void zcopy (const double2 * __restrict__ src,
double2 * __restrict__ dst, int len)
{
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
dst[i] = src[i];
}
}
struct zcopyOpts {
int len;
int dev;
};
static int processArgs (int argc, char *argv[], struct zcopyOpts *opts)
{
int error = 0;
memset (opts, 0, sizeof(*opts));
while (argc) {
if (*argv[0] == '-') {
switch (*(argv[0]+1)) {
case 'n':
opts->len = atol(argv[0]+2);
break;
case 'd':
opts->dev = atol(argv[0]+2);
break;
default:
fprintf (stderr, "Unknown switch '%c%s'\n", '-', argv[0]+1);
error++;
break;
}
}
argc--;
argv++;
}
return error;
}
int main (int argc, char *argv[])
{
double start, stop, init_start, init_stop, elapsed, mintime;
double2 *d_a, *d_b;
int errors;
struct zcopyOpts opts;
errors = processArgs (argc, argv, &opts);
if (errors) {
return EXIT_FAILURE;
}
opts.len = (opts.len) ? opts.len : ZCOPY_DEFLEN;
opts.dev = (opts.dev) ? opts.dev : ZCOPY_DEFDEV;
/* Trigger CUDA context creation */
init_start = second();
CUDA_SAFE_CALL (cudaFree (0));
init_stop = second();
printf ("CUDA initialization: %.3f seconds\n", init_stop - init_start);
/* Select GPU to run on */
struct cudaDeviceProp props;
CUDA_SAFE_CALL (cudaSetDevice (opts.dev));
CUDA_SAFE_CALL (cudaGetDeviceProperties (&props, opts.dev));
printf ("zcopy: running on device %d (%s)\n", opts.dev, props.name);
/* Allocate memory on device */
CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0]) * opts.len));
CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0]) * opts.len));
/* Initialize device memory */
CUDA_SAFE_CALL (cudaMemset(d_a, 0x00, sizeof(d_a[0]) * opts.len)); // zero
CUDA_SAFE_CALL (cudaMemset(d_b, 0xff, sizeof(d_b[0]) * opts.len)); // NaN
/* Compute execution configuration */
dim3 dimBlock(ZCOPY_THREADS);
int threadBlocks = (opts.len + (dimBlock.x - 1)) / dimBlock.x;
dim3 dimGrid(threadBlocks);
printf ("zcopy: using vectors of %d double2 elements (= %.3e bytes)\n",
opts.len, (double)sizeof(d_a[0]) * opts.len);
printf ("zcopy: using %d threads per block, %d blocks\n",
dimBlock.x, dimGrid.x);
mintime = fabs(log(0.0));
for (int k = 0; k < ZCOPY_ITER; k++) {
start = second();
zcopy<<<dimGrid,dimBlock>>>(d_a, d_b, opts.len);
CHECK_LAUNCH_ERROR();
stop = second();
elapsed = stop - start;
if (elapsed < mintime) mintime = elapsed;
}
printf ("zcopy: mintime = %.3f msec; eqv. memory bandwith = %.2f GB/sec\n",
1.0e3 * mintime, (2 * sizeof(d_a[0]) * opts.len) / (1e9 * mintime));
CUDA_SAFE_CALL (cudaFree(d_a));
CUDA_SAFE_CALL (cudaFree(d_b));
return EXIT_SUCCESS;
}