When I suggested experiments, I was suggesting doing that for the purpose of measuring performance, not “proving” correctness.
Your point, as I understood it, was: The description of __restrict__
in the documentation is unnecessarily restrictive because it does not take into account that in certain special cases aliasing is harmless even when I told the compiler there will be no aliasing. Following the documentation causes my CUDA code to leave performance on the table.
My point is: Yes, in certain instances aliasing via __restricted__
pointers seems to be harmless. However, in those situations there is also no performance benefit from using __restricted__
pointers. If so, unnecessarily complicating the documentation and deviating from the simple semantics introduced by ISO-C99 provides no benefit to anyone and may actually be harmful by confusing programmers.
Here is my experiment. Vector addition with and without __restricted__
pointers. No performance difference outside measurement noise level.
GPU1:
vecadd [foo]: operating on vectors of 50000000 float (= 2.000e+008 bytes)
vecadd [foo]: using 128 threads per block, 65520 blocks
vecadd [foo]: mintime = 24.639 msec throughput = 24.35 GB/sec
vecadd [bar]: operating on vectors of 50000000 float (= 2.000e+008 bytes)
vecadd [bar]: using 128 threads per block, 65520 blocks
vecadd [bar]: mintime = 24.645 msec throughput = 24.35 GB/sec
GPU2:
vecadd [foo]: operating on vectors of 50000000 float (= 2.000e+08 bytes)
vecadd [foo]: using 128 threads per block, 65520 blocks
vecadd [foo]: mintime = 1.604 msec throughput = 373.97 GB/sec
vecadd [bar]: operating on vectors of 50000000 float (= 2.000e+08 bytes)
vecadd [bar]: using 128 threads per block, 65520 blocks
vecadd [bar]: mintime = 1.602 msec throughput = 374.41 GB/sec
If I change the kernel calls from {foo|bar}<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, opts.len)
to {foo|bar}<<<dimGrid,dimBar>>>(d_a, d_b, d_b, opts.len)
to introduce aliasing, I get this on GPU2:
vecadd [foo]: operating on vectors of 50000000 float (= 2.000e+08 bytes)
vecadd [foo]: using 128 threads per block, 65520 blocks
vecadd [foo]: mintime = 1.145 msec throughput = 524.15 GB/sec
vecadd [bar]: operating on vectors of 50000000 float (= 2.000e+08 bytes)
vecadd [bar]: using 128 threads per block, 65520 blocks
vecadd [bar]: mintime = 1.142 msec throughput = 525.39 GB/sec
No difference in perfomance (note that the GB/sec numbers are off because the calculation assumes the code is operating on three vectors, while we are only using two in this case).
The program:
#include <stdlib.h>
#include <stdio.h>
#define VECADD_THREADS 128
#define VECADD_DEFLEN 50000000
#define VECADD_ITER 10 // as in STREAM benchmark
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaDeviceSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// A routine to give access to a high precision timer on most systems.
#if defined(_WIN32)
#if !defined(WIN32_LEAN_AND_MEAN)
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
double second (void)
{
LARGE_INTEGER t;
static double oofreq;
static int checkedForHighResTimer;
static BOOL hasHighResTimer;
if (!checkedForHighResTimer) {
hasHighResTimer = QueryPerformanceFrequency (&t);
oofreq = 1.0 / (double)t.QuadPart;
checkedForHighResTimer = 1;
}
if (hasHighResTimer) {
QueryPerformanceCounter (&t);
return (double)t.QuadPart * oofreq;
} else {
return (double)GetTickCount() * 1.0e-3;
}
}
#elif defined(__linux__) || defined(__APPLE__)
#include <stddef.h>
#include <sys/time.h>
double second (void)
{
struct timeval tv;
gettimeofday(&tv, NULL);
return (double)tv.tv_sec + (double)tv.tv_usec * 1.0e-6;
}
#else
#error unsupported platform
#endif
__global__ void foo (float *a, const float *b, const float *c, int len)
{
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
a[i] = b[i] + c[i];
}
}
__global__ void bar (float * __restrict__ a, const float * __restrict__ b, const float * __restrict__ c, int len)
{
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
a[i] = b[i] + c[i];
}
}
struct vecaddOpts {
int len;
};
static int processArgs (int argc, char *argv[], struct vecaddOpts *opts)
{
int error = 0;
memset (opts, 0, sizeof(*opts));
while (argc) {
if (*argv[0] == '-') {
switch (*(argv[0]+1)) {
case 'n':
opts->len = atol(argv[0]+2);
break;
default:
fprintf (stderr, "Unknown switch '%c%s'\n", '-', argv[0]+1);
error++;
break;
}
}
argc--;
argv++;
}
return error;
}
int main (int argc, char *argv[])
{
double start, stop, elapsed, mintime;
float *d_a = 0, *d_b = 0, *d_c = 0;
int errors;
struct vecaddOpts opts;
errors = processArgs (argc, argv, &opts);
if (errors) {
return EXIT_FAILURE;
}
opts.len = (opts.len) ? opts.len : VECADD_DEFLEN;
/* Allocate memory on device */
CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0]) * opts.len));
CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0]) * opts.len));
CUDA_SAFE_CALL (cudaMalloc((void**)&d_c, sizeof(d_c[0]) * opts.len));
/* Initialize device memory */
CUDA_SAFE_CALL (cudaMemset(d_a, 0xff, sizeof(d_a[0]) * opts.len)); // NAN
CUDA_SAFE_CALL (cudaMemset(d_b, 0x00, sizeof(d_b[0]) * opts.len)); // zero
CUDA_SAFE_CALL (cudaMemset(d_c, 0x00, sizeof(d_c[0]) * opts.len)); // zero
/* Compute execution configuration */
dim3 dimBlock(VECADD_THREADS);
int threadBlocks = (opts.len + (dimBlock.x - 1)) / dimBlock.x;
if (threadBlocks > 65520) threadBlocks = 65520;
dim3 dimGrid(threadBlocks);
printf ("vecadd [foo]: operating on vectors of %d float (= %.3e bytes)\n",
opts.len, (double)sizeof(d_a[0]) * opts.len);
printf ("vecadd [foo]: using %d threads per block, %d blocks\n",
dimBlock.x, dimGrid.x);
mintime = fabs(log(0.0));
for (int k = 0; k < VECADD_ITER; k++) {
start = second();
foo<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, opts.len);
CHECK_LAUNCH_ERROR();
stop = second();
elapsed = stop - start;
if (elapsed < mintime) mintime = elapsed;
}
printf ("vecadd [foo]: mintime = %.3f msec throughput = %.2f GB/sec\n",
1.0e3 * mintime, (3 * 1.e-9 * sizeof(d_a[0]) * opts.len) / mintime);
printf ("\n");
printf ("vecadd [bar]: operating on vectors of %d float (= %.3e bytes)\n",
opts.len, (double)sizeof(d_a[0]) * opts.len);
printf ("vecadd [bar]: using %d threads per block, %d blocks\n",
dimBlock.x, dimGrid.x);
mintime = fabs(log(0.0));
for (int k = 0; k < VECADD_ITER; k++) {
start = second();
bar<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, opts.len);
CHECK_LAUNCH_ERROR();
stop = second();
elapsed = stop - start;
if (elapsed < mintime) mintime = elapsed;
}
printf ("vecadd [bar]: mintime = %.3f msec throughput = %.2f GB/sec\n",
1.0e3 * mintime, (3 * 1.e-9 * sizeof(d_a[0]) * opts.len) / mintime);
CUDA_SAFE_CALL (cudaFree(d_a));
CUDA_SAFE_CALL (cudaFree(d_b));
CUDA_SAFE_CALL (cudaFree(d_c));
return EXIT_SUCCESS;
}