It doesn’t seem to, according to my testing. It certainly makes an improvement, however.
According to my testing, the performance deficit on the CUDA side is eliminated if we supply an appropriate pragma unroll directive, and modify the order of parameters of the fmax3
function:
$ cat t1966.cu
#include <cstdlib>
#include <assert.h>
#include <CL/cl.h>
#include <stdio.h>
#include <string.h>
#include <sys/time.h>
// compile using
// /usr/local/cuda/bin/nvcc -g -Xcompiler -O3 -gencode arch=compute_86,code=sm_86 -use_fast_math test_cuda_kernel.cu -o test_cuda_kernel
#ifdef OPT
__device__ __forceinline__ long fmax3(long c, long b, long a)
#else
__device__ __forceinline__ long fmax3(long a, long b, long c)
#endif
{
long r = a;
if (b > r)
r = b;
if (c > r)
r = c;
return r;
}
#define max2(a,b) ((a)>(b))? (a):(b)
#define max3(a,b,c) max2((c), max2((b), (a)))
__global__ void test(long* pin, long* pout, long n)
{
long gid = blockIdx.x;
long sum = 0;
#ifdef OPT
#pragma unroll 10
#endif
for (long i = 0; i < n; i++)
{
long idx = gid - n;
long idx2 = idx +1;
if (idx > 0 && idx2 < gid)
sum = fmax3(sum, pin[idx], pin[idx2]);
}
pout[gid] = sum;
}
struct timeval tnow;
double dtime(){
gettimeofday(&tnow, NULL);
return (double)tnow.tv_sec + (double)tnow.tv_usec * 1.0e-6;}
int main(int argc, char** argv)
{
size_t size = 1024 * 1024 * 100; // 1 GiB
long* pin;
long* pout;
cudaMalloc(&pin, size * sizeof(long));
cudaMalloc(&pout, size * sizeof(long));
long n = 1000;
double t0 = dtime();
test<<<size, 1>>>(pin, pout, n);
cudaDeviceSynchronize();
double tf = dtime();
printf("n: %ld Ellapsed: %f\n", n, (tf-t0));
return 0;
}
$ nvcc -arch=sm_70 -o t1966 t1966.cu
$ ./t1966
n: 1000 Ellapsed: 3.807186
$ nvcc -arch=sm_70 -o t1966 t1966.cu -DOPT
$ ./t1966
n: 1000 Ellapsed: 1.170802
$
(Tesla V100, CUDA 11.4, CentOS 7, 470.57.02)
(I don’t know if this has a similar effect on the cc8.6 or cc7.5 GPUs that OP is using.)
The selection of the pragma unroll value seems to be important. I tried values of 4, 5, 6, 8, 10, 12, 13, 14, 15, 20, and 100. 13 - 100 provided no significant benefit. 4 - 8 are almost as good as 10 and 12, which were the best (12 seems a few percent better than 10).
It might be worthwhile to file a bug for this. I’m not sure what could be done to make this outcome achievable without this additional coaxing, but the OpenCL example certainly makes it seem possible. There may be several things going on here.
Here’s my equivalent run of the provided OpenCL code on the same machine for reference:
$ cat t1967.cpp
// /usr/local/cuda/bin/nvcc -g -Xcompiler -O3 -gencode arch=compute_86,code=sm_86 -use_fast_math test_opencl_kernel.cpp -l OpenCL -o test_opencl_kernel
#include <cstdlib>
#include <assert.h>
#include <CL/cl.h>
#include <stdio.h>
#include <string.h>
#include <sys/time.h>
using namespace std;
const char* gCLSource =
"#define max2(a,b) ((a)>(b))? (a):(b)\n"
"#define max3(a,b,c) max2((a), max2((b), (c)))\n"
"__kernel void test(\n"
" __global long* pin,\n"
" __global long* pout,\n"
" long n)\n"
"{\n"
" long gid = get_global_id(0);\n"
// " printf(\"gid: %d\", gid);"
"\n"
" long sum = 0;\n"
" for (long i = 0; i < n; i++)\n"
" {\n"
" long idx = gid - n;\n"
" long idx2 = idx +1;"
" if (idx > 0 && idx2 < gid)\n"
" sum = max3(sum, pin[idx], pin[idx2]);\n"
" }\n"
" pout[gid] = sum;\n"
"}\n";
struct timeval tnow;
double dtime()
{
gettimeofday(&tnow, NULL);
return (double)tnow.tv_sec + (double)tnow.tv_usec * 1.0e-6;
}
/*
*
*/
int main(int argc, char** argv)
{
cl_int err;
cl_uint pnum = 0;
err = clGetPlatformIDs(0, 0, &pnum);
assert(err == CL_SUCCESS);
if (pnum <= 0){printf("No OpenCL platform\n"); exit(-1);}
cl_platform_id pid;
err = clGetPlatformIDs(1, &pid, 0);
size_t plen = 0;
err = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, 0, &plen);
assert(err == CL_SUCCESS);
char pname[plen];
err = clGetPlatformInfo(pid, CL_PLATFORM_NAME, plen, pname, 0 );
assert(err == CL_SUCCESS);
printf("OpenCL Platform Name: %s\n", pname);
cl_uint dnum = 0;
err = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 0, 0, &dnum );
assert(err == CL_SUCCESS);
if (dnum <= 0){printf("No OpenCL device\n"); exit(-1);}
cl_device_id did;
err = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 1, &did, 0 );
assert(err == CL_SUCCESS);
size_t dlen = 0;
err = clGetDeviceInfo(did, CL_DEVICE_NAME, 0, 0, &dlen );
assert(err == CL_SUCCESS);
char dname[dlen];
err = clGetDeviceInfo(did, CL_DEVICE_NAME, dlen, dname, 0 );
assert(err == CL_SUCCESS);
printf("OpenCL Device Name: %s\n", dname);
cl_context_properties cprops[3];
cprops[0] = CL_CONTEXT_PLATFORM;
cprops[1] = cl_context_properties(pid);
cprops[2] = 0;
cl_context context = clCreateContext(&cprops[0], 1, &did, NULL, 0, &err);
assert(err == CL_SUCCESS);
size_t size = 1024 * 1024 * 100; // 1 GiB
cl_mem buf_in = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(long), NULL, &err);
assert(err == CL_SUCCESS);
cl_mem buf_out = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(long), NULL, &err);
assert(err == CL_SUCCESS);
cl_command_queue_properties prop = {0};
cl_command_queue queue = clCreateCommandQueue(context, did, prop, &err);
assert(err == CL_SUCCESS);
const char* sources[] = {gCLSource};
const size_t sourcesLen[] = {strlen(gCLSource)};
printf("PROGRAM:\n%s\n", gCLSource);
cl_program program = clCreateProgramWithSource(context, 1, sources, sourcesLen, &err);
assert(err == CL_SUCCESS);
const char* options = ""; // -cl-nv-verbose";
cl_device_id dids[] = {did};
err = clBuildProgram(program, 1, dids, options, NULL, NULL);
assert(err == CL_SUCCESS);
const char* name = "test";
cl_kernel kernel = clCreateKernel(program, name, &err);
assert(err == CL_SUCCESS);
//err = clEnqueueWriteBuffer(queue, buf, CL_TRUE, 0, size, dst, 0, NULL, NULL);
//assert(err == CL_SUCCESS);
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&buf_in);
assert(err == CL_SUCCESS);
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&buf_out);
assert(err == CL_SUCCESS);
long n = 1000;
err = clSetKernelArg(kernel, 2, sizeof(cl_long), (void *)&n);
assert(err == CL_SUCCESS);
size_t wgSize[3] = {1, 1, 1};
size_t gSize[3] = {size, 1, 1};
double t0 = dtime();
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, gSize, wgSize, 0, NULL, NULL);
assert(err == CL_SUCCESS);
err = clFinish(queue);
assert(err == CL_SUCCESS);
double tf = dtime();
printf("n: %ld Ellapsed: %f\n", n, (tf-t0));
return 0;
}
$ nvcc t1967.cpp -o t1967 -lOpenCL
$ ./t1967
OpenCL Platform Name: NVIDIA CUDA
OpenCL Device Name: Tesla V100-PCIE-32GB
PROGRAM:
#define max2(a,b) ((a)>(b))? (a):(b)
#define max3(a,b,c) max2((a), max2((b), (c)))
__kernel void test(
__global long* pin,
__global long* pout,
long n)
{
long gid = get_global_id(0);
long sum = 0;
for (long i = 0; i < n; i++)
{
long idx = gid - n;
long idx2 = idx +1; if (idx > 0 && idx2 < gid)
sum = max3(sum, pin[idx], pin[idx2]);
}
pout[gid] = sum;
}
n: 1000 Ellapsed: 1.672007
$
(I did make a change to the supplied OpenCL code to address a compiler warning, however it has no effect on the observation.)