Significant speedup of OpenCL vs CUDA

Robert_Crovella · February 12, 2022, 7:27pm

It doesn’t seem to, according to my testing. It certainly makes an improvement, however.

According to my testing, the performance deficit on the CUDA side is eliminated if we supply an appropriate pragma unroll directive, and modify the order of parameters of the fmax3 function:

$ cat t1966.cu
#include <cstdlib>
#include <assert.h>
#include <CL/cl.h>
#include <stdio.h>
#include <string.h>
#include <sys/time.h>


// compile using
// /usr/local/cuda/bin/nvcc -g -Xcompiler -O3 -gencode arch=compute_86,code=sm_86 -use_fast_math test_cuda_kernel.cu -o test_cuda_kernel
#ifdef OPT
__device__ __forceinline__ long fmax3(long c, long b, long a)
#else
__device__ __forceinline__ long fmax3(long a, long b, long c)
#endif
{
    long r = a;
    if (b > r)
        r = b;
    if (c > r)
        r = c;
    return r;
}

#define max2(a,b) ((a)>(b))? (a):(b)
#define max3(a,b,c) max2((c), max2((b), (a)))

__global__  void test(long* pin, long* pout, long n)
{
   long gid = blockIdx.x;

   long sum = 0;
#ifdef OPT
#pragma unroll 10
#endif
   for (long i = 0; i < n; i++)
   {
       long idx = gid - n;
       long idx2 = idx +1;
       if (idx > 0 && idx2 < gid)
           sum = fmax3(sum, pin[idx], pin[idx2]);
   }
   pout[gid] = sum;
}

struct timeval tnow;

double dtime(){
    gettimeofday(&tnow, NULL);
    return (double)tnow.tv_sec + (double)tnow.tv_usec * 1.0e-6;}

int main(int argc, char** argv)
{
    size_t size = 1024 * 1024 * 100;  // 1 GiB
    long* pin;
    long* pout;
    cudaMalloc(&pin,  size * sizeof(long));
    cudaMalloc(&pout, size * sizeof(long));
    long n = 1000;
    double t0 = dtime();
    test<<<size, 1>>>(pin, pout, n);
    cudaDeviceSynchronize();
    double tf = dtime();
    printf("n: %ld Ellapsed: %f\n", n, (tf-t0));

    return 0;
}
$ nvcc -arch=sm_70 -o t1966 t1966.cu
$ ./t1966
n: 1000 Ellapsed: 3.807186
$ nvcc -arch=sm_70 -o t1966 t1966.cu -DOPT
$ ./t1966
n: 1000 Ellapsed: 1.170802
$

(Tesla V100, CUDA 11.4, CentOS 7, 470.57.02)
(I don’t know if this has a similar effect on the cc8.6 or cc7.5 GPUs that OP is using.)

The selection of the pragma unroll value seems to be important. I tried values of 4, 5, 6, 8, 10, 12, 13, 14, 15, 20, and 100. 13 - 100 provided no significant benefit. 4 - 8 are almost as good as 10 and 12, which were the best (12 seems a few percent better than 10).

It might be worthwhile to file a bug for this. I’m not sure what could be done to make this outcome achievable without this additional coaxing, but the OpenCL example certainly makes it seem possible. There may be several things going on here.

Here’s my equivalent run of the provided OpenCL code on the same machine for reference:

$ cat t1967.cpp
// /usr/local/cuda/bin/nvcc -g -Xcompiler -O3 -gencode arch=compute_86,code=sm_86 -use_fast_math test_opencl_kernel.cpp -l OpenCL -o test_opencl_kernel

#include <cstdlib>
#include <assert.h>
#include <CL/cl.h>
#include <stdio.h>
#include <string.h>
#include <sys/time.h>

using namespace std;

const char* gCLSource =
"#define max2(a,b) ((a)>(b))? (a):(b)\n"
"#define max3(a,b,c) max2((a), max2((b), (c)))\n"
"__kernel void test(\n"
"   __global long* pin,\n"
"   __global long* pout,\n"
"   long n)\n"
"{\n"
"   long gid = get_global_id(0);\n"
// "   printf(\"gid: %d\", gid);"
"\n"
"   long sum = 0;\n"
"   for (long i = 0; i < n; i++)\n"
"   {\n"
"       long idx = gid - n;\n"
"       long idx2 = idx +1;"
"       if (idx > 0 && idx2 < gid)\n"
"           sum = max3(sum, pin[idx], pin[idx2]);\n"
"   }\n"
"   pout[gid] = sum;\n"
"}\n";

struct timeval tnow;

double dtime()
{
    gettimeofday(&tnow, NULL);
    return (double)tnow.tv_sec + (double)tnow.tv_usec * 1.0e-6;
}

/*
 *
 */
int main(int argc, char** argv)
{
    cl_int err;
    cl_uint pnum = 0;
    err = clGetPlatformIDs(0, 0, &pnum);
    assert(err == CL_SUCCESS);

    if (pnum <= 0){printf("No OpenCL platform\n"); exit(-1);}

    cl_platform_id pid;
    err = clGetPlatformIDs(1, &pid, 0);
    size_t plen = 0;
    err = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, 0, &plen);
    assert(err == CL_SUCCESS);

    char pname[plen];
    err = clGetPlatformInfo(pid, CL_PLATFORM_NAME, plen, pname, 0 );
    assert(err == CL_SUCCESS);

    printf("OpenCL Platform Name: %s\n", pname);


    cl_uint dnum = 0;
    err = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 0, 0, &dnum );
    assert(err == CL_SUCCESS);

    if (dnum <= 0){printf("No OpenCL device\n"); exit(-1);}

    cl_device_id did;

    err = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 1, &did, 0 );
    assert(err == CL_SUCCESS);

    size_t dlen = 0;
    err = clGetDeviceInfo(did, CL_DEVICE_NAME, 0, 0, &dlen );
    assert(err == CL_SUCCESS);

    char dname[dlen];
    err = clGetDeviceInfo(did, CL_DEVICE_NAME, dlen, dname, 0 );
    assert(err == CL_SUCCESS);

    printf("OpenCL Device Name: %s\n", dname);

    cl_context_properties cprops[3];
    cprops[0] = CL_CONTEXT_PLATFORM;
    cprops[1] = cl_context_properties(pid);
    cprops[2] = 0;

    cl_context context = clCreateContext(&cprops[0], 1, &did, NULL, 0, &err);
    assert(err == CL_SUCCESS);


    size_t size = 1024 * 1024 * 100;  // 1 GiB

    cl_mem buf_in = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(long), NULL, &err);
    assert(err == CL_SUCCESS);

    cl_mem buf_out = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(long), NULL, &err);
    assert(err == CL_SUCCESS);
    cl_command_queue_properties prop = {0};
    cl_command_queue queue = clCreateCommandQueue(context, did, prop, &err);
    assert(err == CL_SUCCESS);

    const char* sources[] = {gCLSource};
    const size_t sourcesLen[] = {strlen(gCLSource)};

    printf("PROGRAM:\n%s\n", gCLSource);

    cl_program program = clCreateProgramWithSource(context, 1, sources, sourcesLen, &err);
    assert(err == CL_SUCCESS);

    const char* options = ""; // -cl-nv-verbose";

    cl_device_id dids[] = {did};

    err = clBuildProgram(program, 1, dids, options, NULL, NULL);
    assert(err == CL_SUCCESS);

    const char* name = "test";
    cl_kernel kernel = clCreateKernel(program, name, &err);
    assert(err == CL_SUCCESS);

    //err = clEnqueueWriteBuffer(queue, buf, CL_TRUE, 0, size, dst, 0, NULL, NULL);
    //assert(err == CL_SUCCESS);

    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&buf_in);
    assert(err == CL_SUCCESS);

    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&buf_out);
    assert(err == CL_SUCCESS);

    long n = 1000;
    err = clSetKernelArg(kernel, 2, sizeof(cl_long), (void *)&n);
    assert(err == CL_SUCCESS);

    size_t wgSize[3] = {1, 1, 1};
    size_t gSize[3] = {size, 1, 1};

    double t0 = dtime();
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, gSize, wgSize, 0, NULL, NULL);
    assert(err == CL_SUCCESS);

    err = clFinish(queue);
    assert(err == CL_SUCCESS);

    double tf = dtime();

    printf("n: %ld Ellapsed: %f\n", n, (tf-t0));


    return 0;
}
$ nvcc t1967.cpp -o t1967 -lOpenCL
$ ./t1967
OpenCL Platform Name: NVIDIA CUDA
OpenCL Device Name: Tesla V100-PCIE-32GB
PROGRAM:
#define max2(a,b) ((a)>(b))? (a):(b)
#define max3(a,b,c) max2((a), max2((b), (c)))
__kernel void test(
   __global long* pin,
   __global long* pout,
   long n)
{
   long gid = get_global_id(0);

   long sum = 0;
   for (long i = 0; i < n; i++)
   {
       long idx = gid - n;
       long idx2 = idx +1;       if (idx > 0 && idx2 < gid)
           sum = max3(sum, pin[idx], pin[idx2]);
   }
   pout[gid] = sum;
}

n: 1000 Ellapsed: 1.672007
$

(I did make a change to the supplied OpenCL code to address a compiler warning, however it has no effect on the observation.)

Topic		Replies	Views
OpenCL vs Cuda performance on same kernels CUDA Programming and Performance	13	55739	July 15, 2010
Cuda OpenCL comparison cuda, openCL, nvidia CUDA Programming and Performance	19	42821	November 1, 2012
Cuda vs OpenCL CUDA Programming and Performance	49	262282	December 28, 2008
OpenCL or CUDA? CUDA Programming and Performance	16	11024	October 26, 2011
Any reason to choose CUDA over OpenCL? CUDA Programming and Performance	27	26173	August 2, 2010
Same Implementation in CUDA and OpenCL but different performance, and OpenCL Faster? CUDA Programming and Performance	2	1244	October 11, 2013
Cross-vendor GPU development strategy CUDA Programming and Performance	20	6769	January 11, 2010
Convincing skeptical bigwigs on the future of CUDA CUDA Programming and Performance	49	8722	March 19, 2009
Significant speed gap between CUDA and OpenCL - how to debug? CUDA Programming and Performance	3	7611	January 28, 2018
clGetEventProfilingInfo() zero end time cannot get kernel's end time while start is correct CUDA Programming and Performance	23	6210	November 14, 2010

Significant speedup of OpenCL vs CUDA

Related topics