Significant speedup of OpenCL vs CUDA

Robert_Crovella · February 12, 2022, 7:27pm

It doesn’t seem to, according to my testing. It certainly makes an improvement, however.

According to my testing, the performance deficit on the CUDA side is eliminated if we supply an appropriate pragma unroll directive, and modify the order of parameters of the fmax3 function:

$ cat t1966.cu
#include <cstdlib>
#include <assert.h>
#include <CL/cl.h>
#include <stdio.h>
#include <string.h>
#include <sys/time.h>


// compile using
// /usr/local/cuda/bin/nvcc -g -Xcompiler -O3 -gencode arch=compute_86,code=sm_86 -use_fast_math test_cuda_kernel.cu -o test_cuda_kernel
#ifdef OPT
__device__ __forceinline__ long fmax3(long c, long b, long a)
#else
__device__ __forceinline__ long fmax3(long a, long b, long c)
#endif
{
    long r = a;
    if (b > r)
        r = b;
    if (c > r)
        r = c;
    return r;
}

#define max2(a,b) ((a)>(b))? (a):(b)
#define max3(a,b,c) max2((c), max2((b), (a)))

__global__  void test(long* pin, long* pout, long n)
{
   long gid = blockIdx.x;

   long sum = 0;
#ifdef OPT
#pragma unroll 10
#endif
   for (long i = 0; i < n; i++)
   {
       long idx = gid - n;
       long idx2 = idx +1;
       if (idx > 0 && idx2 < gid)
           sum = fmax3(sum, pin[idx], pin[idx2]);
   }
   pout[gid] = sum;
}

struct timeval tnow;

double dtime(){
    gettimeofday(&tnow, NULL);
    return (double)tnow.tv_sec + (double)tnow.tv_usec * 1.0e-6;}

int main(int argc, char** argv)
{
    size_t size = 1024 * 1024 * 100;  // 1 GiB
    long* pin;
    long* pout;
    cudaMalloc(&pin,  size * sizeof(long));
    cudaMalloc(&pout, size * sizeof(long));
    long n = 1000;
    double t0 = dtime();
    test<<<size, 1>>>(pin, pout, n);
    cudaDeviceSynchronize();
    double tf = dtime();
    printf("n: %ld Ellapsed: %f\n", n, (tf-t0));

    return 0;
}
$ nvcc -arch=sm_70 -o t1966 t1966.cu
$ ./t1966
n: 1000 Ellapsed: 3.807186
$ nvcc -arch=sm_70 -o t1966 t1966.cu -DOPT
$ ./t1966
n: 1000 Ellapsed: 1.170802
$

(Tesla V100, CUDA 11.4, CentOS 7, 470.57.02)
(I don’t know if this has a similar effect on the cc8.6 or cc7.5 GPUs that OP is using.)

The selection of the pragma unroll value seems to be important. I tried values of 4, 5, 6, 8, 10, 12, 13, 14, 15, 20, and 100. 13 - 100 provided no significant benefit. 4 - 8 are almost as good as 10 and 12, which were the best (12 seems a few percent better than 10).

It might be worthwhile to file a bug for this. I’m not sure what could be done to make this outcome achievable without this additional coaxing, but the OpenCL example certainly makes it seem possible. There may be several things going on here.

Here’s my equivalent run of the provided OpenCL code on the same machine for reference:

$ cat t1967.cpp
// /usr/local/cuda/bin/nvcc -g -Xcompiler -O3 -gencode arch=compute_86,code=sm_86 -use_fast_math test_opencl_kernel.cpp -l OpenCL -o test_opencl_kernel

#include <cstdlib>
#include <assert.h>
#include <CL/cl.h>
#include <stdio.h>
#include <string.h>
#include <sys/time.h>

using namespace std;

const char* gCLSource =
"#define max2(a,b) ((a)>(b))? (a):(b)\n"
"#define max3(a,b,c) max2((a), max2((b), (c)))\n"
"__kernel void test(\n"
"   __global long* pin,\n"
"   __global long* pout,\n"
"   long n)\n"
"{\n"
"   long gid = get_global_id(0);\n"
// "   printf(\"gid: %d\", gid);"
"\n"
"   long sum = 0;\n"
"   for (long i = 0; i < n; i++)\n"
"   {\n"
"       long idx = gid - n;\n"
"       long idx2 = idx +1;"
"       if (idx > 0 && idx2 < gid)\n"
"           sum = max3(sum, pin[idx], pin[idx2]);\n"
"   }\n"
"   pout[gid] = sum;\n"
"}\n";

struct timeval tnow;

double dtime()
{
    gettimeofday(&tnow, NULL);
    return (double)tnow.tv_sec + (double)tnow.tv_usec * 1.0e-6;
}

/*
 *
 */
int main(int argc, char** argv)
{
    cl_int err;
    cl_uint pnum = 0;
    err = clGetPlatformIDs(0, 0, &pnum);
    assert(err == CL_SUCCESS);

    if (pnum <= 0){printf("No OpenCL platform\n"); exit(-1);}

    cl_platform_id pid;
    err = clGetPlatformIDs(1, &pid, 0);
    size_t plen = 0;
    err = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, 0, &plen);
    assert(err == CL_SUCCESS);

    char pname[plen];
    err = clGetPlatformInfo(pid, CL_PLATFORM_NAME, plen, pname, 0 );
    assert(err == CL_SUCCESS);

    printf("OpenCL Platform Name: %s\n", pname);


    cl_uint dnum = 0;
    err = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 0, 0, &dnum );
    assert(err == CL_SUCCESS);

    if (dnum <= 0){printf("No OpenCL device\n"); exit(-1);}

    cl_device_id did;

    err = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 1, &did, 0 );
    assert(err == CL_SUCCESS);

    size_t dlen = 0;
    err = clGetDeviceInfo(did, CL_DEVICE_NAME, 0, 0, &dlen );
    assert(err == CL_SUCCESS);

    char dname[dlen];
    err = clGetDeviceInfo(did, CL_DEVICE_NAME, dlen, dname, 0 );
    assert(err == CL_SUCCESS);

    printf("OpenCL Device Name: %s\n", dname);

    cl_context_properties cprops[3];
    cprops[0] = CL_CONTEXT_PLATFORM;
    cprops[1] = cl_context_properties(pid);
    cprops[2] = 0;

    cl_context context = clCreateContext(&cprops[0], 1, &did, NULL, 0, &err);
    assert(err == CL_SUCCESS);


    size_t size = 1024 * 1024 * 100;  // 1 GiB

    cl_mem buf_in = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(long), NULL, &err);
    assert(err == CL_SUCCESS);

    cl_mem buf_out = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(long), NULL, &err);
    assert(err == CL_SUCCESS);
    cl_command_queue_properties prop = {0};
    cl_command_queue queue = clCreateCommandQueue(context, did, prop, &err);
    assert(err == CL_SUCCESS);

    const char* sources[] = {gCLSource};
    const size_t sourcesLen[] = {strlen(gCLSource)};

    printf("PROGRAM:\n%s\n", gCLSource);

    cl_program program = clCreateProgramWithSource(context, 1, sources, sourcesLen, &err);
    assert(err == CL_SUCCESS);

    const char* options = ""; // -cl-nv-verbose";

    cl_device_id dids[] = {did};

    err = clBuildProgram(program, 1, dids, options, NULL, NULL);
    assert(err == CL_SUCCESS);

    const char* name = "test";
    cl_kernel kernel = clCreateKernel(program, name, &err);
    assert(err == CL_SUCCESS);

    //err = clEnqueueWriteBuffer(queue, buf, CL_TRUE, 0, size, dst, 0, NULL, NULL);
    //assert(err == CL_SUCCESS);

    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&buf_in);
    assert(err == CL_SUCCESS);

    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&buf_out);
    assert(err == CL_SUCCESS);

    long n = 1000;
    err = clSetKernelArg(kernel, 2, sizeof(cl_long), (void *)&n);
    assert(err == CL_SUCCESS);

    size_t wgSize[3] = {1, 1, 1};
    size_t gSize[3] = {size, 1, 1};

    double t0 = dtime();
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, gSize, wgSize, 0, NULL, NULL);
    assert(err == CL_SUCCESS);

    err = clFinish(queue);
    assert(err == CL_SUCCESS);

    double tf = dtime();

    printf("n: %ld Ellapsed: %f\n", n, (tf-t0));


    return 0;
}
$ nvcc t1967.cpp -o t1967 -lOpenCL
$ ./t1967
OpenCL Platform Name: NVIDIA CUDA
OpenCL Device Name: Tesla V100-PCIE-32GB
PROGRAM:
#define max2(a,b) ((a)>(b))? (a):(b)
#define max3(a,b,c) max2((a), max2((b), (c)))
__kernel void test(
   __global long* pin,
   __global long* pout,
   long n)
{
   long gid = get_global_id(0);

   long sum = 0;
   for (long i = 0; i < n; i++)
   {
       long idx = gid - n;
       long idx2 = idx +1;       if (idx > 0 && idx2 < gid)
           sum = max3(sum, pin[idx], pin[idx2]);
   }
   pout[gid] = sum;
}

n: 1000 Ellapsed: 1.672007
$

(I did make a change to the supplied OpenCL code to address a compiler warning, however it has no effect on the observation.)

njuffa · February 12, 2022, 8:11pm

I noticed what I considered unusually massive unrolling in the code generated by the CUDA compiler and was wondering whether that was helpful or harmful.

In the past the CUDA compiler contained multiple “unrollers” which sometimes were in conflict with each other, producing weird code-generation artifacts. I could imagine something like that playing a role here, but speculating about it without analyzing it in detail is not beneficial.

Given the not insignificant performance impact, having the compiler team root cause this would appear worthwhile. That requires filing a bug report as a starting point.

usuari34 · February 12, 2022, 9:13pm

What about the block size? Could it be possible that the OpenCL compiler automatically ends using a block size greater than 1 although the specified “workgroup size” (in OpenCL terms) is 1 ?

Robert_Crovella · February 12, 2022, 11:11pm

I think the short answer is no.

Going beyond that, if you meant simply increasing the workgroup size, without making any other changes, what purpose would that serve? There is no reason to think that alone makes codes run faster. Also, it could easily break code. That would increase the total threads being launched (or workitems, in OpenCL speak.)

If you meant increasing the workgroup size and decreasing the grid size or whatever OpenCL calls it, a few points.

First of all, you don’t trust the language specifications? Where does it say that a compiler is allowed to do that?

Second, this could also break indexing calculations. Probably not a calculation based on get_global_id but that is not the only way to do indexing calculations in OpenCL. That is a convenience function.

It doesn’t seem like we need to speculate that OpenCL is doing something incredible. We already have a data point that shows that the CUDA compiler can be coaxed into a competitive position.

Topic		Replies	Views
OpenCL in Fedora vs Windows Same code. Runs in Windows. Kernel error in Fedora CUDA Programming and Performance	7	2620	December 11, 2009
Cuda vs OpenCL CUDA Programming and Performance	49	262167	December 28, 2008
OpenCL vs Cuda performance on same kernels CUDA Programming and Performance	13	55627	July 15, 2010
Cuda OpenCL comparison cuda, openCL, nvidia CUDA Programming and Performance	19	42601	November 1, 2012
CUDA very slow performance CUDA Programming and Performance	21	16610	March 6, 2020
OpenCL 1.1 driver, 8 months and waiting... CUDA Programming and Performance	67	10024	July 7, 2011
OpenCL or CUDA? CUDA Programming and Performance	16	10952	October 26, 2011
Bootstrapping with OpenCL! Advice me please CUDA Programming and Performance	21	4700	July 15, 2010
A problem in reading opencl kernel results in a simple reduction implementation CUDA Programming and Performance	0	806	August 4, 2016
clGetEventProfilingInfo() zero end time cannot get kernel's end time while start is correct CUDA Programming and Performance	23	6096	November 14, 2010

Significant speedup of OpenCL vs CUDA

Related topics