You are right, the difference between 4.9 and 7.4 is caused by the order of the comparisons.
About the block size, I’ve tried to remove grid issues from the comparison.
I assume OpenCL’s “workgroup size” is equivalent to CUDA’s “block size”.
Here you have the OpenCL code
// /usr/local/cuda/bin/nvcc -g -Xcompiler -O3 -gencode arch=compute_86,code=sm_86 -use_fast_math test_opencl_kernel.cpp -l OpenCL -o test_opencl_kernel
#include <cstdlib>
#include <assert.h>
#include <CL/cl.h>
#include <stdio.h>
#include <string.h>
#include <sys/time.h>
using namespace std;
const char* gCLSource =
"#define max2(a,b) ((a)>(b))? (a):(b)\n"
"#define max3(a,b,c) max2((a), max2((b), (c)))\n"
"__kernel void test(\n"
" __global long* pin,\n"
" __global long* pout,\n"
" long n)\n"
"{\n"
" long gid = get_global_id(0);\n"
// " printf(\"gid: %d\", gid);"
"\n"
" long sum = 0;\n"
" for (long i = 0; i < n; i++)\n"
" {\n"
" long idx = gid - n;\n"
" long idx2 = idx +1;"
" if (idx > 0 && idx2 < gid)\n"
" sum = max3(sum, pin[idx], pin[idx2]);\n"
" }\n"
" pout[gid] = sum;\n"
"}\n";
struct timeval tnow;
double dtime()
{
gettimeofday(&tnow, NULL);
return (double)tnow.tv_sec + (double)tnow.tv_usec * 1.0e-6;
}
/*
*
*/
int main(int argc, char** argv)
{
cl_int err;
cl_uint pnum = 0;
err = clGetPlatformIDs(0, 0, &pnum);
assert(err == CL_SUCCESS);
if (pnum <= 0){printf("No OpenCL platform\n"); exit(-1);}
cl_platform_id pid;
err = clGetPlatformIDs(1, &pid, 0);
size_t plen = 0;
err = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, 0, &plen);
assert(err == CL_SUCCESS);
char pname[plen];
err = clGetPlatformInfo(pid, CL_PLATFORM_NAME, plen, pname, 0 );
assert(err == CL_SUCCESS);
printf("OpenCL Platform Name: %s\n", pname);
cl_uint dnum = 0;
err = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 0, 0, &dnum );
assert(err == CL_SUCCESS);
if (dnum <= 0){printf("No OpenCL device\n"); exit(-1);}
cl_device_id did;
err = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 1, &did, 0 );
assert(err == CL_SUCCESS);
size_t dlen = 0;
err = clGetDeviceInfo(did, CL_DEVICE_NAME, 0, 0, &dlen );
assert(err == CL_SUCCESS);
char dname[dlen];
err = clGetDeviceInfo(did, CL_DEVICE_NAME, dlen, dname, 0 );
assert(err == CL_SUCCESS);
printf("OpenCL Device Name: %s\n", dname);
cl_context_properties cprops[3];
cprops[0] = CL_CONTEXT_PLATFORM;
cprops[1] = cl_context_properties(pid);
cprops[2] = 0;
cl_context context = clCreateContext(&cprops[0], 1, &did, NULL, 0, &err);
assert(err == CL_SUCCESS);
size_t size = 1024 * 1024 * 100; // 1 GiB
cl_mem buf_in = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(long), NULL, &err);
assert(err == CL_SUCCESS);
cl_mem buf_out = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(long), NULL, &err);
assert(err == CL_SUCCESS);
cl_command_queue queue = clCreateCommandQueue(context, did, NULL, &err);
assert(err == CL_SUCCESS);
const char* sources[] = {gCLSource};
const size_t sourcesLen[] = {strlen(gCLSource)};
printf("PROGRAM:\n%s\n", gCLSource);
cl_program program = clCreateProgramWithSource(context, 1, sources, sourcesLen, &err);
assert(err == CL_SUCCESS);
const char* options = ""; // -cl-nv-verbose";
cl_device_id dids[] = {did};
err = clBuildProgram(program, 1, dids, options, NULL, NULL);
assert(err == CL_SUCCESS);
const char* name = "test";
cl_kernel kernel = clCreateKernel(program, name, &err);
assert(err == CL_SUCCESS);
//err = clEnqueueWriteBuffer(queue, buf, CL_TRUE, 0, size, dst, 0, NULL, NULL);
//assert(err == CL_SUCCESS);
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&buf_in);
assert(err == CL_SUCCESS);
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&buf_out);
assert(err == CL_SUCCESS);
long n = 1000;
err = clSetKernelArg(kernel, 2, sizeof(cl_long), (void *)&n);
assert(err == CL_SUCCESS);
size_t wgSize[3] = {1, 1, 1};
size_t gSize[3] = {size, 1, 1};
double t0 = dtime();
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, gSize, wgSize, 0, NULL, NULL);
assert(err == CL_SUCCESS);
err = clFinish(queue);
assert(err == CL_SUCCESS);
double tf = dtime();
printf("n: %ld Ellapsed: %f\n", n, (tf-t0));
return 0;
}