At least in CUDA, the first time you launch a particular kernel, there may be some overheads. I don’t think they would normally amount to milliseconds, but I haven’t surveyed the space much.
What I observe with a simple OpenCL test case (derived from the code here) is a somewhat longer first-time kernel launch overhead of ~150 microseconds followed by subsequent launches in the range I would expect, around 15 microseconds:
# cat t3.cpp
#include <CL/opencl.h>
#include <stdint.h>
#include <stdio.h>
#include <inttypes.h>
#include <stdlib.h>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start=0){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
const char source[] =
"__kernel void test_rotate(__global ulong *d_count, ulong loops, ulong patt)"
"{"
" ulong n = patt;"
" for (ulong i = 0; i<loops; i++)"
" n &= (107 << (patt+(i%7)));"
" d_count[0] = n + loops;"
"}"
;
int main(int argc, char *argv[])
{
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue1, queue2;
cl_program program;
cl_mem mem1, mem2;
cl_kernel kernel;
bool two_kernels = false;
unsigned long long loops = 1000;
if (argc > 1) loops *= atoi(argv[1]);
if (argc > 2) two_kernels = true;
if (two_kernels) printf("running two kernels\n");
else printf("running one kernel\n");
printf("running %lu loops\n", loops);
unsigned long long pattern = 1;
clGetPlatformIDs(1, &platform, NULL);
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
queue1 = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, NULL);
queue2 = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, NULL);
const char *sources[1] = {source};
program = clCreateProgramWithSource(context, 1, sources, NULL, NULL);
clBuildProgram(program, 1, &device, NULL, NULL, NULL);
mem1 = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(cl_ulong), NULL, NULL);
mem2 = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(cl_ulong), NULL, NULL);
kernel = clCreateKernel(program, "test_rotate", NULL);
const size_t work_size[1] = {1};
clSetKernelArg(kernel, 0, sizeof(mem1), &mem1);
clSetKernelArg(kernel, 1, sizeof(loops), &loops);
clSetKernelArg(kernel, 2, sizeof(pattern), &pattern);
unsigned long long dt = dtime_usec(0);
clEnqueueNDRangeKernel(queue1, kernel, 1, NULL, work_size, work_size, 0, NULL, NULL);
dt = dtime_usec(dt);
std::cout << "enqueue time: " << dt << " microseconds" << std::endl;
if (two_kernels){
clSetKernelArg(kernel, 0, sizeof(mem2), &mem2);
clSetKernelArg(kernel, 1, sizeof(loops), &loops);
clSetKernelArg(kernel, 2, sizeof(pattern), &pattern);
clEnqueueNDRangeKernel(queue2, kernel, 1, NULL, work_size, work_size, 0, NULL, NULL);
}
cl_ulong *buf1 = (cl_ulong *)clEnqueueMapBuffer(queue1, mem1, true, CL_MAP_READ, 0, 1*sizeof(cl_ulong), 0, NULL, NULL, NULL);
cl_ulong *buf2 = (cl_ulong *)clEnqueueMapBuffer(queue2, mem2, true, CL_MAP_READ, 0, 1*sizeof(cl_ulong), 0, NULL, NULL, NULL);
printf("result1: %lu\n", buf1[0]);
printf("result2: %lu\n", buf2[0]);
dt = dtime_usec(0);
clEnqueueNDRangeKernel(queue1, kernel, 1, NULL, work_size, work_size, 0, NULL, NULL);
dt = dtime_usec(dt);
std::cout << "enqueue time: " << dt << " microseconds" << std::endl;
clEnqueueUnmapMemObject(queue1, mem1, buf1, 0, NULL, NULL);
clEnqueueUnmapMemObject(queue2, mem2, buf2, 0, NULL, NULL);
return 0;
}
# nvcc -o t3 t3.cpp -lOpenCL
In file included from /usr/local/cuda/bin/../targets/x86_64-linux/include/CL/opencl.h:24,
from t3.cpp:1:
/usr/local/cuda/bin/../targets/x86_64-linux/include/CL/cl.h:26:104: note: ‘#pragma message: cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 300 (OpenCL 3.0)’
26 | _TARGET_OPENCL_VERSION is not defined. Defaulting to 300 (OpenCL 3.0)")
| ^
t3.cpp: In function ‘int main(int, char**)’:
t3.cpp:44:22: warning: format ‘%lu’ expects argument of type ‘long unsigned int’, but argument 2 has type ‘long long unsigned int’ [-Wformat=]
44 | printf("running %lu loops\n", loops);
| ~~^ ~~~~~
| | |
| | long long unsigned int
| long unsigned int
| %llu
t3.cpp:49:32: warning: ‘_cl_command_queue* clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int*)’ is deprecated [-Wdeprecated-declarations]
49 | queue1 = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, NULL);
| ~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from /usr/local/cuda/bin/../targets/x86_64-linux/include/CL/opencl.h:24,
from t3.cpp:1:
/usr/local/cuda/bin/../targets/x86_64-linux/include/CL/cl.h:1980:1: note: declared here
1980 | clCreateCommandQueue(cl_context context,
| ^~~~~~~~~~~~~~~~~~~~
t3.cpp:50:32: warning: ‘_cl_command_queue* clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int*)’ is deprecated [-Wdeprecated-declarations]
50 | queue2 = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, NULL);
| ~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from /usr/local/cuda/bin/../targets/x86_64-linux/include/CL/opencl.h:24,
from t3.cpp:1:
/usr/local/cuda/bin/../targets/x86_64-linux/include/CL/cl.h:1980:1: note: declared here
1980 | clCreateCommandQueue(cl_context context,
| ^~~~~~~~~~~~~~~~~~~~
# ./t3
running one kernel
running 1000 loops
enqueue time: 144 microseconds
result1: 1000
result2: 0
enqueue time: 13 microseconds
#
(CUDA 12.2, driver 535.86.10, L4 GPU)