When I do clSetEventCallback
with CL_COMPLETE
on an event returned by clEnqueueNDRangeKernel
the GPU stops generating callbacks after four iterations.
The code example below illustrates the problem. It works as expected on AMD and Intel GPUs but on Nvidia GPUs the program freezes after four iterations (no more callbacks are received). If I instead of calling clEnqueueNDRangeKernel
do clEnqueueReadBuffer
it works as expected on Nvidia GPUs as well.
If I flush the command queue just before or just after the I set the callback, all callbacks are received; however, the runtime is approximately 10x slower on Nvidia GPUs compared to GPUs from AMD or Intel.
Does anybody have an idea as to why I see this behavior? Any help will be appreciated.
I have seen this problem on all Nvidia GPU I have tested on Windows 10. Currently I use:
Name: NVIDIA RTX A4000
Vendor: NVIDIA Corporation
Device OpenCL C version: OpenCL C 1.2
Driver version: 516.25
Profile: FULL_PROFILE
Version: OpenCL 3.0 CUDA
#include <atomic>
#include <iostream>
#include <vector>
#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_HPP_TARGET_OPENCL_VERSION 120
#include <CL/opencl.hpp>
static constexpr auto clSource{ R"CLC(
__kernel void copy(__global uchar *i, __global uchar *o)
{
const int gid = get_global_id(0);
o[gid] = i[gid];
}
)CLC" };
int main()
{
constexpr auto vendor = "NVIDIA";
//constexpr auto vendor = "AMD";
//constexpr auto vendor = "Intel";
std::cout << "OpenCL Callback Test\n\n";
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
auto platform = find_if(begin(platforms), end(platforms), [](const auto& platform) { return platform.getInfo<CL_PLATFORM_NAME>().starts_with(vendor); });
std::cout << "Selected platform: " << platform->getInfo<CL_PLATFORM_NAME>() << "\n";
std::vector<cl::Device> devices;
platform->getDevices(CL_DEVICE_TYPE_GPU, &devices);
auto device = devices.front();
std::cout << "Selected device: " << device.getInfo<CL_DEVICE_NAME>() << "\n\n";
std::vector<cl::Device> contextDevices{ device };
cl_context_properties props[3]{ CL_CONTEXT_PLATFORM, (cl_context_properties)(*platform)(), 0 };
cl::Context context(contextDevices, props);
cl::CommandQueue queue(context, device/*, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE*/);
std::vector<std::byte> input(10);
auto output = input;
cl::Buffer inputBuffer{ context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, input.size(), nullptr };
cl::Buffer outputBuffer{ context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, output.size(), nullptr };
cl::Program program(context, clSource, true);
cl::Kernel kernel(program, "copy");
kernel.setArg(0, inputBuffer);
kernel.setArg(1, outputBuffer);
for (int i = 0; i < 10; i++)
{
cl::Event event;
queue.enqueueNDRangeKernel(kernel, cl::NullRange, { input.size() }, cl::NullRange, nullptr, &event);
//queue.enqueueReadBuffer(outputBuffer, CL_FALSE, 0, output.size(), output.data(), nullptr, &event);
std::atomic_bool flag{ false };
event.setCallback(CL_COMPLETE, [](cl_event, cl_int, void* user_data)
{
auto& flag = *((std::atomic_bool*)user_data);
flag = true;
flag.notify_one();
}, &flag);
flag.wait(false);
std::cout << "Completed loop " << i << std::endl;
}
std::cout << "The End!\n";
}