buffer allocations in OpenCL can be deferred:
https://devtalk.nvidia.com/default/topic/493543/best-practice-for-memory-managment-in-opencl/
I think what you are seeing is expected behavior. If you want to witness an out of memory condition, you have to actually use those buffers.
Here’s a simple test case demonstrating this:
$ cat t6.cpp
#include <CL/opencl.h>
#include <stdio.h>
const int nblk = 256;
int main(int argc, char *argv[])
{
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_mem mem1[nblk];
cl_int err;
cl_command_queue queue1;
cl_event event1[nblk];
err = clGetPlatformIDs(1, &platform, NULL);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
queue1 = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
size_t mem_size = 0x20000000; // 512MB
unsigned char pattern = 0;
int i = 0;
while ((i < nblk)&&(err == CL_SUCCESS)){
mem1[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size, NULL, &err);
#ifdef USE_FILL
if (i > 0)
err += clEnqueueFillBuffer(queue1, mem1[i], &pattern, 1, 0, mem_size, 1, event1+i-1, event1+i);
else
err += clEnqueueFillBuffer(queue1, mem1[i], &pattern, 1, 0, mem_size, 0, NULL, event1+i);
#endif
i++;}
if (err != CL_SUCCESS)
printf("ocl error: %d at iteration: %d\n", err, i);
else
printf("%d loops finished with no error\n", i);
}
$ g++ t6.cpp -I/usr/local/cuda/include -lOpenCL -o t6
$ ./t6
256 loops finished with no error
$ g++ t6.cpp -I/usr/local/cuda/include -lOpenCL -o t6 -DUSE_FILL
$ ./t6
ocl error: -4 at iteration: 63
$
this is being run on a tesla V100 32GB GPU, linux, driver 410.48
We see that if we don’t use the buffers, 256 loops complete successfully, where each buffer is 512MB in size.
If we actually use the buffers as we allocate, the loop fails on the 63 iteration (-4 = CL_MEM_OBJECT _ALLOCATION_FAILURE). This makes sense because 62 successful iterations at 512MB/iteration is 31GB, which is reasonable for the 32GB GPU.