CL_OUT_OF_RESSOURCES with the clEnqueueReadBuffer function

Hello,

I’m trying to get running a program in CL, but i have some weird errors.

I have a picture with a laser line in it. I have to get some points of that laser line to compute distances to the objects.

I’m giving an array of all the lines i need to check, the parameters, and i need to compute the distances, and i want to get back an array of the coordinates of the pixels choosen to represent the laser.

I’m not computing the distances yet because the call to the findLaserPoint function returns an error. Here is the opencl code :

(it transforms a pixel into the HLS mode and then i get the pixel that represents the laser in the line)

[codebox]typedef struct cameraParams

{

float h;

float d;

float rcp1;

float r0_1;

float rcp2;

float r0_2;

} cameraParams;

enum bgr

{

BLUE = 0,

GREEN,

RED

};

typedef struct size

{

int height;

int width;

} size;

typedef struct coord

{

int x;

int y;

} coord;

typedef struct uchar3

{

// With opencv : c1 = blue, c2 = green, c3 = red

// In RGB mode : c1 = red, c2 = green, c3 = blue

// In HLS mode : c1 = hue, c2 = lightness, c3 = saturation

unsigned char c1;

unsigned char c2;

unsigned char c3;

} uchar3;

typedef struct pixel

{

coord xy;

uchar3 value;

} pixel;

typedef struct line

{

pixel * pixels;

float angle;

} line;

coord getCoord(pixel pix)

{

return pix.xy;

}

uchar3 transformToHls(pixel pix)

{

uchar3 hls;

float blue = pix.value.c1 / 255.;

float green = pix.value.c2 / 255.;

float red = pix.value.c3 / 255.;

float vMax, vMin;

float h, l, s;

int maxColor;

if(blue > red)

{

    if(red > green)

    {

        vMax = blue;

        vMin = green;

        maxColor = BLUE;

    }

    else if(blue > green)

    {

        vMax = blue;

        vMin = red;

        maxColor = BLUE;

   }

    else

    {

        vMax = green;

        vMin = red;

        maxColor = GREEN;

    }

}

else if(blue > green)

{

    vMax = red;

    vMin = green;

    maxColor = RED;

}

else if(red > green)

{

    vMax = red;

    vMin = green;

    maxColor = RED;

}

else

{

    vMax = green;

    vMin = blue;

    maxColor = GREEN;

}

l = (vMin + vMax) / 2.;

if(vMin == vMax)

{

    h = 0;

    s = 0;

}

else if(maxColor == RED)

    h = (int)((60 * (green - blue) / (vMax - vMin) + 360.)) % 360;

else if(maxColor == GREEN)

    h = 60 * (blue - red) / (vMax - vMin) + 120;

else

    h = 60 * (red - green) / (vMax - vMin) + 240;

if(l > 0.5)

    s = (vMax - vMin) / (vMax + vMin);

else if(l <= 0.5)

    s = (vMax - vMin) / (2 - (vMax + vMin));

hls.c2 = (int)(l * 255.);

hls.c1 = (int)(h / (360. / 255.));

hls.c3 = (int)(s * 255.);

return hls;

}

bool checkPixel(unsigned char hue, unsigned char saturation)

{

if(hue > 30 && hue < 170)

    return false;

if(saturation < 170)

    return false;

return true;

}

coord findLaserPoint(line l)

{

coord point;

point.x = -1;

point.y = -1;

int index = -1;

uchar3 hls;

unsigned char lightness_prev = 0;

int nValidPixels = 0;

int before = 0;

for(int i = 0; i < 240 ; i++)

{

    hls = transformToHls(l.pixels[i]);

    if(!checkPixel(hls.c1, hls.c3))

        continue;

l.pixels[nValidPixels] = l.pixels[i];

    nValidPixels++;

if(point.x == -1)

    {

        point.x = l.pixels[i].xy.x;

        point.y = l.pixels[i].xy.y;

        lightness_prev = hls.c2;

    }

    else if(hls.c2 > lightness_prev)

    {

        point.x = l.pixels[i].xy.x;

        point.y = l.pixels[i].xy.y;

        lightness_prev = hls.c2;

        index = nValidPixels - 1;

    }

}

if(index >= 0)

{

    int laserMin = index;

    int laserMax = index;

    int start = max(0, index - 1);

if(index > 0)

        for(int i = start; i >= 0; i--)

        {

            int tx = abs((l.pixels[i].xy.x - l.pixels[i + 1].xy.x));

            int ty = abs((l.pixels[i].xy.y - l.pixels[i + 1].xy.y));

            if(tx == 1 || ty == 1)

                laserMin = i;

	        else

                break;

        }

start = min(nValidPixels - 1, index + 1);

if(index < nValidPixels - 1)

       for(int i = start; i < nValidPixels; i++)

        {

            int tx = abs((l.pixels[i].xy.x - l.pixels[i - 1].xy.x));

            int ty = abs((l.pixels[i].xy.y - l.pixels[i - 1].xy.y));

            if(tx == 1 || ty == 1)

                laserMax = i;

            else

                break;

                }

    int laserMid = (laserMax + laserMin) / 2;

    point.x = l.pixels[laserMid].xy.x;

    point.y = l.pixels[laserMid].xy.y;

}

return point;

}

__kernel void

analyse(__global line * lines,

    __global line * hlsPixels,

    const cameraParams params,

    __global coord * pixelCoords)

{

int nIndex = get_global_id(0);

pixelCoords[nIndex] = findLaserPoint(lines[nIndex]);

}[/codebox]

And now the code C++ (yeah i know it’s mainly C code) :

[codebox]#define __CL_ENABLE_EXCEPTIONS

#define __NO_STD_VECTOR

#define __NO_STD_STRING

#include <stdlib.h>

#include <CL/opencl.h>

#include “Image.h”

#include “Source.h”

#include “utils.h”

void randomInit(float * data, int size)

{

for (int i = 0; i < size; ++i)

    data[i] = rand() % 100 + 1 / (float)RAND_MAX;

}

void showErrors(int err)

{

switch(err)

{

    case CL_SUCCESS:

        std::cout << "OK" << std::endl;

        break;

case CL_INVALID_PROGRAM_EXECUTABLE:

        std::cout << "error : CL_INVALID_PROGRAM_EXECUTABLE" << std::endl;

        break;

case CL_INVALID_COMMAND_QUEUE:

        std::cout << "error : CL_INVALID_COMMAND_QUEUE" << std::endl;

        break;

case CL_INVALID_KERNEL:

        std::cout << "error : CL_INVALID_KERNEL" << std::endl;

        break;

case CL_INVALID_CONTEXT:

        std::cout << "error : CL_INVALID_CONTEXT" << std::endl;

        break;

case CL_INVALID_KERNEL_ARGS:

        std::cout << "error : CL_INVALID_KERNEL_ARGS" << std::endl;

        break;

case CL_INVALID_WORK_DIMENSION:

        std::cout << "error : CL_INVALID_WORK_DIMENSION" << std::endl;

        break;

case CL_INVALID_WORK_GROUP_SIZE:

        std::cout << "error : CL_INVALID_WORK_GROUP_SIZE" << std::endl;

        break;

case CL_INVALID_WORK_ITEM_SIZE:

        std::cout << "error : CL_INVALID_WORK_ITEM_SIZE" << std::endl;

        break;

case CL_INVALID_GLOBAL_OFFSET:

        std::cout << "error : CL_INVALID_GLOBAL_OFFSET" << std::endl;

        break;

case CL_OUT_OF_RESOURCES:

        std::cout << "error : CL_OUT_OF_RESOURCES" << std::endl;

        break;

case CL_MEM_OBJECT_ALLOCATION_FAILURE:

        std::cout << "error : CL_MEM_OBJECT_ALLOCATION_FAILURE" << std::endl;

        break;

case CL_INVALID_EVENT_WAIT_LIST:

        std::cout << "error : CL_INVALID_EVENT_WAIT_LIST" << std::endl;

        break;

case CL_OUT_OF_HOST_MEMORY:

        std::cout << "error : CL_OUT_OF_HOST_MEMORY" << std::endl;

        break;

case CL_INVALID_MEM_OBJECT:

        std::cout << "error : CL_INVALID_MEM_OBJECT" << std::endl;

case CL_INVALID_VALUE:

        std::cout << "error : CL_INVALID_VALUE" << std::endl;

default:

        std::cout << "error not found" << std::endl;

}

}

int main()

{

cl_int err;

const int nPoints = 1;

const float angle = (float)(60. * PI / 180.);

const cameraParams params = {3., -1.9, 0.00132331, -0.03630043, 0.00089212,

                             -0.01771631};

cv::VideoCapture cap(0);

cap.set(CV_CAP_PROP_FRAME_WIDTH, 640);

cap.set(CV_CAP_PROP_FRAME_HEIGHT, 480);

while(42)

{

    cv::Mat image;

    cap >> image;

    cv::Mat image_ = image.clone();

    cv::line(image_, cv::Point(image_.cols / 2, 0),

             cv::Point(image_.cols / 2, image_.rows - 1),

             cv::Scalar(255, 255, 255));

    cv::line(image_, cv::Point(0, image_.rows / 2),

             cv::Point(image_.cols - 1, image_.rows / 2),

             cv::Scalar(255, 255, 255));

    cv::namedWindow("test", 0);

    cv::imshow("test", image_);

    char c = cvWaitKey(40);

    if(c == 'q' || c == 27)

    {

        cap.release();

        break;

    }

    else if(c == 'c')

    {

        imwrite("img_test.png", image);

        Image * img = new Image(image.size().height, image.size().width,

                                image.channels());

        img->setData(image.data);

const size imgSize = {img->getHeight(), img->getWidth()};

const unsigned int cnBlockSize = sizeof(pixel) * img->getHeight() / 2;

        const unsigned int cnBlocks = nPoints;

        const unsigned int cnDimension = cnBlocks * cnBlockSize;

bool even = false;

        float a = 0.;

if(nPoints > 1)

        {

            if(!(nPoints % 2))

            {

                even = true;

                a = angle / nPoints;

            }

            else

                a = angle / (nPoints - 1);

        }

// Get platforms & device

        cl_platform_id clPlatform;

        cl_device_id clDeviceId;

        err = clGetPlatformIDs(1, &clPlatform, NULL);

        std::cout << "clGetPlatformIDs... ";

        showErrors(err);

        err = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDeviceId, NULL);

        std::cout << "clGetDeviceIDs... ";

        showErrors(err);

Source program(“test_tmp.cl”);

        const char * programSource = program.get();

// create OpenCL context

        cl_context hContext;

        hContext = clCreateContext(0, 1, &clDeviceId, NULL, NULL, &err);

        std::cout << "clCreateContext... ";

        showErrors(err);

// query all devices available to the context

        size_t nContextDescriptorSize;

        clGetContextInfo(hContext, CL_CONTEXT_DEVICES, 0, 0,

                         &nContextDescriptorSize);

        cl_device_id * aDevices = (cl_device_id *)malloc(nContextDescriptorSize);

        clGetContextInfo(hContext, CL_CONTEXT_DEVICES,

                         nContextDescriptorSize, aDevices, 0);

// create a command queue for first device the context reported

        cl_command_queue hCmdQueue;

        hCmdQueue = clCreateCommandQueue(hContext, aDevices[0], 0, &err);

        std::cout << "clCreateCommandQueue... ";

        showErrors(err);

// create & compile program

        cl_program hProgram;

        //const char * s = programSource.get();

        hProgram = clCreateProgramWithSource(hContext, 1,

                                             (const char **) &programSource,

                                             0, &err);

        std::cout << "clCreateProgramWithSource... ";

        showErrors(err);

        err = clBuildProgram(hProgram, 0, NULL, NULL, NULL, NULL);

        std::cout << "clBuildProgram... ";

        showErrors(err);

// create kernel

        cl_kernel hKernel;

        hKernel = clCreateKernel(hProgram, "analyse", &err);

        std::cout << "clCreateKernel... ";

        showErrors(err);

// allocate host vectors

        line * lines = new line[cnBlocks];

        coord * pixelCoords = new coord[cnBlocks];

// initialize host memory

        for(int i = 0; i < nPoints / 2; i++)

        {

            int dist = (int)fabs(tan(a * (double)(i + 1.)) * img->getHeight() / 2.);

            lines[i].pixels = img->Bresenham(img->getWidth() / 2, 0,

                                             img->getWidth() / 2 - dist, 

                                             img->getHeight() / 2);

            lines[i].angle = a;

lines[nPoints - 1 - i].pixels = img->Bresenham(img->getWidth() / 2, 0,

                                         img->getWidth() / 2 + dist,

                                         img->getHeight() / 2);

            lines[nPoints - 1 - i].angle = a;

        }

        if(!even)

        {

            lines[nPoints / 2].pixels = img->Bresenham(img->getWidth() / 2, 0,

                                                       img->getWidth() / 2,

                                                       img->getHeight() / 2);

            lines[nPoints / 2].angle = a;

        }

// allocate device memory

        cl_mem hDeviceMemLines, hDeviceMemPixelCoords;

        hDeviceMemLines = clCreateBuffer(hContext, CL_MEM_READ_WRITE,

                                         cnBlocks * sizeof(line), lines,

                                         &err);

        std::cout << "hDeviceMemLines clCreateBuffer... ";

        showErrors(err);

        err = clEnqueueWriteBuffer(hCmdQueue, hDeviceMemLines, CL_TRUE, 0, 

                                   cnBlocks * sizeof(line), lines, 0,

                                   NULL, NULL);

        std::cout << "hDeviceMemLines clEnqueueWriteBuffer... ";

        showErrors(err);

hDeviceMemPixelCoords = clCreateBuffer(hContext, CL_MEM_READ_WRITE,

                                               cnBlocks * sizeof(coord), 0,

                                               &err);

        std::cout << "hDeviceMemPixelCoords clCreateBuffer... ";

        showErrors(err);

// setup parameter values

        err = clSetKernelArg(hKernel, 0, sizeof(cl_mem),

                             (void *)&hDeviceMemLines);

        std::cout << "hDeviceMemLines clSetKernelArg... ";

        showErrors(err);

        err = clSetKernelArg(hKernel, 1, sizeof(cameraParams),

                             (void *)&params);

        std::cout << "params clSetKernelArg... ";

        showErrors(err);

        err = clSetKernelArg(hKernel, 2, sizeof(cl_mem),

                             (void *)&hDeviceMemPixelCoords);

        std::cout << "hDeviceMemPixelCoords clSetKernelArg... ";

        showErrors(err);

// execute kernel

        err = clEnqueueNDRangeKernel(hCmdQueue, hKernel, 1, 0, &cnDimension,

                                     NULL, 0, 0, 0);

        std::cout << "clEnqueueNDRangeKernel... ";

        showErrors(err);

err = clEnqueueReadBuffer(hCmdQueue, hDeviceMemPixelCoords, CL_TRUE,

                                  0, cnBlocks * sizeof(coord),

                                  (void *)pixelCoords, NULL, NULL, NULL);

        std::cout << "hDeviceMemPixelCoords... ";

        showErrors(err);

delete lines;

        delete[] pixelCoords;

clReleaseMemObject(hDeviceMemLines);

        clReleaseMemObject(hDeviceMemPixelCoords);

    }

}

return 0;

}[/codebox]

For now i’m using opencl to get a picture. I checked if the lines were properly passed by making a copy of the lines and print it in the main and there was no problem (i.e all functions used return CL_SUCCESS).

I tryed to look everywhere, to use different way to implement this, but still i get this error :(

clGetPlatformIDs… OK

clGetDeviceIDs… OK

clCreateContext… OK

clCreateCommandQueue… OK

clCreateProgramWithSource… OK

clBuildProgram… OK

clCreateKernel… OK

hDeviceMemLines clCreateBuffer… OK

hDeviceMemLines clEnqueueWriteBuffer… OK

hDeviceMemPixelCoords clCreateBuffer… OK

hDeviceMemLines clSetKernelArg… OK

params clSetKernelArg… OK

hDeviceMemPixelCoords clSetKernelArg… OK

clEnqueueNDRangeKernel… OK

hDeviceMemPixelCoords clEnqueueReadBuffer… error : CL_OUT_OF_RESOURCES

The funny thing is that this function should not return this error…

I tryed to debug a bit the opencl code. When the findLaserPoint function is cut as

[codebox]coord findLaserPoint(line l)

{

coord point;

point.x = -1;

point.y = -1;

int index = -1;

uchar3 hls;

unsigned char lightness_prev = 0;

int nValidPixels = 0;

int before = 0;

for(int i = 0; i < 240 ; i++)

{

    hls = transformToHls(l.pixels[i]);

    if(!checkPixel(hls.c1, hls.c3))

        continue;

}

return points

}[/codebox]

everything works well.

I have a similar problem with my kernel here. Nobody has really answered, yet.
The only thing that helped me was to split my kernel into two or to simplify it (not really an option :D).

First try passing clEnqueueNDRangeKernel a concrete local workgroup size instead of NULL. Try 128 or whatever clGetKernelWorkGroupInfo(…CL_KERNEL_WORK_GROUP_SIZE…) gives you, or less.

Maybe you really do run out of resources. I would recommend replacing your uchar3 and similar constructs with standard uchar4 (or uint4), fix the code accordingly and save lots of computations using 4-wide SIMD vector instruction. I think the compiler won’t figure that out. However, that might still not help :(

Anyone with better ideas?

I tried to specify a local size (128 and 512, the max) but still it doesnt work and i’m getting the CL_INVALID_WORK_GROUP_SIZE at the clEnqueueNDRangeKernel. I read that the local size need to be a divisor of the argument just before so i had to put 64. No more problem with the work group size but i get again the CL_OU_OF_RESSOURCES just like before.

I’ll follow your advice and try to remove as much structures as i can and see if things go better.

I’ll post again when i’m done and i hope someone can help us with this weird thing ^^"

I’m finally done with this code. I didn’t have to simplify my kernel, just to remove the big functions.
I replace as well all my structures bt base types like unsigned char *, etc.

I have more parameters in my kernel and it’s a bit more complicated since i have to split all my strutures into easier types, but everything is working well now.

What i recommand :

  • avoid structures and try to use opencl types
  • avoid complicated functions (small function like add 2 vectors, return the biggest value from 2, etc, are ok)

In the end, its kinda hard to program in opencl in terms of structuration of the code. I would never program like this if it was a simple C program.

I hope this will help someone.

So, in the end we might assume that the compiler still needs some work and doesn’t handle some optimisations well yet? :)
I wonder how does it do register allocation and register spilling, since it really seems that many constructs fool it easily :(

Is there anyone who can tell us anything on the optimising compilers topic here?