CL_OUT_OF_RESOURCES

Hi!

I’m trying to get my simple raytracer work in CL, but I can’t proceed, since even with this simple kernel, which finally compiles OK, I get CL_OUT_OF_RECOURCES when waiting on event from clEnqueueNDRangeKernel.

I tried to run it on worksize even 1x1, does not help. I have few guesses why, all of which might be easily totally incorrect, since I’m new to CL:

  • I use too many host CL memory objects (look at my __kernel void rayTrace function). I need all of them. Would packing them into a single one help? Why? :o

  • The kernel is too long, the register allocator doesn’t know what to do

  • Some other strange error?

  • Does “inline” or “static” attribude of a function actually change anything?

As you can see in my code, I even replaced parameters to my functions to use __global space (for example before I did: BVHNode node = bvhNodes[…], now I do: __global const BVHNode* node = &bvhNodes[…]) – I guess that won’t help anything, will it?. I guess I won’t be smarter than the optimising compiler. Btw, all the arrays are only like 15000 long, so in sum I don’t send more than few dozen megabytes. That cannot be the problem.

[codebox]#define EPSILON 1e-3f

#define MAX_STACK 24

#define WORKGROUP_SIZE 4*4

typedef struct _Ray {

float4 o;

float4 d;

int4 sign;

float4 invDir;

float minDist;

float maxDist;

} Ray;

typedef struct _Camera {

float4 orig;

float4 dir;

float4 up;

float4 right;

float4 dimFov;

} Camera;

typedef struct _KERNELPARAMS {

float4 camOrig;

float4 camDir;

float4 camUp;

float4 camRight;

float4 camDimFov;

uint maxBVHdepth;

} KERNELPARAMS;

typedef struct _Vertex {

float4 position; // 4B

float4 normal; // 4B

float4 texcoord12; // 4B

float4 texcoord34; // 4B

} Vertex;

typedef struct _AABB {

float4 corners[2];

} AABB;

typedef struct _Triangle {

AABB aabb; // 32 = 16+16

uint vertexAidx; // 4

uint vertexBidx; // 4

uint vertexCidx; // 4

uint padding; // 4

} Triangle;

typedef struct _TriangleHit {

float u;

float v;

float t;

int triIndex;

} TriangleHit;

typedef struct _BVHNODE {

AABB aabb;									// 8B

uint firstTriangleIndex; // 4B

uint packedNumTrisDepthLeaf; // 4B

uint padding[2]; // 4B, total 48

} BVHNode;

typedef struct _Light {

float4 pos;

float4 color;

} Light;

static inline float4 AABBextents(const AABB* aabb) { return aabb->corners[1] - aabb->corners[0]; }

static inline float4 AABBcentre(const AABB* aabb) { return (aabb->corners[0] + aabb->corners[1]) * 0.5f; }

static inline void createRayFromPixel(const uint x, const uint y, __constant const KERNELPARAMS* kernelparams, Ray* ray) {

float4 dir, dx, dy, forward;

float tanCamFov = tan(kernelparams->camDimFov.z);

float mx = (float)((x - kernelparams->camDimFov.x * 0.5f) * (1.0f / kernelparams->camDimFov.y) * tanCamFov);

float my = (float)((y - kernelparams->camDimFov.y * 0.5f) * (1.0f / kernelparams->camDimFov.y) * tanCamFov);

dx = kernelparams->camRight * mx;

dy = kernelparams->camUp * my;

forward = normalize(cross(kernelparams->camUp, kernelparams->camRight));

dir = forward + dx2.0f + dy2.0f;

ray->o = kernelparams->camOrig;

ray->d = normalize(dir); 



ray->minDist = 0.001f;

ray->maxDist = MAXFLOAT;

ray->invDir = native_divide((float4)(1.0f), ray->d);	

ray->sign = signbit(ray->invDir) + (int4)(1,1,1,1);

}

static inline void makeShadowRay(const float4 pos, const float4 lightPos, Ray* ray) {

float4 dir = lightPos - pos;

float dd = length(dir);

ray->d = dir / dd;

ray->o = pos + ray->d * 0.001f;

ray->invDir = native_divide((float4)(1.0f), ray->d);	

ray->sign = signbit(ray->invDir) + (int4)(1,1,1,1);

ray->minDist = 0.001f;

ray->maxDist = dd;

}

static inline void rayTriangle(const Ray* r, const float4 A, const float4 B, const float4 C, TriangleHit* th) {

float4 edge1 = B - A;	// TODO - optimise

float4 edge2 = C - A;

float4 tvec = r->o - A;  

float4 pvec = cross(r->d, edge2);  

float  det  = dot(edge1, pvec); 

th->t = r->maxDist; 

//det = __fdividef(1.0f, det);  

det = 1.0f / det;

th->u = dot(tvec, pvec) * det;  

if (th->u < 0.0f || th->u > 1.0f)  

	return;// -1.0f;  

float4 qvec = cross(tvec, edge1);  

th->v = dot(r->d, qvec) * det;  

if (th->v < 0.0f || (th->u + th->v) > 1.0f)  

	return;// -1.0f;  

//return dot(edge2, qvec) * det;  

th->t = dot(edge2, qvec) * det;

}

static inline bool getAABBRayIntersection(const Ray* r, __global const AABB* aabb, float* tIn) { // old, remove

float4 tmin, tmax;

tmin.x = (aabb->corners[r->sign.x][0] - r->o.x) * r->invDir.x;

tmax.x = (aabb->corners[1-r->sign.x][0] - r->o.x) * r->invDir.x;

tmin.y = (aabb->corners[r->sign.y][1] - r->o.y) * r->invDir.y;

tmax.y = (aabb->corners[1-r->sign.y][1] - r->o.y) * r->invDir.y;

if ( (tmin.x > tmax.y) || (tmin.y > tmax.x) )

return false;

if (tmin.y > tmin.x)

tmin.x = tmin.y;

if (tmax.y < tmax.x)

tmax.x = tmax.y;

tmin.z = (aabb->corners[r->sign.z][2] - r->o.z) * r->invDir.z;

tmax.z = (aabb->corners[1-r->sign.z][2] - r->o.z) * r->invDir.z;

if ( (tmin.x > tmax.z) || (tmin.z > tmax.x) )

return false;

if (tmin.x > tmin.x)

tmin.x = tmin.z;

if (tmax.z < tmax.x)

tmax.x = tmax.z;

*tIn=tmin.x;

return (tmin.x < MAXFLOAT) && (tmax.x > EPSILON);

}

static int intersectBox(const Ray* r, __global const AABB* aabb, float *tnear) {

// compute intersection of ray with all six bbox planes

//float4 invR = (float4)(1.0f,1.0f,1.0f,1.0f) / r_d;

float4 tbot = r->invDir * (aabb->corners[0] - r->o);

float4 ttop = r->invDir * (aabb->corners[1] - r->o);

// re-order intersections to find smallest and largest on each axis

float4 tmin = fmin(ttop, tbot);

float4 tmax = fmax(ttop, tbot);

// find the largest tmin and the smallest tmax

float largest_tmin = fmax(fmax(tmin.x, tmin.y), fmax(tmin.x, tmin.z));

float smallest_tmax = fmin(fmin(tmax.x, tmax.y), fmin(tmax.x, tmax.z));

*tnear = largest_tmin;

//*tfar = smallest_tmax;

return smallest_tmax > largest_tmin;

}

bool intersectAllTriangles(

const Ray* ray,

const uint numTris,

const uint firstTriIndex, 

__global const Triangle* triangles, 

__global const unsigned int* triangleIndices, 

__global const Vertex* vertices, 

TriangleHit* thOut)

{

TriangleHit minHit, th;

minHit.t = MAXFLOAT;

for(uint f = 0; f < numTris; f++) {

uint triIdx = triangleIndices[firstTriIndex+f];

__global const Vertex* A = &vertices[triangles[triIdx].vertexAidx];

__global const Vertex* B = &vertices[triangles[triIdx].vertexBidx];

__global const Vertex* C = &vertices[triangles[triIdx].vertexCidx];

rayTriangle(ray, A->position, B->position, C->position, &th);

if(th.t > 0.0f) {	// TODO > ray.minDist?

  if(th.t < minHit.t) {

    minHit = th;

    minHit.triIndex = triIdx;

  }

}

}

if(minHit.t < MAXFLOAT) { // TODO < ray.maxDist

*thOut = minHit;

return true;

}

return false;

}

bool intersectAnyTriangles(

const Ray* ray,

const uint numTris,

const uint firstTriIndex, 

__global const Triangle* triangles, 

__global const unsigned int* triangleIndices, 

__global const Vertex* vertices)

{

TriangleHit th;

for(uint f = 0; f < numTris; f++) {

uint triIdx = triangleIndices[firstTriIndex+f];

__global const Vertex* A = &vertices[triangles[triIdx].vertexAidx];  // optimise!!

__global const Vertex* B = &vertices[triangles[triIdx].vertexBidx];	// TODO - global!?

__global const Vertex* C = &vertices[triangles[triIdx].vertexCidx];

rayTriangle(ray, A->position, B->position, C->position, & th);

if(th.t > ray->minDist && th.t < ray->maxDist) {

		return true;

}

}

return false;

}

inline void stackPush(__local uint* stack, uint* head, uint val) {

stack[++(*head)] = val;

}

inline uint stackPop(__local uint* stack, uint* head) {

return stack[(*head)--];

}

inline bool stackEmpty(__local uint* stack, uint* head, uint localId) {

return (*head) == localId * MAX_STACK;

}

inline void calcTriangleCoords(

const TriangleHit* th, 

__global const Vertex* vertices, 

__global const Triangle* triangles,

float4* P, float4* N, float2* TC) 

{

__global const Vertex* A = &vertices[triangles[th->triIndex].vertexAidx];

__global const Vertex* B = &vertices[triangles[th->triIndex].vertexBidx];

__global const Vertex* C = &vertices[triangles[th->triIndex].vertexCidx];

*P = A->position * (1.0f - th->u - th->v) + B->position * th->u + C->position * th->v;

*N = fast_normalize(A->normal * (1.0f - th->u - th->v) + B->normal * th->u + C->normal * th->v);

*TC = A->texcoord12.xy * (1.0f - th->u - th->v) + B->texcoord12.xy * th->u + C->texcoord12.xy * th->v;

}

TriangleHit tracePrimaryRay(

Ray* ray,

const uint maxBVHdepth,

__global const Triangle* triangles,

__global const uint* triangleIndices,

__global const Vertex* vertices,

__global const BVHNode* bvhNodes)			

{

__local uint stack[MAX_STACK * WORKGROUP_SIZE];

uint lidx = get_local_id(0); // TODO

uint lidy = get_local_id(1); // TODO

uint localId = lidx + get_local_size(0)*lidy;

uint stackHead = localId * MAX_STACK;

TriangleHit minTh, th;

minTh.t = MAXFLOAT;

stackPush(stack, &stackHead, 0);

while(!stackEmpty(stack, &stackHead, localId)) {

unsigned int nodeIndex = stackPop(stack, &stackHead);

__global const BVHNode* node = &bvhNodes[nodeIndex];

float hitDist = 0;;

bool hit = intersectBox(ray, &node->aabb, &hitDist);

if(!hit || hitDist > minTh.t)

  continue;

uint leftRightLeaf = (node->packedNumTrisDepthLeaf) & 0x00000003;

	if(leftRightLeaf != 0) {

		uint depth = (node->packedNumTrisDepthLeaf >> 2) & 0x0000003F;

		uint leftIndex = nodeIndex + 1;						

		uint rightIndex = nodeIndex + (1 << (maxBVHdepth - depth));

		stackPush(stack, &stackHead, leftIndex);

		stackPush(stack, &stackHead, rightIndex);

	} else {

		//minTh.t = hitDist;

		uint numtris = (node->packedNumTrisDepthLeaf >> 8);

		bool hitLeaf = intersectAllTriangles(ray, numtris, node->firstTriangleIndex, triangles, triangleIndices, vertices, &th);

  if(hitLeaf) {

    if(th.t < minTh.t) {

      minTh = th;

    }

  }

	}

} // while

return minTh;

}

bool traceShadowRay(

Ray* ray,

const uint maxBVHdepth,

__global const Triangle* triangles,

__global const uint* triangleIndices,

__global const Vertex* vertices,

__global const BVHNode* bvhNodes)			

{

__local uint stack[MAX_STACK * WORKGROUP_SIZE];

uint lidx = get_local_id(0); // TODO

uint lidy = get_local_id(1); // TODO

uint localId = lidx + get_local_size(0)*lidy;

uint stackHead = localId * MAX_STACK;

stackPush(stack, &stackHead, 0);

while(!stackEmpty(stack, &stackHead, localId)) {

unsigned int nodeIndex = stackPop(stack, &stackHead);

__global const BVHNode* node = &bvhNodes[nodeIndex];

float hitDist = 0;;

bool hit = intersectBox(ray, &node->aabb, &hitDist);

if(!hit)

  continue;

uint leftRightLeaf = (node->packedNumTrisDepthLeaf) & 0x00000003;

	if(leftRightLeaf != 0) {

		uint depth = (node->packedNumTrisDepthLeaf >> 2) & 0x0000003F;

		uint leftIndex = nodeIndex + 1;						

		uint rightIndex = nodeIndex + (1 << (maxBVHdepth - depth));

		stackPush(stack, &stackHead, leftIndex);

		stackPush(stack, &stackHead, rightIndex);

	} else {

		uint numtris = (node->packedNumTrisDepthLeaf >> 8);

		bool hitLeaf = intersectAnyTriangles(ray, numtris, node->firstTriangleIndex, triangles, triangleIndices, vertices);

  if(hitLeaf) {

    return true;

  }

	}

} // while

return false;

}

inline uint camMap2Drow(__constant const KERNELPARAMS* kernelparams, uint x, uint y) {

return y*kernelparams->camDimFov.x + x;

}

inline uint camMap2Dcol(__constant const KERNELPARAMS* kernelparams, uint x, uint y) {

return x*kernelparams->camDimFov.y + y;

}

__kernel void raytrace(

__constant const KERNELPARAMS* kernelparams,

__global const Triangle* triangles,

__global const uint* triangleIndices,

__global const Vertex* vertices,

__global const BVHNode* bvhNodes,

__global float4 * img) 

{

unsigned int tidx = get_global_id(0);

unsigned int tidy = get_global_id(1);

	float4 color = (float4)1;

Ray ray;

createRayFromPixel(tidx, tidy, kernelparams, &ray);

	

TriangleHit th = tracePrimaryRay(&ray, kernelparams->maxBVHdepth, triangles, triangleIndices, vertices, bvhNodes);

if(th.t > ray.minDist && th.t < ray.maxDist) {

		float4 P, N;

		float2 TC;

		calcTriangleCoords(&th, vertices, triangles, &P, &N, &TC);

		Ray shadowRay;

		makeShadowRay(P, (float4)(10,1000,10,0), &shadowRay);

		

		bool lightOccluded = traceShadowRay(&shadowRay, kernelparams->maxBVHdepth, triangles, triangleIndices, vertices, bvhNodes);

		

		if(!lightOccluded) {

			float d = dot(N, shadowRay.d);

			if(d < 0.0f)

				d = 0.0f;

			color = (float4)d;

		} else {

			color.y = 1.0f;

		}

}

	img[camMap2Drow(kernelparams, tidx, tidy)] = color;

}

[/codebox]

And lastly - if I comment out the traceShadowRay() in the end of rayTrace() or simplify traceShadowRay() (like only doing nothing, or hitting the depth 0 AABB only), it starts working (sometimes) AND SOMETIMES IT HANGS the OS UP :-( That gave me the impression that my program is too long, which is kinda ridiculous to me :(

Geforce 8800GT, SDK 3.0beta1, Windows Server 2003 x64 R2 SP2, 196.21 drivers.

Thank you for any ideas on how to make this run!

Here’s what clGetKernelWorkGroupInfo() returns after building the kernel:

CL_KERNEL_COMPILE_WORK_GROUP_SIZE= 0 x 0 x 0
CL_KERNEL_LOCAL_MEM_SIZE= 3108
CL_KERNEL_WORK_GROUP_SIZE= 192

EDIT: After commenting out the shadow tracing at the end, I get:

CL_KERNEL_LOCAL_MEM_SIZE= 1572
CL_KERNEL_WORK_GROUP_SIZE= 192
and it works, as I announced. So really a problem with the number of registers? How to get around? If I allocate more __local memory than 16kB, I get completely different errors (and BUILD fails), so that’s not the case. Here it dropped because there’s another stack in the shadow tracing.

CL_OUT_OF_RECOURCES may happen even your register/shared memory are well within the device limit. See my post at http://forums.nvidia.com/index.php?showtopic=160743

When that happens, it is basically the OpenCL version of the “kernel launch timed out” in CUDA. In fact, it is THE most frequently encountered error when running CUDA/OpenCL for some serious calculations on a non-dedicated GPU. You can find more details at here:

http://mcx.sourceforge.net/cgi-bin/index.c…or_what_is_that

I want to add that this type of error does not happen for ATI’s hardware when running OpenCL. This driver watchdog time limit appeared to me an nVidia specific thing (at least for Linux).

The watchdog should be an OS level thing for any video adapter.

I ran my OpenCL kernel on a ATI card (used for both display and computing) for over 30 seconds (Ubuntu 9.10), it was not killed and gave me the correct results. I don’t know what magic ATI used in their driver.

Thanks guys :( But, how should I overcome this stuff? :-O
I can’t understand why a 5 sec watchdog would do any bad here, since my calculation doesn’t take more than few 100ms on a single core CPU!!! :-O