Hi!
I’m trying to get my simple raytracer work in CL, but I can’t proceed, since even with this simple kernel, which finally compiles OK, I get CL_OUT_OF_RECOURCES when waiting on event from clEnqueueNDRangeKernel.
I tried to run it on worksize even 1x1, does not help. I have few guesses why, all of which might be easily totally incorrect, since I’m new to CL:
-
I use too many host CL memory objects (look at my __kernel void rayTrace function). I need all of them. Would packing them into a single one help? Why? :o
-
The kernel is too long, the register allocator doesn’t know what to do
-
Some other strange error?
-
Does “inline” or “static” attribude of a function actually change anything?
As you can see in my code, I even replaced parameters to my functions to use __global space (for example before I did: BVHNode node = bvhNodes[…], now I do: __global const BVHNode* node = &bvhNodes[…]) – I guess that won’t help anything, will it?. I guess I won’t be smarter than the optimising compiler. Btw, all the arrays are only like 15000 long, so in sum I don’t send more than few dozen megabytes. That cannot be the problem.
[codebox]#define EPSILON 1e-3f
#define MAX_STACK 24
#define WORKGROUP_SIZE 4*4
typedef struct _Ray {
float4 o;
float4 d;
int4 sign;
float4 invDir;
float minDist;
float maxDist;
} Ray;
typedef struct _Camera {
float4 orig;
float4 dir;
float4 up;
float4 right;
float4 dimFov;
} Camera;
typedef struct _KERNELPARAMS {
float4 camOrig;
float4 camDir;
float4 camUp;
float4 camRight;
float4 camDimFov;
uint maxBVHdepth;
} KERNELPARAMS;
typedef struct _Vertex {
float4 position; // 4B
float4 normal; // 4B
float4 texcoord12; // 4B
float4 texcoord34; // 4B
} Vertex;
typedef struct _AABB {
float4 corners[2];
} AABB;
typedef struct _Triangle {
AABB aabb; // 32 = 16+16
uint vertexAidx; // 4
uint vertexBidx; // 4
uint vertexCidx; // 4
uint padding; // 4
} Triangle;
typedef struct _TriangleHit {
float u;
float v;
float t;
int triIndex;
} TriangleHit;
typedef struct _BVHNODE {
AABB aabb; // 8B
uint firstTriangleIndex; // 4B
uint packedNumTrisDepthLeaf; // 4B
uint padding[2]; // 4B, total 48
} BVHNode;
typedef struct _Light {
float4 pos;
float4 color;
} Light;
static inline float4 AABBextents(const AABB* aabb) { return aabb->corners[1] - aabb->corners[0]; }
static inline float4 AABBcentre(const AABB* aabb) { return (aabb->corners[0] + aabb->corners[1]) * 0.5f; }
static inline void createRayFromPixel(const uint x, const uint y, __constant const KERNELPARAMS* kernelparams, Ray* ray) {
float4 dir, dx, dy, forward;
float tanCamFov = tan(kernelparams->camDimFov.z);
float mx = (float)((x - kernelparams->camDimFov.x * 0.5f) * (1.0f / kernelparams->camDimFov.y) * tanCamFov);
float my = (float)((y - kernelparams->camDimFov.y * 0.5f) * (1.0f / kernelparams->camDimFov.y) * tanCamFov);
dx = kernelparams->camRight * mx;
dy = kernelparams->camUp * my;
forward = normalize(cross(kernelparams->camUp, kernelparams->camRight));
dir = forward + dx2.0f + dy2.0f;
ray->o = kernelparams->camOrig;
ray->d = normalize(dir);
ray->minDist = 0.001f;
ray->maxDist = MAXFLOAT;
ray->invDir = native_divide((float4)(1.0f), ray->d);
ray->sign = signbit(ray->invDir) + (int4)(1,1,1,1);
}
static inline void makeShadowRay(const float4 pos, const float4 lightPos, Ray* ray) {
float4 dir = lightPos - pos;
float dd = length(dir);
ray->d = dir / dd;
ray->o = pos + ray->d * 0.001f;
ray->invDir = native_divide((float4)(1.0f), ray->d);
ray->sign = signbit(ray->invDir) + (int4)(1,1,1,1);
ray->minDist = 0.001f;
ray->maxDist = dd;
}
static inline void rayTriangle(const Ray* r, const float4 A, const float4 B, const float4 C, TriangleHit* th) {
float4 edge1 = B - A; // TODO - optimise
float4 edge2 = C - A;
float4 tvec = r->o - A;
float4 pvec = cross(r->d, edge2);
float det = dot(edge1, pvec);
th->t = r->maxDist;
//det = __fdividef(1.0f, det);
det = 1.0f / det;
th->u = dot(tvec, pvec) * det;
if (th->u < 0.0f || th->u > 1.0f)
return;// -1.0f;
float4 qvec = cross(tvec, edge1);
th->v = dot(r->d, qvec) * det;
if (th->v < 0.0f || (th->u + th->v) > 1.0f)
return;// -1.0f;
//return dot(edge2, qvec) * det;
th->t = dot(edge2, qvec) * det;
}
static inline bool getAABBRayIntersection(const Ray* r, __global const AABB* aabb, float* tIn) { // old, remove
float4 tmin, tmax;
tmin.x = (aabb->corners[r->sign.x][0] - r->o.x) * r->invDir.x;
tmax.x = (aabb->corners[1-r->sign.x][0] - r->o.x) * r->invDir.x;
tmin.y = (aabb->corners[r->sign.y][1] - r->o.y) * r->invDir.y;
tmax.y = (aabb->corners[1-r->sign.y][1] - r->o.y) * r->invDir.y;
if ( (tmin.x > tmax.y) || (tmin.y > tmax.x) )
return false;
if (tmin.y > tmin.x)
tmin.x = tmin.y;
if (tmax.y < tmax.x)
tmax.x = tmax.y;
tmin.z = (aabb->corners[r->sign.z][2] - r->o.z) * r->invDir.z;
tmax.z = (aabb->corners[1-r->sign.z][2] - r->o.z) * r->invDir.z;
if ( (tmin.x > tmax.z) || (tmin.z > tmax.x) )
return false;
if (tmin.x > tmin.x)
tmin.x = tmin.z;
if (tmax.z < tmax.x)
tmax.x = tmax.z;
*tIn=tmin.x;
return (tmin.x < MAXFLOAT) && (tmax.x > EPSILON);
}
static int intersectBox(const Ray* r, __global const AABB* aabb, float *tnear) {
// compute intersection of ray with all six bbox planes
//float4 invR = (float4)(1.0f,1.0f,1.0f,1.0f) / r_d;
float4 tbot = r->invDir * (aabb->corners[0] - r->o);
float4 ttop = r->invDir * (aabb->corners[1] - r->o);
// re-order intersections to find smallest and largest on each axis
float4 tmin = fmin(ttop, tbot);
float4 tmax = fmax(ttop, tbot);
// find the largest tmin and the smallest tmax
float largest_tmin = fmax(fmax(tmin.x, tmin.y), fmax(tmin.x, tmin.z));
float smallest_tmax = fmin(fmin(tmax.x, tmax.y), fmin(tmax.x, tmax.z));
*tnear = largest_tmin;
//*tfar = smallest_tmax;
return smallest_tmax > largest_tmin;
}
bool intersectAllTriangles(
const Ray* ray,
const uint numTris,
const uint firstTriIndex,
__global const Triangle* triangles,
__global const unsigned int* triangleIndices,
__global const Vertex* vertices,
TriangleHit* thOut)
{
TriangleHit minHit, th;
minHit.t = MAXFLOAT;
for(uint f = 0; f < numTris; f++) {
uint triIdx = triangleIndices[firstTriIndex+f];
__global const Vertex* A = &vertices[triangles[triIdx].vertexAidx];
__global const Vertex* B = &vertices[triangles[triIdx].vertexBidx];
__global const Vertex* C = &vertices[triangles[triIdx].vertexCidx];
rayTriangle(ray, A->position, B->position, C->position, &th);
if(th.t > 0.0f) { // TODO > ray.minDist?
if(th.t < minHit.t) {
minHit = th;
minHit.triIndex = triIdx;
}
}
}
if(minHit.t < MAXFLOAT) { // TODO < ray.maxDist
*thOut = minHit;
return true;
}
return false;
}
bool intersectAnyTriangles(
const Ray* ray,
const uint numTris,
const uint firstTriIndex,
__global const Triangle* triangles,
__global const unsigned int* triangleIndices,
__global const Vertex* vertices)
{
TriangleHit th;
for(uint f = 0; f < numTris; f++) {
uint triIdx = triangleIndices[firstTriIndex+f];
__global const Vertex* A = &vertices[triangles[triIdx].vertexAidx]; // optimise!!
__global const Vertex* B = &vertices[triangles[triIdx].vertexBidx]; // TODO - global!?
__global const Vertex* C = &vertices[triangles[triIdx].vertexCidx];
rayTriangle(ray, A->position, B->position, C->position, & th);
if(th.t > ray->minDist && th.t < ray->maxDist) {
return true;
}
}
return false;
}
inline void stackPush(__local uint* stack, uint* head, uint val) {
stack[++(*head)] = val;
}
inline uint stackPop(__local uint* stack, uint* head) {
return stack[(*head)--];
}
inline bool stackEmpty(__local uint* stack, uint* head, uint localId) {
return (*head) == localId * MAX_STACK;
}
inline void calcTriangleCoords(
const TriangleHit* th,
__global const Vertex* vertices,
__global const Triangle* triangles,
float4* P, float4* N, float2* TC)
{
__global const Vertex* A = &vertices[triangles[th->triIndex].vertexAidx];
__global const Vertex* B = &vertices[triangles[th->triIndex].vertexBidx];
__global const Vertex* C = &vertices[triangles[th->triIndex].vertexCidx];
*P = A->position * (1.0f - th->u - th->v) + B->position * th->u + C->position * th->v;
*N = fast_normalize(A->normal * (1.0f - th->u - th->v) + B->normal * th->u + C->normal * th->v);
*TC = A->texcoord12.xy * (1.0f - th->u - th->v) + B->texcoord12.xy * th->u + C->texcoord12.xy * th->v;
}
TriangleHit tracePrimaryRay(
Ray* ray,
const uint maxBVHdepth,
__global const Triangle* triangles,
__global const uint* triangleIndices,
__global const Vertex* vertices,
__global const BVHNode* bvhNodes)
{
__local uint stack[MAX_STACK * WORKGROUP_SIZE];
uint lidx = get_local_id(0); // TODO
uint lidy = get_local_id(1); // TODO
uint localId = lidx + get_local_size(0)*lidy;
uint stackHead = localId * MAX_STACK;
TriangleHit minTh, th;
minTh.t = MAXFLOAT;
stackPush(stack, &stackHead, 0);
while(!stackEmpty(stack, &stackHead, localId)) {
unsigned int nodeIndex = stackPop(stack, &stackHead);
__global const BVHNode* node = &bvhNodes[nodeIndex];
float hitDist = 0;;
bool hit = intersectBox(ray, &node->aabb, &hitDist);
if(!hit || hitDist > minTh.t)
continue;
uint leftRightLeaf = (node->packedNumTrisDepthLeaf) & 0x00000003;
if(leftRightLeaf != 0) {
uint depth = (node->packedNumTrisDepthLeaf >> 2) & 0x0000003F;
uint leftIndex = nodeIndex + 1;
uint rightIndex = nodeIndex + (1 << (maxBVHdepth - depth));
stackPush(stack, &stackHead, leftIndex);
stackPush(stack, &stackHead, rightIndex);
} else {
//minTh.t = hitDist;
uint numtris = (node->packedNumTrisDepthLeaf >> 8);
bool hitLeaf = intersectAllTriangles(ray, numtris, node->firstTriangleIndex, triangles, triangleIndices, vertices, &th);
if(hitLeaf) {
if(th.t < minTh.t) {
minTh = th;
}
}
}
} // while
return minTh;
}
bool traceShadowRay(
Ray* ray,
const uint maxBVHdepth,
__global const Triangle* triangles,
__global const uint* triangleIndices,
__global const Vertex* vertices,
__global const BVHNode* bvhNodes)
{
__local uint stack[MAX_STACK * WORKGROUP_SIZE];
uint lidx = get_local_id(0); // TODO
uint lidy = get_local_id(1); // TODO
uint localId = lidx + get_local_size(0)*lidy;
uint stackHead = localId * MAX_STACK;
stackPush(stack, &stackHead, 0);
while(!stackEmpty(stack, &stackHead, localId)) {
unsigned int nodeIndex = stackPop(stack, &stackHead);
__global const BVHNode* node = &bvhNodes[nodeIndex];
float hitDist = 0;;
bool hit = intersectBox(ray, &node->aabb, &hitDist);
if(!hit)
continue;
uint leftRightLeaf = (node->packedNumTrisDepthLeaf) & 0x00000003;
if(leftRightLeaf != 0) {
uint depth = (node->packedNumTrisDepthLeaf >> 2) & 0x0000003F;
uint leftIndex = nodeIndex + 1;
uint rightIndex = nodeIndex + (1 << (maxBVHdepth - depth));
stackPush(stack, &stackHead, leftIndex);
stackPush(stack, &stackHead, rightIndex);
} else {
uint numtris = (node->packedNumTrisDepthLeaf >> 8);
bool hitLeaf = intersectAnyTriangles(ray, numtris, node->firstTriangleIndex, triangles, triangleIndices, vertices);
if(hitLeaf) {
return true;
}
}
} // while
return false;
}
inline uint camMap2Drow(__constant const KERNELPARAMS* kernelparams, uint x, uint y) {
return y*kernelparams->camDimFov.x + x;
}
inline uint camMap2Dcol(__constant const KERNELPARAMS* kernelparams, uint x, uint y) {
return x*kernelparams->camDimFov.y + y;
}
__kernel void raytrace(
__constant const KERNELPARAMS* kernelparams,
__global const Triangle* triangles,
__global const uint* triangleIndices,
__global const Vertex* vertices,
__global const BVHNode* bvhNodes,
__global float4 * img)
{
unsigned int tidx = get_global_id(0);
unsigned int tidy = get_global_id(1);
float4 color = (float4)1;
Ray ray;
createRayFromPixel(tidx, tidy, kernelparams, &ray);
TriangleHit th = tracePrimaryRay(&ray, kernelparams->maxBVHdepth, triangles, triangleIndices, vertices, bvhNodes);
if(th.t > ray.minDist && th.t < ray.maxDist) {
float4 P, N;
float2 TC;
calcTriangleCoords(&th, vertices, triangles, &P, &N, &TC);
Ray shadowRay;
makeShadowRay(P, (float4)(10,1000,10,0), &shadowRay);
bool lightOccluded = traceShadowRay(&shadowRay, kernelparams->maxBVHdepth, triangles, triangleIndices, vertices, bvhNodes);
if(!lightOccluded) {
float d = dot(N, shadowRay.d);
if(d < 0.0f)
d = 0.0f;
color = (float4)d;
} else {
color.y = 1.0f;
}
}
img[camMap2Drow(kernelparams, tidx, tidy)] = color;
}
[/codebox]
And lastly - if I comment out the traceShadowRay() in the end of rayTrace() or simplify traceShadowRay() (like only doing nothing, or hitting the depth 0 AABB only), it starts working (sometimes) AND SOMETIMES IT HANGS the OS UP :-( That gave me the impression that my program is too long, which is kinda ridiculous to me :(
Geforce 8800GT, SDK 3.0beta1, Windows Server 2003 x64 R2 SP2, 196.21 drivers.
Thank you for any ideas on how to make this run!