I’m getting a CL_OUT_OF_RESOURCES error trying to run this code:
cl_mem ocl_rays = clCreateBuffer (ocl_context, CL_MEM_READ_ONLY , sizeof(ray_t)*ray_count, NULL, &err);
err = clEnqueueWriteBuffer(ocl_cmd_queue, ocl_rays, CL_TRUE, 0, sizeof(ray_t)*ray_count, local_rays, 0, NULL, NULL);
err=clFinish(ocl_cmd_queue);
cl_mem ocl_return = clCreateBuffer (ocl_context, CL_MEM_WRITE_ONLY, ray_count*sizeof(ocl_isect), NULL, &err);
err |= clSetKernelArg(ocl_kernel, 0, sizeof(cl_mem), (void *)&triangles_gpu_mem);
err |= clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void *)&ocl_rays);
err |= clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void *)&ocl_return);
err |= clSetKernelArg(ocl_kernel, 3, sizeof(unsigned int), (void *)&triangles_count);
err |= clSetKernelArg(ocl_kernel, 4, sizeof(unsigned int), (void *)&original_count);
err=clFinish(ocl_cmd_queue);
//err = clEnqueueNDRangeKernel(ocl_cmd_queue, ocl_kernel, dim, NULL, dims,local_dims, 0, NULL, NULL);
//err = clEnqueueNDRangeKernel(ocl_cmd_queue, ocl_kernel, dim, NULL, dims,NULL, 0, NULL, NULL);
printf("%d\n",err);
err = clFinish(ocl_cmd_queue);
err=clEnqueueReadBuffer(ocl_cmd_queue, ocl_return, CL_TRUE, 0,sizeof(ocl_isect)*ray_count,ris, 0, 0, 0);
err=clFinish(ocl_cmd_queue);
err=clReleaseMemObject(ocl_return);
err=clReleaseMemObject(ocl_rays);
on the clFinish() call just after the clEnqueueNDRangeKernel…
I call this code with 100000 work items, in a [100,100,10] global dimension matrix and i’ve tried to set the group dimensions to a NULL pointer (which should autodetect the sizes) and then with dimensions fixed at [2,5,1].
This function:
err= clGetKernelWorkGroupInfo(ocl_kernel,ocl_device,CL_KERNEL_WORK_GROUP_SIZE ,sizeof(size_t),(void *)&local_work_size,NULL);
return a local_work_size of 192, so I think i’m well within the hardware/implementation limits…
I suppose the opencl code is ok, as i’ve tried to run it with the AMD opencl CPU implementation and it works just fine…
Everything I found about the CL_OUT_OF_RESOURCES error points to
-
sampler problems (I don’t use any(or am I missing something?))
-
work/invalid/??? work group dimensions
Any idea???
PS:
the hardware is a Quadro FX 570M;
the kernel code is:
void isect_ray_triangle(unsigned int first_idx, ocl_isect *ris, ray_t ray, triangle_t triangle) {
float4 tvec = ray.o - triangle.v0;
float4 pvec = cross(ray.d, triangle.e2);
float det = dot(triangle.e1, pvec);
if(det>-EPSILON && det<EPSILON)
return;
det=1.0f/det;
float u = dot(tvec, pvec) * det;
if (u < 0.0f || u > 1.0f)
return;
float4 qvec = cross(tvec, triangle.e1);
float v = dot(ray.d, qvec) * det;
if (v < 0.0f || (u + v) > 1.0f)
return;
float t=dot(triangle.e2, qvec) * det;
if(t<0 || t>ris->distance)
return;
ris->distance=t;
ris->idx=first_idx;
ris->u=u;
ris->v=v;
}
__kernel void intersect (__global triangle_t *triangles,
__global ray_t *rays,
__global ocl_isect *results,
__const unsigned int tcount,
__const unsigned int rays_count
) {
const unsigned int idx=get_global_id(2)*256*256+get_global_id(1)*256+get_global_id(0);
if(idx<=rays_count){
ocl_isect r;
r.distance=999999.0f;
r.idx=4294967295;
ray_t ray=rays[idx];
for(unsigned int i=0;i<tcount;i++) {
triangle_t t=triangles[i];
isect_ray_triangle(i,&r, ray, t);
}
results[idx]=r;
}
}