clEnqueueNDRangeKernel throws CL_OUT_OF_RESOURCES

I have a kernel running very well on Intel HD graphics card. But, when I want to run the kernel on my GeForce 960 it gives the CL_OUT_OF_RESOURCES error.

I have tried for different local sizes and made sure to not go beyond the array indices, but still have no clue why this error is happening. Do you know why my code runs fine on Intel and doesn’t work on NVIDIA?

One weird thing that is happening in my code is that I have a 13 itrations of similar operations. For performance purposes, I have repeated the same operations for 13 times and avoided writing a loop just to save some additional operations that loops have. The code works on NVIDIA when I reach to the 11th operation. But, when I include the 12th operation in the code it gives the above error and the 11th and 12th operations are similar! Any ideas why such thing is happening?

I have attached the c-code and kernel codes in two separate text files. The code reads the coords array, passes it to the kernel. The correct result is an array of length 16 in which all the elements are equal to 1 expect the last one which is zero.

Here is the kernel code which is stored in a separate .cl file:

float2 projectCube(float3 axis, float3 vertex){
	
	float voxelSize = 0.5f;
	float2 projection = (float2)(0.0f, 0.0f);

	float temp;
	//1
	temp = axis.x;
	if (projection.x > temp){ projection.x = temp; }
	else if (projection.y < temp){ projection.y = temp; }

	//2
	temp = axis.x + axis.y;
	if (projection.x > temp){ projection.x = temp; }
	else if (projection.y < temp){ projection.y = temp; }
	
	//3
	temp = axis.y;
	if (projection.x > temp){ projection.x = temp; }
	else if (projection.y < temp){ projection.y = temp; }
	
	//4
	temp = axis.z;
	if (projection.x > temp){ projection.x = temp; }
	else if (projection.y < temp){ projection.y = temp; }
	
	//5
	temp = axis.x + axis.z;
	if (projection.x > temp){ projection.x = temp; }
	else if (projection.y < temp){ projection.y = temp; }
	
	//6
	temp = axis.y + axis.z;
	if (projection.x > temp){ projection.x = temp; }
	else if (projection.y < temp){ projection.y = temp; }
	
	//7
	temp = axis.x + axis.y + axis.z;
	if (projection.x > temp){ projection.x = temp; }
	else if (projection.y < temp){ projection.y = temp; }

	float product = dot(axis, vertex);
	projection.x = voxelSize * projection.x + product;
	projection.y = voxelSize * projection.y + product;
	return projection;
}

float2 projectTriangle(float3 axis, float3 v0, float3 v1, float3 v2){
	
	float2 projection;
	projection.x = dot(axis, v0);
	projection.y = projection.x;

	float temp = dot(axis, v1);
	if(projection.x > temp){
		projection.x = temp;
	}
	else if(projection.y < temp){
		projection.y = temp;
	}
	temp = dot(axis, v2);
	if (projection.x > temp){
		projection.x = temp;
	}
	else if (projection.y < temp){
		projection.y = temp;
	}
	return projection;
}

float tester(float3 axis, float3 voxel, float3 v0, float3 v1, float3 v2){
	
	float2 voxelProjection = projectCube(axis, voxel);
	float2 faceProjection = projectTriangle(axis, v0, v1, v2);
	float minProjection = fmin(voxelProjection.x, faceProjection.x);
	float maxProjection = fmax(voxelProjection.y, faceProjection.y);
	float testResult = maxProjection - minProjection - voxelProjection.y + voxelProjection.x
		- faceProjection.y + faceProjection.x;
	return testResult;
}

__kernel void voxelizer(int global_size,
						float h_voxelSize,
						__global float* h_minBoundsGrid,
						__global int *h_dimGrid,
						__global float* coords,
						__global int* density)
{

	//printf("local size is: %d\n", get_num_groups(0));
	int i = get_global_id(0) * 9;
	if (i <= global_size * 9){
		
		float voxelSize = h_voxelSize;
		float3 minBoundsGrid;
		minBoundsGrid.x = h_minBoundsGrid[0];
		minBoundsGrid.y = h_minBoundsGrid[1];
		minBoundsGrid.z = h_minBoundsGrid[2];
		int3 dimGrid;
		dimGrid.x = h_dimGrid[0];
		dimGrid.y = h_dimGrid[1];
		dimGrid.z = h_dimGrid[2];

		if ( i %9 == 0){
			
			/*Triangle vertices*/
			float3 v0;
			v0 = (float3)(coords[i], coords[i + 1], coords[i + 2]);
			float3 v1;
			v1 = (float3)(coords[i + 3], coords[i + 4], coords[i + 5]);
			float3 v2;
			v2 = (float3)(coords[i + 6], coords[i + 7], coords[i + 8]);
			//printf("i = %d. v0: %f, %f, %f\n", i, v0.x, v0.y, v0.z);
			//printf("i = %d. v1: %f, %f, %f\n", i, v1.x, v1.y, v1.z);
			//printf("i = %d. v2: %f, %f, %f\n", i, v2.x, v2.y, v2.z);
			
			/*Normal vectors of the each voxel*/
			float3 e0;
			e0 = (float3)(0.5f, 0.0f, 0.0f);
			float3 e1;
			e1 = (float3)(0.0f, 0.5f, 0.0f);
			float3 e2;
			e2 = (float3)(0.0f, 0.0f, 0.5f);

			/*Edges of a traingle*/
			float3 f0;
			f0 = v1 - v0;
			float3 f1;
			f1 = v2 - v1;
			float3 f2;
			f2 = v0 - v2;

			float3 minLocalGrid;
			minLocalGrid.x = fmin(v0.x, fmin(v1.x, v2.x));			
			minLocalGrid.y = fmin(v0.y, fmin(v1.y, v2.y));			
			minLocalGrid.z = fmin(v0.z, fmin(v1.z, v2.z));			
			minLocalGrid.x = voxelSize * floor(minLocalGrid.x / voxelSize);
			minLocalGrid.y = voxelSize * floor(minLocalGrid.y / voxelSize);
			minLocalGrid.z = voxelSize * floor(minLocalGrid.z / voxelSize);
			//printf("i = %d. minLocalGrid = %f, %f, %f.\n", i, minLocalGrid.x, minLocalGrid.y, minLocalGrid.z);

			float3 maxLocalGrid;
			maxLocalGrid.x = fmax(v0.x, fmax(v1.x, v2.x));
			maxLocalGrid.y = fmax(v0.y, fmax(v1.y, v2.y));
			maxLocalGrid.z = fmax(v0.z, fmax(v1.z, v2.z));
			maxLocalGrid.x = voxelSize * ceil(maxLocalGrid.x / voxelSize);
			maxLocalGrid.y = voxelSize * ceil(maxLocalGrid.y / voxelSize);
			maxLocalGrid.z = voxelSize * ceil(maxLocalGrid.z / voxelSize);
			if (maxLocalGrid.x == minLocalGrid.x){ maxLocalGrid.x += voxelSize; }
			if (maxLocalGrid.y == minLocalGrid.y){ maxLocalGrid.y += voxelSize; }
			if (maxLocalGrid.z == minLocalGrid.z){ maxLocalGrid.z += voxelSize; }
			//printf("i = %d. maxLocalGrid = %f, %f, %f.\n", i, maxLocalGrid.x, maxLocalGrid.y, maxLocalGrid.z);

			//printf("i = %d\n v0 = %f, %f, %f\n v1 = %f, %f, %f\n v2 = %f, %f, %f\n minLocalGrid = %f, %f, %f\n===============\n",
			//	i, v0.x, v0.y, v0.z, v1.x, v1.y, v1.z, v2.x, v2.y, v2.z, maxLocalGrid.x, maxLocalGrid.y, maxLocalGrid.z);

			float j = minLocalGrid.z;
			while(j < maxLocalGrid.z){
				float k = minLocalGrid.y;
				while(k < maxLocalGrid.y){
					float l = minLocalGrid.x;
					while (l < maxLocalGrid.x){

						float3 firstVertexOfVoxel = (float3)(l, k, j);
						//printf("l,k,j: %f, %f, %f\n", l, k, j);
						float3 globalCoordOffset = (firstVertexOfVoxel - minBoundsGrid) / voxelSize;
						int3 globalDimOffset = convert_int3_rtz(globalCoordOffset);
						//printf("i = %d. globalCoordOffset: %f, %f, %f\n", i, globalCoordOffset.x, globalCoordOffset.y, globalCoordOffset.z);
						//printf("i = %d. globalDimOffset: %d, %d, %d\n", i, globalDimOffset.x, globalDimOffset.y, globalDimOffset.z);

						int voxelIndexGlobalGrid = globalDimOffset.x + dimGrid.x * (globalDimOffset.y +
							dimGrid.y * globalDimOffset.z);
						//printf("i = %d. voxelIndexGlobalGrid = %d\n", i, voxelIndexGlobalGrid);

						if (density[voxelIndexGlobalGrid] != 1){

							/*The famous 13-axes test*/
							float3 axis;
							float testResult = 0;
							int overlapCount = 0;

							//1
							testResult = tester(e0, firstVertexOfVoxel, v0, v1, v2);
							if (testResult <= 0){

								overlapCount++;
							}

							//2
							testResult = tester(e1, firstVertexOfVoxel, v0, v1, v2);
							if (testResult <= 0){

								overlapCount++;
							}

							//3
							testResult = tester(e2, firstVertexOfVoxel, v0, v1, v2);
							if (testResult <= 0){

								overlapCount++;
							}

							//4
							//axis = ;
							testResult = tester(cross(-f2, f0), firstVertexOfVoxel, v0, v1, v2);
							if (testResult <= 0){

								overlapCount++;
							}

							//5
							/*axis = cross(e0, f0);*/
							testResult = tester(cross(e0, f0), firstVertexOfVoxel, v0, v1, v2);
							if (testResult <= 0){

								overlapCount++;
							}

							//6
							//axis = cross(e0, f0);
							testResult = tester(cross(e0, f1), firstVertexOfVoxel, v0, v1, v2);
							if (testResult <= 0){

								overlapCount++;
							}

							//7
							//axis = cross(e0, f0);
							testResult = tester(cross(e0, f2), firstVertexOfVoxel, v0, v1, v2);
							if (testResult <= 0){

								overlapCount++;
							}

							//8
							//axis = cross(e1, f0);
							testResult = tester(cross(e1, f0), firstVertexOfVoxel, v0, v1, v2);
							if (testResult <= 0){

								overlapCount++;
							}

							//9
							//axis = cross(e1, f1);
							testResult = tester(cross(e1, f1), firstVertexOfVoxel, v0, v1, v2);
							if (testResult <= 0){

								overlapCount++;
							}

							//10
							//axis = cross(e1, f2);
							testResult = tester(cross(e1, f2), firstVertexOfVoxel, v0, v1, v2);
							if (testResult <= 0){

								overlapCount++;
							}

							//11
							//axis = cross(e2, f0);
							testResult = tester(cross(e2, f0), firstVertexOfVoxel, v0, v1, v2);
							if (testResult <= 0){

								overlapCount++;
							}

							//12
							//axis = cross(e2, f1);
							testResult = tester(cross(e2, f1), firstVertexOfVoxel, v0, v1, v2);
							if (testResult <= 0){

								overlapCount++;
							}

							//13
							//axis = cross(e2, f2);
							testResult = tester(cross(e2, f2), firstVertexOfVoxel, v0, v1, v2);
							if (testResult <= 0){

								overlapCount++;
							}
							if (overlapCount == 13){ 
							
								density[voxelIndexGlobalGrid] = 1;
							}
							
						}
					l = l + voxelSize;
					}// while for l
					k = k + voxelSize;
				}// while for k
				j = j + voxelSize;
			}//while for j

			//printf("Here are the max of the %d-th face: %f, %f, %f\n", i / 9, maxLocalGrid.x, maxLocalGrid.y, maxLocalGrid.z);
			//printf("Here are the coordinates of the %d-th face: %f, %f, %f\n", i / 9, e1.x, e1.y, e1.z);
			//printf("Here are the coordinates of the %d-th face: %f, %f, %f\n", i / 9, e2.x, e2.y, e2.z);

			//printf("\n==================KERNEL COMPUTED==================\n");
			//barrier(CLK_LOCAL_MEM_FENCE);

		}
		
	}
}

And the c-code is here:

#define DEVICE_SELECTOR 1 //0 for Intel and 1 for Nvidia in my computer
#define _CRT_SECURE_NO_WARNINGS
#define KERNEL_FILE "..\\voxelizerKernel.cl"
#define WORK_DIM 1

#define VOXEL_SIZE 0.5f
#define HALF_VOXEL_SIZE VOXEL_SIZE/2.0f;

//C header files
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <algorithm>
//OpenCL header files
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

cl_device_id create_device() {

	cl_platform_id *platform;
	cl_device_id dev;
	cl_uint num_platform;
	int err;

	/* Identify a platform */
	err = clGetPlatformIDs(0, NULL, &num_platform);
	if (err < 0) {
		printf("Error code: %d. Couldn't identify a platform\n", err);
		exit(1);
	}
	platform = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platform);
	clGetPlatformIDs(num_platform, platform, NULL);
	/* Access a device */
	err = clGetDeviceIDs(platform[DEVICE_SELECTOR], CL_DEVICE_TYPE_GPU, 1, &dev, NULL);

	if (err < 0) {
		printf("Error code: %d. Couldn't access any devices\n", err);
		exit(1);
	}

	return dev;
}
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {

	cl_program program;
	FILE *program_handle;
	char *program_buffer, *program_log;
	size_t program_size, log_size;
	int err;

	/* Read program file and place content into buffer */
	program_handle = fopen(filename, "r");
	if (program_handle == NULL) {
		printf("Couldn't find the program file\n");
		exit(1);
	}
	fseek(program_handle, 0, SEEK_END);
	program_size = ftell(program_handle);
	rewind(program_handle);
	program_buffer = (char*)malloc(program_size + 1);
	program_buffer[program_size] = '\0';
	fread(program_buffer, sizeof(char), program_size, program_handle);
	fclose(program_handle);

	/* Create program from file */
	program = clCreateProgramWithSource(ctx, 1,
		(const char**)&program_buffer, &program_size, &err);
	if (err < 0) {
		printf("Error code: %d. Couldn't create the program\n", err);
		exit(1);
	}
	free(program_buffer);

	/* Build program */
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err < 0) {

		/* Find size of log and print to std output */
		clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
			0, NULL, &log_size);
		program_log = (char*)malloc(log_size + 1);
		program_log[log_size] = '\0';
		clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
			log_size + 1, program_log, NULL);
		printf("%s\n", program_log);
		free(program_log);
		exit(1);
	}

	return program;
}
void print_device_info(cl_device_id dev){

	cl_ulong glob_mem_size, local_mem_size;
	cl_uint clock_freq, num_core, work_item_dim, time_res;
	size_t local_size, work_item_size[3];
	char dev_vendor[40], dev_name[400], driver_version[40], device_version[40];

	clGetDeviceInfo(dev, CL_DEVICE_VENDOR, sizeof(dev_vendor), &dev_vendor, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_NAME, sizeof(dev_name), &dev_name, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(glob_mem_size), &glob_mem_size, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(local_mem_size), &local_mem_size, NULL);
	clGetDeviceInfo(dev, CL_DRIVER_VERSION, sizeof(driver_version), &driver_version, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_VERSION, sizeof(device_version), &device_version, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_freq), &clock_freq, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_core), &num_core, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_size), &work_item_size, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(work_item_dim), &work_item_dim, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(time_res), &time_res, NULL);

	printf("==========================================================\n");
	printf("Device Sepc without consideration of kernels:\n");
	printf("CL_DEVICE_VENDOR:                     %s\n", dev_vendor);
	printf("CL_DEVICE_NAME:                       %s\n", dev_name);
	printf("CL_DEVICE_GLOBAL_MEM_SIZE:            %I64u GB\n", glob_mem_size / 1073741824);
	printf("CL_DEVICE_LOCAL_MEM_SIZE:             %I64u KB\n", local_mem_size / 1024);
	printf("CL_DRIVER_VERSION:                    %s\n", driver_version);
	printf("CL_DEVICE_VERSION:                    %s\n", device_version);
	printf("CL_DEVICE_MAX_CLOCK_FREQUENCY:        %I32u MHz\n", clock_freq);
	printf("CL_DEVICE_MAX_COMPUTE_UNITS:          %I32u\n", num_core);
	printf("CL_DEVICE_MAX_WORK_GROUP_SIZE         %u\n", local_size);
	printf("CL_DEVICE_MAX_WORK_ITEM_SIZES:        {%I32u, %I32u, %I32u}\n", work_item_size[0], work_item_size[1], work_item_size[2]);
	printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:   %I32u\n", work_item_dim);
	printf("CL_DEVICE_PROFILING_TIMER_RESOLUTION: %I32u ns\n", time_res);
	printf("==========================================================\n");

}

int main()
{

	/*OpenCL variables*/
	cl_int i, j, err, num_groups;
	size_t local_size, max_local_size, global_size, processed_global_size;
	
	cl_context context;
	cl_command_queue queue;

	cl_program program;
	cl_device_id device;
	cl_kernel voxelization_kernel, reduction_kernel, reduction_complete_kernel;
	
	cl_mem coords_buffer, density_buffer, dimGrid_buffer, h_minBoundsGrid_buffer, fullVxelsCount_buffer, group_sums_buffer;
	void *density_mapped_memory;
	
	cl_event prof_event;
	cl_ulong time_start, time_end, total_time;

	float h_voxelSize = VOXEL_SIZE;

	float fullVxelsCount = 0;

	/*Read mesh data*/
	float coords[54] =
	{ 0.300500,
	1.300000,
	0.000500,
	1.200500,
	1.600000,
	0.000500,
	1.600500,
	0.600000,
	0.000500,
	0.300500,
	1.300000,
	0.000500,
	0.500500,
	1.900000,
	0.000500,
	1.200500,
	1.600000,
	0.000500,
	0.300500,
	1.300000,
	0.000500,
	1.600500,
	0.600000,
	0.000500,
	0.100500,
	0.700000,
	0.000500,
	0.100500,
	0.700000,
	0.000500,
	1.600500,
	0.600000,
	0.000500,
	0.000500,
	0.200000,
	0.000500,
	0.000500,
	0.200000,
	0.000500,
	1.600500,
	0.600000,
	0.000500,
	1.600500,
	0.100000,
	0.000500,
	1.200500,
	1.600000,
	0.000500,
	1.600500,
	1.300000,
	0.000500,
	1.600500,
	0.600000,
	0.000500 };

	/*Get the voxel count*/
	float boundsGrid[6] = {0,2,0,2,0,0.5};

	int dimGrid[3] = {
		(boundsGrid[1] - boundsGrid[0]) / VOXEL_SIZE,
		(boundsGrid[3] - boundsGrid[2]) / VOXEL_SIZE,
		(boundsGrid[5] - boundsGrid[4]) / VOXEL_SIZE
	};
	if (dimGrid[0] == 0) dimGrid[0] = 1;
	if (dimGrid[1] == 0) dimGrid[1] = 1;
	if (dimGrid[2] == 0) dimGrid[2] = 1;
	float h_minBoundsGrid[3];
	h_minBoundsGrid[0] = boundsGrid[0];
	h_minBoundsGrid[1] = boundsGrid[2];
	h_minBoundsGrid[2] = boundsGrid[4];
	int voxelCounts = dimGrid[0] * dimGrid[1] * dimGrid[2];

	/*Prepare kernel output : build an array for storing voxles' density info*/
	int *density = (int*)malloc(sizeof(int)*voxelCounts);
	for (int i = 0; i < voxelCounts; i++){
		density[i] = 0;
	}

	/*OpenCL essentials*/
	device = create_device();
	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_local_size), &max_local_size, NULL);
	//print_device_info(device);

	context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
	if (err < 0) {
		printf("Error code: %d. Couldn't create a context\n", err);
		exit(1);
	}
	program = build_program(context, device, KERNEL_FILE);
	queue = clCreateCommandQueue(context, device,
		CL_QUEUE_PROFILING_ENABLE, &err);
	if (err < 0) {
		printf("Error code: %d. Couldn't create a command queue\n", err);
		exit(1);
	};
	voxelization_kernel = clCreateKernel(program, "voxelizer", &err);
	if (err < 0) {
		printf("Error code: %d. Couldn't create a kernel\n", err);
		exit(1);
	};

	int numberOfFaces = 6;
	global_size = numberOfFaces;
	local_size = max_local_size;
	if (global_size % local_size != 0){
		processed_global_size = (global_size / local_size + 1) * local_size;
		//int padding = processed_global_size - global_size;
		//int *working_data = (int*)malloc((voxelCounts + padding)*sizeof(int));
		//memcpy(working_data, density, voxelCounts);
		//memset(working_data + voxelCounts, 0.0, padding);
	}
	else{
		processed_global_size = global_size;
	}

	/* Create host-device data exchange interface*/
	dimGrid_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
		CL_MEM_COPY_HOST_PTR, sizeof(float)* 3, dimGrid, &err);
	h_minBoundsGrid_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
		CL_MEM_COPY_HOST_PTR, sizeof(float)* 3, h_minBoundsGrid, &err);
	coords_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
		CL_MEM_COPY_HOST_PTR, sizeof(coords) * numberOfFaces * 9, coords, &err);
	density_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY |
		CL_MEM_COPY_HOST_PTR, sizeof(density) * voxelCounts, density, &err);
	if (err < 0) {
		printf("Error code: %d. Couldn't create a buffer\n", err);
		exit(1);
	};

	err = clSetKernelArg(voxelization_kernel, 0, sizeof(global_size), &global_size);
	err |= clSetKernelArg(voxelization_kernel, 1, sizeof(h_voxelSize), &h_voxelSize);
	err |= clSetKernelArg(voxelization_kernel, 2, sizeof(cl_mem), &h_minBoundsGrid_buffer);
	err |= clSetKernelArg(voxelization_kernel, 3, sizeof(cl_mem), &dimGrid_buffer);
	err |= clSetKernelArg(voxelization_kernel, 4, sizeof(cl_mem), &coords_buffer);
	err |= clSetKernelArg(voxelization_kernel, 5, sizeof(cl_mem), &density_buffer);
	if (err < 0) {
		printf("Error code: %d. Couldn't create an argument for voxelization_kernel\n", err);
		exit(1);
	}

	/* Do the voxelization magic */
	err = clEnqueueNDRangeKernel(queue, voxelization_kernel, 1, NULL, &processed_global_size,
		&local_size, 0, NULL, &prof_event);
	if (err < 0) {
		printf("Error code: %d. Couldn't enqueue the voxelization_kernel\n", err);
		exit(1);
	}

	/* Read the results */
	density_mapped_memory = clEnqueueMapBuffer(queue, density_buffer, CL_TRUE,
		CL_MAP_READ, 0, sizeof(density), 0, NULL, NULL, &err);
	if (err < 0) {
		printf("Error code : %d. Couldn't map the buffer to host memory\n", err);
		exit(1);
	}
	memcpy(density, density_mapped_memory, sizeof(density)* voxelCounts);
	err = clEnqueueUnmapMemObject(queue, density_buffer, density_mapped_memory,
		0, NULL, NULL);
	if (err < 0) {
		printf("Error code: %d. Couldn't unmap the density_buffer\n", err);
		exit(1);
	}
	for (int i = 0; i < voxelCounts; i++){
		printf("%d\n", density[i]);
	}
	

	/*Clean up*/
	clReleaseKernel(voxelization_kernel);
	clReleaseMemObject(dimGrid_buffer);
	clReleaseMemObject(h_minBoundsGrid_buffer);
	clReleaseMemObject(coords_buffer);
	clReleaseMemObject(density_buffer);
	clReleaseCommandQueue(queue);
	clReleaseProgram(program);
	clReleaseContext(context);

	return 0;
}

Can you shrink the problem down to a short, complete code that someone else could compile and run? It might be a registers/thread (workitem) issue. One of the ways to empirically test for a registers/thread issue is to run your code with fewer work items per workgroup (say, cut it in half) to see if the previously failing case now works. That would be an additional datapoint.

Thanks for you answer. I have added the source code.

I don’t see it. If the files are not too long, you should be able to copy and paste them directly into the thread here - no need to attach anything. There is a code formatting block in the toolbar, select the code after you have pasted it in and then click the </> symbol in the toolbar.

It took a long time for scanning the attachments. I added them to the question. Thanks!

I’m getting a seg fault in your code on this line:

coords_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
		CL_MEM_COPY_HOST_PTR, sizeof(coords) * numberOfFaces * 9, coords, &err);

According to the OCL documentation:

https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clCreateBuffer.html

clCreateBuffer, when used with CL_MEM_COPY_HOST_PTR will initialize the buffer with the contents pointed to by coords (the host pointer). However you’ve specified a buffer size of:

sizeof(coords) * numberOfFaces * 9

which is larger than the size of the coords allocation:

float coords[54] = ...

It seg faults within the call to clCreateBuffer as it tries to copy from the coords buffer and goes beyond the end of it, because your specified buffer size is much larger than the coords buffer.

That looks like broken code to me.

Thanks for looking into my code. Can you tell me what will be the right way of doing so? I replaced sizeof(coords) with sizeof(float) but still get the same error.

This seg fault is not the same as your CL_OUT_OF_RESOURCES error. Its a separate issue. It occurs before the enqueue of the kernel. But I can’t get past that seg fault without making changes to your code. Changing your code requires an understanding of what the size of that buffer should be and how it should be initialized. I don’t have that knowledge.

Here’s another bug in your code. The first kernel parameter is an int:

__kernel void voxelizer(int global_size, ...

But in your main program, you are declaring global_size as a size_t and passing that as the first kernel argument:

size_t local_size, max_local_size, global_size, processed_global_size;
        ...
	err = clSetKernelArg(voxelization_kernel, 0, sizeof(global_size), &global_size);

On a 64-bit platform, which is what I happen to be using, size_t and int are not the same thing, so that clSetKernelArg command throws an error.

Anyway, the CL_OUT_OF_RESOURCES is due I believe to a registers/thread issue. This can be worked around by reducing the size of your workgroup:

local_size = max_local_size/2;   // makes local_size 512 instead of 1024

You also have bugs in your code in the handling of the buffers after enqueue of the kernel, around the sizes of the mapped buffer. Finally, your kernel is accessing data out-of-bounds.

I have put some description regarding the above points that you mentioned. Hope that it will explain what you wanted.

The function that this code is supposed to do is to receive a triangular mesh and voxelize it. Reading the mesh and displaying the voxels are done using VTK. I removed those parts here to avoid the need for installing VTK in case you didn’t have it.

The array “coords” will contain the coordinates of vertices of a triangle in the mesh. So, for a mesh containing “numberOfFaces” triangles, the coords must contain “numberOfFaces * 9” elements simply because there are three vertices for each face and each vertex has 3-dimensional coordinates. As including all the coordinates of the mesh here was not possible, I put 54 numbers which represent a simple mesh with “numberOfFaces = 6” triangles. But, for real geometries this number can be very large.

In the implementation that I had previously in pure C, I loop over all the triangles of the mesh which was very time-consuming. Therefore, I wanted to create a work-item for every trinagle of the mesh here and parallelize the algorithm in this way. That’s why global size is set to “numberOfFaces”. However, since the global size must be a multiple of local size I had created the processed global size as in practice there is no guarantee that the number of faces of a mesh will a multiple of local size.

Yes, I forgot the global size must be a multiple of the local size, so I removed that comment. Your code has many other issues apart from that. You may want to re-read my comments above.

Sorry I forgot to answer the upper questions.

The coords_buffer needs to transfer all the elements of coords array. Therefore, in this case it should transfer all the 54 numbers. So, I think in order to initialize that buffer the following code will be fine: (In the original code when I use VTK, I find the number of triangles from mesh and dynamically allocate coords with “numberOfFaces * 9”).

coords_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
		CL_MEM_COPY_HOST_PTR, sizeof(float) * 54, coords, &err);

Thanks to your comment I have changed the first argument’s type to size_t.

One weird thing about my code is in the kernel, when I comment the codes between the lines 265-287 it doesn’t give the error. It is even working when I uncomment those lines but comment for two other blocks on top of them! That’s why I thought may be the number of instructions is getting too much in the kernel but I don’t think that GPU programming is this much limited cause my kernel does not seem to be that big!

In my comment #9 I indicated the reason for the CL_OUT_OF_RESOURCES_ERROR It is due to the compiler using up a large number of registers. When you modify the kernel code, it modifies the register usage.

Great! This led to running the kernel. Thanks!

But, I noticed another issue. The results are different when I run the code on Intel compared to NVIDIA. The correct result must be an array of length 16 with all elements equal to one except the last one. The Intel HD gives the correct result, but NVIDIA is doing it wrong! There results should be similar, right?

Your kernel code is making access errors. It is accessing data out-of-range. I mentioned this already.

Okay. I take a closer look into it. I am using printf to debug the code. Do you know any better way to do so?

Your buffer mapping at the end is broken. did you fix that?

Oh, thanks so much for pointing out this issue. I fixed it with the following code. This fixed the issue and the code runs well on NVIDIA for relatively meshes.

density_mapped_memory = clEnqueueMapBuffer(queue, density_buffer, CL_TRUE,
		CL_MAP_READ, 0, sizeof(int) *  voxelCounts, 0, NULL, NULL, &err);
	if (err < 0) {
		printf("Error code : %d. Couldn't map the buffer to host memory\n", err);
		exit(1);
	}
	memcpy(density, density_mapped_memory, sizeof(int)* voxelCounts);
	err = clEnqueueUnmapMemObject(queue, density_buffer, density_mapped_memory,
		0, NULL, NULL);
	if (err < 0) {
		printf("Error code: %d. Couldn't unmap the density_buffer\n", err);
		exit(1);

I really appreciate your time to solve this issue!