clEnqueueNDRangeKernel() returns CL_OUT_OF_RESOURCES and I have no idea why

I’m trying to run my program, and it worked at first with these args. If I change the size of unifiedSrcValid, it may (or may not) return the CL_OUT_OF_RESOURCES error. I can’t find a reason why it wouldn’t work. I’m running on a GTX 285, so I should have plenty of GPU memory. My machine is a Mac Pro with 16 GB of RAM running 10.6.3. It gives the same error if I run it on the CPU vs the GPU.

Does anyone else get this error (or another error) with this code? Am I doing anything wrong?

#include <fcntl.h>

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <math.h>

#include <unistd.h>

#include <sys/types.h>

#include <sys/stat.h>

#include <OpenCL/opencl.h>

#define TRUE 1

#define FALSE 0

////////////////////////////////////////////////////////////////////////////////

struct oclWarper {

	cl_command_queue queue;

	cl_context context;

	cl_kernel kern;

	cl_device_id dev;

	

	int srcWidth;

	int srcHeight;

	int dstWidth;

	int dstHeight;

	

	unsigned int xyChSize;

	cl_channel_order xyChOrder;

	unsigned int imgChSize;

	cl_channel_order imgChOrder;

	cl_channel_type imageFormat;

};

// Simple compute kernel which computes the square of an input array 

//

const char *KernelSource = "\n"

"__kernel void resamp(read_only image2d_t srcCoords,\n"

					"read_only image2d_t srcReal,\n"

					"read_only image2d_t srcImag,\n"

					"__constant float *fUnifiedSrcDensity,\n"

					"__constant int *nUnifiedSrcValid,\n"

					"__constant char *useBandSrcValid,\n"

					"__constant int *nBandSrcValid,\n"

					"__global char *dstReal,\n"

					"__global char *dstImag,\n"

					"__constant float *fDstNoDataReal,\n"

					"__constant float *dstDensity,\n"

					"__constant int *nDstValid,\n"

					"const int bandNum)\n"

"{}\n";

#define handleErr(err) if((err) != CL_SUCCESS) { \

printf("Error at file %s line %d; Err val: %d\n", __FILE__, __LINE__, err); \

printCLErr(err); \

while(1){}\

return err; \

}

void printCLErr(cl_int err)

{

	switch (err)

	{

		case CL_SUCCESS:

			printf("CL_SUCCESS\n");

			break;

		case CL_DEVICE_NOT_FOUND:

			printf("CL_DEVICE_NOT_FOUND\n");

			break;

		case CL_DEVICE_NOT_AVAILABLE:

			printf("CL_DEVICE_NOT_AVAILABLE\n");

			break;

		case CL_COMPILER_NOT_AVAILABLE:

			printf("CL_COMPILER_NOT_AVAILABLE\n");

			break;

		case CL_MEM_OBJECT_ALLOCATION_FAILURE:

			printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n");

			break;

		case CL_OUT_OF_RESOURCES:

			printf("CL_OUT_OF_RESOURCES\n");

			break;

		case CL_OUT_OF_HOST_MEMORY:

			printf("CL_OUT_OF_HOST_MEMORY\n");

			break;

		case CL_PROFILING_INFO_NOT_AVAILABLE:

			printf("CL_PROFILING_INFO_NOT_AVAILABLE\n");

			break;

		case CL_MEM_COPY_OVERLAP:

			printf("CL_MEM_COPY_OVERLAP\n");

			break;

		case CL_IMAGE_FORMAT_MISMATCH:

			printf("CL_IMAGE_FORMAT_MISMATCH\n");

			break;

		case CL_IMAGE_FORMAT_NOT_SUPPORTED:

			printf("CL_IMAGE_FORMAT_NOT_SUPPORTED\n");

			break;

		case CL_BUILD_PROGRAM_FAILURE:

			printf("CL_BUILD_PROGRAM_FAILURE\n");

			break;

		case CL_MAP_FAILURE:

			printf("CL_MAP_FAILURE\n");

			break;

		case CL_INVALID_VALUE:

			printf("CL_INVALID_VALUE\n");

			break;

		case CL_INVALID_DEVICE_TYPE:

			printf("CL_INVALID_DEVICE_TYPE\n");

			break;

		case CL_INVALID_PLATFORM:

			printf("CL_INVALID_PLATFORM\n");

			break;

		case CL_INVALID_DEVICE:

			printf("CL_INVALID_DEVICE\n");

			break;

		case CL_INVALID_CONTEXT:

			printf("CL_INVALID_CONTEXT\n");

			break;

		case CL_INVALID_QUEUE_PROPERTIES:

			printf("CL_INVALID_QUEUE_PROPERTIES\n");

			break;

		case CL_INVALID_COMMAND_QUEUE:

			printf("CL_INVALID_COMMAND_QUEUE\n");

			break;

		case CL_INVALID_HOST_PTR:

			printf("CL_INVALID_HOST_PTR\n");

			break;

		case CL_INVALID_MEM_OBJECT:

			printf("CL_INVALID_MEM_OBJECT\n");

			break;

		case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:

			printf("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR\n");

			break;

		case CL_INVALID_IMAGE_SIZE:

			printf("CL_INVALID_IMAGE_SIZE\n");

			break;

		case CL_INVALID_SAMPLER:

			printf("CL_INVALID_SAMPLER\n");

			break;

		case CL_INVALID_BINARY:

			printf("CL_INVALID_BINARY\n");

			break;

		case CL_INVALID_BUILD_OPTIONS:

			printf("CL_INVALID_BUILD_OPTIONS\n");

			break;

		case CL_INVALID_PROGRAM:

			printf("CL_INVALID_PROGRAM\n");

			break;

		case CL_INVALID_PROGRAM_EXECUTABLE:

			printf("CL_INVALID_PROGRAM_EXECUTABLE\n");

			break;

		case CL_INVALID_KERNEL_NAME:

			printf("CL_INVALID_KERNEL_NAME\n");

			break;

		case CL_INVALID_KERNEL_DEFINITION:

			printf("CL_INVALID_KERNEL_DEFINITION\n");

			break;

		case CL_INVALID_KERNEL:

			printf("CL_INVALID_KERNEL\n");

			break;

		case CL_INVALID_ARG_INDEX:

			printf("CL_INVALID_ARG_INDEX\n");

			break;

		case CL_INVALID_ARG_VALUE:

			printf("CL_INVALID_ARG_VALUE\n");

			break;

		case CL_INVALID_ARG_SIZE:

			printf("CL_INVALID_ARG_SIZE\n");

			break;

		case CL_INVALID_KERNEL_ARGS:

			printf("CL_INVALID_KERNEL_ARGS\n");

			break;

		case CL_INVALID_WORK_DIMENSION:

			printf("CL_INVALID_WORK_DIMENSION\n");

			break;

		case CL_INVALID_WORK_GROUP_SIZE:

			printf("CL_INVALID_WORK_GROUP_SIZE\n");

			break;

		case CL_INVALID_WORK_ITEM_SIZE:

			printf("CL_INVALID_WORK_ITEM_SIZE\n");

			break;

		case CL_INVALID_GLOBAL_OFFSET:

			printf("CL_INVALID_GLOBAL_OFFSET\n");

			break;

		case CL_INVALID_EVENT_WAIT_LIST:

			printf("CL_INVALID_EVENT_WAIT_LIST\n");

			break;

		case CL_INVALID_EVENT:

			printf("CL_INVALID_EVENT\n");

			break;

		case CL_INVALID_OPERATION:

			printf("CL_INVALID_OPERATION\n");

			break;

		case CL_INVALID_GL_OBJECT:

			printf("CL_INVALID_GL_OBJECT\n");

			break;

		case CL_INVALID_BUFFER_SIZE:

			printf("CL_INVALID_BUFFER_SIZE\n");

			break;

		case CL_INVALID_MIP_LEVEL:

			printf("CL_INVALID_MIP_LEVEL\n");

			break;

		case CL_INVALID_GLOBAL_WORK_SIZE:

			printf("CL_INVALID_GLOBAL_WORK_SIZE\n");

			break;

	}

}

////////////////////////////////////////////////////////////////////////////////

int main(int argc, char** argv)

{

	cl_device_id device_id;			 // compute device id 

	cl_context context;				 // compute context

	cl_command_queue commands;		  // compute command queue

	cl_program program;				 // compute program

	cl_kernel kernel;				   // compute kernel

	

	struct oclWarper warperStack;

	struct oclWarper *warper = &warperStack;

	cl_image_format imgFmt;

	

	warper->srcWidth = 2703;

	warper->srcHeight = 2685;

	warper->dstWidth = 2248;

	warper->dstHeight = 3086;

	

	warper->xyChSize = 4;

	warper->xyChOrder = 4277;

	warper->imgChOrder = 4280;

	warper->imageFormat = 4306;

	warper->imgChSize = 1;

	

	cl_int err = CL_SUCCESS;

	size_t numSrcPx = warper->srcWidth * warper->srcHeight;

	int validSrcSz = sizeof(int) * (1 + (numSrcPx >> 5));

	size_t numDstPx = warper->dstWidth * warper->dstHeight;

	

	cl_mem xy, unifiedSrcDensityCL, unifiedSrcValidCL;

	cl_mem dstDensityCL, dstValidCL, dstNoDataRealCL;

	cl_mem useBandSrcValidCL, nBandSrcValidCL;

	

	cl_mem srcImag, srcReal;

	cl_mem dstReal, dstImag;

	

	size_t ceil_runs[2];

	size_t group_size[2];

	

	int bandNum = 0;

	

	// Connect to a compute device

	//

	int gpu = 0;

	err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);

	if (err != CL_SUCCESS)

	{

		printf("Error: Failed to create a device group!\n");

		return EXIT_FAILURE;

	}

	// Create a compute context 

	//

	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);

	if (!context)

	{

		printf("Error: Failed to create a compute context!\n");

		return EXIT_FAILURE;

	}

	// Create a command commands

	//

	commands = clCreateCommandQueue(context, device_id, 0, &err);

	if (!commands)

	{

		printf("Error: Failed to create a command commands!\n");

		return EXIT_FAILURE;

	}

	// Create the compute program from the source buffer

	//

	program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);

	if (!program)

	{

		printf("Error: Failed to create compute program!\n");

		return EXIT_FAILURE;

	}

	// Build the program executable

	//

	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

	if (err != CL_SUCCESS)

	{

		size_t len;

		char buffer[2048];

		printf("Error: Failed to build program executable!\n");

		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);

		printf("%s\n", buffer);

		exit(1);

	}

	// Create the compute kernel in the program we wish to run

	//

	kernel = clCreateKernel(program, "resamp", &err);

	if (!kernel || err != CL_SUCCESS)

	{

		printf("Error: Failed to create compute kernel!\n");

		printCLErr(err);

		exit(1);

	}

	

	//************************************************************

***

	

	warper->context = context;

	warper->queue = commands;

	warper->dev = device_id;

	warper->kern = kernel;

	

	//Copy coord data to the device

	imgFmt.image_channel_order = warper->xyChOrder;

	imgFmt.image_channel_data_type = CL_FLOAT;

	xy = clCreateImage2D(warper->context, CL_MEM_READ_ONLY, &imgFmt,

						 (size_t) warper->dstWidth, (size_t) warper->dstHeight,

						 (size_t) sizeof(float) * warper->xyChSize * warper->dstWidth,

						 NULL, &err);

	handleErr(err);

	

	//Set up argument

	handleErr(err = clSetKernelArg(warper->kern, 0, sizeof(cl_mem), &xy));

	

	//Set up image vars

	unifiedSrcDensityCL = clCreateBuffer(warper->context, CL_MEM_READ_ONLY,

										 sizeof(float) * numSrcPx, NULL, &err);

	handleErr(err);

	

	//Copy unifiedSrcValid if it exists

	if (FALSE) {

		//Alloc dummy device RAM

		unifiedSrcValidCL = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, NULL, &err);

		handleErr(err);

	} else {

		//Alloc & copy all validity data

		unifiedSrcValidCL = clCreateBuffer(warper->context, CL_MEM_READ_ONLY,

										   validSrcSz, NULL, &err);

		handleErr(err);

	}

	

	//Make a fake image so we don't have a NULL pointer

	useBandSrcValidCL = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, NULL, &err);

	handleErr(err);

	nBandSrcValidCL = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, NULL, &err);

	handleErr(err);

	

	//Set up arguments

	handleErr(err = clSetKernelArg(warper->kern, 3, sizeof(cl_mem), &unifiedSrcDensityCL));

	handleErr(err = clSetKernelArg(warper->kern, 4, sizeof(cl_mem), &unifiedSrcValidCL));

	handleErr(err = clSetKernelArg(warper->kern, 5, sizeof(cl_mem), &useBandSrcValidCL));

	handleErr(err = clSetKernelArg(warper->kern, 6, sizeof(cl_mem), &nBandSrcValidCL));

	

	//Set up image vars

	imgFmt.image_channel_order = warper->imgChOrder;

	imgFmt.image_channel_data_type = warper->imageFormat;

	srcReal = clCreateImage2D(warper->context, CL_MEM_READ_ONLY, &imgFmt,

							  (size_t) warper->srcWidth, (size_t) warper->srcHeight,

							  warper->srcWidth * warper->imgChSize * sizeof(char),

							  NULL, &err);

	handleErr(err);

	srcImag = clCreateImage2D(warper->context, CL_MEM_READ_ONLY, &imgFmt,

							  1, 1, warper->imgChSize * sizeof(char), NULL, &err);

	handleErr(err);

	

	//Set up per-band arguments

	handleErr(err = clSetKernelArg(warper->kern, 1, sizeof(cl_mem), &srcReal));

	handleErr(err = clSetKernelArg(warper->kern, 2, sizeof(cl_mem), &srcImag));

	

	//Make dummy memory

	dstReal = clCreateBuffer(warper->context, CL_MEM_READ_WRITE,

							 numDstPx * warper->imgChSize * sizeof(char), NULL, &err);

	handleErr(err);

	dstImag = clCreateBuffer(warper->context, CL_MEM_READ_WRITE, 1, NULL, &err);

	handleErr(err);

	

	//Set up per-band arguments

	handleErr(err = clSetKernelArg(warper->kern, 7, sizeof(cl_mem), &dstReal));

	handleErr(err = clSetKernelArg(warper->kern, 8, sizeof(cl_mem), &dstImag));

	

	//Make dummy memory

	dstNoDataRealCL = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, NULL, &err);

	handleErr(err);

	dstDensityCL = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, NULL, &err);

	handleErr(err);

	dstValidCL = clCreateBuffer(warper->context, CL_MEM_READ_ONLY, 1, NULL, &err);

	handleErr(err);

	

	//Set up arguments

	handleErr(err = clSetKernelArg(warper->kern,  9, sizeof(cl_mem), &dstNoDataRealCL));

	handleErr(err = clSetKernelArg(warper->kern, 10, sizeof(cl_mem), &dstDensityCL));

	handleErr(err = clSetKernelArg(warper->kern, 11, sizeof(cl_mem), &dstValidCL));

	

	handleErr(err = clSetKernelArg(warper->kern, 12, sizeof(int), &bandNum));

	

	ceil_runs[0] = 1;

	ceil_runs[1] = 1;

	group_size[0] = 1;

	group_size[1] = 1;

	

	handleErr(err = clEnqueueNDRangeKernel(warper->queue, warper->kern, 2, NULL, 

										   ceil_runs, group_size, 0, NULL, NULL));

	

	return 0;

}

Hey all, I figured it out. I’m overusing ‘__constant’. I removed the ‘__constant’ qualifiers on the larger arrays in the kernel and it works fine now.