read_imagef() always returns zeros. Why?

I’m trying to do some work with image memory, except everything I do returns the image texture as zeros. I’ve been working on this for far too many hours. Help! Why can’t I get the read_imagef() functions to return anything useful?

Here is my complete code worked into a simple example:

#include <stdio.h>

#include <assert.h>

#include <sys/sysctl.h>

#include <sys/stat.h>

#include <stdlib.h>

#include <stdio.h>

#include <OpenCL/OpenCL.h>

#pragma mark -

#pragma mark Utilities

char * load_program_source(const char *filename)

{ 

	

	struct stat statbuf;

	FILE *fh; 

	char *source; 

	

	fh = fopen(filename, "r");

	if (fh == 0)

		return 0; 

	

	stat(filename, &statbuf);

	source = (char *) malloc(statbuf.st_size + 1);

	fread(source, statbuf.st_size, 1, fh);

	source[statbuf.st_size] = '

#include <stdio.h>

#include <assert.h>

#include <sys/sysctl.h>

#include <sys/stat.h>

#include <stdlib.h>

#include <stdio.h>

#include <OpenCL/OpenCL.h>

#pragma mark -

#pragma mark Utilities

char * load_program_source(const char *filename)

{

struct stat statbuf;

FILE *fh; 

char *source; 



fh = fopen(filename, "r");

if (fh == 0)

	return 0; 



stat(filename, &statbuf);

source = (char *) malloc(statbuf.st_size + 1);

fread(source, statbuf.st_size, 1, fh);

source[statbuf.st_size] = '\0'; 



return source; 

}

#pragma mark -

#pragma mark Main OpenCL Routine

int runCL(char * a, char * b, float * results, int n)

{

cl_program program[1];

cl_kernel kernel[2];



cl_command_queue cmd_queue;

cl_context   context;



cl_device_id cpu = NULL, device = NULL;



cl_int err = 0;

size_t returned_size = 0;



cl_mem a_mem, b_mem, ans_mem;

#pragma mark Device Information

{

	// List all devices

	cl_uint num_dev;

	cl_device_id dev_list[5];

	// Find the CPU CL device

	err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &device, NULL);

	assert(err == CL_SUCCESS);

	assert(device);

	

	// Get some information about the returned device

	cl_char vendor_name[1024] = {0};

	cl_char device_name[1024] = {0};

	err = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name), 

						  vendor_name, &returned_size);

	err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), 

						  device_name, &returned_size);

	assert(err == CL_SUCCESS);

	printf("Connecting to %s %s...\n", vendor_name, device_name);

}

#pragma mark Context and Command Queue

{

	// Now create a context to perform our calculation with the 

	// specified device 

	context = clCreateContext(0, 1, &device, NULL, NULL, &err);

	assert(err == CL_SUCCESS);

	

	// And also a command queue for the context

	cmd_queue = clCreateCommandQueue(context, device, 0, NULL);

}

#pragma mark Program and Kernel Creation

{

	// Load the program source from disk

	// The kernel/program is the project directory and in Xcode the executable

	// is set to launch from that directory hence we use a relative path

	const char * filename = "example.cl";

	char *program_source = load_program_source(filename);

	program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source,

										   NULL, &err);

	assert(err == CL_SUCCESS);

	

	err = clBuildProgram(program[0], 1, &device, "-cl-fast-relaxed-math", NULL, NULL);

	assert(err == CL_SUCCESS);

	

	// Now create the kernel "objects" that we want to use in the example file 

	kernel[0] = clCreateKernel(program[0], "add", &err);

}

#pragma mark Memory Allocation

{

	cl_image_format imgFmt;

	imgFmt.image_channel_order = CL_A;

	imgFmt.image_channel_data_type = CL_UNORM_INT8;

	

	// Allocate memory on the device to hold our data and store the results into

	a_mem = clCreateImage2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,

							10, 10, sizeof(char) * 10, a, &err);

	assert(err == CL_SUCCESS);

	b_mem = clCreateImage2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,

							10, 10, sizeof(char) * 10, b, &err);

	assert(err == CL_SUCCESS);

	

	// Results array

	ans_mem	= clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*100, NULL, NULL);

	

	// Get all of the stuff written and allocated 

	clFinish(cmd_queue);

}

#pragma mark Kernel Arguments

{

	// Now setup the arguments to our kernel

	err  = clSetKernelArg(kernel[0],  0, sizeof(cl_mem), &a_mem);

	err |= clSetKernelArg(kernel[0],  1, sizeof(cl_mem), &b_mem);

	err |= clSetKernelArg(kernel[0],  2, sizeof(cl_mem), &ans_mem);

	assert(err == CL_SUCCESS);

}

#pragma mark Execution and Read

{

	// Run the calculation by enqueuing it and forcing the 

	// command queue to complete the task

	size_t global_work_size = n;

	err = clEnqueueNDRangeKernel(cmd_queue, kernel[0], 1, NULL, 

								 &global_work_size, NULL, 0, NULL, NULL);

	assert(err == CL_SUCCESS);

	clFinish(cmd_queue);

	

	// Once finished read back the results from the answer 

	// array into the results array

	err = clEnqueueReadBuffer(cmd_queue, ans_mem, CL_TRUE, 0, sizeof(float)*100, 

							  results, 0, NULL, NULL);

	assert(err == CL_SUCCESS);

	clFinish(cmd_queue);

}

#pragma mark Teardown

{

	clReleaseMemObject(a_mem);

	clReleaseMemObject(b_mem);

	clReleaseMemObject(ans_mem);

	

	clReleaseCommandQueue(cmd_queue);

	clReleaseContext(context);

}

return CL_SUCCESS;

}

int main (int argc, const char * argv) {

// Problem size

int n = 100;

int i;



// Allocate some memory and a place for the results

char * a = (char *)malloc(n*sizeof(char));

char * b = (char *)malloc(n*sizeof(char));

float * results = (float *)malloc(n*sizeof(float));



// Fill in the values

for(i=0;i<n;i++){

	a[i] = (char)i;

	b[i] = (char)n-i;

	results[i] = 0.f;

}



// Do the OpenCL calculation

runCL(a, b, results, n);



// Print out some results. For this example the values of all elements

// should be the same as the value of n

for(i=0;i<n && i<32;i++) printf("%f\n",results[i]);



// Free up memory

free(a);

free(b);

free(results);



return 0;

}

'; 

	

	return source; 

} 

#pragma mark -

#pragma mark Main OpenCL Routine

int runCL(char * a, char * b, float * results, int n)

{

	cl_program program[1];

	cl_kernel kernel[2];

	

	cl_command_queue cmd_queue;

	cl_context   context;

	

	cl_device_id cpu = NULL, device = NULL;

	

	cl_int err = 0;

	size_t returned_size = 0;

	

	cl_mem a_mem, b_mem, ans_mem;

	

#pragma mark Device Information

	{

		// List all devices

		cl_uint num_dev;

		cl_device_id dev_list[5];

		// Find the CPU CL device

		err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &device, NULL);

		assert(err == CL_SUCCESS);

		assert(device);

		

		// Get some information about the returned device

		cl_char vendor_name[1024] = {0};

		cl_char device_name[1024] = {0};

		err = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name), 

							  vendor_name, &returned_size);

		err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), 

							  device_name, &returned_size);

		assert(err == CL_SUCCESS);

		printf("Connecting to %s %s...\n", vendor_name, device_name);

	}

	

#pragma mark Context and Command Queue

	{

		// Now create a context to perform our calculation with the 

		// specified device 

		context = clCreateContext(0, 1, &device, NULL, NULL, &err);

		assert(err == CL_SUCCESS);

		

		// And also a command queue for the context

		cmd_queue = clCreateCommandQueue(context, device, 0, NULL);

	}

	

#pragma mark Program and Kernel Creation

	{

		// Load the program source from disk

		// The kernel/program is the project directory and in Xcode the executable

		// is set to launch from that directory hence we use a relative path

		const char * filename = "example.cl";

		char *program_source = load_program_source(filename);

		program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source,

											   NULL, &err);

		assert(err == CL_SUCCESS);

		

		err = clBuildProgram(program[0], 1, &device, "-cl-fast-relaxed-math", NULL, NULL);

		assert(err == CL_SUCCESS);

		

		// Now create the kernel "objects" that we want to use in the example file 

		kernel[0] = clCreateKernel(program[0], "add", &err);

	}

		

#pragma mark Memory Allocation

	{

		cl_image_format imgFmt;

		imgFmt.image_channel_order = CL_A;

		imgFmt.image_channel_data_type = CL_UNORM_INT8;

		

		// Allocate memory on the device to hold our data and store the results into

		a_mem = clCreateImage2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,

								10, 10, sizeof(char) * 10, a, &err);

		assert(err == CL_SUCCESS);

		b_mem = clCreateImage2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &imgFmt,

								10, 10, sizeof(char) * 10, b, &err);

		assert(err == CL_SUCCESS);

		

		// Results array

		ans_mem	= clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*100, NULL, NULL);

		

		// Get all of the stuff written and allocated 

		clFinish(cmd_queue);

	}

	

#pragma mark Kernel Arguments

	{

		// Now setup the arguments to our kernel

		err  = clSetKernelArg(kernel[0],  0, sizeof(cl_mem), &a_mem);

		err |= clSetKernelArg(kernel[0],  1, sizeof(cl_mem), &b_mem);

		err |= clSetKernelArg(kernel[0],  2, sizeof(cl_mem), &ans_mem);

		assert(err == CL_SUCCESS);

	}

	

#pragma mark Execution and Read

	{

		// Run the calculation by enqueuing it and forcing the 

		// command queue to complete the task

		size_t global_work_size = n;

		err = clEnqueueNDRangeKernel(cmd_queue, kernel[0], 1, NULL, 

									 &global_work_size, NULL, 0, NULL, NULL);

		assert(err == CL_SUCCESS);

		clFinish(cmd_queue);

		

		// Once finished read back the results from the answer 

		// array into the results array

		err = clEnqueueReadBuffer(cmd_queue, ans_mem, CL_TRUE, 0, sizeof(float)*100, 

								  results, 0, NULL, NULL);

		assert(err == CL_SUCCESS);

		clFinish(cmd_queue);

	}

	

#pragma mark Teardown

	{

		clReleaseMemObject(a_mem);

		clReleaseMemObject(b_mem);

		clReleaseMemObject(ans_mem);

		

		clReleaseCommandQueue(cmd_queue);

		clReleaseContext(context);

	}

	return CL_SUCCESS;

}

int main (int argc, const char * argv[]) {

	

	// Problem size

	int n = 100;

	int i;

	

	// Allocate some memory and a place for the results

	char * a = (char *)malloc(n*sizeof(char));

	char * b = (char *)malloc(n*sizeof(char));

	float * results = (float *)malloc(n*sizeof(float));

	

	// Fill in the values

	for(i=0;i<n;i++){

		a[i] = (char)i;

		b[i] = (char)n-i;

		results[i] = 0.f;

	}

	

	// Do the OpenCL calculation

	runCL(a, b, results, n);

	

	// Print out some results. For this example the values of all elements

	// should be the same as the value of n

	for(i=0;i<n && i<32;i++) printf("%f\n",results[i]);

	

	// Free up memory

	free(a);

	free(b);

	free(results);

	

	return 0;

}
__kernel void 

add(read_only image2d_t a,

	read_only image2d_t b,

	__global float *answer)

{

	int gid = get_global_id(0);

	const sampler_t samp =  CLK_NORMALIZED_COORDS_FALSE |

							CLK_ADDRESS_CLAMP_TO_EDGE |

							CLK_FILTER_NEAREST;

	float4 img = read_imagef(a, samp, (float2)(gid%10, gid/10));

	

	answer[gid] = (read_imagef(a, samp, (float2)(gid%10, gid/10))).x +

				  (read_imagef(b, samp, (float2)(gid%10, gid/10))).x;

	

	printf("%10f\n", img.x);

}

The answer is that which element of the float4 vector returned by read_imagef() is set according to img.image_channel_order. So if I specify CL_A, then the value will be returned in img.w. If I specify CL_R, then the value will be returned in img.x. I was just looking in the wrong spot for my data.