OpenCL Sample Code | DD STOSIC. Simplified, straightforward, OpenCL source code.

Hello fellow programmers. When I started learning OpenCL, nearly a month ago, I was disturbed by the long source codes I have encountered in Nvidia’s SDK and on the web. Therefore, I simplified some of Nvidia’s sample codes, more specifically: oclSimpleGL and oclVectorAdd, and have adapted them to my own use. Although these samples lack many comments, most of the code is straightforward and easy to understand.

I am providing source code for four of my projects:

    [*]simplecl (data set of 100 elements; mutliply each element by an integer) | adapted from oclVectorAdd

    [*]add (adds two buffers)

    [*]profiler (calculates and displays time required for work-items to execute kernel in simplecl)

    [*]wavecl (OpenCL/OpenGL Interoperability) adapted from oclSimpleGL

simplecl

simplecl.cpp (host code):

#include <oclUtils.h>

#include <stdlib.h>

#include <stdio.h>

int main (int argc, const char **argv) {

	cl_platform_id		platform;	  

	cl_device_id		device;

	cl_context			context;

	cl_command_queue	queue;

	cl_program			program;

	cl_kernel			kernel;

	cl_mem				buffer;		

	size_t				kernelsize;

	size_t				global=100;		// global work size

	size_t				local;			// local work size

	cl_float			*data;

	char				*pathname = NULL;	  

	char				*source = NULL;

	int					i;

	

	data = (cl_float*)malloc(sizeof(cl_float) * global);

	for (i=0; i<global; i++)	//	insert values in 'data'.

		data[i] = i;				

	

	clGetPlatformIDs(1, &platform, NULL);

	clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

	context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);

	queue = clCreateCommandQueue(context, device, 0, NULL);

	buffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * global, data, NULL);

	

	pathname = shrFindFilePath("simplecl.cl", argv[0]);

	source = oclLoadProgSource(pathname, "", &kernelsize);

	

	program = clCreateProgramWithSource(context, 1, (const char**)&source, &kernelsize, NULL);

	clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

	kernel = clCreateKernel(program, "add", NULL);

	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buffer);

	clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);

	clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, sizeof(cl_float) * global, data, 0, NULL, NULL);

	

	for (i=0; i<global; i++)

		printf("%i = %.0f\n", i, data[i]);

}

simplecl.cl (kernel code):

__kernel void add(__global float* data)

{

	int id = get_global_id(0);

	

	data[id] = 5 * data[id];

}

add

addv.cpp (host code):

#include <oclUtils.h>

#include <stdlib.h>

#include <stdio.h>

int main (int argc, const char **argv) {

	// OpenCL Objects

	cl_platform_id		platform;	  

	cl_device_id		device;

	cl_context			context;

	cl_command_queue	queue;

	cl_program			program;

	cl_kernel			kernel;

	cl_mem				a, b, c;	

	size_t				kernelsize;

	size_t				global=100000;		// global work size

	size_t				local=256;		// local work size

	cl_float			*ahost, *bhost, *result;

	cl_ulong			start, end;

	double				total;

	cl_event			event;

	char				*pathname = NULL;	  

	char				*source = NULL;

	int					i;

	

	local=10;

	ahost = (cl_float*)malloc(sizeof(cl_float) * global);

	bhost = (cl_float*)malloc(sizeof(cl_float) * global);

	result = (cl_float*)malloc(sizeof(cl_float) * global);

	for (i = 0; i < global; i++) {	//	insert values in 'ahost' and 'bhost'.

		ahost[i] = i;

		bhost[i] = 5*i;	

	}

		// Setup OpenCL

	clGetPlatformIDs(1, &platform, NULL);

	clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

	context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);

	queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL); 

	

	a = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * global, ahost, NULL);

	b = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * global, bhost, NULL);

	c = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * global, NULL, NULL);

	

	pathname = shrFindFilePath("addv.cl", argv[0]);

	source = oclLoadProgSource(pathname, "", &kernelsize);

	

	program = clCreateProgramWithSource(context, 1, (const char**)&source, &kernelsize, NULL);

	clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

	kernel = clCreateKernel(program, "addv", NULL);

	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a);

	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&b);

	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&c);

	

	clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event);

	

	clWaitForEvents(1, &event);

	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);

	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);

	total = (double)(end - start) / 1e9;	/* Convert nanoseconds to seconds */

	clEnqueueReadBuffer(queue, c, CL_TRUE, 0, sizeof(cl_float) * global, result, 0, NULL, NULL);

	

	

//	for (i=0; i < global; i++)

//		printf("%.0f + %.0f = %.0f\n", ahost[i], bhost[i], result[i]);	

	printf("\nProfiling: Total kernel time was %f seconds.\n\n", total);

}

addv.cl (kernel code):

__kernel void addv(__global const float* a, __global const float* b, __global float* c)

{

	int id = get_global_id(0);

	

	c[id] = a[id] + b[id];

}

profiler

profiler.cpp (host code):

#include <oclUtils.h>

#include <stdlib.h>

#include <stdio.h>

int main (int argc, const char **argv) {

	cl_platform_id		platform;	  

	cl_device_id		device;

	cl_context			context;

	cl_command_queue	queue;

	cl_program			program;

	cl_kernel			kernel;

	cl_mem				buffer;		

	size_t				kernelsize;

	size_t				global=256000;		// global work size

	size_t				local=128;			// local work size

	cl_float			*data;

	cl_ulong			start, end;

	double				total;

	cl_event			event;

	char				*pathname = NULL;	  

	char				*source = NULL;

	int					i, err;

	

	data = (cl_float*)malloc(sizeof(cl_float) * global);

	for (i=0; i<global; i++)	//	insert values in 'data'.

		data[i] = i;				

	

	clGetPlatformIDs(1, &platform, NULL);

	clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

	context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);

	queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL); 

	buffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * global, data, NULL);

	

	pathname = shrFindFilePath("profiler.cl", argv[0]);

	source = oclLoadProgSource(pathname, "", &kernelsize);

	

	program = clCreateProgramWithSource(context, 1, (const char**)&source, &kernelsize, NULL);

	clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

	kernel = clCreateKernel(program, "add", NULL);

	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buffer);

	clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);

		clWaitForEvents(1, &event);

		clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);

		clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);

		total = (double)(end - start) / 1e9;	/* Convert nanoseconds to seconds */

	clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, sizeof(cl_float) * global, data, 0, NULL, NULL);

	

	printf("\nProfiling: Total kernel time was %f seconds.\n\n", total);

}

profiler.cl (kernel code):

__kernel void add(__global float* data)

{

	int id = get_global_id(0);

	

	data[id] = sin(data[id]) * cos(data[id]);

}

wavecl

wavecl.cpp (host code):

#include <oclUtils.h>

#include <GL/glew.h>

#include <GL/glut.h>

#define	WIDTH					1408

#define HEIGHT					1024

const unsigned int	xsize = 64, ysize = 64;

const unsigned int	size = xsize * ysize * 4 * sizeof(float); 

cl_platform_id		platform;	  

cl_device_id		device;

cl_context			context;

cl_command_queue	queue;

cl_program			program;

cl_kernel			kernel;

size_t				kernelsize;

cl_mem				vbocl;

size_t				global[] = {xsize, ysize};

char				*pathname = NULL;

char				*source = NULL;	

GLuint	vbo;

float	anim = 0.0;

void display();

void initgl(int argc, const char** argv);

int main(int argc, const char **argv) {

	initgl(argc, argv);

	clGetPlatformIDs(1, &platform, NULL);

	clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

		// Win32 OS-specific contex properties and create OpenCL context

	cl_context_properties	props[] = {

		CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(), 

		CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), 

		CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 

		0};

	context = clCreateContext(props, 1, &device, NULL, NULL, NULL);

	queue = clCreateCommandQueue(context, device, 0, NULL); 

		// create vbo

	glGenBuffers(1, &vbo);

	glBindBuffer(GL_ARRAY_BUFFER, vbo);

	glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);	// initialize buffer object

	vbocl = clCreateFromGLBuffer(context, CL_MEM_WRITE_ONLY, vbo, NULL);	// create OpenCL buffer from GL VBO

	pathname = shrFindFilePath("wavecl.cl", argv[0]);

	source = oclLoadProgSource(pathname, "", &kernelsize);

	program = clCreateProgramWithSource(context, 1, (const char**)&source, &kernelsize, NULL);

	clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

	kernel = clCreateKernel(program, "sinewave", NULL);

	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&vbocl);

	clSetKernelArg(kernel, 1, sizeof(unsigned int), &xsize);

	clSetKernelArg(kernel, 2, sizeof(unsigned int), &ysize);

	

	glutMainLoop();

}

void initgl(int argc, const char** argv) {

	glutInit(&argc, (char**)argv);

	glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);

	glutInitWindowPosition (glutGet(GLUT_SCREEN_WIDTH)/2 - WIDTH/2, glutGet(GLUT_SCREEN_HEIGHT)/2 - HEIGHT/2);

	glutInitWindowSize(WIDTH, HEIGHT);

	glutCreateWindow("OpenCL/GL Interop");

	glutDisplayFunc(display);	// register GLUT callback functions

	glewInit();

	glClearColor(0.0, 0.0, 0.0, 1.0);

	glDisable(GL_DEPTH_TEST);

	glViewport(0, 0, WIDTH, HEIGHT);

	glMatrixMode(GL_PROJECTION);

	glLoadIdentity();

	gluPerspective(10000.0, (GLfloat)WIDTH / (GLfloat)HEIGHT, 0.1, 10.0);

	glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);

	glMatrixMode(GL_MODELVIEW);

	glLoadIdentity();

	glTranslatef(0.0, 0.0, -2.5);

	return;

}

void display() {

	anim += 0.01f;

		// map OpenGL buffer object for writing from OpenCL

	glFinish();

	clEnqueueAcquireGLObjects(queue, 1, &vbocl, 0,0,0);

		// Set arg 3 and execute the kernel

	clSetKernelArg(kernel, 3, sizeof(float), &anim);

	clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, NULL, 0, 0, 0);

		// unmap buffer object

	clEnqueueReleaseGLObjects(queue, 1, &vbocl, 0,0,0);

	clFinish(queue);

		// clear graphics then render from the vbo

	glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);	

	glBindBuffer(GL_ARRAY_BUFFER, vbo);

	glVertexPointer(4, GL_FLOAT, 0, 0);

	glEnableClientState(GL_VERTEX_ARRAY);

	glColor3f(0.4, 0.4, 0.4);

	glDrawArrays(GL_POINTS, 0, xsize * ysize);

	glDisableClientState(GL_VERTEX_ARRAY);

	glutSwapBuffers();

	glutPostRedisplay();

}

wavecl.cl (kernel code):

__kernel void sinewave(__global float4* pos, unsigned int width, unsigned int height, float time)

{

	unsigned int x = get_global_id(0);

	unsigned int y = get_global_id(1);

	// calculate uv coordinates

	float u = x / (float) width;

	float v = y / (float) height;

	u = u*2.0f - 1.0f;

	v = v*2.0f - 1.0f;

	// calculate simple sine wave pattern

	float freq = 2.0f;

	float w = sin(u*freq + time) * cos(v*freq + time) * 0.5f;

	// write output vertex

	pos[y*width+x] = (float4)(u, w, v, 1.0f);

}

PS: If you wish to create a project without having to deal with all that ‘including libraries’ bureaucracy, do the following:

[list=1]

[*]Locate the sample OpenCL projects from Nvidia’s SDK on your directory

[*]Copy one of those projects and paste it in the same folder (the directory should be something like this: C:\CUDA\OpenCL\src)

[*]Modlify the source code, change the project name, et cetera


DD STOSIC.

Hi dd.stosic,

Your first project didn’t work for me at first due to missing num_devices parameter. Setting 5th parameter of clGetDeviceIDs() to num_devices instead of NULL and using this variable in clCreateContext and clBuildProgram solved the problem. I’m wondering if this is specific with my environment. Could you please verify? I’m using MBP with Mac OSX 10.6.6 with OpenCL 1.0 (I guess).

Thanks a lot for these codes! They are really helpful.

Thanks for posting this. You don’t do error checking, but you should IMHO… Also I looked at the source of deviceQuery which is very complicated for what it does… It would be great to have a simplified version of that too…