OpenCL Sample Code | DD STOSIC. Simplified, straightforward, OpenCL source code.

dd.stosic · June 25, 2010, 8:32pm

Hello fellow programmers. When I started learning OpenCL, nearly a month ago, I was disturbed by the long source codes I have encountered in Nvidia’s SDK and on the web. Therefore, I simplified some of Nvidia’s sample codes, more specifically: oclSimpleGL and oclVectorAdd, and have adapted them to my own use. Although these samples lack many comments, most of the code is straightforward and easy to understand.

I am providing source code for four of my projects:

[*]simplecl (data set of 100 elements; mutliply each element by an integer) | adapted from oclVectorAdd

[*]add (adds two buffers)

[*]profiler (calculates and displays time required for work-items to execute kernel in simplecl)

[*]wavecl (OpenCL/OpenGL Interoperability) adapted from oclSimpleGL

simplecl

simplecl.cpp (host code):

#include <oclUtils.h>

#include <stdlib.h>

#include <stdio.h>

int main (int argc, const char **argv) {

	cl_platform_id		platform;	  

	cl_device_id		device;

	cl_context			context;

	cl_command_queue	queue;

	cl_program			program;

	cl_kernel			kernel;

	cl_mem				buffer;		

	size_t				kernelsize;

	size_t				global=100;		// global work size

	size_t				local;			// local work size

	cl_float			*data;

	char				*pathname = NULL;	  

	char				*source = NULL;

	int					i;

	

	data = (cl_float*)malloc(sizeof(cl_float) * global);

	for (i=0; i<global; i++)	//	insert values in 'data'.

		data[i] = i;				

	

	clGetPlatformIDs(1, &platform, NULL);

	clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

	context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);

	queue = clCreateCommandQueue(context, device, 0, NULL);

	buffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * global, data, NULL);

	

	pathname = shrFindFilePath("simplecl.cl", argv[0]);

	source = oclLoadProgSource(pathname, "", &kernelsize);

	

	program = clCreateProgramWithSource(context, 1, (const char**)&source, &kernelsize, NULL);

	clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

	kernel = clCreateKernel(program, "add", NULL);

	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buffer);

	clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);

	clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, sizeof(cl_float) * global, data, 0, NULL, NULL);

	

	for (i=0; i<global; i++)

		printf("%i = %.0f\n", i, data[i]);

}

simplecl.cl (kernel code):

__kernel void add(__global float* data)

{

	int id = get_global_id(0);

	

	data[id] = 5 * data[id];

}

add

addv.cpp (host code):

#include <oclUtils.h>

#include <stdlib.h>

#include <stdio.h>

int main (int argc, const char **argv) {

	// OpenCL Objects

	cl_platform_id		platform;	  

	cl_device_id		device;

	cl_context			context;

	cl_command_queue	queue;

	cl_program			program;

	cl_kernel			kernel;

	cl_mem				a, b, c;	

	size_t				kernelsize;

	size_t				global=100000;		// global work size

	size_t				local=256;		// local work size

	cl_float			*ahost, *bhost, *result;

	cl_ulong			start, end;

	double				total;

	cl_event			event;

	char				*pathname = NULL;	  

	char				*source = NULL;

	int					i;

	

	local=10;

	ahost = (cl_float*)malloc(sizeof(cl_float) * global);

	bhost = (cl_float*)malloc(sizeof(cl_float) * global);

	result = (cl_float*)malloc(sizeof(cl_float) * global);

	for (i = 0; i < global; i++) {	//	insert values in 'ahost' and 'bhost'.

		ahost[i] = i;

		bhost[i] = 5*i;	

	}

		// Setup OpenCL

	clGetPlatformIDs(1, &platform, NULL);

	clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

	context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);

	queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL); 

	

	a = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * global, ahost, NULL);

	b = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * global, bhost, NULL);

	c = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * global, NULL, NULL);

	

	pathname = shrFindFilePath("addv.cl", argv[0]);

	source = oclLoadProgSource(pathname, "", &kernelsize);

	

	program = clCreateProgramWithSource(context, 1, (const char**)&source, &kernelsize, NULL);

	clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

	kernel = clCreateKernel(program, "addv", NULL);

	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a);

	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&b);

	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&c);

	

	clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event);

	

	clWaitForEvents(1, &event);

	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);

	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);

	total = (double)(end - start) / 1e9;	/* Convert nanoseconds to seconds */

	clEnqueueReadBuffer(queue, c, CL_TRUE, 0, sizeof(cl_float) * global, result, 0, NULL, NULL);

	

	

//	for (i=0; i < global; i++)

//		printf("%.0f + %.0f = %.0f\n", ahost[i], bhost[i], result[i]);	

	printf("\nProfiling: Total kernel time was %f seconds.\n\n", total);

}

addv.cl (kernel code):

__kernel void addv(__global const float* a, __global const float* b, __global float* c)

{

	int id = get_global_id(0);

	

	c[id] = a[id] + b[id];

}

profiler

profiler.cpp (host code):

#include <oclUtils.h>

#include <stdlib.h>

#include <stdio.h>

int main (int argc, const char **argv) {

	cl_platform_id		platform;	  

	cl_device_id		device;

	cl_context			context;

	cl_command_queue	queue;

	cl_program			program;

	cl_kernel			kernel;

	cl_mem				buffer;		

	size_t				kernelsize;

	size_t				global=256000;		// global work size

	size_t				local=128;			// local work size

	cl_float			*data;

	cl_ulong			start, end;

	double				total;

	cl_event			event;

	char				*pathname = NULL;	  

	char				*source = NULL;

	int					i, err;

	

	data = (cl_float*)malloc(sizeof(cl_float) * global);

	for (i=0; i<global; i++)	//	insert values in 'data'.

		data[i] = i;				

	

	clGetPlatformIDs(1, &platform, NULL);

	clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

	context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);

	queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL); 

	buffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * global, data, NULL);

	

	pathname = shrFindFilePath("profiler.cl", argv[0]);

	source = oclLoadProgSource(pathname, "", &kernelsize);

	

	program = clCreateProgramWithSource(context, 1, (const char**)&source, &kernelsize, NULL);

	clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

	kernel = clCreateKernel(program, "add", NULL);

	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buffer);

	clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);

		clWaitForEvents(1, &event);

		clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);

		clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);

		total = (double)(end - start) / 1e9;	/* Convert nanoseconds to seconds */

	clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, sizeof(cl_float) * global, data, 0, NULL, NULL);

	

	printf("\nProfiling: Total kernel time was %f seconds.\n\n", total);

}

profiler.cl (kernel code):

__kernel void add(__global float* data)

{

	int id = get_global_id(0);

	

	data[id] = sin(data[id]) * cos(data[id]);

}

wavecl

wavecl.cpp (host code):

#include <oclUtils.h>

#include <GL/glew.h>

#include <GL/glut.h>

#define	WIDTH					1408

#define HEIGHT					1024

const unsigned int	xsize = 64, ysize = 64;

const unsigned int	size = xsize * ysize * 4 * sizeof(float); 

cl_platform_id		platform;	  

cl_device_id		device;

cl_context			context;

cl_command_queue	queue;

cl_program			program;

cl_kernel			kernel;

size_t				kernelsize;

cl_mem				vbocl;

size_t				global[] = {xsize, ysize};

char				*pathname = NULL;

char				*source = NULL;	

GLuint	vbo;

float	anim = 0.0;

void display();

void initgl(int argc, const char** argv);

int main(int argc, const char **argv) {

	initgl(argc, argv);

	clGetPlatformIDs(1, &platform, NULL);

	clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

		// Win32 OS-specific contex properties and create OpenCL context

	cl_context_properties	props[] = {

		CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(), 

		CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), 

		CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 

		0};

	context = clCreateContext(props, 1, &device, NULL, NULL, NULL);

	queue = clCreateCommandQueue(context, device, 0, NULL); 

		// create vbo

	glGenBuffers(1, &vbo);

	glBindBuffer(GL_ARRAY_BUFFER, vbo);

	glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);	// initialize buffer object

	vbocl = clCreateFromGLBuffer(context, CL_MEM_WRITE_ONLY, vbo, NULL);	// create OpenCL buffer from GL VBO

	pathname = shrFindFilePath("wavecl.cl", argv[0]);

	source = oclLoadProgSource(pathname, "", &kernelsize);

	program = clCreateProgramWithSource(context, 1, (const char**)&source, &kernelsize, NULL);

	clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

	kernel = clCreateKernel(program, "sinewave", NULL);

	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&vbocl);

	clSetKernelArg(kernel, 1, sizeof(unsigned int), &xsize);

	clSetKernelArg(kernel, 2, sizeof(unsigned int), &ysize);

	

	glutMainLoop();

}

void initgl(int argc, const char** argv) {

	glutInit(&argc, (char**)argv);

	glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);

	glutInitWindowPosition (glutGet(GLUT_SCREEN_WIDTH)/2 - WIDTH/2, glutGet(GLUT_SCREEN_HEIGHT)/2 - HEIGHT/2);

	glutInitWindowSize(WIDTH, HEIGHT);

	glutCreateWindow("OpenCL/GL Interop");

	glutDisplayFunc(display);	// register GLUT callback functions

	glewInit();

	glClearColor(0.0, 0.0, 0.0, 1.0);

	glDisable(GL_DEPTH_TEST);

	glViewport(0, 0, WIDTH, HEIGHT);

	glMatrixMode(GL_PROJECTION);

	glLoadIdentity();

	gluPerspective(10000.0, (GLfloat)WIDTH / (GLfloat)HEIGHT, 0.1, 10.0);

	glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);

	glMatrixMode(GL_MODELVIEW);

	glLoadIdentity();

	glTranslatef(0.0, 0.0, -2.5);

	return;

}

void display() {

	anim += 0.01f;

		// map OpenGL buffer object for writing from OpenCL

	glFinish();

	clEnqueueAcquireGLObjects(queue, 1, &vbocl, 0,0,0);

		// Set arg 3 and execute the kernel

	clSetKernelArg(kernel, 3, sizeof(float), &anim);

	clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, NULL, 0, 0, 0);

		// unmap buffer object

	clEnqueueReleaseGLObjects(queue, 1, &vbocl, 0,0,0);

	clFinish(queue);

		// clear graphics then render from the vbo

	glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);	

	glBindBuffer(GL_ARRAY_BUFFER, vbo);

	glVertexPointer(4, GL_FLOAT, 0, 0);

	glEnableClientState(GL_VERTEX_ARRAY);

	glColor3f(0.4, 0.4, 0.4);

	glDrawArrays(GL_POINTS, 0, xsize * ysize);

	glDisableClientState(GL_VERTEX_ARRAY);

	glutSwapBuffers();

	glutPostRedisplay();

}

wavecl.cl (kernel code):

__kernel void sinewave(__global float4* pos, unsigned int width, unsigned int height, float time)

{

	unsigned int x = get_global_id(0);

	unsigned int y = get_global_id(1);

	// calculate uv coordinates

	float u = x / (float) width;

	float v = y / (float) height;

	u = u*2.0f - 1.0f;

	v = v*2.0f - 1.0f;

	// calculate simple sine wave pattern

	float freq = 2.0f;

	float w = sin(u*freq + time) * cos(v*freq + time) * 0.5f;

	// write output vertex

	pos[y*width+x] = (float4)(u, w, v, 1.0f);

}

PS: If you wish to create a project without having to deal with all that ‘including libraries’ bureaucracy, do the following:

[list=1]

[*]Locate the sample OpenCL projects from Nvidia’s SDK on your directory

[*]Copy one of those projects and paste it in the same folder (the directory should be something like this: C:\CUDA\OpenCL\src)

[*]Modlify the source code, change the project name, et cetera

DD STOSIC.

fbuyukkececi · February 8, 2011, 4:22pm

Hi dd.stosic,

Your first project didn’t work for me at first due to missing num_devices parameter. Setting 5th parameter of clGetDeviceIDs() to num_devices instead of NULL and using this variable in clCreateContext and clBuildProgram solved the problem. I’m wondering if this is specific with my environment. Could you please verify? I’m using MBP with Mac OSX 10.6.6 with OpenCL 1.0 (I guess).

Thanks a lot for these codes! They are really helpful.

MrNightLifeLover · February 17, 2011, 11:14am

Thanks for posting this. You don’t do error checking, but you should IMHO… Also I looked at the source of deviceQuery which is very complicated for what it does… It would be great to have a simplified version of that too…

Topic		Replies	Views
CL_BUILD_PROGRAM_FAILURE CUDA Programming and Performance	6	21908	February 5, 2012
Problem with get_global_id(1); CUDA Programming and Performance	5	4032	May 19, 2014
OpenCL and Ubuntu 10.10 CUDA Programming and Performance	7	80077	January 25, 2011
CL / GL interop. clCreateFromGLBuffer return CL_OUT_OF_RESOURCES on GeForce GTX 1070 (Linux with Nvi... CUDA Programming and Performance	3	1738	July 20, 2017
Simple OpenCL Example from OpenCL_GettingStartedLinux.pdf wrong output - only zeros CUDA Programming and Performance	3	22284	July 11, 2010
A problem in reading opencl kernel results in a simple reduction implementation CUDA Programming and Performance	0	809	August 4, 2016
openCL-openGL interop 4-5x slower than CG-shaders performance issues with OpenCL CUDA Programming and Performance	6	7471	June 13, 2012
performance question CUDA Programming and Performance	9	9933	August 4, 2010
Intel Opencl sample astonishing godrays implementation CUDA Programming and Performance	8	19348	November 29, 2010
OpenCL Driver BUG? clEnqueueReadImage fails to copy data from Image Object CUDA Programming and Performance	1	4291	April 6, 2011

OpenCL Sample Code | DD STOSIC. Simplified, straightforward, OpenCL source code.

Related topics