Hello fellow programmers. When I started learning OpenCL, nearly a month ago, I was disturbed by the long source codes I have encountered in Nvidia’s SDK and on the web. Therefore, I simplified some of Nvidia’s sample codes, more specifically: oclSimpleGL and oclVectorAdd, and have adapted them to my own use. Although these samples lack many comments, most of the code is straightforward and easy to understand.
I am providing source code for four of my projects:
[*]simplecl (data set of 100 elements; mutliply each element by an integer) | adapted from oclVectorAdd
[*]add (adds two buffers)
[*]profiler (calculates and displays time required for work-items to execute kernel in simplecl)
[*]wavecl (OpenCL/OpenGL Interoperability) adapted from oclSimpleGL
simplecl
simplecl.cpp (host code):
#include <oclUtils.h>
#include <stdlib.h>
#include <stdio.h>
int main (int argc, const char **argv) {
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
cl_mem buffer;
size_t kernelsize;
size_t global=100; // global work size
size_t local; // local work size
cl_float *data;
char *pathname = NULL;
char *source = NULL;
int i;
data = (cl_float*)malloc(sizeof(cl_float) * global);
for (i=0; i<global; i++) // insert values in 'data'.
data[i] = i;
clGetPlatformIDs(1, &platform, NULL);
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
queue = clCreateCommandQueue(context, device, 0, NULL);
buffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * global, data, NULL);
pathname = shrFindFilePath("simplecl.cl", argv[0]);
source = oclLoadProgSource(pathname, "", &kernelsize);
program = clCreateProgramWithSource(context, 1, (const char**)&source, &kernelsize, NULL);
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
kernel = clCreateKernel(program, "add", NULL);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buffer);
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, sizeof(cl_float) * global, data, 0, NULL, NULL);
for (i=0; i<global; i++)
printf("%i = %.0f\n", i, data[i]);
}
simplecl.cl (kernel code):
__kernel void add(__global float* data)
{
int id = get_global_id(0);
data[id] = 5 * data[id];
}
add
addv.cpp (host code):
#include <oclUtils.h>
#include <stdlib.h>
#include <stdio.h>
int main (int argc, const char **argv) {
// OpenCL Objects
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
cl_mem a, b, c;
size_t kernelsize;
size_t global=100000; // global work size
size_t local=256; // local work size
cl_float *ahost, *bhost, *result;
cl_ulong start, end;
double total;
cl_event event;
char *pathname = NULL;
char *source = NULL;
int i;
local=10;
ahost = (cl_float*)malloc(sizeof(cl_float) * global);
bhost = (cl_float*)malloc(sizeof(cl_float) * global);
result = (cl_float*)malloc(sizeof(cl_float) * global);
for (i = 0; i < global; i++) { // insert values in 'ahost' and 'bhost'.
ahost[i] = i;
bhost[i] = 5*i;
}
// Setup OpenCL
clGetPlatformIDs(1, &platform, NULL);
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL);
a = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * global, ahost, NULL);
b = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * global, bhost, NULL);
c = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * global, NULL, NULL);
pathname = shrFindFilePath("addv.cl", argv[0]);
source = oclLoadProgSource(pathname, "", &kernelsize);
program = clCreateProgramWithSource(context, 1, (const char**)&source, &kernelsize, NULL);
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
kernel = clCreateKernel(program, "addv", NULL);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a);
clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&b);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&c);
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event);
clWaitForEvents(1, &event);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);
total = (double)(end - start) / 1e9; /* Convert nanoseconds to seconds */
clEnqueueReadBuffer(queue, c, CL_TRUE, 0, sizeof(cl_float) * global, result, 0, NULL, NULL);
// for (i=0; i < global; i++)
// printf("%.0f + %.0f = %.0f\n", ahost[i], bhost[i], result[i]);
printf("\nProfiling: Total kernel time was %f seconds.\n\n", total);
}
addv.cl (kernel code):
__kernel void addv(__global const float* a, __global const float* b, __global float* c)
{
int id = get_global_id(0);
c[id] = a[id] + b[id];
}
profiler
profiler.cpp (host code):
#include <oclUtils.h>
#include <stdlib.h>
#include <stdio.h>
int main (int argc, const char **argv) {
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
cl_mem buffer;
size_t kernelsize;
size_t global=256000; // global work size
size_t local=128; // local work size
cl_float *data;
cl_ulong start, end;
double total;
cl_event event;
char *pathname = NULL;
char *source = NULL;
int i, err;
data = (cl_float*)malloc(sizeof(cl_float) * global);
for (i=0; i<global; i++) // insert values in 'data'.
data[i] = i;
clGetPlatformIDs(1, &platform, NULL);
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL);
buffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * global, data, NULL);
pathname = shrFindFilePath("profiler.cl", argv[0]);
source = oclLoadProgSource(pathname, "", &kernelsize);
program = clCreateProgramWithSource(context, 1, (const char**)&source, &kernelsize, NULL);
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
kernel = clCreateKernel(program, "add", NULL);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buffer);
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
clWaitForEvents(1, &event);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);
total = (double)(end - start) / 1e9; /* Convert nanoseconds to seconds */
clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, sizeof(cl_float) * global, data, 0, NULL, NULL);
printf("\nProfiling: Total kernel time was %f seconds.\n\n", total);
}
profiler.cl (kernel code):
__kernel void add(__global float* data)
{
int id = get_global_id(0);
data[id] = sin(data[id]) * cos(data[id]);
}
wavecl
wavecl.cpp (host code):
#include <oclUtils.h>
#include <GL/glew.h>
#include <GL/glut.h>
#define WIDTH 1408
#define HEIGHT 1024
const unsigned int xsize = 64, ysize = 64;
const unsigned int size = xsize * ysize * 4 * sizeof(float);
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
size_t kernelsize;
cl_mem vbocl;
size_t global[] = {xsize, ysize};
char *pathname = NULL;
char *source = NULL;
GLuint vbo;
float anim = 0.0;
void display();
void initgl(int argc, const char** argv);
int main(int argc, const char **argv) {
initgl(argc, argv);
clGetPlatformIDs(1, &platform, NULL);
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
// Win32 OS-specific contex properties and create OpenCL context
cl_context_properties props[] = {
CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(),
CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(),
CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
0};
context = clCreateContext(props, 1, &device, NULL, NULL, NULL);
queue = clCreateCommandQueue(context, device, 0, NULL);
// create vbo
glGenBuffers(1, &vbo);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW); // initialize buffer object
vbocl = clCreateFromGLBuffer(context, CL_MEM_WRITE_ONLY, vbo, NULL); // create OpenCL buffer from GL VBO
pathname = shrFindFilePath("wavecl.cl", argv[0]);
source = oclLoadProgSource(pathname, "", &kernelsize);
program = clCreateProgramWithSource(context, 1, (const char**)&source, &kernelsize, NULL);
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
kernel = clCreateKernel(program, "sinewave", NULL);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&vbocl);
clSetKernelArg(kernel, 1, sizeof(unsigned int), &xsize);
clSetKernelArg(kernel, 2, sizeof(unsigned int), &ysize);
glutMainLoop();
}
void initgl(int argc, const char** argv) {
glutInit(&argc, (char**)argv);
glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);
glutInitWindowPosition (glutGet(GLUT_SCREEN_WIDTH)/2 - WIDTH/2, glutGet(GLUT_SCREEN_HEIGHT)/2 - HEIGHT/2);
glutInitWindowSize(WIDTH, HEIGHT);
glutCreateWindow("OpenCL/GL Interop");
glutDisplayFunc(display); // register GLUT callback functions
glewInit();
glClearColor(0.0, 0.0, 0.0, 1.0);
glDisable(GL_DEPTH_TEST);
glViewport(0, 0, WIDTH, HEIGHT);
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
gluPerspective(10000.0, (GLfloat)WIDTH / (GLfloat)HEIGHT, 0.1, 10.0);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
glTranslatef(0.0, 0.0, -2.5);
return;
}
void display() {
anim += 0.01f;
// map OpenGL buffer object for writing from OpenCL
glFinish();
clEnqueueAcquireGLObjects(queue, 1, &vbocl, 0,0,0);
// Set arg 3 and execute the kernel
clSetKernelArg(kernel, 3, sizeof(float), &anim);
clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, NULL, 0, 0, 0);
// unmap buffer object
clEnqueueReleaseGLObjects(queue, 1, &vbocl, 0,0,0);
clFinish(queue);
// clear graphics then render from the vbo
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
glVertexPointer(4, GL_FLOAT, 0, 0);
glEnableClientState(GL_VERTEX_ARRAY);
glColor3f(0.4, 0.4, 0.4);
glDrawArrays(GL_POINTS, 0, xsize * ysize);
glDisableClientState(GL_VERTEX_ARRAY);
glutSwapBuffers();
glutPostRedisplay();
}
wavecl.cl (kernel code):
__kernel void sinewave(__global float4* pos, unsigned int width, unsigned int height, float time)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
// calculate uv coordinates
float u = x / (float) width;
float v = y / (float) height;
u = u*2.0f - 1.0f;
v = v*2.0f - 1.0f;
// calculate simple sine wave pattern
float freq = 2.0f;
float w = sin(u*freq + time) * cos(v*freq + time) * 0.5f;
// write output vertex
pos[y*width+x] = (float4)(u, w, v, 1.0f);
}
PS: If you wish to create a project without having to deal with all that ‘including libraries’ bureaucracy, do the following:
[list=1]
[*]Locate the sample OpenCL projects from Nvidia’s SDK on your directory
[*]Copy one of those projects and paste it in the same folder (the directory should be something like this: C:\CUDA\OpenCL\src)
[*]Modlify the source code, change the project name, et cetera
DD STOSIC.