Not yet a forum for OpenCL (so I am putting here this thread, even so related with CUDA)

Dear All

I am porting a OpenCL program to CUDA but I have to put working first in OpenCL

The examples I found in OpenCL in the site of NVIDIA gave all the errors (two errors): (for Visual 2015)

do not found exception.h (Nvidia or Microsoft include???)
identifier “RUNTIME_EXCEPTION” is undefined

The original projects are for Visual 2008

Thanks

Luis Gonçalves

Another question. For running OpenCL in Visual Studio 2015, it is needed that the project be a CUDA project?

Thanks

Luis Gonçalves

It should not be necessary to make a CUDA project.

You should be able to drop an OpenCL code into a new project in VS, then provide the include path to the cl.h header file, and provide the linker path to the OpenCL library. That’s all that should be needed.

On VS 2015, on a system with CUDA 8 and a proper GPU install, I did the following:

  • start a new empty “general” project - console application
  • set the build configuration to x64 Release
  • in the project explorer window, select Source Files (folder). Right-click, select Add…C++ file (.cpp)
  • double-click on the new source file (Source.cpp)
  • paste in an OpenCL program, such as:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#include <CL/cl.h>

#define STRINGIFY(s) #s

#define CL_SUCCEED(e) (assert(e == CL_SUCCESS))

const char *kernel_cl = STRINGIFY(
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {

    // Get the index of the current element to be processed
    int i = get_global_id(0);

    // Do the operation
    C[i] = A[i] + B[i];
});

int main(void) {
    printf("started running\n");

    // Create the two input vectors
    int i;
    const int LIST_SIZE = 1024;
    int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
    int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
    for(i = 0; i < LIST_SIZE; i++) {
        A[i] = i;
        B[i] = LIST_SIZE - i;
    }

    // Load the kernel source code into the array source_str
    const char *source_str = kernel_cl;
    size_t source_size;
    source_size = strlen(source_str);
    printf("kernel loading done\n");
    // Get platform and device information
    cl_device_id device_id = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;

cl_int ret = clGetPlatformIDs(0, NULL, &ret_num_platforms);
    cl_platform_id *platforms = NULL;
    platforms = (cl_platform_id*)malloc(ret_num_platforms*sizeof(cl_platform_id));

    ret = clGetPlatformIDs(ret_num_platforms, platforms, NULL);
    if (ret != CL_SUCCESS) {printf("ret at %d is %d\n", __LINE__, ret); CL_SUCCEED(ret);}

    ret = clGetDeviceIDs( platforms[0], CL_DEVICE_TYPE_ALL, 1,
            &device_id, &ret_num_devices);
    if (ret != CL_SUCCESS) {printf("ret at %d is %d\n", __LINE__, ret); CL_SUCCEED(ret);}
    // Create an OpenCL context
    cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
    if (ret != CL_SUCCESS) {printf("ret at %d is %d\n", __LINE__, ret); CL_SUCCEED(ret);}

    // Create a command queue
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
    if (ret != CL_SUCCESS) {printf("ret at %d is %d\n", __LINE__, ret); CL_SUCCEED(ret);}

    // Create memory buffers on the device for each vector
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
            LIST_SIZE * sizeof(int), NULL, &ret);
    cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
            LIST_SIZE * sizeof(int), NULL, &ret);
    cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
            LIST_SIZE * sizeof(int), NULL, &ret);

    // Copy the lists A and B to their respective memory buffers
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
            LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
    if (ret != CL_SUCCESS) {printf("ret at %d is %d\n", __LINE__, ret); CL_SUCCEED(ret);}

    ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
            LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
    if (ret != CL_SUCCESS) {printf("ret at %d is %d\n", __LINE__, ret); CL_SUCCEED(ret);}

    printf("before building\n");
    // Create a program from the kernel source
    cl_program program = clCreateProgramWithSource(context, 1,
            (const char **)&source_str, (const size_t *)&source_size, &ret);
    if (ret != CL_SUCCESS) {printf("ret at %d is %d\n", __LINE__, ret); CL_SUCCEED(ret);}

    // Build the program
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
    if (ret != CL_SUCCESS) {printf("ret at %d is %d\n", __LINE__, ret); CL_SUCCEED(ret);}

    printf("after building\n");
    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
    if (ret != CL_SUCCESS) {printf("ret at %d is %d\n", __LINE__, ret); CL_SUCCEED(ret);}

    // Set the arguments of the kernel
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
    if (ret != CL_SUCCESS) {printf("ret at %d is %d\n", __LINE__, ret); CL_SUCCEED(ret);}

    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
    if (ret != CL_SUCCESS) {printf("ret at %d is %d\n", __LINE__, ret); CL_SUCCEED(ret);}

    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
    if (ret != CL_SUCCESS) {printf("ret at %d is %d\n", __LINE__, ret); CL_SUCCEED(ret);}

    printf("before execution\n");
    // Execute the OpenCL kernel on the list
    size_t global_item_size = LIST_SIZE; // Process the entire lists
    size_t local_item_size = 64; // Divide work items into groups of 64
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
            &global_item_size, &local_item_size, 0, NULL, NULL);
 printf("after execution\n");
    // Read the memory buffer C on the device to the local variable C
    int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
    ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
            LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
 printf("after copying\n");
    // Display the result to the screen
    for(i = 0; i < 10; i++)
        printf("%d + %d = %d\n", A[i], B[i], C[i]);

    // Clean up
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(a_mem_obj);
    ret = clReleaseMemObject(b_mem_obj);
    ret = clReleaseMemObject(c_mem_obj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
    free(A);
    free(B);
    free(C);
    getchar();
    return 0;
}
  • In project properties…C/C++…General…Additional Include Directories, add the path to your CL/cl.h file, for me I added C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\include
  • In project properties…Linker…Input…Additional Dependences, add your opencl lib, for me I added OpenCL.lib
  • In project properties…Linker…General…Additional Library Directories, add the path to your opencl lib, for me I added C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\lib\x64

Then build the code and run it.