This is actually an opencl code not cuda. I have an array of 16 chars named density and I want to reduce them into an array of 4 integers called scalar_sums. The kernel is working fine and when I printf the results within kernel I get proper results. However, when I map the memory to print them in my c-code, I am only able to print the first element. I think there might be a problem with my memory mapping like segfaulting but I cannot figure that out where that is.
One wired thing when I comment the last comments of the kernel, i.e. those lines that manipulate scalar_nums, the mapping works fine. It returns their initial value properly. But, when the kernel operate on scalar_nums` the mapping has problem although kernel printf is printing proper numbers.
Do you have any idea why this is not working.
Here are the C-code and the Kernel.
__kernel void reduction(int voxelCounts,
__global char* density,
__local int* partialSums,
__global int* scalar_sum){
int gid = get_global_id(0);
int lid = get_local_id(0);
int groupSize = get_local_size(0);
int i = ceil(convert_float(groupSize / 2));
//printf("%d\n", i);
//partialSums = (convert_int(density[gid]) + 128) / 255;
partialSums[lid] = convert_int(density[gid]);
//printf("partialSums in kernel: %d\n", partialSums);
barrier(CLK_LOCAL_MEM_FENCE);
while (i != 0){
if (gid + i < voxelCounts && lid < i){
partialSums[lid] += partialSums[lid + i];
}
i /= 2;
barrier(CLK_LOCAL_MEM_FENCE);
}
if (gid < voxelCounts && lid == 0){
int temp = partialSums[lid];
barrier(CLK_LOCAL_MEM_FENCE);
scalar_sum[gid] = temp;
printf("At the end: %d --> %d\n", gid, scalar_sum[gid]);
}
}
//This program performs reduction for an array of arbitrary size.
#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE "reduction_arbitrary.cl"
#define KERNEL_FUNC "reduction"
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <string.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
cl_device_id create_device() {
cl_platform_id *platform;
cl_device_id dev;
cl_uint num_platform;
int err;
/* Identify a platform */
err = clGetPlatformIDs(0, NULL, &num_platform);
if (err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
platform = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platform);
clGetPlatformIDs(num_platform, platform, NULL);
/* Access a device */
err = clGetDeviceIDs(platform[1], CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
if (err < 0) {
perror("Couldn't access any devices");
exit(1);
}
return dev;
}
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if (program_handle == NULL) {
perror("Couldn't find the program file");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '
//This program performs reduction for an array of arbitrary size.
#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE “reduction_arbitrary.cl”
#define KERNEL_FUNC “reduction”
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <string.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
cl_device_id create_device() {
cl_platform_id *platform;
cl_device_id dev;
cl_uint num_platform;
int err;
/* Identify a platform */
err = clGetPlatformIDs(0, NULL, &num_platform);
if (err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
platform = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platform);
clGetPlatformIDs(num_platform, platform, NULL);
/* Access a device */
err = clGetDeviceIDs(platform[1], CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
if (err < 0) {
perror("Couldn't access any devices");
exit(1);
}
return dev;
}
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if (program_handle == NULL) {
perror("Couldn't find the program file");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if (err < 0) {
perror("Couldn't create the program");
exit(1);
}
free(program_buffer);
/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*)malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
void get_info(cl_device_id dev){
cl_ulong glob_mem_size, local_mem_size;
cl_uint clock_freq, num_core, work_item_dim, time_res;
size_t local_size, work_item_size[3];
char dev_vendor[40], dev_name[400], driver_version[40], device_version[40];
clGetDeviceInfo(dev, CL_DEVICE_VENDOR, sizeof(dev_vendor), &dev_vendor, NULL);
clGetDeviceInfo(dev, CL_DEVICE_NAME, sizeof(dev_name), &dev_name, NULL);
clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(glob_mem_size), &glob_mem_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(local_mem_size), &local_mem_size, NULL);
clGetDeviceInfo(dev, CL_DRIVER_VERSION, sizeof(driver_version), &driver_version, NULL);
clGetDeviceInfo(dev, CL_DEVICE_VERSION, sizeof(device_version), &device_version, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_freq), &clock_freq, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_core), &num_core, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_size), &work_item_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(work_item_dim), &work_item_dim, NULL);
clGetDeviceInfo(dev, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(time_res), &time_res, NULL);
printf("==========================================================\n");
printf("Device Sepc without consideration of kernels:\n");
printf("CL_DEVICE_VENDOR: %s\n", dev_vendor);
printf("CL_DEVICE_NAME: %s\n", dev_name);
printf("CL_DEVICE_GLOBAL_MEM_SIZE: %I64u GB\n", glob_mem_size / 1073741824);
printf("CL_DEVICE_LOCAL_MEM_SIZE: %I64u KB\n", local_mem_size / 1024);
printf("CL_DRIVER_VERSION: %s\n", driver_version);
printf("CL_DEVICE_VERSION: %s\n", device_version);
printf("CL_DEVICE_MAX_CLOCK_FREQUENCY: %I32u MHz\n", clock_freq);
printf("CL_DEVICE_MAX_COMPUTE_UNITS: %I32u\n", num_core);
printf("CL_DEVICE_MAX_WORK_GROUP_SIZE %u\n", local_size);
printf("CL_DEVICE_MAX_WORK_ITEM_SIZES: {%I32u, %I32u, %I32u}\n", work_item_size[0], work_item_size[1], work_item_size[2]);
printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: %I32u\n", work_item_dim);
printf("CL_DEVICE_PROFILING_TIMER_RESOLUTION: %I32u ns\n", time_res);
printf("==========================================================\n");
}
int main() {
/* Host/device data structures */
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_int i, err;
/* Program/kernel data structures */
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
cl_kernel kernel;
/* Data and buffers */
void *scalar_sum_mapped_memory;
cl_mem density_buffer, scalar_sum_buffer;
char density[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 };
int scalar_sum[4] = {1,2,3,4};
device = create_device();
get_info(device);
/* Create the context */
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err < 0) {
perror("Couldn't create a context");
exit(1);
}
program = build_program(context, device, PROGRAM_FILE);
/* Create kernel for the mat_vec_mult function */
kernel = clCreateKernel(program, KERNEL_FUNC, &err);
if (err < 0) {
perror("Couldn't create the kernel");
exit(1);
}
size_t global_work_size = 16;
size_t local_work_size = 4;
density_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, sizeof(char)* 16, density, &err);
scalar_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, sizeof(int)* 4, scalar_sum, &err);
if (err < 0){
perror("Couldn't create a buffer");
exit(1);
}
/* Create kernel arguments from the CL buffers */
err = clSetKernelArg(kernel, 0, sizeof(size_t), &global_work_size);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &density_buffer);
err |= clSetKernelArg(kernel, 2, sizeof(int), NULL);
err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &scalar_sum_buffer);
if (err < 0) {
perror("Couldn't set the kernel argument");
exit(1);
}
/* Create a CL command queue for the device*/
queue = clCreateCommandQueue(context, device, 0, &err);
if (err < 0) {
perror("Couldn't create the command queue");
exit(1);
}
/* Enqueue the command queue to the device */
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size,
&local_work_size, 0, NULL, NULL);
if (err < 0) {
perror("Couldn't enqueue the kernel execution command");
printf("Here is the error code: %d\n", err);
exit(1);
}
/*Read the results*/
scalar_sum_mapped_memory = clEnqueueMapBuffer(queue, scalar_sum_buffer, CL_TRUE,
CL_MAP_READ, 0, sizeof(int)*4, 0, NULL, NULL, &err);
if (err < 0) {
printf("Error code : %d. Couldn't map the buffer to host memory\n", err);
exit(1);
}
memcpy(scalar_sum, scalar_sum_mapped_memory, sizeof(int)*4);
err = clEnqueueUnmapMemObject(queue, scalar_sum_buffer, scalar_sum_mapped_memory,
0, NULL, NULL);
if (err < 0) {
printf("Error code: %d. Couldn't unmap the scalar_sum_buffer\n", err);
exit(1);
}
printf("Mapping is done!\n");
for (int i = 0; i < 4; i++){
printf("%d\n", *(scalar_sum+i));
}
/* Deallocate resources */
clReleaseMemObject(density_buffer);
clReleaseMemObject(scalar_sum_buffer);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}
';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if (err < 0) {
perror("Couldn't create the program");
exit(1);
}
free(program_buffer);
/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*)malloc(log_size + 1);
program_log[log_size] = '
//This program performs reduction for an array of arbitrary size.
#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE “reduction_arbitrary.cl”
#define KERNEL_FUNC “reduction”
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <string.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
cl_device_id create_device() {
cl_platform_id *platform;
cl_device_id dev;
cl_uint num_platform;
int err;
/* Identify a platform */
err = clGetPlatformIDs(0, NULL, &num_platform);
if (err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
platform = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platform);
clGetPlatformIDs(num_platform, platform, NULL);
/* Access a device */
err = clGetDeviceIDs(platform[1], CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
if (err < 0) {
perror("Couldn't access any devices");
exit(1);
}
return dev;
}
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if (program_handle == NULL) {
perror("Couldn't find the program file");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if (err < 0) {
perror("Couldn't create the program");
exit(1);
}
free(program_buffer);
/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*)malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
void get_info(cl_device_id dev){
cl_ulong glob_mem_size, local_mem_size;
cl_uint clock_freq, num_core, work_item_dim, time_res;
size_t local_size, work_item_size[3];
char dev_vendor[40], dev_name[400], driver_version[40], device_version[40];
clGetDeviceInfo(dev, CL_DEVICE_VENDOR, sizeof(dev_vendor), &dev_vendor, NULL);
clGetDeviceInfo(dev, CL_DEVICE_NAME, sizeof(dev_name), &dev_name, NULL);
clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(glob_mem_size), &glob_mem_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(local_mem_size), &local_mem_size, NULL);
clGetDeviceInfo(dev, CL_DRIVER_VERSION, sizeof(driver_version), &driver_version, NULL);
clGetDeviceInfo(dev, CL_DEVICE_VERSION, sizeof(device_version), &device_version, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_freq), &clock_freq, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_core), &num_core, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_size), &work_item_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(work_item_dim), &work_item_dim, NULL);
clGetDeviceInfo(dev, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(time_res), &time_res, NULL);
printf("==========================================================\n");
printf("Device Sepc without consideration of kernels:\n");
printf("CL_DEVICE_VENDOR: %s\n", dev_vendor);
printf("CL_DEVICE_NAME: %s\n", dev_name);
printf("CL_DEVICE_GLOBAL_MEM_SIZE: %I64u GB\n", glob_mem_size / 1073741824);
printf("CL_DEVICE_LOCAL_MEM_SIZE: %I64u KB\n", local_mem_size / 1024);
printf("CL_DRIVER_VERSION: %s\n", driver_version);
printf("CL_DEVICE_VERSION: %s\n", device_version);
printf("CL_DEVICE_MAX_CLOCK_FREQUENCY: %I32u MHz\n", clock_freq);
printf("CL_DEVICE_MAX_COMPUTE_UNITS: %I32u\n", num_core);
printf("CL_DEVICE_MAX_WORK_GROUP_SIZE %u\n", local_size);
printf("CL_DEVICE_MAX_WORK_ITEM_SIZES: {%I32u, %I32u, %I32u}\n", work_item_size[0], work_item_size[1], work_item_size[2]);
printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: %I32u\n", work_item_dim);
printf("CL_DEVICE_PROFILING_TIMER_RESOLUTION: %I32u ns\n", time_res);
printf("==========================================================\n");
}
int main() {
/* Host/device data structures */
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_int i, err;
/* Program/kernel data structures */
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
cl_kernel kernel;
/* Data and buffers */
void *scalar_sum_mapped_memory;
cl_mem density_buffer, scalar_sum_buffer;
char density[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 };
int scalar_sum[4] = {1,2,3,4};
device = create_device();
get_info(device);
/* Create the context */
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err < 0) {
perror("Couldn't create a context");
exit(1);
}
program = build_program(context, device, PROGRAM_FILE);
/* Create kernel for the mat_vec_mult function */
kernel = clCreateKernel(program, KERNEL_FUNC, &err);
if (err < 0) {
perror("Couldn't create the kernel");
exit(1);
}
size_t global_work_size = 16;
size_t local_work_size = 4;
density_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, sizeof(char)* 16, density, &err);
scalar_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, sizeof(int)* 4, scalar_sum, &err);
if (err < 0){
perror("Couldn't create a buffer");
exit(1);
}
/* Create kernel arguments from the CL buffers */
err = clSetKernelArg(kernel, 0, sizeof(size_t), &global_work_size);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &density_buffer);
err |= clSetKernelArg(kernel, 2, sizeof(int), NULL);
err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &scalar_sum_buffer);
if (err < 0) {
perror("Couldn't set the kernel argument");
exit(1);
}
/* Create a CL command queue for the device*/
queue = clCreateCommandQueue(context, device, 0, &err);
if (err < 0) {
perror("Couldn't create the command queue");
exit(1);
}
/* Enqueue the command queue to the device */
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size,
&local_work_size, 0, NULL, NULL);
if (err < 0) {
perror("Couldn't enqueue the kernel execution command");
printf("Here is the error code: %d\n", err);
exit(1);
}
/*Read the results*/
scalar_sum_mapped_memory = clEnqueueMapBuffer(queue, scalar_sum_buffer, CL_TRUE,
CL_MAP_READ, 0, sizeof(int)*4, 0, NULL, NULL, &err);
if (err < 0) {
printf("Error code : %d. Couldn't map the buffer to host memory\n", err);
exit(1);
}
memcpy(scalar_sum, scalar_sum_mapped_memory, sizeof(int)*4);
err = clEnqueueUnmapMemObject(queue, scalar_sum_buffer, scalar_sum_mapped_memory,
0, NULL, NULL);
if (err < 0) {
printf("Error code: %d. Couldn't unmap the scalar_sum_buffer\n", err);
exit(1);
}
printf("Mapping is done!\n");
for (int i = 0; i < 4; i++){
printf("%d\n", *(scalar_sum+i));
}
/* Deallocate resources */
clReleaseMemObject(density_buffer);
clReleaseMemObject(scalar_sum_buffer);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}
';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
void get_info(cl_device_id dev){
cl_ulong glob_mem_size, local_mem_size;
cl_uint clock_freq, num_core, work_item_dim, time_res;
size_t local_size, work_item_size[3];
char dev_vendor[40], dev_name[400], driver_version[40], device_version[40];
clGetDeviceInfo(dev, CL_DEVICE_VENDOR, sizeof(dev_vendor), &dev_vendor, NULL);
clGetDeviceInfo(dev, CL_DEVICE_NAME, sizeof(dev_name), &dev_name, NULL);
clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(glob_mem_size), &glob_mem_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(local_mem_size), &local_mem_size, NULL);
clGetDeviceInfo(dev, CL_DRIVER_VERSION, sizeof(driver_version), &driver_version, NULL);
clGetDeviceInfo(dev, CL_DEVICE_VERSION, sizeof(device_version), &device_version, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_freq), &clock_freq, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_core), &num_core, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_size), &work_item_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(work_item_dim), &work_item_dim, NULL);
clGetDeviceInfo(dev, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(time_res), &time_res, NULL);
printf("==========================================================\n");
printf("Device Sepc without consideration of kernels:\n");
printf("CL_DEVICE_VENDOR: %s\n", dev_vendor);
printf("CL_DEVICE_NAME: %s\n", dev_name);
printf("CL_DEVICE_GLOBAL_MEM_SIZE: %I64u GB\n", glob_mem_size / 1073741824);
printf("CL_DEVICE_LOCAL_MEM_SIZE: %I64u KB\n", local_mem_size / 1024);
printf("CL_DRIVER_VERSION: %s\n", driver_version);
printf("CL_DEVICE_VERSION: %s\n", device_version);
printf("CL_DEVICE_MAX_CLOCK_FREQUENCY: %I32u MHz\n", clock_freq);
printf("CL_DEVICE_MAX_COMPUTE_UNITS: %I32u\n", num_core);
printf("CL_DEVICE_MAX_WORK_GROUP_SIZE %u\n", local_size);
printf("CL_DEVICE_MAX_WORK_ITEM_SIZES: {%I32u, %I32u, %I32u}\n", work_item_size[0], work_item_size[1], work_item_size[2]);
printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: %I32u\n", work_item_dim);
printf("CL_DEVICE_PROFILING_TIMER_RESOLUTION: %I32u ns\n", time_res);
printf("==========================================================\n");
}
int main() {
/* Host/device data structures */
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_int i, err;
/* Program/kernel data structures */
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
cl_kernel kernel;
/* Data and buffers */
void *scalar_sum_mapped_memory;
cl_mem density_buffer, scalar_sum_buffer;
char density[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 };
int scalar_sum[4] = {1,2,3,4};
device = create_device();
get_info(device);
/* Create the context */
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err < 0) {
perror("Couldn't create a context");
exit(1);
}
program = build_program(context, device, PROGRAM_FILE);
/* Create kernel for the mat_vec_mult function */
kernel = clCreateKernel(program, KERNEL_FUNC, &err);
if (err < 0) {
perror("Couldn't create the kernel");
exit(1);
}
size_t global_work_size = 16;
size_t local_work_size = 4;
density_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, sizeof(char)* 16, density, &err);
scalar_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, sizeof(int)* 4, scalar_sum, &err);
if (err < 0){
perror("Couldn't create a buffer");
exit(1);
}
/* Create kernel arguments from the CL buffers */
err = clSetKernelArg(kernel, 0, sizeof(size_t), &global_work_size);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &density_buffer);
err |= clSetKernelArg(kernel, 2, sizeof(int), NULL);
err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &scalar_sum_buffer);
if (err < 0) {
perror("Couldn't set the kernel argument");
exit(1);
}
/* Create a CL command queue for the device*/
queue = clCreateCommandQueue(context, device, 0, &err);
if (err < 0) {
perror("Couldn't create the command queue");
exit(1);
}
/* Enqueue the command queue to the device */
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size,
&local_work_size, 0, NULL, NULL);
if (err < 0) {
perror("Couldn't enqueue the kernel execution command");
printf("Here is the error code: %d\n", err);
exit(1);
}
/*Read the results*/
scalar_sum_mapped_memory = clEnqueueMapBuffer(queue, scalar_sum_buffer, CL_TRUE,
CL_MAP_READ, 0, sizeof(int)*4, 0, NULL, NULL, &err);
if (err < 0) {
printf("Error code : %d. Couldn't map the buffer to host memory\n", err);
exit(1);
}
memcpy(scalar_sum, scalar_sum_mapped_memory, sizeof(int)*4);
err = clEnqueueUnmapMemObject(queue, scalar_sum_buffer, scalar_sum_mapped_memory,
0, NULL, NULL);
if (err < 0) {
printf("Error code: %d. Couldn't unmap the scalar_sum_buffer\n", err);
exit(1);
}
printf("Mapping is done!\n");
for (int i = 0; i < 4; i++){
printf("%d\n", *(scalar_sum+i));
}
/* Deallocate resources */
clReleaseMemObject(density_buffer);
clReleaseMemObject(scalar_sum_buffer);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}