A problem in reading opencl kernel results in a simple reduction implementation

This is actually an opencl code not cuda. I have an array of 16 chars named density and I want to reduce them into an array of 4 integers called scalar_sums. The kernel is working fine and when I printf the results within kernel I get proper results. However, when I map the memory to print them in my c-code, I am only able to print the first element. I think there might be a problem with my memory mapping like segfaulting but I cannot figure that out where that is.

One wired thing when I comment the last comments of the kernel, i.e. those lines that manipulate scalar_nums, the mapping works fine. It returns their initial value properly. But, when the kernel operate on scalar_nums` the mapping has problem although kernel printf is printing proper numbers.
Do you have any idea why this is not working.
Here are the C-code and the Kernel.

__kernel void reduction(int voxelCounts,
	__global char* density,
	__local int* partialSums,
	__global int* scalar_sum){

	int gid = get_global_id(0);
	int lid = get_local_id(0);
	int groupSize = get_local_size(0);
	int i = ceil(convert_float(groupSize / 2));
	//printf("%d\n", i);
	//partialSums = (convert_int(density[gid]) + 128) / 255;
	partialSums[lid] = convert_int(density[gid]);
	//printf("partialSums in kernel: %d\n", partialSums);
	barrier(CLK_LOCAL_MEM_FENCE);

	while (i != 0){

		if (gid + i < voxelCounts && lid < i){

			partialSums[lid] += partialSums[lid + i];
		}
		i /= 2;
		barrier(CLK_LOCAL_MEM_FENCE);
	}

	if (gid < voxelCounts && lid == 0){

		int temp = partialSums[lid];
		barrier(CLK_LOCAL_MEM_FENCE);
		scalar_sum[gid] = temp;
		printf("At the end: %d --> %d\n", gid, scalar_sum[gid]);
	}
}
//This program performs reduction  for an array of arbitrary size.

#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE "reduction_arbitrary.cl"
#define KERNEL_FUNC "reduction"

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <string.h>

#ifdef MAC
#include <OpenCL/cl.h>
#else  
#include <CL/cl.h>
#endif

cl_device_id create_device() {

	cl_platform_id *platform;
	cl_device_id dev;
	cl_uint num_platform;
	int err;

	/* Identify a platform */
	err = clGetPlatformIDs(0, NULL, &num_platform);
	if (err < 0) {
		perror("Couldn't identify a platform");
		exit(1);
	}
	platform = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platform);
	clGetPlatformIDs(num_platform, platform, NULL);
	/* Access a device */
	err = clGetDeviceIDs(platform[1], CL_DEVICE_TYPE_GPU, 1, &dev, NULL);

	if (err < 0) {
		perror("Couldn't access any devices");
		exit(1);
	}

	return dev;
}
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {

	cl_program program;
	FILE *program_handle;
	char *program_buffer, *program_log;
	size_t program_size, log_size;
	int err;

	/* Read program file and place content into buffer */
	program_handle = fopen(filename, "r");
	if (program_handle == NULL) {
		perror("Couldn't find the program file");
		exit(1);
	}
	fseek(program_handle, 0, SEEK_END);
	program_size = ftell(program_handle);
	rewind(program_handle);
	program_buffer = (char*)malloc(program_size + 1);
	program_buffer[program_size] = '

//This program performs reduction for an array of arbitrary size.

#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE “reduction_arbitrary.cl”
#define KERNEL_FUNC “reduction”

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <string.h>

#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

cl_device_id create_device() {

cl_platform_id *platform;
cl_device_id dev;
cl_uint num_platform;
int err;

/* Identify a platform */
err = clGetPlatformIDs(0, NULL, &num_platform);
if (err < 0) {
	perror("Couldn't identify a platform");
	exit(1);
}
platform = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platform);
clGetPlatformIDs(num_platform, platform, NULL);
/* Access a device */
err = clGetDeviceIDs(platform[1], CL_DEVICE_TYPE_GPU, 1, &dev, NULL);

if (err < 0) {
	perror("Couldn't access any devices");
	exit(1);
}

return dev;

}
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {

cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;

/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if (program_handle == NULL) {
	perror("Couldn't find the program file");
	exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);

/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
	(const char**)&program_buffer, &program_size, &err);
if (err < 0) {
	perror("Couldn't create the program");
	exit(1);
}
free(program_buffer);

/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err < 0) {

	/* Find size of log and print to std output */
	clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
		0, NULL, &log_size);
	program_log = (char*)malloc(log_size + 1);
	program_log[log_size] = '\0';
	clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
		log_size + 1, program_log, NULL);
	printf("%s\n", program_log);
	free(program_log);
	exit(1);
}

return program;

}
void get_info(cl_device_id dev){

cl_ulong glob_mem_size, local_mem_size;
cl_uint clock_freq, num_core, work_item_dim, time_res;
size_t local_size, work_item_size[3];
char dev_vendor[40], dev_name[400], driver_version[40], device_version[40];

clGetDeviceInfo(dev, CL_DEVICE_VENDOR, sizeof(dev_vendor), &dev_vendor, NULL);
clGetDeviceInfo(dev, CL_DEVICE_NAME, sizeof(dev_name), &dev_name, NULL);
clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(glob_mem_size), &glob_mem_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(local_mem_size), &local_mem_size, NULL);
clGetDeviceInfo(dev, CL_DRIVER_VERSION, sizeof(driver_version), &driver_version, NULL);
clGetDeviceInfo(dev, CL_DEVICE_VERSION, sizeof(device_version), &device_version, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_freq), &clock_freq, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_core), &num_core, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_size), &work_item_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(work_item_dim), &work_item_dim, NULL);
clGetDeviceInfo(dev, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(time_res), &time_res, NULL);

printf("==========================================================\n");
printf("Device Sepc without consideration of kernels:\n");
printf("CL_DEVICE_VENDOR:                     %s\n", dev_vendor);
printf("CL_DEVICE_NAME:                       %s\n", dev_name);
printf("CL_DEVICE_GLOBAL_MEM_SIZE:            %I64u GB\n", glob_mem_size / 1073741824);
printf("CL_DEVICE_LOCAL_MEM_SIZE:             %I64u KB\n", local_mem_size / 1024);
printf("CL_DRIVER_VERSION:                    %s\n", driver_version);
printf("CL_DEVICE_VERSION:                    %s\n", device_version);
printf("CL_DEVICE_MAX_CLOCK_FREQUENCY:        %I32u MHz\n", clock_freq);
printf("CL_DEVICE_MAX_COMPUTE_UNITS:          %I32u\n", num_core);
printf("CL_DEVICE_MAX_WORK_GROUP_SIZE         %u\n", local_size);
printf("CL_DEVICE_MAX_WORK_ITEM_SIZES:        {%I32u, %I32u, %I32u}\n", work_item_size[0], work_item_size[1], work_item_size[2]);
printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:   %I32u\n", work_item_dim);
printf("CL_DEVICE_PROFILING_TIMER_RESOLUTION: %I32u ns\n", time_res);
printf("==========================================================\n");

}

int main() {

/* Host/device data structures */
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_int i, err;

/* Program/kernel data structures */
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
cl_kernel kernel;

/* Data and buffers */
void *scalar_sum_mapped_memory;
cl_mem density_buffer, scalar_sum_buffer;
char density[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 };
int scalar_sum[4] = {1,2,3,4};

device = create_device();
get_info(device);

/* Create the context */
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err < 0) {
	perror("Couldn't create a context");
	exit(1);
}

program = build_program(context, device, PROGRAM_FILE);

/* Create kernel for the mat_vec_mult function */
kernel = clCreateKernel(program, KERNEL_FUNC, &err);
if (err < 0) {
	perror("Couldn't create the kernel");
	exit(1);
}

size_t global_work_size = 16;
size_t local_work_size = 4;

density_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
	CL_MEM_COPY_HOST_PTR, sizeof(char)* 16, density, &err);
scalar_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
	CL_MEM_COPY_HOST_PTR, sizeof(int)* 4, scalar_sum, &err);
if (err < 0){
	perror("Couldn't create a buffer");
	exit(1);
}

/* Create kernel arguments from the CL buffers */
err = clSetKernelArg(kernel, 0, sizeof(size_t), &global_work_size);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &density_buffer);
err |= clSetKernelArg(kernel, 2, sizeof(int), NULL);
err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &scalar_sum_buffer);
if (err < 0) {
	perror("Couldn't set the kernel argument");
	exit(1);
}

/* Create a CL command queue for the device*/
queue = clCreateCommandQueue(context, device, 0, &err);
if (err < 0) {
	perror("Couldn't create the command queue");
	exit(1);
}

/* Enqueue the command queue to the device */
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size,
	&local_work_size, 0, NULL, NULL);
if (err < 0) {
	perror("Couldn't enqueue the kernel execution command");
	printf("Here is the error code: %d\n", err);
	exit(1);
}

/*Read the results*/
scalar_sum_mapped_memory = clEnqueueMapBuffer(queue, scalar_sum_buffer, CL_TRUE,
	CL_MAP_READ, 0, sizeof(int)*4, 0, NULL, NULL, &err);
if (err < 0) {
	printf("Error code : %d. Couldn't map the buffer to host memory\n", err);
	exit(1);
}
memcpy(scalar_sum, scalar_sum_mapped_memory, sizeof(int)*4);
err = clEnqueueUnmapMemObject(queue, scalar_sum_buffer, scalar_sum_mapped_memory,
	0, NULL, NULL);
if (err < 0) {
	printf("Error code: %d. Couldn't unmap the scalar_sum_buffer\n", err);
	exit(1);
}
printf("Mapping is done!\n");
for (int i = 0; i < 4; i++){
	printf("%d\n", *(scalar_sum+i));
}
/* Deallocate resources */
clReleaseMemObject(density_buffer);
clReleaseMemObject(scalar_sum_buffer);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);

return 0;

}

';
	fread(program_buffer, sizeof(char), program_size, program_handle);
	fclose(program_handle);

	/* Create program from file */
	program = clCreateProgramWithSource(ctx, 1,
		(const char**)&program_buffer, &program_size, &err);
	if (err < 0) {
		perror("Couldn't create the program");
		exit(1);
	}
	free(program_buffer);

	/* Build program */
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err < 0) {

		/* Find size of log and print to std output */
		clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
			0, NULL, &log_size);
		program_log = (char*)malloc(log_size + 1);
		program_log[log_size] = '

//This program performs reduction for an array of arbitrary size.

#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE “reduction_arbitrary.cl”
#define KERNEL_FUNC “reduction”

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <string.h>

#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

cl_device_id create_device() {

cl_platform_id *platform;
cl_device_id dev;
cl_uint num_platform;
int err;

/* Identify a platform */
err = clGetPlatformIDs(0, NULL, &num_platform);
if (err < 0) {
	perror("Couldn't identify a platform");
	exit(1);
}
platform = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platform);
clGetPlatformIDs(num_platform, platform, NULL);
/* Access a device */
err = clGetDeviceIDs(platform[1], CL_DEVICE_TYPE_GPU, 1, &dev, NULL);

if (err < 0) {
	perror("Couldn't access any devices");
	exit(1);
}

return dev;

}
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {

cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;

/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if (program_handle == NULL) {
	perror("Couldn't find the program file");
	exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);

/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
	(const char**)&program_buffer, &program_size, &err);
if (err < 0) {
	perror("Couldn't create the program");
	exit(1);
}
free(program_buffer);

/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err < 0) {

	/* Find size of log and print to std output */
	clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
		0, NULL, &log_size);
	program_log = (char*)malloc(log_size + 1);
	program_log[log_size] = '\0';
	clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
		log_size + 1, program_log, NULL);
	printf("%s\n", program_log);
	free(program_log);
	exit(1);
}

return program;

}
void get_info(cl_device_id dev){

cl_ulong glob_mem_size, local_mem_size;
cl_uint clock_freq, num_core, work_item_dim, time_res;
size_t local_size, work_item_size[3];
char dev_vendor[40], dev_name[400], driver_version[40], device_version[40];

clGetDeviceInfo(dev, CL_DEVICE_VENDOR, sizeof(dev_vendor), &dev_vendor, NULL);
clGetDeviceInfo(dev, CL_DEVICE_NAME, sizeof(dev_name), &dev_name, NULL);
clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(glob_mem_size), &glob_mem_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(local_mem_size), &local_mem_size, NULL);
clGetDeviceInfo(dev, CL_DRIVER_VERSION, sizeof(driver_version), &driver_version, NULL);
clGetDeviceInfo(dev, CL_DEVICE_VERSION, sizeof(device_version), &device_version, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_freq), &clock_freq, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_core), &num_core, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_size), &work_item_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(work_item_dim), &work_item_dim, NULL);
clGetDeviceInfo(dev, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(time_res), &time_res, NULL);

printf("==========================================================\n");
printf("Device Sepc without consideration of kernels:\n");
printf("CL_DEVICE_VENDOR:                     %s\n", dev_vendor);
printf("CL_DEVICE_NAME:                       %s\n", dev_name);
printf("CL_DEVICE_GLOBAL_MEM_SIZE:            %I64u GB\n", glob_mem_size / 1073741824);
printf("CL_DEVICE_LOCAL_MEM_SIZE:             %I64u KB\n", local_mem_size / 1024);
printf("CL_DRIVER_VERSION:                    %s\n", driver_version);
printf("CL_DEVICE_VERSION:                    %s\n", device_version);
printf("CL_DEVICE_MAX_CLOCK_FREQUENCY:        %I32u MHz\n", clock_freq);
printf("CL_DEVICE_MAX_COMPUTE_UNITS:          %I32u\n", num_core);
printf("CL_DEVICE_MAX_WORK_GROUP_SIZE         %u\n", local_size);
printf("CL_DEVICE_MAX_WORK_ITEM_SIZES:        {%I32u, %I32u, %I32u}\n", work_item_size[0], work_item_size[1], work_item_size[2]);
printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:   %I32u\n", work_item_dim);
printf("CL_DEVICE_PROFILING_TIMER_RESOLUTION: %I32u ns\n", time_res);
printf("==========================================================\n");

}

int main() {

/* Host/device data structures */
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_int i, err;

/* Program/kernel data structures */
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
cl_kernel kernel;

/* Data and buffers */
void *scalar_sum_mapped_memory;
cl_mem density_buffer, scalar_sum_buffer;
char density[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 };
int scalar_sum[4] = {1,2,3,4};

device = create_device();
get_info(device);

/* Create the context */
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err < 0) {
	perror("Couldn't create a context");
	exit(1);
}

program = build_program(context, device, PROGRAM_FILE);

/* Create kernel for the mat_vec_mult function */
kernel = clCreateKernel(program, KERNEL_FUNC, &err);
if (err < 0) {
	perror("Couldn't create the kernel");
	exit(1);
}

size_t global_work_size = 16;
size_t local_work_size = 4;

density_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
	CL_MEM_COPY_HOST_PTR, sizeof(char)* 16, density, &err);
scalar_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
	CL_MEM_COPY_HOST_PTR, sizeof(int)* 4, scalar_sum, &err);
if (err < 0){
	perror("Couldn't create a buffer");
	exit(1);
}

/* Create kernel arguments from the CL buffers */
err = clSetKernelArg(kernel, 0, sizeof(size_t), &global_work_size);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &density_buffer);
err |= clSetKernelArg(kernel, 2, sizeof(int), NULL);
err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &scalar_sum_buffer);
if (err < 0) {
	perror("Couldn't set the kernel argument");
	exit(1);
}

/* Create a CL command queue for the device*/
queue = clCreateCommandQueue(context, device, 0, &err);
if (err < 0) {
	perror("Couldn't create the command queue");
	exit(1);
}

/* Enqueue the command queue to the device */
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size,
	&local_work_size, 0, NULL, NULL);
if (err < 0) {
	perror("Couldn't enqueue the kernel execution command");
	printf("Here is the error code: %d\n", err);
	exit(1);
}

/*Read the results*/
scalar_sum_mapped_memory = clEnqueueMapBuffer(queue, scalar_sum_buffer, CL_TRUE,
	CL_MAP_READ, 0, sizeof(int)*4, 0, NULL, NULL, &err);
if (err < 0) {
	printf("Error code : %d. Couldn't map the buffer to host memory\n", err);
	exit(1);
}
memcpy(scalar_sum, scalar_sum_mapped_memory, sizeof(int)*4);
err = clEnqueueUnmapMemObject(queue, scalar_sum_buffer, scalar_sum_mapped_memory,
	0, NULL, NULL);
if (err < 0) {
	printf("Error code: %d. Couldn't unmap the scalar_sum_buffer\n", err);
	exit(1);
}
printf("Mapping is done!\n");
for (int i = 0; i < 4; i++){
	printf("%d\n", *(scalar_sum+i));
}
/* Deallocate resources */
clReleaseMemObject(density_buffer);
clReleaseMemObject(scalar_sum_buffer);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);

return 0;

}

';
		clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
			log_size + 1, program_log, NULL);
		printf("%s\n", program_log);
		free(program_log);
		exit(1);
	}

	return program;
}
void get_info(cl_device_id dev){

	cl_ulong glob_mem_size, local_mem_size;
	cl_uint clock_freq, num_core, work_item_dim, time_res;
	size_t local_size, work_item_size[3];
	char dev_vendor[40], dev_name[400], driver_version[40], device_version[40];

	clGetDeviceInfo(dev, CL_DEVICE_VENDOR, sizeof(dev_vendor), &dev_vendor, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_NAME, sizeof(dev_name), &dev_name, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(glob_mem_size), &glob_mem_size, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(local_mem_size), &local_mem_size, NULL);
	clGetDeviceInfo(dev, CL_DRIVER_VERSION, sizeof(driver_version), &driver_version, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_VERSION, sizeof(device_version), &device_version, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_freq), &clock_freq, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_core), &num_core, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_size), &work_item_size, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(work_item_dim), &work_item_dim, NULL);
	clGetDeviceInfo(dev, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(time_res), &time_res, NULL);

	printf("==========================================================\n");
	printf("Device Sepc without consideration of kernels:\n");
	printf("CL_DEVICE_VENDOR:                     %s\n", dev_vendor);
	printf("CL_DEVICE_NAME:                       %s\n", dev_name);
	printf("CL_DEVICE_GLOBAL_MEM_SIZE:            %I64u GB\n", glob_mem_size / 1073741824);
	printf("CL_DEVICE_LOCAL_MEM_SIZE:             %I64u KB\n", local_mem_size / 1024);
	printf("CL_DRIVER_VERSION:                    %s\n", driver_version);
	printf("CL_DEVICE_VERSION:                    %s\n", device_version);
	printf("CL_DEVICE_MAX_CLOCK_FREQUENCY:        %I32u MHz\n", clock_freq);
	printf("CL_DEVICE_MAX_COMPUTE_UNITS:          %I32u\n", num_core);
	printf("CL_DEVICE_MAX_WORK_GROUP_SIZE         %u\n", local_size);
	printf("CL_DEVICE_MAX_WORK_ITEM_SIZES:        {%I32u, %I32u, %I32u}\n", work_item_size[0], work_item_size[1], work_item_size[2]);
	printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:   %I32u\n", work_item_dim);
	printf("CL_DEVICE_PROFILING_TIMER_RESOLUTION: %I32u ns\n", time_res);
	printf("==========================================================\n");

}

int main() {

	/* Host/device data structures */
	cl_platform_id platform;
	cl_device_id device;
	cl_context context;
	cl_command_queue queue;
	cl_int i, err;

	/* Program/kernel data structures */
	cl_program program;
	FILE *program_handle;
	char *program_buffer, *program_log;
	size_t program_size, log_size;
	cl_kernel kernel;

	/* Data and buffers */
	void *scalar_sum_mapped_memory;
	cl_mem density_buffer, scalar_sum_buffer;
	char density[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 };
	int scalar_sum[4] = {1,2,3,4};

	device = create_device();
	get_info(device);

	/* Create the context */
	context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
	if (err < 0) {
		perror("Couldn't create a context");
		exit(1);
	}

	program = build_program(context, device, PROGRAM_FILE);

	/* Create kernel for the mat_vec_mult function */
	kernel = clCreateKernel(program, KERNEL_FUNC, &err);
	if (err < 0) {
		perror("Couldn't create the kernel");
		exit(1);
	}

	size_t global_work_size = 16;
	size_t local_work_size = 4;

	density_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
		CL_MEM_COPY_HOST_PTR, sizeof(char)* 16, density, &err);
	scalar_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
		CL_MEM_COPY_HOST_PTR, sizeof(int)* 4, scalar_sum, &err);
	if (err < 0){
		perror("Couldn't create a buffer");
		exit(1);
	}

	/* Create kernel arguments from the CL buffers */
	err = clSetKernelArg(kernel, 0, sizeof(size_t), &global_work_size);
	err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &density_buffer);
	err |= clSetKernelArg(kernel, 2, sizeof(int), NULL);
	err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &scalar_sum_buffer);
	if (err < 0) {
		perror("Couldn't set the kernel argument");
		exit(1);
	}

	/* Create a CL command queue for the device*/
	queue = clCreateCommandQueue(context, device, 0, &err);
	if (err < 0) {
		perror("Couldn't create the command queue");
		exit(1);
	}

	/* Enqueue the command queue to the device */
	err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size,
		&local_work_size, 0, NULL, NULL);
	if (err < 0) {
		perror("Couldn't enqueue the kernel execution command");
		printf("Here is the error code: %d\n", err);
		exit(1);
	}

	/*Read the results*/
	scalar_sum_mapped_memory = clEnqueueMapBuffer(queue, scalar_sum_buffer, CL_TRUE,
		CL_MAP_READ, 0, sizeof(int)*4, 0, NULL, NULL, &err);
	if (err < 0) {
		printf("Error code : %d. Couldn't map the buffer to host memory\n", err);
		exit(1);
	}
	memcpy(scalar_sum, scalar_sum_mapped_memory, sizeof(int)*4);
	err = clEnqueueUnmapMemObject(queue, scalar_sum_buffer, scalar_sum_mapped_memory,
		0, NULL, NULL);
	if (err < 0) {
		printf("Error code: %d. Couldn't unmap the scalar_sum_buffer\n", err);
		exit(1);
	}
	printf("Mapping is done!\n");
	for (int i = 0; i < 4; i++){
		printf("%d\n", *(scalar_sum+i));
	}
	/* Deallocate resources */
	clReleaseMemObject(density_buffer);
	clReleaseMemObject(scalar_sum_buffer);
	clReleaseKernel(kernel);
	clReleaseCommandQueue(queue);
	clReleaseProgram(program);
	clReleaseContext(context);

	return 0;
}