A problem in reading opencl kernel results in a simple reduction implementation

This is actually an opencl code not cuda. I have an array of 16 chars named density and I want to reduce them into an array of 4 integers called scalar_sums. The kernel is working fine and when I printf the results within kernel I get proper results. However, when I map the memory to print them in my c-code, I am only able to print the first element. I think there might be a problem with my memory mapping like segfaulting but I cannot figure that out where that is.

One wired thing when I comment the last comments of the kernel, i.e. those lines that manipulate scalar_nums, the mapping works fine. It returns their initial value properly. But, when the kernel operate on scalar_nums` the mapping has problem although kernel printf is printing proper numbers.
Do you have any idea why this is not working.
Here are the C-code and the Kernel.

__kernel void reduction(int voxelCounts,
	__global char* density,
	__local int* partialSums,
	__global int* scalar_sum){

	int gid = get_global_id(0);
	int lid = get_local_id(0);
	int groupSize = get_local_size(0);
	int i = ceil(convert_float(groupSize / 2));
	//printf("%d\n", i);
	//partialSums = (convert_int(density[gid]) + 128) / 255;
	partialSums[lid] = convert_int(density[gid]);
	//printf("partialSums in kernel: %d\n", partialSums);

	while (i != 0){

		if (gid + i < voxelCounts && lid < i){

			partialSums[lid] += partialSums[lid + i];
		i /= 2;

	if (gid < voxelCounts && lid == 0){

		int temp = partialSums[lid];
		scalar_sum[gid] = temp;
		printf("At the end: %d --> %d\n", gid, scalar_sum[gid]);
//This program performs reduction  for an array of arbitrary size.

#define PROGRAM_FILE "reduction_arbitrary.cl"
#define KERNEL_FUNC "reduction"

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <string.h>

#ifdef MAC
#include <OpenCL/cl.h>
#include <CL/cl.h>

cl_device_id create_device() {

	cl_platform_id *platform;
	cl_device_id dev;
	cl_uint num_platform;
	int err;

	/* Identify a platform */
	err = clGetPlatformIDs(0, NULL, &num_platform);
	if (err < 0) {
		perror("Couldn't identify a platform");
	platform = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platform);
	clGetPlatformIDs(num_platform, platform, NULL);
	/* Access a device */
	err = clGetDeviceIDs(platform[1], CL_DEVICE_TYPE_GPU, 1, &dev, NULL);

	if (err < 0) {
		perror("Couldn't access any devices");

	return dev;
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {

	cl_program program;
	FILE *program_handle;
	char *program_buffer, *program_log;
	size_t program_size, log_size;
	int err;

	/* Read program file and place content into buffer */
	program_handle = fopen(filename, "r");
	if (program_handle == NULL) {
		perror("Couldn't find the program file");
	fseek(program_handle, 0, SEEK_END);
	program_size = ftell(program_handle);
	program_buffer = (char*)malloc(program_size + 1);
	program_buffer[program_size] = '

