OpenCL application segmentation fault when building program with driver 343.22 and GTX660

When building the OpenCL program, the double free or corruption happened.
Is it a bug?

below attach the backtrace
double free or corruption (fasttop): 0x0000000000d7f440 ***
======= Backtrace: =========
/lib64/libc.so.6(+0x79033)[0x7f8444e41033]
/lib64/libc.so.6(+0x7ebc6)[0x7f8444e46bc6]
/lib64/libc.so.6(+0x7fd92)[0x7f8444e47d92]
/usr/lib64/libnvidia-compiler.so.343.22(+0x1010f2b)[0x7f84398a4f2b]
/usr/lib64/libnvidia-compiler.so.343.22(+0x1011654)[0x7f84398a5654]
/usr/lib64/libnvidia-compiler.so.343.22(+0xecb400)[0x7f843975f400]
/usr/lib64/libnvidia-compiler.so.343.22(+0xecb506)[0x7f843975f506]
/usr/lib64/libnvidia-compiler.so.343.22(+0xecb6ec)[0x7f843975f6ec]
/usr/lib64/libnvidia-compiler.so.343.22(+0x162774)[0x7f84389f6774]
/usr/lib64/libnvidia-compiler.so.343.22(+0x1537ba)[0x7f84389e77ba]
/usr/lib64/libnvidia-compiler.so.343.22(+0x14df22)[0x7f84389e1f22]
/usr/lib64/libnvidia-compiler.so.343.22(+0x14f09d)[0x7f84389e309d]
/usr/lib64/libnvidia-compiler.so.343.22(+0x14ae5d)[0x7f84389dee5d]
/usr/lib64/libnvidia-compiler.so.343.22(NvCliCompileProgram+0x147)[0x7f84389d9c27]
/usr/lib64/libnvidia-opencl.so.1(+0x153f62)[0x7f8443b5df62]
/usr/lib64/libnvidia-opencl.so.1(+0x148195)[0x7f8443b52195]
/home/tedyu002/src/cl_program_builder/a.out[0x401f27]
/home/tedyu002/src/cl_program_builder/a.out[0x402288]
/lib64/libc.so.6(__libc_start_main+0xf5)[0x7f8444decdb5]
/home/tedyu002/src/cl_program_builder/a.out[0x401319]
======= Memory map: ========
00400000-00403000 r-xp 00000000 08:02 12060955 /home/tedyu002/src/cl_program_builder/a.out
00603000-00604000 r–p 00003000 08:02 12060955 /home/tedyu002/src/cl_program_builder/a.out
00604000-00605000 rw-p 00004000 08:02 12060955 /home/tedyu002/src/cl_program_builder/a.out
00605000-01761000 rw-p 00000000 00:00 0 [heap]
200000000-900000000 —p 00000000 00:00 0
7f8438894000-7f8439ded000 r-xp 00000000 08:02 1323320 /usr/lib64/libnvidia-compiler.so.343.22
7f8439ded000-7f8439fec000 —p 01559000 08:02 1323320 /usr/lib64/libnvidia-compiler.so.343.22
7f8439fec000-7f843a7d9000 rw-p 01558000 08:02 1323320 /usr/lib64/libnvidia-compiler.so.343.22
7f843a7d9000-7f843a7fd000 rw-p 00000000 00:00 0
7f843a7fd000-7f843a7fe000 —p 00000000 00:00 0
7f843a7fe000-7f843affe000 rwxp 00000000 00:00 0 [stack:545]
7f843affe000-7f843afff000 —p 00000000 00:00 0
7f843afff000-7f843b7ff000 rwxp 00000000 00:00 0 [stack:544]
7f843b7ff000-7f843b800000 —p 00000000 00:00 0
7f843b800000-7f843c000000 rwxp 00000000 00:00 0 [stack:543]
7f843c000000-7f843c021000 rw-p 00000000 00:00 0
7f843c021000-7f8440000000 —p 00000000 00:00 0
7f84401fe000-7f84401ff000 —p 00000000 00:00 0
7f84401ff000-7f84409ff000 rwxp 00000000 00:00 0 [stack:542]
7f84409ff000-7f8440a00000 —p 00000000 00:00 0
7f8440a00000-7f8441200000 rwxp 00000000 00:00 0 [stack:541]
7f8441200000-7f8441281000 rw-s 86db6000 00:05 4052314 /dev/nvidiactl
7f8441281000-7f8441381000 rw-s 0319d000 00:05 4052314 /dev/nvidiactl
7f8441381000-7f8441481000 rw-s 10e4a0000 00:05 4052314 /dev/nvidiactl
7f8441481000-7f8441581000 rw-s 20c5a9000 00:05 4052314 /dev/nvidiactl
7f8441581000-7f8441681000 rw-s 1d5e10000 00:05 4052314 /dev/nvidiactl
7f8441681000-7f8441781000 rw-s 205621000 00:05 4052314 /dev/nvidiactl
7f8441781000-7f8441881000 rw-s 205476000 00:05 4052314 /dev/nvidiactl
7f8441881000-7f8441c81000 rw-s 20c41f000 00:05 4052314 /dev/nvidiactl
7f8441c81000-7f8442081000 rw-s 2d3d8000 00:05 4052314 /dev/nvidiactl
7f8442081000-7f8442082000 —p 00000000 00:00 0
7f8442082000-7f8442882000 rwxp 00000000 00:00 0 [stack:540]
7f8442882000-7f8443423000 r-xp 00000000 08:02 1322931 /usr/lib64/libcuda.so.343.22
7f8443423000-7f8443622000 —p 00ba1000 08:02 1322931 /usr/lib64/libcuda.so.343.22
7f8443622000-7f84437ee000 rw-p 00ba0000 08:02 1322931 /usr/lib64/libcuda.so.343.22
7f84437ee000-7f8443802000 rw-p 00000000 00:00 0
7f8443802000-7f8443809000 r-xp 00000000 08:02 30147785 /lib64/librt-2.19.so
7f8443809000-7f8443a08000 —p 00007000 08:02 30147785 /lib64/librt-2.19.so
7f8443a08000-7f8443a09000 r–p 00006000 08:02 30147785 /lib64/librt-2.19.so
7f8443a09000-7f8443a0a000 rw-p 00007000 08:02 30147785 /lib64/librt-2.19.so
7f8443a0a000-7f84445c3000 r-xp 00000000 08:02 1325273 /usr/lib64/libnvidia-opencl.so.343.22
7f84445c3000-7f84447c3000 —p 00bb9000 08:02 1325273 /usr/lib64/libnvidia-opencl.so.343.22
7f84447c3000-7f8444992000 rw-p 00bb9000 08:02 1325273 /usr/lib64/libnvidia-opencl.so.343.22
7f8444992000-7f84449a6000 rw-p 00000000 00:00 0
7f84449a6000-7f84449bf000 r-xp 00000000 08:02 30147801 /lib64/libpthread-2.19.so
7f84449bf000-7f8444bbe000 —p 00019000 08:02 30147801 /lib64/libpthread-2.19.so
7f8444bbe000-7f8444bbf000 r–p 00018000 08:02 30147801 /lib64/libpthread-2.19.so
7f8444bbf000-7f8444bc0000 rw-p 00019000 08:02 30147801 /lib64/libpthread-2.19.so
7f8444bc0000-7f8444bc4000 rw-p 00000000 00:00 0
7f8444bc4000-7f8444bc7000 r-xp 00000000 08:02 30147782 /lib64/libdl-2.19.so
7f8444bc7000-7f8444dc6000 —p 00003000 08:02 30147782 /lib64/libdl-2.19.so
7f8444dc6000-7f8444dc7000 r–p 00002000 08:02 30147782 /lib64/libdl-2.19.so
7f8444dc7000-7f8444dc8000 rw-p 00003000 08:02 30147782 /lib64/libdl-2.19.so
7f8444dc8000-7f8444f65000 r-xp 00000000 08:02 30147799 /lib64/libc-2.19.so
7f8444f65000-7f8445164000 —p 0019d000 08:02 30147799 /lib64/libc-2.19.so
7f8445164000-7f8445168000 r–p 0019c000 08:02 30147799 /lib64/libc-2.19.so
7f8445168000-7f844516a000 rw-p 001a0000 08:02 30147799 /lib64/libc-2.19.so
7f844516a000-7f844516e000 rw-p 00000000 00:00 0
7f844516e000-7f8445183000 r-xp 00000000 08:02 8660078 /usr/lib64/gcc/x86_64-pc-linux-gnu/4.8.3/libgcc_s.so.1
7f8445183000-7f8445383000 —p 00015000 08:02 8660078 /usr/lib64/gcc/x86_64-pc-linux-gnu/4.8.3/libgcc_s.so.1
7f8445383000-7f8445384000 r–p 00015000 08:02 8660078 /usr/lib64/gcc/x86_64-pc-linux-gnu/4.8.3/libgcc_s.so.1
7f8445384000-7f8445385000 rw-p 00016000 08:02 8660078 /usr/lib64/gcc/x86_64-pc-linux-gnu/4.8.3/libgcc_s.so.1
7f8445385000-7f844547e000 r-xp 00000000 08:02 30146880 /lib64/libm-2.19.so
7f844547e000-7f844567d000 —p 000f9000 08:02 30146880 /lib64/libm-2.19.so
7f844567d000-7f844567e000 r–p 000f8000 08:02 30146880 /lib64/libm-2.19.so
7f844567e000-7f844567f000 rw-p 000f9000 08:02 30146880 /lib64/libm-2.19.so
7f844567f000-7f844576b000 r-xp 00000000 08:02 8660088 /usr/lib64/gcc/x86_64-pc-linux-gnu/4.8.3/libstdc++.so.6.0.19
7f844576b000-7f844596a000 —p 000ec000 08:02 8660088 /usr/lib64/gcc/x86_64-pc-linux-gnu/4.8.3/libstdc++.so.6.0.19
7f844596a000-7f8445972000 r–p 000eb000 08:02 8660088 /usr/lib64/gcc/x86_64-pc-linux-gnu/4.8.3/libstdc++.so.6.0.19
7f8445972000-7f8445974000 rw-p 000f3000 08:02 8660088 /usr/lib64/gcc/x86_64-pc-linux-gnu/4.8.3/libstdc++.so.6.0.19
7f8445974000-7f8445989000 rw-p 00000000 00:00 0
7f8445989000-7f844598d000 r-xp 00000000 08:02 8658493 /usr/lib64/OpenCL/vendors/nvidia/libOpenCL.so.1.0.0
7f844598d000-7f8445b8d000 —p 00004000 08:02 8658493 /usr/lib64/OpenCL/vendors/nvidia/libOpenCL.so.1.0.0
7f8445b8d000-7f8445b8e000 rw-p 00004000 08:02 8658493 /usr/lib64/OpenCL/vendors/nvidia/libOpenCL.so.1.0.0
7f8445b8e000-7f8445baf000 r-xp 00000000 08:02 30147800 /lib64/ld-2.19.so
7f8445c1f000-7f8445c20000 rw-p 00000000 00:00 0
7f8445c20000-7f8445c21000 rw-s e7d60000 00:05 4053007 /dev/nvidia0
7f8445c21000-7f8445c22000 rw-s 1ee46e000 00:05 4052314 /dev/nvidiactl
7f8445c22000-7f8445c42000 rw-s e0020000 00:05 4053007 /dev/nvidia0
7f8445c42000-7f8445c62000 rw-s e0000000 00:05 4053007 /dev/nvidia0
7f8445c62000-7f8445d62000 rw-s 2d24c000 00:05 4052314 /dev/nvidiactl
7f8445d62000-7f8445d68000 rw-p 00000000 00:00 0
7f8445d68000-7f8445d69000 rw-s e7d60000 00:05 4053007 /dev/nvidia0
7f8445d69000-7f8445d6a000 rw-s 20c5a4000 00:05 4052314 /dev/nvidiactl
7f8445d6a000-7f8445dae000 rw-p 00000000 00:00 0
7f8445dae000-7f8445daf000 r–p 00020000 08:02 30147800 /lib64/ld-2.19.so
7f8445daf000-7f8445db0000 rw-p 00021000 08:02 30147800 /lib64/ld-2.19.so
7f8445db0000-7f8445db1000 rw-p 00000000 00:00 0
7fff0099a000-7fff009ba000 rwxp 00000000 00:00 0 [stack]
7fff009ba000-7fff009bc000 rw-p 00000000 00:00 0
7fff009fd000-7fff009fe000 r-xp 00000000 00:00 0 [vdso]
7fff009fe000-7fff00a00000 r–p 00000000 00:00 0 [vvar]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]

And the program

struct __virtcl_struct_dynamic_kernel_gpu_opencl_0{ulong2 range[4]; };
struct __virtcl_struct_dynamic_kernel_gpu_opencl_1{ulong2 range[3]; };
struct __virtcl_struct_dynamic_kernel_gpu_opencl_2{ulong2 range[23]; };
struct __virtcl_validator_binary_tree_value{
 ulong logical_ptr;
 ulong physical_ptr;
 ulong size;
 ulong buffer_offset;
};


struct __virtcl_validator_binary_tree_buffer_node{
 struct __virtcl_validator_binary_tree_value value;
 ulong left;
 ulong right;
};

struct __virtcl_validator_binary_tree_buffer_head{
 ulong root;
};
inline __global void* __virtcl_validator_dereference_validate(
const __constant struct __virtcl_validator_binary_tree_buffer_node * const restrict root,
__global void * const buffer,
__private ulong2 *segments, size_t segments_size,
__global const void * const lower, const size_t size
){
 __global const void * const upper = lower + size;

 for( uint i = 0 ; i < segments_size ; ++i ){
  if( (segments[i]).x <= (ulong)lower && (ulong)upper <= segments[i].x + segments[i].y ){
   return (__global void*)lower;
  }
 }






 return buffer;

}

inline __global void* __virtcl_validator_dereference_validate_spec_1(
const __constant struct __virtcl_validator_binary_tree_buffer_node * const restrict root, __global void * const buffer,
__global const void * const base, const size_t base_size,
__global const void * const lower, const size_t size ){
 __global const void * const upper = lower + size;

 if( lower >= base && upper <= (base + base_size ) ){
  return (__global void*)lower;
 }






 return buffer;

}

inline __global void* __virtcl_validator_dereference_validate_spec_2(
const __constant struct __virtcl_validator_binary_tree_buffer_node * const restrict root, __global void * const buffer,
__global const void * const base_1, const size_t base_1_size,
__global const void * const base_2, const size_t base_2_size,
__global const void * const lower, const size_t size ){
 __global const void * const upper = lower + size;

 if( lower >= base_1 && upper <= (base_1 + base_1_size ) ){
  return (__global void*)lower;
 }else if( lower >= base_2 && upper <= (base_2 + base_2_size) ){
  return (__global void*)lower;
 }






 return buffer;

}

inline __global void* __virtcl_validator_dereference_validate_spec_3(
__constant const struct __virtcl_validator_binary_tree_buffer_node * const restrict root, __global void * const buffer,
__global const void *base_1, const size_t base_1_size,
__global const void *base_2, const size_t base_2_size,
__global const void *base_3, const size_t base_3_size,
__global const void *lower, const size_t size ){
 __global const void * const upper = lower + size;

 if( lower >= base_1 && upper <= (base_1 + base_1_size ) ){
  return (__global void*)lower;
 }else if( lower >= base_2 && upper <= (base_2 + base_2_size) ){
  return (__global void*)lower;
 }else if( lower >= base_3 && upper <= (base_3 + base_3_size ) ){
  return (__global void*)lower;
 }






 return buffer;

}

__global void* __virtcl_validator_address_of_translation( __global const void *ptr ){
 return (__global void*)ptr;
}



inline __private void* __virtcl_validator_dereference_validate_private_spec_1(
  __private const void * const base_1, const size_t base_1_size,
  __private const void * const failed_buffer,
  __private const void * const lower, const size_t access_size
){
 __private const void *upper = lower + access_size;
 if( lower >= base_1 && upper <= base_1 + base_1_size ){
  return (__private void*)lower;
 }
 return (__private void*)failed_buffer;
}

inline __private void* __virtcl_validator_dereference_validate_private_spec_2(
  __private const void * const base_1, const size_t base_1_size,
  __private const void * const base_2, const size_t base_2_size,
  __private const void * const failed_buffer,
  __private const void * const lower, const size_t access_size
){
 __private const void *upper = lower + access_size;
 if( lower >= base_1 && upper <= base_1 + base_1_size ){
  return (__private void*)lower;
 }else if( lower >= base_2 && upper <= base_2 + base_2_size ){
  return (__private void*)lower;
 }
 return (__private void*)failed_buffer;
}

inline __private void* __virtcl_validator_dereference_validate_private_spec_3(
  __private const void * const base_1, const size_t base_1_size,
  __private const void * const base_2, const size_t base_2_size,
  __private const void * const base_3, const size_t base_3_size,
  __private const void * const failed_buffer,
  __private const void * const lower, const size_t access_size
){
 __private const void *upper = lower + access_size;
 if( lower >= base_1 && upper <= base_1 + base_1_size ){
  return (__private void*)lower;
 }else if( lower >= base_2 && upper <= base_2 + base_2_size ){
  return (__private void*)lower;
 }else if( lower >= base_3 && upper <= base_3 + base_3_size ){
  return (__private void*)lower;
 }
 return (__private void*)failed_buffer;
}

inline __private void* __virtcl_validator_dereference_validate_private_dynamic(
__private ulong2 *segments, size_t size,
__private const void * const failed_buffer,
__private const void *lower, size_t access_size){

 __private const void *upper = lower + access_size;
 for( uint i = 0 ; i < size ; ++i ){
  if( (segments[i]).x <= (ulong)lower && (ulong)upper <= (segments[i].x + segments[i].y) ){
   return (__private void*)lower;
  }
 }
 return (__private void*)failed_buffer;
}

inline private void* __virtcl_validator_dereference_validate_private_static_dynamic(
__private const void * const base_1, const size_t base_1_size,
__private ulong2 *segments, size_t size,
__private const void * const failed_buffer,
__private const void *lower, size_t access_size){
 __private const void *upper = lower + access_size;
 if( lower >= base_1 && upper <= base_1 + base_1_size ){
  return (__private void*)lower;
 }
 for( uint i = 0 ; i < size ; ++i ){
  if( (segments[i]).x <= (ulong)lower && (ulong)upper <= (segments[i].x + segments[i].y) ){
   return (__private void*)lower;
  }
 }
 return (__private void*)failed_buffer;
}



inline __local void* __virtcl_validator_dereference_validate_local_spec_1(
  __local const void * const base_1, const size_t base_1_size,
 __local const void * const failed_buffer,
 __local const void * const lower, const size_t access_size
){
 const __local void * const upper = lower + access_size;
 if( lower >= base_1 && upper <= base_1 + base_1_size ){
  return (__local void*)lower;
 }

 return (__local void*)failed_buffer;
}

inline __local void* __virtcl_validator_dereference_validate_local_spec_2(
  __local const void * const base_1, const size_t base_1_size,
 __local const void * const base_2, const size_t base_2_size,
 __local const void * const failed_buffer,
 __local const void * const lower, const size_t access_size
){
 const __local void * const upper = lower + access_size;
 if( lower >= base_1 && upper <= base_1 + base_1_size ){
  return (__local void*)lower;
 }else if( lower >= base_2 && upper <= base_2 + base_2_size ){
  return (__local void*)lower;
 }

 return (__local void*)failed_buffer;
}

inline __local void* __virtcl_validator_dereference_validate_local_spec_3(
  __local const void * const base_1, const size_t base_1_size,
 __local const void * const base_2, const size_t base_2_size,
 __local const void * const base_3, const size_t base_3_size,
 __local const void * const failed_buffer,
 __local const void * const lower, const size_t access_size
){
 const __local void * const upper = lower + access_size;
 if( lower >= base_1 && upper <= base_1 + base_1_size ){
  return (__local void*)lower;
 }else if( lower >= base_2 && upper <= base_2 + base_2_size ){
  return (__local void*)lower;
 }else if( lower >= base_3 && upper <= base_3 + base_3_size ){
  return (__local void*)lower;
 }

 return (__local void*)failed_buffer;
}

inline __local void* __virtcl_validator_dereference_validate_local_dynamic(
__private ulong2 *segments, size_t size,
__local const void * const failed_buffer,
__local const void *lower, size_t access_size){

 __local const void *upper = lower + access_size;
 for( uint i = 0 ; i < size ; ++i ){
  if( (segments[i]).x <= (ulong)lower && (ulong)upper <= (segments[i].x + segments[i].y) ){
   return (__local void*)lower;
  }
 }
 return (__local void*)failed_buffer;
}

inline __local void* __virtcl_validator_dereference_validate_local_static_dynamic(
__local const void * const base_1, const size_t base_1_size,
__private ulong2 *segments, size_t size,
__local const void * const failed_buffer,
__local const void *lower, size_t access_size){
 __local const void *upper = lower + access_size;
 if( lower >= base_1 && upper <= base_1 + base_1_size ){
  return (__local void*)lower;
 }
 for( uint i = 0 ; i < size ; ++i ){
  if( (segments[i]).x <= (ulong)lower && (ulong)upper <= (segments[i].x + segments[i].y) ){
   return (__local void*)lower;
  }
 }
 return (__local void*)failed_buffer;
}



inline __constant void* __virtcl_validator_dereference_validate_constant_spec_1(
  __constant const void * const base_1, const size_t base_1_size,
 __constant const void * const failed_buffer,
  __constant const void * const lower, const size_t access_size
){
 __constant const void * const upper = lower + access_size;
 if( lower >= base_1 && upper <= base_1 + base_1_size ) {
  return (__constant void*)lower;
 }
 return (__constant void*)failed_buffer;
}

inline __constant void* __virtcl_validator_dereference_validate_constant_spec_2(
  __constant const void * const base_1, const size_t base_1_size,
  __constant const void * const base_2, const size_t base_2_size,
 __constant const void * const failed_buffer,
  __constant const void * const lower, const size_t access_size
){
 __constant const void * const upper = lower + access_size;
 if( lower >= base_1 && upper <= base_1 + base_1_size ) {
  return (__constant void*)lower;
 }else if( lower >= base_2 && upper <= base_2 + base_2_size ){
  return (__constant void*)lower;
 }
 return (__constant void*)failed_buffer;
}

inline __constant void* __virtcl_validator_dereference_validate_constant_spec_3(
  __constant const void * const base_1, const size_t base_1_size,
  __constant const void * const base_2, const size_t base_2_size,
  __constant const void * const base_3, const size_t base_3_size,
 __constant const void * const failed_buffer,
  __constant const void * const lower, const size_t access_size
){
 __constant const void * const upper = lower + access_size;
 if( lower >= base_1 && upper <= base_1 + base_1_size ) {
  return (__constant void*)lower;
 }else if( lower >= base_2 && upper <= base_2 + base_2_size ){
  return (__constant void*)lower;
 }else if( lower >= base_3 && upper <= base_3 + base_3_size){
  return (__constant void*)lower;
 }
 return (__constant void*)failed_buffer;
}

inline __constant void* __virtcl_validator_dereference_validate_constant_dynamic(
__private ulong2 *segments, size_t size,
__constant const void * const failed_buffer,
__constant const void *lower, size_t access_size){

 __constant const void *upper = lower + access_size;
 for( uint i = 0 ; i < size ; ++i ){
  if( (segments[i]).x <= (ulong)lower && (ulong)upper <= (segments[i].x + segments[i].y) ){
   return (__constant void*)lower;
  }
 }
 return (__constant void*)failed_buffer;
}

inline __constant void* __virtcl_validator_dereference_validate_constant_static_dynamic(
__constant const void * const base_1, const size_t base_1_size,
__private ulong2 *segments, size_t size,
__constant const void * const failed_buffer,
__constant const void *lower, size_t access_size){
 __constant const void *upper = lower + access_size;
 if( lower >= base_1 && upper <= base_1 + base_1_size ){
  return (__constant void*)lower;
 }
 for( uint i = 0 ; i < size ; ++i ){
  if( (segments[i]).x <= (ulong)lower && (ulong)upper <= (segments[i].x + segments[i].y) ){
   return (__constant void*)lower;
  }
 }
 return (__constant void*)failed_buffer;
}




__local void*
checked_address_use_def_1(
const __local void* v0,size_t v1,__local void* v2,__local void **temp
,__local void *addr, size_t size){
 if( *temp == 0 ){ *temp = __virtcl_validator_dereference_validate_local_spec_1(v0,v1,v2, addr, size); } return *temp; }
__local void*
checked_address_use_def_2(
const __local void* v0,size_t v1,__local void* v2,__local void **temp
,__local void *addr, size_t size){
 if( *temp == 0 ){ *temp = __virtcl_validator_dereference_validate_local_spec_1(v0,v1,v2, addr, size); } return *temp; }
__local void*
checked_address_use_def_3(
const __local void* v0,size_t v1,__local void* v2,__local void **temp
,__local void *addr, size_t size){
 if( *temp == 0 ){ *temp = __virtcl_validator_dereference_validate_local_spec_1(v0,v1,v2, addr, size); } return *temp; }
__local void*
checked_address_use_def_4(
const __local void* v0,size_t v1,__local void* v2,__local void **temp
,__local void *addr, size_t size){
 if( *temp == 0 ){ *temp = __virtcl_validator_dereference_validate_local_spec_1(v0,v1,v2, addr, size); } return *temp; }
__local void*
checked_address_use_def_5(
const __local void* v0,size_t v1,__local void* v2,__local void **temp
,__local void *addr, size_t size){
 if( *temp == 0 ){ *temp = __virtcl_validator_dereference_validate_local_spec_1(v0,v1,v2, addr, size); } return *temp; }
__local void*
checked_address_use_def_6(
const __local void* v0,size_t v1,__local void* v2,__local void **temp
,__local void *addr, size_t size){
 if( *temp == 0 ){ *temp = __virtcl_validator_dereference_validate_local_spec_1(v0,v1,v2, addr, size); } return *temp; }
__local void*
checked_address_use_def_7(
const __local void* v0,size_t v1,__local void* v2,__local void **temp
,__local void *addr, size_t size){
 if( *temp == 0 ){ *temp = __virtcl_validator_dereference_validate_local_spec_1(v0,v1,v2, addr, size); } return *temp; }
__local void*
checked_address_use_def_8(
const __local void* v0,size_t v1,__local void* v2,__local void **temp
,__local void *addr, size_t size){
 if( *temp == 0 ){ *temp = __virtcl_validator_dereference_validate_local_spec_1(v0,v1,v2, addr, size); } return *temp; }

typedef struct
{
 float x, y, z;

} THREE_VECTOR;

typedef struct
{
 float v, x, y, z;

} FOUR_VECTOR;

typedef struct nei_str
{


 int x, y, z;
 int number;
 long offset;

} nei_str;

typedef struct box_str
{


 int x, y, z;
 int number;
 long offset;


 int nn;
 nei_str nei[26];

} box_str;

typedef struct par_str
{

 float alpha;

} par_str;

typedef struct dim_str
{


 int cur_arg;
 int arch_arg;
 int cores_arg;
 int boxes1d_arg;


 long number_boxes;
 long box_mem;
 long space_elem;
 long space_mem;
 long space_mem2;

} dim_str;





__kernel void kernel_gpu_opencl( __constant struct __virtcl_validator_binary_tree_buffer_node * restrict const __virtcl_validator_root, __constant ulong * restrict const __virtcl_validator_arg_info, __global void * const __virtcl_validator_buffer,par_str d_par_gpu,
     dim_str d_dim_gpu,
     __global box_str *d_box_gpu,
     __global FOUR_VECTOR *d_rv_gpu,
     __global float *d_qv_gpu,
     __global FOUR_VECTOR *d_fv_gpu)
{__local void *__virtcl_ck_1 = 0;__global void *__virtcl_ck_2 = 0;__local void *__virtcl_ck_3 = 0;__local void *__virtcl_ck_4 = 0;__global void *__virtcl_ck_10 = 0;






 int bx = get_group_id(0);
 int tx = get_local_id(0);
 int wtx = tx;





 if(bx<d_dim_gpu.number_boxes){






  float a2 = 2*d_par_gpu.alpha*d_par_gpu.alpha;


  int first_i;

  __local FOUR_VECTOR rA_shared[100];


  int pointer;
  int k = 0;
  int first_j;
  int j = 0;

  __local FOUR_VECTOR rB_shared[100];
  __local float qB_shared[100];


  float r2;
  float u2;
  float vij;
  float fs;
  float fxij;
  float fyij;
  float fzij;
  THREE_VECTOR d;
  first_i = (*( (__virtcl_ck_10 = (((__global box_str *)__virtcl_validator_dereference_validate_spec_1(__virtcl_validator_root,__virtcl_validator_buffer,d_box_gpu,(__virtcl_validator_arg_info)[2],(d_box_gpu+bx ),sizeof(__global box_str))))), ((__global box_str *)(__virtcl_ck_10)) )).offset;







  while(wtx<100){
   (*((__local FOUR_VECTOR *)__virtcl_validator_dereference_validate_local_spec_1(&rA_shared,sizeof(rA_shared),&qB_shared,rA_shared+wtx ,sizeof(__local FOUR_VECTOR)))) = (*((__global FOUR_VECTOR *)__virtcl_validator_dereference_validate_spec_1(__virtcl_validator_root,__virtcl_validator_buffer,d_rv_gpu,(__virtcl_validator_arg_info)[3],d_rv_gpu+first_i+wtx ,sizeof(__global FOUR_VECTOR))));
   wtx = wtx + 128;
  }
  wtx = tx;



  barrier(CLK_LOCAL_MEM_FENCE);






  for (k=0; k<(1+(*( ((d_box_gpu[bx])), ((__global box_str *)(__virtcl_ck_10)) )).nn); k++){





   if(k==0){
    pointer = bx;
   }
   else{
    pointer = (*((__global nei_str *)__virtcl_validator_dereference_validate_spec_1(__virtcl_validator_root,__virtcl_validator_buffer,d_box_gpu,(__virtcl_validator_arg_info)[2],(*( ((d_box_gpu[bx])), ((__global box_str *)(__virtcl_ck_10)) )).nei+k-1 ,sizeof(__global nei_str)))).number;
   }






   first_j = (*((__global box_str *)__virtcl_validator_dereference_validate_spec_1(__virtcl_validator_root,__virtcl_validator_buffer,d_box_gpu,(__virtcl_validator_arg_info)[2],d_box_gpu+pointer ,sizeof(__global box_str)))).offset;







   while(wtx<100){
    (*((__local FOUR_VECTOR *)__virtcl_validator_dereference_validate_local_spec_1(&rB_shared,sizeof(rB_shared),&qB_shared,rB_shared+wtx ,sizeof(__local FOUR_VECTOR)))) = (*((__global FOUR_VECTOR *)__virtcl_validator_dereference_validate_spec_1(__virtcl_validator_root,__virtcl_validator_buffer,d_rv_gpu,(__virtcl_validator_arg_info)[3],d_rv_gpu+first_j+wtx ,sizeof(__global FOUR_VECTOR))));
    (*((__local float *)__virtcl_validator_dereference_validate_local_spec_1(&qB_shared,sizeof(qB_shared),&qB_shared,qB_shared+wtx ,sizeof(__local float)))) = (*((__global float *)__virtcl_validator_dereference_validate_spec_1(__virtcl_validator_root,__virtcl_validator_buffer,d_qv_gpu,(__virtcl_validator_arg_info)[4],d_qv_gpu+first_j+wtx ,sizeof(__global float))));
    wtx = wtx + 128;
   }
   wtx = tx;



   barrier(CLK_LOCAL_MEM_FENCE);






   while(wtx<100){


    for (j=0; j<100; j++){
     (__virtcl_ck_1 = 0),((__virtcl_ck_4 = 0),(r2 = (*((__local FOUR_VECTOR *)checked_address_use_def_1(&rA_shared,sizeof(rA_shared),&qB_shared,&__virtcl_ck_4,rA_shared+wtx ,sizeof(__local FOUR_VECTOR)))).v + (*((__local FOUR_VECTOR *)checked_address_use_def_2(&rB_shared,sizeof(rB_shared),&qB_shared,&__virtcl_ck_1,rB_shared+j ,sizeof(__local FOUR_VECTOR)))).v - (((*((__local FOUR_VECTOR *)checked_address_use_def_3(&rA_shared,sizeof(rA_shared),&qB_shared,&__virtcl_ck_4,rA_shared+wtx ,sizeof(__local FOUR_VECTOR)))).x)*((*((__local FOUR_VECTOR *)checked_address_use_def_4(&rB_shared,sizeof(rB_shared),&qB_shared,&__virtcl_ck_1,rB_shared+j ,sizeof(__local FOUR_VECTOR)))).x)+((*((__local FOUR_VECTOR *)checked_address_use_def_5(&rA_shared,sizeof(rA_shared),&qB_shared,&__virtcl_ck_4,rA_shared+wtx ,sizeof(__local FOUR_VECTOR)))).y)*((*((__local FOUR_VECTOR *)checked_address_use_def_6(&rB_shared,sizeof(rB_shared),&qB_shared,&__virtcl_ck_1,rB_shared+j ,sizeof(__local FOUR_VECTOR)))).y)+((*((__local FOUR_VECTOR *)checked_address_use_def_7(&rA_shared,sizeof(rA_shared),&qB_shared,&__virtcl_ck_4,rA_shared+wtx ,sizeof(__local FOUR_VECTOR)))).z)*((*((__local FOUR_VECTOR *)checked_address_use_def_8(&rB_shared,sizeof(rB_shared),&qB_shared,&__virtcl_ck_1,rB_shared+j ,sizeof(__local FOUR_VECTOR)))).z))));
     u2 = a2*r2;
     vij= exp(-u2);
     fs = 2*vij;
     d.x = (*( ((rA_shared[wtx])), ((__local FOUR_VECTOR *)(__virtcl_ck_4)) )).x - (*( ((rB_shared[j])), ((__local FOUR_VECTOR *)(__virtcl_ck_1)) )).x;
     fxij=fs*d.x;
     d.y = (*( ((rA_shared[wtx])), ((__local FOUR_VECTOR *)(__virtcl_ck_4)) )).y - (*( ((rB_shared[j])), ((__local FOUR_VECTOR *)(__virtcl_ck_1)) )).y;
     fyij=fs*d.y;
     d.z = (*( ((rA_shared[wtx])), ((__local FOUR_VECTOR *)(__virtcl_ck_4)) )).z - (*( ((rB_shared[j])), ((__local FOUR_VECTOR *)(__virtcl_ck_1)) )).z;
     fzij=fs*d.z;
     (*( (__virtcl_ck_2 = (((__global FOUR_VECTOR *)__virtcl_validator_dereference_validate_spec_1(__virtcl_validator_root,__virtcl_validator_buffer,d_fv_gpu,(__virtcl_validator_arg_info)[5],(d_fv_gpu+first_i+wtx ),sizeof(__global FOUR_VECTOR))))), ((__global FOUR_VECTOR *)(__virtcl_ck_2)) )).v += (*( (__virtcl_ck_3 = (((__local float *)__virtcl_validator_dereference_validate_local_spec_1(&qB_shared,sizeof(qB_shared),&qB_shared,(qB_shared+j ),sizeof(__local float))))), ((__local float *)(__virtcl_ck_3)) ))*vij;
     (*( ((d_fv_gpu[first_i+wtx])), ((__global FOUR_VECTOR *)(__virtcl_ck_2)) )).x += (*( ((qB_shared[j])), ((__local float *)(__virtcl_ck_3)) ))*fxij;
     (*( ((d_fv_gpu[first_i+wtx])), ((__global FOUR_VECTOR *)(__virtcl_ck_2)) )).y += (*( ((qB_shared[j])), ((__local float *)(__virtcl_ck_3)) ))*fyij;
     (*( ((d_fv_gpu[first_i+wtx])), ((__global FOUR_VECTOR *)(__virtcl_ck_2)) )).z += (*( ((qB_shared[j])), ((__local float *)(__virtcl_ck_3)) ))*fzij;

    }


    wtx = wtx + 128;

   }


   wtx = tx;


   barrier(CLK_LOCAL_MEM_FENCE);





  }





 }

}

The builder program.

#ifdef __APPLE__
#include "OpenCL/opencl.h"
#else
#include "CL/cl.h"
#endif

#include<cstdlib>
#include<fstream>
#include<cassert>
#include<cstring>
#include<cstdio>
#include<utility>
#include<iostream>
#include<vector>
#include<sstream>

bool is_NVIDIA;

struct cl_str{
  char *str;
  size_t size;
};

static inline void print_platform_info(const cl_platform_id platform ){
  char info[4096];
  size_t ret_size;
  clGetPlatformInfo( platform, CL_PLATFORM_NAME, sizeof(info), info, &ret_size );
  printf("%s\n", info);

  is_NVIDIA = strstr( info, "NVIDIA") != NULL;
}

static inline void print_device_info( const cl_device_id device ){
  char info[4096];
  size_t ret_size;
  clGetDeviceInfo( device, CL_DEVICE_NAME, sizeof(info), info, &ret_size );
  printf("%s\n", info);
  size_t profiler_resolution;
  clGetDeviceInfo( device, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(profiler_resolution), &profiler_resolution, &ret_size);
  printf("profiler timer resloution in nano seconds %zu\n", profiler_resolution);
}

static inline struct cl_str cl_read_file( const char *file_name ) {
  std::fstream file(file_name, std::ios_base::binary | std::ios_base::in );
  std::streampos file_size;
  {
  file.seekg(0, std::ios_base::end);
  file_size = file.tellg();
  file.seekg(0);
  }

  struct cl_str str;
  str.str = (char*)malloc( sizeof(char) * file_size);
  str.size = file_size;

  file.read( str.str, file_size);
  file.close();
  return str;
}

static inline void cl_dump_program( cl_program program, char *output_file ){
  cl_uint num_devices = 0;
  cl_int result;
  result = clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES, sizeof( cl_uint), &num_devices, NULL);
  assert( result == CL_SUCCESS);
  assert( num_devices != 0);
  size_t *program_size = new size_t[num_devices];
  unsigned char **binary_program = new unsigned char*[num_devices];

  result = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof( size_t ) * num_devices, program_size, NULL);
  assert( result == CL_SUCCESS);

  for( cl_uint i = 0 ; i < num_devices ; ++i){
    binary_program[i] = new unsigned char[ std::max(program_size[i],(size_t)2) ];
  }

  result = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(unsigned char*) * num_devices, binary_program, NULL);
  assert( result == CL_SUCCESS);

  for( cl_uint i = 0 ; i < num_devices ; ++i ){
    if( program_size[i] == 0 ){
      continue;
    }
    std::fstream out( output_file, std::ios_base::binary | std::ios_base::out | std::ios_base::trunc );
    out.write( (const char*)binary_program[i], program_size[i] );
    out.close();
  }

  for( cl_uint i = 0 ; i < num_devices ; ++i){
    delete []binary_program[i];
  }

  delete[] binary_program;
  delete[] program_size;
}

static inline void cl_print_program_build_info( cl_program program, cl_device_id device_id ){
	char tempString[1000000];
	tempString[0] = '\0';
	size_t tempLength;
	cl_int status = clGetProgramBuildInfo(program,device_id,CL_PROGRAM_BUILD_LOG,sizeof(tempString),tempString,&tempLength);
	printf("Build Log : %s\n",tempString);
}


static inline void cl_print_error(const char *name,cl_int status, const cl_program *program = NULL, const cl_device_id *device_id = NULL){
  switch(status){
    case CL_INVALID_PROGRAM_EXECUTABLE:
      printf("%s CL_INVALID_PROGRAM_EXECUTABLE\n",name);break;
    case CL_INVALID_COMMAND_QUEUE:
      printf("%s CL_INVALID_COMMAND_QUEUE\n",name);break;
    case CL_INVALID_KERNEL:
      printf("%s CL_INVALID_KERNEL\n",name);break;
    case CL_INVALID_CONTEXT:
       printf("%s CL_INVALID_CONTEXT\n",name);break;
    case CL_INVALID_KERNEL_ARGS:
       printf("%s CL_INVALID_KERNEL_ARGS\n",name);break;
    case CL_INVALID_WORK_DIMENSION:
       printf("%s CL_INVALID_WORK_DIMENSION\n",name);break;
    case CL_INVALID_WORK_GROUP_SIZE:
       printf("%s CL_INVALID_WORK_GROUP_SIZE\n",name);break;
    case CL_INVALID_WORK_ITEM_SIZE:
       printf("%s CL_INVALID_WORK_ITEM_SIZE\n",name);break;
    case CL_INVALID_GLOBAL_OFFSET:
       printf("%s CL_INVALID_GLOBAL_OFFSET\n",name);break;
    case CL_OUT_OF_RESOURCES:
       printf("%s CL_OUT_OF_RESOURCES\n",name);break;
    case CL_MEM_OBJECT_ALLOCATION_FAILURE:
       printf("%s CL_MEM_OBJECT_ALLOCATION_FAILURE\n",name);break;
    case CL_INVALID_EVENT_WAIT_LIST:
      printf("%s CL_INVALID_EVENT_WAIT_LIST\n",name);break;
    case CL_OUT_OF_HOST_MEMORY:
      printf("%s CL_OUT_OF_HOST_MEMORY\n",name);break;                   
    case CL_INVALID_PROGRAM:
      printf("CL_INVALID_PROGRAM\n");break;
    case CL_INVALID_VALUE:
      printf("CL_INVALID_VALUE\n");break;
    case CL_INVALID_DEVICE:
      printf("CL_INVALID_DEVICE:\n");break;
    case CL_INVALID_BINARY:
      printf("CL_INVALID_BINARY:\n");break;
    case CL_INVALID_BUILD_OPTIONS:
      printf("CL_INVALID_BUILD_OPTIONS:\n");break;
    case CL_INVALID_OPERATION:
      printf("CL_INVALID_OPERATION:\n");break;
    case CL_COMPILER_NOT_AVAILABLE:
      printf("CL_COMPILER_NOT_AVAILABLE:\n");break;
		case CL_INVALID_ARG_VALUE:
			printf("%s CL_INVALID_ARG_VALUE\n", name);break;
    case CL_BUILD_PROGRAM_FAILURE:
      printf("CL_BUILD_PROGRAM_FAILURE: Below is messages\n");
      char tempString[1000000];
      tempString[0] = '\0';
      size_t tempLength;
      status = clGetProgramBuildInfo(*program,*device_id,CL_PROGRAM_BUILD_LOG,sizeof(tempString),tempString,&tempLength);
      printf("%s\n",tempString);
      cl_print_error("", status);
      break;
    case CL_PROFILING_INFO_NOT_AVAILABLE:
      printf("%s %s\n", name, "CL_PROFILING_INFO_NOT_AVAILABLE");
    case CL_SUCCESS:
      break;
  }
}

static inline void display_program_size(cl_program program){
  const size_t ALLOCATE_SIZE = sizeof(size_t) * 1;
  size_t program_binary_sizes ;
  cl_int result = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, ALLOCATE_SIZE, &program_binary_sizes, NULL);
	if( result != CL_SUCCESS){
		cl_print_error("get program size", result);
		assert(0);
	}
  std::cout << "Program Size : " << program_binary_sizes << std::endl;
}

static inline void display_kernel_info(cl_kernel kernel, cl_device_id device_id){
  size_t work_group_size;
  cl_ulong local_mem_size;
  clGetKernelWorkGroupInfo(kernel, device_id,  CL_KERNEL_WORK_GROUP_SIZE , sizeof(work_group_size), &work_group_size,NULL);
  clGetKernelWorkGroupInfo(kernel, device_id,  CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local_mem_size), &local_mem_size, NULL);
  std::cout << "Maximum work group size : " << work_group_size <<  " Local mem size : " << local_mem_size << std::endl;
}

static inline cl_ulong display_device_exec_time(cl_event event){
  cl_ulong start,end;
  clWaitForEvents(1, &event);
  cl_int result = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof( start), &start, NULL);
  if( result != CL_SUCCESS){
    cl_print_error("get exec time error", result);
    assert(0);
  }
  result = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof( end), &end, NULL);
  if( result != CL_SUCCESS){
    cl_print_error("get exec time error", result);
    assert(0);
  }
  std::cout << start << " " << end << std::endl;
  return end - start;
}


cl_context context;
cl_device_id device_id;
cl_uint device_number;
char *output_file;
char *input_file;


int run(){
  cl_int result;

	struct cl_str source = cl_read_file( input_file );
	cl_program program = clCreateProgramWithSource( context, 1, (const char**)&source.str, &source.size	, &result);
	assert(result == CL_SUCCESS);
	const char *build_option = NULL;
	if( is_NVIDIA ){
		build_option = "-cl-nv-verbose";
	}
	result = clBuildProgram(program, 1, &device_id, build_option, NULL, NULL);
	if( result != CL_SUCCESS){
		cl_print_error("build program", result, &program, &device_id);
		cl_print_program_build_info(program, device_id);
		assert( result == CL_SUCCESS);
	}
	cl_print_program_build_info(program, device_id);
	cl_dump_program(program, output_file);
  return 0;
}

void init_(){
  cl_int result;
  cl_platform_id platform_id[4];
  cl_uint num_platform ;
  result = clGetPlatformIDs( 4, platform_id, &num_platform);
  printf("%d\n", result );
  assert( result == CL_SUCCESS);
  printf("Num of platforms %u\n", num_platform);
  cl_platform_id target_platform = platform_id[ num_platform - 1];

  print_platform_info( target_platform );
  cl_device_id temp_id[32];

  cl_uint num_device;
  result = clGetDeviceIDs( target_platform , CL_DEVICE_TYPE_ALL, 32, temp_id, &num_device);
  printf("Num of devices %u\n", num_device);
	if( device_number >= num_device ){
		std::cout << "Selected " << device_number << " exceeds " << num_device << std::endl;
		exit(-1);
	}
  device_id = temp_id[device_number];
  assert( result == CL_SUCCESS);
  print_device_info( device_id);

  context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &result);
  assert( result == CL_SUCCESS);
}

void destroy(){
  clReleaseContext(context);
}

bool check_option(int argc, char *argv[]){
  if( argc != 5 ){
		goto ERROR;
	}
	if( strcmp( argv[3], "-o" ) != 0 ){
		goto ERROR;
	}
	input_file = argv[1];
	output_file = argv[4];
	device_number = atoi(argv[2]);
	return true;
ERROR:
	std::cerr << "Usage ./a.out clfile selected_device_number -o output_file\n";
	return false;
}

int main(int argc, char *argv[]){
	if( check_option(argc, argv) == false ){
		return -1;
	}
  init_();
  //unsigned long long res = TM_stop(&tm);
  run();
  //std::cout << res << std::endl;
  destroy();
	return 0;
}