When building the OpenCL program, the double free or corruption happened.
Is it a bug?
below attach the backtrace
double free or corruption (fasttop): 0x0000000000d7f440 ***
======= Backtrace: =========
/lib64/libc.so.6(+0x79033)[0x7f8444e41033]
/lib64/libc.so.6(+0x7ebc6)[0x7f8444e46bc6]
/lib64/libc.so.6(+0x7fd92)[0x7f8444e47d92]
/usr/lib64/libnvidia-compiler.so.343.22(+0x1010f2b)[0x7f84398a4f2b]
/usr/lib64/libnvidia-compiler.so.343.22(+0x1011654)[0x7f84398a5654]
/usr/lib64/libnvidia-compiler.so.343.22(+0xecb400)[0x7f843975f400]
/usr/lib64/libnvidia-compiler.so.343.22(+0xecb506)[0x7f843975f506]
/usr/lib64/libnvidia-compiler.so.343.22(+0xecb6ec)[0x7f843975f6ec]
/usr/lib64/libnvidia-compiler.so.343.22(+0x162774)[0x7f84389f6774]
/usr/lib64/libnvidia-compiler.so.343.22(+0x1537ba)[0x7f84389e77ba]
/usr/lib64/libnvidia-compiler.so.343.22(+0x14df22)[0x7f84389e1f22]
/usr/lib64/libnvidia-compiler.so.343.22(+0x14f09d)[0x7f84389e309d]
/usr/lib64/libnvidia-compiler.so.343.22(+0x14ae5d)[0x7f84389dee5d]
/usr/lib64/libnvidia-compiler.so.343.22(NvCliCompileProgram+0x147)[0x7f84389d9c27]
/usr/lib64/libnvidia-opencl.so.1(+0x153f62)[0x7f8443b5df62]
/usr/lib64/libnvidia-opencl.so.1(+0x148195)[0x7f8443b52195]
/home/tedyu002/src/cl_program_builder/a.out[0x401f27]
/home/tedyu002/src/cl_program_builder/a.out[0x402288]
/lib64/libc.so.6(__libc_start_main+0xf5)[0x7f8444decdb5]
/home/tedyu002/src/cl_program_builder/a.out[0x401319]
======= Memory map: ========
00400000-00403000 r-xp 00000000 08:02 12060955 /home/tedyu002/src/cl_program_builder/a.out
00603000-00604000 r–p 00003000 08:02 12060955 /home/tedyu002/src/cl_program_builder/a.out
00604000-00605000 rw-p 00004000 08:02 12060955 /home/tedyu002/src/cl_program_builder/a.out
00605000-01761000 rw-p 00000000 00:00 0 [heap]
200000000-900000000 —p 00000000 00:00 0
7f8438894000-7f8439ded000 r-xp 00000000 08:02 1323320 /usr/lib64/libnvidia-compiler.so.343.22
7f8439ded000-7f8439fec000 —p 01559000 08:02 1323320 /usr/lib64/libnvidia-compiler.so.343.22
7f8439fec000-7f843a7d9000 rw-p 01558000 08:02 1323320 /usr/lib64/libnvidia-compiler.so.343.22
7f843a7d9000-7f843a7fd000 rw-p 00000000 00:00 0
7f843a7fd000-7f843a7fe000 —p 00000000 00:00 0
7f843a7fe000-7f843affe000 rwxp 00000000 00:00 0 [stack:545]
7f843affe000-7f843afff000 —p 00000000 00:00 0
7f843afff000-7f843b7ff000 rwxp 00000000 00:00 0 [stack:544]
7f843b7ff000-7f843b800000 —p 00000000 00:00 0
7f843b800000-7f843c000000 rwxp 00000000 00:00 0 [stack:543]
7f843c000000-7f843c021000 rw-p 00000000 00:00 0
7f843c021000-7f8440000000 —p 00000000 00:00 0
7f84401fe000-7f84401ff000 —p 00000000 00:00 0
7f84401ff000-7f84409ff000 rwxp 00000000 00:00 0 [stack:542]
7f84409ff000-7f8440a00000 —p 00000000 00:00 0
7f8440a00000-7f8441200000 rwxp 00000000 00:00 0 [stack:541]
7f8441200000-7f8441281000 rw-s 86db6000 00:05 4052314 /dev/nvidiactl
7f8441281000-7f8441381000 rw-s 0319d000 00:05 4052314 /dev/nvidiactl
7f8441381000-7f8441481000 rw-s 10e4a0000 00:05 4052314 /dev/nvidiactl
7f8441481000-7f8441581000 rw-s 20c5a9000 00:05 4052314 /dev/nvidiactl
7f8441581000-7f8441681000 rw-s 1d5e10000 00:05 4052314 /dev/nvidiactl
7f8441681000-7f8441781000 rw-s 205621000 00:05 4052314 /dev/nvidiactl
7f8441781000-7f8441881000 rw-s 205476000 00:05 4052314 /dev/nvidiactl
7f8441881000-7f8441c81000 rw-s 20c41f000 00:05 4052314 /dev/nvidiactl
7f8441c81000-7f8442081000 rw-s 2d3d8000 00:05 4052314 /dev/nvidiactl
7f8442081000-7f8442082000 —p 00000000 00:00 0
7f8442082000-7f8442882000 rwxp 00000000 00:00 0 [stack:540]
7f8442882000-7f8443423000 r-xp 00000000 08:02 1322931 /usr/lib64/libcuda.so.343.22
7f8443423000-7f8443622000 —p 00ba1000 08:02 1322931 /usr/lib64/libcuda.so.343.22
7f8443622000-7f84437ee000 rw-p 00ba0000 08:02 1322931 /usr/lib64/libcuda.so.343.22
7f84437ee000-7f8443802000 rw-p 00000000 00:00 0
7f8443802000-7f8443809000 r-xp 00000000 08:02 30147785 /lib64/librt-2.19.so
7f8443809000-7f8443a08000 —p 00007000 08:02 30147785 /lib64/librt-2.19.so
7f8443a08000-7f8443a09000 r–p 00006000 08:02 30147785 /lib64/librt-2.19.so
7f8443a09000-7f8443a0a000 rw-p 00007000 08:02 30147785 /lib64/librt-2.19.so
7f8443a0a000-7f84445c3000 r-xp 00000000 08:02 1325273 /usr/lib64/libnvidia-opencl.so.343.22
7f84445c3000-7f84447c3000 —p 00bb9000 08:02 1325273 /usr/lib64/libnvidia-opencl.so.343.22
7f84447c3000-7f8444992000 rw-p 00bb9000 08:02 1325273 /usr/lib64/libnvidia-opencl.so.343.22
7f8444992000-7f84449a6000 rw-p 00000000 00:00 0
7f84449a6000-7f84449bf000 r-xp 00000000 08:02 30147801 /lib64/libpthread-2.19.so
7f84449bf000-7f8444bbe000 —p 00019000 08:02 30147801 /lib64/libpthread-2.19.so
7f8444bbe000-7f8444bbf000 r–p 00018000 08:02 30147801 /lib64/libpthread-2.19.so
7f8444bbf000-7f8444bc0000 rw-p 00019000 08:02 30147801 /lib64/libpthread-2.19.so
7f8444bc0000-7f8444bc4000 rw-p 00000000 00:00 0
7f8444bc4000-7f8444bc7000 r-xp 00000000 08:02 30147782 /lib64/libdl-2.19.so
7f8444bc7000-7f8444dc6000 —p 00003000 08:02 30147782 /lib64/libdl-2.19.so
7f8444dc6000-7f8444dc7000 r–p 00002000 08:02 30147782 /lib64/libdl-2.19.so
7f8444dc7000-7f8444dc8000 rw-p 00003000 08:02 30147782 /lib64/libdl-2.19.so
7f8444dc8000-7f8444f65000 r-xp 00000000 08:02 30147799 /lib64/libc-2.19.so
7f8444f65000-7f8445164000 —p 0019d000 08:02 30147799 /lib64/libc-2.19.so
7f8445164000-7f8445168000 r–p 0019c000 08:02 30147799 /lib64/libc-2.19.so
7f8445168000-7f844516a000 rw-p 001a0000 08:02 30147799 /lib64/libc-2.19.so
7f844516a000-7f844516e000 rw-p 00000000 00:00 0
7f844516e000-7f8445183000 r-xp 00000000 08:02 8660078 /usr/lib64/gcc/x86_64-pc-linux-gnu/4.8.3/libgcc_s.so.1
7f8445183000-7f8445383000 —p 00015000 08:02 8660078 /usr/lib64/gcc/x86_64-pc-linux-gnu/4.8.3/libgcc_s.so.1
7f8445383000-7f8445384000 r–p 00015000 08:02 8660078 /usr/lib64/gcc/x86_64-pc-linux-gnu/4.8.3/libgcc_s.so.1
7f8445384000-7f8445385000 rw-p 00016000 08:02 8660078 /usr/lib64/gcc/x86_64-pc-linux-gnu/4.8.3/libgcc_s.so.1
7f8445385000-7f844547e000 r-xp 00000000 08:02 30146880 /lib64/libm-2.19.so
7f844547e000-7f844567d000 —p 000f9000 08:02 30146880 /lib64/libm-2.19.so
7f844567d000-7f844567e000 r–p 000f8000 08:02 30146880 /lib64/libm-2.19.so
7f844567e000-7f844567f000 rw-p 000f9000 08:02 30146880 /lib64/libm-2.19.so
7f844567f000-7f844576b000 r-xp 00000000 08:02 8660088 /usr/lib64/gcc/x86_64-pc-linux-gnu/4.8.3/libstdc++.so.6.0.19
7f844576b000-7f844596a000 —p 000ec000 08:02 8660088 /usr/lib64/gcc/x86_64-pc-linux-gnu/4.8.3/libstdc++.so.6.0.19
7f844596a000-7f8445972000 r–p 000eb000 08:02 8660088 /usr/lib64/gcc/x86_64-pc-linux-gnu/4.8.3/libstdc++.so.6.0.19
7f8445972000-7f8445974000 rw-p 000f3000 08:02 8660088 /usr/lib64/gcc/x86_64-pc-linux-gnu/4.8.3/libstdc++.so.6.0.19
7f8445974000-7f8445989000 rw-p 00000000 00:00 0
7f8445989000-7f844598d000 r-xp 00000000 08:02 8658493 /usr/lib64/OpenCL/vendors/nvidia/libOpenCL.so.1.0.0
7f844598d000-7f8445b8d000 —p 00004000 08:02 8658493 /usr/lib64/OpenCL/vendors/nvidia/libOpenCL.so.1.0.0
7f8445b8d000-7f8445b8e000 rw-p 00004000 08:02 8658493 /usr/lib64/OpenCL/vendors/nvidia/libOpenCL.so.1.0.0
7f8445b8e000-7f8445baf000 r-xp 00000000 08:02 30147800 /lib64/ld-2.19.so
7f8445c1f000-7f8445c20000 rw-p 00000000 00:00 0
7f8445c20000-7f8445c21000 rw-s e7d60000 00:05 4053007 /dev/nvidia0
7f8445c21000-7f8445c22000 rw-s 1ee46e000 00:05 4052314 /dev/nvidiactl
7f8445c22000-7f8445c42000 rw-s e0020000 00:05 4053007 /dev/nvidia0
7f8445c42000-7f8445c62000 rw-s e0000000 00:05 4053007 /dev/nvidia0
7f8445c62000-7f8445d62000 rw-s 2d24c000 00:05 4052314 /dev/nvidiactl
7f8445d62000-7f8445d68000 rw-p 00000000 00:00 0
7f8445d68000-7f8445d69000 rw-s e7d60000 00:05 4053007 /dev/nvidia0
7f8445d69000-7f8445d6a000 rw-s 20c5a4000 00:05 4052314 /dev/nvidiactl
7f8445d6a000-7f8445dae000 rw-p 00000000 00:00 0
7f8445dae000-7f8445daf000 r–p 00020000 08:02 30147800 /lib64/ld-2.19.so
7f8445daf000-7f8445db0000 rw-p 00021000 08:02 30147800 /lib64/ld-2.19.so
7f8445db0000-7f8445db1000 rw-p 00000000 00:00 0
7fff0099a000-7fff009ba000 rwxp 00000000 00:00 0 [stack]
7fff009ba000-7fff009bc000 rw-p 00000000 00:00 0
7fff009fd000-7fff009fe000 r-xp 00000000 00:00 0 [vdso]
7fff009fe000-7fff00a00000 r–p 00000000 00:00 0 [vvar]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
And the program
struct __virtcl_struct_dynamic_kernel_gpu_opencl_0{ulong2 range[4]; };
struct __virtcl_struct_dynamic_kernel_gpu_opencl_1{ulong2 range[3]; };
struct __virtcl_struct_dynamic_kernel_gpu_opencl_2{ulong2 range[23]; };
struct __virtcl_validator_binary_tree_value{
ulong logical_ptr;
ulong physical_ptr;
ulong size;
ulong buffer_offset;
};
struct __virtcl_validator_binary_tree_buffer_node{
struct __virtcl_validator_binary_tree_value value;
ulong left;
ulong right;
};
struct __virtcl_validator_binary_tree_buffer_head{
ulong root;
};
inline __global void* __virtcl_validator_dereference_validate(
const __constant struct __virtcl_validator_binary_tree_buffer_node * const restrict root,
__global void * const buffer,
__private ulong2 *segments, size_t segments_size,
__global const void * const lower, const size_t size
){
__global const void * const upper = lower + size;
for( uint i = 0 ; i < segments_size ; ++i ){
if( (segments[i]).x <= (ulong)lower && (ulong)upper <= segments[i].x + segments[i].y ){
return (__global void*)lower;
}
}
return buffer;
}
inline __global void* __virtcl_validator_dereference_validate_spec_1(
const __constant struct __virtcl_validator_binary_tree_buffer_node * const restrict root, __global void * const buffer,
__global const void * const base, const size_t base_size,
__global const void * const lower, const size_t size ){
__global const void * const upper = lower + size;
if( lower >= base && upper <= (base + base_size ) ){
return (__global void*)lower;
}
return buffer;
}
inline __global void* __virtcl_validator_dereference_validate_spec_2(
const __constant struct __virtcl_validator_binary_tree_buffer_node * const restrict root, __global void * const buffer,
__global const void * const base_1, const size_t base_1_size,
__global const void * const base_2, const size_t base_2_size,
__global const void * const lower, const size_t size ){
__global const void * const upper = lower + size;
if( lower >= base_1 && upper <= (base_1 + base_1_size ) ){
return (__global void*)lower;
}else if( lower >= base_2 && upper <= (base_2 + base_2_size) ){
return (__global void*)lower;
}
return buffer;
}
inline __global void* __virtcl_validator_dereference_validate_spec_3(
__constant const struct __virtcl_validator_binary_tree_buffer_node * const restrict root, __global void * const buffer,
__global const void *base_1, const size_t base_1_size,
__global const void *base_2, const size_t base_2_size,
__global const void *base_3, const size_t base_3_size,
__global const void *lower, const size_t size ){
__global const void * const upper = lower + size;
if( lower >= base_1 && upper <= (base_1 + base_1_size ) ){
return (__global void*)lower;
}else if( lower >= base_2 && upper <= (base_2 + base_2_size) ){
return (__global void*)lower;
}else if( lower >= base_3 && upper <= (base_3 + base_3_size ) ){
return (__global void*)lower;
}
return buffer;
}
__global void* __virtcl_validator_address_of_translation( __global const void *ptr ){
return (__global void*)ptr;
}
inline __private void* __virtcl_validator_dereference_validate_private_spec_1(
__private const void * const base_1, const size_t base_1_size,
__private const void * const failed_buffer,
__private const void * const lower, const size_t access_size
){
__private const void *upper = lower + access_size;
if( lower >= base_1 && upper <= base_1 + base_1_size ){
return (__private void*)lower;
}
return (__private void*)failed_buffer;
}
inline __private void* __virtcl_validator_dereference_validate_private_spec_2(
__private const void * const base_1, const size_t base_1_size,
__private const void * const base_2, const size_t base_2_size,
__private const void * const failed_buffer,
__private const void * const lower, const size_t access_size
){
__private const void *upper = lower + access_size;
if( lower >= base_1 && upper <= base_1 + base_1_size ){
return (__private void*)lower;
}else if( lower >= base_2 && upper <= base_2 + base_2_size ){
return (__private void*)lower;
}
return (__private void*)failed_buffer;
}
inline __private void* __virtcl_validator_dereference_validate_private_spec_3(
__private const void * const base_1, const size_t base_1_size,
__private const void * const base_2, const size_t base_2_size,
__private const void * const base_3, const size_t base_3_size,
__private const void * const failed_buffer,
__private const void * const lower, const size_t access_size
){
__private const void *upper = lower + access_size;
if( lower >= base_1 && upper <= base_1 + base_1_size ){
return (__private void*)lower;
}else if( lower >= base_2 && upper <= base_2 + base_2_size ){
return (__private void*)lower;
}else if( lower >= base_3 && upper <= base_3 + base_3_size ){
return (__private void*)lower;
}
return (__private void*)failed_buffer;
}
inline __private void* __virtcl_validator_dereference_validate_private_dynamic(
__private ulong2 *segments, size_t size,
__private const void * const failed_buffer,
__private const void *lower, size_t access_size){
__private const void *upper = lower + access_size;
for( uint i = 0 ; i < size ; ++i ){
if( (segments[i]).x <= (ulong)lower && (ulong)upper <= (segments[i].x + segments[i].y) ){
return (__private void*)lower;
}
}
return (__private void*)failed_buffer;
}
inline private void* __virtcl_validator_dereference_validate_private_static_dynamic(
__private const void * const base_1, const size_t base_1_size,
__private ulong2 *segments, size_t size,
__private const void * const failed_buffer,
__private const void *lower, size_t access_size){
__private const void *upper = lower + access_size;
if( lower >= base_1 && upper <= base_1 + base_1_size ){
return (__private void*)lower;
}
for( uint i = 0 ; i < size ; ++i ){
if( (segments[i]).x <= (ulong)lower && (ulong)upper <= (segments[i].x + segments[i].y) ){
return (__private void*)lower;
}
}
return (__private void*)failed_buffer;
}
inline __local void* __virtcl_validator_dereference_validate_local_spec_1(
__local const void * const base_1, const size_t base_1_size,
__local const void * const failed_buffer,
__local const void * const lower, const size_t access_size
){
const __local void * const upper = lower + access_size;
if( lower >= base_1 && upper <= base_1 + base_1_size ){
return (__local void*)lower;
}
return (__local void*)failed_buffer;
}
inline __local void* __virtcl_validator_dereference_validate_local_spec_2(
__local const void * const base_1, const size_t base_1_size,
__local const void * const base_2, const size_t base_2_size,
__local const void * const failed_buffer,
__local const void * const lower, const size_t access_size
){
const __local void * const upper = lower + access_size;
if( lower >= base_1 && upper <= base_1 + base_1_size ){
return (__local void*)lower;
}else if( lower >= base_2 && upper <= base_2 + base_2_size ){
return (__local void*)lower;
}
return (__local void*)failed_buffer;
}
inline __local void* __virtcl_validator_dereference_validate_local_spec_3(
__local const void * const base_1, const size_t base_1_size,
__local const void * const base_2, const size_t base_2_size,
__local const void * const base_3, const size_t base_3_size,
__local const void * const failed_buffer,
__local const void * const lower, const size_t access_size
){
const __local void * const upper = lower + access_size;
if( lower >= base_1 && upper <= base_1 + base_1_size ){
return (__local void*)lower;
}else if( lower >= base_2 && upper <= base_2 + base_2_size ){
return (__local void*)lower;
}else if( lower >= base_3 && upper <= base_3 + base_3_size ){
return (__local void*)lower;
}
return (__local void*)failed_buffer;
}
inline __local void* __virtcl_validator_dereference_validate_local_dynamic(
__private ulong2 *segments, size_t size,
__local const void * const failed_buffer,
__local const void *lower, size_t access_size){
__local const void *upper = lower + access_size;
for( uint i = 0 ; i < size ; ++i ){
if( (segments[i]).x <= (ulong)lower && (ulong)upper <= (segments[i].x + segments[i].y) ){
return (__local void*)lower;
}
}
return (__local void*)failed_buffer;
}
inline __local void* __virtcl_validator_dereference_validate_local_static_dynamic(
__local const void * const base_1, const size_t base_1_size,
__private ulong2 *segments, size_t size,
__local const void * const failed_buffer,
__local const void *lower, size_t access_size){
__local const void *upper = lower + access_size;
if( lower >= base_1 && upper <= base_1 + base_1_size ){
return (__local void*)lower;
}
for( uint i = 0 ; i < size ; ++i ){
if( (segments[i]).x <= (ulong)lower && (ulong)upper <= (segments[i].x + segments[i].y) ){
return (__local void*)lower;
}
}
return (__local void*)failed_buffer;
}
inline __constant void* __virtcl_validator_dereference_validate_constant_spec_1(
__constant const void * const base_1, const size_t base_1_size,
__constant const void * const failed_buffer,
__constant const void * const lower, const size_t access_size
){
__constant const void * const upper = lower + access_size;
if( lower >= base_1 && upper <= base_1 + base_1_size ) {
return (__constant void*)lower;
}
return (__constant void*)failed_buffer;
}
inline __constant void* __virtcl_validator_dereference_validate_constant_spec_2(
__constant const void * const base_1, const size_t base_1_size,
__constant const void * const base_2, const size_t base_2_size,
__constant const void * const failed_buffer,
__constant const void * const lower, const size_t access_size
){
__constant const void * const upper = lower + access_size;
if( lower >= base_1 && upper <= base_1 + base_1_size ) {
return (__constant void*)lower;
}else if( lower >= base_2 && upper <= base_2 + base_2_size ){
return (__constant void*)lower;
}
return (__constant void*)failed_buffer;
}
inline __constant void* __virtcl_validator_dereference_validate_constant_spec_3(
__constant const void * const base_1, const size_t base_1_size,
__constant const void * const base_2, const size_t base_2_size,
__constant const void * const base_3, const size_t base_3_size,
__constant const void * const failed_buffer,
__constant const void * const lower, const size_t access_size
){
__constant const void * const upper = lower + access_size;
if( lower >= base_1 && upper <= base_1 + base_1_size ) {
return (__constant void*)lower;
}else if( lower >= base_2 && upper <= base_2 + base_2_size ){
return (__constant void*)lower;
}else if( lower >= base_3 && upper <= base_3 + base_3_size){
return (__constant void*)lower;
}
return (__constant void*)failed_buffer;
}
inline __constant void* __virtcl_validator_dereference_validate_constant_dynamic(
__private ulong2 *segments, size_t size,
__constant const void * const failed_buffer,
__constant const void *lower, size_t access_size){
__constant const void *upper = lower + access_size;
for( uint i = 0 ; i < size ; ++i ){
if( (segments[i]).x <= (ulong)lower && (ulong)upper <= (segments[i].x + segments[i].y) ){
return (__constant void*)lower;
}
}
return (__constant void*)failed_buffer;
}
inline __constant void* __virtcl_validator_dereference_validate_constant_static_dynamic(
__constant const void * const base_1, const size_t base_1_size,
__private ulong2 *segments, size_t size,
__constant const void * const failed_buffer,
__constant const void *lower, size_t access_size){
__constant const void *upper = lower + access_size;
if( lower >= base_1 && upper <= base_1 + base_1_size ){
return (__constant void*)lower;
}
for( uint i = 0 ; i < size ; ++i ){
if( (segments[i]).x <= (ulong)lower && (ulong)upper <= (segments[i].x + segments[i].y) ){
return (__constant void*)lower;
}
}
return (__constant void*)failed_buffer;
}
__local void*
checked_address_use_def_1(
const __local void* v0,size_t v1,__local void* v2,__local void **temp
,__local void *addr, size_t size){
if( *temp == 0 ){ *temp = __virtcl_validator_dereference_validate_local_spec_1(v0,v1,v2, addr, size); } return *temp; }
__local void*
checked_address_use_def_2(
const __local void* v0,size_t v1,__local void* v2,__local void **temp
,__local void *addr, size_t size){
if( *temp == 0 ){ *temp = __virtcl_validator_dereference_validate_local_spec_1(v0,v1,v2, addr, size); } return *temp; }
__local void*
checked_address_use_def_3(
const __local void* v0,size_t v1,__local void* v2,__local void **temp
,__local void *addr, size_t size){
if( *temp == 0 ){ *temp = __virtcl_validator_dereference_validate_local_spec_1(v0,v1,v2, addr, size); } return *temp; }
__local void*
checked_address_use_def_4(
const __local void* v0,size_t v1,__local void* v2,__local void **temp
,__local void *addr, size_t size){
if( *temp == 0 ){ *temp = __virtcl_validator_dereference_validate_local_spec_1(v0,v1,v2, addr, size); } return *temp; }
__local void*
checked_address_use_def_5(
const __local void* v0,size_t v1,__local void* v2,__local void **temp
,__local void *addr, size_t size){
if( *temp == 0 ){ *temp = __virtcl_validator_dereference_validate_local_spec_1(v0,v1,v2, addr, size); } return *temp; }
__local void*
checked_address_use_def_6(
const __local void* v0,size_t v1,__local void* v2,__local void **temp
,__local void *addr, size_t size){
if( *temp == 0 ){ *temp = __virtcl_validator_dereference_validate_local_spec_1(v0,v1,v2, addr, size); } return *temp; }
__local void*
checked_address_use_def_7(
const __local void* v0,size_t v1,__local void* v2,__local void **temp
,__local void *addr, size_t size){
if( *temp == 0 ){ *temp = __virtcl_validator_dereference_validate_local_spec_1(v0,v1,v2, addr, size); } return *temp; }
__local void*
checked_address_use_def_8(
const __local void* v0,size_t v1,__local void* v2,__local void **temp
,__local void *addr, size_t size){
if( *temp == 0 ){ *temp = __virtcl_validator_dereference_validate_local_spec_1(v0,v1,v2, addr, size); } return *temp; }
typedef struct
{
float x, y, z;
} THREE_VECTOR;
typedef struct
{
float v, x, y, z;
} FOUR_VECTOR;
typedef struct nei_str
{
int x, y, z;
int number;
long offset;
} nei_str;
typedef struct box_str
{
int x, y, z;
int number;
long offset;
int nn;
nei_str nei[26];
} box_str;
typedef struct par_str
{
float alpha;
} par_str;
typedef struct dim_str
{
int cur_arg;
int arch_arg;
int cores_arg;
int boxes1d_arg;
long number_boxes;
long box_mem;
long space_elem;
long space_mem;
long space_mem2;
} dim_str;
__kernel void kernel_gpu_opencl( __constant struct __virtcl_validator_binary_tree_buffer_node * restrict const __virtcl_validator_root, __constant ulong * restrict const __virtcl_validator_arg_info, __global void * const __virtcl_validator_buffer,par_str d_par_gpu,
dim_str d_dim_gpu,
__global box_str *d_box_gpu,
__global FOUR_VECTOR *d_rv_gpu,
__global float *d_qv_gpu,
__global FOUR_VECTOR *d_fv_gpu)
{__local void *__virtcl_ck_1 = 0;__global void *__virtcl_ck_2 = 0;__local void *__virtcl_ck_3 = 0;__local void *__virtcl_ck_4 = 0;__global void *__virtcl_ck_10 = 0;
int bx = get_group_id(0);
int tx = get_local_id(0);
int wtx = tx;
if(bx<d_dim_gpu.number_boxes){
float a2 = 2*d_par_gpu.alpha*d_par_gpu.alpha;
int first_i;
__local FOUR_VECTOR rA_shared[100];
int pointer;
int k = 0;
int first_j;
int j = 0;
__local FOUR_VECTOR rB_shared[100];
__local float qB_shared[100];
float r2;
float u2;
float vij;
float fs;
float fxij;
float fyij;
float fzij;
THREE_VECTOR d;
first_i = (*( (__virtcl_ck_10 = (((__global box_str *)__virtcl_validator_dereference_validate_spec_1(__virtcl_validator_root,__virtcl_validator_buffer,d_box_gpu,(__virtcl_validator_arg_info)[2],(d_box_gpu+bx ),sizeof(__global box_str))))), ((__global box_str *)(__virtcl_ck_10)) )).offset;
while(wtx<100){
(*((__local FOUR_VECTOR *)__virtcl_validator_dereference_validate_local_spec_1(&rA_shared,sizeof(rA_shared),&qB_shared,rA_shared+wtx ,sizeof(__local FOUR_VECTOR)))) = (*((__global FOUR_VECTOR *)__virtcl_validator_dereference_validate_spec_1(__virtcl_validator_root,__virtcl_validator_buffer,d_rv_gpu,(__virtcl_validator_arg_info)[3],d_rv_gpu+first_i+wtx ,sizeof(__global FOUR_VECTOR))));
wtx = wtx + 128;
}
wtx = tx;
barrier(CLK_LOCAL_MEM_FENCE);
for (k=0; k<(1+(*( ((d_box_gpu[bx])), ((__global box_str *)(__virtcl_ck_10)) )).nn); k++){
if(k==0){
pointer = bx;
}
else{
pointer = (*((__global nei_str *)__virtcl_validator_dereference_validate_spec_1(__virtcl_validator_root,__virtcl_validator_buffer,d_box_gpu,(__virtcl_validator_arg_info)[2],(*( ((d_box_gpu[bx])), ((__global box_str *)(__virtcl_ck_10)) )).nei+k-1 ,sizeof(__global nei_str)))).number;
}
first_j = (*((__global box_str *)__virtcl_validator_dereference_validate_spec_1(__virtcl_validator_root,__virtcl_validator_buffer,d_box_gpu,(__virtcl_validator_arg_info)[2],d_box_gpu+pointer ,sizeof(__global box_str)))).offset;
while(wtx<100){
(*((__local FOUR_VECTOR *)__virtcl_validator_dereference_validate_local_spec_1(&rB_shared,sizeof(rB_shared),&qB_shared,rB_shared+wtx ,sizeof(__local FOUR_VECTOR)))) = (*((__global FOUR_VECTOR *)__virtcl_validator_dereference_validate_spec_1(__virtcl_validator_root,__virtcl_validator_buffer,d_rv_gpu,(__virtcl_validator_arg_info)[3],d_rv_gpu+first_j+wtx ,sizeof(__global FOUR_VECTOR))));
(*((__local float *)__virtcl_validator_dereference_validate_local_spec_1(&qB_shared,sizeof(qB_shared),&qB_shared,qB_shared+wtx ,sizeof(__local float)))) = (*((__global float *)__virtcl_validator_dereference_validate_spec_1(__virtcl_validator_root,__virtcl_validator_buffer,d_qv_gpu,(__virtcl_validator_arg_info)[4],d_qv_gpu+first_j+wtx ,sizeof(__global float))));
wtx = wtx + 128;
}
wtx = tx;
barrier(CLK_LOCAL_MEM_FENCE);
while(wtx<100){
for (j=0; j<100; j++){
(__virtcl_ck_1 = 0),((__virtcl_ck_4 = 0),(r2 = (*((__local FOUR_VECTOR *)checked_address_use_def_1(&rA_shared,sizeof(rA_shared),&qB_shared,&__virtcl_ck_4,rA_shared+wtx ,sizeof(__local FOUR_VECTOR)))).v + (*((__local FOUR_VECTOR *)checked_address_use_def_2(&rB_shared,sizeof(rB_shared),&qB_shared,&__virtcl_ck_1,rB_shared+j ,sizeof(__local FOUR_VECTOR)))).v - (((*((__local FOUR_VECTOR *)checked_address_use_def_3(&rA_shared,sizeof(rA_shared),&qB_shared,&__virtcl_ck_4,rA_shared+wtx ,sizeof(__local FOUR_VECTOR)))).x)*((*((__local FOUR_VECTOR *)checked_address_use_def_4(&rB_shared,sizeof(rB_shared),&qB_shared,&__virtcl_ck_1,rB_shared+j ,sizeof(__local FOUR_VECTOR)))).x)+((*((__local FOUR_VECTOR *)checked_address_use_def_5(&rA_shared,sizeof(rA_shared),&qB_shared,&__virtcl_ck_4,rA_shared+wtx ,sizeof(__local FOUR_VECTOR)))).y)*((*((__local FOUR_VECTOR *)checked_address_use_def_6(&rB_shared,sizeof(rB_shared),&qB_shared,&__virtcl_ck_1,rB_shared+j ,sizeof(__local FOUR_VECTOR)))).y)+((*((__local FOUR_VECTOR *)checked_address_use_def_7(&rA_shared,sizeof(rA_shared),&qB_shared,&__virtcl_ck_4,rA_shared+wtx ,sizeof(__local FOUR_VECTOR)))).z)*((*((__local FOUR_VECTOR *)checked_address_use_def_8(&rB_shared,sizeof(rB_shared),&qB_shared,&__virtcl_ck_1,rB_shared+j ,sizeof(__local FOUR_VECTOR)))).z))));
u2 = a2*r2;
vij= exp(-u2);
fs = 2*vij;
d.x = (*( ((rA_shared[wtx])), ((__local FOUR_VECTOR *)(__virtcl_ck_4)) )).x - (*( ((rB_shared[j])), ((__local FOUR_VECTOR *)(__virtcl_ck_1)) )).x;
fxij=fs*d.x;
d.y = (*( ((rA_shared[wtx])), ((__local FOUR_VECTOR *)(__virtcl_ck_4)) )).y - (*( ((rB_shared[j])), ((__local FOUR_VECTOR *)(__virtcl_ck_1)) )).y;
fyij=fs*d.y;
d.z = (*( ((rA_shared[wtx])), ((__local FOUR_VECTOR *)(__virtcl_ck_4)) )).z - (*( ((rB_shared[j])), ((__local FOUR_VECTOR *)(__virtcl_ck_1)) )).z;
fzij=fs*d.z;
(*( (__virtcl_ck_2 = (((__global FOUR_VECTOR *)__virtcl_validator_dereference_validate_spec_1(__virtcl_validator_root,__virtcl_validator_buffer,d_fv_gpu,(__virtcl_validator_arg_info)[5],(d_fv_gpu+first_i+wtx ),sizeof(__global FOUR_VECTOR))))), ((__global FOUR_VECTOR *)(__virtcl_ck_2)) )).v += (*( (__virtcl_ck_3 = (((__local float *)__virtcl_validator_dereference_validate_local_spec_1(&qB_shared,sizeof(qB_shared),&qB_shared,(qB_shared+j ),sizeof(__local float))))), ((__local float *)(__virtcl_ck_3)) ))*vij;
(*( ((d_fv_gpu[first_i+wtx])), ((__global FOUR_VECTOR *)(__virtcl_ck_2)) )).x += (*( ((qB_shared[j])), ((__local float *)(__virtcl_ck_3)) ))*fxij;
(*( ((d_fv_gpu[first_i+wtx])), ((__global FOUR_VECTOR *)(__virtcl_ck_2)) )).y += (*( ((qB_shared[j])), ((__local float *)(__virtcl_ck_3)) ))*fyij;
(*( ((d_fv_gpu[first_i+wtx])), ((__global FOUR_VECTOR *)(__virtcl_ck_2)) )).z += (*( ((qB_shared[j])), ((__local float *)(__virtcl_ck_3)) ))*fzij;
}
wtx = wtx + 128;
}
wtx = tx;
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
The builder program.
#ifdef __APPLE__
#include "OpenCL/opencl.h"
#else
#include "CL/cl.h"
#endif
#include<cstdlib>
#include<fstream>
#include<cassert>
#include<cstring>
#include<cstdio>
#include<utility>
#include<iostream>
#include<vector>
#include<sstream>
bool is_NVIDIA;
struct cl_str{
char *str;
size_t size;
};
static inline void print_platform_info(const cl_platform_id platform ){
char info[4096];
size_t ret_size;
clGetPlatformInfo( platform, CL_PLATFORM_NAME, sizeof(info), info, &ret_size );
printf("%s\n", info);
is_NVIDIA = strstr( info, "NVIDIA") != NULL;
}
static inline void print_device_info( const cl_device_id device ){
char info[4096];
size_t ret_size;
clGetDeviceInfo( device, CL_DEVICE_NAME, sizeof(info), info, &ret_size );
printf("%s\n", info);
size_t profiler_resolution;
clGetDeviceInfo( device, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(profiler_resolution), &profiler_resolution, &ret_size);
printf("profiler timer resloution in nano seconds %zu\n", profiler_resolution);
}
static inline struct cl_str cl_read_file( const char *file_name ) {
std::fstream file(file_name, std::ios_base::binary | std::ios_base::in );
std::streampos file_size;
{
file.seekg(0, std::ios_base::end);
file_size = file.tellg();
file.seekg(0);
}
struct cl_str str;
str.str = (char*)malloc( sizeof(char) * file_size);
str.size = file_size;
file.read( str.str, file_size);
file.close();
return str;
}
static inline void cl_dump_program( cl_program program, char *output_file ){
cl_uint num_devices = 0;
cl_int result;
result = clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES, sizeof( cl_uint), &num_devices, NULL);
assert( result == CL_SUCCESS);
assert( num_devices != 0);
size_t *program_size = new size_t[num_devices];
unsigned char **binary_program = new unsigned char*[num_devices];
result = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof( size_t ) * num_devices, program_size, NULL);
assert( result == CL_SUCCESS);
for( cl_uint i = 0 ; i < num_devices ; ++i){
binary_program[i] = new unsigned char[ std::max(program_size[i],(size_t)2) ];
}
result = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(unsigned char*) * num_devices, binary_program, NULL);
assert( result == CL_SUCCESS);
for( cl_uint i = 0 ; i < num_devices ; ++i ){
if( program_size[i] == 0 ){
continue;
}
std::fstream out( output_file, std::ios_base::binary | std::ios_base::out | std::ios_base::trunc );
out.write( (const char*)binary_program[i], program_size[i] );
out.close();
}
for( cl_uint i = 0 ; i < num_devices ; ++i){
delete []binary_program[i];
}
delete[] binary_program;
delete[] program_size;
}
static inline void cl_print_program_build_info( cl_program program, cl_device_id device_id ){
char tempString[1000000];
tempString[0] = '\0';
size_t tempLength;
cl_int status = clGetProgramBuildInfo(program,device_id,CL_PROGRAM_BUILD_LOG,sizeof(tempString),tempString,&tempLength);
printf("Build Log : %s\n",tempString);
}
static inline void cl_print_error(const char *name,cl_int status, const cl_program *program = NULL, const cl_device_id *device_id = NULL){
switch(status){
case CL_INVALID_PROGRAM_EXECUTABLE:
printf("%s CL_INVALID_PROGRAM_EXECUTABLE\n",name);break;
case CL_INVALID_COMMAND_QUEUE:
printf("%s CL_INVALID_COMMAND_QUEUE\n",name);break;
case CL_INVALID_KERNEL:
printf("%s CL_INVALID_KERNEL\n",name);break;
case CL_INVALID_CONTEXT:
printf("%s CL_INVALID_CONTEXT\n",name);break;
case CL_INVALID_KERNEL_ARGS:
printf("%s CL_INVALID_KERNEL_ARGS\n",name);break;
case CL_INVALID_WORK_DIMENSION:
printf("%s CL_INVALID_WORK_DIMENSION\n",name);break;
case CL_INVALID_WORK_GROUP_SIZE:
printf("%s CL_INVALID_WORK_GROUP_SIZE\n",name);break;
case CL_INVALID_WORK_ITEM_SIZE:
printf("%s CL_INVALID_WORK_ITEM_SIZE\n",name);break;
case CL_INVALID_GLOBAL_OFFSET:
printf("%s CL_INVALID_GLOBAL_OFFSET\n",name);break;
case CL_OUT_OF_RESOURCES:
printf("%s CL_OUT_OF_RESOURCES\n",name);break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
printf("%s CL_MEM_OBJECT_ALLOCATION_FAILURE\n",name);break;
case CL_INVALID_EVENT_WAIT_LIST:
printf("%s CL_INVALID_EVENT_WAIT_LIST\n",name);break;
case CL_OUT_OF_HOST_MEMORY:
printf("%s CL_OUT_OF_HOST_MEMORY\n",name);break;
case CL_INVALID_PROGRAM:
printf("CL_INVALID_PROGRAM\n");break;
case CL_INVALID_VALUE:
printf("CL_INVALID_VALUE\n");break;
case CL_INVALID_DEVICE:
printf("CL_INVALID_DEVICE:\n");break;
case CL_INVALID_BINARY:
printf("CL_INVALID_BINARY:\n");break;
case CL_INVALID_BUILD_OPTIONS:
printf("CL_INVALID_BUILD_OPTIONS:\n");break;
case CL_INVALID_OPERATION:
printf("CL_INVALID_OPERATION:\n");break;
case CL_COMPILER_NOT_AVAILABLE:
printf("CL_COMPILER_NOT_AVAILABLE:\n");break;
case CL_INVALID_ARG_VALUE:
printf("%s CL_INVALID_ARG_VALUE\n", name);break;
case CL_BUILD_PROGRAM_FAILURE:
printf("CL_BUILD_PROGRAM_FAILURE: Below is messages\n");
char tempString[1000000];
tempString[0] = '\0';
size_t tempLength;
status = clGetProgramBuildInfo(*program,*device_id,CL_PROGRAM_BUILD_LOG,sizeof(tempString),tempString,&tempLength);
printf("%s\n",tempString);
cl_print_error("", status);
break;
case CL_PROFILING_INFO_NOT_AVAILABLE:
printf("%s %s\n", name, "CL_PROFILING_INFO_NOT_AVAILABLE");
case CL_SUCCESS:
break;
}
}
static inline void display_program_size(cl_program program){
const size_t ALLOCATE_SIZE = sizeof(size_t) * 1;
size_t program_binary_sizes ;
cl_int result = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, ALLOCATE_SIZE, &program_binary_sizes, NULL);
if( result != CL_SUCCESS){
cl_print_error("get program size", result);
assert(0);
}
std::cout << "Program Size : " << program_binary_sizes << std::endl;
}
static inline void display_kernel_info(cl_kernel kernel, cl_device_id device_id){
size_t work_group_size;
cl_ulong local_mem_size;
clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE , sizeof(work_group_size), &work_group_size,NULL);
clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local_mem_size), &local_mem_size, NULL);
std::cout << "Maximum work group size : " << work_group_size << " Local mem size : " << local_mem_size << std::endl;
}
static inline cl_ulong display_device_exec_time(cl_event event){
cl_ulong start,end;
clWaitForEvents(1, &event);
cl_int result = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof( start), &start, NULL);
if( result != CL_SUCCESS){
cl_print_error("get exec time error", result);
assert(0);
}
result = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof( end), &end, NULL);
if( result != CL_SUCCESS){
cl_print_error("get exec time error", result);
assert(0);
}
std::cout << start << " " << end << std::endl;
return end - start;
}
cl_context context;
cl_device_id device_id;
cl_uint device_number;
char *output_file;
char *input_file;
int run(){
cl_int result;
struct cl_str source = cl_read_file( input_file );
cl_program program = clCreateProgramWithSource( context, 1, (const char**)&source.str, &source.size , &result);
assert(result == CL_SUCCESS);
const char *build_option = NULL;
if( is_NVIDIA ){
build_option = "-cl-nv-verbose";
}
result = clBuildProgram(program, 1, &device_id, build_option, NULL, NULL);
if( result != CL_SUCCESS){
cl_print_error("build program", result, &program, &device_id);
cl_print_program_build_info(program, device_id);
assert( result == CL_SUCCESS);
}
cl_print_program_build_info(program, device_id);
cl_dump_program(program, output_file);
return 0;
}
void init_(){
cl_int result;
cl_platform_id platform_id[4];
cl_uint num_platform ;
result = clGetPlatformIDs( 4, platform_id, &num_platform);
printf("%d\n", result );
assert( result == CL_SUCCESS);
printf("Num of platforms %u\n", num_platform);
cl_platform_id target_platform = platform_id[ num_platform - 1];
print_platform_info( target_platform );
cl_device_id temp_id[32];
cl_uint num_device;
result = clGetDeviceIDs( target_platform , CL_DEVICE_TYPE_ALL, 32, temp_id, &num_device);
printf("Num of devices %u\n", num_device);
if( device_number >= num_device ){
std::cout << "Selected " << device_number << " exceeds " << num_device << std::endl;
exit(-1);
}
device_id = temp_id[device_number];
assert( result == CL_SUCCESS);
print_device_info( device_id);
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &result);
assert( result == CL_SUCCESS);
}
void destroy(){
clReleaseContext(context);
}
bool check_option(int argc, char *argv[]){
if( argc != 5 ){
goto ERROR;
}
if( strcmp( argv[3], "-o" ) != 0 ){
goto ERROR;
}
input_file = argv[1];
output_file = argv[4];
device_number = atoi(argv[2]);
return true;
ERROR:
std::cerr << "Usage ./a.out clfile selected_device_number -o output_file\n";
return false;
}
int main(int argc, char *argv[]){
if( check_option(argc, argv) == false ){
return -1;
}
init_();
//unsigned long long res = TM_stop(&tm);
run();
//std::cout << res << std::endl;
destroy();
return 0;
}