float AddOnActivationFunction1(const float x) { const float input_value2 = native_exp(x * 1.333332e+000F); return native_divide(input_value2 - 1.0F, input_value2 + 1.0F) * 1.715900e+000F; } float AddOnLayer2(const float x) { return x; } float AddOnActivationFunction2(const float x) { return fabs(x); } #define output_width 92 #define output_height 92 #define output_feature_map_count 12 #define input_feature_map_count 2 #define source_width 5 #define source_height 5 #define input_width 96 #define input_height 96 #define weights_size 2400 #define weights_count 600 #define weights_pairs_size 192 #define biases_size 48 #define weights_offsets_size 96 #define output_entry_size 101568 #define input_entry_size 18432 #define backprop_intermediate_width 9 #define hessian_intermediate_width 9 #define backprop_biases_update_batch_size1 20 #define backprop_biases_update_batch_count1 424 #define backprop_biases_update_batch_size2 20 #define backprop_biases_update_batch_count2 22 #define forward_register_local_work_width 31 #define forward_register_local_work_height 31 #define forward_register_workload_width 3 #define forward_register_workload_height 1 #define testing_forward_register_reqd_work_group_size_qualifier __attribute__((reqd_work_group_size(31, 31, 1))) #define backward_register_local_work_width 32 #define backward_register_local_work_height 32 #define backward_register_workload_width 3 #define backward_register_workload_height 1 #define backprop_backward_register_reqd_work_group_size_qualifier __attribute__((reqd_work_group_size(32, 32, 1))) #define backprop_neuron_group_width_size 31 #define backprop_neuron_group_height_size 31 #define backprop_neuron_group_width_count 3 #define backprop_neuron_group_height_count 3 #define backprop_weight_group_size_local 800 #define backprop_weight_group_size_global 800 #define backprop_weight_group_size_inside_multiplier 32 #define backprop_update_reqd_work_group_size_qualifier __attribute__((reqd_work_group_size(800, 1, 1))) #define weights_with_biases_and_offsets_constant_qualifier1 __constant #define weights_with_biases_and_offsets_constant_qualifier2 #define weights_with_biases_and_offsets_constant_size_attribute __attribute__((max_constant_size(2400))) #define biases_after_weights_with_offsets_constant_qualifier1 __constant #define biases_after_weights_with_offsets_constant_qualifier2 #define biases_after_weights_with_offsets_constant_size_attribute __attribute__((max_constant_size(48))) #define offsets_after_weights_and_biases_constant_qualifier1 __constant #define offsets_after_weights_and_biases_constant_qualifier2 #define offsets_after_weights_and_biases_constant_size_attribute __attribute__((max_constant_size(96))) #define weights_with_offsets_constant_qualifier1 __constant #define weights_with_offsets_constant_qualifier2 #define weights_with_offsets_constant_size_attribute __attribute__((max_constant_size(2400))) #define offsets_after_weights_constant_qualifier1 __constant #define offsets_after_weights_constant_qualifier2 #define offsets_after_weights_constant_size_attribute __attribute__((max_constant_size(96))) #define biases_with_offsets_constant_qualifier1 __constant #define biases_with_offsets_constant_qualifier2 #define biases_with_offsets_constant_size_attribute __attribute__((max_constant_size(48))) #define offsets_after_biases_constant_qualifier1 __constant #define offsets_after_biases_constant_qualifier2 #define offsets_after_biases_constant_size_attribute __attribute__((max_constant_size(96))) #define pairs_only_constant_qualifier1 __constant #define pairs_only_constant_qualifier2 #define pairs_only_constant_size_attribute __attribute__((max_constant_size(192))) #define weights_only_constant_qualifier1 __constant #define weights_only_constant_qualifier2 #define weights_only_constant_size_attribute __attribute__((max_constant_size(2400))) #define biases_only_constant_qualifier1 __constant #define biases_only_constant_qualifier2 #define biases_only_constant_size_attribute __attribute__((max_constant_size(48))) #define offsets_only_constant_qualifier1 __constant #define offsets_only_constant_qualifier2 #define offsets_only_constant_size_attribute __attribute__((max_constant_size(96))) __kernel testing_forward_register_reqd_work_group_size_qualifier void ConvolutionRegister( const __global float * restrict input, __global float * restrict output, const __global float * restrict weights, const __global int * restrict weights_offsets, const __global float * restrict biases ) { __local float weights_buffer[((source_width * source_height) + (forward_register_local_work_width * forward_register_local_work_height - 1)) / (forward_register_local_work_width * forward_register_local_work_height) * (forward_register_local_work_width * forward_register_local_work_height)]; __local float input_buffer[((forward_register_local_work_width * forward_register_workload_width + source_width - 1) * (forward_register_local_work_height * forward_register_workload_height + source_height - 1) + (forward_register_local_work_width * forward_register_local_work_height - 1)) / (forward_register_local_work_width * forward_register_local_work_height) * (forward_register_local_work_width * forward_register_local_work_height)]; const uint dest_x_base = mul24((uint)(get_group_id(0)), (uint)(forward_register_local_work_width * forward_register_workload_width)); const uint dest_y_base = mul24((uint)(get_group_id(1)), (uint)(forward_register_local_work_height * forward_register_workload_height)); const uint input_window_width = forward_register_local_work_width * forward_register_workload_width + source_width - 1; const uint input_window_height = forward_register_local_work_height * forward_register_workload_height + source_height - 1; const uint input_window_size = input_window_width * input_window_height; const uint fill_weights_buffer_iteration_count = ((source_width * source_height) + (forward_register_local_work_width * forward_register_local_work_height - 1)) / (forward_register_local_work_width * forward_register_local_work_height); const uint dd = get_global_id(2); const uint entry_id = dd / output_feature_map_count; const uint output_feature_map_id = dd % output_feature_map_count; const uint dest_x_local = get_local_id(0); const uint dest_y_local = get_local_id(1); const uint dest_local = mad24(dest_y_local, (uint)forward_register_local_work_width, dest_x_local); float sum[forward_register_workload_width * forward_register_workload_height]; for(uint i = 0; i < forward_register_workload_height * forward_register_workload_width; i++) { sum[i] = 0.0F; } uint input_buffer_fill_input_base_offset = (entry_id * input_entry_size) + mad24(dest_y_base, (uint)input_width, dest_x_base); const uint input_window_y_backoff = dest_local / input_window_width; const uint input_window_x_backoff = dest_local % input_window_width; const uint max_iteration_count = (input_window_size + (forward_register_local_work_width * forward_register_local_work_height - 1)) / (forward_register_local_work_width * forward_register_local_work_height); const uint input_window_y_step = (forward_register_local_work_width * forward_register_local_work_height) / input_window_width; const uint input_window_x_step = (forward_register_local_work_width * forward_register_local_work_height) % input_window_width; const uint input_buffer_fill_max_y = input_height - dest_y_base; const uint input_buffer_fill_max_x = input_width - dest_x_base; const uint start_input_local = mad24(dest_y_local, (uint)(forward_register_workload_height * input_window_width), mul24(dest_x_local, (uint)forward_register_workload_width)); const uint weights_offset_offset = mul24(output_feature_map_id, (uint)input_feature_map_count); #pragma unroll 1 for(uint input_feature_map_id = 0; input_feature_map_id < input_feature_map_count; input_feature_map_id++) { const int weights_offset = weights_offsets[weights_offset_offset + input_feature_map_id]; if (weights_offset >= 0) { if (dest_local < input_window_size) { uint input_window_y = input_window_y_backoff; uint input_window_x = input_window_x_backoff; uint local_it = dest_local; #pragma unroll for(uint i = 0; i < max_iteration_count; i++) { const bool input_in_bounds = (input_window_y < input_buffer_fill_max_y) && (input_window_x < input_buffer_fill_max_x); const uint offset = input_buffer_fill_input_base_offset + mad24(input_window_y, (uint)input_width, input_window_x); const uint actual_offset = input_in_bounds ? offset : dest_local; input_buffer[local_it] = input[actual_offset]; local_it += forward_register_local_work_width * forward_register_local_work_height; input_window_x += input_window_x_step; const bool is_next_line = input_window_x >= input_window_width; input_window_x -= is_next_line ? input_window_width : 0; input_window_y += is_next_line ? input_window_y_step + 1 : input_window_y_step; } } uint ind = dest_local; for(uint fill_weights_buffer_iteration = 0; fill_weights_buffer_iteration < fill_weights_buffer_iteration_count; fill_weights_buffer_iteration++) { if (ind < source_width * source_height) { weights_buffer[ind] = weights[weights_offset + ind]; } ind += forward_register_local_work_width * forward_register_local_work_height; } barrier(CLK_LOCAL_MEM_FENCE); uint weights_base_offset2 = 0; uint input_base_offset2 = start_input_local; #pragma unroll for(uint source_y = 0; source_y < source_height; source_y++) { #pragma unroll for(uint source_x = 0; source_x < source_width; source_x++) { const float w = weights_buffer[weights_base_offset2 + source_x]; uint input_base_offset3 = input_base_offset2 + source_x; uint reg_base_offset = 0; #pragma unroll for(uint y = 0; y < forward_register_workload_height; y++) { #pragma unroll for(uint x = 0; x < forward_register_workload_width; x++) { sum[reg_base_offset + x] += input_buffer[input_base_offset3 + x] * w; } input_base_offset3 += input_window_width; reg_base_offset += forward_register_workload_width; } } weights_base_offset2 += source_width; input_base_offset2 += input_window_width; } } input_buffer_fill_input_base_offset += input_width * input_height; } const uint dest_x_initial = mul24((uint)(get_global_id(0)), (uint)forward_register_workload_width); const uint dest_y_initial = mul24((uint)(get_global_id(1)), (uint)forward_register_workload_height); const bool is_actual_neuron_initial = (dest_x_initial < output_width) && (dest_y_initial < output_height); if (is_actual_neuron_initial) { const float bias = biases[output_feature_map_id]; const uint max_y = output_height - dest_y_initial; const uint max_x = output_width - dest_x_initial; const uint initial_offset = (entry_id * output_entry_size) + (output_feature_map_id * (output_width * output_height)) + mad24(dest_y_initial, (uint)output_width, dest_x_initial); uint offset = initial_offset; uint buffer_offset = 0; for(uint y = 0; y < forward_register_workload_height; y++) { const bool valid_y = (y < max_y); for(uint x = 0; x < forward_register_workload_width; x++) { const bool valid_full = valid_y && (x < max_x); const uint actual_offset = valid_full ? offset + x : initial_offset; const uint actual_buffer_offset = valid_full ? buffer_offset + x : 0; output[actual_offset] = AddOnActivationFunction2(AddOnLayer2(AddOnActivationFunction1((sum[actual_buffer_offset] + bias)))); } offset += output_width; buffer_offset += forward_register_workload_width; } } }