diff --git a/0035-ccmp-Add-another-optimization-opportunity-for-ccmp-i.patch b/0035-ccmp-Add-another-optimization-opportunity-for-ccmp-i.patch new file mode 100644 index 0000000000000000000000000000000000000000..fe0b533c5a1f7ac0e89473948706ea2e13b3ab2e --- /dev/null +++ b/0035-ccmp-Add-another-optimization-opportunity-for-ccmp-i.patch @@ -0,0 +1,342 @@ +From cf0f086ec274d794a2a180047123920bf8a5224b Mon Sep 17 00:00:00 2001 +From: dingguangya +Date: Mon, 17 Jan 2022 21:03:47 +0800 +Subject: [PATCH 01/12] [ccmp] Add another optimization opportunity for ccmp + instruction + +Add flag -fccmp2. +Enables the use of the ccmp instruction by creating a new conflict +relationship for instances where temporary expressions replacement +cannot be effectively created. +--- + gcc/ccmp.c | 33 ++++ + gcc/ccmp.h | 1 + + gcc/common.opt | 4 + + gcc/testsuite/gcc.target/aarch64/ccmp_3.c | 15 ++ + gcc/tree-ssa-coalesce.c | 197 ++++++++++++++++++++++ + 5 files changed, 250 insertions(+) + create mode 100644 gcc/testsuite/gcc.target/aarch64/ccmp_3.c + +diff --git a/gcc/ccmp.c b/gcc/ccmp.c +index ca77375a9..8d2d73e52 100644 +--- a/gcc/ccmp.c ++++ b/gcc/ccmp.c +@@ -37,6 +37,7 @@ along with GCC; see the file COPYING3. If not see + #include "cfgexpand.h" + #include "ccmp.h" + #include "predict.h" ++#include "gimple-iterator.h" + + /* Check whether T is a simple boolean variable or a SSA name + set by a comparison operator in the same basic block. */ +@@ -129,6 +130,38 @@ ccmp_candidate_p (gimple *g) + return false; + } + ++/* Check whether bb is a potential conditional compare candidate. */ ++bool ++check_ccmp_candidate (basic_block bb) ++{ ++ gimple_stmt_iterator gsi; ++ gimple *bb_last_stmt, *stmt; ++ tree op0, op1; ++ ++ gsi = gsi_last_bb (bb); ++ bb_last_stmt = gsi_stmt (gsi); ++ ++ if (bb_last_stmt && gimple_code (bb_last_stmt) == GIMPLE_COND) ++ { ++ op0 = gimple_cond_lhs (bb_last_stmt); ++ op1 = gimple_cond_rhs (bb_last_stmt); ++ ++ if (TREE_CODE (op0) == SSA_NAME ++ && TREE_CODE (TREE_TYPE (op0)) == BOOLEAN_TYPE ++ && TREE_CODE (op1) == INTEGER_CST ++ && ((gimple_cond_code (bb_last_stmt) == NE_EXPR) ++ || (gimple_cond_code (bb_last_stmt) == EQ_EXPR))) ++ { ++ stmt = SSA_NAME_DEF_STMT (op0); ++ if (stmt && gimple_code (stmt) == GIMPLE_ASSIGN) ++ { ++ return ccmp_candidate_p (stmt); ++ } ++ } ++ } ++ return false; ++} ++ + /* Extract the comparison we want to do from the tree. */ + void + get_compare_parts (tree t, int *up, rtx_code *rcode, +diff --git a/gcc/ccmp.h b/gcc/ccmp.h +index 199dd581d..ac862f0f6 100644 +--- a/gcc/ccmp.h ++++ b/gcc/ccmp.h +@@ -21,5 +21,6 @@ along with GCC; see the file COPYING3. If not see + #define GCC_CCMP_H + + extern rtx expand_ccmp_expr (gimple *, machine_mode); ++extern bool check_ccmp_candidate (basic_block bb); + + #endif /* GCC_CCMP_H */ +diff --git a/gcc/common.opt b/gcc/common.opt +index 24834cf60..4dd566def 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1942,6 +1942,10 @@ fira-verbose= + Common RejectNegative Joined UInteger Var(flag_ira_verbose) Init(5) + -fira-verbose= Control IRA's level of diagnostic messages. + ++fccmp2 ++Common Report Var(flag_ccmp2) Init(0) Optimization ++Optimize potential ccmp instruction in complex scenarios. ++ + fivopts + Common Report Var(flag_ivopts) Init(1) Optimization + Optimize induction variables on trees. +diff --git a/gcc/testsuite/gcc.target/aarch64/ccmp_3.c b/gcc/testsuite/gcc.target/aarch64/ccmp_3.c +new file mode 100644 +index 000000000..b509ba810 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/ccmp_3.c +@@ -0,0 +1,15 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O -fdump-rtl-expand-details -fccmp2" } */ ++ ++int func (int a, int b, int c) ++{ ++ while(1) ++ { ++ if(a-- == 0 || b >= c) ++ { ++ return 1; ++ } ++ } ++} ++ ++/* { dg-final { scan-assembler-times "\tccmp\t" 1} } */ +diff --git a/gcc/tree-ssa-coalesce.c b/gcc/tree-ssa-coalesce.c +index 0b0b1b18d..e0120a4a4 100644 +--- a/gcc/tree-ssa-coalesce.c ++++ b/gcc/tree-ssa-coalesce.c +@@ -38,6 +38,9 @@ along with GCC; see the file COPYING3. If not see + #include "explow.h" + #include "tree-dfa.h" + #include "stor-layout.h" ++#include "ccmp.h" ++#include "target.h" ++#include "tree-outof-ssa.h" + + /* This set of routines implements a coalesce_list. This is an object which + is used to track pairs of ssa_names which are desirable to coalesce +@@ -854,6 +857,198 @@ live_track_clear_base_vars (live_track *ptr) + bitmap_clear (&ptr->live_base_var); + } + ++/* Return true if gimple is a copy assignment. */ ++ ++static inline bool ++gimple_is_assign_copy_p (gimple *gs) ++{ ++ return (is_gimple_assign (gs) && gimple_assign_copy_p (gs) ++ && TREE_CODE (gimple_assign_lhs (gs)) == SSA_NAME ++ && TREE_CODE (gimple_assign_rhs1 (gs)) == SSA_NAME); ++} ++ ++#define MAX_CCMP_CONFLICT_NUM 5 ++ ++/* Clear high-cost conflict graphs. */ ++ ++static void ++remove_high_cost_graph_for_ccmp (ssa_conflicts *conflict_graph) ++{ ++ unsigned x = 0; ++ int add_conflict_num = 0; ++ bitmap b; ++ FOR_EACH_VEC_ELT (conflict_graph->conflicts, x, b) ++ { ++ if (b) ++ { ++ add_conflict_num++; ++ } ++ } ++ if (add_conflict_num >= MAX_CCMP_CONFLICT_NUM) ++ { ++ conflict_graph->conflicts.release (); ++ } ++} ++ ++/* Adding a new conflict graph to the original graph. */ ++ ++static void ++process_add_graph (live_track *live, basic_block bb, ++ ssa_conflicts *conflict_graph) ++{ ++ tree use, def; ++ ssa_op_iter iter; ++ gimple *first_visit_stmt = NULL; ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ if (gimple_visited_p (gsi_stmt (gsi))) ++ { ++ first_visit_stmt = gsi_stmt (gsi); ++ break; ++ } ++ } ++ if (!first_visit_stmt) ++ return; ++ ++ for (gimple_stmt_iterator gsi = gsi_last_bb (bb); ++ gsi_stmt (gsi) != first_visit_stmt; gsi_prev (&gsi)) ++ { ++ gimple *stmt = gsi_stmt (gsi); ++ if (gimple_visited_p (gsi_stmt (gsi)) && is_gimple_debug (stmt)) ++ { ++ continue; ++ } ++ if (gimple_is_assign_copy_p (stmt)) ++ { ++ live_track_clear_var (live, gimple_assign_rhs1 (stmt)); ++ } ++ FOR_EACH_SSA_TREE_OPERAND (def, stmt, iter, SSA_OP_DEF) ++ { ++ live_track_process_def (live, def, conflict_graph); ++ } ++ FOR_EACH_SSA_TREE_OPERAND (use, stmt, iter, SSA_OP_USE) ++ { ++ live_track_process_use (live, use); ++ } ++ } ++} ++ ++/* Build a conflict graph based on ccmp candidate. */ ++ ++static void ++add_ccmp_conflict_graph (ssa_conflicts *conflict_graph, ++ tree_live_info_p liveinfo, var_map map, basic_block bb) ++{ ++ live_track *live; ++ tree use, def; ++ ssa_op_iter iter; ++ live = new_live_track (map); ++ live_track_init (live, live_on_exit (liveinfo, bb)); ++ ++ gimple *last_stmt = gsi_stmt (gsi_last_bb (bb)); ++ gcc_assert (gimple_cond_lhs (last_stmt)); ++ ++ auto_vec stack; ++ stack.safe_push (gimple_cond_lhs (last_stmt)); ++ while (!stack.is_empty ()) ++ { ++ tree op = stack.pop (); ++ gimple *op_stmt = SSA_NAME_DEF_STMT (op); ++ if (!op_stmt || gimple_bb (op_stmt) != bb ++ || !is_gimple_assign (op_stmt) ++ || !ssa_is_replaceable_p (op_stmt)) ++ { ++ continue; ++ } ++ if (gimple_is_assign_copy_p (op_stmt)) ++ { ++ live_track_clear_var (live, gimple_assign_rhs1 (op_stmt)); ++ } ++ gimple_set_visited (op_stmt, true); ++ FOR_EACH_SSA_TREE_OPERAND (def, op_stmt, iter, SSA_OP_DEF) ++ { ++ live_track_process_def (live, def, conflict_graph); ++ } ++ FOR_EACH_SSA_TREE_OPERAND (use, op_stmt, iter, SSA_OP_USE) ++ { ++ stack.safe_push (use); ++ live_track_process_use (live, use); ++ } ++ } ++ ++ process_add_graph (live, bb, conflict_graph); ++ delete_live_track (live); ++ remove_high_cost_graph_for_ccmp (conflict_graph); ++} ++ ++/* Determine whether the ccmp conflict graph can be added. ++ i.e, ++ ++ ;; basic block 3, loop depth 1 ++ ;; pred: 2 ++ ;; 3 ++ # ivtmp.5_10 = PHI ++ _7 = b_4 (D) >= c_5 (D); ++ _8 = ivtmp.5_10 == 0; ++ _9 = _7 | _8; ++ ivtmp.5_11 = ivtmp.5_10 - 1; ++ if (_9 != 0) ++ goto ; [10.70%] ++ else ++ goto ; [89.30%] ++ ++ In the above loop, the expression will be replaced: ++ ++ _7 replaced by b_4 (D) >= c_5 (D) ++ _8 replaced by ivtmp.5_10 == 0 ++ ++ If the current case want use the ccmp instruction, then ++ ++ _9 can replaced by _7 | _8 ++ ++ So this requires that ivtmp.5_11 and ivtmp.5_10 be divided into different ++ partitions. ++ ++ Now this function can achieve this ability. */ ++ ++static void ++determine_add_ccmp_conflict_graph (basic_block bb, tree_live_info_p liveinfo, ++ var_map map, ssa_conflicts *graph) ++{ ++ if (!flag_ccmp2 || !targetm.gen_ccmp_first || !check_ccmp_candidate (bb)) ++ return; ++ for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); ++ gsi_next (&bsi)) ++ { ++ gimple_set_visited (gsi_stmt (bsi), false); ++ } ++ ssa_conflicts *ccmp_conflict_graph; ++ ccmp_conflict_graph = ssa_conflicts_new (num_var_partitions (map)); ++ add_ccmp_conflict_graph (ccmp_conflict_graph, liveinfo, map, bb); ++ unsigned x; ++ bitmap b; ++ if (ccmp_conflict_graph) ++ { ++ FOR_EACH_VEC_ELT (ccmp_conflict_graph->conflicts, x, b) ++ { ++ if (!b) ++ continue; ++ unsigned y = bitmap_first_set_bit (b); ++ if (!graph->conflicts[x] || !bitmap_bit_p (graph->conflicts[x], y)) ++ { ++ ssa_conflicts_add (graph, x, y); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "potential ccmp: add additional " ++ "conflict-ssa : bb[%d] %d:%d\n", ++ bb->index, x, y); ++ } ++ } ++ } ++ } ++ ssa_conflicts_delete (ccmp_conflict_graph); ++} + + /* Build a conflict graph based on LIVEINFO. Any partitions which are in the + partition view of the var_map liveinfo is based on get entries in the +@@ -938,6 +1133,8 @@ build_ssa_conflict_graph (tree_live_info_p liveinfo) + live_track_process_use (live, var); + } + ++ determine_add_ccmp_conflict_graph (bb, liveinfo, map, graph); ++ + /* If result of a PHI is unused, looping over the statements will not + record any conflicts since the def was never live. Since the PHI node + is going to be translated out of SSA form, it will insert a copy. +-- +2.27.0.windows.1 + diff --git a/0036-StructReorg-Refactoring-reorder-fields-to-struct-lay.patch b/0036-StructReorg-Refactoring-reorder-fields-to-struct-lay.patch new file mode 100644 index 0000000000000000000000000000000000000000..41d01f763824d13eaff6260543a19fc59b7fe6af --- /dev/null +++ b/0036-StructReorg-Refactoring-reorder-fields-to-struct-lay.patch @@ -0,0 +1,1115 @@ +From 3c06a2cda7220a48866ae2dbe3f365e300cbaeca Mon Sep 17 00:00:00 2001 +From: liyancheng <412998149@qq.com> +Date: Wed, 1 Jun 2022 17:22:12 +0800 +Subject: [PATCH 02/12] [StructReorg] Refactoring reorder fields to struct + layout optimization + +Refactor the reorder_fields optimization into struct layout optimization. Add +flag -fipa-struct-reorg=[0,1,2] to enable none, strcut reorg, reorder fields +optimizations. +--- + gcc/common.opt | 6 +- + gcc/ipa-struct-reorg/ipa-struct-reorg.c | 167 +++++++++--------- + gcc/ipa-struct-reorg/ipa-struct-reorg.h | 2 +- + gcc/opts.c | 12 ++ + gcc/passes.def | 2 +- + gcc/symbol-summary.h | 4 +- + .../struct/rf_DTE_struct_instance_field.c | 2 +- + gcc/testsuite/gcc.dg/struct/rf_DTE_verify.c | 2 +- + .../gcc.dg/struct/rf_check_ptr_layers_bug.c | 2 +- + .../gcc.dg/struct/rf_create_fields_bug.c | 2 +- + .../gcc.dg/struct/rf_create_new_func_bug.c | 2 +- + .../gcc.dg/struct/rf_ele_minus_verify.c | 2 +- + .../gcc.dg/struct/rf_escape_by_base.c | 2 +- + gcc/testsuite/gcc.dg/struct/rf_int_cast_ptr.c | 2 +- + .../gcc.dg/struct/rf_mem_ref_offset.c | 2 +- + .../struct/rf_mul_layer_ptr_record_bug.c | 2 +- + .../gcc.dg/struct/rf_pass_conflict.c | 2 +- + gcc/testsuite/gcc.dg/struct/rf_ptr2void_lto.c | 2 +- + gcc/testsuite/gcc.dg/struct/rf_ptr_diff.c | 2 +- + .../gcc.dg/struct/rf_ptr_negate_expr.c | 2 +- + gcc/testsuite/gcc.dg/struct/rf_ptr_offset.c | 2 +- + gcc/testsuite/gcc.dg/struct/rf_ptr_ptr.c | 2 +- + gcc/testsuite/gcc.dg/struct/rf_ptr_ptr_ptr.c | 2 +- + .../gcc.dg/struct/rf_rescusive_type.c | 2 +- + .../struct/rf_rewrite_assign_more_cmp.c | 2 +- + .../gcc.dg/struct/rf_rewrite_cond_bug.c | 2 +- + .../gcc.dg/struct/rf_rewrite_cond_more_cmp.c | 2 +- + .../gcc.dg/struct/rf_rewrite_phi_bug.c | 2 +- + gcc/testsuite/gcc.dg/struct/rf_visible_func.c | 2 +- + .../gcc.dg/struct/rf_void_ptr_param_func.c | 2 +- + .../gcc.dg/struct/sr_pointer_minus.c | 2 +- + gcc/testsuite/gcc.dg/struct/struct-reorg.exp | 19 ++ + gcc/timevar.def | 2 +- + gcc/tree-pass.h | 2 +- + gcc/tree.c | 4 +- + 35 files changed, 153 insertions(+), 117 deletions(-) + +diff --git a/gcc/common.opt b/gcc/common.opt +index 4dd566def..7fc075d35 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1876,13 +1876,17 @@ Common Ignore + Does nothing. Preserved for backward compatibility. + + fipa-reorder-fields +-Common Report Var(flag_ipa_reorder_fields) Init(0) Optimization ++Common Report Var(flag_ipa_struct_layout) Init(0) Optimization + Perform structure fields reorder optimizations. + + fipa-struct-reorg + Common Report Var(flag_ipa_struct_reorg) Init(0) Optimization + Perform structure layout optimizations. + ++fipa-struct-reorg= ++Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 2) ++-fipa-struct-reorg=[0,1,2] adding none, struct-reorg, reorder-fields optimizations. ++ + fipa-extend-auto-profile + Common Report Var(flag_ipa_extend_auto_profile) + Use sample profile information for source code. +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.c b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +index 2bf41e0d8..9214ee74a 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.c ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +@@ -235,7 +235,7 @@ enum srmode + { + NORMAL = 0, + COMPLETE_STRUCT_RELAYOUT, +- STRUCT_REORDER_FIELDS ++ STRUCT_LAYOUT_OPTIMIZE + }; + + static bool is_result_of_mult (tree arg, tree *num, tree struct_size); +@@ -552,7 +552,7 @@ void + srtype::simple_dump (FILE *f) + { + print_generic_expr (f, type); +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + fprintf (f, "(%d)", TYPE_UID (type)); + } +@@ -593,9 +593,9 @@ srfield::create_new_fields (tree newtype[max_split], + tree newfields[max_split], + tree newlast[max_split]) + { +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { +- create_new_reorder_fields (newtype, newfields, newlast); ++ create_new_optimized_fields (newtype, newfields, newlast); + return; + } + +@@ -689,15 +689,15 @@ srfield::reorder_fields (tree newfields[max_split], tree newlast[max_split], + } + } + +-/* Create the new reorder fields for this field. ++/* Create the new optimized fields for this field. + newtype[max_split]: srtype's member variable, + newfields[max_split]: created by create_new_type func, + newlast[max_split]: created by create_new_type func. */ + + void +-srfield::create_new_reorder_fields (tree newtype[max_split], +- tree newfields[max_split], +- tree newlast[max_split]) ++srfield::create_new_optimized_fields (tree newtype[max_split], ++ tree newfields[max_split], ++ tree newlast[max_split]) + { + /* newtype, corresponding to newtype[max_split] in srtype. */ + tree nt = NULL_TREE; +@@ -794,7 +794,7 @@ srtype::create_new_type (void) + we are not splitting the struct into two clusters, + then just return false and don't change the type. */ + if (!createnewtype && maxclusters == 0 +- && current_mode != STRUCT_REORDER_FIELDS) ++ && current_mode != STRUCT_LAYOUT_OPTIMIZE) + { + newtype[0] = type; + return false; +@@ -822,8 +822,8 @@ srtype::create_new_type (void) + sprintf(id, "%d", i); + if (tname) + { +- name = concat (tname, current_mode == STRUCT_REORDER_FIELDS +- ? ".reorder." : ".reorg.", id, NULL); ++ name = concat (tname, current_mode == STRUCT_LAYOUT_OPTIMIZE ++ ? ".slo." : ".reorg.", id, NULL); + TYPE_NAME (newtype[i]) = build_decl (UNKNOWN_LOCATION, TYPE_DECL, + get_identifier (name), newtype[i]); + free (name); +@@ -969,8 +969,8 @@ srfunction::create_new_decls (void) + sprintf(id, "%d", j); + if (tname) + { +- name = concat (tname, current_mode == STRUCT_REORDER_FIELDS +- ? ".reorder." : ".reorg.", id, NULL); ++ name = concat (tname, current_mode == STRUCT_LAYOUT_OPTIMIZE ++ ? ".slo." : ".reorg.", id, NULL); + new_name = get_identifier (name); + free (name); + } +@@ -2718,7 +2718,7 @@ escape_type escape_type_volatile_array_or_ptrptr (tree type) + return escape_volatile; + if (isarraytype (type)) + return escape_array; +- if (isptrptr (type) && (current_mode != STRUCT_REORDER_FIELDS)) ++ if (isptrptr (type) && (current_mode != STRUCT_LAYOUT_OPTIMIZE)) + return escape_ptr_ptr; + return does_not_escape; + } +@@ -2740,13 +2740,13 @@ ipa_struct_reorg::record_field_type (tree field, srtype *base_srtype) + field_srtype->add_field_site (field_srfield); + } + if (field_srtype == base_srtype && current_mode != COMPLETE_STRUCT_RELAYOUT +- && current_mode != STRUCT_REORDER_FIELDS) ++ && current_mode != STRUCT_LAYOUT_OPTIMIZE) + { + base_srtype->mark_escape (escape_rescusive_type, NULL); + } + /* Types of non-pointer field are difficult to track the correctness + of the rewrite when it used by the escaped type. */ +- if (current_mode == STRUCT_REORDER_FIELDS ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE + && TREE_CODE (field_type) == RECORD_TYPE) + { + field_srtype->mark_escape (escape_instance_field, NULL); +@@ -2781,7 +2781,7 @@ ipa_struct_reorg::record_struct_field_types (tree base_type, + } + /* Types of non-pointer field are difficult to track the correctness + of the rewrite when it used by the escaped type. */ +- if (current_mode == STRUCT_REORDER_FIELDS ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE + && TREE_CODE (field_type) == RECORD_TYPE) + { + base_srtype->mark_escape (escape_instance_field, NULL); +@@ -2966,7 +2966,7 @@ ipa_struct_reorg::record_var (tree decl, escape_type escapes, int arg) + /* Separate instance is hard to trace in complete struct + relayout optimization. */ + if ((current_mode == COMPLETE_STRUCT_RELAYOUT +- || current_mode == STRUCT_REORDER_FIELDS) ++ || current_mode == STRUCT_LAYOUT_OPTIMIZE) + && TREE_CODE (TREE_TYPE (decl)) == RECORD_TYPE) + { + e = escape_separate_instance; +@@ -3071,7 +3071,7 @@ ipa_struct_reorg::find_vars (gimple *stmt) + /* Add a safe func mechanism. */ + bool l_find = true; + bool r_find = true; +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + l_find = !(current_function->is_safe_func + && TREE_CODE (lhs) == SSA_NAME +@@ -3117,7 +3117,7 @@ ipa_struct_reorg::find_vars (gimple *stmt) + } + } + } +- else if ((current_mode == STRUCT_REORDER_FIELDS) ++ else if ((current_mode == STRUCT_LAYOUT_OPTIMIZE) + && (gimple_assign_rhs_code (stmt) == LE_EXPR + || gimple_assign_rhs_code (stmt) == LT_EXPR + || gimple_assign_rhs_code (stmt) == GE_EXPR +@@ -3128,7 +3128,7 @@ ipa_struct_reorg::find_vars (gimple *stmt) + find_var (gimple_assign_rhs2 (stmt), stmt); + } + /* find void ssa_name from stmt such as: _2 = _1 - old_arcs_1. */ +- else if ((current_mode == STRUCT_REORDER_FIELDS) ++ else if ((current_mode == STRUCT_LAYOUT_OPTIMIZE) + && gimple_assign_rhs_code (stmt) == POINTER_DIFF_EXPR + && types_compatible_p ( + TYPE_MAIN_VARIANT (TREE_TYPE (gimple_assign_rhs1 (stmt))), +@@ -3391,11 +3391,12 @@ is_result_of_mult (tree arg, tree *num, tree struct_size) + arg = gimple_assign_rhs1 (size_def_stmt); + size_def_stmt = SSA_NAME_DEF_STMT (arg); + } +- else if (rhs_code == NEGATE_EXPR && current_mode == STRUCT_REORDER_FIELDS) ++ else if (rhs_code == NEGATE_EXPR ++ && current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + return trace_calculate_negate (size_def_stmt, num, struct_size); + } +- else if (rhs_code == NOP_EXPR && current_mode == STRUCT_REORDER_FIELDS) ++ else if (rhs_code == NOP_EXPR && current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + return trace_calculate_diff (size_def_stmt, num); + } +@@ -3415,7 +3416,7 @@ is_result_of_mult (tree arg, tree *num, tree struct_size) + bool + ipa_struct_reorg::handled_allocation_stmt (gimple *stmt) + { +- if ((current_mode == STRUCT_REORDER_FIELDS) ++ if ((current_mode == STRUCT_LAYOUT_OPTIMIZE) + && (gimple_call_builtin_p (stmt, BUILT_IN_REALLOC) + || gimple_call_builtin_p (stmt, BUILT_IN_MALLOC) + || gimple_call_builtin_p (stmt, BUILT_IN_CALLOC))) +@@ -3548,7 +3549,7 @@ ipa_struct_reorg::maybe_mark_or_record_other_side (tree side, tree other, gimple + /* x_1 = y.x_nodes; void *x; + Directly mark the structure pointer type assigned + to the void* variable as escape. */ +- else if (current_mode == STRUCT_REORDER_FIELDS ++ else if (current_mode == STRUCT_LAYOUT_OPTIMIZE + && TREE_CODE (side) == SSA_NAME + && VOID_POINTER_P (TREE_TYPE (side)) + && SSA_NAME_VAR (side) +@@ -3815,7 +3816,7 @@ ipa_struct_reorg::get_type_field (tree expr, tree &base, bool &indirect, + and doesn't mark escape follow.). */ + /* _1 = MEM[(struct arc_t * *)a_1]. + then base a_1: ssa_name - pointer_type - integer_type. */ +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + bool is_int_ptr = POINTER_TYPE_P (TREE_TYPE (base)) + && (TREE_CODE (inner_type (TREE_TYPE (base))) +@@ -4031,7 +4032,7 @@ ipa_struct_reorg::maybe_record_call (cgraph_node *node, gcall *stmt) + /* callee_func (_1, _2); + Check the callee func, instead of current func. */ + if (!(free_or_realloc +- || (current_mode == STRUCT_REORDER_FIELDS ++ || (current_mode == STRUCT_LAYOUT_OPTIMIZE + && safe_functions.contains ( + node->get_edge (stmt)->callee))) + && VOID_POINTER_P (argtypet)) +@@ -4063,9 +4064,9 @@ ipa_struct_reorg::record_stmt_expr (tree expr, cgraph_node *node, gimple *stmt) + realpart, imagpart, address, escape_from_base)) + return; + +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { +- if (!opt_for_fn (current_function_decl, flag_ipa_reorder_fields)) ++ if (!opt_for_fn (current_function_decl, flag_ipa_struct_layout)) + { + type->mark_escape (escape_non_optimize, stmt); + } +@@ -4287,7 +4288,7 @@ ipa_struct_reorg::check_definition_call (srdecl *decl, vec &worklist) + check_type_and_push (gimple_call_arg (stmt, 0), decl, worklist, stmt); + } + +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + if (!handled_allocation_stmt (stmt)) + { +@@ -4341,7 +4342,7 @@ ipa_struct_reorg::check_definition (srdecl *decl, vec &worklist) + } + return; + } +- if (current_mode == STRUCT_REORDER_FIELDS && SSA_NAME_VAR (ssa_name) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE && SSA_NAME_VAR (ssa_name) + && VOID_POINTER_P (TREE_TYPE (SSA_NAME_VAR (ssa_name)))) + { + type->mark_escape (escape_cast_void, SSA_NAME_DEF_STMT (ssa_name)); +@@ -4442,7 +4443,7 @@ ipa_struct_reorg::check_other_side (srdecl *decl, tree other, gimple *stmt, vec< + if (!get_type_field (other, base, indirect, type1, field, + realpart, imagpart, address, escape_from_base)) + { +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + /* release INTEGER_TYPE cast to struct pointer. */ + bool cast_from_int_ptr = current_function->is_safe_func && base +@@ -4498,7 +4499,7 @@ get_base (tree &base, tree expr) + void + ipa_struct_reorg::check_ptr_layers (tree a_expr, tree b_expr, gimple* stmt) + { +- if (current_mode != STRUCT_REORDER_FIELDS || current_function->is_safe_func ++ if (current_mode != STRUCT_LAYOUT_OPTIMIZE || current_function->is_safe_func + || !(POINTER_TYPE_P (TREE_TYPE (a_expr))) + || !(POINTER_TYPE_P (TREE_TYPE (b_expr))) + || !handled_type (TREE_TYPE (a_expr)) +@@ -4579,7 +4580,7 @@ ipa_struct_reorg::check_use (srdecl *decl, gimple *stmt, vec &worklist) + && (code != EQ_EXPR && code != NE_EXPR + && code != LT_EXPR && code != LE_EXPR + && code != GT_EXPR && code != GE_EXPR)) +- || (current_mode == STRUCT_REORDER_FIELDS ++ || (current_mode == STRUCT_LAYOUT_OPTIMIZE + && (code != EQ_EXPR && code != NE_EXPR + && code != LT_EXPR && code != LE_EXPR + && code != GT_EXPR && code != GE_EXPR))) +@@ -4618,7 +4619,7 @@ ipa_struct_reorg::check_use (srdecl *decl, gimple *stmt, vec &worklist) + && (code != EQ_EXPR && code != NE_EXPR + && code != LT_EXPR && code != LE_EXPR + && code != GT_EXPR && code != GE_EXPR)) +- || (current_mode == STRUCT_REORDER_FIELDS ++ || (current_mode == STRUCT_LAYOUT_OPTIMIZE + && (code != EQ_EXPR && code != NE_EXPR + && code != LT_EXPR && code != LE_EXPR + && code != GT_EXPR && code != GE_EXPR))) +@@ -4740,11 +4741,11 @@ ipa_struct_reorg::record_function (cgraph_node *node) + escapes = escape_marked_as_used; + else if (!node->local) + { +- if (current_mode != STRUCT_REORDER_FIELDS) ++ if (current_mode != STRUCT_LAYOUT_OPTIMIZE) + { + escapes = escape_visible_function; + } +- if (current_mode == STRUCT_REORDER_FIELDS && node->externally_visible) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE && node->externally_visible) + { + escapes = escape_visible_function; + } +@@ -4754,9 +4755,9 @@ ipa_struct_reorg::record_function (cgraph_node *node) + else if (!tree_versionable_function_p (node->decl)) + escapes = escape_noclonable_function; + +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { +- if (!opt_for_fn (node->decl, flag_ipa_reorder_fields)) ++ if (!opt_for_fn (node->decl, flag_ipa_struct_layout)) + { + escapes = escape_non_optimize; + } +@@ -4773,7 +4774,7 @@ ipa_struct_reorg::record_function (cgraph_node *node) + gimple_stmt_iterator si; + + /* Add a safe func mechanism. */ +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + current_function->is_safe_func = safe_functions.contains (node); + if (dump_file) +@@ -4989,7 +4990,7 @@ ipa_struct_reorg::record_accesses (void) + } + + /* Add a safe func mechanism. */ +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + record_safe_func_with_void_ptr_parm (); + } +@@ -5188,7 +5189,7 @@ void + ipa_struct_reorg::prune_escaped_types (void) + { + if (current_mode != COMPLETE_STRUCT_RELAYOUT +- && current_mode != STRUCT_REORDER_FIELDS) ++ && current_mode != STRUCT_LAYOUT_OPTIMIZE) + { + /* Detect recusive types and mark them as escaping. */ + detect_cycles (); +@@ -5196,7 +5197,7 @@ ipa_struct_reorg::prune_escaped_types (void) + mark them as escaping. */ + propagate_escape (); + } +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + propagate_escape_via_original (); + propagate_escape_via_empty_with_no_original (); +@@ -5256,7 +5257,7 @@ ipa_struct_reorg::prune_escaped_types (void) + if (function->args.is_empty () + && function->decls.is_empty () + && function->globals.is_empty () +- && current_mode != STRUCT_REORDER_FIELDS) ++ && current_mode != STRUCT_LAYOUT_OPTIMIZE) + { + delete function; + functions.ordered_remove (i); +@@ -5281,10 +5282,10 @@ ipa_struct_reorg::prune_escaped_types (void) + + /* Prune types that escape, all references to those types + will have been removed in the above loops. */ +- /* The escape type is not deleted in STRUCT_REORDER_FIELDS, ++ /* The escape type is not deleted in STRUCT_LAYOUT_OPTIMIZE, + Then the type that contains the escaped type fields + can find complete information. */ +- if (current_mode != STRUCT_REORDER_FIELDS) ++ if (current_mode != STRUCT_LAYOUT_OPTIMIZE) + { + for (unsigned i = 0; i < types.length ();) + { +@@ -5334,7 +5335,7 @@ ipa_struct_reorg::create_new_types (void) + for (unsigned i = 0; i < types.length (); i++) + newtypes += types[i]->create_new_type (); + +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + for (unsigned i = 0; i < types.length (); i++) + { +@@ -5458,8 +5459,8 @@ ipa_struct_reorg::create_new_args (cgraph_node *new_node) + char *name = NULL; + if (tname) + { +- name = concat (tname, current_mode == STRUCT_REORDER_FIELDS +- ? ".reorder.0" : ".reorg.0", NULL); ++ name = concat (tname, current_mode == STRUCT_LAYOUT_OPTIMIZE ++ ? ".slo.0" : ".reorg.0", NULL); + new_name = get_identifier (name); + free (name); + } +@@ -5547,8 +5548,8 @@ ipa_struct_reorg::create_new_functions (void) + statistics_counter_event (NULL, "Create new function", 1); + new_node = node->create_version_clone_with_body ( + vNULL, NULL, NULL, NULL, NULL, +- current_mode == STRUCT_REORDER_FIELDS +- ? "struct_reorder" : "struct_reorg"); ++ current_mode == STRUCT_LAYOUT_OPTIMIZE ++ ? "slo" : "struct_reorg"); + new_node->can_change_signature = node->can_change_signature; + new_node->make_local (); + f->newnode = new_node; +@@ -5666,13 +5667,13 @@ ipa_struct_reorg::rewrite_expr (tree expr, tree newexpr[max_split], bool ignore_ + newbase1 = build_fold_addr_expr (newbase1); + if (indirect) + { +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + /* Supports the MEM_REF offset. + _1 = MEM[(struct arc *)ap_1 + 72B].flow; +- Old rewrite: _1 = ap.reorder.0_8->flow; ++ Old rewrite: _1 = ap.slo.0_8->flow; + New rewrite: _1 +- = MEM[(struct arc.reorder.0 *)ap.reorder.0_8 + 64B].flow; ++ = MEM[(struct arc.slo.0 *)ap.slo.0_8 + 64B].flow; + */ + HOST_WIDE_INT offset_tmp = 0; + HOST_WIDE_INT mem_offset = 0; +@@ -5738,10 +5739,10 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + return remove; + } + +- if ((current_mode != STRUCT_REORDER_FIELDS ++ if ((current_mode != STRUCT_LAYOUT_OPTIMIZE + && (gimple_assign_rhs_code (stmt) == EQ_EXPR + || gimple_assign_rhs_code (stmt) == NE_EXPR)) +- || (current_mode == STRUCT_REORDER_FIELDS ++ || (current_mode == STRUCT_LAYOUT_OPTIMIZE + && (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) + == tcc_comparison))) + { +@@ -5751,7 +5752,7 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + tree newrhs2[max_split]; + tree_code rhs_code = gimple_assign_rhs_code (stmt); + tree_code code = rhs_code == EQ_EXPR ? BIT_AND_EXPR : BIT_IOR_EXPR; +- if (current_mode == STRUCT_REORDER_FIELDS ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE + && rhs_code != EQ_EXPR && rhs_code != NE_EXPR) + { + code = rhs_code; +@@ -5798,8 +5799,8 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + _6 = _4 + _5; + _5 = (long unsigned int) _3; + _3 = _1 - old_2. */ +- if (current_mode != STRUCT_REORDER_FIELDS +- || (current_mode == STRUCT_REORDER_FIELDS && (num != NULL))) ++ if (current_mode != STRUCT_LAYOUT_OPTIMIZE ++ || (current_mode == STRUCT_LAYOUT_OPTIMIZE && (num != NULL))) + { + num = gimplify_build1 (gsi, NOP_EXPR, sizetype, num); + } +@@ -5827,7 +5828,7 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + } + + /* Support POINTER_DIFF_EXPR rewriting. */ +- if (current_mode == STRUCT_REORDER_FIELDS ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE + && gimple_assign_rhs_code (stmt) == POINTER_DIFF_EXPR) + { + tree rhs1 = gimple_assign_rhs1 (stmt); +@@ -6014,7 +6015,7 @@ ipa_struct_reorg::rewrite_call (gcall *stmt, gimple_stmt_iterator *gsi) + srfunction *f = find_function (node); + + /* Add a safe func mechanism. */ +- if (current_mode == STRUCT_REORDER_FIELDS && f && f->is_safe_func) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE && f && f->is_safe_func) + { + tree expr = gimple_call_arg (stmt, 0); + tree newexpr[max_split]; +@@ -6141,9 +6142,9 @@ ipa_struct_reorg::rewrite_cond (gcond *stmt, gimple_stmt_iterator *gsi) + tree_code rhs_code = gimple_cond_code (stmt); + + /* Handle only equals or not equals conditionals. */ +- if ((current_mode != STRUCT_REORDER_FIELDS ++ if ((current_mode != STRUCT_LAYOUT_OPTIMIZE + && (rhs_code != EQ_EXPR && rhs_code != NE_EXPR)) +- || (current_mode == STRUCT_REORDER_FIELDS ++ || (current_mode == STRUCT_LAYOUT_OPTIMIZE + && TREE_CODE_CLASS (rhs_code) != tcc_comparison)) + return false; + tree lhs = gimple_cond_lhs (stmt); +@@ -6171,10 +6172,10 @@ ipa_struct_reorg::rewrite_cond (gcond *stmt, gimple_stmt_iterator *gsi) + } + + /* Old rewrite: if (x_1 != 0B) +- -> _1 = x.reorder.0_1 != 0B; if (_1 != 1) ++ -> _1 = x.slo.0_1 != 0B; if (_1 != 1) + The logic is incorrect. + New rewrite: if (x_1 != 0B) +- -> if (x.reorder.0_1 != 0B);*/ ++ -> if (x.slo.0_1 != 0B);*/ + for (unsigned i = 0; i < max_split && (newlhs[i] || newrhs[i]); i++) + { + if (newlhs[i]) +@@ -6203,7 +6204,7 @@ ipa_struct_reorg::rewrite_cond (gcond *stmt, gimple_stmt_iterator *gsi) + bool + ipa_struct_reorg::rewrite_debug (gimple *stmt, gimple_stmt_iterator *) + { +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + /* Delete debug gimple now. */ + return true; +@@ -6367,7 +6368,7 @@ ipa_struct_reorg::rewrite_functions (void) + then don't rewrite any accesses. */ + if (!create_new_types ()) + { +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + for (unsigned i = 0; i < functions.length (); i++) + { +@@ -6386,7 +6387,7 @@ ipa_struct_reorg::rewrite_functions (void) + return 0; + } + +- if (current_mode == STRUCT_REORDER_FIELDS && dump_file) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE && dump_file) + { + fprintf (dump_file, "=========== all created newtypes: ===========\n\n"); + dump_newtypes (dump_file); +@@ -6396,13 +6397,13 @@ ipa_struct_reorg::rewrite_functions (void) + { + retval = TODO_remove_functions; + create_new_functions (); +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + prune_escaped_types (); + } + } + +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + for (unsigned i = 0; i < functions.length (); i++) + { +@@ -6572,7 +6573,7 @@ ipa_struct_reorg::execute (enum srmode mode) + { + unsigned int ret = 0; + +- if (mode == NORMAL || mode == STRUCT_REORDER_FIELDS) ++ if (mode == NORMAL || mode == STRUCT_LAYOUT_OPTIMIZE) + { + current_mode = mode; + /* If there is a top-level inline-asm, +@@ -6660,12 +6661,12 @@ pass_ipa_struct_reorg::gate (function *) + && (in_lto_p || flag_whole_program)); + } + +-const pass_data pass_data_ipa_reorder_fields = ++const pass_data pass_data_ipa_struct_layout = + { + SIMPLE_IPA_PASS, // type +- "reorder_fields", // name ++ "struct_layout", // name + OPTGROUP_NONE, // optinfo_flags +- TV_IPA_REORDER_FIELDS, // tv_id ++ TV_IPA_STRUCT_LAYOUT, // tv_id + 0, // properties_required + 0, // properties_provided + 0, // properties_destroyed +@@ -6673,11 +6674,11 @@ const pass_data pass_data_ipa_reorder_fields = + 0, // todo_flags_finish + }; + +-class pass_ipa_reorder_fields : public simple_ipa_opt_pass ++class pass_ipa_struct_layout : public simple_ipa_opt_pass + { + public: +- pass_ipa_reorder_fields (gcc::context *ctxt) +- : simple_ipa_opt_pass (pass_data_ipa_reorder_fields, ctxt) ++ pass_ipa_struct_layout (gcc::context *ctxt) ++ : simple_ipa_opt_pass (pass_data_ipa_struct_layout, ctxt) + {} + + /* opt_pass methods: */ +@@ -6685,17 +6686,17 @@ public: + virtual unsigned int execute (function *) + { + unsigned int ret = 0; +- ret = ipa_struct_reorg ().execute (STRUCT_REORDER_FIELDS); ++ ret = ipa_struct_reorg ().execute (STRUCT_LAYOUT_OPTIMIZE); + return ret; + } + +-}; // class pass_ipa_reorder_fields ++}; // class pass_ipa_struct_layout + + bool +-pass_ipa_reorder_fields::gate (function *) ++pass_ipa_struct_layout::gate (function *) + { + return (optimize >= 3 +- && flag_ipa_reorder_fields ++ && flag_ipa_struct_layout + /* Don't bother doing anything if the program has errors. */ + && !seen_error () + && flag_lto_partition == LTO_PARTITION_ONE +@@ -6715,7 +6716,7 @@ make_pass_ipa_struct_reorg (gcc::context *ctxt) + } + + simple_ipa_opt_pass * +-make_pass_ipa_reorder_fields (gcc::context *ctxt) ++make_pass_ipa_struct_layout (gcc::context *ctxt) + { +- return new pass_ipa_reorder_fields (ctxt); ++ return new pass_ipa_struct_layout (ctxt); + } +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.h b/gcc/ipa-struct-reorg/ipa-struct-reorg.h +index 8fb6ce9c4..54b0dc655 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.h ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.h +@@ -187,7 +187,7 @@ struct srfield + tree newlast[max_split]); + void reorder_fields (tree newfields[max_split], tree newlast[max_split], + tree &field); +- void create_new_reorder_fields (tree newtype[max_split], ++ void create_new_optimized_fields (tree newtype[max_split], + tree newfields[max_split], + tree newlast[max_split]); + }; +diff --git a/gcc/opts.c b/gcc/opts.c +index 479d726df..c3877c24e 100644 +--- a/gcc/opts.c ++++ b/gcc/opts.c +@@ -2695,6 +2695,18 @@ common_handle_option (struct gcc_options *opts, + } + break; + ++ case OPT_fipa_struct_reorg_: ++ opts->x_struct_layout_optimize_level = value; ++ if (value > 1) ++ { ++ SET_OPTION_IF_UNSET (opts, opts_set, flag_ipa_struct_layout, value); ++ } ++ /* No break here - do -fipa-struct-reorg processing. */ ++ /* FALLTHRU. */ ++ case OPT_fipa_struct_reorg: ++ opts->x_flag_ipa_struct_reorg = value; ++ break; ++ + case OPT_fprofile_generate_: + opts->x_profile_data_prefix = xstrdup (arg); + value = true; +diff --git a/gcc/passes.def b/gcc/passes.def +index e9c91d26e..eea4d7808 100644 +--- a/gcc/passes.def ++++ b/gcc/passes.def +@@ -174,7 +174,7 @@ along with GCC; see the file COPYING3. If not see + INSERT_PASSES_AFTER (all_late_ipa_passes) + NEXT_PASS (pass_materialize_all_clones); + NEXT_PASS (pass_ipa_pta); +- NEXT_PASS (pass_ipa_reorder_fields); ++ NEXT_PASS (pass_ipa_struct_layout); + /* FIXME: this should a normal IP pass */ + NEXT_PASS (pass_ipa_struct_reorg); + NEXT_PASS (pass_omp_simd_clone); +diff --git a/gcc/symbol-summary.h b/gcc/symbol-summary.h +index ddf5e3577..f62222a96 100644 +--- a/gcc/symbol-summary.h ++++ b/gcc/symbol-summary.h +@@ -61,7 +61,7 @@ protected: + { + /* In structure optimizatons, we call new to ensure that + the allocated memory is initialized to 0. */ +- if (flag_ipa_reorder_fields || flag_ipa_struct_reorg) ++ if (flag_ipa_struct_layout || flag_ipa_struct_reorg) + return is_ggc () ? new (ggc_internal_alloc (sizeof (T))) T () + : new T (); + /* Call gcc_internal_because we do not want to call finalizer for +@@ -77,7 +77,7 @@ protected: + ggc_delete (item); + else + { +- if (flag_ipa_reorder_fields || flag_ipa_struct_reorg) ++ if (flag_ipa_struct_layout || flag_ipa_struct_reorg) + delete item; + else + m_allocator.remove (item); +diff --git a/gcc/testsuite/gcc.dg/struct/rf_DTE_struct_instance_field.c b/gcc/testsuite/gcc.dg/struct/rf_DTE_struct_instance_field.c +index b95be2dab..882a695b0 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_DTE_struct_instance_field.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_DTE_struct_instance_field.c +@@ -72,4 +72,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "No structures to transform." "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "No structures to transform." "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_DTE_verify.c b/gcc/testsuite/gcc.dg/struct/rf_DTE_verify.c +index 3d243313b..20ecee545 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_DTE_verify.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_DTE_verify.c +@@ -91,4 +91,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_check_ptr_layers_bug.c b/gcc/testsuite/gcc.dg/struct/rf_check_ptr_layers_bug.c +index a5477dcc9..ad879fc11 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_check_ptr_layers_bug.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_check_ptr_layers_bug.c +@@ -21,4 +21,4 @@ main() + { + g(); + } +-/* { dg-final { scan-ipa-dump "No structures to transform." "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "No structures to transform." "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_create_fields_bug.c b/gcc/testsuite/gcc.dg/struct/rf_create_fields_bug.c +index 886706ae9..f0c9d8f39 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_create_fields_bug.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_create_fields_bug.c +@@ -79,4 +79,4 @@ main() + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_create_new_func_bug.c b/gcc/testsuite/gcc.dg/struct/rf_create_new_func_bug.c +index f3785f392..fa5e6c2d0 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_create_new_func_bug.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_create_new_func_bug.c +@@ -53,4 +53,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_ele_minus_verify.c b/gcc/testsuite/gcc.dg/struct/rf_ele_minus_verify.c +index 1415d759a..2966869e7 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_ele_minus_verify.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_ele_minus_verify.c +@@ -57,4 +57,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_escape_by_base.c b/gcc/testsuite/gcc.dg/struct/rf_escape_by_base.c +index 003da0b57..b74b9e5e9 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_escape_by_base.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_escape_by_base.c +@@ -80,4 +80,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_int_cast_ptr.c b/gcc/testsuite/gcc.dg/struct/rf_int_cast_ptr.c +index 10dcf098c..cf85c6109 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_int_cast_ptr.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_int_cast_ptr.c +@@ -69,4 +69,4 @@ main() + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_mem_ref_offset.c b/gcc/testsuite/gcc.dg/struct/rf_mem_ref_offset.c +index 8d1a9a114..61fd9f755 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_mem_ref_offset.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_mem_ref_offset.c +@@ -55,4 +55,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_mul_layer_ptr_record_bug.c b/gcc/testsuite/gcc.dg/struct/rf_mul_layer_ptr_record_bug.c +index 23765fc56..2c115da02 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_mul_layer_ptr_record_bug.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_mul_layer_ptr_record_bug.c +@@ -27,4 +27,4 @@ main() { + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_pass_conflict.c b/gcc/testsuite/gcc.dg/struct/rf_pass_conflict.c +index 54e737ee8..c7646d8b7 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_pass_conflict.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_pass_conflict.c +@@ -106,4 +106,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_ptr2void_lto.c b/gcc/testsuite/gcc.dg/struct/rf_ptr2void_lto.c +index 2ae46fb31..01c000375 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_ptr2void_lto.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_ptr2void_lto.c +@@ -84,4 +84,4 @@ main () + return cnt; + } + +-/* { dg-final { scan-ipa-dump "No structures to transform." "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "No structures to transform." "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_ptr_diff.c b/gcc/testsuite/gcc.dg/struct/rf_ptr_diff.c +index 3a3c10b70..f962163fe 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_ptr_diff.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_ptr_diff.c +@@ -68,4 +68,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 3" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 3" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_ptr_negate_expr.c b/gcc/testsuite/gcc.dg/struct/rf_ptr_negate_expr.c +index 7b7d110df..6558b1797 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_ptr_negate_expr.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_ptr_negate_expr.c +@@ -52,4 +52,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_ptr_offset.c b/gcc/testsuite/gcc.dg/struct/rf_ptr_offset.c +index 317aafa5f..6d528ed5b 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_ptr_offset.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_ptr_offset.c +@@ -31,4 +31,4 @@ main () + printf (" Tree.\n"); + } + +-/* { dg-final { scan-ipa-dump "No structures to transform." "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "No structures to transform." "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_ptr_ptr.c b/gcc/testsuite/gcc.dg/struct/rf_ptr_ptr.c +index 01a33f669..e95cf2e5d 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_ptr_ptr.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_ptr_ptr.c +@@ -52,4 +52,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_ptr_ptr_ptr.c b/gcc/testsuite/gcc.dg/struct/rf_ptr_ptr_ptr.c +index a38556533..cb4054522 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_ptr_ptr_ptr.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_ptr_ptr_ptr.c +@@ -55,4 +55,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_rescusive_type.c b/gcc/testsuite/gcc.dg/struct/rf_rescusive_type.c +index 5c17ee528..38bddbae5 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_rescusive_type.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_rescusive_type.c +@@ -54,4 +54,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_rewrite_assign_more_cmp.c b/gcc/testsuite/gcc.dg/struct/rf_rewrite_assign_more_cmp.c +index 710517ee9..86034f042 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_rewrite_assign_more_cmp.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_rewrite_assign_more_cmp.c +@@ -62,4 +62,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_rewrite_cond_bug.c b/gcc/testsuite/gcc.dg/struct/rf_rewrite_cond_bug.c +index 6ed0a5d2d..aae7c4bc9 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_rewrite_cond_bug.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_rewrite_cond_bug.c +@@ -69,4 +69,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 3" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 3" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_rewrite_cond_more_cmp.c b/gcc/testsuite/gcc.dg/struct/rf_rewrite_cond_more_cmp.c +index 5a2dd964f..8672e7552 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_rewrite_cond_more_cmp.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_rewrite_cond_more_cmp.c +@@ -55,4 +55,4 @@ main() + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_rewrite_phi_bug.c b/gcc/testsuite/gcc.dg/struct/rf_rewrite_phi_bug.c +index faa90b42d..2d67434a0 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_rewrite_phi_bug.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_rewrite_phi_bug.c +@@ -78,4 +78,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 3" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 3" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_visible_func.c b/gcc/testsuite/gcc.dg/struct/rf_visible_func.c +index 8f2da99cc..a8cf2b63c 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_visible_func.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_visible_func.c +@@ -89,4 +89,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_void_ptr_param_func.c b/gcc/testsuite/gcc.dg/struct/rf_void_ptr_param_func.c +index 723142c59..b6cba3c34 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_void_ptr_param_func.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_void_ptr_param_func.c +@@ -51,4 +51,4 @@ main() + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/sr_pointer_minus.c b/gcc/testsuite/gcc.dg/struct/sr_pointer_minus.c +index 9a82da0d6..a0614a1ba 100644 +--- a/gcc/testsuite/gcc.dg/struct/sr_pointer_minus.c ++++ b/gcc/testsuite/gcc.dg/struct/sr_pointer_minus.c +@@ -30,4 +30,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "struct node has escaped: \"Type escapes via a unhandled rewrite stmt\"" "struct_reorg" } } */ ++/* { dg-final { scan-ipa-dump "has escaped: \"Type escapes via a unhandled rewrite stmt\"" "struct_reorg" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/struct-reorg.exp b/gcc/testsuite/gcc.dg/struct/struct-reorg.exp +index c8db4675f..67b3ac2d5 100644 +--- a/gcc/testsuite/gcc.dg/struct/struct-reorg.exp ++++ b/gcc/testsuite/gcc.dg/struct/struct-reorg.exp +@@ -47,6 +47,25 @@ gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/complete_struct_relayout + gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/rf*.c]] \ + "" "-fipa-reorder-fields -fdump-ipa-all -flto-partition=one -fwhole-program" + ++# -fipa-struct-reorg=1 ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/wo_prof_*.c]] \ ++ "" "-fipa-struct-reorg=1 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/w_ratio_*.c]] \ ++ "" "-fipa-struct-reorg=1 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/w_prof_*.c]] \ ++ "" "-fipa-struct-reorg=1 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/struct_reorg*.c]] \ ++ "" "-fipa-struct-reorg=1 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/sr_*.c]] \ ++ "" "-fipa-struct-reorg=1 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/csr_*.c]] \ ++ "" "-fipa-struct-reorg=1 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/complete_struct_relayout.c]] \ ++ "" "-fipa-struct-reorg=1 -fdump-ipa-all -flto-partition=one -fwhole-program" ++ ++# -fipa-struct-reorg=2 ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \ ++ "" "-fipa-struct-reorg=2 -fdump-ipa-all -flto-partition=one -fwhole-program" + # All done. + torture-finish + dg-finish +diff --git a/gcc/timevar.def b/gcc/timevar.def +index e873747a8..b179f62bb 100644 +--- a/gcc/timevar.def ++++ b/gcc/timevar.def +@@ -80,7 +80,7 @@ DEFTIMEVAR (TV_IPA_CONSTANT_PROP , "ipa cp") + DEFTIMEVAR (TV_IPA_INLINING , "ipa inlining heuristics") + DEFTIMEVAR (TV_IPA_FNSPLIT , "ipa function splitting") + DEFTIMEVAR (TV_IPA_COMDATS , "ipa comdats") +-DEFTIMEVAR (TV_IPA_REORDER_FIELDS , "ipa struct reorder fields optimization") ++DEFTIMEVAR (TV_IPA_STRUCT_LAYOUT , "ipa struct layout optimization") + DEFTIMEVAR (TV_IPA_STRUCT_REORG , "ipa struct reorg optimization") + DEFTIMEVAR (TV_IPA_EXTEND_AUTO_PROFILE, "ipa extend auto profile") + DEFTIMEVAR (TV_IPA_OPT , "ipa various optimizations") +diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h +index be6387768..187f1a85c 100644 +--- a/gcc/tree-pass.h ++++ b/gcc/tree-pass.h +@@ -509,7 +509,7 @@ extern ipa_opt_pass_d *make_pass_ipa_odr (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_reference (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_hsa (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_pure_const (gcc::context *ctxt); +-extern simple_ipa_opt_pass *make_pass_ipa_reorder_fields (gcc::context *ctxt); ++extern simple_ipa_opt_pass *make_pass_ipa_struct_layout (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_struct_reorg (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_extend_auto_profile (gcc::context + *ctxt); +diff --git a/gcc/tree.c b/gcc/tree.c +index 89fa469c3..c2075d735 100644 +--- a/gcc/tree.c ++++ b/gcc/tree.c +@@ -5219,7 +5219,7 @@ fld_simplified_type_name (tree type) + /* Simplify type will cause that struct A and struct A within + struct B are different type pointers, so skip it in structure + optimizations. */ +- if (flag_ipa_reorder_fields || flag_ipa_struct_reorg) ++ if (flag_ipa_struct_layout || flag_ipa_struct_reorg) + return TYPE_NAME (type); + + if (!TYPE_NAME (type) || TREE_CODE (TYPE_NAME (type)) != TYPE_DECL) +@@ -5463,7 +5463,7 @@ fld_simplified_type (tree t, class free_lang_data_d *fld) + /* Simplify type will cause that struct A and struct A within + struct B are different type pointers, so skip it in structure + optimizations. */ +- if (flag_ipa_reorder_fields || flag_ipa_struct_reorg) ++ if (flag_ipa_struct_layout || flag_ipa_struct_reorg) + return t; + if (POINTER_TYPE_P (t)) + return fld_incomplete_type_of (t, fld); +-- +2.27.0.windows.1 + diff --git a/0037-Backport-loop-invariant-Don-t-move-cold-bb-instructi.patch b/0037-Backport-loop-invariant-Don-t-move-cold-bb-instructi.patch new file mode 100644 index 0000000000000000000000000000000000000000..b30030e29b9cd3e9f2b7eeada2b55c01204eb832 --- /dev/null +++ b/0037-Backport-loop-invariant-Don-t-move-cold-bb-instructi.patch @@ -0,0 +1,83 @@ +From 897d637aec3b077eb9ef95b2f4a5f7656e36ebd6 Mon Sep 17 00:00:00 2001 +From: benniaobufeijiushiji +Date: Wed, 15 Jun 2022 11:33:03 +0800 +Subject: [PATCH 03/12] [Backport] loop-invariant: Don't move cold bb + instructions to preheader in RTL + +Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=dc1969dab392661cdac1170bbb8c9f83f388580d + +When inner loop is unlikely to execute, loop invariant motion would move +cold instrcutions to a hotter loop. This patch adds profile count checking +to fix the problem. +--- + gcc/loop-invariant.c | 17 ++++++++++++++--- + gcc/testsuite/gcc.dg/loop-invariant-2.c | 20 ++++++++++++++++++++ + 2 files changed, 34 insertions(+), 3 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/loop-invariant-2.c + +diff --git a/gcc/loop-invariant.c b/gcc/loop-invariant.c +index 37ae6549e..24b9bcb11 100644 +--- a/gcc/loop-invariant.c ++++ b/gcc/loop-invariant.c +@@ -1184,9 +1184,21 @@ find_invariants_insn (rtx_insn *insn, bool always_reached, bool always_executed) + call. */ + + static void +-find_invariants_bb (basic_block bb, bool always_reached, bool always_executed) ++find_invariants_bb (class loop *loop, basic_block bb, bool always_reached, ++ bool always_executed) + { + rtx_insn *insn; ++ basic_block preheader = loop_preheader_edge (loop)->src; ++ ++ /* Don't move insn of cold BB out of loop to preheader to reduce calculations ++ and register live range in hot loop with cold BB. */ ++ if (!always_executed && preheader->count > bb->count) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Don't move invariant from bb: %d out of loop %d\n", ++ bb->index, loop->num); ++ return; ++ } + + FOR_BB_INSNS (bb, insn) + { +@@ -1215,8 +1227,7 @@ find_invariants_body (class loop *loop, basic_block *body, + unsigned i; + + for (i = 0; i < loop->num_nodes; i++) +- find_invariants_bb (body[i], +- bitmap_bit_p (always_reached, i), ++ find_invariants_bb (loop, body[i], bitmap_bit_p (always_reached, i), + bitmap_bit_p (always_executed, i)); + } + +diff --git a/gcc/testsuite/gcc.dg/loop-invariant-2.c b/gcc/testsuite/gcc.dg/loop-invariant-2.c +new file mode 100644 +index 000000000..df3d84585 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/loop-invariant-2.c +@@ -0,0 +1,20 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -fdump-rtl-loop2_invariant" } */ ++ ++volatile int x; ++void ++bar (int, char *, char *); ++void ++foo (int *a, int n, int k) ++{ ++ int i; ++ ++ for (i = 0; i < n; i++) ++ { ++ if (__builtin_expect (x, 0)) ++ bar (k / 5, "one", "two"); ++ a[i] = k; ++ } ++} ++ ++/* { dg-final { scan-rtl-dump "Don't move invariant from bb: .*out of loop" "loop2_invariant" } } */ +-- +2.27.0.windows.1 + diff --git a/0038-DFE-Add-Dead-Field-Elimination-in-Struct-Reorg.patch b/0038-DFE-Add-Dead-Field-Elimination-in-Struct-Reorg.patch new file mode 100644 index 0000000000000000000000000000000000000000..6b913ae5b294b8fd3631526567728a7dea70b153 --- /dev/null +++ b/0038-DFE-Add-Dead-Field-Elimination-in-Struct-Reorg.patch @@ -0,0 +1,902 @@ +From edd4200e2b3e94d5c124900657b91c22dfe9c557 Mon Sep 17 00:00:00 2001 +From: Mingchuan Wu +Date: Wed, 15 Jun 2022 16:00:25 +0800 +Subject: [PATCH 04/12] [DFE] Add Dead Field Elimination in Struct-Reorg. + +We can transform gimple to eliminate fields that are never read +and remove their redundant stmts. +Also we adapted the partial escape_cast_another_ptr for struct relayout. +Add flag -fipa-struct-reorg=3 to enable dead field elimination. +--- + gcc/common.opt | 4 +- + gcc/ipa-struct-reorg/ipa-struct-reorg.c | 209 ++++++++++++++++-- + gcc/ipa-struct-reorg/ipa-struct-reorg.h | 9 +- + gcc/testsuite/gcc.dg/struct/dfe_DTE_verify.c | 86 +++++++ + .../gcc.dg/struct/dfe_ele_minus_verify.c | 60 +++++ + .../gcc.dg/struct/dfe_mem_ref_offset.c | 58 +++++ + .../struct/dfe_mul_layer_ptr_record_bug.c | 30 +++ + gcc/testsuite/gcc.dg/struct/dfe_ptr_diff.c | 71 ++++++ + .../gcc.dg/struct/dfe_ptr_negate_expr.c | 55 +++++ + gcc/testsuite/gcc.dg/struct/dfe_ptr_ptr.c | 55 +++++ + gcc/testsuite/gcc.dg/struct/struct-reorg.exp | 21 +- + 11 files changed, 639 insertions(+), 19 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/struct/dfe_DTE_verify.c + create mode 100644 gcc/testsuite/gcc.dg/struct/dfe_ele_minus_verify.c + create mode 100644 gcc/testsuite/gcc.dg/struct/dfe_mem_ref_offset.c + create mode 100644 gcc/testsuite/gcc.dg/struct/dfe_mul_layer_ptr_record_bug.c + create mode 100644 gcc/testsuite/gcc.dg/struct/dfe_ptr_diff.c + create mode 100644 gcc/testsuite/gcc.dg/struct/dfe_ptr_negate_expr.c + create mode 100644 gcc/testsuite/gcc.dg/struct/dfe_ptr_ptr.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index 7fc075d35..b5ea3c7a1 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1884,8 +1884,8 @@ Common Report Var(flag_ipa_struct_reorg) Init(0) Optimization + Perform structure layout optimizations. + + fipa-struct-reorg= +-Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 2) +--fipa-struct-reorg=[0,1,2] adding none, struct-reorg, reorder-fields optimizations. ++Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 3) ++-fipa-struct-reorg=[0,1,2,3] adding none, struct-reorg, reorder-fields, dfe optimizations. + + fipa-extend-auto-profile + Common Report Var(flag_ipa_extend_auto_profile) +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.c b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +index 9214ee74a..2fa560239 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.c ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +@@ -81,6 +81,7 @@ along with GCC; see the file COPYING3. If not see + #include "tree-pretty-print.h" + #include "gimple-pretty-print.h" + #include "gimple-iterator.h" ++#include "gimple-walk.h" + #include "cfg.h" + #include "ssa.h" + #include "tree-dfa.h" +@@ -238,11 +239,44 @@ enum srmode + STRUCT_LAYOUT_OPTIMIZE + }; + ++/* Enum the struct layout optimize level, ++ which should be the same as the option -fstruct-reorg=. */ ++ ++enum struct_layout_opt_level ++{ ++ NONE = 0, ++ STRUCT_REORG, ++ STRUCT_REORDER_FIELDS, ++ DEAD_FIELD_ELIMINATION ++}; ++ + static bool is_result_of_mult (tree arg, tree *num, tree struct_size); + bool isptrptr (tree type); + + srmode current_mode; + ++hash_map replace_type_map; ++ ++/* Return true if one of these types is created by struct-reorg. */ ++ ++static bool ++is_replace_type (tree type1, tree type2) ++{ ++ if (replace_type_map.is_empty ()) ++ return false; ++ if (type1 == NULL_TREE || type2 == NULL_TREE) ++ return false; ++ tree *type_value = replace_type_map.get (type1); ++ if (type_value) ++ if (types_compatible_p (*type_value, type2)) ++ return true; ++ type_value = replace_type_map.get (type2); ++ if (type_value) ++ if (types_compatible_p (*type_value, type1)) ++ return true; ++ return false; ++} ++ + } // anon namespace + + namespace struct_reorg { +@@ -318,12 +352,13 @@ srfunction::simple_dump (FILE *file) + /* Constructor of FIELD. */ + + srfield::srfield (tree field, srtype *base) +- : offset(int_byte_position (field)), ++ : offset (int_byte_position (field)), + fieldtype (TREE_TYPE (field)), + fielddecl (field), +- base(base), +- type(NULL), +- clusternum(0) ++ base (base), ++ type (NULL), ++ clusternum (0), ++ field_access (EMPTY_FIELD) + { + for(int i = 0;i < max_split; i++) + newfield[i] = NULL_TREE; +@@ -362,6 +397,25 @@ srtype::srtype (tree type) + } + } + ++/* Check it if all fields in the RECORD_TYPE are referenced. */ ++ ++bool ++srtype::has_dead_field (void) ++{ ++ bool may_dfe = false; ++ srfield *this_field; ++ unsigned i; ++ FOR_EACH_VEC_ELT (fields, i, this_field) ++ { ++ if (!(this_field->field_access & READ_FIELD)) ++ { ++ may_dfe = true; ++ break; ++ } ++ } ++ return may_dfe; ++} ++ + /* Mark the type as escaping type E at statement STMT. */ + + void +@@ -833,6 +887,10 @@ srtype::create_new_type (void) + for (unsigned i = 0; i < fields.length (); i++) + { + srfield *f = fields[i]; ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE ++ && struct_layout_optimize_level >= DEAD_FIELD_ELIMINATION ++ && !(f->field_access & READ_FIELD)) ++ continue; + f->create_new_fields (newtype, newfields, newlast); + } + +@@ -854,6 +912,16 @@ srtype::create_new_type (void) + + warn_padded = save_warn_padded; + ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE ++ && replace_type_map.get (this->newtype[0]) == NULL) ++ replace_type_map.put (this->newtype[0], this->type); ++ if (dump_file) ++ { ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE ++ && struct_layout_optimize_level >= DEAD_FIELD_ELIMINATION ++ && has_dead_field ()) ++ fprintf (dump_file, "Dead field elimination.\n"); ++ } + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Created %d types:\n", maxclusters); +@@ -1128,12 +1196,12 @@ csrtype::init_type_info (void) + + /* Close enough to pad to improve performance. + 33~63 should pad to 64 but 33~48 (first half) are too far away, and +- 65~127 should pad to 128 but 65~96 (first half) are too far away. */ ++ 65~127 should pad to 128 but 65~80 (first half) are too far away. */ + if (old_size > 48 && old_size < 64) + { + new_size = 64; + } +- if (old_size > 96 && old_size < 128) ++ if (old_size > 80 && old_size < 128) + { + new_size = 128; + } +@@ -1272,6 +1340,7 @@ public: + bool has_rewritten_type (srfunction*); + void maybe_mark_or_record_other_side (tree side, tree other, gimple *stmt); + unsigned execute_struct_relayout (void); ++ bool remove_dead_field_stmt (tree lhs); + }; + + struct ipa_struct_relayout +@@ -3206,6 +3275,90 @@ ipa_struct_reorg::find_vars (gimple *stmt) + } + } + ++/* Update field_access in srfield. */ ++ ++static void ++update_field_access (tree record, tree field, unsigned access, void *data) ++{ ++ srtype *this_srtype = ((ipa_struct_reorg *)data)->find_type (record); ++ if (this_srtype == NULL) ++ return; ++ srfield *this_srfield = this_srtype->find_field (int_byte_position (field)); ++ if (this_srfield == NULL) ++ return; ++ ++ this_srfield->field_access |= access; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "record field access %d:", access); ++ print_generic_expr (dump_file, record); ++ fprintf (dump_file, " field:"); ++ print_generic_expr (dump_file, field); ++ fprintf (dump_file, "\n"); ++ } ++ return; ++} ++ ++/* A callback for walk_stmt_load_store_ops to visit store. */ ++ ++static bool ++find_field_p_store (gimple *, tree node, tree op, void *data) ++{ ++ if (TREE_CODE (op) != COMPONENT_REF) ++ return false; ++ tree node_type = TREE_TYPE (node); ++ if (!handled_type (node_type)) ++ return false; ++ ++ update_field_access (node_type, TREE_OPERAND (op, 1), WRITE_FIELD, data); ++ ++ return false; ++} ++ ++/* A callback for walk_stmt_load_store_ops to visit load. */ ++ ++static bool ++find_field_p_load (gimple *, tree node, tree op, void *data) ++{ ++ if (TREE_CODE (op) != COMPONENT_REF) ++ return false; ++ tree node_type = TREE_TYPE (node); ++ if (!handled_type (node_type)) ++ return false; ++ ++ update_field_access (node_type, TREE_OPERAND (op, 1), READ_FIELD, data); ++ ++ return false; ++} ++ ++/* Determine whether the stmt should be deleted. */ ++ ++bool ++ipa_struct_reorg::remove_dead_field_stmt (tree lhs) ++{ ++ tree base = NULL_TREE; ++ bool indirect = false; ++ srtype *t = NULL; ++ srfield *f = NULL; ++ bool realpart = false; ++ bool imagpart = false; ++ bool address = false; ++ bool escape_from_base = false; ++ if (!get_type_field (lhs, base, indirect, t, f, realpart, imagpart, ++ address, escape_from_base)) ++ return false; ++ if (t ==NULL) ++ return false; ++ if (t->newtype[0] == t->type) ++ return false; ++ if (f == NULL) ++ return false; ++ if (f->newfield[0] == NULL ++ && (f->field_access & WRITE_FIELD)) ++ return true; ++ return false; ++} ++ + /* Maybe record access of statement for further analaysis. */ + + void +@@ -3227,6 +3380,13 @@ ipa_struct_reorg::maybe_record_stmt (cgraph_node *node, gimple *stmt) + default: + break; + } ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE ++ && struct_layout_optimize_level >= DEAD_FIELD_ELIMINATION) ++ { ++ /* Look for loads and stores. */ ++ walk_stmt_load_store_ops (stmt, this, find_field_p_load, ++ find_field_p_store); ++ } + } + + /* Calculate the multiplier. */ +@@ -3543,8 +3703,11 @@ ipa_struct_reorg::maybe_mark_or_record_other_side (tree side, tree other, gimple + } + else if (type != d->type) + { +- type->mark_escape (escape_cast_another_ptr, stmt); +- d->type->mark_escape (escape_cast_another_ptr, stmt); ++ if (!is_replace_type (d->type->type, type->type)) ++ { ++ type->mark_escape (escape_cast_another_ptr, stmt); ++ d->type->mark_escape (escape_cast_another_ptr, stmt); ++ } + } + /* x_1 = y.x_nodes; void *x; + Directly mark the structure pointer type assigned +@@ -4131,8 +4294,9 @@ ipa_struct_reorg::check_type_and_push (tree newdecl, srdecl *decl, + } + /* If we have a non void* or a decl (which is hard to track), + then mark the type as escaping. */ +- if (!VOID_POINTER_P (TREE_TYPE (newdecl)) +- || DECL_P (newdecl)) ++ if (replace_type_map.get (type->type) == NULL ++ && (!VOID_POINTER_P (TREE_TYPE (newdecl)) ++ || DECL_P (newdecl))) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { +@@ -4142,7 +4306,7 @@ ipa_struct_reorg::check_type_and_push (tree newdecl, srdecl *decl, + print_generic_expr (dump_file, TREE_TYPE (newdecl)); + fprintf (dump_file, "\n"); + } +- type->mark_escape (escape_cast_another_ptr, stmt); ++ type->mark_escape (escape_cast_another_ptr, stmt); + return; + } + /* At this point there should only be unkown void* ssa names. */ +@@ -4465,11 +4629,13 @@ ipa_struct_reorg::check_other_side (srdecl *decl, tree other, gimple *stmt, vec< + + return; + } ++ if (!is_replace_type (t1->type, type->type)) ++ { ++ if (t1) ++ t1->mark_escape (escape_cast_another_ptr, stmt); + +- if (t1) +- t1->mark_escape (escape_cast_another_ptr, stmt); +- +- type->mark_escape (escape_cast_another_ptr, stmt); ++ type->mark_escape (escape_cast_another_ptr, stmt); ++ } + } + + +@@ -5722,6 +5888,19 @@ bool + ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + { + bool remove = false; ++ ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE ++ && struct_layout_optimize_level >= DEAD_FIELD_ELIMINATION ++ && remove_dead_field_stmt (gimple_assign_lhs (stmt))) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\n rewriting statement (remove): \n"); ++ print_gimple_stmt (dump_file, stmt, 0); ++ } ++ return true; ++ } ++ + if (gimple_clobber_p (stmt)) + { + tree lhs = gimple_assign_lhs (stmt); +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.h b/gcc/ipa-struct-reorg/ipa-struct-reorg.h +index 54b0dc655..936c0fa6f 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.h ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.h +@@ -142,6 +142,7 @@ public: + + bool create_new_type (void); + void analyze (void); ++ bool has_dead_field (void); + void mark_escape (escape_type, gimple *stmt); + bool has_escaped (void) + { +@@ -163,6 +164,12 @@ public: + } + }; + ++/* Bitflags used for determining if a field ++ is never accessed, read or written. */ ++const unsigned EMPTY_FIELD = 0x0u; ++const unsigned READ_FIELD = 0x01u; ++const unsigned WRITE_FIELD = 0x02u; ++ + struct srfield + { + unsigned HOST_WIDE_INT offset; +@@ -174,7 +181,7 @@ struct srfield + unsigned clusternum; + + tree newfield[max_split]; +- ++ unsigned field_access; /* FIELD_DECL -> bitflag (use for dfe). */ + // Constructors + srfield (tree field, srtype *base); + +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_DTE_verify.c b/gcc/testsuite/gcc.dg/struct/dfe_DTE_verify.c +new file mode 100644 +index 000000000..4261d2352 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_DTE_verify.c +@@ -0,0 +1,86 @@ ++/* { dg-do compile } */ ++ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node *node_p; ++ ++typedef struct arc arc_t; ++typedef struct arc *arc_p; ++ ++typedef struct network ++{ ++ arc_p arcs; ++ arc_p sorted_arcs; ++ int x; ++ node_p nodes; ++ node_p stop_nodes; ++} network_t; ++ ++struct node ++{ ++ int64_t potential; ++ int orientation; ++ node_p child; ++ node_p pred; ++ node_p sibling; ++ node_p sibling_prev; ++ arc_p basic_arc; ++ arc_p firstout; ++ arc_p firstin; ++ arc_p arc_tmp; ++ int64_t flow; ++ int64_t depth; ++ int number; ++ int time; ++}; ++ ++struct arc ++{ ++ int id; ++ int64_t cost; ++ node_p tail; ++ node_p head; ++ short ident; ++ arc_p nextout; ++ arc_p nextin; ++ int64_t flow; ++ int64_t org_cost; ++ network_t* net_add; ++}; ++ ++ ++const int MAX = 100; ++ ++/* let it escape_array, "Type is used in an array [not handled yet]". */ ++network_t* net[2]; ++arc_p stop_arcs = NULL; ++ ++int ++main () ++{ ++ net[0] = (network_t*) calloc (1, sizeof(network_t)); ++ net[0]->arcs = (arc_p) calloc (MAX, sizeof (arc_t)); ++ stop_arcs = (arc_p) calloc (MAX, sizeof (arc_t)); ++ ++ net[0]->arcs->id = 100; ++ ++ for (unsigned i = 0; i < 3; i++) ++ { ++ net[0]->arcs->id = net[0]->arcs->id + 2; ++ stop_arcs->cost = net[0]->arcs->id / 2; ++ stop_arcs->net_add = net[0]; ++ printf("stop_arcs->cost = %ld\n", stop_arcs->cost); ++ net[0]->arcs++; ++ stop_arcs++; ++ } ++ ++ if( net[1] != 0 && stop_arcs != 0) ++ { ++ return -1; ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 2 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_ele_minus_verify.c b/gcc/testsuite/gcc.dg/struct/dfe_ele_minus_verify.c +new file mode 100644 +index 000000000..42d38c63a +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_ele_minus_verify.c +@@ -0,0 +1,60 @@ ++// verify newarc[cmp-1].flow ++/* { dg-do compile } */ ++ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node *node_p; ++ ++typedef struct arc arc_t; ++typedef struct arc *arc_p; ++ ++struct node ++{ ++ int64_t potential; ++ int orientation; ++ node_p child; ++ node_p pred; ++ node_p sibling; ++ node_p sibling_prev; ++ arc_p basic_arc; ++ arc_p firstout; ++ arc_p firstin; ++ arc_p arc_tmp; ++ int64_t flow; ++ int64_t depth; ++ int number; ++ int time; ++}; ++ ++struct arc ++{ ++ int id; ++ int64_t cost; ++ node_p tail; ++ node_p head; ++ short ident; ++ arc_p nextout; ++ arc_p nextin; ++ int64_t flow; ++ int64_t org_cost; ++}; ++ ++const int MAX = 100; ++arc_p ap = NULL; ++ ++int ++main () ++{ ++ ap = (arc_p) calloc(MAX, sizeof(arc_t)); ++ printf("%d\n", ap[0].id); ++ for (int i = 1; i < MAX; i++) ++ { ++ ap[i-1].id = 500; ++ } ++ printf("%d\n", ap[0].id); ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 2 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_mem_ref_offset.c b/gcc/testsuite/gcc.dg/struct/dfe_mem_ref_offset.c +new file mode 100644 +index 000000000..53583fe82 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_mem_ref_offset.c +@@ -0,0 +1,58 @@ ++/* Supports the MEM_REF offset. ++ _1 = MEM[(struct arc *)ap_4 + 72B].flow; ++ Old rewrite:_1 = ap.reorder.0_8->flow; ++ New rewrite:_1 = MEM[(struct arc.reorder.0 *)ap.reorder.0_8 + 64B].flow. */ ++/* { dg-do compile } */ ++ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node *node_p; ++ ++typedef struct arc arc_t; ++typedef struct arc *arc_p; ++ ++struct node ++{ ++ int64_t potential; ++ int orientation; ++ node_p child; ++ node_p pred; ++ node_p sibling; ++ node_p sibling_prev; ++ arc_p basic_arc; ++ arc_p firstout; ++ arc_p firstin; ++ arc_p arc_tmp; ++ int64_t flow; ++ int64_t depth; ++ int number; ++ int time; ++}; ++ ++struct arc ++{ ++ int id; ++ int64_t cost; ++ node_p tail; ++ node_p head; ++ short ident; ++ arc_p nextout; ++ arc_p nextin; ++ int64_t flow; ++ int64_t org_cost; ++}; ++ ++int ++main () ++{ ++ const int MAX = 100; ++ /* A similar scenario can be reproduced only by using local variables. */ ++ arc_p ap = NULL; ++ ap = (arc_p) calloc(MAX, sizeof(arc_t)); ++ printf("%d\n", ap[1].flow); ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 2 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_mul_layer_ptr_record_bug.c b/gcc/testsuite/gcc.dg/struct/dfe_mul_layer_ptr_record_bug.c +new file mode 100644 +index 000000000..fd675ec2e +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_mul_layer_ptr_record_bug.c +@@ -0,0 +1,30 @@ ++/* { dg-do compile } */ ++ ++#include ++#include ++ ++typedef struct T_HASH_ENTRY ++{ ++ unsigned int hash; ++ unsigned int klen; ++ char *key; ++} iHashEntry; ++ ++typedef struct T_HASH ++{ ++ unsigned int size; ++ unsigned int fill; ++ unsigned int keys; ++ ++ iHashEntry **array; ++} uHash; ++ ++uHash *retval; ++ ++int ++main() { ++ retval->array = (iHashEntry **)calloc(sizeof(iHashEntry *), retval->size); ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 2 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_ptr_diff.c b/gcc/testsuite/gcc.dg/struct/dfe_ptr_diff.c +new file mode 100644 +index 000000000..600e7908b +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_ptr_diff.c +@@ -0,0 +1,71 @@ ++// support POINTER_DIFF_EXPR & NOP_EXPR to avoid ++// escape_unhandled_rewrite, "Type escapes via a unhandled rewrite stmt" ++/* { dg-do compile } */ ++ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node *node_p; ++ ++typedef struct arc arc_t; ++typedef struct arc *arc_p; ++ ++typedef struct network ++{ ++ arc_p arcs; ++ arc_p sorted_arcs; ++ int x; ++ node_p nodes; ++ node_p stop_nodes; ++} network_t; ++ ++struct node ++{ ++ int64_t potential; ++ int orientation; ++ node_p child; ++ node_p pred; ++ node_p sibling; ++ node_p sibling_prev; ++ arc_p basic_arc; ++ arc_p firstout; ++ arc_p firstin; ++ arc_p arc_tmp; ++ int64_t flow; ++ int64_t depth; ++ int number; ++ int time; ++}; ++ ++struct arc ++{ ++ int id; ++ int64_t cost; ++ node_p tail; ++ node_p head; ++ short ident; ++ arc_p nextout; ++ arc_p nextin; ++ int64_t flow; ++ int64_t org_cost; ++}; ++ ++int ++main () ++{ ++ arc_t *old_arcs; ++ node_t *node; ++ node_t *stop; ++ size_t off; ++ network_t* net; ++ ++ for( ; node->number < stop->number; node++ ) ++ { ++ off = node->basic_arc - old_arcs; ++ node->basic_arc = (arc_t *)(net->arcs + off); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 3 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_ptr_negate_expr.c b/gcc/testsuite/gcc.dg/struct/dfe_ptr_negate_expr.c +new file mode 100644 +index 000000000..f411364a7 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_ptr_negate_expr.c +@@ -0,0 +1,55 @@ ++// support NEGATE_EXPR rewriting ++/* { dg-do compile } */ ++ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node *node_p; ++ ++typedef struct arc arc_t; ++typedef struct arc *arc_p; ++ ++struct node ++{ ++ int64_t potential; ++ int orientation; ++ node_p child; ++ node_p pred; ++ node_p sibling; ++ node_p sibling_prev; ++ arc_p basic_arc; ++ arc_p firstout; ++ arc_p firstin; ++ arc_p arc_tmp; ++ int64_t flow; ++ int64_t depth; ++ int number; ++ int time; ++}; ++ ++struct arc ++{ ++ int id; ++ int64_t cost; ++ node_p tail; ++ node_p head; ++ short ident; ++ arc_p nextout; ++ arc_p nextin; ++ int64_t flow; ++ int64_t org_cost; ++}; ++ ++int ++main () ++{ ++ int64_t susp = 0; ++ const int MAX = 100; ++ arc_p ap = (arc_p) calloc(MAX, sizeof(arc_t)); ++ ap -= susp; ++ printf("%d\n", ap[1].flow); ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 2 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_ptr_ptr.c b/gcc/testsuite/gcc.dg/struct/dfe_ptr_ptr.c +new file mode 100644 +index 000000000..a4e723763 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_ptr_ptr.c +@@ -0,0 +1,55 @@ ++// release escape_ptr_ptr, "Type is used in a pointer to a pointer [not handled yet]"; ++/* { dg-do compile } */ ++ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node *node_p; ++ ++typedef struct arc arc_t; ++typedef struct arc *arc_p; ++ ++struct node ++{ ++ int64_t potential; ++ int orientation; ++ node_p child; ++ node_p pred; ++ node_p sibling; ++ node_p sibling_prev; ++ arc_p basic_arc; ++ arc_p firstout; ++ arc_p firstin; ++ arc_p arc_tmp; ++ int64_t flow; ++ int64_t depth; ++ int number; ++ int time; ++}; ++ ++struct arc ++{ ++ int id; ++ int64_t cost; ++ node_p tail; ++ node_p head; ++ short ident; ++ arc_p nextout; ++ arc_p nextin; ++ int64_t flow; ++ int64_t org_cost; ++}; ++ ++const int MAX = 100; ++arc_t **ap = NULL; ++ ++int ++main () ++{ ++ ap = (arc_t**) malloc(MAX * sizeof(arc_t*)); ++ (*ap)[0].id = 300; ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 2 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/struct-reorg.exp b/gcc/testsuite/gcc.dg/struct/struct-reorg.exp +index 67b3ac2d5..ac5585813 100644 +--- a/gcc/testsuite/gcc.dg/struct/struct-reorg.exp ++++ b/gcc/testsuite/gcc.dg/struct/struct-reorg.exp +@@ -64,8 +64,27 @@ gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/complete_struct_relayout + "" "-fipa-struct-reorg=1 -fdump-ipa-all -flto-partition=one -fwhole-program" + + # -fipa-struct-reorg=2 +-gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \ ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/rf*.c]] \ ++ "" "-fipa-struct-reorg=2 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/wo_prof_*.c]] \ + "" "-fipa-struct-reorg=2 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/w_ratio_*.c]] \ ++ "" "-fipa-struct-reorg=2 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/w_prof_*.c]] \ ++ "" "-fipa-struct-reorg=2 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/struct_reorg*.c]] \ ++ "" "-fipa-struct-reorg=2 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/sr_*.c]] \ ++ "" "-fipa-struct-reorg=2 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/csr_*.c]] \ ++ "" "-fipa-struct-reorg=2 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/complete_struct_relayout.c]] \ ++ "" "-fipa-struct-reorg=2 -fdump-ipa-all -flto-partition=one -fwhole-program" ++ ++# -fipa-struct-reorg=3 ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/dfe*.c]] \ ++ "" "-fipa-struct-reorg=3 -fdump-ipa-all -flto-partition=one -fwhole-program" ++ + # All done. + torture-finish + dg-finish +-- +2.27.0.windows.1 + diff --git a/0039-Backport-ipa-sra-Fix-thinko-when-overriding-safe_to_.patch b/0039-Backport-ipa-sra-Fix-thinko-when-overriding-safe_to_.patch new file mode 100644 index 0000000000000000000000000000000000000000..d63a8efa43583eb3bee074f21b0bc485e13374ff --- /dev/null +++ b/0039-Backport-ipa-sra-Fix-thinko-when-overriding-safe_to_.patch @@ -0,0 +1,143 @@ +From d8753de2129d230afc9a887d5804747c69824a68 Mon Sep 17 00:00:00 2001 +From: zhaowenyu <804544223@qq.com> +Date: Mon, 20 Jun 2022 11:24:45 +0800 +Subject: [PATCH 05/12] [Backport] ipa-sra: Fix thinko when overriding + safe_to_import_accesses (PR 101066) + +Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=5aa28c8cf15cd254cc5a3a12278133b93b8b017f + +ipa-sra: Fix thinko when overriding safe_to_import_accesses (PR 101066) + +The "new" IPA-SRA has a more difficult job than the previous +not-truly-IPA version when identifying situations in which a parameter +passed by reference can be passed into a third function and only thee +converted to one passed by value (and possibly "split" at the same +time). + +In order to allow this, two conditions must be fulfilled. First the +call to the third function must happen before any modifications of +memory, because it could change the value passed by reference. +Second, in order to make sure we do not introduce new (invalid) +dereferences, the call must postdominate the entry BB. + +The second condition is actually not necessary if the caller function +is also certain to dereference the pointer but the first one must +still hold. Unfortunately, the code making this overriding decision +also happen to trigger when the first condition is not fulfilled. +This is fixed in the following patch. + +gcc/ChangeLog: + +2021-06-16 Martin Jambor + +(cherry picked from commit 763121ccd908f52bc666f277ea2cf42110b3aad9) +--- + gcc/ipa-sra.c | 15 +++++++++++++-- + gcc/testsuite/gcc.dg/ipa/pr101066.c | 20 ++++++++++++++++++++ + 2 files changed, 33 insertions(+), 2 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/ipa/pr101066.c + +diff --git a/gcc/ipa-sra.c b/gcc/ipa-sra.c +index b706fceff..1cb30afc3 100644 +--- a/gcc/ipa-sra.c ++++ b/gcc/ipa-sra.c +@@ -340,7 +340,7 @@ class isra_call_summary + public: + isra_call_summary () + : m_arg_flow (), m_return_ignored (false), m_return_returned (false), +- m_bit_aligned_arg (false) ++ m_bit_aligned_arg (false), m_before_any_store (false) + {} + + void init_inputs (unsigned arg_count); +@@ -359,6 +359,10 @@ public: + + /* Set when any of the call arguments are not byte-aligned. */ + unsigned m_bit_aligned_arg : 1; ++ ++ /* Set to true if the call happend before any (other) store to memory in the ++ caller. */ ++ unsigned m_before_any_store : 1; + }; + + /* Class to manage function summaries. */ +@@ -472,6 +476,8 @@ isra_call_summary::dump (FILE *f) + fprintf (f, " return value ignored\n"); + if (m_return_returned) + fprintf (f, " return value used only to compute caller return value\n"); ++ if (m_before_any_store) ++ fprintf (f, " happens before any store to memory\n"); + for (unsigned i = 0; i < m_arg_flow.length (); i++) + { + fprintf (f, " Parameter %u:\n", i); +@@ -516,6 +522,7 @@ ipa_sra_call_summaries::duplicate (cgraph_edge *, cgraph_edge *, + new_sum->m_return_ignored = old_sum->m_return_ignored; + new_sum->m_return_returned = old_sum->m_return_returned; + new_sum->m_bit_aligned_arg = old_sum->m_bit_aligned_arg; ++ new_sum->m_before_any_store = old_sum->m_before_any_store; + } + + +@@ -2355,6 +2362,7 @@ process_scan_results (cgraph_node *node, struct function *fun, + unsigned count = gimple_call_num_args (call_stmt); + isra_call_summary *csum = call_sums->get_create (cs); + csum->init_inputs (count); ++ csum->m_before_any_store = uses_memory_as_obtained; + for (unsigned argidx = 0; argidx < count; argidx++) + { + if (!csum->m_arg_flow[argidx].pointer_pass_through) +@@ -2601,6 +2609,7 @@ isra_write_edge_summary (output_block *ob, cgraph_edge *e) + bp_pack_value (&bp, csum->m_return_ignored, 1); + bp_pack_value (&bp, csum->m_return_returned, 1); + bp_pack_value (&bp, csum->m_bit_aligned_arg, 1); ++ bp_pack_value (&bp, csum->m_before_any_store, 1); + streamer_write_bitpack (&bp); + } + +@@ -2719,6 +2728,7 @@ isra_read_edge_summary (struct lto_input_block *ib, cgraph_edge *cs) + csum->m_return_ignored = bp_unpack_value (&bp, 1); + csum->m_return_returned = bp_unpack_value (&bp, 1); + csum->m_bit_aligned_arg = bp_unpack_value (&bp, 1); ++ csum->m_before_any_store = bp_unpack_value (&bp, 1); + } + + /* Read intraprocedural analysis information about NODE and all of its outgoing +@@ -3475,7 +3485,8 @@ param_splitting_across_edge (cgraph_edge *cs) + } + else if (!ipf->safe_to_import_accesses) + { +- if (!all_callee_accesses_present_p (param_desc, arg_desc)) ++ if (!csum->m_before_any_store ++ || !all_callee_accesses_present_p (param_desc, arg_desc)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, " %u->%u: cannot import accesses.\n", +diff --git a/gcc/testsuite/gcc.dg/ipa/pr101066.c b/gcc/testsuite/gcc.dg/ipa/pr101066.c +new file mode 100644 +index 000000000..1ceb6e431 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/ipa/pr101066.c +@@ -0,0 +1,20 @@ ++/* { dg-do run } */ ++/* { dg-options "-Os -fno-ipa-cp -fno-inline" } */ ++ ++int a = 1, c, d, e; ++int *b = &a; ++static int g(int *h) { ++ c = *h; ++ return d; ++} ++static void f(int *h) { ++ e = *h; ++ *b = 0; ++ g(h); ++} ++int main() { ++ f(b); ++ if (c) ++ __builtin_abort(); ++ return 0; ++} +-- +2.27.0.windows.1 + diff --git a/0040-Backport-ifcvt-Allow-constants-for-noce_convert_mult.patch b/0040-Backport-ifcvt-Allow-constants-for-noce_convert_mult.patch new file mode 100644 index 0000000000000000000000000000000000000000..23e3b89074b04ca1895a4468e0c699cd7c340bd7 --- /dev/null +++ b/0040-Backport-ifcvt-Allow-constants-for-noce_convert_mult.patch @@ -0,0 +1,59 @@ +From ea059ab02ac79eba1c05d6e05cbb2590c47d7c1f Mon Sep 17 00:00:00 2001 +From: zhaowenyu <804544223@qq.com> +Date: Thu, 23 Jun 2022 10:16:08 +0800 +Subject: [PATCH 06/12] [Backport] ifcvt: Allow constants for + noce_convert_multiple. + +Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=9b8eaa282250ad260e01d164093b597579db00d9 + +This lifts the restriction of not allowing constants for noce_convert_multiple. +The code later checks if a valid sequence is produced anyway. +gcc/ChangeLog: + + * ifcvt.cc (noce_convert_multiple_sets): Allow constants. + (bb_ok_for_noce_convert_multiple_sets): Likewise. +--- + gcc/ifcvt.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c +index 977dd1bd4..2452f231c 100644 +--- a/gcc/ifcvt.c ++++ b/gcc/ifcvt.c +@@ -3252,7 +3252,9 @@ noce_convert_multiple_sets (struct noce_if_info *if_info) + we'll end up trying to emit r4:HI = cond ? (r1:SI) : (r3:HI). + Wrap the two cmove operands into subregs if appropriate to prevent + that. */ +- if (GET_MODE (new_val) != GET_MODE (temp)) ++ ++ if (!CONSTANT_P (new_val) ++ && GET_MODE (new_val) != GET_MODE (temp)) + { + machine_mode src_mode = GET_MODE (new_val); + machine_mode dst_mode = GET_MODE (temp); +@@ -3263,7 +3265,8 @@ noce_convert_multiple_sets (struct noce_if_info *if_info) + } + new_val = lowpart_subreg (dst_mode, new_val, src_mode); + } +- if (GET_MODE (old_val) != GET_MODE (temp)) ++ if (!CONSTANT_P (old_val) ++ && GET_MODE (old_val) != GET_MODE (temp)) + { + machine_mode src_mode = GET_MODE (old_val); + machine_mode dst_mode = GET_MODE (temp); +@@ -3392,9 +3395,9 @@ bb_ok_for_noce_convert_multiple_sets (basic_block test_bb) + if (!REG_P (dest)) + return false; + +- if (!(REG_P (src) +- || (GET_CODE (src) == SUBREG && REG_P (SUBREG_REG (src)) +- && subreg_lowpart_p (src)))) ++ if (!((REG_P (src) || CONSTANT_P (src)) ++ || (GET_CODE (src) == SUBREG && REG_P (SUBREG_REG (src)) ++ && subreg_lowpart_p (src)))) + return false; + + /* Destination must be appropriate for a conditional write. */ +-- +2.27.0.windows.1 + diff --git a/0041-Backport-Register-sysroot-in-the-driver-switches-tab.patch b/0041-Backport-Register-sysroot-in-the-driver-switches-tab.patch new file mode 100644 index 0000000000000000000000000000000000000000..441b5c5afd5789f77ad62563686585c584286c5c --- /dev/null +++ b/0041-Backport-Register-sysroot-in-the-driver-switches-tab.patch @@ -0,0 +1,40 @@ +From beeb0fb50c7e40ee3d79044abc6408f760d6584a Mon Sep 17 00:00:00 2001 +From: zhaowenyu <804544223@qq.com> +Date: Thu, 23 Jun 2022 10:40:46 +0800 +Subject: [PATCH 07/12] [Backport] Register --sysroot in the driver switches + table + +Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=48e2d9b7b88dabed336cd098cd212d0e53c5125f + +This change adjusts the processing of --sysroot to save the option in the internal "switches" +array, which lets self-specs test for it and provide a default value possibly dependent on +environment variables, as in + + --with-specs=%{!-sysroot*:--sysroot=%:getenv("WIND_BASE" /target)} + +2021-12-20 Olivier Hainque + + gcc/ + * gcc.c (driver_handle_option): do_save --sysroot. +--- + gcc/gcc.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/gcc/gcc.c b/gcc/gcc.c +index b55075b14..655beffcc 100644 +--- a/gcc/gcc.c ++++ b/gcc/gcc.c +@@ -4190,7 +4190,9 @@ driver_handle_option (struct gcc_options *opts, + case OPT__sysroot_: + target_system_root = arg; + target_system_root_changed = 1; +- do_save = false; ++ /* Saving this option is useful to let self-specs decide to ++ provide a default one. */ ++ do_save = true; + break; + + case OPT_time_: +-- +2.27.0.windows.1 + diff --git a/0042-DFE-Fix-bugs.patch b/0042-DFE-Fix-bugs.patch new file mode 100644 index 0000000000000000000000000000000000000000..58015585cbf5c6fa3929316690a92adfff0757c3 --- /dev/null +++ b/0042-DFE-Fix-bugs.patch @@ -0,0 +1,665 @@ +From f8308a2b440efe124cd6ff59924f135e85e53888 Mon Sep 17 00:00:00 2001 +From: Mingchuan Wu +Date: Sat, 18 Jun 2022 17:51:04 +0800 +Subject: [PATCH 08/12] [DFE] Fix bugs + +Fix bugs: +1. Fixed a bug in check replace type. +2. Use new to update field access for ref. +3. We now replace the dead fields in stmt by creating a new ssa. +4. The replaced type is no longer optimized in NORMAL mode. + +Also we added 5 dejaGNU test cases. +--- + gcc/ipa-struct-reorg/ipa-struct-reorg.c | 77 ++++++--- + gcc/testsuite/gcc.dg/struct/dfe_extr_dtrace.c | 56 ++++++ + gcc/testsuite/gcc.dg/struct/dfe_extr_gc.c | 162 ++++++++++++++++++ + gcc/testsuite/gcc.dg/struct/dfe_extr_hpsa.c | 126 ++++++++++++++ + .../gcc.dg/struct/dfe_extr_tcp_usrreq.c | 58 +++++++ + .../gcc.dg/struct/dfe_extr_ui_main.c | 61 +++++++ + 6 files changed, 516 insertions(+), 24 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/struct/dfe_extr_dtrace.c + create mode 100644 gcc/testsuite/gcc.dg/struct/dfe_extr_gc.c + create mode 100644 gcc/testsuite/gcc.dg/struct/dfe_extr_hpsa.c + create mode 100644 gcc/testsuite/gcc.dg/struct/dfe_extr_tcp_usrreq.c + create mode 100644 gcc/testsuite/gcc.dg/struct/dfe_extr_ui_main.c + +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.c b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +index 2fa560239..00dc4bf1d 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.c ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +@@ -252,6 +252,7 @@ enum struct_layout_opt_level + + static bool is_result_of_mult (tree arg, tree *num, tree struct_size); + bool isptrptr (tree type); ++void get_base (tree &base, tree expr); + + srmode current_mode; + +@@ -631,7 +632,15 @@ srtype::analyze (void) + into 2 different structures. In future we intend to add profile + info and/or static heuristics to differentiate splitting process. */ + if (fields.length () == 2) +- fields[1]->clusternum = 1; ++ { ++ for (hash_map::iterator it = replace_type_map.begin (); ++ it != replace_type_map.end (); ++it) ++ { ++ if (types_compatible_p ((*it).second, this->type)) ++ return; ++ } ++ fields[1]->clusternum = 1; ++ } + + /* Otherwise we do nothing. */ + if (fields.length () >= 3) +@@ -3278,12 +3287,33 @@ ipa_struct_reorg::find_vars (gimple *stmt) + /* Update field_access in srfield. */ + + static void +-update_field_access (tree record, tree field, unsigned access, void *data) ++update_field_access (tree node, tree op, unsigned access, void *data) + { +- srtype *this_srtype = ((ipa_struct_reorg *)data)->find_type (record); ++ HOST_WIDE_INT offset = 0; ++ switch (TREE_CODE (op)) ++ { ++ case COMPONENT_REF: ++ { ++ offset = int_byte_position (TREE_OPERAND (op, 1)); ++ break; ++ } ++ case MEM_REF: ++ { ++ offset = tree_to_uhwi (TREE_OPERAND (op, 1)); ++ break; ++ } ++ default: ++ return; ++ } ++ tree base = node; ++ get_base (base, node); ++ srdecl *this_srdecl = ((ipa_struct_reorg *)data)->find_decl (base); ++ if (this_srdecl == NULL) ++ return; ++ srtype *this_srtype = this_srdecl->type; + if (this_srtype == NULL) + return; +- srfield *this_srfield = this_srtype->find_field (int_byte_position (field)); ++ srfield *this_srfield = this_srtype->find_field (offset); + if (this_srfield == NULL) + return; + +@@ -3291,9 +3321,9 @@ update_field_access (tree record, tree field, unsigned access, void *data) + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "record field access %d:", access); +- print_generic_expr (dump_file, record); ++ print_generic_expr (dump_file, this_srtype->type); + fprintf (dump_file, " field:"); +- print_generic_expr (dump_file, field); ++ print_generic_expr (dump_file, this_srfield->fielddecl); + fprintf (dump_file, "\n"); + } + return; +@@ -3302,15 +3332,10 @@ update_field_access (tree record, tree field, unsigned access, void *data) + /* A callback for walk_stmt_load_store_ops to visit store. */ + + static bool +-find_field_p_store (gimple *, tree node, tree op, void *data) ++find_field_p_store (gimple *stmt ATTRIBUTE_UNUSED, ++ tree node, tree op, void *data) + { +- if (TREE_CODE (op) != COMPONENT_REF) +- return false; +- tree node_type = TREE_TYPE (node); +- if (!handled_type (node_type)) +- return false; +- +- update_field_access (node_type, TREE_OPERAND (op, 1), WRITE_FIELD, data); ++ update_field_access (node, op, WRITE_FIELD, data); + + return false; + } +@@ -3318,15 +3343,10 @@ find_field_p_store (gimple *, tree node, tree op, void *data) + /* A callback for walk_stmt_load_store_ops to visit load. */ + + static bool +-find_field_p_load (gimple *, tree node, tree op, void *data) ++find_field_p_load (gimple *stmt ATTRIBUTE_UNUSED, ++ tree node, tree op, void *data) + { +- if (TREE_CODE (op) != COMPONENT_REF) +- return false; +- tree node_type = TREE_TYPE (node); +- if (!handled_type (node_type)) +- return false; +- +- update_field_access (node_type, TREE_OPERAND (op, 1), READ_FIELD, data); ++ update_field_access (node, op, READ_FIELD, data); + + return false; + } +@@ -4629,7 +4649,7 @@ ipa_struct_reorg::check_other_side (srdecl *decl, tree other, gimple *stmt, vec< + + return; + } +- if (!is_replace_type (t1->type, type->type)) ++ if (!is_replace_type (inner_type (t), type->type)) + { + if (t1) + t1->mark_escape (escape_cast_another_ptr, stmt); +@@ -5898,7 +5918,16 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + fprintf (dump_file, "\n rewriting statement (remove): \n"); + print_gimple_stmt (dump_file, stmt, 0); + } +- return true; ++ /* Replace the dead field in stmt by creating a dummy ssa. */ ++ tree dummy_ssa = make_ssa_name (TREE_TYPE (gimple_assign_lhs (stmt))); ++ gimple_assign_set_lhs (stmt, dummy_ssa); ++ update_stmt (stmt); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "To: \n"); ++ print_gimple_stmt (dump_file, stmt, 0); ++ } ++ return false; + } + + if (gimple_clobber_p (stmt)) +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_extr_dtrace.c b/gcc/testsuite/gcc.dg/struct/dfe_extr_dtrace.c +new file mode 100644 +index 000000000..13a226ee8 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_extr_dtrace.c +@@ -0,0 +1,56 @@ ++/* { dg-do compile} */ ++ ++#define NULL ((void*)0) ++typedef unsigned long size_t; ++typedef long intptr_t; ++typedef unsigned long uintptr_t; ++typedef long scalar_t__; ++typedef int bool; ++#define false 0 ++#define true 1 ++ ++typedef struct TYPE_4__ TYPE_2__; ++typedef struct TYPE_3__ TYPE_1__; ++ ++typedef int uint8_t; ++typedef int uint16_t; ++ ++struct TYPE_4__ ++{ ++ size_t cpu_id; ++}; ++ ++struct TYPE_3__ ++{ ++ int cpuc_dtrace_flags; ++}; ++ ++TYPE_2__ *CPU; ++volatile int CPU_DTRACE_FAULT; ++TYPE_1__ *cpu_core; ++scalar_t__ dtrace_load8 (uintptr_t); ++ ++__attribute__((used)) static int ++dtrace_bcmp (const void *s1, const void *s2, size_t len) ++{ ++ volatile uint16_t *flags; ++ flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags; ++ if (s1 == s2) ++ return (0); ++ if (s1 == NULL || s2 == NULL) ++ return (1); ++ if (s1 != s2 && len != 0) ++ { ++ const uint8_t *ps1 = s1; ++ const uint8_t *ps2 = s2; ++ do ++ { ++ if (dtrace_load8 ((uintptr_t)ps1++) != *ps2++) ++ return (1); ++ } ++ while (--len != 0 && !(*flags & CPU_DTRACE_FAULT)); ++ } ++ return (0); ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 0 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_extr_gc.c b/gcc/testsuite/gcc.dg/struct/dfe_extr_gc.c +new file mode 100644 +index 000000000..1fff2cb9d +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_extr_gc.c +@@ -0,0 +1,162 @@ ++/* { dg-do compile} */ ++ ++#define NULL ((void*)0) ++typedef unsigned long size_t; ++typedef long intptr_t; ++typedef unsigned long uintptr_t; ++typedef long scalar_t__; ++typedef int bool; ++#define false 0 ++#define true 1 ++ ++struct mrb_context ++{ ++ size_t stack; ++ size_t stbase; ++ size_t stend; ++ size_t eidx; ++ int *ci; ++ int *cibase; ++ int status; ++}; ++ ++struct RObject ++{ ++ int dummy; ++}; ++ ++struct RHash ++{ ++ int dummy; ++}; ++ ++struct RFiber ++{ ++ struct mrb_context *cxt; ++}; ++ ++struct RClass ++{ ++ int dummy; ++}; ++ ++struct RBasic ++{ ++ int tt; ++}; ++ ++struct RArray ++{ ++ int dummy; ++}; ++ ++typedef int mrb_state; ++typedef int mrb_gc; ++typedef int mrb_callinfo; ++size_t ARY_LEN (struct RArray *); ++size_t MRB_ENV_STACK_LEN (struct RBasic *); ++int MRB_FIBER_TERMINATED; ++ ++#define MRB_TT_ARRAY 140 ++#define MRB_TT_CLASS 139 ++#define MRB_TT_DATA 138 ++#define MRB_TT_ENV 137 ++#define MRB_TT_EXCEPTION 136 ++#define MRB_TT_FIBER 135 ++#define MRB_TT_HASH 134 ++#define MRB_TT_ICLASS 133 ++#define MRB_TT_MODULE 132 ++#define MRB_TT_OBJECT 131 ++#define MRB_TT_PROC 130 ++#define MRB_TT_RANGE 129 ++#define MRB_TT_SCLASS 128 ++ ++size_t ci_nregs (int *); ++int gc_mark_children (int *, int *, struct RBasic *); ++size_t mrb_gc_mark_hash_size (int *, struct RHash *); ++size_t mrb_gc_mark_iv_size (int *, struct RObject *); ++size_t mrb_gc_mark_mt_size (int *, struct RClass *); ++ ++__attribute__((used)) static size_t ++gc_gray_mark (mrb_state *mrb, mrb_gc *gc, struct RBasic *obj) ++{ ++ size_t children = 0; ++ gc_mark_children (mrb, gc, obj); ++ switch (obj->tt) ++ { ++ case MRB_TT_ICLASS: ++ children++; ++ break; ++ ++ case MRB_TT_CLASS: ++ case MRB_TT_SCLASS: ++ case MRB_TT_MODULE: ++ { ++ struct RClass *c = (struct RClass *)obj; ++ children += mrb_gc_mark_iv_size (mrb, (struct RObject *)obj); ++ children += mrb_gc_mark_mt_size (mrb, c); ++ children ++; ++ } ++ break; ++ ++ case MRB_TT_OBJECT: ++ case MRB_TT_DATA: ++ case MRB_TT_EXCEPTION: ++ children += mrb_gc_mark_iv_size (mrb, (struct RObject *)obj); ++ break; ++ ++ case MRB_TT_ENV: ++ children += MRB_ENV_STACK_LEN (obj); ++ break; ++ ++ case MRB_TT_FIBER: ++ { ++ struct mrb_context *c = ((struct RFiber *)obj)->cxt; ++ size_t i; ++ mrb_callinfo *ci; ++ if (!c || c->status == MRB_FIBER_TERMINATED) ++ break; ++ ++ i = c->stack - c->stbase; ++ if (c->ci) ++ { ++ i += ci_nregs (c->ci); ++ } ++ if (c->stbase + i > c->stend) ++ i = c->stend - c->stbase; ++ ++ children += i; ++ children += c->eidx; ++ if (c->cibase) ++ { ++ for (i = 0, ci = c->cibase; ci <= c->ci; i++, ci++) ++ ; ++ } ++ children += i; ++ } ++ break; ++ ++ case MRB_TT_ARRAY: ++ { ++ struct RArray *a = (struct RArray *)obj; ++ children += ARY_LEN (a); ++ } ++ break; ++ ++ case MRB_TT_HASH: ++ children += mrb_gc_mark_iv_size (mrb, (struct RObject *)obj); ++ children += mrb_gc_mark_hash_size (mrb, (struct RHash *)obj); ++ break; ++ ++ case MRB_TT_PROC: ++ case MRB_TT_RANGE: ++ children += 2; ++ break; ++ default: ++ break; ++ } ++ ++ return children; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 0 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_extr_hpsa.c b/gcc/testsuite/gcc.dg/struct/dfe_extr_hpsa.c +new file mode 100644 +index 000000000..0f577667c +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_extr_hpsa.c +@@ -0,0 +1,126 @@ ++/* { dg-do compile} */ ++ ++#define NULL ((void*)0) ++typedef unsigned long size_t; ++typedef long intptr_t; ++typedef unsigned long uintptr_t; ++typedef long scalar_t__; ++typedef int bool; ++#define false 0 ++#define true 1 ++ ++typedef struct TYPE_6__ TYPE_3__; ++typedef struct TYPE_5__ TYPE_2__; ++typedef struct TYPE_4__ TYPE_1__; ++ ++struct io_accel2_cmd ++{ ++ int dummy; ++}; ++ ++struct hpsa_tmf_struct ++{ ++ int it_nexus; ++}; ++ ++struct hpsa_scsi_dev_t ++{ ++ int nphysical_disks; ++ int ioaccel_handle; ++ struct hpsa_scsi_dev_t **phys_disk; ++}; ++ ++struct ctlr_info ++{ ++ TYPE_3__ *pdev; ++ struct io_accel2_cmd *ioaccel2_cmd_pool; ++}; ++struct TYPE_4__ ++{ ++ int LunAddrBytes; ++}; ++ ++struct TYPE_5__ ++{ ++ TYPE_1__ LUN; ++}; ++ ++struct CommandList ++{ ++ size_t cmdindex; ++ int cmd_type; ++ struct hpsa_scsi_dev_t *phys_disk; ++ TYPE_2__ Header; ++}; ++ ++struct TYPE_6__ ++{ ++ int dev; ++}; ++ ++int BUG (); ++#define CMD_IOACCEL1 132 ++#define CMD_IOACCEL2 131 ++#define CMD_IOCTL_PEND 130 ++#define CMD_SCSI 129 ++#define IOACCEL2_TMF 128 ++int dev_err (int *, char *, int); ++scalar_t__ hpsa_is_cmd_idle (struct CommandList *); ++int le32_to_cpu (int); ++int test_memcmp (unsigned char *, int *, int); ++ ++__attribute__((used)) static bool ++hpsa_cmd_dev_match (struct ctlr_info *h, struct CommandList *c, ++ struct hpsa_scsi_dev_t *dev, unsigned char *scsi3addr) ++{ ++ int i; ++ bool match = false; ++ struct io_accel2_cmd * c2 = &h->ioaccel2_cmd_pool[c->cmdindex]; ++ struct hpsa_tmf_struct *ac = (struct hpsa_tmf_struct *)c2; ++ ++ if (hpsa_is_cmd_idle (c)) ++ return false; ++ ++ switch (c->cmd_type) ++ { ++ case CMD_SCSI: ++ case CMD_IOCTL_PEND: ++ match = !test_memcmp (scsi3addr, &c->Header.LUN.LunAddrBytes, ++ sizeof (c->Header.LUN.LunAddrBytes)); ++ break; ++ ++ case CMD_IOACCEL1: ++ case CMD_IOACCEL2: ++ if (c->phys_disk == dev) ++ { ++ match = true; ++ } ++ else ++ { ++ for (i = 0; i < dev->nphysical_disks && !match; i++) ++ { ++ match = dev->phys_disk[i] == c->phys_disk; ++ } ++ } ++ break; ++ ++ case IOACCEL2_TMF: ++ for (i = 0; i < dev->nphysical_disks && !match; i++) ++ { ++ match = dev->phys_disk[i]->ioaccel_handle == ++ le32_to_cpu (ac->it_nexus); ++ } ++ break; ++ ++ case 0: ++ match = false; ++ break; ++ default: ++ dev_err (&h->pdev->dev, "unexpected cmd_type: %d\n", c->cmd_type); ++ BUG (); ++ } ++ ++ return match; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 0 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_extr_tcp_usrreq.c b/gcc/testsuite/gcc.dg/struct/dfe_extr_tcp_usrreq.c +new file mode 100644 +index 000000000..5570c762e +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_extr_tcp_usrreq.c +@@ -0,0 +1,58 @@ ++/* { dg-do compile} */ ++ ++#define NULL ((void*)0) ++typedef unsigned long size_t; ++typedef long intptr_t; ++typedef unsigned long uintptr_t; ++typedef long scalar_t__; ++typedef int bool; ++#define false 0 ++#define true 1 ++ ++struct tcpcb ++{ ++ int t_state; ++}; ++ ++struct socket ++{ ++ int dummy; ++}; ++ ++struct proc ++{ ++ int dummy; ++}; ++ ++struct inpcb ++{ ++ scalar_t__ inp_lport; ++}; ++ ++int COMMON_END (int); ++int COMMON_START (); ++int PRU_LISTEN; ++int TCPS_LISTEN; ++int in_pcbbind (struct inpcb *, int *, struct proc *); ++struct inpcb* sotoinpcb (struct socket *); ++ ++__attribute__((used)) static void ++tcp_usr_listen (struct socket *so, struct proc *p) ++{ ++ int error = 0; ++ struct inpcb *inp = sotoinpcb (so); ++ struct tcpcb *tp; ++ ++ COMMON_START (); ++ if (inp->inp_lport == 0) ++ { ++ error = in_pcbbind (inp, NULL, p); ++ } ++ if (error == 0) ++ { ++ tp->t_state = TCPS_LISTEN; ++ } ++ COMMON_END (PRU_LISTEN); ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 1 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_extr_ui_main.c b/gcc/testsuite/gcc.dg/struct/dfe_extr_ui_main.c +new file mode 100644 +index 000000000..50ab9cc24 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_extr_ui_main.c +@@ -0,0 +1,61 @@ ++/* { dg-do compile} */ ++ ++#define NULL ((void*)0) ++typedef unsigned long size_t; ++typedef long intptr_t; ++typedef unsigned long uintptr_t; ++typedef long scalar_t__; ++typedef int bool; ++#define false 0 ++#define true 1 ++ ++typedef struct TYPE_4__ TYPE_2__; ++typedef struct TYPE_3__ TYPE_1__; ++ ++struct TYPE_4__ ++{ ++ size_t modCount; ++ TYPE_1__ *modList; ++}; ++ ++struct TYPE_3__ ++{ ++ void *modDescr; ++ void *modName; ++}; ++ ++size_t MAX_MODS; ++void *String_Alloc (char *); ++int test_strlen (char *); ++int trap_FD_GetFileList (char *, char *, char *, int); ++TYPE_2__ uiInfo; ++ ++__attribute__((used)) static void ++UI_LoadMods () ++{ ++ int numdirs; ++ char dirlist[2048]; ++ char *dirptr; ++ char *descptr; ++ int i; ++ int dirlen; ++ ++ uiInfo.modCount = 0; ++ numdirs = trap_FD_GetFileList ("$modelist", "", dirlist, sizeof (dirlist)); ++ dirptr = dirlist; ++ for (i = 0; i < numdirs; i++) ++ { ++ dirlen = test_strlen (dirptr) + 1; ++ descptr = dirptr + dirlen; ++ uiInfo.modList[uiInfo.modCount].modName = String_Alloc (dirptr); ++ uiInfo.modList[uiInfo.modCount].modDescr = String_Alloc (descptr); ++ dirptr += dirlen + test_strlen (descptr) + 1; ++ uiInfo.modCount++; ++ if (uiInfo.modCount >= MAX_MODS) ++ { ++ break; ++ } ++ } ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 1 "struct_layout" } } */ +-- +2.27.0.windows.1 + diff --git a/0043-Backport-Extend-special_memory_constraint.patch b/0043-Backport-Extend-special_memory_constraint.patch new file mode 100644 index 0000000000000000000000000000000000000000..3ca2d4356030779eeea926c1c626929a67253b3f --- /dev/null +++ b/0043-Backport-Extend-special_memory_constraint.patch @@ -0,0 +1,165 @@ +From b4770dd95fa342671d53c9de2077d77ee07b68dd Mon Sep 17 00:00:00 2001 +From: zhaowenyu <804544223@qq.com> +Date: Sat, 25 Jun 2022 00:41:50 +0800 +Subject: [PATCH 09/12] [Backport] Extend special_memory_constraint. + +Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=4de7b010038933dd6ca96bf186ca49f243d0def6 + +For operand with special_memory_constraint, there could be a wrapper for memory_operand. +Extract mem for operand for conditional judgement like MEM_P, also for record_address_regs. +--- + gcc/ira-costs.c | 12 +++++++----- + gcc/ira.c | 2 +- + gcc/lra-constraints.c | 28 +++++++++++++++++++++++----- + gcc/recog.c | 7 +++++-- + gcc/rtl.h | 1 + + 5 files changed, 37 insertions(+), 13 deletions(-) + +diff --git a/gcc/ira-costs.c b/gcc/ira-costs.c +index 6891156b5..aeda6588b 100644 +--- a/gcc/ira-costs.c ++++ b/gcc/ira-costs.c +@@ -781,7 +781,8 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops, + + case CT_SPECIAL_MEMORY: + insn_allows_mem[i] = allows_mem[i] = 1; +- if (MEM_P (op) && constraint_satisfied_p (op, cn)) ++ if (MEM_P (extract_mem_from_operand (op)) ++ && constraint_satisfied_p (op, cn)) + win = 1; + break; + +@@ -1397,15 +1398,16 @@ record_operand_costs (rtx_insn *insn, enum reg_class *pref) + commutative. */ + for (i = 0; i < recog_data.n_operands; i++) + { ++ rtx op_mem = extract_mem_from_operand (recog_data.operand[i]); + memcpy (op_costs[i], init_cost, struct_costs_size); + + if (GET_CODE (recog_data.operand[i]) == SUBREG) + recog_data.operand[i] = SUBREG_REG (recog_data.operand[i]); + +- if (MEM_P (recog_data.operand[i])) +- record_address_regs (GET_MODE (recog_data.operand[i]), +- MEM_ADDR_SPACE (recog_data.operand[i]), +- XEXP (recog_data.operand[i], 0), ++ if (MEM_P (op_mem)) ++ record_address_regs (GET_MODE (op_mem), ++ MEM_ADDR_SPACE (op_mem), ++ XEXP (op_mem, 0), + 0, MEM, SCRATCH, frequency * 2); + else if (constraints[i][0] == 'p' + || (insn_extra_address_constraint +diff --git a/gcc/ira.c b/gcc/ira.c +index 681ec2f46..c13650229 100644 +--- a/gcc/ira.c ++++ b/gcc/ira.c +@@ -1868,7 +1868,7 @@ ira_setup_alts (rtx_insn *insn) + + case CT_MEMORY: + case CT_SPECIAL_MEMORY: +- if (MEM_P (op)) ++ if (MEM_P (extract_mem_from_operand (op))) + goto op_success; + win_p = true; + break; +diff --git a/gcc/lra-constraints.c b/gcc/lra-constraints.c +index 7cc479b30..df75c7b94 100644 +--- a/gcc/lra-constraints.c ++++ b/gcc/lra-constraints.c +@@ -409,14 +409,34 @@ valid_address_p (rtx op, struct address_info *ad, + return valid_address_p (ad->mode, *ad->outer, ad->as); + } + ++/* For special_memory_operand, it could be false for MEM_P (op), ++ i.e. bcst_mem_operand in i386 backend. ++ Extract and return real memory operand or op. */ ++rtx ++extract_mem_from_operand (rtx op) ++{ ++ for (rtx x = op;; x = XEXP (x, 0)) ++ { ++ if (MEM_P (x)) ++ return x; ++ if (GET_RTX_LENGTH (GET_CODE (x)) != 1 ++ || GET_RTX_FORMAT (GET_CODE (x))[0] != 'e') ++ break; ++ } ++ return op; ++} ++ + /* Return true if the eliminated form of memory reference OP satisfies + extra (special) memory constraint CONSTRAINT. */ + static bool + satisfies_memory_constraint_p (rtx op, enum constraint_num constraint) + { + struct address_info ad; ++ rtx mem = extract_mem_from_operand (op); ++ if (!MEM_P (mem)) ++ return false; + +- decompose_mem_address (&ad, op); ++ decompose_mem_address (&ad, mem); + address_eliminator eliminator (&ad); + return constraint_satisfied_p (op, constraint); + } +@@ -2344,8 +2364,7 @@ process_alt_operands (int only_alternative) + break; + + case CT_MEMORY: +- if (MEM_P (op) +- && satisfies_memory_constraint_p (op, cn)) ++ if (satisfies_memory_constraint_p (op, cn)) + win = true; + else if (spilled_pseudo_p (op)) + win = true; +@@ -2386,8 +2405,7 @@ process_alt_operands (int only_alternative) + break; + + case CT_SPECIAL_MEMORY: +- if (MEM_P (op) +- && satisfies_memory_constraint_p (op, cn)) ++ if (satisfies_memory_constraint_p (op, cn)) + win = true; + else if (spilled_pseudo_p (op)) + win = true; +diff --git a/gcc/recog.c b/gcc/recog.c +index 2720aaaac..8674054b9 100644 +--- a/gcc/recog.c ++++ b/gcc/recog.c +@@ -1798,7 +1798,8 @@ asm_operand_ok (rtx op, const char *constraint, const char **constraints) + case CT_MEMORY: + case CT_SPECIAL_MEMORY: + /* Every memory operand can be reloaded to fit. */ +- result = result || memory_operand (op, VOIDmode); ++ result = result || memory_operand (extract_mem_from_operand (op), ++ VOIDmode); + break; + + case CT_ADDRESS: +@@ -2584,7 +2585,9 @@ constrain_operands (int strict, alternative_mask alternatives) + + /* A unary operator may be accepted by the predicate, but it + is irrelevant for matching constraints. */ +- if (UNARY_P (op)) ++ /* For special_memory_operand, there could be a memory operand inside, ++ and it would cause a mismatch for constraint_satisfied_p. */ ++ if (UNARY_P (op) && op == extract_mem_from_operand (op)) + op = XEXP (op, 0); + + if (GET_CODE (op) == SUBREG) +diff --git a/gcc/rtl.h b/gcc/rtl.h +index b29afca8d..35fb6ba73 100644 +--- a/gcc/rtl.h ++++ b/gcc/rtl.h +@@ -4323,6 +4323,7 @@ extern rtx gen_hard_reg_clobber (machine_mode, unsigned int); + extern rtx get_reg_known_value (unsigned int); + extern bool get_reg_known_equiv_p (unsigned int); + extern rtx get_reg_base_value (unsigned int); ++extern rtx extract_mem_from_operand (rtx); + + #ifdef STACK_REGS + extern int stack_regs_mentioned (const_rtx insn); +-- +2.27.0.windows.1 + diff --git a/0044-Backport-ira-Fix-unnecessary-register-spill.patch b/0044-Backport-ira-Fix-unnecessary-register-spill.patch new file mode 100644 index 0000000000000000000000000000000000000000..63103fe82dc20c9802fdd80623dd9bcb6fec48a0 --- /dev/null +++ b/0044-Backport-ira-Fix-unnecessary-register-spill.patch @@ -0,0 +1,73 @@ +From 95d8a6545bef39f5deff376c60c38e4e3c13c8f5 Mon Sep 17 00:00:00 2001 +From: zhaowenyu <804544223@qq.com> +Date: Sat, 25 Jun 2022 00:45:24 +0800 +Subject: [PATCH 10/12] [Backport] ira: Fix unnecessary register spill + +Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=edf95e51e53697f3050f076675c26a4cece17741 + +The variables first_moveable_pseudo and last_moveable_pseudo aren't reset after compiling a function, +which means they leak into the first scheduler pass of the following function. In some cases, this +can cause an extra spill during register location of the second function. +--- + gcc/ira.c | 2 ++ + gcc/testsuite/gcc.target/aarch64/nospill.c | 35 ++++++++++++++++++++++ + 2 files changed, 37 insertions(+) + create mode 100644 gcc/testsuite/gcc.target/aarch64/nospill.c + +diff --git a/gcc/ira.c b/gcc/ira.c +index 681ec2f46..77e4bb988 100644 +--- a/gcc/ira.c ++++ b/gcc/ira.c +@@ -5130,6 +5130,8 @@ move_unallocated_pseudos (void) + INSN_UID (newinsn), i); + SET_REG_N_REFS (i, 0); + } ++ ++ first_moveable_pseudo = last_moveable_pseudo = 0; + } + + /* If the backend knows where to allocate pseudos for hard +diff --git a/gcc/testsuite/gcc.target/aarch64/nospill.c b/gcc/testsuite/gcc.target/aarch64/nospill.c +new file mode 100644 +index 000000000..968a4267e +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/nospill.c +@@ -0,0 +1,35 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O3" } */ ++ ++/* The pseudo for P is marked as moveable in the IRA pass. */ ++float ++func_0 (float a, float b, float c) ++{ ++ float p = c / a; ++ ++ if (b > 1) ++ { ++ b /= p; ++ if (c > 2) ++ a /= 3; ++ } ++ ++ return b / c * a; ++} ++ ++/* If first_moveable_pseudo and last_moveable_pseudo are not reset correctly, ++ they will carry over and spill the pseudo for Q. */ ++float ++func_1 (float a, float b, float c) ++{ ++ float q = a + b; ++ ++ c *= a / (b + b); ++ if (a > 0) ++ c *= q; ++ ++ return a * b * c; ++} ++ ++/* We have plenty of spare registers, so check nothing has been spilled. */ ++/* { dg-final { scan-assembler-not "\tstr\t" } } */ +-- +2.27.0.windows.1 + diff --git a/0045-Transposed-SLP-Enable-Transposed-SLP.patch b/0045-Transposed-SLP-Enable-Transposed-SLP.patch new file mode 100644 index 0000000000000000000000000000000000000000..6323e8d915b6e25cbc3b1e905930dabc5683da80 --- /dev/null +++ b/0045-Transposed-SLP-Enable-Transposed-SLP.patch @@ -0,0 +1,3009 @@ +From 639b5248cbab1806618545fc30215ed9d1a019e7 Mon Sep 17 00:00:00 2001 +From: luohailing +Date: Fri, 17 Jun 2022 22:38:55 +0800 +Subject: [PATCH 11/12] [Transposed SLP] Enable Transposed SLP Enable + Transposed SLP when memory is uncontinual with + -ftree-slp-transpose-vectorize. + +--- + gcc/common.opt | 4 + + gcc/testsuite/gcc.dg/vect/transpose-1.c | 53 ++ + gcc/testsuite/gcc.dg/vect/transpose-2.c | 50 ++ + gcc/testsuite/gcc.dg/vect/transpose-3.c | 54 ++ + gcc/testsuite/gcc.dg/vect/transpose-4.c | 53 ++ + gcc/testsuite/gcc.dg/vect/transpose-5.c | 73 ++ + gcc/testsuite/gcc.dg/vect/transpose-6.c | 67 ++ + gcc/testsuite/gcc.dg/vect/transpose-7.c | 53 ++ + gcc/testsuite/gcc.dg/vect/transpose-8.c | 53 ++ + gcc/testsuite/gcc.dg/vect/vect.exp | 7 + + gcc/tree-vect-data-refs.c | 236 +++++ + gcc/tree-vect-slp.c | 1090 ++++++++++++++++++++++- + gcc/tree-vect-stmts.c | 763 +++++++++++++++- + gcc/tree-vectorizer.h | 89 ++ + 14 files changed, 2641 insertions(+), 4 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-1.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-2.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-3.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-4.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-5.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-6.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-7.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-8.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index 24834cf60..d38401b71 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -3049,6 +3049,10 @@ ftree-vect-analyze-slp-group + Common Report Var(flag_tree_slp_group) Init(0) + Disable SLP vectorization for reduction chain on tree. + ++ftree-slp-transpose-vectorize ++Common Report Var(flag_tree_slp_transpose_vectorize) Optimization Init(0) ++Enable basic block vectorization (SLP) for transposed stores and loads on trees. ++ + fvect-cost-model= + Common Joined RejectNegative Enum(vect_cost_model) Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT) Optimization + -fvect-cost-model=[unlimited|dynamic|cheap] Specifies the cost model for vectorization. +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-1.c b/gcc/testsuite/gcc.dg/vect/transpose-1.c +new file mode 100644 +index 000000000..8237a8b9e +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-1.c +@@ -0,0 +1,53 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++ ++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; ++ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ c2[i] = pix1[2] - pix2[2]; ++ c3[i] = pix1[3] - pix2[3]; ++ c4[i] = pix1[4] - pix2[4]; ++ c5[i] = pix1[5] - pix2[5]; ++ c6[i] = pix1[6] - pix2[6]; ++ c7[i] = pix1[7] - pix2[7]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 16; ++ int i2 = 8; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 2; ++ input2[i] = i; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 1264) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-2.c b/gcc/testsuite/gcc.dg/vect/transpose-2.c +new file mode 100644 +index 000000000..b01a0410e +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-2.c +@@ -0,0 +1,50 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-additional-options "-fno-tree-loop-vectorize" } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 8 ++#define M 256 ++ ++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned short c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; ++ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ c2[i] = pix1[2] - pix2[2]; ++ c3[i] = pix1[3] - pix2[3]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i] + c2[i] + c3[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 5; ++ int i2 = 4; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 4; ++ input2[i] = i * 2; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 1440) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-3.c b/gcc/testsuite/gcc.dg/vect/transpose-3.c +new file mode 100644 +index 000000000..529581c59 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-3.c +@@ -0,0 +1,54 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-additional-options "-fno-tree-loop-vectorize" } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++ ++int foo (unsigned short *pix1, int i_pix1, unsigned short *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; ++ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ c2[i] = pix1[2] - pix2[2]; ++ c3[i] = pix1[3] - pix2[3]; ++ c4[i] = pix1[4] - pix2[4]; ++ c5[i] = pix1[5] - pix2[5]; ++ c6[i] = pix1[6] - pix2[6]; ++ c7[i] = pix1[7] - pix2[7]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned short input1[M]; ++ unsigned short input2[M]; ++ int i1 = 8; ++ int i2 = 4; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 4; ++ input2[i] = i; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 1680) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-4.c b/gcc/testsuite/gcc.dg/vect/transpose-4.c +new file mode 100644 +index 000000000..0b4adea9b +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-4.c +@@ -0,0 +1,53 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++ ++int foo (unsigned *pix1, int i_pix1, unsigned *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; ++ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ c2[i] = pix1[2] - pix2[2]; ++ c3[i] = pix1[3] - pix2[3]; ++ c4[i] = pix1[4] - pix2[4]; ++ c5[i] = pix1[5] - pix2[5]; ++ c6[i] = pix1[6] - pix2[6]; ++ c7[i] = pix1[7] - pix2[7]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned input1[M]; ++ unsigned input2[M]; ++ int i1 = 12; ++ int i2 = 6; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 7; ++ input2[i] = i * 3; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 3616) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-5.c b/gcc/testsuite/gcc.dg/vect/transpose-5.c +new file mode 100644 +index 000000000..81a248840 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-5.c +@@ -0,0 +1,73 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++#define eps 1e-8 ++ ++double foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ unsigned a0[N]; ++ unsigned a1[N]; ++ unsigned a2[N]; ++ unsigned a3[N]; ++ ++ int b0[N]; ++ int b1[N]; ++ int b2[N]; ++ int b3[N]; ++ ++ for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ a0[i] = (pix1[0] - pix2[0]) + ((pix1[4] + pix2[4]) << 16); ++ a1[i] = (pix1[1] - pix2[1]) + ((pix1[5] + pix2[5]) << 16); ++ a2[i] = (pix1[2] - pix2[2]) + ((pix1[6] + pix2[6]) << 16); ++ a3[i] = (pix1[3] - pix2[3]) + ((pix1[7] + pix2[7]) << 16); ++ } ++ ++ for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ b0[i] = (pix1[0] - pix2[0]) + (pix1[4] + pix2[4]); ++ b1[i] = (pix1[1] - pix2[1]) + (pix1[5] + pix2[5]); ++ b2[i] = (pix1[2] - pix2[2]) + (pix1[6] + pix2[6]); ++ b3[i] = (pix1[3] - pix2[3]) + (pix1[7] + pix2[7]); ++ } ++ ++ double sum = 0; ++ for (int i = 0; i < N; i++) ++ { ++ sum += a0[i] + a1[i] + a2[i] + a3[i] + b0[i] + b1[i] + b2[i] + b3[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 8; ++ int i2 = 3; ++ unsigned char m = 2; ++ unsigned short n = 12; ++ float t = 3.0; ++ double k = 4.2; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 6; ++ input2[i] = i * 3; ++ } ++ double sum = foo (input1, i1, input2, i2); ++ if (fabs (sum - 78648144) > eps) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ ++/* { dg-final { scan-tree-dump-times "vectorizable_store for slp transpose" 2 "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-6.c b/gcc/testsuite/gcc.dg/vect/transpose-6.c +new file mode 100644 +index 000000000..3e134ac02 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-6.c +@@ -0,0 +1,67 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-require-effective-target vect_int } */ ++/* { dg-require-effective-target vect_float } */ ++#include ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++#define eps 1e-8 ++ ++float foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ unsigned a0[N]; ++ unsigned a1[N]; ++ unsigned a2[N]; ++ unsigned a3[N]; ++ ++ float c0[N]; ++ float c1[N]; ++ float c2[N]; ++ float c3[N]; ++ ++ for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ a0[i] = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16); ++ a1[i] = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16); ++ a2[i] = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16); ++ a3[i] = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16); ++ ++ c0[i] = (pix1[0] * pix2[0]) + (pix1[4] * pix2[4]); ++ c1[i] = (pix1[1] * pix2[1]) + (pix1[5] * pix2[5]); ++ c2[i] = (pix1[2] * pix2[2]) + (pix1[6] * pix2[6]); ++ c3[i] = (pix1[3] * pix2[3]) + (pix1[7] * pix2[7]); ++ } ++ ++ float sum = 0; ++ for (int i = 0; i < N; i++) ++ { ++ sum += a0[i] + a1[i] + a2[i] + a3[i] + c0[i] + c1[i] + c2[i] + c3[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 18; ++ int i2 = 6; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 4; ++ input2[i] = i * 2; ++ } ++ float sum = foo (input1, i1, input2, i2); ++ if (fabs (sum - 106041168) > eps) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ ++/* { dg-final { scan-tree-dump-times "vectorizable_store for slp transpose" 2 "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-7.c b/gcc/testsuite/gcc.dg/vect/transpose-7.c +new file mode 100644 +index 000000000..2074d9aa8 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-7.c +@@ -0,0 +1,53 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-additional-options "-fno-tree-loop-vectorize" } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 16 ++#define M 256 ++ ++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned char c0[N], c1[N]; ++ for (int i = 0; i < N/2; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ } ++ for (int i = N/2; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 6; ++ int i2 = 4; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 5; ++ input2[i] = i * 2; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 3280) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-8.c b/gcc/testsuite/gcc.dg/vect/transpose-8.c +new file mode 100644 +index 000000000..a154f012a +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-8.c +@@ -0,0 +1,53 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-additional-options "-fno-tree-loop-vectorize" } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 32 ++#define M 256 ++ ++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned char c0[N], c1[N]; ++ for (int i = 0; i < N/2; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ } ++ for (int i = N/2; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 6; ++ int i2 = 4; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 5; ++ input2[i] = i * 2; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 7584) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect.exp b/gcc/testsuite/gcc.dg/vect/vect.exp +index efe17ac6f..d92e1ba5b 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect.exp ++++ b/gcc/testsuite/gcc.dg/vect/vect.exp +@@ -114,6 +114,13 @@ et-dg-runtest dg-runtest [lsort \ + [glob -nocomplain $srcdir/$subdir/no-vfa-*.\[cS\]]] \ + "" $DEFAULT_VECTCFLAGS + ++# -ftree-slp-transpose-vectorize SLP tests ++set VECT_SLP_CFLAGS $SAVED_VECT_SLP_CFLAGS ++lappend VECT_SLP_CFLAGS "-ftree-slp-transpose-vectorize" ++et-dg-runtest dg-runtest [lsort \ ++ [glob -nocomplain $srcdir/$subdir/transpose-*.\[cS\]]] \ ++ "" "-ftree-slp-transpose-vectorize -fdump-tree-slp-details -O3" ++ + # -ffast-math tests + set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS + lappend DEFAULT_VECTCFLAGS "-ffast-math" +diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c +index fcc0726bd..d78b06455 100644 +--- a/gcc/tree-vect-data-refs.c ++++ b/gcc/tree-vect-data-refs.c +@@ -2647,6 +2647,9 @@ vect_analyze_group_access_1 (dr_vec_info *dr_info) + DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element; + + DR_GROUP_SIZE (stmt_info) = groupsize; ++ ++ DR_GROUP_SLP_TRANSPOSE (stmt_info) = false; ++ + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, +@@ -2676,6 +2679,20 @@ vect_analyze_group_access_1 (dr_vec_info *dr_info) + DR_GROUP_GAP (stmt_info)); + } + ++ /* SLP: create an SLP data structure for every interleaving group of ++ loads for further analysis in vect_analyse_slp. */ ++ if (DR_IS_READ (dr) && !slp_impossible) ++ { ++ if (loop_vinfo) ++ { ++ LOOP_VINFO_GROUPED_LOADS (loop_vinfo).safe_push (stmt_info); ++ } ++ if (bb_vinfo) ++ { ++ BB_VINFO_GROUPED_LOADS (bb_vinfo).safe_push (stmt_info); ++ } ++ } ++ + /* SLP: create an SLP data structure for every interleaving group of + stores for further analysis in vect_analyse_slp. */ + if (DR_IS_WRITE (dr) && !slp_impossible) +@@ -5413,6 +5430,225 @@ vect_permute_store_chain (vec dr_chain, + } + } + ++/* Encoding the PERM_MASK_FIRST. */ ++ ++static void ++vect_indices_encoding_first (tree vectype, unsigned int array_num, ++ tree &perm_mask_high_first, ++ tree &perm_mask_low_first) ++{ ++ unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); ++ vec_perm_builder sel (nelt, nelt, 1); ++ sel.quick_grow (nelt); ++ unsigned int group_num = nelt / array_num; ++ unsigned int index = 0; ++ unsigned int array = 0; ++ unsigned int group = 0; ++ ++ /* The encoding has 1 pattern in the fisrt stage. */ ++ for (array = 0; array < array_num / 2; array++) ++ { ++ for (group = 0; group < group_num * 2; group++) ++ { ++ sel[index++] = array + array_num * group; ++ } ++ } ++ vec_perm_indices indices (sel, 2, nelt); ++ perm_mask_high_first = vect_gen_perm_mask_checked (vectype, indices); ++ ++ index = 0; ++ for (array = array_num / 2; array < array_num; array++) ++ { ++ for (group = 0; group < group_num * 2; group++) ++ { ++ sel[index++] = array + array_num * group; ++ } ++ } ++ indices.new_vector (sel, 2, nelt); ++ perm_mask_low_first = vect_gen_perm_mask_checked (vectype, indices); ++} ++ ++/* Encoding the PERM_MASK. */ ++ ++static void ++vect_indices_encoding (tree vectype, unsigned int array_num, ++ tree &perm_mask_high, tree &perm_mask_low) ++{ ++ unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); ++ vec_perm_builder sel (nelt, nelt, 1); ++ sel.quick_grow (nelt); ++ unsigned int group_num = nelt / array_num; ++ unsigned int index = 0; ++ unsigned int array = 0; ++ unsigned int group = 0; ++ ++ /* The encoding has 2 patterns in the folllowing stages. */ ++ for (array = 0; array < array_num / 2; array++) ++ { ++ for (group = 0; group < group_num; group++) ++ { ++ sel[index++] = group + group_num * array; ++ } ++ for (group = 0; group < group_num; group++) ++ { ++ sel[index++] = nelt + group + group_num * array; ++ } ++ } ++ vec_perm_indices indices (sel, 2, nelt); ++ perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); ++ ++ index = 0; ++ for (array = array_num / 2; array < array_num; array++) ++ { ++ for (group = 0; group < group_num; group++) ++ { ++ sel[index++] = group + group_num * array; ++ } ++ for (group = 0; group < group_num; group++) ++ { ++ sel[index++] = nelt + group + group_num * array; ++ } ++ } ++ indices.new_vector (sel, 2, nelt); ++ perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); ++} ++ ++/* Function vect_transpose_store_chain. ++ ++ Given a chain of interleaved stores in DR_CHAIN of LENGTH and ARRAY_NUM that ++ must be a power of 2. Generate interleave_high/low stmts to reorder ++ the data correctly for the stores. Return the final references for stores ++ in RESULT_CHAIN. This function is similar to vect_permute_store_chain (), ++ we interleave the contents of the vectors in their order. ++ ++ E.g., LENGTH is 4, the scalar type is short (i.e., VF is 8) and ARRAY_NUM ++ is 4. That is, the input is 4 vectors each containing 8 elements. ++ And 2 (VF / ARRAY_NUM) of 8 elements come from the same array. we interleave ++ the contents of the four vectors in their order. We assign a number to each ++ element, the input sequence is: ++ ++ 1st vec: 0 1 2 3 4 5 6 7 ++ 2nd vec: 8 9 10 11 12 13 14 15 ++ 3rd vec: 16 17 18 19 20 21 22 23 ++ 4th vec: 24 25 26 27 28 29 30 31 ++ ++ The output sequence should be: ++ ++ 1st vec: 0 4 8 12 16 20 24 28 ++ 2nd vec: 1 5 9 13 17 21 25 29 ++ 3rd vec: 2 6 10 14 18 22 26 30 ++ 4th vec: 3 7 11 15 19 23 27 31 ++ ++ In our example, ++ We get 2 (VF / ARRAY_NUM) elements together in every vector. ++ ++ I1: 0 4 1 5 2 6 3 7 ++ I2: 8 12 9 13 10 14 11 15 ++ I3: 16 20 17 21 18 22 19 23 ++ I4: 24 28 25 29 26 30 27 31 ++ ++ Then, we use interleave_high/low instructions to create such output. ++ Every 2 (VF / ARRAY_NUM) elements are regarded as a whole. The permutation ++ is done in log LENGTH stages. ++ ++ I1: interleave_high (1st vec, 3rd vec) ++ I2: interleave_low (1st vec, 3rd vec) ++ I3: interleave_high (2nd vec, 4th vec) ++ I4: interleave_low (2nd vec, 4th vec) ++ ++ The first stage of the sequence should be: ++ ++ I1: 0 4 16 20 1 5 17 21 ++ I2: 2 6 18 22 3 7 19 23 ++ I3: 8 12 24 28 9 13 25 29 ++ I4: 10 14 26 30 11 15 27 31 ++ ++ The following stage sequence should be, i.e. the final result is: ++ ++ I1: 0 4 8 12 16 20 24 28 ++ I2: 1 5 9 13 17 21 25 29 ++ I3: 2 6 10 14 18 22 26 30 ++ I4: 3 7 11 15 19 23 27 31. */ ++ ++void ++vect_transpose_store_chain (vec dr_chain, unsigned int length, ++ unsigned int array_num, stmt_vec_info stmt_info, ++ gimple_stmt_iterator *gsi, vec *result_chain) ++{ ++ gimple *perm_stmt = NULL; ++ tree vectype = STMT_VINFO_VECTYPE (stmt_info); ++ tree perm_mask_low_first = NULL; ++ tree perm_mask_high_first = NULL; ++ tree perm_mask_low = NULL; ++ tree perm_mask_high = NULL; ++ unsigned int log_length = exact_log2 (length); ++ ++ /* Only power of 2 is supported. */ ++ gcc_assert (pow2p_hwi (length)); ++ ++ /* The encoding has 2 types, one for the grouped pattern in the fisrt stage, ++ another for the interleaved patterns in the following stages. */ ++ gcc_assert (array_num != 0); ++ ++ /* Create grouped stmt (in the first stage): ++ group = nelt / array_num; ++ high_first = VEC_PERM_EXPR ++ low_first = VEC_PERM_EXPR */ ++ vect_indices_encoding_first (vectype, array_num, perm_mask_high_first, ++ perm_mask_low_first); ++ ++ /* Create interleaving stmt (in the following stages): ++ high = VEC_PERM_EXPR ++ low = VEC_PERM_EXPR */ ++ vect_indices_encoding (vectype, array_num, perm_mask_high, perm_mask_low); ++ ++ for (unsigned int perm_time = 0; perm_time < log_length; perm_time++) ++ { ++ for (unsigned int index = 0; index < length / 2; index++) ++ { ++ tree vect1 = dr_chain[index]; ++ tree vect2 = dr_chain[index + length / 2]; ++ ++ tree high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); ++ perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, vect2, ++ perm_time == 0 ? perm_mask_high_first ++ : perm_mask_high); ++ vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); ++ (*result_chain)[2 * index] = high; ++ ++ tree low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); ++ perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, vect2, ++ perm_time == 0 ? perm_mask_low_first ++ : perm_mask_low); ++ vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); ++ (*result_chain)[2 * index+1] = low; ++ } ++ memcpy (dr_chain.address (), result_chain->address (), ++ length * sizeof (tree)); ++ } ++} ++ + /* Function vect_setup_realignment + + This function is called when vectorizing an unaligned load using +diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c +index 476b32370..d30463b96 100644 +--- a/gcc/tree-vect-slp.c ++++ b/gcc/tree-vect-slp.c +@@ -2414,11 +2414,13 @@ vect_analyze_slp_instance (vec_info *vinfo, + + /* For basic block SLP, try to break the group up into multiples of the + vector size. */ ++ bb_vec_info bb_vinfo = dyn_cast (vinfo); + unsigned HOST_WIDE_INT const_nunits; + if (is_a (vinfo) + && STMT_VINFO_GROUPED_ACCESS (stmt_info) + && DR_GROUP_FIRST_ELEMENT (stmt_info) +- && nunits.is_constant (&const_nunits)) ++ && nunits.is_constant (&const_nunits) ++ && !bb_vinfo->transposed) + { + /* We consider breaking the group only on VF boundaries from the existing + start. */ +@@ -2455,6 +2457,898 @@ vect_analyze_slp_instance (vec_info *vinfo, + return false; + } + ++static inline bool ++is_const_assign (stmt_vec_info store_elem) ++{ ++ if (store_elem == NULL) ++ { ++ gcc_unreachable (); ++ } ++ gimple *stmt = store_elem->stmt; ++ gimple_rhs_class rhs_class = gimple_assign_rhs_class (stmt); ++ return rhs_class == GIMPLE_SINGLE_RHS ++ && TREE_CONSTANT (gimple_assign_rhs1 (store_elem->stmt)); ++} ++ ++/* Push inits to INNERMOST_INITS and check const assign. */ ++ ++static bool ++record_innermost (vec &innermost_inits, ++ vec &innermost_offsets, ++ stmt_vec_info stmt_vinfo) ++{ ++ if (!stmt_vinfo) ++ { ++ return false; ++ } ++ stmt_vec_info next_info = stmt_vinfo; ++ while (next_info) ++ { ++ /* No need to vectorize constant assign in a transposed version. */ ++ if (is_const_assign (next_info)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "no need to vectorize, store is const assign: %G", ++ next_info->stmt); ++ } ++ return false; ++ } ++ innermost_inits.safe_push (STMT_VINFO_DR_INIT (next_info)); ++ innermost_offsets.safe_push (STMT_VINFO_DR_OFFSET (next_info)); ++ next_info = DR_GROUP_NEXT_ELEMENT (next_info); ++ } ++ return true; ++} ++ ++/* Compare inits to INNERMOST_INITS, return FALSE if inits do not match ++ the first grouped_store. And check const assign meanwhile. */ ++ ++static bool ++compare_innermost (const vec &innermost_inits, ++ const vec &innermost_offsets, ++ stmt_vec_info stmt_vinfo) ++{ ++ if (!stmt_vinfo || innermost_inits.length () != stmt_vinfo->size) ++ { ++ return false; ++ } ++ stmt_vec_info next_info = stmt_vinfo; ++ unsigned int i = 0; ++ while (next_info) ++ { ++ if (is_const_assign (next_info)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "no need to vectorize, store is const " ++ "assign: %G", next_info->stmt); ++ } ++ return false; ++ } ++ if (innermost_inits[i] != STMT_VINFO_DR_INIT (next_info) ++ || innermost_offsets[i] != STMT_VINFO_DR_OFFSET (next_info)) ++ { ++ return false; ++ } ++ next_info = DR_GROUP_NEXT_ELEMENT (next_info); ++ i++; ++ } ++ return true; ++} ++ ++/* Check if grouped stores are of same type. ++ input: t1/t2 = TREE_TYPE (gimple_assign_lhs (first_element->stmt)) ++ output: 0 if same, 1 or -1 else. */ ++ ++static int ++tree_type_cmp (const tree t1, const tree t2) ++{ ++ gcc_checking_assert (t1 != NULL && t2 != NULL); ++ if (t1 != t2) ++ { ++ if (TREE_CODE (t1) != TREE_CODE (t2)) ++ { ++ return TREE_CODE (t1) > TREE_CODE (t2) ? 1 : -1; ++ } ++ if (TYPE_UNSIGNED (t1) != TYPE_UNSIGNED (t2)) ++ { ++ return TYPE_UNSIGNED (t1) > TYPE_UNSIGNED (t2) ? 1 : -1; ++ } ++ if (TYPE_PRECISION (t1) != TYPE_PRECISION (t2)) ++ { ++ return TYPE_PRECISION (t1) > TYPE_PRECISION (t2) ? 1 : -1; ++ } ++ } ++ return 0; ++} ++ ++/* Check it if 2 grouped stores are of same type that ++ we can analyze them in a transpose group. */ ++static int ++check_same_store_type (stmt_vec_info grp1, stmt_vec_info grp2) ++{ ++ if (grp1 == grp2) ++ { ++ return 0; ++ } ++ if (grp1->size != grp2->size) ++ { ++ return grp1->size > grp2->size ? 1 : -1; ++ } ++ tree lhs1 = gimple_assign_lhs (grp1->stmt); ++ tree lhs2 = gimple_assign_lhs (grp2->stmt); ++ if (TREE_CODE (lhs1) != TREE_CODE (lhs2)) ++ { ++ return TREE_CODE (lhs1) > TREE_CODE (lhs2) ? 1 : -1; ++ } ++ tree grp_type1 = TREE_TYPE (gimple_assign_lhs (grp1->stmt)); ++ tree grp_type2 = TREE_TYPE (gimple_assign_lhs (grp2->stmt)); ++ int cmp = tree_type_cmp (grp_type1, grp_type2); ++ return cmp; ++} ++ ++/* Sort grouped stores according to group_size and store_type. ++ output: 0 if same, 1 if grp1 > grp2, -1 otherwise. */ ++ ++static int ++grouped_store_cmp (const void *grp1_, const void *grp2_) ++{ ++ stmt_vec_info grp1 = *(stmt_vec_info *)const_cast(grp1_); ++ stmt_vec_info grp2 = *(stmt_vec_info *)const_cast(grp2_); ++ return check_same_store_type (grp1, grp2); ++} ++ ++/* Transposing is based on permutation in registers. Permutation requires ++ vector length being power of 2 and satisfying the vector mode. */ ++ ++static inline bool ++check_filling_reg (stmt_vec_info current_element) ++{ ++ if (current_element->size == 0) ++ { ++ return false; ++ } ++ /* If the gimple STMT was already vectorized in vect pass, it's unable to ++ conduct transpose analysis, skip it. */ ++ bool lhs_vectorized ++ = TREE_CODE (TREE_TYPE (gimple_get_lhs (current_element->stmt))) ++ == VECTOR_TYPE; ++ bool rhs_vectorized ++ = TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (current_element->stmt))) ++ == VECTOR_TYPE; ++ if (lhs_vectorized || rhs_vectorized) ++ { ++ return false; ++ } ++ unsigned int store_precision ++ = TYPE_PRECISION (TREE_TYPE (gimple_get_lhs (current_element->stmt))); ++ auto_vector_modes vector_modes; ++ targetm.vectorize.autovectorize_vector_modes (&vector_modes, false); ++ unsigned min_mode_size = -1u; ++ for (unsigned i = 0; i < vector_modes.length (); i++) ++ { ++ unsigned mode_bit_size = (GET_MODE_BITSIZE (vector_modes[i])).coeffs[0]; ++ min_mode_size = mode_bit_size < min_mode_size ++ ? mode_bit_size : min_mode_size; ++ } ++ return store_precision != 0 ++ && pow2p_hwi (current_element->size) ++ && (current_element->size * store_precision % min_mode_size == 0); ++} ++ ++/* Check if previous groups are suitable to transpose, if not, set their ++ group number to -1, reduce grp_num and clear current_groups. ++ Otherwise, just clear current_groups. */ ++ ++static void ++check_and_clear_groups (vec current_groups, ++ unsigned int &grp_num) ++{ ++ stmt_vec_info first_element; ++ if (current_groups.length () == 1 ++ || (current_groups.length () != 0 ++ && !pow2p_hwi (current_groups.length ()))) ++ { ++ while (current_groups.length () != 0) ++ { ++ first_element = current_groups.pop (); ++ first_element->group_number = -1; ++ } ++ grp_num--; ++ } ++ else ++ { ++ while (current_groups.length ()) ++ { ++ current_groups.pop (); ++ } ++ } ++} ++ ++ ++/* Make sure that transpose slp vectorization is conducted only if grouped ++ stores are one dimension array ref. */ ++ ++static bool ++is_store_one_dim_array (gimple *stmt) ++{ ++ tree op = gimple_get_lhs (stmt); ++ if (TREE_CODE (op) != ARRAY_REF) ++ return false; ++ return TREE_OPERAND_LENGTH (op) > 0 ++ && TREE_OPERAND_LENGTH (TREE_OPERAND (op, 0)) == 0; ++} ++ ++/* Set grouped_stores with similar MEM_REF to the same group and mark their ++ grp_num. Groups with same grp_num consist the minimum unit to analyze ++ transpose. Return num of such units. */ ++ ++static unsigned ++vect_prepare_transpose (bb_vec_info bb_vinfo) ++{ ++ stmt_vec_info current_element = NULL; ++ stmt_vec_info first_element = NULL; ++ unsigned int i = 0; ++ unsigned int grp_num = 0; ++ /* Use arrays to record MEM_REF data in different GROUPED_STORES. */ ++ auto_vec innermost_inits; ++ auto_vec innermost_offsets; ++ ++ /* A set of stmt_vec_info with same store type. Analyze them if their size ++ is suitable to transpose. */ ++ auto_vec current_groups; ++ ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, current_element) ++ { ++ /* Compare current grouped_store to the first one if first_element exists, ++ push current_element to current_groups if they are similar on innermost ++ behavior of MEM_REF. */ ++ if (first_element != NULL ++ && !check_same_store_type (first_element, current_element) ++ && compare_innermost (innermost_inits, innermost_offsets, ++ current_element)) ++ { ++ current_groups.safe_push (current_element); ++ current_element->group_number = grp_num; ++ /* If current_element is the last element in grouped_stores, continue ++ will exit the loop and leave the last group unanalyzed. */ ++ if (i == bb_vinfo->grouped_stores.length () - 1) ++ { ++ check_and_clear_groups (current_groups, grp_num); ++ } ++ continue; ++ } ++ check_and_clear_groups (current_groups, grp_num); ++ innermost_inits.release (); ++ innermost_offsets.release (); ++ /* Beginning of a new group to analyze whether they are able to consist ++ a unit to conduct transpose analysis. */ ++ first_element = NULL; ++ if (is_store_one_dim_array (current_element->stmt) ++ && check_filling_reg (current_element) ++ && record_innermost (innermost_inits, innermost_offsets, ++ current_element)) ++ { ++ first_element = current_element; ++ current_groups.safe_push (current_element); ++ current_element->group_number = ++grp_num; ++ if (i == bb_vinfo->grouped_stores.length () - 1) ++ { ++ check_and_clear_groups (current_groups, grp_num); ++ } ++ continue; ++ } ++ current_element->group_number = -1; ++ } ++ return grp_num; ++} ++ ++/* Return a flag to transpose grouped stores before building slp tree. ++ Add bool may_transpose in class vec_info. */ ++ ++static bool ++vect_may_transpose (bb_vec_info bb_vinfo) ++{ ++ if (targetm.vectorize.vec_perm_const == NULL) ++ { ++ return false; ++ } ++ if (bb_vinfo->grouped_stores.length () < 2) ++ { ++ return false; ++ } ++ DUMP_VECT_SCOPE ("analyze if grouped stores may transpose to slp"); ++ /* Sort grouped_stores according to size and type for function ++ vect_prepare_transpose (). */ ++ bb_vinfo->grouped_stores.qsort (grouped_store_cmp); ++ ++ int groups = vect_prepare_transpose (bb_vinfo); ++ BB_VINFO_TRANS_GROUPS (bb_vinfo) = groups; ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "%d groups to analyze transposed slp.\n", groups); ++ return groups != 0; ++} ++ ++/* Get the base address of STMT_INFO. */ ++ ++static tree ++get_op_base_address (stmt_vec_info stmt_info) ++{ ++ struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); ++ tree op = DR_BASE_ADDRESS (dr); ++ while (TREE_OPERAND_LENGTH (op) > 0) ++ { ++ op = TREE_OPERAND (op, 0); ++ } ++ return op; ++} ++ ++/* Compare the UID of the two stmt_info STMTINFO_A and STMTINFO_B. ++ Sorting them in ascending order. */ ++ ++static int ++dr_group_cmp (const void *stmtinfo_a_, const void *stmtinfo_b_) ++{ ++ stmt_vec_info stmtinfo_a ++ = *(stmt_vec_info *) const_cast (stmtinfo_a_); ++ stmt_vec_info stmtinfo_b ++ = *(stmt_vec_info *) const_cast (stmtinfo_b_); ++ ++ /* Stabilize sort. */ ++ if (stmtinfo_a == stmtinfo_b) ++ { ++ return 0; ++ } ++ return gimple_uid (stmtinfo_a->stmt) < gimple_uid (stmtinfo_b->stmt) ? -1 : 1; ++} ++ ++/* Find the first elements of the grouped loads which are required to merge. */ ++ ++static void ++vect_slp_grouped_load_find (bb_vec_info bb_vinfo, vec &visited, ++ vec &res) ++{ ++ unsigned int i = 0; ++ stmt_vec_info merge_first_element = NULL; ++ stmt_vec_info first_element = NULL; ++ tree opa = NULL; ++ unsigned int grp_size_a = 0; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_loads, i, first_element) ++ { ++ if (visited[i]) ++ { ++ continue; ++ } ++ if (!STMT_VINFO_GROUPED_ACCESS (first_element) ++ || !pow2p_hwi (DR_GROUP_SIZE (first_element))) ++ { ++ /* Non-conforming grouped load should be grouped separately. */ ++ if (merge_first_element == NULL) ++ { ++ visited[i] = true; ++ res.safe_push (first_element); ++ return; ++ } ++ } ++ if (merge_first_element == NULL) ++ { ++ merge_first_element = first_element; ++ opa = get_op_base_address (first_element); ++ grp_size_a = DR_GROUP_SIZE (first_element); ++ res.safe_push (first_element); ++ visited[i] = true; ++ continue; ++ } ++ ++ /* If the two first elements are of the same base address and group size, ++ these two grouped loads need to be merged. */ ++ tree opb = get_op_base_address (first_element); ++ unsigned int grp_size_b = DR_GROUP_SIZE (first_element); ++ if (opa == opb && grp_size_a == grp_size_b) ++ { ++ res.safe_push (first_element); ++ visited[i] = true; ++ } ++ } ++} ++ ++/* Merge the grouped loads that are found from ++ vect_slp_grouped_load_find (). */ ++ ++static stmt_vec_info ++vect_slp_grouped_load_merge (vec res) ++{ ++ stmt_vec_info stmt_info = res[0]; ++ if (res.length () == 1) ++ { ++ return stmt_info; ++ } ++ unsigned int i = 0; ++ unsigned int size = DR_GROUP_SIZE (res[0]); ++ unsigned int new_group_size = size * res.length (); ++ stmt_vec_info first_element = NULL; ++ stmt_vec_info merge_first_element = NULL; ++ stmt_vec_info last_element = NULL; ++ FOR_EACH_VEC_ELT (res, i, first_element) ++ { ++ if (merge_first_element == NULL) ++ { ++ merge_first_element = first_element; ++ last_element = merge_first_element; ++ size = DR_GROUP_SIZE (merge_first_element); ++ } ++ ++ if (last_element != first_element ++ && !DR_GROUP_NEXT_ELEMENT (last_element)) ++ { ++ DR_GROUP_NEXT_ELEMENT (last_element) = first_element; ++ /* Store the gap from the previous member of the group. If there is ++ no gap in the access, DR_GROUP_GAP is always 1. */ ++ DR_GROUP_GAP_TRANS (first_element) = DR_GROUP_GAP (first_element); ++ DR_GROUP_GAP (first_element) = 1; ++ } ++ for (stmt_info = first_element; stmt_info; ++ stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info)) ++ { ++ DR_GROUP_FIRST_ELEMENT (stmt_info) = merge_first_element; ++ DR_GROUP_SIZE_TRANS (stmt_info) = DR_GROUP_SIZE (stmt_info); ++ DR_GROUP_SIZE (stmt_info) = new_group_size; ++ last_element = stmt_info; ++ } ++ } ++ DR_GROUP_SIZE (merge_first_element) = new_group_size; ++ DR_GROUP_SLP_TRANSPOSE (merge_first_element) = true; ++ DR_GROUP_NEXT_ELEMENT (last_element) = NULL; ++ return merge_first_element; ++} ++ ++/* Merge the grouped loads which have the same base address and group size. ++ For example, for grouped loads (opa_1, opa_2, opb_1, opb_2): ++ opa_1: a0->a1->a2->a3 ++ opa_2: a8->a9->a10->a11 ++ opb_1: b0->b1 ++ opb_2: b16->b17 ++ we can probably get two merged grouped loads: ++ opa: a0->a1->a2->a3->a8->a9->a10->a11 ++ opb: b0->b1->b16->b17. */ ++ ++static bool ++vect_merge_slp_grouped_loads (bb_vec_info bb_vinfo) ++{ ++ if (bb_vinfo->grouped_loads.length () <= 0) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "The number of grouped loads is 0.\n"); ++ } ++ return false; ++ } ++ bb_vinfo->grouped_loads.qsort (dr_group_cmp); ++ auto_vec visited (bb_vinfo->grouped_loads.length ()); ++ auto_vec grouped_loads_merge; ++ for (unsigned int i = 0; i < bb_vinfo->grouped_loads.length (); i++) ++ { ++ visited.safe_push (false); ++ } ++ while (1) ++ { ++ /* Find grouped loads which are required to merge. */ ++ auto_vec res; ++ vect_slp_grouped_load_find (bb_vinfo, visited, res); ++ if (res.is_empty ()) ++ { ++ break; ++ } ++ /* Merge the required grouped loads into one group. */ ++ grouped_loads_merge.safe_push (vect_slp_grouped_load_merge (res)); ++ } ++ if (grouped_loads_merge.length () == bb_vinfo->grouped_loads.length ()) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "No grouped loads need to be merged.\n"); ++ } ++ return false; ++ } ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Merging grouped loads successfully.\n"); ++ } ++ BB_VINFO_GROUPED_LOADS (bb_vinfo).release (); ++ for (unsigned int i = 0; i < grouped_loads_merge.length (); i++) ++ { ++ BB_VINFO_GROUPED_LOADS (bb_vinfo).safe_push (grouped_loads_merge[i]); ++ } ++ return true; ++} ++ ++/* Find the first elements of the grouped stores ++ which are required to transpose and merge. */ ++ ++static void ++vect_slp_grouped_store_find (bb_vec_info bb_vinfo, vec &visited, ++ vec &res) ++{ ++ stmt_vec_info first_element = NULL; ++ stmt_vec_info merge_first_element = NULL; ++ unsigned int k = 0; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element) ++ { ++ if (visited[k]) ++ { ++ continue; ++ } ++ /* Non-conforming grouped store should be grouped separately. */ ++ if (!STMT_VINFO_GROUPED_ACCESS (first_element) ++ || first_element->group_number == -1) ++ { ++ if (merge_first_element == NULL) ++ { ++ visited[k] = true; ++ res.safe_push (first_element); ++ return; ++ } ++ } ++ if (first_element->group_number != -1 ++ && merge_first_element == NULL) ++ { ++ merge_first_element = first_element; ++ } ++ if (merge_first_element->group_number == first_element->group_number) ++ { ++ visited[k] = true; ++ res.safe_push (first_element); ++ } ++ } ++} ++ ++/* Transpose and merge the grouped stores that are found from ++ vect_slp_grouped_store_find (). */ ++ ++static stmt_vec_info ++vect_slp_grouped_store_transform (vec res) ++{ ++ stmt_vec_info stmt_info = res[0]; ++ if (res.length () == 1) ++ { ++ return stmt_info; ++ } ++ stmt_vec_info rearrange_first_element = stmt_info; ++ stmt_vec_info last_element = rearrange_first_element; ++ ++ unsigned int size = DR_GROUP_SIZE (rearrange_first_element); ++ unsigned int new_group_size = size * res.length (); ++ for (unsigned int i = 1; i < res.length (); i++) ++ { ++ /* Store the gap from the previous member of the group. If there is no ++ gap in the access, DR_GROUP_GAP is always 1. */ ++ DR_GROUP_GAP_TRANS (res[i]) = DR_GROUP_GAP (res[i]); ++ DR_GROUP_GAP (res[i]) = 1; ++ } ++ while (!res.is_empty ()) ++ { ++ stmt_info = res[0]; ++ res.ordered_remove (0); ++ if (DR_GROUP_NEXT_ELEMENT (stmt_info)) ++ { ++ res.safe_push (DR_GROUP_NEXT_ELEMENT (stmt_info)); ++ } ++ DR_GROUP_FIRST_ELEMENT (stmt_info) = rearrange_first_element; ++ DR_GROUP_NEXT_ELEMENT (last_element) = stmt_info; ++ DR_GROUP_SIZE_TRANS (stmt_info) = DR_GROUP_SIZE (stmt_info); ++ DR_GROUP_SIZE (stmt_info) = new_group_size; ++ last_element = stmt_info; ++ } ++ ++ DR_GROUP_SIZE (rearrange_first_element) = new_group_size; ++ DR_GROUP_SLP_TRANSPOSE (rearrange_first_element) = true; ++ DR_GROUP_NEXT_ELEMENT (last_element) = NULL; ++ return rearrange_first_element; ++} ++ ++/* Save the STMT_INFO in the grouped stores to BB_VINFO_SCALAR_STORES for ++ transposing back grouped stores. */ ++ ++static void ++get_scalar_stores (bb_vec_info bb_vinfo) ++{ ++ unsigned int k = 0; ++ stmt_vec_info first_element = NULL; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element) ++ { ++ /* Filter the grouped store which is unnecessary for transposing. */ ++ if (!STMT_VINFO_GROUPED_ACCESS (first_element) ++ || first_element->group_number == -1) ++ { ++ continue; ++ } ++ vec tmp_scalar_store; ++ tmp_scalar_store.create (DR_GROUP_SIZE (first_element)); ++ for (stmt_vec_info stmt_info = first_element; stmt_info; ++ stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info)) ++ { ++ tmp_scalar_store.safe_push (stmt_info); ++ } ++ BB_VINFO_SCALAR_STORES (bb_vinfo).safe_push (tmp_scalar_store); ++ } ++} ++ ++/* Transpose and merge the grouped stores which have the same group number. ++ For example, for grouped stores (opa_0, opa_1, opa_2, opa_3): ++ opa_0: a00->a01->a02->a03 ++ opa_1: a10->a11->a12->a13 ++ opa_2: a20->a21->a22->a23 ++ opa_2: a30->a31->a32->a33 ++ we can probably get the merged grouped store: ++ opa: a00->a10->a20->a30 ++ ->a01->a11->a21->a31 ++ ->a02->a12->a22->a32 ++ ->a03->a13->a23->a33. */ ++ ++static bool ++vect_transform_slp_grouped_stores (bb_vec_info bb_vinfo) ++{ ++ if (bb_vinfo->grouped_stores.length () <= 0) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "The number of grouped stores is 0.\n"); ++ } ++ return false; ++ } ++ ++ bb_vinfo->grouped_stores.qsort (dr_group_cmp); ++ auto_vec grouped_stores_merge; ++ auto_vec visited (bb_vinfo->grouped_stores.length ()); ++ unsigned int i = 0; ++ for (i = 0; i < bb_vinfo->grouped_stores.length (); i++) ++ { ++ visited.safe_push (false); ++ } ++ ++ /* Get scalar stores for the following transposition recovery. */ ++ get_scalar_stores (bb_vinfo); ++ ++ while (1) ++ { ++ /* Find grouped stores which are required to transpose and merge. */ ++ auto_vec res; ++ vect_slp_grouped_store_find (bb_vinfo, visited, res); ++ if (res.is_empty ()) ++ { ++ break; ++ } ++ /* Transpose and merge the required grouped stores into one group. */ ++ grouped_stores_merge.safe_push (vect_slp_grouped_store_transform (res)); ++ } ++ ++ BB_VINFO_GROUPED_STORES (bb_vinfo).release (); ++ for (i = 0; i < grouped_stores_merge.length (); i++) ++ { ++ BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (grouped_stores_merge[i]); ++ } ++ ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Transposing grouped stores successfully.\n"); ++ } ++ return true; ++} ++ ++/* A helpful function of vect_transform_back_slp_grouped_stores (). */ ++ ++static auto_vec ++vect_transform_back_slp_grouped_store (bb_vec_info bb_vinfo, ++ stmt_vec_info first_stmt_info) ++{ ++ auto_vec grouped_stores_split; ++ for (unsigned int i = 0; i < bb_vinfo->scalar_stores.length (); i++) ++ { ++ vec scalar_tmp = bb_vinfo->scalar_stores[i]; ++ if (scalar_tmp.length () > 1 ++ && scalar_tmp[0]->group_number != first_stmt_info->group_number) ++ { ++ continue; ++ } ++ stmt_vec_info cur_stmt_info = NULL; ++ stmt_vec_info cur_first_stmt_info = NULL; ++ stmt_vec_info last_stmt_info = NULL; ++ unsigned int k = 0; ++ FOR_EACH_VEC_ELT (scalar_tmp, k, cur_stmt_info) ++ { ++ if (k == 0) ++ { ++ cur_first_stmt_info = cur_stmt_info; ++ last_stmt_info = cur_stmt_info; ++ } ++ DR_GROUP_FIRST_ELEMENT (cur_stmt_info) = cur_first_stmt_info; ++ DR_GROUP_NEXT_ELEMENT (last_stmt_info) = cur_stmt_info; ++ last_stmt_info = cur_stmt_info; ++ } ++ DR_GROUP_SIZE (cur_first_stmt_info) = k; ++ DR_GROUP_NEXT_ELEMENT (last_stmt_info) = NULL; ++ if (first_stmt_info != cur_first_stmt_info) ++ { ++ DR_GROUP_GAP (cur_first_stmt_info) ++ = DR_GROUP_GAP_TRANS (cur_first_stmt_info); ++ DR_GROUP_SLP_TRANSPOSE (cur_first_stmt_info) = false; ++ DR_GROUP_NUMBER (cur_first_stmt_info) = -1; ++ } ++ grouped_stores_split.safe_push (cur_first_stmt_info); ++ } ++ return grouped_stores_split; ++} ++ ++/* Transform the grouped store back. */ ++ ++void ++vect_transform_back_slp_grouped_stores (bb_vec_info bb_vinfo, ++ stmt_vec_info first_stmt_info) ++{ ++ if (first_stmt_info->group_number == -1) ++ { ++ return; ++ } ++ /* Transform back. */ ++ auto_vec grouped_stores_split ++ = vect_transform_back_slp_grouped_store (bb_vinfo, first_stmt_info); ++ ++ /* Add the remaining grouped stores to grouped_stores_split. */ ++ stmt_vec_info first_element = NULL; ++ unsigned int i = 0; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, first_element) ++ { ++ if (first_element->group_number != first_stmt_info->group_number) ++ { ++ grouped_stores_split.safe_push (first_element); ++ } ++ } ++ DR_GROUP_SLP_TRANSPOSE (first_stmt_info) = false; ++ DR_GROUP_NUMBER (first_stmt_info) = -1; ++ BB_VINFO_GROUPED_STORES (bb_vinfo).release (); ++ for (i = 0; i < grouped_stores_split.length (); i++) ++ { ++ BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (grouped_stores_split[i]); ++ } ++} ++ ++/* Function check_for_slp_vectype ++ ++ Restriction for grouped stores by checking their vectype. ++ If the vectype of the grouped store is changed, it need transform back. ++ If all grouped stores need to be transformed back, return FALSE. */ ++ ++static bool ++check_for_slp_vectype (bb_vec_info bb_vinfo) ++{ ++ stmt_vec_info first_element = NULL; ++ unsigned int i = 0; ++ int count = 0; ++ auto_vec grouped_stores_check; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, first_element) ++ { ++ grouped_stores_check.safe_push (first_element); ++ } ++ FOR_EACH_VEC_ELT (grouped_stores_check, i, first_element) ++ { ++ if (STMT_VINFO_GROUPED_ACCESS (first_element) ++ && first_element->group_number != -1) ++ { ++ unsigned int group_size_b ++ = DR_GROUP_SIZE_TRANS (first_element); ++ tree vectype = STMT_VINFO_VECTYPE (first_element); ++ poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); ++ if (nunits.to_constant () > group_size_b) ++ { ++ count++; ++ /* If the vectype is changed, this grouped store need ++ to be transformed back. */ ++ vect_transform_back_slp_grouped_stores (bb_vinfo, first_element); ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "No supported: only supported for" ++ " group_size geq than nunits.\n"); ++ } ++ } ++ } ++ } ++ if (count == BB_VINFO_TRANS_GROUPS (bb_vinfo)) ++ { ++ return false; ++ } ++ return true; ++} ++ ++/* Function check_for_dr_alignment ++ ++ Check the alignment of the slp instance loads. ++ Return FALSE if a load cannot be vectorized. */ ++ ++static bool ++check_for_dr_alignment (slp_instance instance) ++{ ++ slp_tree node = NULL; ++ unsigned int i = 0; ++ FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node) ++ { ++ stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; ++ dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info); ++ enum dr_alignment_support supportable_dr_alignment ++ = vect_supportable_dr_alignment (first_dr_info, false); ++ if (supportable_dr_alignment == dr_explicit_realign_optimized ++ || supportable_dr_alignment == dr_explicit_realign) ++ { ++ return false; ++ } ++ } ++ return true; ++} ++ ++/* Initialize slp_transpose flag before transposing. */ ++ ++static void ++init_stmt_info_slp_transpose (bb_vec_info bb_vinfo) ++{ ++ stmt_vec_info first_element = NULL; ++ unsigned int k = 0; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element) ++ { ++ if (STMT_VINFO_GROUPED_ACCESS (first_element)) ++ { ++ DR_GROUP_SLP_TRANSPOSE (first_element) = false; ++ } ++ } ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_loads, k, first_element) ++ { ++ if (STMT_VINFO_GROUPED_ACCESS (first_element)) ++ { ++ DR_GROUP_SLP_TRANSPOSE (first_element) = false; ++ } ++ } ++} ++ ++/* Analyze and transpose the stmts before building the SLP tree. */ ++ ++static bool ++vect_analyze_transpose (bb_vec_info bb_vinfo) ++{ ++ DUMP_VECT_SCOPE ("vect_analyze_transpose"); ++ ++ if (!vect_may_transpose (bb_vinfo)) ++ { ++ return false; ++ } ++ ++ /* For basic block SLP, try to merge the grouped stores and loads ++ into one group. */ ++ init_stmt_info_slp_transpose (bb_vinfo); ++ if (vect_transform_slp_grouped_stores (bb_vinfo) ++ && vect_merge_slp_grouped_loads (bb_vinfo)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Analysis succeeded with SLP transposed.\n"); ++ } ++ return true; ++ } ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Analysis failed with SLP transposed.\n"); ++ } ++ return false; ++} + + /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP + trees of packed scalar stmts if SLP is possible. */ +@@ -3124,7 +4018,11 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo) + + vec_outside_cost = vec_prologue_cost + vec_epilogue_cost; + +- if (dump_enabled_p ()) ++ BB_VINFO_VEC_INSIDE_COST (bb_vinfo) = vec_inside_cost; ++ BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo) = vec_outside_cost; ++ BB_VINFO_SCALAR_COST (bb_vinfo) = scalar_cost; ++ ++ if (!unlimited_cost_model (NULL) && dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); + dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n", +@@ -3239,6 +4137,22 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal) + + vect_pattern_recog (bb_vinfo); + ++ /* Transpose grouped stores and loads for better vectorizable version. */ ++ if (bb_vinfo->transposed) ++ { ++ if (!vect_analyze_transpose (bb_vinfo)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "not vectorized: unhandled slp transposed in " ++ "basic block.\n"); ++ } ++ return false; ++ } ++ } ++ bb_vinfo->before_slp = true; ++ + /* Check the SLP opportunities in the basic block, analyze and build SLP + trees. */ + if (!vect_analyze_slp (bb_vinfo, n_stmts)) +@@ -3254,6 +4168,20 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal) + return false; + } + ++ /* Check if the vectype is suitable for SLP transposed. */ ++ if (bb_vinfo->transposed && !check_for_slp_vectype (bb_vinfo)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "Failed to SLP transposed in the basic block.\n"); ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "not vectorized: vectype is not suitable for " ++ "SLP transposed in basic block.\n"); ++ } ++ return false; ++ } ++ + vect_record_base_alignments (bb_vinfo); + + /* Analyze and verify the alignment of data references and the +@@ -3286,6 +4214,27 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal) + if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ()) + return false; + ++ /* Check if the alignment is suitable for SLP transposed. */ ++ if (bb_vinfo->transposed) ++ { ++ for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); i++) ++ { ++ if (!check_for_dr_alignment (instance)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "Failed to SLP transposed in the basic " ++ "block.\n"); ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "not vectorized: alignment is not suitable " ++ "for SLP transposed in basic block.\n"); ++ } ++ return false; ++ } ++ } ++ } ++ + if (!vect_slp_analyze_operations (bb_vinfo)) + { + if (dump_enabled_p ()) +@@ -3311,6 +4260,83 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal) + return true; + } + ++static bool ++may_new_transpose_bbvinfo (bb_vec_info bb_vinfo_ori, bool res_ori) ++{ ++ /* If the flag is false or the slp analysis is broken before ++ vect_analyze_slp, we don't try to analyze the transposed SLP version. */ ++ if (!flag_tree_slp_transpose_vectorize ++ || !BB_VINFO_BEFORE_SLP (bb_vinfo_ori)) ++ { ++ return false; ++ } ++ ++ /* If the original bb_vinfo can't be vectorized, try to new a bb_vinfo ++ of the transposed version. */ ++ if (!res_ori) ++ { ++ return true; ++ } ++ ++ /* Caculate the cost of the original bb_vinfo. */ ++ if (unlimited_cost_model (NULL)) ++ { ++ vect_bb_vectorization_profitable_p (bb_vinfo_ori); ++ } ++ /* If the vec cost and scalar cost are not much difference (here we set the ++ threshold to 4), we try to new a bb_vinfo of the transposed version. */ ++ if (BB_VINFO_SCALAR_COST (bb_vinfo_ori) ++ < 4 * (BB_VINFO_VEC_INSIDE_COST (bb_vinfo_ori) ++ + BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_ori))) ++ { ++ return true; ++ } ++ return false; ++} ++ ++static bool ++may_choose_transpose_bbvinfo (bb_vec_info bb_vinfo_trans, bool res_trans, ++ bb_vec_info bb_vinfo_ori, bool res_ori) ++{ ++ /* The original bb_vinfo is chosen if the transposed bb_vinfo ++ can't be vectorized. */ ++ if (!res_trans) ++ { ++ return false; ++ } ++ /* Caculate the cost of the transposed bb_vinfo. */ ++ if (unlimited_cost_model (NULL)) ++ { ++ vect_bb_vectorization_profitable_p (bb_vinfo_trans); ++ } ++ int diff_bb_cost = -1; ++ int diff_bb_cost_trans = -1; ++ if (res_ori) ++ { ++ diff_bb_cost = BB_VINFO_SCALAR_COST (bb_vinfo_ori) ++ - BB_VINFO_VEC_INSIDE_COST (bb_vinfo_ori) ++ - BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_ori); ++ } ++ if (res_trans) ++ { ++ diff_bb_cost_trans = BB_VINFO_SCALAR_COST (bb_vinfo_trans) ++ - BB_VINFO_VEC_INSIDE_COST (bb_vinfo_trans) ++ - BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_trans); ++ } ++ /* The original bb_vinfo is chosen when one of the following conditions ++ is satisfied as follows: ++ 1) The cost of original version is better transposed version. ++ 2) The vec cost is similar to scalar cost in the transposed version. */ ++ if ((res_ori && res_trans && diff_bb_cost >= diff_bb_cost_trans) ++ || (res_trans && BB_VINFO_SCALAR_COST (bb_vinfo_trans) ++ <= (BB_VINFO_VEC_INSIDE_COST (bb_vinfo_trans) ++ + BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_trans)))) ++ { ++ return false; ++ } ++ return true; ++} ++ + /* Subroutine of vect_slp_bb. Try to vectorize the statements between + REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true + on success. The region has N_STMTS statements and has the datarefs +@@ -3323,6 +4349,7 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin, + unsigned int n_stmts) + { + bb_vec_info bb_vinfo; ++ bb_vec_info bb_vinfo_trans = NULL; + auto_vector_modes vector_modes; + + /* Autodetect first vector size we try. */ +@@ -3337,6 +4364,10 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin, + { + bool vectorized = false; + bool fatal = false; ++ bool res_bb_vinfo_ori = false; ++ bool res_bb_vinfo_trans = false; ++ ++ /* New a bb_vinfo of the original version. */ + bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared); + + bool first_time_p = shared.datarefs.is_empty (); +@@ -3346,8 +4377,57 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin, + else + bb_vinfo->shared->check_datarefs (); + bb_vinfo->vector_mode = next_vector_mode; ++ bb_vinfo->transposed = false; ++ bb_vinfo->before_slp = false; ++ ++ res_bb_vinfo_ori = vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal); ++ /* Analyze and new a transposed bb_vinfo. */ ++ if (may_new_transpose_bbvinfo (bb_vinfo, res_bb_vinfo_ori)) ++ { ++ bool fatal_trans = false; ++ bb_vinfo_trans ++ = new _bb_vec_info (region_begin, region_end, &shared); ++ bool first_time_p = shared.datarefs.is_empty (); ++ BB_VINFO_DATAREFS (bb_vinfo_trans) = datarefs; ++ if (first_time_p) ++ { ++ bb_vinfo_trans->shared->save_datarefs (); ++ } ++ else ++ { ++ bb_vinfo_trans->shared->check_datarefs (); ++ } ++ bb_vinfo_trans->vector_mode = next_vector_mode; ++ bb_vinfo_trans->transposed = true; ++ bb_vinfo_trans->before_slp = false; ++ ++ res_bb_vinfo_trans ++ = vect_slp_analyze_bb_1 (bb_vinfo_trans, n_stmts, fatal_trans); ++ if (may_choose_transpose_bbvinfo (bb_vinfo_trans, ++ res_bb_vinfo_trans, ++ bb_vinfo, res_bb_vinfo_ori)) ++ { ++ bb_vinfo = bb_vinfo_trans; ++ fatal = fatal_trans; ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Basic block part vectorized " ++ "using transposed version.\n"); ++ } ++ } ++ else ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Basic block part vectorized " ++ "using original version.\n"); ++ } ++ } ++ } + +- if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal) ++ if ((res_bb_vinfo_ori || res_bb_vinfo_trans) + && dbg_cnt (vect_slp)) + { + if (dump_enabled_p ()) +@@ -3400,6 +4480,10 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin, + } + + delete bb_vinfo; ++ if (bb_vinfo_trans) ++ { ++ bb_vinfo_trans = NULL; ++ } + + if (mode_i < vector_modes.length () + && VECTOR_MODE_P (autodetected_vector_mode) +diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c +index 6418edb52..b872cfc8d 100644 +--- a/gcc/tree-vect-stmts.c ++++ b/gcc/tree-vect-stmts.c +@@ -7329,6 +7329,153 @@ vectorizable_scan_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + return true; + } + ++/* Function vect_permute_store_chains ++ ++ Call function vect_permute_store_chain (). ++ Given a chain of interleaved stores in DR_CHAIN, generate ++ interleave_high/low stmts to reorder the data correctly. ++ Return the final references for stores in RESULT_CHAIN. */ ++ ++static void ++vect_permute_store_chains (vec dr_chain, unsigned int num_each, ++ stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, ++ vec *result_chain, unsigned int group) ++{ ++ unsigned int k = 0; ++ unsigned int t = 0; ++ ++ /* Divide vectors into GROUP parts. And permute every NUM_EACH vectors ++ together. */ ++ for (k = 0; k < group; k++) ++ { ++ auto_vec dr_chain_transposed (num_each); ++ auto_vec result_chain_transposed (num_each); ++ for (t = k; t < dr_chain.length (); t = t + group) ++ { ++ dr_chain_transposed.quick_push (dr_chain[t]); ++ } ++ vect_permute_store_chain (dr_chain_transposed, num_each, stmt_info, ++ gsi, &result_chain_transposed); ++ for (t = 0; t < num_each; t++) ++ { ++ result_chain->quick_push (result_chain_transposed[t]); ++ } ++ } ++} ++ ++/* Function transpose_oprnd_store ++ ++ Calculate the transposed results from VEC_OPRNDS (VEC_STMT) ++ for vectorizable_store. */ ++ ++static void ++transpose_oprnd_store (vecvec_oprnds, vec *result_chain, ++ unsigned int vec_num, unsigned int const_nunits, ++ unsigned int array_num, stmt_vec_info first_stmt_info, ++ gimple_stmt_iterator *gsi) ++{ ++ unsigned int group_for_transform = 0; ++ unsigned int num_each = 0; ++ ++ /* Transpose back for vec_oprnds. */ ++ /* vec = {vec1, vec2, ...} */ ++ if (array_num < const_nunits ++ && const_nunits % array_num == 0) ++ { ++ vect_transpose_store_chain (vec_oprnds, ++ vec_num, array_num, ++ first_stmt_info, ++ gsi, result_chain); ++ } ++ /* vec1 = {vec_part1}, vec2 = {vec_part2}, ... */ ++ else if (array_num >= const_nunits ++ && array_num % const_nunits == 0) ++ { ++ group_for_transform = array_num / const_nunits; ++ num_each = vec_oprnds.length () / group_for_transform; ++ vect_permute_store_chains (vec_oprnds, ++ num_each, first_stmt_info, ++ gsi, result_chain, ++ group_for_transform); ++ } ++ else ++ { ++ gcc_unreachable (); ++ } ++} ++ ++static dr_vec_info * ++get_dr_info (stmt_vec_info stmt_info) ++{ ++ dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); ++ if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED) ++ { ++ SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN); ++ } ++ return dr_info; ++} ++ ++static unsigned ++dr_align_vect_store (dr_vec_info *cur_first_dr_info, ++ unsigned HOST_WIDE_INT &align) ++{ ++ unsigned misalign = 0; ++ align = known_alignment (DR_TARGET_ALIGNMENT (cur_first_dr_info)); ++ if (aligned_access_p (cur_first_dr_info)) ++ { ++ return misalign; ++ } ++ else if (DR_MISALIGNMENT (cur_first_dr_info) == -1) ++ { ++ align = dr_alignment (vect_dr_behavior (cur_first_dr_info)); ++ } ++ else ++ { ++ misalign = DR_MISALIGNMENT (cur_first_dr_info); ++ } ++ return misalign; ++} ++ ++static stmt_vec_info ++add_new_stmt_vect_store (tree vectype, tree dataref_ptr, tree dataref_offset, ++ tree ref_type, dr_vec_info *cur_first_dr_info, ++ tree vec_oprnd, gimple_stmt_iterator *gsi, ++ stmt_vec_info stmt_info) ++{ ++ /* Data align. */ ++ unsigned HOST_WIDE_INT align; ++ unsigned misalign = dr_align_vect_store (cur_first_dr_info, align); ++ ++ if (dataref_offset == NULL_TREE && TREE_CODE (dataref_ptr) == SSA_NAME) ++ { ++ set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, misalign); ++ } ++ ++ /* Get data_ref. */ ++ tree offset = dataref_offset ? dataref_offset : build_int_cst (ref_type, 0); ++ tree data_ref = fold_build2 (MEM_REF, vectype, dataref_ptr, offset); ++ if (aligned_access_p (cur_first_dr_info)) ++ { ++ ; ++ } ++ else if (DR_MISALIGNMENT (cur_first_dr_info) == -1) ++ { ++ TREE_TYPE (data_ref) = build_aligned_type (TREE_TYPE (data_ref), ++ align * BITS_PER_UNIT); ++ } ++ else ++ { ++ tree elem_type = TREE_TYPE (vectype); ++ TREE_TYPE (data_ref) = build_aligned_type (TREE_TYPE (data_ref), ++ TYPE_ALIGN (elem_type)); ++ } ++ /* Add new stmt. */ ++ vect_copy_ref_info (data_ref, DR_REF (cur_first_dr_info->dr)); ++ gassign *new_stmt = gimple_build_assign (data_ref, vec_oprnd); ++ stmt_vec_info new_stmt_info ++ = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); ++ return new_stmt_info; ++} + + /* Function vectorizable_store. + +@@ -8208,6 +8355,16 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + vect_get_gather_scatter_ops (loop, stmt_info, &gs_info, + &dataref_ptr, &vec_offset); ++ /* If the stmt_info need to be transposed recovery, dataref_ptr ++ will be caculated later. */ ++ else if (memory_access_type == VMAT_CONTIGUOUS ++ && is_a (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE ( ++ DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ dataref_ptr = NULL_TREE; ++ } + else + dataref_ptr + = vect_create_data_ref_ptr (first_stmt_info, aggr_type, +@@ -8299,6 +8456,75 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + } + else + { ++ /* group_size: the size of group after transposing and merging. ++ group_size_b: the size of group before transposing and merging, ++ and only group_size_b >= const_nunits is supported. ++ array_num: the number of arrays. ++ const_nunits: TYPE_VECTOR_SUBPARTS (vectype). ++ ncontinues: group_size_b / const_nunits, it means the number of ++ times an array is stored in memory. */ ++ if (slp && is_a (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "vectorizable_store for slp transpose.\n"); ++ } ++ /* Transpose back for grouped stores. */ ++ vect_transform_back_slp_grouped_stores (bb_vinfo, ++ first_stmt_info); ++ ++ result_chain.create (vec_oprnds.length ()); ++ unsigned int const_nunits = nunits.to_constant (); ++ unsigned int group_size_b = DR_GROUP_SIZE_TRANS (first_stmt_info); ++ unsigned int array_num = group_size / group_size_b; ++ transpose_oprnd_store (vec_oprnds, &result_chain, vec_num, ++ const_nunits, array_num, ++ first_stmt_info, gsi); ++ ++ /* For every store group, not for every vec, because transposing ++ and merging have changed the data reference access. */ ++ gcc_assert (group_size_b >= const_nunits); ++ unsigned int ncontinues = group_size_b / const_nunits; ++ ++ unsigned int k = 0; ++ for (i = 0; i < array_num; i++) ++ { ++ stmt_vec_info first_stmt_b; ++ BB_VINFO_GROUPED_STORES (vinfo).iterate (i, &first_stmt_b); ++ bool simd_lane_access_p ++ = STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_b) != 0; ++ tree ref_type = get_group_alias_ptr_type (first_stmt_b); ++ dataref_ptr = vect_create_data_ref_ptr ( ++ first_stmt_b, aggr_type, ++ simd_lane_access_p ? loop : NULL, ++ offset, &dummy, gsi, &ptr_incr, ++ simd_lane_access_p, NULL_TREE, bump); ++ dr_vec_info *cur_first_dr_info = get_dr_info (first_stmt_b); ++ for (unsigned int t = 0; t < ncontinues; t++) ++ { ++ vec_oprnd = result_chain[k]; ++ k++; ++ if (t > 0) ++ { ++ /* Bump the vector pointer. */ ++ dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, ++ gsi, first_stmt_b, ++ bump); ++ } ++ new_stmt_info = add_new_stmt_vect_store ( ++ vectype, dataref_ptr, dataref_offset, ++ ref_type, cur_first_dr_info, vec_oprnd, ++ gsi, first_stmt_b); ++ } ++ } ++ oprnds.release (); ++ result_chain.release (); ++ vec_oprnds.release (); ++ return true; ++ } + new_stmt_info = NULL; + if (grouped_store) + { +@@ -8557,6 +8783,447 @@ hoist_defs_of_uses (stmt_vec_info stmt_info, class loop *loop) + return true; + } + ++static tree ++calculate_new_type (tree vectype, unsigned int const_nunits, ++ unsigned int group_size_b, unsigned int &nloads, ++ unsigned int &ncontinues, tree &lvectype) ++{ ++ tree ltype = TREE_TYPE (vectype); ++ /* nloads is the number of ARRAYs in a vector. ++ vectemp = {a[], b[], ...} */ ++ if (group_size_b < const_nunits) ++ { ++ tree ptype; ++ tree vtype ++ = vector_vector_composition_type (vectype, ++ const_nunits / group_size_b, ++ &ptype); ++ if (vtype != NULL_TREE) ++ { ++ nloads = const_nunits / group_size_b; ++ lvectype = vtype; ++ ltype = ptype; ++ ncontinues = 1; ++ } ++ } ++ /* ncontinues is the number of vectors from an ARRAY. ++ vectemp1 = {a[0], a[1], ...} ++ ... ++ vectempm = {a[k], a[k+1], ...} */ ++ else ++ { ++ nloads = 1; ++ ltype = vectype; ++ ncontinues = group_size_b / const_nunits; ++ } ++ ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype))); ++ return ltype; ++} ++ ++static void ++generate_old_load_permutations (slp_tree slp_node, unsigned int group_size, ++ vec &old_load_permutation) ++{ ++ /* Generate the old load permutations from the slp_node. */ ++ unsigned i = 0; ++ unsigned k = 0; ++ ++ /* If SLP_NODE has load_permutation, we copy it to old_load_permutation. ++ Otherwise, we generate a permutation sequentially. */ ++ if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) ++ { ++ FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), i, k) ++ { ++ old_load_permutation.safe_push (k); ++ } ++ } ++ else ++ { ++ for (unsigned i = 0; i < group_size; i++) ++ { ++ old_load_permutation.safe_push (i); ++ } ++ } ++} ++ ++static void ++generate_new_load_permutation_mapping (unsigned slp_node_length, ++ vec &group_idx, ++ const vec &load_permutation, ++ unsigned int group_size_b, ++ unsigned &new_group_size, ++ vec &group_from) ++{ ++ /* group_num_vec: only stores the group_loads IDs which are caculated from ++ load_permutation. */ ++ auto_vec group_num_vec; ++ ++ /* Caculate which group_loads are the stmts in SLP_NODE from. */ ++ unsigned i = 0; ++ unsigned k = 0; ++ FOR_EACH_VEC_ELT (load_permutation, i, k) ++ { ++ unsigned int t0 = k / group_size_b; ++ if (!group_num_vec.contains (t0)) ++ { ++ group_num_vec.safe_push (t0); ++ } ++ group_from.safe_push (t0); ++ } ++ group_num_vec.qsort (cmp_for_group_num); ++ /* n_groups: the number of group_loads. */ ++ unsigned int n_groups = group_num_vec.length (); ++ new_group_size = n_groups * group_size_b; ++ for (i = 0; i < n_groups; i++) ++ { ++ group_idx.safe_push (group_num_vec[i] * group_size_b); ++ } ++ /* A new mapping from group_ind_vec to group_from. ++ For example: ++ Origin: group_from = {1,1,3,3,5,5,7,7}; ++ After mapping: group_from = {0,0,1,1,2,2,2,2}; */ ++ auto_vec group_ind_vec (n_groups); ++ for (k = 0; k < n_groups; k++) ++ { ++ group_ind_vec.safe_push (k); ++ } ++ for (i = 0; i < slp_node_length; i++) ++ { ++ for (k = 0; k < n_groups; k++) ++ { ++ if (group_from[i] == group_num_vec[k]) ++ { ++ group_from[i] = group_ind_vec[k]; ++ break; ++ } ++ } ++ } ++} ++ ++static void ++generate_new_load_permutation (vec &new_load_permutation, ++ const vec &old_load_permutation, ++ slp_tree slp_node, bool &this_load_permuted, ++ const vec &group_from, ++ unsigned int group_size_b) ++{ ++ unsigned slp_node_length = SLP_TREE_SCALAR_STMTS (slp_node).length (); ++ /* Generate the new load permutation from the new mapping. */ ++ new_load_permutation.create (slp_node_length); ++ unsigned i = 0; ++ unsigned k = 0; ++ FOR_EACH_VEC_ELT (old_load_permutation, i, k) ++ { ++ /* t1 is the new permutation of k in the old permutation. ++ t1 = base_address + offset: ++ base_address = group_from[i] * group_size_b; ++ offset = k % group_size_b. */ ++ unsigned int t1 ++ = group_from[i] * group_size_b + k % group_size_b; ++ new_load_permutation.safe_push (t1); ++ if (t1 != k) ++ { ++ this_load_permuted = true; ++ } ++ } ++} ++ ++static bool ++is_slp_perm (bool slp_perm, bool this_load_permuted, poly_uint64 nunits, ++ unsigned int group_size, stmt_vec_info first_stmt_info) ++{ ++ /* Calculate the unrolling factor based on the smallest type. */ ++ poly_uint64 unrolling_factor ++ = exact_div (common_multiple (nunits, group_size), group_size); ++ /* The load requires permutation when unrolling exposes ++ a gap either because the group is larger than the SLP ++ group-size or because there is a gap between the groups. */ ++ if (!slp_perm && !this_load_permuted ++ && (known_eq (unrolling_factor, 1U) ++ || (group_size == DR_GROUP_SIZE (first_stmt_info) ++ && DR_GROUP_GAP (first_stmt_info) == 0))) ++ { ++ return false; ++ } ++ else ++ { ++ return true; ++ } ++} ++ ++static void ++generate_load_permutation (slp_tree slp_node, unsigned &new_group_size, ++ unsigned int group_size, unsigned int group_size_b, ++ bool &this_load_permuted, vec &group_idx, ++ vec &new_load_permutation) ++{ ++ /* Generate the old load permutations from SLP_NODE. */ ++ vec old_load_permutation; ++ old_load_permutation.create (group_size); ++ generate_old_load_permutations (slp_node, group_size, old_load_permutation); ++ ++ /* Caculate which group_loads are the stmts in SLP_NODE from. */ ++ unsigned slp_node_length = SLP_TREE_SCALAR_STMTS (slp_node).length (); ++ /* group_from: stores the group_loads ID for every stmt in SLP_NODE. */ ++ vec group_from; ++ group_from.create (slp_node_length); ++ generate_new_load_permutation_mapping (slp_node_length, group_idx, ++ old_load_permutation, ++ group_size_b, new_group_size, ++ group_from); ++ ++ /* Generate the new load permutation from the new mapping and caculate ++ this_load_permuted flag. If this_load_permuted is true, we need execute ++ slp permutation by using new load permutation. */ ++ generate_new_load_permutation (new_load_permutation, old_load_permutation, ++ slp_node, this_load_permuted, group_from, ++ group_size_b); ++ old_load_permutation.release (); ++ group_from.release (); ++} ++ ++static unsigned int ++dr_align_vect_load (dr_vec_info *cur_first_dr_info, ++ unsigned HOST_WIDE_INT &align, ++ enum dr_alignment_support alignment_support_scheme) ++{ ++ unsigned int misalign = 0; ++ ++ align = known_alignment (DR_TARGET_ALIGNMENT (cur_first_dr_info)); ++ if (alignment_support_scheme == dr_aligned) ++ { ++ gcc_assert (aligned_access_p (cur_first_dr_info)); ++ } ++ else if (DR_MISALIGNMENT (cur_first_dr_info) == -1) ++ { ++ align = dr_alignment (vect_dr_behavior (cur_first_dr_info)); ++ } ++ else ++ { ++ misalign = DR_MISALIGNMENT (cur_first_dr_info); ++ } ++ return misalign; ++} ++ ++static stmt_vec_info ++add_new_stmt_vect_load (tree vectype, tree dataref_ptr, tree dataref_offset, ++ tree ref_type, tree ltype, gassign *(&new_stmt), ++ dr_vec_info *cur_first_dr_info, ++ gimple_stmt_iterator *gsi, stmt_vec_info stmt_info) ++{ ++ /* Data align. */ ++ enum dr_alignment_support alignment_support_scheme ++ = vect_supportable_dr_alignment (cur_first_dr_info, false); ++ unsigned HOST_WIDE_INT align; ++ unsigned int misalign = dr_align_vect_load (cur_first_dr_info, align, ++ alignment_support_scheme); ++ if (dataref_offset == NULL_TREE && TREE_CODE (dataref_ptr) == SSA_NAME) ++ { ++ set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, misalign); ++ } ++ ++ /* Get data_ref. */ ++ tree offset = dataref_offset ? dataref_offset : build_int_cst (ref_type, 0); ++ tree data_ref = fold_build2 (MEM_REF, ltype, dataref_ptr, offset); ++ if (alignment_support_scheme == dr_aligned) ++ { ++ ; ++ } ++ else if (DR_MISALIGNMENT (cur_first_dr_info) == -1) ++ { ++ TREE_TYPE (data_ref) ++ = build_aligned_type (TREE_TYPE (data_ref), align * BITS_PER_UNIT); ++ } ++ else ++ { ++ tree elem_type = TREE_TYPE (vectype); ++ TREE_TYPE (data_ref) ++ = build_aligned_type (TREE_TYPE (data_ref), TYPE_ALIGN (elem_type)); ++ } ++ ++ /* Add new stmt. */ ++ vect_copy_ref_info (data_ref, DR_REF (cur_first_dr_info->dr)); ++ new_stmt = gimple_build_assign (make_ssa_name (ltype), data_ref); ++ stmt_vec_info new_stmt_info ++ = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); ++ return new_stmt_info; ++} ++ ++static void ++push_new_stmt_to_dr_chain (bool slp_perm, stmt_vec_info new_stmt_info, ++ vec &dr_chain, slp_tree slp_node) ++{ ++ if (slp_perm) ++ { ++ dr_chain.quick_push (gimple_assign_lhs (new_stmt_info->stmt)); ++ } ++ else ++ { ++ SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info); ++ } ++} ++ ++static stmt_vec_info ++get_first_stmt_info_before_transpose (stmt_vec_info first_stmt_info, ++ unsigned int group_el, ++ unsigned int group_size) ++{ ++ stmt_vec_info last_stmt_info = first_stmt_info; ++ unsigned int count = 0; ++ gcc_assert (group_el < group_size); ++ while (count < group_el) ++ { ++ last_stmt_info = DR_GROUP_NEXT_ELEMENT (last_stmt_info); ++ count++; ++ } ++ return last_stmt_info; ++} ++ ++static stmt_vec_info ++add_new_stmt_for_nloads_greater_than_one (tree lvectype, tree vectype, ++ vec *v, ++ stmt_vec_info stmt_info, ++ gimple_stmt_iterator *gsi) ++{ ++ tree vec_inv = build_constructor (lvectype, v); ++ tree new_temp = vect_init_vector (stmt_info, vec_inv, lvectype, gsi); ++ vec_info *vinfo = stmt_info->vinfo; ++ stmt_vec_info new_stmt_info = vinfo->lookup_def (new_temp); ++ if (lvectype != vectype) ++ { ++ gassign *new_stmt = gimple_build_assign (make_ssa_name (vectype), ++ VIEW_CONVERT_EXPR, ++ build1 (VIEW_CONVERT_EXPR, ++ vectype, new_temp)); ++ new_stmt_info = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); ++ } ++ return new_stmt_info; ++} ++ ++/* Function new_vect_stmt_for_nloads. ++ ++ New a VEC_STMT when nloads Arrays are merged into a vector. ++ ++ ncopies is the number of vectors that need to be loaded from memmory. ++ nloads is the number of ARRAYs in a vector. ++ vectemp = {a[], b[], ...} */ ++ ++static void ++new_vect_stmt_for_nloads (unsigned int ncopies, unsigned int nloads, ++ vec group_idx, stmt_vec_info stmt_info, ++ offset_info *offset_info, vectype_info *vectype_info, ++ vect_memory_access_type memory_access_type, ++ bool slp_perm, vec& dr_chain, slp_tree slp_node, ++ gimple_stmt_iterator *gsi) ++{ ++ vec *v = NULL; ++ stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); ++ unsigned int group_size = DR_GROUP_SIZE (first_stmt_info); ++ stmt_vec_info first_stmt_info_b = NULL; ++ stmt_vec_info new_stmt_info = NULL; ++ tree dataref_ptr = NULL_TREE; ++ tree dummy; ++ gimple *ptr_incr = NULL; ++ unsigned int n = 0; ++ for (unsigned int i = 0; i < ncopies; i++) ++ { ++ vec_alloc (v, nloads); ++ for (unsigned int t = 0; t < nloads; t++) ++ { ++ first_stmt_info_b = get_first_stmt_info_before_transpose ( ++ first_stmt_info, group_idx[n++], group_size); ++ dr_vec_info* cur_first_dr_info = get_dr_info (first_stmt_info_b); ++ tree bump = vect_get_data_ptr_increment (cur_first_dr_info, ++ vectype_info->ltype, ++ memory_access_type); ++ bool simd_lane_access_p ++ = STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_info_b) != 0; ++ ++ /* Create dataref_ptr which is point to init_address. */ ++ dataref_ptr = vect_create_data_ref_ptr ( ++ first_stmt_info_b, vectype_info->ltype, NULL, ++ offset_info->offset, &dummy, gsi, &ptr_incr, ++ simd_lane_access_p, offset_info->byte_offset, bump); ++ ++ gassign *new_stmt = NULL; ++ new_stmt_info = add_new_stmt_vect_load ( ++ vectype_info->vectype, dataref_ptr, ++ offset_info->dataref_offset, vectype_info->ref_type, ++ vectype_info->ltype, new_stmt, cur_first_dr_info, ++ gsi, first_stmt_info_b); ++ ++ CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_assign_lhs (new_stmt)); ++ } ++ new_stmt_info = add_new_stmt_for_nloads_greater_than_one ( ++ vectype_info->lvectype, vectype_info->vectype, ++ v, first_stmt_info_b, gsi); ++ push_new_stmt_to_dr_chain (slp_perm, new_stmt_info, ++ dr_chain, slp_node); ++ } ++} ++ ++/* Function new_vect_stmt_for_ncontinues. ++ ++ New a VEC_STMTs when an Array is divided into several vectors. ++ ++ n_groups is the number of ARRAYs. ++ ncontinues is the number of vectors from an ARRAY. ++ vectemp1 = {a[0], a[1], ...} ++ ... ++ vectempm = {a[k], a[k+1], ...} */ ++ ++static void ++new_vect_stmt_for_ncontinues (unsigned int ncontinues, vec group_idx, ++ stmt_vec_info stmt_info, offset_info* offset_info, ++ vectype_info* vectype_info, ++ vect_memory_access_type memory_access_type, ++ bool slp_perm, vec& dr_chain, ++ slp_tree slp_node, ++ gimple_stmt_iterator *gsi) ++{ ++ stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); ++ unsigned int group_size = DR_GROUP_SIZE (first_stmt_info); ++ stmt_vec_info new_stmt_info = NULL; ++ tree dataref_ptr = NULL_TREE; ++ tree dummy; ++ gimple *ptr_incr = NULL; ++ unsigned int n_groups = group_idx.length (); ++ for (unsigned int i = 0; i < n_groups; i++) ++ { ++ stmt_vec_info first_stmt_info_b = get_first_stmt_info_before_transpose ( ++ first_stmt_info, group_idx[i], group_size); ++ dr_vec_info* cur_first_dr_info = get_dr_info (first_stmt_info_b); ++ tree bump = vect_get_data_ptr_increment (cur_first_dr_info, ++ vectype_info->ltype, memory_access_type); ++ bool simd_lane_access_p ++ = STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_info_b) != 0; ++ for (unsigned int k = 0; k < ncontinues; k++) ++ { ++ /* Create dataref_ptr which is point to init_address. */ ++ if (k == 0) ++ { ++ dataref_ptr = vect_create_data_ref_ptr ( ++ first_stmt_info_b, vectype_info->ltype, NULL, ++ offset_info->offset, &dummy, gsi, &ptr_incr, ++ simd_lane_access_p, offset_info->byte_offset, bump); ++ } ++ else ++ { ++ dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, ++ gsi, first_stmt_info_b, bump); ++ } ++ gassign *new_stmt = NULL; ++ new_stmt_info = add_new_stmt_vect_load ( ++ vectype_info->vectype, dataref_ptr, ++ offset_info->dataref_offset, vectype_info->ref_type, ++ vectype_info->ltype, new_stmt, cur_first_dr_info, ++ gsi, first_stmt_info_b); ++ push_new_stmt_to_dr_chain (slp_perm, new_stmt_info, ++ dr_chain, slp_node); ++ } ++ } ++} ++ + /* vectorizable_load. + + Check if STMT_INFO reads a non scalar data-ref (array/pointer/structure) +@@ -9364,6 +10031,9 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + tree vec_mask = NULL_TREE; + prev_stmt_info = NULL; + poly_uint64 group_elt = 0; ++ unsigned new_group_size = 0; ++ vec new_load_permutation; ++ + for (j = 0; j < ncopies; j++) + { + stmt_vec_info new_stmt_info = NULL; +@@ -9385,6 +10055,15 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + dataref_ptr = unshare_expr (DR_BASE_ADDRESS (first_dr_info->dr)); + dataref_offset = build_int_cst (ref_type, 0); + } ++ /* If the stmt_info need to be transposed recovery, dataref_ptr ++ will be caculated later. */ ++ else if (slp && is_a (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE ( ++ DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ dataref_ptr = NULL_TREE; ++ } + else if (diff_first_stmt_info) + { + dataref_ptr +@@ -9501,6 +10180,63 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + /* Record that VEC_ARRAY is now dead. */ + vect_clobber_variable (stmt_info, gsi, vec_array); + } ++ else if (slp && is_a (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "vectorizable_load for slp transpose.\n"); ++ } ++ /* group_size: the size of group after merging. ++ group_size_b: the size of group before merging. ++ const_nunits: TYPE_VECTOR_SUBPARTS (vectype), it is the number of ++ elements in a vector. ++ nloads: const_nunits / group_size_b or 1, it means the number ++ of ARRAYs in a vector. ++ ncontinues: group_size_b / const_nunits or 1, it means the number ++ of vectors from an ARRAY. */ ++ unsigned int group_size_b = DR_GROUP_SIZE_TRANS (first_stmt_info); ++ unsigned int const_nunits = nunits.to_constant (); ++ unsigned int nloads = const_nunits; ++ unsigned int ncontinues = group_size_b; ++ tree lvectype = vectype; ++ tree ltype = calculate_new_type (vectype, const_nunits, ++ group_size_b, nloads, ++ ncontinues, lvectype); ++ bool this_load_permuted = false; ++ auto_vec group_idx; ++ generate_load_permutation (slp_node, new_group_size, group_size, ++ group_size_b, this_load_permuted, ++ group_idx, new_load_permutation); ++ slp_perm = is_slp_perm (slp_perm, this_load_permuted, nunits, ++ group_size, first_stmt_info); ++ ++ /* ncopies: the number of vectors that need to be loaded from ++ memmory. */ ++ unsigned int ncopies = new_group_size / const_nunits; ++ offset_info offset_info = {offset, byte_offset, dataref_offset}; ++ vectype_info vectype_info = {vectype, ltype, lvectype, ref_type}; ++ if (slp_perm) ++ { ++ dr_chain.create (ncopies); ++ } ++ if (nloads > 1 && ncontinues == 1) ++ { ++ new_vect_stmt_for_nloads (ncopies, nloads, group_idx, stmt_info, ++ &offset_info, &vectype_info, ++ memory_access_type, slp_perm, dr_chain, ++ slp_node, gsi); ++ } ++ else ++ { ++ new_vect_stmt_for_ncontinues (ncontinues, group_idx, stmt_info, ++ &offset_info, &vectype_info, ++ memory_access_type, slp_perm, ++ dr_chain, slp_node, gsi); ++ } ++ } + else + { + for (i = 0; i < vec_num; i++) +@@ -9840,7 +10576,32 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + if (slp && !slp_perm) + continue; + +- if (slp_perm) ++ /* Using the new load permutation to generate vector permute statements ++ from a list of loads in DR_CHAIN. */ ++ if (slp && slp_perm && is_a (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ unsigned n_perms; ++ stmt_vec_info stmt_info_ = SLP_TREE_SCALAR_STMTS (slp_node)[0]; ++ unsigned int old_size = DR_GROUP_SIZE (stmt_info); ++ DR_GROUP_SIZE (stmt_info_) = new_group_size; ++ vec old_load_permutation ++ = SLP_TREE_LOAD_PERMUTATION (slp_node); ++ SLP_TREE_LOAD_PERMUTATION (slp_node) = new_load_permutation; ++ bool perm_load_success = vect_transform_slp_perm_load ( ++ slp_node, dr_chain, gsi, vf, ++ slp_node_instance, false, &n_perms); ++ DR_GROUP_SIZE (stmt_info_) = old_size; ++ SLP_TREE_LOAD_PERMUTATION (slp_node) = old_load_permutation; ++ new_load_permutation.release (); ++ if (!perm_load_success) ++ { ++ dr_chain.release (); ++ return false; ++ } ++ } ++ else if (slp_perm) + { + unsigned n_perms; + if (!vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf, +diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h +index f7becb34a..1c4a6c421 100644 +--- a/gcc/tree-vectorizer.h ++++ b/gcc/tree-vectorizer.h +@@ -297,6 +297,21 @@ public: + vec ddrs; + }; + ++/* Information about offset in vectorizable_load. */ ++struct offset_info { ++ tree offset; ++ tree byte_offset; ++ tree dataref_offset; ++}; ++ ++/* Information about vectype in vectorizable_load. */ ++struct vectype_info { ++ tree vectype; ++ tree ltype; ++ tree lvectype; ++ tree ref_type; ++}; ++ + /* Vectorizer state common between loop and basic-block vectorization. */ + class vec_info { + public: +@@ -335,6 +350,14 @@ public: + stmt in the chain. */ + auto_vec grouped_stores; + ++ /* All interleaving chains of loads, represented by the first ++ stmt in the chain. */ ++ auto_vec grouped_loads; ++ ++ /* All interleaving chains of stores (before transposed), represented by all ++ stmt in the chain. */ ++ auto_vec > scalar_stores; ++ + /* Cost data used by the target cost model. */ + void *target_cost_data; + +@@ -702,6 +725,8 @@ public: + #define LOOP_VINFO_CHECK_NONZERO(L) (L)->check_nonzero + #define LOOP_VINFO_LOWER_BOUNDS(L) (L)->lower_bounds + #define LOOP_VINFO_GROUPED_STORES(L) (L)->grouped_stores ++#define LOOP_VINFO_GROUPED_LOADS(L) (L)->grouped_loads ++#define LOOP_VINFO_SCALAR_STORES(L) (L)->scalar_stores + #define LOOP_VINFO_SLP_INSTANCES(L) (L)->slp_instances + #define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor + #define LOOP_VINFO_REDUCTIONS(L) (L)->reductions +@@ -764,6 +789,25 @@ public: + basic_block bb; + gimple_stmt_iterator region_begin; + gimple_stmt_iterator region_end; ++ ++ /* True, if bb_vinfo can goto vect_analyze_slp. */ ++ bool before_slp; ++ ++ /* True, if bb_vinfo is a transposed version. */ ++ bool transposed; ++ ++ /* The number of transposed groups. */ ++ int transposed_group; ++ ++ /* The cost of the scalar iterations. */ ++ int scalar_cost; ++ ++ /* The cost of the vector prologue and epilogue, including peeled ++ iterations and set-up code. */ ++ int vec_outside_cost; ++ ++ /* The cost of the vector loop body. */ ++ int vec_inside_cost; + } *bb_vec_info; + + #define BB_VINFO_BB(B) (B)->bb +@@ -772,6 +816,14 @@ public: + #define BB_VINFO_DATAREFS(B) (B)->shared->datarefs + #define BB_VINFO_DDRS(B) (B)->shared->ddrs + #define BB_VINFO_TARGET_COST_DATA(B) (B)->target_cost_data ++#define BB_VINFO_GROUPED_LOADS(B) (B)->grouped_loads ++#define BB_VINFO_SCALAR_STORES(B) (B)->scalar_stores ++#define BB_VINFO_VEC_OUTSIDE_COST(B) (B)->vec_outside_cost ++#define BB_VINFO_VEC_INSIDE_COST(B) (B)->vec_inside_cost ++#define BB_VINFO_SCALAR_COST(B) (B)->scalar_cost ++#define BB_VINFO_SLP_TRANSPOSED(B) (B)->transposed ++#define BB_VINFO_BEFORE_SLP(B) (B)->before_slp ++#define BB_VINFO_TRANS_GROUPS(B) (B)->transposed_group + + static inline bb_vec_info + vec_info_for_bb (basic_block bb) +@@ -1012,6 +1064,17 @@ public: + stmt_vec_info next_element; + /* The size of the group. */ + unsigned int size; ++ ++ /* The size of the group before transposed. */ ++ unsigned int size_before_transpose; ++ ++ /* If true, the stmt_info is slp transposed. */ ++ bool slp_transpose; ++ ++ /* Mark the group store number for rebuild interleaving chain ++ during transpose phase. Value -1 represents unable to transpose. */ ++ int group_number; ++ + /* For stores, number of stores from this group seen. We vectorize the last + one. */ + unsigned int store_count; +@@ -1019,6 +1082,9 @@ public: + is 1. */ + unsigned int gap; + ++ /* The gap before transposed. */ ++ unsigned int gap_before_transpose; ++ + /* The minimum negative dependence distance this stmt participates in + or zero if none. */ + unsigned int min_neg_dist; +@@ -1217,6 +1283,12 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo) + #define STMT_VINFO_REDUC_VECTYPE_IN(S) (S)->reduc_vectype_in + #define STMT_VINFO_SLP_VECT_ONLY(S) (S)->slp_vect_only_p + ++#define DR_GROUP_SLP_TRANSPOSE(S) \ ++ (gcc_checking_assert ((S)->dr_aux.dr), (S)->slp_transpose) ++#define DR_GROUP_SIZE_TRANS(S) \ ++ (gcc_checking_assert ((S)->dr_aux.dr), (S)->size_before_transpose) ++#define DR_GROUP_NUMBER(S) \ ++ (gcc_checking_assert ((S)->dr_aux.dr), (S)->group_number) + #define DR_GROUP_FIRST_ELEMENT(S) \ + (gcc_checking_assert ((S)->dr_aux.dr), (S)->first_element) + #define DR_GROUP_NEXT_ELEMENT(S) \ +@@ -1227,6 +1299,8 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo) + (gcc_checking_assert ((S)->dr_aux.dr), (S)->store_count) + #define DR_GROUP_GAP(S) \ + (gcc_checking_assert ((S)->dr_aux.dr), (S)->gap) ++#define DR_GROUP_GAP_TRANS(S) \ ++ (gcc_checking_assert ((S)->dr_aux.dr), (S)->gap_before_transpose) + + #define REDUC_GROUP_FIRST_ELEMENT(S) \ + (gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element) +@@ -1624,6 +1698,17 @@ vect_get_scalar_dr_size (dr_vec_info *dr_info) + return tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_info->dr)))); + } + ++/* Compare two unsigned int A and B. ++ Sorting them in ascending order. */ ++ ++static inline int ++cmp_for_group_num (const void *a_, const void *b_) ++{ ++ unsigned int a = *(unsigned int *)const_cast(a_); ++ unsigned int b = *(unsigned int *)const_cast(b_); ++ return a < b ? -1 : 1; ++} ++ + /* Return true if LOOP_VINFO requires a runtime check for whether the + vector loop is profitable. */ + +@@ -1787,6 +1872,9 @@ extern bool vect_grouped_load_supported (tree, bool, unsigned HOST_WIDE_INT); + extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool); + extern void vect_permute_store_chain (vec ,unsigned int, stmt_vec_info, + gimple_stmt_iterator *, vec *); ++extern void vect_transpose_store_chain (vec, unsigned int, unsigned int, ++ stmt_vec_info, gimple_stmt_iterator *, ++ vec *); + extern tree vect_setup_realignment (stmt_vec_info, gimple_stmt_iterator *, + tree *, enum dr_alignment_support, tree, + class loop **); +@@ -1849,6 +1937,7 @@ extern void vect_free_slp_instance (slp_instance, bool); + extern bool vect_transform_slp_perm_load (slp_tree, vec , + gimple_stmt_iterator *, poly_uint64, + slp_instance, bool, unsigned *); ++extern void vect_transform_back_slp_grouped_stores (bb_vec_info, stmt_vec_info); + extern bool vect_slp_analyze_operations (vec_info *); + extern void vect_schedule_slp (vec_info *); + extern opt_result vect_analyze_slp (vec_info *, unsigned); +-- +2.27.0.windows.1 + diff --git a/0046-ArrayWidenCompare-Add-a-new-optimization-for-array-c.patch b/0046-ArrayWidenCompare-Add-a-new-optimization-for-array-c.patch new file mode 100644 index 0000000000000000000000000000000000000000..4a3da7836de06a97f866758891d1718eb81f8fe0 --- /dev/null +++ b/0046-ArrayWidenCompare-Add-a-new-optimization-for-array-c.patch @@ -0,0 +1,1982 @@ +From 8072fe107c04778de78db90bf6fdb7baf474e24a Mon Sep 17 00:00:00 2001 +From: dingguangya +Date: Thu, 2 Jun 2022 12:48:17 +0800 +Subject: [PATCH 12/12] [ArrayWidenCompare] Add a new optimization for array + comparison scenarios + +Add option farray-widen-compare. +For an array pointer whose element is a single-byte type, +by changing the pointer type to a long-byte type, the elements +can be combined and compared after loading. +--- + gcc/Makefile.in | 1 + + gcc/common.opt | 5 + + gcc/doc/invoke.texi | 13 +- + gcc/passes.def | 1 + + .../gcc.dg/tree-ssa/awiden-compare-1.c | 19 + + .../gcc.dg/tree-ssa/awiden-compare-2.c | 90 + + .../gcc.dg/tree-ssa/awiden-compare-3.c | 22 + + .../gcc.dg/tree-ssa/awiden-compare-4.c | 22 + + .../gcc.dg/tree-ssa/awiden-compare-5.c | 19 + + .../gcc.dg/tree-ssa/awiden-compare-6.c | 19 + + .../gcc.dg/tree-ssa/awiden-compare-7.c | 22 + + .../gcc.dg/tree-ssa/awiden-compare-8.c | 24 + + gcc/timevar.def | 1 + + gcc/tree-pass.h | 1 + + gcc/tree-ssa-loop-array-widen-compare.c | 1555 +++++++++++++++++ + 15 files changed, 1813 insertions(+), 1 deletion(-) + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-1.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-2.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-3.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-4.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-5.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-6.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-7.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-8.c + create mode 100644 gcc/tree-ssa-loop-array-widen-compare.c + +diff --git a/gcc/Makefile.in b/gcc/Makefile.in +index 23394c64b..2b2bf474a 100644 +--- a/gcc/Makefile.in ++++ b/gcc/Makefile.in +@@ -1591,6 +1591,7 @@ OBJS = \ + tree-ssa-loop-ivopts.o \ + tree-ssa-loop-manip.o \ + tree-ssa-loop-niter.o \ ++ tree-ssa-loop-array-widen-compare.o \ + tree-ssa-loop-prefetch.o \ + tree-ssa-loop-split.o \ + tree-ssa-loop-unswitch.o \ +diff --git a/gcc/common.opt b/gcc/common.opt +index 24834cf60..2985a5791 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1060,6 +1060,11 @@ fasynchronous-unwind-tables + Common Report Var(flag_asynchronous_unwind_tables) Optimization + Generate unwind tables that are exact at each instruction boundary. + ++farray-widen-compare ++Common Report Var(flag_array_widen_compare) Optimization ++Extends types for pointers to arrays to improve array comparsion performance. ++In some extreme situations this may result in unsafe behavior. ++ + fauto-inc-dec + Common Report Var(flag_auto_inc_dec) Init(1) Optimization + Generate auto-inc/dec instructions. +diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +index 4b0fd2ffb..44f1f8a2e 100644 +--- a/gcc/doc/invoke.texi ++++ b/gcc/doc/invoke.texi +@@ -459,7 +459,7 @@ Objective-C and Objective-C++ Dialects}. + -falign-loops[=@var{n}[:@var{m}:[@var{n2}[:@var{m2}]]]] @gol + -fno-allocation-dce -fallow-store-data-races @gol + -fassociative-math -fauto-profile -fauto-profile[=@var{path}] @gol +--fauto-inc-dec -fbranch-probabilities @gol ++-farray-widen-compare -fauto-inc-dec -fbranch-probabilities @gol + -fcaller-saves @gol + -fcombine-stack-adjustments -fconserve-stack @gol + -fcompare-elim -fcprop-registers -fcrossjumping @gol +@@ -9710,6 +9710,17 @@ This pass is always skipped on architectures that do not have + instructions to support this. Enabled by default at @option{-O} and + higher on architectures that support this. + ++@item -farray-widen-compare ++@opindex farray-widen-compare ++In the narrow-byte array comparison scenario, the types of pointers ++pointing to array are extended so that elements of multiple bytes can ++be loaded at a time when a wide type is used to dereference an array, ++thereby improving the performance of this comparison scenario. In some ++extreme situations this may result in unsafe behavior. ++ ++This option may generate better or worse code; results are highly dependent ++on the structure of loops within the source code. ++ + @item -fdce + @opindex fdce + Perform dead code elimination (DCE) on RTL@. +diff --git a/gcc/passes.def b/gcc/passes.def +index e9c91d26e..797b803ca 100644 +--- a/gcc/passes.def ++++ b/gcc/passes.def +@@ -91,6 +91,7 @@ along with GCC; see the file COPYING3. If not see + NEXT_PASS (pass_dse); + NEXT_PASS (pass_cd_dce); + NEXT_PASS (pass_phiopt, true /* early_p */); ++ NEXT_PASS (pass_array_widen_compare); + NEXT_PASS (pass_tail_recursion); + NEXT_PASS (pass_convert_switch); + NEXT_PASS (pass_cleanup_eh); +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-1.c b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-1.c +new file mode 100644 +index 000000000..27b69b0e9 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-1.c +@@ -0,0 +1,19 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -farray-widen-compare -fdump-tree-awiden_compare-details" } */ ++ ++#include ++#include ++ ++#define my_min(x, y) ((x) < (y) ? (x) : (y)) ++ ++uint32_t ++func (uint32_t len0, uint32_t len1, const uint32_t len_limit, const uint8_t *const pb, const uint8_t *const cur) ++{ ++ uint32_t len = my_min(len0, len1); ++ while (++len != len_limit) ++ if (pb[len] != cur[len]) ++ break; ++ return len; ++} ++ ++/* { dg-final { scan-tree-dump-times "loop form is success" 1 "awiden_compare"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-2.c b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-2.c +new file mode 100644 +index 000000000..d102364f2 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-2.c +@@ -0,0 +1,90 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -farray-widen-compare -fdump-tree-awiden_compare-details" } */ ++ ++#include ++#include ++ ++#define EMPTY_HASH_VALUE 0 ++#define my_min(x, y) ((x) < (y) ? (x) : (y)) ++#define true 1 ++ ++typedef struct { ++ uint32_t len; ++ uint32_t dist; ++} lzma_match; ++ ++ ++lzma_match * ++func ( ++ const uint32_t len_limit, ++ const uint32_t pos, ++ const uint8_t *const cur, ++ uint32_t cur_match, ++ uint32_t depth, ++ uint32_t *const son, ++ const uint32_t cyclic_pos, ++ const uint32_t cyclic_size, ++ lzma_match *matches, ++ uint32_t len_best) ++{ ++ uint32_t *ptr0 = son + (cyclic_pos << 1) + 1; ++ uint32_t *ptr1 = son + (cyclic_pos << 1); ++ ++ uint32_t len0 = 0; ++ uint32_t len1 = 0; ++ ++ while (true) ++ { ++ const uint32_t delta = pos - cur_match; ++ if (depth-- == 0 || delta >= cyclic_size) ++ { ++ *ptr0 = EMPTY_HASH_VALUE; ++ *ptr1 = EMPTY_HASH_VALUE; ++ return matches; ++ } ++ ++ uint32_t *const pair = son + ((cyclic_pos - delta + (delta > cyclic_pos ? cyclic_size : 0)) << 1); ++ ++ const uint8_t *const pb = cur -delta; ++ uint32_t len = my_min(len0, len1); ++ ++ if (pb[len] == cur[len]) ++ { ++ while (++len != len_limit) ++ if (pb[len] != cur[len]) ++ break; ++ ++ if (len_best < len) ++ { ++ len_best = len; ++ matches->len = len; ++ matches->dist = delta - 1; ++ ++matches; ++ ++ if (len == len_limit) ++ { ++ *ptr1 = pair[0]; ++ *ptr0 = pair[1]; ++ return matches; ++ } ++ } ++ } ++ ++ if (pb[len] < cur[len]) ++ { ++ *ptr1 = cur_match; ++ ptr1 = pair + 1; ++ cur_match = *ptr1; ++ len1 = len; ++ } ++ else ++ { ++ *ptr0 = cur_match; ++ ptr0 = pair; ++ cur_match = *ptr0; ++ len0 = len; ++ } ++ } ++} ++ ++/* { dg-final { scan-tree-dump-times "loop form is success" 1 "awiden_compare"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-3.c b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-3.c +new file mode 100644 +index 000000000..52dd6b02b +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-3.c +@@ -0,0 +1,22 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -farray-widen-compare -fdump-tree-awiden_compare-details" } */ ++ ++#include ++#include ++ ++#define my_min(x, y) ((x) < (y) ? (x) : (y)) ++ ++uint32_t ++func (uint32_t len0, uint32_t len1, const uint32_t len_limit, const uint8_t *const pb, const uint8_t *const cur) ++{ ++ uint32_t len = my_min(len0, len1); ++ while (len != len_limit) ++ { ++ if (pb[len] != cur[len]) ++ break; ++ len = len + 1; ++ } ++ return len; ++} ++ ++/* { dg-final { scan-tree-dump-times "loop form is success" 1 "awiden_compare"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-4.c b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-4.c +new file mode 100644 +index 000000000..d3185d326 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-4.c +@@ -0,0 +1,22 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -farray-widen-compare -fdump-tree-awiden_compare-details" } */ ++ ++#include ++#include ++ ++#define my_min(x, y) ((x) < (y) ? (x) : (y)) ++ ++uint32_t ++func (uint32_t len0, uint32_t len1, const uint32_t len_limit, const uint8_t *const pb, const uint8_t *const cur) ++{ ++ uint32_t len = my_min(len0, len1); ++ while (len != len_limit) ++ { ++ if (pb[len] != cur[len]) ++ break; ++ len = len + 2; ++ } ++ return len; ++} ++ ++/* { dg-final { scan-tree-dump-times "loop form is success" 0 "awiden_compare"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-5.c b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-5.c +new file mode 100644 +index 000000000..9743dc623 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-5.c +@@ -0,0 +1,19 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -farray-widen-compare -fdump-tree-awiden_compare-details" } */ ++ ++#include ++#include ++ ++#define my_min(x, y) ((x) < (y) ? (x) : (y)) ++ ++uint32_t ++func (uint32_t len0, uint32_t len1, const uint32_t len_limit, const uint8_t *const pb, const uint8_t *const cur) ++{ ++ uint32_t len = my_min(len0, len1); ++ while (++len != len_limit) ++ if (pb[len] != cur[len-1]) ++ break; ++ return len; ++} ++ ++/* { dg-final { scan-tree-dump-times "loop form is success" 0 "awiden_compare"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-6.c b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-6.c +new file mode 100644 +index 000000000..2323d5bf7 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-6.c +@@ -0,0 +1,19 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -farray-widen-compare -fdump-tree-awiden_compare-details" } */ ++ ++#include ++#include ++ ++#define my_min(x, y) ((x) < (y) ? (x) : (y)) ++ ++uint32_t ++func (uint32_t len0, uint32_t len1, const uint32_t len_limit, const uint8_t *const pb, const uint8_t *const cur) ++{ ++ uint32_t len = my_min(len0, len1); ++ while (len++ != len_limit) ++ if (pb[len] != cur[len]) ++ break; ++ return len; ++} ++ ++/* { dg-final { scan-tree-dump-times "loop form is success" 0 "awiden_compare"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-7.c b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-7.c +new file mode 100644 +index 000000000..33db62fa4 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-7.c +@@ -0,0 +1,22 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -farray-widen-compare -fdump-tree-awiden_compare-details" } */ ++ ++#include ++#include ++ ++#define my_min(x, y) ((x) < (y) ? (x) : (y)) ++ ++uint32_t ++func (uint32_t len0, uint32_t len1, const uint32_t len_limit, const uint8_t *const pb, const uint8_t *const cur) ++{ ++ uint32_t len = my_min(len0, len1); ++ while (len != len_limit) ++ { ++ len = len + 1; ++ if (pb[len] != cur[len]) ++ break; ++ } ++ return len; ++} ++ ++/* { dg-final { scan-tree-dump-times "loop form is success" 0 "awiden_compare"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-8.c b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-8.c +new file mode 100644 +index 000000000..8c96d24a1 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-8.c +@@ -0,0 +1,24 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -farray-widen-compare -fdump-tree-awiden_compare-details" } */ ++ ++#include ++#include ++ ++#define my_min(x, y) ((x) < (y) ? (x) : (y)) ++ ++uint32_t ++func (uint32_t len0, uint32_t len1, const uint32_t len_limit, const uint8_t *const pb, const uint8_t *const cur) ++{ ++ uint32_t len = my_min(len0, len1); ++ while (++len != len_limit) ++ { ++ if (pb[len] != cur[len]) ++ { ++ len = len - 1; ++ break; ++ } ++ } ++ return len; ++} ++ ++/* { dg-final { scan-tree-dump-times "loop form is success" 0 "awiden_compare"} } */ +diff --git a/gcc/timevar.def b/gcc/timevar.def +index e873747a8..6d90bb6e1 100644 +--- a/gcc/timevar.def ++++ b/gcc/timevar.def +@@ -215,6 +215,7 @@ DEFTIMEVAR (TV_TREE_NRV , "tree NRV optimization") + DEFTIMEVAR (TV_TREE_COPY_RENAME , "tree rename SSA copies") + DEFTIMEVAR (TV_TREE_SSA_VERIFY , "tree SSA verifier") + DEFTIMEVAR (TV_TREE_STMT_VERIFY , "tree STMT verifier") ++DEFTIMEVAR (TV_TREE_ARRAY_WIDEN_COMPARE, "tree array widen compare") + DEFTIMEVAR (TV_TREE_SWITCH_CONVERSION, "tree switch conversion") + DEFTIMEVAR (TV_TREE_SWITCH_LOWERING, "tree switch lowering") + DEFTIMEVAR (TV_TREE_RECIP , "gimple CSE reciprocals") +diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h +index be6387768..aca0b83f2 100644 +--- a/gcc/tree-pass.h ++++ b/gcc/tree-pass.h +@@ -436,6 +436,7 @@ extern gimple_opt_pass *make_pass_cselim (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_phiopt (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_forwprop (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_phiprop (gcc::context *ctxt); ++extern gimple_opt_pass *make_pass_array_widen_compare (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_tree_ifcombine (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_dse (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_nrv (gcc::context *ctxt); +diff --git a/gcc/tree-ssa-loop-array-widen-compare.c b/gcc/tree-ssa-loop-array-widen-compare.c +new file mode 100644 +index 000000000..ba51d785d +--- /dev/null ++++ b/gcc/tree-ssa-loop-array-widen-compare.c +@@ -0,0 +1,1555 @@ ++/* Array widen compare. ++ Copyright (C) 2022-2022 Free Software Foundation, Inc. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify it ++under the terms of the GNU General Public License as published by the ++Free Software Foundation; either version 3, or (at your option) any ++later version. ++ ++GCC is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++#include "config.h" ++#include "system.h" ++#include "coretypes.h" ++#include "backend.h" ++#include "target.h" ++#include "tree.h" ++#include "gimple.h" ++#include "tree-pass.h" ++#include "gimple-ssa.h" ++#include "tree-pretty-print.h" ++#include "fold-const.h" ++#include "gimplify.h" ++#include "gimple-iterator.h" ++#include "tree-ssa-loop-manip.h" ++#include "tree-ssa-loop.h" ++#include "ssa.h" ++#include "tree-into-ssa.h" ++#include "cfganal.h" ++#include "cfgloop.h" ++#include "gimple-pretty-print.h" ++#include "tree-cfg.h" ++#include "cgraph.h" ++#include "print-tree.h" ++#include "cfghooks.h" ++#include "gimple-fold.h" ++ ++/* This pass handles scenarios similar to the following: ++ ++ uint32_t ++ func (uint32_t len0, uint32_t len1, const uint32_t len_limit, ++ const uint8_t *const pb, const uint8_t *const cur) ++ { ++ uint32_t len = my_min (len0, len1); ++ while (++len != len_limit) ++ if (pb[len] != cur[len]) ++ break; ++ return len; ++ } ++ ++ Features of this type of loop: ++ 1) the loop has two exits; ++ 2) One of the exits comes from the comparison result of the array; ++ ++ From the source code point of view, the pass completes the conversion of the ++ above scenario into: ++ ++ uint32_t ++ func (uint32_t len0, uint32_t len1, const uint32_t len_limit, ++ const uint8_t *const pb, const uint8_t *const cur) ++ { ++ uint32_t len = my_min (len0, len1); ++ // align_loop ++ for(++len; len + sizeof(uint64_t) <= len_limit; len += sizeof (uint64_t)) ++ { ++ uint64_t a = *((uint64_t*)(cur+len)); ++ uint64_t b = *((uint64_t*)(pb+len)); ++ if (a != b) ++ { ++ int lz = __builtin_ctzll (a ^ b); ++ len += lz / 8; ++ return len; ++ } ++ } ++ // epilogue_loop ++ for (;len != len_limit; ++len) ++ if (pb[len] != cur[len]) ++ break; ++ return len; ++ } ++ ++ This pass is to complete the conversion of such scenarios from the internal ++ perspective of the compiler: ++ 1) determine_loop_form: The function completes the screening of such ++ scenarios; ++ 2) convert_to_new_loop: The function completes the conversion of ++ origin_loop to new loops, and removes origin_loop; ++ 3) origin_loop_info: The structure is used to record important information ++ of origin_loop: such as loop exit, growth step size ++ of loop induction variable, initial value ++ of induction variable, etc; ++ 4) create_new_loops: The function is used as the key content of the pass ++ to complete the creation of new loops. */ ++ ++/* The useful information of origin loop. */ ++ ++struct origin_loop_info ++{ ++ tree base; /* The initial index of the array in the old loop. */ ++ tree limit; /* The limit index of the array in the old loop. */ ++ tree arr1; /* Array 1 in the old loop. */ ++ tree arr2; /* Array 2 in the old loop. */ ++ edge entry_edge; /* The edge into the old loop. */ ++ basic_block exit_bb1; ++ basic_block exit_bb2; ++ edge exit_e1; ++ edge exit_e2; ++ gimple *cond_stmt1; ++ gimple *cond_stmt2; ++ gimple *update_stmt; ++ bool exist_prolog_assgin; ++ /* Whether the marker has an initial value assigned ++ to the array index. */ ++ unsigned HOST_WIDE_INT step; ++ /* The growth step of the loop induction variable. */ ++}; ++ ++typedef struct origin_loop_info origin_loop_info; ++ ++static origin_loop_info origin_loop; ++hash_map defs_map; ++ ++/* Dump the bb information in a loop. */ ++ ++static void ++dump_loop_bb (struct loop *loop) ++{ ++ basic_block *body = get_loop_body_in_dom_order (loop); ++ basic_block bb = NULL; ++ ++ for (unsigned i = 0; i < loop->num_nodes; i++) ++ { ++ bb = body[i]; ++ if (bb->loop_father != loop) ++ { ++ continue; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "===== the %dth bb of loop ==========:\n", i); ++ gimple_dump_bb (dump_file, bb, 0, dump_flags); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ free (body); ++} ++ ++/* Return true if the loop has precisely one backedge. */ ++ ++static bool ++loop_single_backedge_p (class loop *loop) ++{ ++ basic_block latch = loop->latch; ++ if (!single_succ_p (latch)) ++ return false; ++ ++ edge e = single_succ_edge (latch); ++ edge backedge = find_edge (latch, loop->header); ++ ++ if (e != backedge) ++ return false; ++ ++ return true; ++} ++ ++/* Return true if the loop has precisely one preheader BB. */ ++ ++static bool ++loop_single_preheader_bb (class loop *loop) ++{ ++ basic_block header = loop->header; ++ if (EDGE_COUNT (header->preds) != 2) ++ return false; ++ ++ edge e1 = EDGE_PRED (header, 0); ++ edge e2 = EDGE_PRED (header, 1); ++ ++ if ((e1->src == loop->latch && e2->src->loop_father != loop) ++ || (e2->src == loop->latch && e1->src->loop_father != loop)) ++ return true; ++ ++ return false; ++} ++ ++/* Initialize the origin_loop structure. */ ++static void ++init_origin_loop_structure () ++{ ++ origin_loop.base = NULL; ++ origin_loop.limit = NULL; ++ origin_loop.arr1 = NULL; ++ origin_loop.arr2 = NULL; ++ origin_loop.exit_e1 = NULL; ++ origin_loop.exit_e2 = NULL; ++ origin_loop.exit_bb1 = NULL; ++ origin_loop.exit_bb2 =NULL; ++ origin_loop.entry_edge = NULL; ++ origin_loop.cond_stmt1 = NULL; ++ origin_loop.cond_stmt2 = NULL; ++ origin_loop.update_stmt = NULL; ++ origin_loop.exist_prolog_assgin = false; ++ origin_loop.step = 0; ++} ++ ++/* Get the edge that first entered the loop. */ ++ ++static edge ++get_loop_preheader_edge (class loop *loop) ++{ ++ edge e; ++ edge_iterator ei; ++ ++ FOR_EACH_EDGE (e, ei, loop->header->preds) ++ if (e->src != loop->latch) ++ break; ++ ++ if (!e) ++ { ++ gcc_assert (!loop_outer (loop)); ++ return single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun)); ++ } ++ ++ return e; ++} ++ ++/* Make sure the exit condition stmt satisfies a specific form. */ ++ ++static bool ++check_cond_stmt (gimple *stmt) ++{ ++ if (!stmt) ++ return false; ++ if (gimple_code (stmt) != GIMPLE_COND) ++ return false; ++ ++ if (gimple_cond_code (stmt) != NE_EXPR && gimple_cond_code (stmt) != EQ_EXPR) ++ return false; ++ ++ tree lhs = gimple_cond_lhs (stmt); ++ tree rhs = gimple_cond_rhs (stmt); ++ ++ /* The parameter that does not support the cond statement is not SSA_NAME. ++ eg: if (len_1 != 100). */ ++ if (TREE_CODE (lhs) != SSA_NAME || TREE_CODE (rhs) != SSA_NAME) ++ return false; ++ ++ return true; ++} ++ ++/* Record the exit information in the original loop including exit edge, ++ exit bb block, exit condition stmt, ++ eg: exit_eX origin_exit_bbX cond_stmtX. */ ++ ++static bool ++record_origin_loop_exit_info (class loop *loop) ++{ ++ bool found = false; ++ edge e = NULL; ++ unsigned i = 0; ++ gimple *stmt; ++ ++ if (origin_loop.exit_e1 != NULL || origin_loop.exit_bb1 != NULL ++ || origin_loop.exit_e2 != NULL || origin_loop.exit_bb2 != NULL ++ || origin_loop.cond_stmt1 != NULL || origin_loop.cond_stmt2 != NULL) ++ return false; ++ ++ vec exit_edges = get_loop_exit_edges (loop); ++ if (exit_edges == vNULL) ++ return false; ++ ++ if (exit_edges.length () != 2) ++ goto fail; ++ ++ FOR_EACH_VEC_ELT (exit_edges, i, e) ++ { ++ if (e->src == loop->header) ++ { ++ origin_loop.exit_e1 = e; ++ origin_loop.exit_bb1 = e->dest; ++ stmt = gsi_stmt (gsi_last_bb (e->src)); ++ if (check_cond_stmt (stmt)) ++ origin_loop.cond_stmt1 = stmt; ++ } ++ else ++ { ++ origin_loop.exit_e2 = e; ++ origin_loop.exit_bb2 = e->dest; ++ stmt = gsi_stmt (gsi_last_bb (e->src)); ++ if (check_cond_stmt (stmt)) ++ origin_loop.cond_stmt2 = stmt; ++ } ++ } ++ ++ if (origin_loop.exit_e1 != NULL && origin_loop.exit_bb1 != NULL ++ && origin_loop.exit_e2 != NULL && origin_loop.exit_bb2 != NULL ++ && origin_loop.cond_stmt1 != NULL && origin_loop.cond_stmt2 != NULL) ++ found = true; ++ ++fail: ++ exit_edges.release (); ++ return found; ++} ++ ++/* Returns true if t is SSA_NAME and user variable exists. */ ++ ++static bool ++ssa_name_var_p (tree t) ++{ ++ if (!t || TREE_CODE (t) != SSA_NAME) ++ return false; ++ if (SSA_NAME_VAR (t)) ++ return true; ++ return false; ++} ++ ++/* Returns true if t1 and t2 are SSA_NAME and belong to the same variable. */ ++ ++static bool ++same_ssa_name_var_p (tree t1, tree t2) ++{ ++ if (!ssa_name_var_p (t1) || !ssa_name_var_p (t2)) ++ return false; ++ if (SSA_NAME_VAR (t1) == SSA_NAME_VAR (t2)) ++ return true; ++ return false; ++} ++ ++/* Get origin loop induction variable upper bound. */ ++ ++static bool ++get_iv_upper_bound (gimple *stmt) ++{ ++ if (origin_loop.limit != NULL) ++ return false; ++ ++ tree lhs = gimple_cond_lhs (stmt); ++ tree rhs = gimple_cond_rhs (stmt); ++ ++ if (TREE_CODE (TREE_TYPE (lhs)) != INTEGER_TYPE ++ || TREE_CODE (TREE_TYPE (rhs)) != INTEGER_TYPE) ++ return false; ++ ++ gimple *g = SSA_NAME_DEF_STMT (rhs); ++ ++ /* TODO: Currently, the input restrictions on lhs and rhs are implemented ++ through PARM_DECL. We may consider releasing the restrictions later, and ++ we need to consider the overall adaptation scenario and adding test ++ cases. */ ++ if (ssa_name_var_p (rhs) && TREE_CODE (SSA_NAME_VAR (rhs)) == PARM_DECL ++ && g && gimple_code (g) == GIMPLE_NOP ++ && (ssa_name_var_p (lhs) && TREE_CODE (SSA_NAME_VAR (lhs)) != PARM_DECL)) ++ { ++ origin_loop.limit = rhs; ++ } ++ else ++ return false; ++ ++ if (origin_loop.limit != NULL) ++ return true; ++ ++ return false; ++} ++ ++/* Returns true only when the expression on the rhs code of stmt is PLUS_EXPR, ++ rhs1 is SSA_NAME with the same var as origin_loop base, and rhs2 is ++ INTEGER_CST. */ ++ ++static bool ++check_update_stmt (gimple *stmt) ++{ ++ if (!stmt) ++ return false; ++ ++ if (gimple_assign_rhs_code (stmt) == PLUS_EXPR) ++ { ++ tree rhs1 = gimple_assign_rhs1 (stmt); ++ tree rhs2 = gimple_assign_rhs2 (stmt); ++ if (TREE_CODE (rhs1) == SSA_NAME && TREE_CODE (rhs2) == INTEGER_CST ++ && same_ssa_name_var_p (rhs1, origin_loop.base)) ++ { ++ origin_loop.step = tree_to_uhwi (rhs2); ++ if (origin_loop.step == 1) ++ return true; ++ } ++ } ++ return false; ++} ++ ++/* Get origin loop induction variable initial value. */ ++ ++static bool ++get_iv_base (gimple *stmt) ++{ ++ tree lhs = gimple_cond_lhs (stmt); ++ if (origin_loop.base != NULL || origin_loop.update_stmt != NULL) ++ return false; ++ ++ basic_block header = gimple_bb (stmt); ++ ++ gphi_iterator gsi; ++ edge e; ++ edge_iterator ei; ++ tree iv_after; ++ ++ for (gsi = gsi_start_phis (header); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ gphi *phi = gsi.phi (); ++ tree res = gimple_phi_result (phi); ++ if (!same_ssa_name_var_p (res, lhs)) ++ continue; ++ tree base = PHI_ARG_DEF_FROM_EDGE (phi, origin_loop.entry_edge); ++ if (!same_ssa_name_var_p (base, lhs)) ++ return false; ++ origin_loop.base = base; ++ FOR_EACH_EDGE (e, ei, header->preds) ++ { ++ if (e != origin_loop.entry_edge) ++ { ++ iv_after = PHI_ARG_DEF_FROM_EDGE (phi, e); ++ gimple *update = SSA_NAME_DEF_STMT (iv_after); ++ if (!check_update_stmt (update)) ++ return false; ++ origin_loop.update_stmt = update; ++ if (gimple_bb (update) == header && iv_after == lhs) ++ origin_loop.exist_prolog_assgin = true; ++ } ++ } ++ } ++ ++ if (origin_loop.base != NULL && origin_loop.update_stmt != NULL) ++ return true; ++ ++ return false; ++} ++ ++/* Record the upper bound and initial value of the induction variable in the ++ original loop; When prolog_assign is present, make sure loop header is in ++ simple form; And the interpretation of prolog_assign is as follows: ++ eg: while (++len != limit) ++ ...... ++ For such a loop, ++len will be processed before entering header_bb, and the ++ assign is regarded as the prolog_assign of the loop. */ ++ ++static bool ++record_origin_loop_header (class loop *loop) ++{ ++ basic_block header = loop->header; ++ ++ if (origin_loop.entry_edge != NULL || origin_loop.base != NULL ++ || origin_loop.update_stmt != NULL || origin_loop.limit != NULL) ++ return false; ++ origin_loop.entry_edge = get_loop_preheader_edge (loop); ++ ++ gimple_stmt_iterator gsi; ++ gimple *stmt; ++ ++ for (gsi = gsi_last_bb (header); !gsi_end_p (gsi); gsi_prev (&gsi)) ++ { ++ stmt = gsi_stmt (gsi); ++ if (stmt && is_gimple_debug (stmt)) ++ continue; ++ if (stmt && gimple_code (stmt) == GIMPLE_COND) ++ { ++ if (!get_iv_upper_bound (stmt)) ++ return false; ++ if (!get_iv_base (stmt)) ++ return false; ++ } ++ else if (stmt && gimple_code (stmt) == GIMPLE_ASSIGN) ++ { ++ if (stmt != origin_loop.update_stmt || !origin_loop.exist_prolog_assgin) ++ return false; ++ } ++ else ++ return false; ++ } ++ ++ if (origin_loop.entry_edge != NULL && origin_loop.base != NULL ++ && origin_loop.update_stmt != NULL && origin_loop.limit != NULL) ++ return true; ++ ++ return false; ++} ++ ++/* When prolog_assign does not exist, make sure that update_stmt exists in the ++ loop latch, and its form is a specific form, eg: ++ len_2 = len_1 + 1. */ ++ ++static bool ++record_origin_loop_latch (class loop *loop) ++{ ++ basic_block latch = loop->latch; ++ gimple_stmt_iterator gsi; ++ gimple *stmt; ++ ++ gsi = gsi_start_bb (latch); ++ ++ if (origin_loop.exist_prolog_assgin) ++ { ++ if (gsi_end_p (gsi)) ++ return true; ++ } ++ else ++ { ++ if (gsi_one_before_end_p (gsi)) ++ { ++ stmt = gsi_stmt (gsi); ++ if (stmt == origin_loop.update_stmt) ++ return true; ++ } ++ } ++ return false; ++} ++ ++/* Returns true when the DEF_STMT corresponding to arg0 of the mem_ref tree ++ satisfies the POINTER_PLUS_EXPR type. */ ++ ++static bool ++check_body_mem_ref (tree mem_ref) ++{ ++ tree arg0 = TREE_OPERAND (mem_ref , 0); ++ tree arg1 = TREE_OPERAND (mem_ref , 1); ++ ++ if (TREE_CODE (TREE_TYPE (arg0)) == POINTER_TYPE ++ && TREE_CODE (arg1) == INTEGER_CST ++ && tree_to_uhwi (arg1) == 0) ++ { ++ gimple *tmp_g = SSA_NAME_DEF_STMT (arg0); ++ if (tmp_g && gimple_assign_rhs_code (tmp_g) == POINTER_PLUS_EXPR) ++ return true; ++ } ++ return false; ++} ++ ++/* Returns true if the rh2 of the current stmt comes from the base in the ++ original loop. */ ++ ++static bool ++check_body_pointer_plus (gimple *stmt, tree &tmp_index) ++{ ++ tree rhs1 = gimple_assign_rhs1 (stmt); ++ tree rhs2 = gimple_assign_rhs2 (stmt); ++ if (TREE_CODE (TREE_TYPE (rhs1)) == POINTER_TYPE) ++ { ++ gimple *g = SSA_NAME_DEF_STMT (rhs2); ++ if (g && gimple_assign_rhs_code (g) == NOP_EXPR) ++ { ++ tree nop_rhs = gimple_assign_rhs1 (g); ++ if (same_ssa_name_var_p (nop_rhs, origin_loop.base)) ++ { ++ if (!origin_loop.arr1) ++ { ++ origin_loop.arr1 = rhs1; ++ tmp_index = rhs2; ++ } ++ else if (!origin_loop.arr2) ++ { ++ origin_loop.arr2 = rhs1; ++ if (tmp_index != rhs2) ++ return false; ++ } ++ else ++ return false; ++ return true; ++ } ++ } ++ } ++ return false; ++} ++ ++/* Record the array comparison information in the original loop, while ensuring ++ that there are only statements related to cont_stmt in the loop body. */ ++ ++static bool ++record_origin_loop_body (class loop *loop) ++{ ++ basic_block body = gimple_bb (origin_loop.cond_stmt2); ++ ++ if (origin_loop.arr1 != NULL || origin_loop.arr2 != NULL) ++ return false; ++ ++ gimple_stmt_iterator gsi; ++ for (gsi = gsi_start_bb (body); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ gimple_set_visited (gsi_stmt (gsi), false); ++ } ++ ++ tree cond_lhs = gimple_cond_lhs (origin_loop.cond_stmt2); ++ tree cond_rhs = gimple_cond_rhs (origin_loop.cond_stmt2); ++ if (TREE_CODE (TREE_TYPE (cond_lhs)) != INTEGER_TYPE ++ || TREE_CODE (TREE_TYPE (cond_rhs)) != INTEGER_TYPE) ++ return false; ++ ++ auto_vec stack; ++ tree tmp_index = NULL; ++ stack.safe_push (cond_lhs); ++ stack.safe_push (cond_rhs); ++ gimple_set_visited (origin_loop.cond_stmt2, true); ++ ++ while (!stack.is_empty ()) ++ { ++ tree op = stack.pop (); ++ gimple *g = SSA_NAME_DEF_STMT (op); ++ if (!g || gimple_bb (g) != body || !is_gimple_assign (g)) ++ continue; ++ gimple_set_visited (g, true); ++ if (gimple_assign_rhs_code (g) == MEM_REF) ++ { ++ tree mem_ref = gimple_assign_rhs1 (g); ++ if (!check_body_mem_ref (mem_ref)) ++ return false; ++ stack.safe_push (TREE_OPERAND (mem_ref , 0)); ++ } ++ else if (gimple_assign_rhs_code (g) == POINTER_PLUS_EXPR) ++ { ++ tree rhs2 = gimple_assign_rhs2 (g); ++ if (!check_body_pointer_plus (g, tmp_index)) ++ return false; ++ stack.safe_push (rhs2); ++ } ++ else if (gimple_assign_rhs_code (g) == NOP_EXPR) ++ { ++ tree rhs = gimple_assign_rhs1 (g); ++ if (!same_ssa_name_var_p (rhs, origin_loop.base)) ++ return false; ++ stack.safe_push (rhs); ++ } ++ else ++ return false; ++ } ++ bool allvisited = true; ++ for (gsi = gsi_start_bb (body); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ if (!gimple_visited_p (gsi_stmt (gsi)) ++ && !is_gimple_debug (gsi_stmt (gsi))) ++ allvisited = false; ++ } ++ if (allvisited) ++ { ++ if (origin_loop.arr1 != NULL && origin_loop.arr2 != NULL) ++ return true; ++ } ++ return false; ++} ++ ++/* Dump the original loop information to see if the origin loop ++ form matches. */ ++ ++static void ++dump_origin_loop_info () ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nThe origin loop info:\n"); ++ fprintf (dump_file, "\n the origin_loop.limit is:\n"); ++ print_node (dump_file, "", origin_loop.limit, 0); ++ fprintf (dump_file, "\n"); ++ fprintf (dump_file, "\n the origin_loop.base is:\n"); ++ print_node (dump_file, "", origin_loop.base, 0); ++ fprintf (dump_file, "\n"); ++ fprintf (dump_file, "\n the origin_loop.arr1 is:\n"); ++ print_node (dump_file, "", origin_loop.arr1, 0); ++ fprintf (dump_file, "\n"); ++ fprintf (dump_file, "\n the origin_loop.arr2 is:\n"); ++ print_node (dump_file, "", origin_loop.arr2, 0); ++ fprintf (dump_file, "\n"); ++ fprintf (dump_file, "\n the origin_loop.cond_stmt1 is:\n"); ++ print_gimple_stmt (dump_file, origin_loop.cond_stmt1, 0); ++ fprintf (dump_file, "\n"); ++ fprintf (dump_file, "\n the origin_loop.cond_stmt2 is:\n"); ++ print_gimple_stmt (dump_file, origin_loop.cond_stmt2, 0); ++ fprintf (dump_file, "\n"); ++ fprintf (dump_file, "\n the origin_loop.update_stmt is:\n"); ++ print_gimple_stmt (dump_file, origin_loop.update_stmt, 0); ++ fprintf (dump_file, "\n"); ++ } ++} ++ ++/* Returns true only if the exit bb of the original loop is unique and its phi ++ node parameter comes from the same variable. */ ++ ++static bool ++check_exit_bb (class loop *loop) ++{ ++ if (origin_loop.exit_bb1 != origin_loop.exit_bb2 ++ || flow_bb_inside_loop_p (loop, origin_loop.exit_bb1)) ++ return false; ++ ++ gphi_iterator gsi; ++ for (gsi = gsi_start_phis (origin_loop.exit_bb1); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ gphi *phi = gsi.phi (); ++ tree res = gimple_phi_result (phi); ++ if (!same_ssa_name_var_p (res, origin_loop.base)) ++ continue; ++ if (gimple_phi_num_args (phi) == 2) ++ { ++ tree arg0 = gimple_phi_arg_def (phi, 0); ++ tree arg1 = gimple_phi_arg_def (phi, 1); ++ if (arg0 == arg1) ++ return true; ++ } ++ } ++ return false; ++} ++ ++/* Make sure that the recorded origin_loop information meets the ++ relative requirements. */ ++ ++static bool ++check_origin_loop_info (class loop *loop) ++{ ++ dump_origin_loop_info (); ++ tree arr1_elem_size, arr2_elem_size; ++ ++ if (!check_exit_bb (loop)) ++ return false; ++ ++ if (TREE_CODE (origin_loop.base) != SSA_NAME) ++ return false; ++ ++ if (!TYPE_READONLY (TREE_TYPE (origin_loop.limit))) ++ return false; ++ ++ if (!TYPE_READONLY (TREE_TYPE (TREE_TYPE (origin_loop.arr1)))) ++ return false; ++ ++ if (!TYPE_READONLY (TREE_TYPE (TREE_TYPE (origin_loop.arr2)))) ++ return false; ++ ++ if (TREE_CODE (TREE_TYPE (origin_loop.arr1)) != POINTER_TYPE ++ || TREE_CODE (TREE_TYPE (origin_loop.arr2)) != POINTER_TYPE ++ || TREE_CODE (TREE_TYPE (TREE_TYPE (origin_loop.arr1))) != INTEGER_TYPE ++ || TREE_CODE (TREE_TYPE (TREE_TYPE (origin_loop.arr2))) != INTEGER_TYPE) ++ return false; ++ ++ arr1_elem_size = TYPE_SIZE (TREE_TYPE (TREE_TYPE (origin_loop.arr1))); ++ arr2_elem_size = TYPE_SIZE (TREE_TYPE (TREE_TYPE (origin_loop.arr2))); ++ ++ if (tree_to_uhwi (arr1_elem_size) != 8 || tree_to_uhwi (arr2_elem_size) != 8) ++ return false; ++ ++ return true; ++} ++ ++/* Record the useful information of the original loop and judge whether the ++ information meets the specified conditions. */ ++ ++static bool ++check_record_loop_form (class loop *loop) ++{ ++ if (!record_origin_loop_exit_info (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nFailed to record loop exit information.\n"); ++ } ++ return false; ++ } ++ ++ if (!record_origin_loop_header (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nFailed to record loop header information.\n"); ++ } ++ return false; ++ } ++ ++ if (!record_origin_loop_latch (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nFailed to record loop latch information.\n"); ++ } ++ return false; ++ } ++ ++ if (!record_origin_loop_body (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nFailed to record loop body information.\n"); ++ } ++ return false; ++ } ++ ++ if (!check_origin_loop_info (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nFailed to check origin loop information.\n"); ++ } ++ return false; ++ } ++ ++ return true; ++} ++ ++/* The main entry for judging whether the loop meets some conditions. */ ++ ++static bool ++determine_loop_form (class loop *loop) ++{ ++ /* Currently only standard loops are processed, that is, only loop_header, ++ loop_latch, loop_body 3 bb blocks are included. */ ++ if (loop->inner || loop->num_nodes != 3) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nWrong loop form, there is inner loop or" ++ "redundant bb.\n"); ++ } ++ return false; ++ } ++ ++ if (single_exit (loop) || !loop->latch) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nWrong loop form, only one exit or loop_latch" ++ "does not exist.\n"); ++ } ++ return false; ++ } ++ ++ /* Support loop with only one backedge. */ ++ if (!loop_single_backedge_p (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nWrong loop form, loop back edges are not" ++ "unique.\n"); ++ } ++ return false; ++ } ++ ++ /* Support loop with only one preheader BB. */ ++ if (!loop_single_preheader_bb (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nWrong loop form, loop preheader bb are not" ++ "unique.\n"); ++ } ++ return false; ++ } ++ ++ init_origin_loop_structure (); ++ if (!check_record_loop_form (loop)) ++ return false; ++ ++ return true; ++} ++ ++/* Create prolog bb for newly constructed loop; When prolog_assign exists in ++ the original loop, the corresponding assign needs to be added to prolog_bb; ++ eg: ++ len_16 = len_10 + 1 ++ Create simple copy statement when prolog_assign does not exist; ++ eg: ++ len_16 = len_10 ++ ++ The IR of bb is as above. */ ++ ++static void ++create_prolog_bb (basic_block &prolog_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer, edge entry_edge) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gimple *g; ++ tree lhs1; ++ ++ prolog_bb = create_empty_bb (after_bb); ++ add_bb_to_loop (prolog_bb, outer); ++ redirect_edge_and_branch (entry_edge, prolog_bb); ++ set_immediate_dominator (CDI_DOMINATORS, prolog_bb, dominator_bb); ++ gsi = gsi_last_bb (prolog_bb); ++ lhs1 = copy_ssa_name (origin_loop.base); ++ ++ if (origin_loop.exist_prolog_assgin) ++ g = gimple_build_assign (lhs1, PLUS_EXPR, origin_loop.base, ++ build_int_cst (TREE_TYPE (origin_loop.base), origin_loop.step)); ++ else ++ g = gimple_build_assign (lhs1, NOP_EXPR, origin_loop.base); ++ gimple_seq_add_stmt (&stmts, g); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ set_current_def (origin_loop.base, lhs1); ++ defs_map.put (prolog_bb, lhs1); ++} ++ ++/* Create preheader bb for new loop; In order to ensure the standard form of ++ the loop, add a preheader_bb before loop_header. */ ++ ++static void ++create_loop_pred_bb (basic_block &loop_pred_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer) ++{ ++ loop_pred_bb = create_empty_bb (after_bb); ++ add_bb_to_loop (loop_pred_bb, outer); ++ set_immediate_dominator (CDI_DOMINATORS, loop_pred_bb, dominator_bb); ++ defs_map.put (loop_pred_bb, get_current_def (origin_loop.base)); ++} ++ ++/* Add phi_arg for bb with phi node. */ ++ ++static void ++rewrite_add_phi_arg (basic_block bb) ++{ ++ edge e; ++ edge_iterator ei; ++ gphi *phi; ++ gphi_iterator gsi; ++ tree res; ++ location_t loc; ++ ++ for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ phi = gsi.phi (); ++ res = gimple_phi_result (phi); ++ ++ FOR_EACH_EDGE (e, ei, bb->preds) ++ { ++ if (PHI_ARG_DEF_FROM_EDGE (phi, e)) ++ continue; ++ tree var = *(defs_map.get (e->src)); ++ if (!same_ssa_name_var_p (var, res)) ++ continue; ++ if (virtual_operand_p (var)) ++ loc = UNKNOWN_LOCATION; ++ else ++ loc = gimple_location (SSA_NAME_DEF_STMT (var)); ++ add_phi_arg (phi, var, e, loc); ++ } ++ } ++} ++ ++/* Create loop_header BB for align_loop. ++ eg: ++ _18 = (long unsigned int) len_17; ++ _19 = _18 + 8; ++ _20 = (long unsigned int) len_limit_12 (D); ++ if (_19 <= _20) ++ ++ The IR of bb is as above. */ ++ ++static void ++create_align_loop_header (basic_block &align_loop_header, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gcond *cond_stmt; ++ gphi *phi; ++ tree res; ++ ++ tree entry_node = get_current_def (origin_loop.base); ++ align_loop_header = create_empty_bb (after_bb); ++ add_bb_to_loop (align_loop_header, outer); ++ make_single_succ_edge (after_bb, align_loop_header, EDGE_FALLTHRU); ++ set_immediate_dominator (CDI_DOMINATORS, align_loop_header, dominator_bb); ++ gsi = gsi_last_bb (align_loop_header); ++ phi = create_phi_node (NULL_TREE, align_loop_header); ++ create_new_def_for (entry_node, phi, gimple_phi_result_ptr (phi)); ++ res = gimple_phi_result (phi); ++ ++ tree lhs1 = gimple_build (&stmts, NOP_EXPR, long_unsigned_type_node, res); ++ tree lhs2 = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (lhs1), lhs1, ++ build_int_cst (TREE_TYPE (lhs1), 8)); ++ tree lhs3 = gimple_build (&stmts, NOP_EXPR, long_unsigned_type_node, ++ origin_loop.limit); ++ cond_stmt = gimple_build_cond (LE_EXPR, lhs2, lhs3, NULL_TREE, NULL_TREE); ++ gimple_seq_add_stmt (&stmts, cond_stmt); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ ++ set_current_def (origin_loop.base, res); ++ defs_map.put (align_loop_header, res); ++} ++ ++/* Create loop body BB for align_loop. ++ eg: ++ _21 = (sizetype) len_17; ++ _22 = cur_15 (D) + _21; ++ _23 = MEM[(long unsigned int *)_22]; ++ _24 = pb_13 (D) + _21; ++ _25 = MEM[(long unsigned int *)_24]; ++ if (_23 != _25) ++ ++ The IR of bb is as above. */ ++ ++static void ++create_align_loop_body_bb (basic_block &align_loop_body_bb, ++ basic_block after_bb, basic_block dominator_bb, ++ class loop *outer) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gimple *g; ++ gcond *cond_stmt; ++ tree lhs1, lhs2; ++ ++ align_loop_body_bb = create_empty_bb (after_bb); ++ add_bb_to_loop (align_loop_body_bb, outer); ++ make_edge (after_bb, align_loop_body_bb, EDGE_TRUE_VALUE); ++ set_immediate_dominator (CDI_DOMINATORS, align_loop_body_bb, dominator_bb); ++ gsi = gsi_last_bb (align_loop_body_bb); ++ ++ tree var = gimple_build (&stmts, NOP_EXPR, sizetype, ++ get_current_def (origin_loop.base)); ++ lhs1 = gimple_build (&stmts, POINTER_PLUS_EXPR, TREE_TYPE (origin_loop.arr2), ++ origin_loop.arr2, var); ++ g = gimple_build_assign (make_ssa_name (long_unsigned_type_node), ++ fold_build2 (MEM_REF, long_unsigned_type_node, lhs1, ++ build_int_cst (build_pointer_type (long_unsigned_type_node), 0))); ++ gimple_seq_add_stmt (&stmts, g); ++ lhs1 = gimple_assign_lhs (g); ++ lhs2 = gimple_build (&stmts, POINTER_PLUS_EXPR, TREE_TYPE (origin_loop.arr1), ++ origin_loop.arr1, var); ++ g = gimple_build_assign (make_ssa_name (long_unsigned_type_node), ++ fold_build2 (MEM_REF, long_unsigned_type_node, lhs2, ++ build_int_cst (build_pointer_type (long_unsigned_type_node), 0))); ++ gimple_seq_add_stmt (&stmts, g); ++ lhs2 = gimple_assign_lhs (g); ++ cond_stmt = gimple_build_cond (gimple_cond_code (origin_loop.cond_stmt2), ++ lhs1, lhs2, NULL_TREE, NULL_TREE); ++ gimple_seq_add_stmt (&stmts, cond_stmt); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++} ++ ++/* Create loop_latch BB for align_loop. ++ eg: ++ len_26 = len_17 + 8; ++ ++ The IR of bb is as above. */ ++ ++static void ++create_align_loop_latch (basic_block &align_loop_latch, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gimple *g; ++ tree res; ++ ++ tree entry_node = get_current_def (origin_loop.base); ++ align_loop_latch = create_empty_bb (after_bb); ++ add_bb_to_loop (align_loop_latch, outer); ++ make_edge (after_bb, align_loop_latch, EDGE_FALSE_VALUE); ++ set_immediate_dominator (CDI_DOMINATORS, align_loop_latch, dominator_bb); ++ gsi = gsi_last_bb (align_loop_latch); ++ res = copy_ssa_name (entry_node); ++ g = gimple_build_assign (res, PLUS_EXPR, entry_node, ++ build_int_cst (TREE_TYPE (entry_node), 8)); ++ gimple_seq_add_stmt (&stmts, g); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ defs_map.put (align_loop_latch, res); ++} ++ ++/* Create a new loop and add it to outer_loop and return. */ ++ ++static class loop * ++init_new_loop (class loop *outer_loop, basic_block header, basic_block latch) ++{ ++ class loop *new_loop; ++ new_loop = alloc_loop (); ++ new_loop->header = header; ++ new_loop->latch = latch; ++ add_loop (new_loop, outer_loop); ++ ++ return new_loop; ++} ++ ++/* Create necessary exit BB for align_loop. ++ eg: ++ _27 = _23 ^ _25; ++ _28 = __builtin_ctzll (_27); ++ _29 = _28 >> 3; ++ len_30 = _29 + len_17; ++ ++ The IR of bb is as above. */ ++ ++static void ++create_align_loop_exit_bb (basic_block &align_loop_exit_bb, ++ basic_block after_bb, basic_block dominator_bb, ++ class loop *outer) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gimple *g; ++ gimple *cond_stmt; ++ tree lhs1, lhs2; ++ tree cond_lhs, cond_rhs; ++ gcall *build_ctzll; ++ ++ tree entry_node = get_current_def (origin_loop.base); ++ align_loop_exit_bb = create_empty_bb (after_bb); ++ add_bb_to_loop (align_loop_exit_bb, outer); ++ make_edge (after_bb, align_loop_exit_bb, EDGE_TRUE_VALUE); ++ set_immediate_dominator (CDI_DOMINATORS, align_loop_exit_bb, dominator_bb); ++ gsi = gsi_last_bb (align_loop_exit_bb); ++ ++ cond_stmt = gsi_stmt (gsi_last_bb (after_bb)); ++ cond_lhs = gimple_cond_lhs (cond_stmt); ++ cond_rhs = gimple_cond_rhs (cond_stmt); ++ ++ lhs1 = gimple_build (&stmts, BIT_XOR_EXPR, TREE_TYPE (cond_lhs), cond_lhs, ++ cond_rhs); ++ build_ctzll = gimple_build_call (builtin_decl_explicit (BUILT_IN_CTZLL), 1, ++ lhs1); ++ lhs1 = make_ssa_name (integer_type_node); ++ gimple_call_set_lhs (build_ctzll, lhs1); ++ gimple_seq_add_stmt (&stmts, build_ctzll); ++ lhs2 = copy_ssa_name (lhs1); ++ g = gimple_build_assign (lhs2, RSHIFT_EXPR, lhs1, ++ build_int_cst (TREE_TYPE (lhs1), 3)); ++ gimple_seq_add_stmt (&stmts, g); ++ lhs1 = gimple_build (&stmts, NOP_EXPR, TREE_TYPE (entry_node), lhs2); ++ lhs2 = copy_ssa_name (entry_node); ++ g = gimple_build_assign (lhs2, PLUS_EXPR, lhs1, entry_node); ++ gimple_seq_add_stmt (&stmts, g); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ defs_map.put (align_loop_exit_bb, lhs2); ++} ++ ++/* Create loop_header BB for epilogue_loop. ++ eg: ++ # len_31 = PHI ++ if (len_31 != len_limit_12 (D)) ++ ++ The IR of bb is as above. */ ++ ++static void ++create_epilogue_loop_header (basic_block &epilogue_loop_header, ++ basic_block after_bb, basic_block dominator_bb, ++ class loop *outer) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gcond *cond_stmt; ++ tree res; ++ gphi *phi; ++ ++ tree entry_node = get_current_def (origin_loop.base); ++ epilogue_loop_header = create_empty_bb (after_bb); ++ add_bb_to_loop (epilogue_loop_header, outer); ++ make_single_succ_edge (after_bb, epilogue_loop_header, EDGE_FALLTHRU); ++ set_immediate_dominator (CDI_DOMINATORS, epilogue_loop_header, dominator_bb); ++ gsi = gsi_last_bb (epilogue_loop_header); ++ phi = create_phi_node (NULL_TREE, epilogue_loop_header); ++ create_new_def_for (entry_node, phi, gimple_phi_result_ptr (phi)); ++ res = gimple_phi_result (phi); ++ cond_stmt = gimple_build_cond (gimple_cond_code (origin_loop.cond_stmt1), res, ++ origin_loop.limit, NULL_TREE, NULL_TREE); ++ gimple_seq_add_stmt (&stmts, cond_stmt); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ ++ set_current_def (origin_loop.base, res); ++ defs_map.put (epilogue_loop_header, res); ++} ++ ++/* Create loop body BB for epilogue_loop. ++ eg: ++ _32 = (sizetype) len_31; ++ _33 = pb_13 (D) + _32; ++ _34 = *_33; ++ _35 = cur_15 (D) + _32; ++ _36 = *_35; ++ if (_34 != _36) ++ ++ The IR of bb is as above. */ ++ ++static void ++create_epilogue_loop_body_bb (basic_block &epilogue_loop_body_bb, ++ basic_block after_bb, basic_block dominator_bb, ++ class loop *outer) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gimple *g; ++ gcond *cond_stmt; ++ tree lhs1, lhs2, lhs3; ++ ++ tree entry_node = get_current_def (origin_loop.base); ++ epilogue_loop_body_bb = create_empty_bb (after_bb); ++ add_bb_to_loop (epilogue_loop_body_bb, outer); ++ make_edge (after_bb, epilogue_loop_body_bb, EDGE_TRUE_VALUE); ++ set_immediate_dominator (CDI_DOMINATORS, epilogue_loop_body_bb, dominator_bb); ++ gsi = gsi_last_bb (epilogue_loop_body_bb); ++ lhs1 = gimple_build (&stmts, NOP_EXPR, sizetype, entry_node); ++ lhs2 = gimple_build (&stmts, POINTER_PLUS_EXPR, TREE_TYPE (origin_loop.arr1), ++ origin_loop.arr1, lhs1); ++ g = gimple_build_assign (make_ssa_name (unsigned_char_type_node), ++ fold_build2 (MEM_REF, unsigned_char_type_node, lhs2, ++ build_int_cst (TREE_TYPE (lhs2), 0))); ++ gimple_seq_add_stmt (&stmts, g); ++ lhs2 = gimple_assign_lhs (g); ++ lhs3 = gimple_build (&stmts, POINTER_PLUS_EXPR, TREE_TYPE (origin_loop.arr2), ++ origin_loop.arr2, lhs1); ++ g = gimple_build_assign (make_ssa_name (unsigned_char_type_node), ++ fold_build2 (MEM_REF, unsigned_char_type_node, lhs3, ++ build_int_cst (TREE_TYPE (lhs3), 0))); ++ gimple_seq_add_stmt (&stmts, g); ++ lhs3 = gimple_assign_lhs (g); ++ cond_stmt = gimple_build_cond (gimple_cond_code (origin_loop.cond_stmt2), lhs2, ++ lhs3, NULL_TREE, NULL_TREE); ++ gimple_seq_add_stmt (&stmts, cond_stmt); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ defs_map.put (epilogue_loop_body_bb, get_current_def (origin_loop.base)); ++} ++ ++/* Create loop_latch BB for epilogue_loop. ++ eg: ++ len_37 = len_31 + 1; ++ ++ The IR of bb is as above. */ ++ ++static void ++create_epilogue_loop_latch (basic_block &epilogue_loop_latch, ++ basic_block after_bb, basic_block dominator_bb, ++ class loop *outer) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gimple *g; ++ tree res; ++ ++ tree entry_node = get_current_def (origin_loop.base); ++ epilogue_loop_latch = create_empty_bb (after_bb); ++ add_bb_to_loop (epilogue_loop_latch, outer); ++ make_edge (after_bb, epilogue_loop_latch, EDGE_FALSE_VALUE); ++ set_immediate_dominator (CDI_DOMINATORS, epilogue_loop_latch, dominator_bb); ++ gsi = gsi_last_bb (epilogue_loop_latch); ++ res = copy_ssa_name (entry_node); ++ g = gimple_build_assign (res, PLUS_EXPR, entry_node, ++ build_int_cst (TREE_TYPE (entry_node), origin_loop.step)); ++ gimple_seq_add_stmt (&stmts, g); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ defs_map.put (epilogue_loop_latch, res); ++} ++ ++/* convert_to_new_loop ++ | | ++ | | ++ | | entry_edge ++ | ______ | ++ | / V V ++ | | -----origin_loop_header--- ++ | | | | ++ | | -------------------------\ ++ | | | \ ++ | | V \___ ___ ___ ___ ___ ___ ___ ++ | | -----origin_loop_body----- | ++ | | | | | ++ | | -------------------------\ | ++ | | | \___ ___ ___ ___ | ++ | | V V V ++ | | -----origin_loop_latch---- -----exit_bb------ ++ | | | | | | ++ | | /-------------------------- ------------------ ++ | \ __ / ++ | ++ | | ++ | ====> |entry_edge ++ | V ++ | -------prolog_bb----- ++ | | | ++ | --------------------- ++ | | ++ | V ++ | -----align_loop_header---- ++ | /-----------------> | | ++ |/ -------------------------- ++ || / \ ++ || V V ++ || ---align_loop_body--- ---epilogue_loop_header-- ++ || | | -------| |<---| ++ || --------------------\ / ------------------------- | ++ || | \____ | | | ++ || V | | V | ++ || ---align_loop_latch--- | | ---epilogue_loop_body---- | ++ || | | | | ----| | | ++ || ---------------------- | | / ------------------------- | ++ || / __________/ | | | | ++ || / | | | V | ++ | \ __________/ | | | ---epilogue_loop_latch--- | ++ | | | | | | | ++ | | | | ------------------------- / ++ | V | | | / ++ | -align_loop_exit_bb- | | \______________/ ++ | | | | | ++ | -------------------- | | ++ | | | | ++ | | V V ++ | | -----exit_bb------ ++ | |---->| | ++ | ------------------ ++ ++ The origin_loop conversion process starts from entry_edge and ends at ++ exit_bb; The execution logic of origin_loop is completely replaced by ++ align_loop + epilogue_loop: ++ 1) align_loop mainly implements the idea of ​​using wide-type dereference ++ and comparison on array elements, so as to achieve the effect of ++ acceleration; For the corresponding source code understanding, please ++ refer to the description of the pass at the beginning; ++ 2) epilogue_loop processes the previous loop remaining array element ++ comparison. */ ++ ++static void ++create_new_loops (edge entry_edge) ++{ ++ basic_block prolog_bb; ++ basic_block align_loop_header, align_loop_latch, align_loop_body_bb; ++ basic_block align_pred_bb, align_loop_exit_bb; ++ basic_block epilogue_loop_header, epilogue_loop_latch, epilogue_loop_body_bb; ++ basic_block epilogue_loop_pred_bb; ++ class loop *align_loop; ++ class loop *epilogue_loop; ++ ++ class loop *outer = entry_edge->src->loop_father; ++ ++ create_prolog_bb (prolog_bb, entry_edge->src, entry_edge->src, outer, ++ entry_edge); ++ ++ create_loop_pred_bb (align_pred_bb, prolog_bb, prolog_bb, outer); ++ make_single_succ_edge (prolog_bb, align_pred_bb, EDGE_FALLTHRU); ++ ++ create_align_loop_header (align_loop_header, align_pred_bb, ++ align_pred_bb, outer); ++ ++ create_align_loop_body_bb (align_loop_body_bb, align_loop_header, ++ align_loop_header, outer); ++ ++ create_align_loop_latch (align_loop_latch, align_loop_body_bb, ++ align_loop_body_bb, outer); ++ make_edge (align_loop_latch, align_loop_header, EDGE_FALLTHRU); ++ rewrite_add_phi_arg (align_loop_header); ++ ++ align_loop = init_new_loop (outer, align_loop_header, align_loop_latch); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nPrint byte align loop %d:\n", align_loop->num); ++ flow_loop_dump (align_loop, dump_file, NULL, 1); ++ fprintf (dump_file, "\n\n"); ++ } ++ ++ create_align_loop_exit_bb (align_loop_exit_bb, align_loop_body_bb, ++ align_loop_body_bb, outer); ++ ++ create_loop_pred_bb (epilogue_loop_pred_bb, align_loop_header, ++ align_loop_header, outer); ++ make_edge (align_loop_header, epilogue_loop_pred_bb, EDGE_FALSE_VALUE); ++ ++ create_epilogue_loop_header (epilogue_loop_header, epilogue_loop_pred_bb, ++ epilogue_loop_pred_bb, outer); ++ ++ create_epilogue_loop_body_bb (epilogue_loop_body_bb, epilogue_loop_header, ++ epilogue_loop_header, outer); ++ ++ create_epilogue_loop_latch (epilogue_loop_latch, epilogue_loop_body_bb, ++ epilogue_loop_body_bb, outer); ++ make_single_succ_edge (epilogue_loop_latch, epilogue_loop_header, ++ EDGE_FALLTHRU); ++ rewrite_add_phi_arg (epilogue_loop_header); ++ ++ epilogue_loop = init_new_loop (outer, epilogue_loop_header, ++ epilogue_loop_latch); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nPrint epilogue loop %d:\n", epilogue_loop->num); ++ flow_loop_dump (epilogue_loop, dump_file, NULL, 1); ++ fprintf (dump_file, "\n\n"); ++ } ++ make_single_succ_edge (align_loop_exit_bb, origin_loop.exit_bb1, ++ EDGE_FALLTHRU); ++ set_immediate_dominator (CDI_DOMINATORS, origin_loop.exit_bb1, ++ entry_edge->src); ++ make_edge (epilogue_loop_body_bb, origin_loop.exit_bb1, EDGE_TRUE_VALUE); ++ ++ make_edge (epilogue_loop_header, origin_loop.exit_bb2, EDGE_FALSE_VALUE); ++ set_immediate_dominator (CDI_DOMINATORS, origin_loop.exit_bb2, ++ entry_edge->src); ++ ++ rewrite_add_phi_arg (origin_loop.exit_bb1); ++ rewrite_add_phi_arg (origin_loop.exit_bb2); ++ ++ remove_edge (origin_loop.exit_e1); ++ remove_edge (origin_loop.exit_e2); ++} ++ ++/* Make sure that the dominance relationship of the newly inserted cfg ++ is not missing. */ ++ ++static void ++update_loop_dominator (cdi_direction dir) ++{ ++ gcc_assert (dom_info_available_p (dir)); ++ ++ basic_block bb; ++ FOR_EACH_BB_FN (bb, cfun) ++ { ++ basic_block imm_bb = get_immediate_dominator (dir, bb); ++ if (!imm_bb || bb == origin_loop.exit_bb1) ++ { ++ set_immediate_dominator (CDI_DOMINATORS, bb, ++ recompute_dominator (CDI_DOMINATORS, bb)); ++ continue; ++ } ++ } ++} ++ ++/* Clear information about the original loop. */ ++ ++static void ++remove_origin_loop (class loop *loop) ++{ ++ basic_block *body; ++ ++ body = get_loop_body_in_dom_order (loop); ++ unsigned n = loop->num_nodes; ++ for (unsigned i = 0; i < n; i++) ++ { ++ delete_basic_block (body[i]); ++ } ++ free (body); ++ delete_loop (loop); ++} ++ ++/* Perform the conversion of origin_loop to new_loop. */ ++ ++static void ++convert_to_new_loop (class loop *loop) ++{ ++ create_new_loops (origin_loop.entry_edge); ++ remove_origin_loop (loop); ++ update_loop_dominator (CDI_DOMINATORS); ++ update_ssa (TODO_update_ssa); ++} ++ ++/* The main entry of array-widen-compare optimizes. */ ++ ++static unsigned int ++tree_ssa_array_widen_compare () ++{ ++ unsigned int todo = 0; ++ class loop *loop; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ flow_loops_dump (dump_file, NULL, 1); ++ fprintf (dump_file, "\nConfirm which loop can be optimized using" ++ " array-widen-compare\n"); ++ } ++ ++ enum li_flags LI = LI_FROM_INNERMOST; ++ FOR_EACH_LOOP (loop, LI) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "======================================\n"); ++ fprintf (dump_file, "Processing loop %d:\n", loop->num); ++ fprintf (dump_file, "======================================\n"); ++ flow_loop_dump (loop, dump_file, NULL, 1); ++ fprintf (dump_file, "\n\n"); ++ } ++ ++ if (determine_loop_form (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "The %dth loop form is success matched," ++ "and the loop can be optimized.\n", ++ loop->num); ++ dump_loop_bb (loop); ++ } ++ ++ convert_to_new_loop (loop); ++ } ++ } ++ ++ todo |= (TODO_update_ssa); ++ return todo; ++} ++ ++/* Array widen compare. */ ++ ++namespace { ++ ++const pass_data pass_data_tree_array_widen_compare = ++{ ++ GIMPLE_PASS, ++ "awiden_compare", ++ OPTGROUP_LOOP, ++ TV_TREE_ARRAY_WIDEN_COMPARE, ++ (PROP_cfg | PROP_ssa), ++ 0, ++ 0, ++ 0, ++ (TODO_update_ssa | TODO_verify_all) ++}; ++ ++class pass_array_widen_compare : public gimple_opt_pass ++{ ++public: ++ pass_array_widen_compare (gcc::context *ctxt) ++ : gimple_opt_pass (pass_data_tree_array_widen_compare, ctxt) ++ {} ++ ++ /* opt_pass methods: */ ++ virtual bool gate (function *); ++ virtual unsigned int execute (function *); ++ ++}; // class pass_array_widen_compare ++ ++bool ++pass_array_widen_compare::gate (function *) ++{ ++ return (flag_array_widen_compare > 0 && optimize >= 3); ++} ++ ++unsigned int ++pass_array_widen_compare::execute (function *fun) ++{ ++ if (number_of_loops (fun) <= 1) ++ return 0; ++ ++ /* Only supports LP64 data mode. */ ++ if (TYPE_PRECISION (long_integer_type_node) != 64 ++ || POINTER_SIZE != 64 || TYPE_PRECISION (integer_type_node) != 32) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "The current data mode is not supported," ++ "only the LP64 date mode is supported.\n"); ++ return 0; ++ } ++ ++ return tree_ssa_array_widen_compare (); ++} ++ ++} // anon namespace ++ ++gimple_opt_pass * ++make_pass_array_widen_compare (gcc::context *ctxt) ++{ ++ return new pass_array_widen_compare (ctxt); ++} +\ No newline at end of file +-- +2.27.0.windows.1 + diff --git a/gcc.spec b/gcc.spec index acc5506cf28222ffa28e9b5fc7442e9461cf4554..d5a0273e20b374eefb49e219dd6f9c26dc6e262f 100644 --- a/gcc.spec +++ b/gcc.spec @@ -61,7 +61,7 @@ Summary: Various compilers (C, C++, Objective-C, ...) Name: gcc Version: %{gcc_version} -Release: 12 +Release: 13 License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD URL: https://gcc.gnu.org @@ -150,6 +150,18 @@ Patch31: 0031-AutoBOLT-Enable-BOLT-linker-plugin-on-aarch64-3-3.patch Patch32: 0032-Autoprefetch-Prune-invaild-loops-containing-edges-wh.patch Patch33: 0033-AutoFdo-Fix-memory-leaks-in-autofdo-and-autoprefetch.patch Patch34: 0034-Backport-sanitizer-Fix-asan-against-glibc-2.34-PR100.patch +Patch35: 0035-ccmp-Add-another-optimization-opportunity-for-ccmp-i.patch +Patch36: 0036-StructReorg-Refactoring-reorder-fields-to-struct-lay.patch +Patch37: 0037-Backport-loop-invariant-Don-t-move-cold-bb-instructi.patch +Patch38: 0038-DFE-Add-Dead-Field-Elimination-in-Struct-Reorg.patch +Patch39: 0039-Backport-ipa-sra-Fix-thinko-when-overriding-safe_to_.patch +Patch40: 0040-Backport-ifcvt-Allow-constants-for-noce_convert_mult.patch +Patch41: 0041-Backport-Register-sysroot-in-the-driver-switches-tab.patch +Patch42: 0042-DFE-Fix-bugs.patch +Patch43: 0043-Backport-Extend-special_memory_constraint.patch +Patch44: 0044-Backport-ira-Fix-unnecessary-register-spill.patch +Patch45: 0045-Transposed-SLP-Enable-Transposed-SLP.patch +Patch46: 0046-ArrayWidenCompare-Add-a-new-optimization-for-array-c.patch %global gcc_target_platform %{_arch}-linux-gnu @@ -640,6 +652,18 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch32 -p1 %patch33 -p1 %patch34 -p1 +%patch35 -p1 +%patch36 -p1 +%patch37 -p1 +%patch38 -p1 +%patch39 -p1 +%patch40 -p1 +%patch41 -p1 +%patch42 -p1 +%patch43 -p1 +%patch44 -p1 +%patch45 -p1 +%patch46 -p1 %build @@ -2660,6 +2684,12 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Mon Aug 8 2022 benniaobufeijiushiji - 10.3.1-13 +- Type:Sync +- ID:NA +- SUG:NA +- DESC:Sync patch from openeuler/gcc + * Fri Jul 08 2022 zhaomengmeng - 10.3.1-12 - Type:SPEC - ID:NA