diff --git a/0035-ccmp-Add-another-optimization-opportunity-for-ccmp-i.patch b/0035-ccmp-Add-another-optimization-opportunity-for-ccmp-i.patch new file mode 100644 index 0000000000000000000000000000000000000000..91a29d7cf9dadee86fee3f547d64861c2cb8fed1 --- /dev/null +++ b/0035-ccmp-Add-another-optimization-opportunity-for-ccmp-i.patch @@ -0,0 +1,334 @@ +From cf0f086ec274d794a2a180047123920bf8a5224b Mon Sep 17 00:00:00 2001 +From: dingguangya +Date: Mon, 17 Jan 2022 21:03:47 +0800 +Subject: [PATCH] [ccmp] Add another optimization opportunity for ccmp + instruction + +Add flag -fccmp2. +Enables the use of the ccmp instruction by creating a new conflict +relationship for instances where temporary expressions replacement +cannot be effectively created. + +diff --git a/gcc/ccmp.c b/gcc/ccmp.c +index ca77375a9..8d2d73e52 100644 +--- a/gcc/ccmp.c ++++ b/gcc/ccmp.c +@@ -37,6 +37,7 @@ along with GCC; see the file COPYING3. If not see + #include "cfgexpand.h" + #include "ccmp.h" + #include "predict.h" ++#include "gimple-iterator.h" + + /* Check whether T is a simple boolean variable or a SSA name + set by a comparison operator in the same basic block. */ +@@ -129,6 +130,38 @@ ccmp_candidate_p (gimple *g) + return false; + } + ++/* Check whether bb is a potential conditional compare candidate. */ ++bool ++check_ccmp_candidate (basic_block bb) ++{ ++ gimple_stmt_iterator gsi; ++ gimple *bb_last_stmt, *stmt; ++ tree op0, op1; ++ ++ gsi = gsi_last_bb (bb); ++ bb_last_stmt = gsi_stmt (gsi); ++ ++ if (bb_last_stmt && gimple_code (bb_last_stmt) == GIMPLE_COND) ++ { ++ op0 = gimple_cond_lhs (bb_last_stmt); ++ op1 = gimple_cond_rhs (bb_last_stmt); ++ ++ if (TREE_CODE (op0) == SSA_NAME ++ && TREE_CODE (TREE_TYPE (op0)) == BOOLEAN_TYPE ++ && TREE_CODE (op1) == INTEGER_CST ++ && ((gimple_cond_code (bb_last_stmt) == NE_EXPR) ++ || (gimple_cond_code (bb_last_stmt) == EQ_EXPR))) ++ { ++ stmt = SSA_NAME_DEF_STMT (op0); ++ if (stmt && gimple_code (stmt) == GIMPLE_ASSIGN) ++ { ++ return ccmp_candidate_p (stmt); ++ } ++ } ++ } ++ return false; ++} ++ + /* Extract the comparison we want to do from the tree. */ + void + get_compare_parts (tree t, int *up, rtx_code *rcode, +diff --git a/gcc/ccmp.h b/gcc/ccmp.h +index 199dd581d..ac862f0f6 100644 +--- a/gcc/ccmp.h ++++ b/gcc/ccmp.h +@@ -21,5 +21,6 @@ along with GCC; see the file COPYING3. If not see + #define GCC_CCMP_H + + extern rtx expand_ccmp_expr (gimple *, machine_mode); ++extern bool check_ccmp_candidate (basic_block bb); + + #endif /* GCC_CCMP_H */ +diff --git a/gcc/common.opt b/gcc/common.opt +index 24834cf60..4dd566def 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1942,6 +1942,10 @@ fira-verbose= + Common RejectNegative Joined UInteger Var(flag_ira_verbose) Init(5) + -fira-verbose= Control IRA's level of diagnostic messages. + ++fccmp2 ++Common Report Var(flag_ccmp2) Init(0) Optimization ++Optimize potential ccmp instruction in complex scenarios. ++ + fivopts + Common Report Var(flag_ivopts) Init(1) Optimization + Optimize induction variables on trees. +diff --git a/gcc/testsuite/gcc.target/aarch64/ccmp_3.c b/gcc/testsuite/gcc.target/aarch64/ccmp_3.c +new file mode 100644 +index 000000000..b509ba810 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/ccmp_3.c +@@ -0,0 +1,15 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O -fdump-rtl-expand-details -fccmp2" } */ ++ ++int func (int a, int b, int c) ++{ ++ while(1) ++ { ++ if(a-- == 0 || b >= c) ++ { ++ return 1; ++ } ++ } ++} ++ ++/* { dg-final { scan-assembler-times "\tccmp\t" 1} } */ +diff --git a/gcc/tree-ssa-coalesce.c b/gcc/tree-ssa-coalesce.c +index 0b0b1b18d..e0120a4a4 100644 +--- a/gcc/tree-ssa-coalesce.c ++++ b/gcc/tree-ssa-coalesce.c +@@ -38,6 +38,9 @@ along with GCC; see the file COPYING3. If not see + #include "explow.h" + #include "tree-dfa.h" + #include "stor-layout.h" ++#include "ccmp.h" ++#include "target.h" ++#include "tree-outof-ssa.h" + + /* This set of routines implements a coalesce_list. This is an object which + is used to track pairs of ssa_names which are desirable to coalesce +@@ -854,6 +857,198 @@ live_track_clear_base_vars (live_track *ptr) + bitmap_clear (&ptr->live_base_var); + } + ++/* Return true if gimple is a copy assignment. */ ++ ++static inline bool ++gimple_is_assign_copy_p (gimple *gs) ++{ ++ return (is_gimple_assign (gs) && gimple_assign_copy_p (gs) ++ && TREE_CODE (gimple_assign_lhs (gs)) == SSA_NAME ++ && TREE_CODE (gimple_assign_rhs1 (gs)) == SSA_NAME); ++} ++ ++#define MAX_CCMP_CONFLICT_NUM 5 ++ ++/* Clear high-cost conflict graphs. */ ++ ++static void ++remove_high_cost_graph_for_ccmp (ssa_conflicts *conflict_graph) ++{ ++ unsigned x = 0; ++ int add_conflict_num = 0; ++ bitmap b; ++ FOR_EACH_VEC_ELT (conflict_graph->conflicts, x, b) ++ { ++ if (b) ++ { ++ add_conflict_num++; ++ } ++ } ++ if (add_conflict_num >= MAX_CCMP_CONFLICT_NUM) ++ { ++ conflict_graph->conflicts.release (); ++ } ++} ++ ++/* Adding a new conflict graph to the original graph. */ ++ ++static void ++process_add_graph (live_track *live, basic_block bb, ++ ssa_conflicts *conflict_graph) ++{ ++ tree use, def; ++ ssa_op_iter iter; ++ gimple *first_visit_stmt = NULL; ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ if (gimple_visited_p (gsi_stmt (gsi))) ++ { ++ first_visit_stmt = gsi_stmt (gsi); ++ break; ++ } ++ } ++ if (!first_visit_stmt) ++ return; ++ ++ for (gimple_stmt_iterator gsi = gsi_last_bb (bb); ++ gsi_stmt (gsi) != first_visit_stmt; gsi_prev (&gsi)) ++ { ++ gimple *stmt = gsi_stmt (gsi); ++ if (gimple_visited_p (gsi_stmt (gsi)) && is_gimple_debug (stmt)) ++ { ++ continue; ++ } ++ if (gimple_is_assign_copy_p (stmt)) ++ { ++ live_track_clear_var (live, gimple_assign_rhs1 (stmt)); ++ } ++ FOR_EACH_SSA_TREE_OPERAND (def, stmt, iter, SSA_OP_DEF) ++ { ++ live_track_process_def (live, def, conflict_graph); ++ } ++ FOR_EACH_SSA_TREE_OPERAND (use, stmt, iter, SSA_OP_USE) ++ { ++ live_track_process_use (live, use); ++ } ++ } ++} ++ ++/* Build a conflict graph based on ccmp candidate. */ ++ ++static void ++add_ccmp_conflict_graph (ssa_conflicts *conflict_graph, ++ tree_live_info_p liveinfo, var_map map, basic_block bb) ++{ ++ live_track *live; ++ tree use, def; ++ ssa_op_iter iter; ++ live = new_live_track (map); ++ live_track_init (live, live_on_exit (liveinfo, bb)); ++ ++ gimple *last_stmt = gsi_stmt (gsi_last_bb (bb)); ++ gcc_assert (gimple_cond_lhs (last_stmt)); ++ ++ auto_vec stack; ++ stack.safe_push (gimple_cond_lhs (last_stmt)); ++ while (!stack.is_empty ()) ++ { ++ tree op = stack.pop (); ++ gimple *op_stmt = SSA_NAME_DEF_STMT (op); ++ if (!op_stmt || gimple_bb (op_stmt) != bb ++ || !is_gimple_assign (op_stmt) ++ || !ssa_is_replaceable_p (op_stmt)) ++ { ++ continue; ++ } ++ if (gimple_is_assign_copy_p (op_stmt)) ++ { ++ live_track_clear_var (live, gimple_assign_rhs1 (op_stmt)); ++ } ++ gimple_set_visited (op_stmt, true); ++ FOR_EACH_SSA_TREE_OPERAND (def, op_stmt, iter, SSA_OP_DEF) ++ { ++ live_track_process_def (live, def, conflict_graph); ++ } ++ FOR_EACH_SSA_TREE_OPERAND (use, op_stmt, iter, SSA_OP_USE) ++ { ++ stack.safe_push (use); ++ live_track_process_use (live, use); ++ } ++ } ++ ++ process_add_graph (live, bb, conflict_graph); ++ delete_live_track (live); ++ remove_high_cost_graph_for_ccmp (conflict_graph); ++} ++ ++/* Determine whether the ccmp conflict graph can be added. ++ i.e, ++ ++ ;; basic block 3, loop depth 1 ++ ;; pred: 2 ++ ;; 3 ++ # ivtmp.5_10 = PHI ++ _7 = b_4 (D) >= c_5 (D); ++ _8 = ivtmp.5_10 == 0; ++ _9 = _7 | _8; ++ ivtmp.5_11 = ivtmp.5_10 - 1; ++ if (_9 != 0) ++ goto ; [10.70%] ++ else ++ goto ; [89.30%] ++ ++ In the above loop, the expression will be replaced: ++ ++ _7 replaced by b_4 (D) >= c_5 (D) ++ _8 replaced by ivtmp.5_10 == 0 ++ ++ If the current case want use the ccmp instruction, then ++ ++ _9 can replaced by _7 | _8 ++ ++ So this requires that ivtmp.5_11 and ivtmp.5_10 be divided into different ++ partitions. ++ ++ Now this function can achieve this ability. */ ++ ++static void ++determine_add_ccmp_conflict_graph (basic_block bb, tree_live_info_p liveinfo, ++ var_map map, ssa_conflicts *graph) ++{ ++ if (!flag_ccmp2 || !targetm.gen_ccmp_first || !check_ccmp_candidate (bb)) ++ return; ++ for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); ++ gsi_next (&bsi)) ++ { ++ gimple_set_visited (gsi_stmt (bsi), false); ++ } ++ ssa_conflicts *ccmp_conflict_graph; ++ ccmp_conflict_graph = ssa_conflicts_new (num_var_partitions (map)); ++ add_ccmp_conflict_graph (ccmp_conflict_graph, liveinfo, map, bb); ++ unsigned x; ++ bitmap b; ++ if (ccmp_conflict_graph) ++ { ++ FOR_EACH_VEC_ELT (ccmp_conflict_graph->conflicts, x, b) ++ { ++ if (!b) ++ continue; ++ unsigned y = bitmap_first_set_bit (b); ++ if (!graph->conflicts[x] || !bitmap_bit_p (graph->conflicts[x], y)) ++ { ++ ssa_conflicts_add (graph, x, y); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "potential ccmp: add additional " ++ "conflict-ssa : bb[%d] %d:%d\n", ++ bb->index, x, y); ++ } ++ } ++ } ++ } ++ ssa_conflicts_delete (ccmp_conflict_graph); ++} + + /* Build a conflict graph based on LIVEINFO. Any partitions which are in the + partition view of the var_map liveinfo is based on get entries in the +@@ -938,6 +1133,8 @@ build_ssa_conflict_graph (tree_live_info_p liveinfo) + live_track_process_use (live, var); + } + ++ determine_add_ccmp_conflict_graph (bb, liveinfo, map, graph); ++ + /* If result of a PHI is unused, looping over the statements will not + record any conflicts since the def was never live. Since the PHI node + is going to be translated out of SSA form, it will insert a copy. +-- +2.33.0.windows.2 + diff --git a/0036-StructReorg-Refactoring-reorder-fields-to-struct-lay.patch b/0036-StructReorg-Refactoring-reorder-fields-to-struct-lay.patch new file mode 100644 index 0000000000000000000000000000000000000000..da607f3c8bdd101091de02b1d7e27fda4f6bbf6f --- /dev/null +++ b/0036-StructReorg-Refactoring-reorder-fields-to-struct-lay.patch @@ -0,0 +1,1078 @@ +From 3c06a2cda7220a48866ae2dbe3f365e300cbaeca Mon Sep 17 00:00:00 2001 +From: liyancheng <412998149@qq.com> +Date: Wed, 1 Jun 2022 17:22:12 +0800 +Subject: [PATCH] [StructReorg] Refactoring reorder fields to struct layout + optimization + +Refactor the reorder_fields optimization into struct layout optimization. Add +flag -fipa-struct-reorg=[0,1,2] to enable none, strcut reorg, reorder fields +optimizations. + +diff --git a/gcc/common.opt b/gcc/common.opt +index 4dd566def..7fc075d35 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1876,13 +1876,17 @@ Common Ignore + Does nothing. Preserved for backward compatibility. + + fipa-reorder-fields +-Common Report Var(flag_ipa_reorder_fields) Init(0) Optimization ++Common Report Var(flag_ipa_struct_layout) Init(0) Optimization + Perform structure fields reorder optimizations. + + fipa-struct-reorg + Common Report Var(flag_ipa_struct_reorg) Init(0) Optimization + Perform structure layout optimizations. + ++fipa-struct-reorg= ++Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 2) ++-fipa-struct-reorg=[0,1,2] adding none, struct-reorg, reorder-fields optimizations. ++ + fipa-extend-auto-profile + Common Report Var(flag_ipa_extend_auto_profile) + Use sample profile information for source code. +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.c b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +index 2bf41e0d8..9214ee74a 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.c ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +@@ -235,7 +235,7 @@ enum srmode + { + NORMAL = 0, + COMPLETE_STRUCT_RELAYOUT, +- STRUCT_REORDER_FIELDS ++ STRUCT_LAYOUT_OPTIMIZE + }; + + static bool is_result_of_mult (tree arg, tree *num, tree struct_size); +@@ -552,7 +552,7 @@ void + srtype::simple_dump (FILE *f) + { + print_generic_expr (f, type); +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + fprintf (f, "(%d)", TYPE_UID (type)); + } +@@ -593,9 +593,9 @@ srfield::create_new_fields (tree newtype[max_split], + tree newfields[max_split], + tree newlast[max_split]) + { +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { +- create_new_reorder_fields (newtype, newfields, newlast); ++ create_new_optimized_fields (newtype, newfields, newlast); + return; + } + +@@ -689,15 +689,15 @@ srfield::reorder_fields (tree newfields[max_split], tree newlast[max_split], + } + } + +-/* Create the new reorder fields for this field. ++/* Create the new optimized fields for this field. + newtype[max_split]: srtype's member variable, + newfields[max_split]: created by create_new_type func, + newlast[max_split]: created by create_new_type func. */ + + void +-srfield::create_new_reorder_fields (tree newtype[max_split], +- tree newfields[max_split], +- tree newlast[max_split]) ++srfield::create_new_optimized_fields (tree newtype[max_split], ++ tree newfields[max_split], ++ tree newlast[max_split]) + { + /* newtype, corresponding to newtype[max_split] in srtype. */ + tree nt = NULL_TREE; +@@ -794,7 +794,7 @@ srtype::create_new_type (void) + we are not splitting the struct into two clusters, + then just return false and don't change the type. */ + if (!createnewtype && maxclusters == 0 +- && current_mode != STRUCT_REORDER_FIELDS) ++ && current_mode != STRUCT_LAYOUT_OPTIMIZE) + { + newtype[0] = type; + return false; +@@ -822,8 +822,8 @@ srtype::create_new_type (void) + sprintf(id, "%d", i); + if (tname) + { +- name = concat (tname, current_mode == STRUCT_REORDER_FIELDS +- ? ".reorder." : ".reorg.", id, NULL); ++ name = concat (tname, current_mode == STRUCT_LAYOUT_OPTIMIZE ++ ? ".slo." : ".reorg.", id, NULL); + TYPE_NAME (newtype[i]) = build_decl (UNKNOWN_LOCATION, TYPE_DECL, + get_identifier (name), newtype[i]); + free (name); +@@ -969,8 +969,8 @@ srfunction::create_new_decls (void) + sprintf(id, "%d", j); + if (tname) + { +- name = concat (tname, current_mode == STRUCT_REORDER_FIELDS +- ? ".reorder." : ".reorg.", id, NULL); ++ name = concat (tname, current_mode == STRUCT_LAYOUT_OPTIMIZE ++ ? ".slo." : ".reorg.", id, NULL); + new_name = get_identifier (name); + free (name); + } +@@ -2718,7 +2718,7 @@ escape_type escape_type_volatile_array_or_ptrptr (tree type) + return escape_volatile; + if (isarraytype (type)) + return escape_array; +- if (isptrptr (type) && (current_mode != STRUCT_REORDER_FIELDS)) ++ if (isptrptr (type) && (current_mode != STRUCT_LAYOUT_OPTIMIZE)) + return escape_ptr_ptr; + return does_not_escape; + } +@@ -2740,13 +2740,13 @@ ipa_struct_reorg::record_field_type (tree field, srtype *base_srtype) + field_srtype->add_field_site (field_srfield); + } + if (field_srtype == base_srtype && current_mode != COMPLETE_STRUCT_RELAYOUT +- && current_mode != STRUCT_REORDER_FIELDS) ++ && current_mode != STRUCT_LAYOUT_OPTIMIZE) + { + base_srtype->mark_escape (escape_rescusive_type, NULL); + } + /* Types of non-pointer field are difficult to track the correctness + of the rewrite when it used by the escaped type. */ +- if (current_mode == STRUCT_REORDER_FIELDS ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE + && TREE_CODE (field_type) == RECORD_TYPE) + { + field_srtype->mark_escape (escape_instance_field, NULL); +@@ -2781,7 +2781,7 @@ ipa_struct_reorg::record_struct_field_types (tree base_type, + } + /* Types of non-pointer field are difficult to track the correctness + of the rewrite when it used by the escaped type. */ +- if (current_mode == STRUCT_REORDER_FIELDS ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE + && TREE_CODE (field_type) == RECORD_TYPE) + { + base_srtype->mark_escape (escape_instance_field, NULL); +@@ -2966,7 +2966,7 @@ ipa_struct_reorg::record_var (tree decl, escape_type escapes, int arg) + /* Separate instance is hard to trace in complete struct + relayout optimization. */ + if ((current_mode == COMPLETE_STRUCT_RELAYOUT +- || current_mode == STRUCT_REORDER_FIELDS) ++ || current_mode == STRUCT_LAYOUT_OPTIMIZE) + && TREE_CODE (TREE_TYPE (decl)) == RECORD_TYPE) + { + e = escape_separate_instance; +@@ -3071,7 +3071,7 @@ ipa_struct_reorg::find_vars (gimple *stmt) + /* Add a safe func mechanism. */ + bool l_find = true; + bool r_find = true; +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + l_find = !(current_function->is_safe_func + && TREE_CODE (lhs) == SSA_NAME +@@ -3117,7 +3117,7 @@ ipa_struct_reorg::find_vars (gimple *stmt) + } + } + } +- else if ((current_mode == STRUCT_REORDER_FIELDS) ++ else if ((current_mode == STRUCT_LAYOUT_OPTIMIZE) + && (gimple_assign_rhs_code (stmt) == LE_EXPR + || gimple_assign_rhs_code (stmt) == LT_EXPR + || gimple_assign_rhs_code (stmt) == GE_EXPR +@@ -3128,7 +3128,7 @@ ipa_struct_reorg::find_vars (gimple *stmt) + find_var (gimple_assign_rhs2 (stmt), stmt); + } + /* find void ssa_name from stmt such as: _2 = _1 - old_arcs_1. */ +- else if ((current_mode == STRUCT_REORDER_FIELDS) ++ else if ((current_mode == STRUCT_LAYOUT_OPTIMIZE) + && gimple_assign_rhs_code (stmt) == POINTER_DIFF_EXPR + && types_compatible_p ( + TYPE_MAIN_VARIANT (TREE_TYPE (gimple_assign_rhs1 (stmt))), +@@ -3391,11 +3391,12 @@ is_result_of_mult (tree arg, tree *num, tree struct_size) + arg = gimple_assign_rhs1 (size_def_stmt); + size_def_stmt = SSA_NAME_DEF_STMT (arg); + } +- else if (rhs_code == NEGATE_EXPR && current_mode == STRUCT_REORDER_FIELDS) ++ else if (rhs_code == NEGATE_EXPR ++ && current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + return trace_calculate_negate (size_def_stmt, num, struct_size); + } +- else if (rhs_code == NOP_EXPR && current_mode == STRUCT_REORDER_FIELDS) ++ else if (rhs_code == NOP_EXPR && current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + return trace_calculate_diff (size_def_stmt, num); + } +@@ -3415,7 +3416,7 @@ is_result_of_mult (tree arg, tree *num, tree struct_size) + bool + ipa_struct_reorg::handled_allocation_stmt (gimple *stmt) + { +- if ((current_mode == STRUCT_REORDER_FIELDS) ++ if ((current_mode == STRUCT_LAYOUT_OPTIMIZE) + && (gimple_call_builtin_p (stmt, BUILT_IN_REALLOC) + || gimple_call_builtin_p (stmt, BUILT_IN_MALLOC) + || gimple_call_builtin_p (stmt, BUILT_IN_CALLOC))) +@@ -3548,7 +3549,7 @@ ipa_struct_reorg::maybe_mark_or_record_other_side (tree side, tree other, gimple + /* x_1 = y.x_nodes; void *x; + Directly mark the structure pointer type assigned + to the void* variable as escape. */ +- else if (current_mode == STRUCT_REORDER_FIELDS ++ else if (current_mode == STRUCT_LAYOUT_OPTIMIZE + && TREE_CODE (side) == SSA_NAME + && VOID_POINTER_P (TREE_TYPE (side)) + && SSA_NAME_VAR (side) +@@ -3815,7 +3816,7 @@ ipa_struct_reorg::get_type_field (tree expr, tree &base, bool &indirect, + and doesn't mark escape follow.). */ + /* _1 = MEM[(struct arc_t * *)a_1]. + then base a_1: ssa_name - pointer_type - integer_type. */ +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + bool is_int_ptr = POINTER_TYPE_P (TREE_TYPE (base)) + && (TREE_CODE (inner_type (TREE_TYPE (base))) +@@ -4031,7 +4032,7 @@ ipa_struct_reorg::maybe_record_call (cgraph_node *node, gcall *stmt) + /* callee_func (_1, _2); + Check the callee func, instead of current func. */ + if (!(free_or_realloc +- || (current_mode == STRUCT_REORDER_FIELDS ++ || (current_mode == STRUCT_LAYOUT_OPTIMIZE + && safe_functions.contains ( + node->get_edge (stmt)->callee))) + && VOID_POINTER_P (argtypet)) +@@ -4063,9 +4064,9 @@ ipa_struct_reorg::record_stmt_expr (tree expr, cgraph_node *node, gimple *stmt) + realpart, imagpart, address, escape_from_base)) + return; + +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { +- if (!opt_for_fn (current_function_decl, flag_ipa_reorder_fields)) ++ if (!opt_for_fn (current_function_decl, flag_ipa_struct_layout)) + { + type->mark_escape (escape_non_optimize, stmt); + } +@@ -4287,7 +4288,7 @@ ipa_struct_reorg::check_definition_call (srdecl *decl, vec &worklist) + check_type_and_push (gimple_call_arg (stmt, 0), decl, worklist, stmt); + } + +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + if (!handled_allocation_stmt (stmt)) + { +@@ -4341,7 +4342,7 @@ ipa_struct_reorg::check_definition (srdecl *decl, vec &worklist) + } + return; + } +- if (current_mode == STRUCT_REORDER_FIELDS && SSA_NAME_VAR (ssa_name) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE && SSA_NAME_VAR (ssa_name) + && VOID_POINTER_P (TREE_TYPE (SSA_NAME_VAR (ssa_name)))) + { + type->mark_escape (escape_cast_void, SSA_NAME_DEF_STMT (ssa_name)); +@@ -4442,7 +4443,7 @@ ipa_struct_reorg::check_other_side (srdecl *decl, tree other, gimple *stmt, vec< + if (!get_type_field (other, base, indirect, type1, field, + realpart, imagpart, address, escape_from_base)) + { +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + /* release INTEGER_TYPE cast to struct pointer. */ + bool cast_from_int_ptr = current_function->is_safe_func && base +@@ -4498,7 +4499,7 @@ get_base (tree &base, tree expr) + void + ipa_struct_reorg::check_ptr_layers (tree a_expr, tree b_expr, gimple* stmt) + { +- if (current_mode != STRUCT_REORDER_FIELDS || current_function->is_safe_func ++ if (current_mode != STRUCT_LAYOUT_OPTIMIZE || current_function->is_safe_func + || !(POINTER_TYPE_P (TREE_TYPE (a_expr))) + || !(POINTER_TYPE_P (TREE_TYPE (b_expr))) + || !handled_type (TREE_TYPE (a_expr)) +@@ -4579,7 +4580,7 @@ ipa_struct_reorg::check_use (srdecl *decl, gimple *stmt, vec &worklist) + && (code != EQ_EXPR && code != NE_EXPR + && code != LT_EXPR && code != LE_EXPR + && code != GT_EXPR && code != GE_EXPR)) +- || (current_mode == STRUCT_REORDER_FIELDS ++ || (current_mode == STRUCT_LAYOUT_OPTIMIZE + && (code != EQ_EXPR && code != NE_EXPR + && code != LT_EXPR && code != LE_EXPR + && code != GT_EXPR && code != GE_EXPR))) +@@ -4618,7 +4619,7 @@ ipa_struct_reorg::check_use (srdecl *decl, gimple *stmt, vec &worklist) + && (code != EQ_EXPR && code != NE_EXPR + && code != LT_EXPR && code != LE_EXPR + && code != GT_EXPR && code != GE_EXPR)) +- || (current_mode == STRUCT_REORDER_FIELDS ++ || (current_mode == STRUCT_LAYOUT_OPTIMIZE + && (code != EQ_EXPR && code != NE_EXPR + && code != LT_EXPR && code != LE_EXPR + && code != GT_EXPR && code != GE_EXPR))) +@@ -4740,11 +4741,11 @@ ipa_struct_reorg::record_function (cgraph_node *node) + escapes = escape_marked_as_used; + else if (!node->local) + { +- if (current_mode != STRUCT_REORDER_FIELDS) ++ if (current_mode != STRUCT_LAYOUT_OPTIMIZE) + { + escapes = escape_visible_function; + } +- if (current_mode == STRUCT_REORDER_FIELDS && node->externally_visible) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE && node->externally_visible) + { + escapes = escape_visible_function; + } +@@ -4754,9 +4755,9 @@ ipa_struct_reorg::record_function (cgraph_node *node) + else if (!tree_versionable_function_p (node->decl)) + escapes = escape_noclonable_function; + +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { +- if (!opt_for_fn (node->decl, flag_ipa_reorder_fields)) ++ if (!opt_for_fn (node->decl, flag_ipa_struct_layout)) + { + escapes = escape_non_optimize; + } +@@ -4773,7 +4774,7 @@ ipa_struct_reorg::record_function (cgraph_node *node) + gimple_stmt_iterator si; + + /* Add a safe func mechanism. */ +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + current_function->is_safe_func = safe_functions.contains (node); + if (dump_file) +@@ -4989,7 +4990,7 @@ ipa_struct_reorg::record_accesses (void) + } + + /* Add a safe func mechanism. */ +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + record_safe_func_with_void_ptr_parm (); + } +@@ -5188,7 +5189,7 @@ void + ipa_struct_reorg::prune_escaped_types (void) + { + if (current_mode != COMPLETE_STRUCT_RELAYOUT +- && current_mode != STRUCT_REORDER_FIELDS) ++ && current_mode != STRUCT_LAYOUT_OPTIMIZE) + { + /* Detect recusive types and mark them as escaping. */ + detect_cycles (); +@@ -5196,7 +5197,7 @@ ipa_struct_reorg::prune_escaped_types (void) + mark them as escaping. */ + propagate_escape (); + } +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + propagate_escape_via_original (); + propagate_escape_via_empty_with_no_original (); +@@ -5256,7 +5257,7 @@ ipa_struct_reorg::prune_escaped_types (void) + if (function->args.is_empty () + && function->decls.is_empty () + && function->globals.is_empty () +- && current_mode != STRUCT_REORDER_FIELDS) ++ && current_mode != STRUCT_LAYOUT_OPTIMIZE) + { + delete function; + functions.ordered_remove (i); +@@ -5281,10 +5282,10 @@ ipa_struct_reorg::prune_escaped_types (void) + + /* Prune types that escape, all references to those types + will have been removed in the above loops. */ +- /* The escape type is not deleted in STRUCT_REORDER_FIELDS, ++ /* The escape type is not deleted in STRUCT_LAYOUT_OPTIMIZE, + Then the type that contains the escaped type fields + can find complete information. */ +- if (current_mode != STRUCT_REORDER_FIELDS) ++ if (current_mode != STRUCT_LAYOUT_OPTIMIZE) + { + for (unsigned i = 0; i < types.length ();) + { +@@ -5334,7 +5335,7 @@ ipa_struct_reorg::create_new_types (void) + for (unsigned i = 0; i < types.length (); i++) + newtypes += types[i]->create_new_type (); + +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + for (unsigned i = 0; i < types.length (); i++) + { +@@ -5458,8 +5459,8 @@ ipa_struct_reorg::create_new_args (cgraph_node *new_node) + char *name = NULL; + if (tname) + { +- name = concat (tname, current_mode == STRUCT_REORDER_FIELDS +- ? ".reorder.0" : ".reorg.0", NULL); ++ name = concat (tname, current_mode == STRUCT_LAYOUT_OPTIMIZE ++ ? ".slo.0" : ".reorg.0", NULL); + new_name = get_identifier (name); + free (name); + } +@@ -5547,8 +5548,8 @@ ipa_struct_reorg::create_new_functions (void) + statistics_counter_event (NULL, "Create new function", 1); + new_node = node->create_version_clone_with_body ( + vNULL, NULL, NULL, NULL, NULL, +- current_mode == STRUCT_REORDER_FIELDS +- ? "struct_reorder" : "struct_reorg"); ++ current_mode == STRUCT_LAYOUT_OPTIMIZE ++ ? "slo" : "struct_reorg"); + new_node->can_change_signature = node->can_change_signature; + new_node->make_local (); + f->newnode = new_node; +@@ -5666,13 +5667,13 @@ ipa_struct_reorg::rewrite_expr (tree expr, tree newexpr[max_split], bool ignore_ + newbase1 = build_fold_addr_expr (newbase1); + if (indirect) + { +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + /* Supports the MEM_REF offset. + _1 = MEM[(struct arc *)ap_1 + 72B].flow; +- Old rewrite: _1 = ap.reorder.0_8->flow; ++ Old rewrite: _1 = ap.slo.0_8->flow; + New rewrite: _1 +- = MEM[(struct arc.reorder.0 *)ap.reorder.0_8 + 64B].flow; ++ = MEM[(struct arc.slo.0 *)ap.slo.0_8 + 64B].flow; + */ + HOST_WIDE_INT offset_tmp = 0; + HOST_WIDE_INT mem_offset = 0; +@@ -5738,10 +5739,10 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + return remove; + } + +- if ((current_mode != STRUCT_REORDER_FIELDS ++ if ((current_mode != STRUCT_LAYOUT_OPTIMIZE + && (gimple_assign_rhs_code (stmt) == EQ_EXPR + || gimple_assign_rhs_code (stmt) == NE_EXPR)) +- || (current_mode == STRUCT_REORDER_FIELDS ++ || (current_mode == STRUCT_LAYOUT_OPTIMIZE + && (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) + == tcc_comparison))) + { +@@ -5751,7 +5752,7 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + tree newrhs2[max_split]; + tree_code rhs_code = gimple_assign_rhs_code (stmt); + tree_code code = rhs_code == EQ_EXPR ? BIT_AND_EXPR : BIT_IOR_EXPR; +- if (current_mode == STRUCT_REORDER_FIELDS ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE + && rhs_code != EQ_EXPR && rhs_code != NE_EXPR) + { + code = rhs_code; +@@ -5798,8 +5799,8 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + _6 = _4 + _5; + _5 = (long unsigned int) _3; + _3 = _1 - old_2. */ +- if (current_mode != STRUCT_REORDER_FIELDS +- || (current_mode == STRUCT_REORDER_FIELDS && (num != NULL))) ++ if (current_mode != STRUCT_LAYOUT_OPTIMIZE ++ || (current_mode == STRUCT_LAYOUT_OPTIMIZE && (num != NULL))) + { + num = gimplify_build1 (gsi, NOP_EXPR, sizetype, num); + } +@@ -5827,7 +5828,7 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + } + + /* Support POINTER_DIFF_EXPR rewriting. */ +- if (current_mode == STRUCT_REORDER_FIELDS ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE + && gimple_assign_rhs_code (stmt) == POINTER_DIFF_EXPR) + { + tree rhs1 = gimple_assign_rhs1 (stmt); +@@ -6014,7 +6015,7 @@ ipa_struct_reorg::rewrite_call (gcall *stmt, gimple_stmt_iterator *gsi) + srfunction *f = find_function (node); + + /* Add a safe func mechanism. */ +- if (current_mode == STRUCT_REORDER_FIELDS && f && f->is_safe_func) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE && f && f->is_safe_func) + { + tree expr = gimple_call_arg (stmt, 0); + tree newexpr[max_split]; +@@ -6141,9 +6142,9 @@ ipa_struct_reorg::rewrite_cond (gcond *stmt, gimple_stmt_iterator *gsi) + tree_code rhs_code = gimple_cond_code (stmt); + + /* Handle only equals or not equals conditionals. */ +- if ((current_mode != STRUCT_REORDER_FIELDS ++ if ((current_mode != STRUCT_LAYOUT_OPTIMIZE + && (rhs_code != EQ_EXPR && rhs_code != NE_EXPR)) +- || (current_mode == STRUCT_REORDER_FIELDS ++ || (current_mode == STRUCT_LAYOUT_OPTIMIZE + && TREE_CODE_CLASS (rhs_code) != tcc_comparison)) + return false; + tree lhs = gimple_cond_lhs (stmt); +@@ -6171,10 +6172,10 @@ ipa_struct_reorg::rewrite_cond (gcond *stmt, gimple_stmt_iterator *gsi) + } + + /* Old rewrite: if (x_1 != 0B) +- -> _1 = x.reorder.0_1 != 0B; if (_1 != 1) ++ -> _1 = x.slo.0_1 != 0B; if (_1 != 1) + The logic is incorrect. + New rewrite: if (x_1 != 0B) +- -> if (x.reorder.0_1 != 0B);*/ ++ -> if (x.slo.0_1 != 0B);*/ + for (unsigned i = 0; i < max_split && (newlhs[i] || newrhs[i]); i++) + { + if (newlhs[i]) +@@ -6203,7 +6204,7 @@ ipa_struct_reorg::rewrite_cond (gcond *stmt, gimple_stmt_iterator *gsi) + bool + ipa_struct_reorg::rewrite_debug (gimple *stmt, gimple_stmt_iterator *) + { +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + /* Delete debug gimple now. */ + return true; +@@ -6367,7 +6368,7 @@ ipa_struct_reorg::rewrite_functions (void) + then don't rewrite any accesses. */ + if (!create_new_types ()) + { +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + for (unsigned i = 0; i < functions.length (); i++) + { +@@ -6386,7 +6387,7 @@ ipa_struct_reorg::rewrite_functions (void) + return 0; + } + +- if (current_mode == STRUCT_REORDER_FIELDS && dump_file) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE && dump_file) + { + fprintf (dump_file, "=========== all created newtypes: ===========\n\n"); + dump_newtypes (dump_file); +@@ -6396,13 +6397,13 @@ ipa_struct_reorg::rewrite_functions (void) + { + retval = TODO_remove_functions; + create_new_functions (); +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + prune_escaped_types (); + } + } + +- if (current_mode == STRUCT_REORDER_FIELDS) ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + for (unsigned i = 0; i < functions.length (); i++) + { +@@ -6572,7 +6573,7 @@ ipa_struct_reorg::execute (enum srmode mode) + { + unsigned int ret = 0; + +- if (mode == NORMAL || mode == STRUCT_REORDER_FIELDS) ++ if (mode == NORMAL || mode == STRUCT_LAYOUT_OPTIMIZE) + { + current_mode = mode; + /* If there is a top-level inline-asm, +@@ -6660,12 +6661,12 @@ pass_ipa_struct_reorg::gate (function *) + && (in_lto_p || flag_whole_program)); + } + +-const pass_data pass_data_ipa_reorder_fields = ++const pass_data pass_data_ipa_struct_layout = + { + SIMPLE_IPA_PASS, // type +- "reorder_fields", // name ++ "struct_layout", // name + OPTGROUP_NONE, // optinfo_flags +- TV_IPA_REORDER_FIELDS, // tv_id ++ TV_IPA_STRUCT_LAYOUT, // tv_id + 0, // properties_required + 0, // properties_provided + 0, // properties_destroyed +@@ -6673,11 +6674,11 @@ const pass_data pass_data_ipa_reorder_fields = + 0, // todo_flags_finish + }; + +-class pass_ipa_reorder_fields : public simple_ipa_opt_pass ++class pass_ipa_struct_layout : public simple_ipa_opt_pass + { + public: +- pass_ipa_reorder_fields (gcc::context *ctxt) +- : simple_ipa_opt_pass (pass_data_ipa_reorder_fields, ctxt) ++ pass_ipa_struct_layout (gcc::context *ctxt) ++ : simple_ipa_opt_pass (pass_data_ipa_struct_layout, ctxt) + {} + + /* opt_pass methods: */ +@@ -6685,17 +6686,17 @@ public: + virtual unsigned int execute (function *) + { + unsigned int ret = 0; +- ret = ipa_struct_reorg ().execute (STRUCT_REORDER_FIELDS); ++ ret = ipa_struct_reorg ().execute (STRUCT_LAYOUT_OPTIMIZE); + return ret; + } + +-}; // class pass_ipa_reorder_fields ++}; // class pass_ipa_struct_layout + + bool +-pass_ipa_reorder_fields::gate (function *) ++pass_ipa_struct_layout::gate (function *) + { + return (optimize >= 3 +- && flag_ipa_reorder_fields ++ && flag_ipa_struct_layout + /* Don't bother doing anything if the program has errors. */ + && !seen_error () + && flag_lto_partition == LTO_PARTITION_ONE +@@ -6715,7 +6716,7 @@ make_pass_ipa_struct_reorg (gcc::context *ctxt) + } + + simple_ipa_opt_pass * +-make_pass_ipa_reorder_fields (gcc::context *ctxt) ++make_pass_ipa_struct_layout (gcc::context *ctxt) + { +- return new pass_ipa_reorder_fields (ctxt); ++ return new pass_ipa_struct_layout (ctxt); + } +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.h b/gcc/ipa-struct-reorg/ipa-struct-reorg.h +index 8fb6ce9c4..54b0dc655 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.h ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.h +@@ -187,7 +187,7 @@ struct srfield + tree newlast[max_split]); + void reorder_fields (tree newfields[max_split], tree newlast[max_split], + tree &field); +- void create_new_reorder_fields (tree newtype[max_split], ++ void create_new_optimized_fields (tree newtype[max_split], + tree newfields[max_split], + tree newlast[max_split]); + }; +diff --git a/gcc/opts.c b/gcc/opts.c +index 479d726df..c3877c24e 100644 +--- a/gcc/opts.c ++++ b/gcc/opts.c +@@ -2695,6 +2695,18 @@ common_handle_option (struct gcc_options *opts, + } + break; + ++ case OPT_fipa_struct_reorg_: ++ opts->x_struct_layout_optimize_level = value; ++ if (value > 1) ++ { ++ SET_OPTION_IF_UNSET (opts, opts_set, flag_ipa_struct_layout, value); ++ } ++ /* No break here - do -fipa-struct-reorg processing. */ ++ /* FALLTHRU. */ ++ case OPT_fipa_struct_reorg: ++ opts->x_flag_ipa_struct_reorg = value; ++ break; ++ + case OPT_fprofile_generate_: + opts->x_profile_data_prefix = xstrdup (arg); + value = true; +diff --git a/gcc/passes.def b/gcc/passes.def +index e9c91d26e..eea4d7808 100644 +--- a/gcc/passes.def ++++ b/gcc/passes.def +@@ -174,7 +174,7 @@ along with GCC; see the file COPYING3. If not see + INSERT_PASSES_AFTER (all_late_ipa_passes) + NEXT_PASS (pass_materialize_all_clones); + NEXT_PASS (pass_ipa_pta); +- NEXT_PASS (pass_ipa_reorder_fields); ++ NEXT_PASS (pass_ipa_struct_layout); + /* FIXME: this should a normal IP pass */ + NEXT_PASS (pass_ipa_struct_reorg); + NEXT_PASS (pass_omp_simd_clone); +diff --git a/gcc/symbol-summary.h b/gcc/symbol-summary.h +index ddf5e3577..f62222a96 100644 +--- a/gcc/symbol-summary.h ++++ b/gcc/symbol-summary.h +@@ -61,7 +61,7 @@ protected: + { + /* In structure optimizatons, we call new to ensure that + the allocated memory is initialized to 0. */ +- if (flag_ipa_reorder_fields || flag_ipa_struct_reorg) ++ if (flag_ipa_struct_layout || flag_ipa_struct_reorg) + return is_ggc () ? new (ggc_internal_alloc (sizeof (T))) T () + : new T (); + /* Call gcc_internal_because we do not want to call finalizer for +@@ -77,7 +77,7 @@ protected: + ggc_delete (item); + else + { +- if (flag_ipa_reorder_fields || flag_ipa_struct_reorg) ++ if (flag_ipa_struct_layout || flag_ipa_struct_reorg) + delete item; + else + m_allocator.remove (item); +diff --git a/gcc/testsuite/gcc.dg/struct/rf_DTE_struct_instance_field.c b/gcc/testsuite/gcc.dg/struct/rf_DTE_struct_instance_field.c +index b95be2dab..882a695b0 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_DTE_struct_instance_field.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_DTE_struct_instance_field.c +@@ -72,4 +72,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "No structures to transform." "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "No structures to transform." "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_DTE_verify.c b/gcc/testsuite/gcc.dg/struct/rf_DTE_verify.c +index 3d243313b..20ecee545 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_DTE_verify.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_DTE_verify.c +@@ -91,4 +91,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_check_ptr_layers_bug.c b/gcc/testsuite/gcc.dg/struct/rf_check_ptr_layers_bug.c +index a5477dcc9..ad879fc11 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_check_ptr_layers_bug.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_check_ptr_layers_bug.c +@@ -21,4 +21,4 @@ main() + { + g(); + } +-/* { dg-final { scan-ipa-dump "No structures to transform." "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "No structures to transform." "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_create_fields_bug.c b/gcc/testsuite/gcc.dg/struct/rf_create_fields_bug.c +index 886706ae9..f0c9d8f39 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_create_fields_bug.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_create_fields_bug.c +@@ -79,4 +79,4 @@ main() + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_create_new_func_bug.c b/gcc/testsuite/gcc.dg/struct/rf_create_new_func_bug.c +index f3785f392..fa5e6c2d0 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_create_new_func_bug.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_create_new_func_bug.c +@@ -53,4 +53,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_ele_minus_verify.c b/gcc/testsuite/gcc.dg/struct/rf_ele_minus_verify.c +index 1415d759a..2966869e7 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_ele_minus_verify.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_ele_minus_verify.c +@@ -57,4 +57,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_escape_by_base.c b/gcc/testsuite/gcc.dg/struct/rf_escape_by_base.c +index 003da0b57..b74b9e5e9 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_escape_by_base.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_escape_by_base.c +@@ -80,4 +80,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_int_cast_ptr.c b/gcc/testsuite/gcc.dg/struct/rf_int_cast_ptr.c +index 10dcf098c..cf85c6109 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_int_cast_ptr.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_int_cast_ptr.c +@@ -69,4 +69,4 @@ main() + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_mem_ref_offset.c b/gcc/testsuite/gcc.dg/struct/rf_mem_ref_offset.c +index 8d1a9a114..61fd9f755 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_mem_ref_offset.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_mem_ref_offset.c +@@ -55,4 +55,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_mul_layer_ptr_record_bug.c b/gcc/testsuite/gcc.dg/struct/rf_mul_layer_ptr_record_bug.c +index 23765fc56..2c115da02 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_mul_layer_ptr_record_bug.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_mul_layer_ptr_record_bug.c +@@ -27,4 +27,4 @@ main() { + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_pass_conflict.c b/gcc/testsuite/gcc.dg/struct/rf_pass_conflict.c +index 54e737ee8..c7646d8b7 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_pass_conflict.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_pass_conflict.c +@@ -106,4 +106,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_ptr2void_lto.c b/gcc/testsuite/gcc.dg/struct/rf_ptr2void_lto.c +index 2ae46fb31..01c000375 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_ptr2void_lto.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_ptr2void_lto.c +@@ -84,4 +84,4 @@ main () + return cnt; + } + +-/* { dg-final { scan-ipa-dump "No structures to transform." "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "No structures to transform." "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_ptr_diff.c b/gcc/testsuite/gcc.dg/struct/rf_ptr_diff.c +index 3a3c10b70..f962163fe 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_ptr_diff.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_ptr_diff.c +@@ -68,4 +68,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 3" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 3" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_ptr_negate_expr.c b/gcc/testsuite/gcc.dg/struct/rf_ptr_negate_expr.c +index 7b7d110df..6558b1797 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_ptr_negate_expr.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_ptr_negate_expr.c +@@ -52,4 +52,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_ptr_offset.c b/gcc/testsuite/gcc.dg/struct/rf_ptr_offset.c +index 317aafa5f..6d528ed5b 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_ptr_offset.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_ptr_offset.c +@@ -31,4 +31,4 @@ main () + printf (" Tree.\n"); + } + +-/* { dg-final { scan-ipa-dump "No structures to transform." "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "No structures to transform." "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_ptr_ptr.c b/gcc/testsuite/gcc.dg/struct/rf_ptr_ptr.c +index 01a33f669..e95cf2e5d 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_ptr_ptr.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_ptr_ptr.c +@@ -52,4 +52,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_ptr_ptr_ptr.c b/gcc/testsuite/gcc.dg/struct/rf_ptr_ptr_ptr.c +index a38556533..cb4054522 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_ptr_ptr_ptr.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_ptr_ptr_ptr.c +@@ -55,4 +55,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_rescusive_type.c b/gcc/testsuite/gcc.dg/struct/rf_rescusive_type.c +index 5c17ee528..38bddbae5 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_rescusive_type.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_rescusive_type.c +@@ -54,4 +54,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_rewrite_assign_more_cmp.c b/gcc/testsuite/gcc.dg/struct/rf_rewrite_assign_more_cmp.c +index 710517ee9..86034f042 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_rewrite_assign_more_cmp.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_rewrite_assign_more_cmp.c +@@ -62,4 +62,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_rewrite_cond_bug.c b/gcc/testsuite/gcc.dg/struct/rf_rewrite_cond_bug.c +index 6ed0a5d2d..aae7c4bc9 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_rewrite_cond_bug.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_rewrite_cond_bug.c +@@ -69,4 +69,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 3" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 3" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_rewrite_cond_more_cmp.c b/gcc/testsuite/gcc.dg/struct/rf_rewrite_cond_more_cmp.c +index 5a2dd964f..8672e7552 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_rewrite_cond_more_cmp.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_rewrite_cond_more_cmp.c +@@ -55,4 +55,4 @@ main() + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_rewrite_phi_bug.c b/gcc/testsuite/gcc.dg/struct/rf_rewrite_phi_bug.c +index faa90b42d..2d67434a0 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_rewrite_phi_bug.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_rewrite_phi_bug.c +@@ -78,4 +78,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 3" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 3" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_visible_func.c b/gcc/testsuite/gcc.dg/struct/rf_visible_func.c +index 8f2da99cc..a8cf2b63c 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_visible_func.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_visible_func.c +@@ -89,4 +89,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/rf_void_ptr_param_func.c b/gcc/testsuite/gcc.dg/struct/rf_void_ptr_param_func.c +index 723142c59..b6cba3c34 100644 +--- a/gcc/testsuite/gcc.dg/struct/rf_void_ptr_param_func.c ++++ b/gcc/testsuite/gcc.dg/struct/rf_void_ptr_param_func.c +@@ -51,4 +51,4 @@ main() + return 0; + } + +-/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "reorder_fields" } } */ +\ No newline at end of file ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/sr_pointer_minus.c b/gcc/testsuite/gcc.dg/struct/sr_pointer_minus.c +index 9a82da0d6..a0614a1ba 100644 +--- a/gcc/testsuite/gcc.dg/struct/sr_pointer_minus.c ++++ b/gcc/testsuite/gcc.dg/struct/sr_pointer_minus.c +@@ -30,4 +30,4 @@ main () + return 0; + } + +-/* { dg-final { scan-ipa-dump "struct node has escaped: \"Type escapes via a unhandled rewrite stmt\"" "struct_reorg" } } */ ++/* { dg-final { scan-ipa-dump "has escaped: \"Type escapes via a unhandled rewrite stmt\"" "struct_reorg" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/struct-reorg.exp b/gcc/testsuite/gcc.dg/struct/struct-reorg.exp +index c8db4675f..67b3ac2d5 100644 +--- a/gcc/testsuite/gcc.dg/struct/struct-reorg.exp ++++ b/gcc/testsuite/gcc.dg/struct/struct-reorg.exp +@@ -47,6 +47,25 @@ gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/complete_struct_relayout + gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/rf*.c]] \ + "" "-fipa-reorder-fields -fdump-ipa-all -flto-partition=one -fwhole-program" + ++# -fipa-struct-reorg=1 ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/wo_prof_*.c]] \ ++ "" "-fipa-struct-reorg=1 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/w_ratio_*.c]] \ ++ "" "-fipa-struct-reorg=1 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/w_prof_*.c]] \ ++ "" "-fipa-struct-reorg=1 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/struct_reorg*.c]] \ ++ "" "-fipa-struct-reorg=1 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/sr_*.c]] \ ++ "" "-fipa-struct-reorg=1 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/csr_*.c]] \ ++ "" "-fipa-struct-reorg=1 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/complete_struct_relayout.c]] \ ++ "" "-fipa-struct-reorg=1 -fdump-ipa-all -flto-partition=one -fwhole-program" ++ ++# -fipa-struct-reorg=2 ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \ ++ "" "-fipa-struct-reorg=2 -fdump-ipa-all -flto-partition=one -fwhole-program" + # All done. + torture-finish + dg-finish +diff --git a/gcc/timevar.def b/gcc/timevar.def +index e873747a8..b179f62bb 100644 +--- a/gcc/timevar.def ++++ b/gcc/timevar.def +@@ -80,7 +80,7 @@ DEFTIMEVAR (TV_IPA_CONSTANT_PROP , "ipa cp") + DEFTIMEVAR (TV_IPA_INLINING , "ipa inlining heuristics") + DEFTIMEVAR (TV_IPA_FNSPLIT , "ipa function splitting") + DEFTIMEVAR (TV_IPA_COMDATS , "ipa comdats") +-DEFTIMEVAR (TV_IPA_REORDER_FIELDS , "ipa struct reorder fields optimization") ++DEFTIMEVAR (TV_IPA_STRUCT_LAYOUT , "ipa struct layout optimization") + DEFTIMEVAR (TV_IPA_STRUCT_REORG , "ipa struct reorg optimization") + DEFTIMEVAR (TV_IPA_EXTEND_AUTO_PROFILE, "ipa extend auto profile") + DEFTIMEVAR (TV_IPA_OPT , "ipa various optimizations") +diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h +index be6387768..187f1a85c 100644 +--- a/gcc/tree-pass.h ++++ b/gcc/tree-pass.h +@@ -509,7 +509,7 @@ extern ipa_opt_pass_d *make_pass_ipa_odr (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_reference (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_hsa (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_pure_const (gcc::context *ctxt); +-extern simple_ipa_opt_pass *make_pass_ipa_reorder_fields (gcc::context *ctxt); ++extern simple_ipa_opt_pass *make_pass_ipa_struct_layout (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_struct_reorg (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_extend_auto_profile (gcc::context + *ctxt); +diff --git a/gcc/tree.c b/gcc/tree.c +index 89fa469c3..c2075d735 100644 +--- a/gcc/tree.c ++++ b/gcc/tree.c +@@ -5219,7 +5219,7 @@ fld_simplified_type_name (tree type) + /* Simplify type will cause that struct A and struct A within + struct B are different type pointers, so skip it in structure + optimizations. */ +- if (flag_ipa_reorder_fields || flag_ipa_struct_reorg) ++ if (flag_ipa_struct_layout || flag_ipa_struct_reorg) + return TYPE_NAME (type); + + if (!TYPE_NAME (type) || TREE_CODE (TYPE_NAME (type)) != TYPE_DECL) +@@ -5463,7 +5463,7 @@ fld_simplified_type (tree t, class free_lang_data_d *fld) + /* Simplify type will cause that struct A and struct A within + struct B are different type pointers, so skip it in structure + optimizations. */ +- if (flag_ipa_reorder_fields || flag_ipa_struct_reorg) ++ if (flag_ipa_struct_layout || flag_ipa_struct_reorg) + return t; + if (POINTER_TYPE_P (t)) + return fld_incomplete_type_of (t, fld); +-- +2.33.0.windows.2 + diff --git a/0037-Backport-loop-invariant-Don-t-move-cold-bb-instructi.patch b/0037-Backport-loop-invariant-Don-t-move-cold-bb-instructi.patch new file mode 100644 index 0000000000000000000000000000000000000000..df2bb5b6815a0aef5618c31430e4daa7c2f7550a --- /dev/null +++ b/0037-Backport-loop-invariant-Don-t-move-cold-bb-instructi.patch @@ -0,0 +1,78 @@ +From 897d637aec3b077eb9ef95b2f4a5f7656e36ebd6 Mon Sep 17 00:00:00 2001 +From: benniaobufeijiushiji +Date: Wed, 15 Jun 2022 11:33:03 +0800 +Subject: [PATCH] [Backport] loop-invariant: Don't move cold bb instructions to + preheader in RTL + +Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=dc1969dab392661cdac1170bbb8c9f83f388580d + +When inner loop is unlikely to execute, loop invariant motion would move +cold instrcutions to a hotter loop. This patch adds profile count checking +to fix the problem. + +diff --git a/gcc/loop-invariant.c b/gcc/loop-invariant.c +index 37ae6549e..24b9bcb11 100644 +--- a/gcc/loop-invariant.c ++++ b/gcc/loop-invariant.c +@@ -1184,9 +1184,21 @@ find_invariants_insn (rtx_insn *insn, bool always_reached, bool always_executed) + call. */ + + static void +-find_invariants_bb (basic_block bb, bool always_reached, bool always_executed) ++find_invariants_bb (class loop *loop, basic_block bb, bool always_reached, ++ bool always_executed) + { + rtx_insn *insn; ++ basic_block preheader = loop_preheader_edge (loop)->src; ++ ++ /* Don't move insn of cold BB out of loop to preheader to reduce calculations ++ and register live range in hot loop with cold BB. */ ++ if (!always_executed && preheader->count > bb->count) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Don't move invariant from bb: %d out of loop %d\n", ++ bb->index, loop->num); ++ return; ++ } + + FOR_BB_INSNS (bb, insn) + { +@@ -1215,8 +1227,7 @@ find_invariants_body (class loop *loop, basic_block *body, + unsigned i; + + for (i = 0; i < loop->num_nodes; i++) +- find_invariants_bb (body[i], +- bitmap_bit_p (always_reached, i), ++ find_invariants_bb (loop, body[i], bitmap_bit_p (always_reached, i), + bitmap_bit_p (always_executed, i)); + } + +diff --git a/gcc/testsuite/gcc.dg/loop-invariant-2.c b/gcc/testsuite/gcc.dg/loop-invariant-2.c +new file mode 100644 +index 000000000..df3d84585 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/loop-invariant-2.c +@@ -0,0 +1,20 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -fdump-rtl-loop2_invariant" } */ ++ ++volatile int x; ++void ++bar (int, char *, char *); ++void ++foo (int *a, int n, int k) ++{ ++ int i; ++ ++ for (i = 0; i < n; i++) ++ { ++ if (__builtin_expect (x, 0)) ++ bar (k / 5, "one", "two"); ++ a[i] = k; ++ } ++} ++ ++/* { dg-final { scan-rtl-dump "Don't move invariant from bb: .*out of loop" "loop2_invariant" } } */ +-- +2.33.0.windows.2 + diff --git a/0038-DFE-Add-Dead-Field-Elimination-in-Struct-Reorg.patch b/0038-DFE-Add-Dead-Field-Elimination-in-Struct-Reorg.patch new file mode 100644 index 0000000000000000000000000000000000000000..dd0892c227062a4275dc9e1189b30fdead09ce3c --- /dev/null +++ b/0038-DFE-Add-Dead-Field-Elimination-in-Struct-Reorg.patch @@ -0,0 +1,882 @@ +From edd4200e2b3e94d5c124900657b91c22dfe9c557 Mon Sep 17 00:00:00 2001 +From: Mingchuan Wu +Date: Wed, 15 Jun 2022 16:00:25 +0800 +Subject: [PATCH] [DFE] Add Dead Field Elimination in Struct-Reorg. + +We can transform gimple to eliminate fields that are never read +and remove their redundant stmts. +Also we adapted the partial escape_cast_another_ptr for struct relayout. +Add flag -fipa-struct-reorg=3 to enable dead field elimination. + +diff --git a/gcc/common.opt b/gcc/common.opt +index 7fc075d35..b5ea3c7a1 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1884,8 +1884,8 @@ Common Report Var(flag_ipa_struct_reorg) Init(0) Optimization + Perform structure layout optimizations. + + fipa-struct-reorg= +-Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 2) +--fipa-struct-reorg=[0,1,2] adding none, struct-reorg, reorder-fields optimizations. ++Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 3) ++-fipa-struct-reorg=[0,1,2,3] adding none, struct-reorg, reorder-fields, dfe optimizations. + + fipa-extend-auto-profile + Common Report Var(flag_ipa_extend_auto_profile) +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.c b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +index 9214ee74a..2fa560239 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.c ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +@@ -81,6 +81,7 @@ along with GCC; see the file COPYING3. If not see + #include "tree-pretty-print.h" + #include "gimple-pretty-print.h" + #include "gimple-iterator.h" ++#include "gimple-walk.h" + #include "cfg.h" + #include "ssa.h" + #include "tree-dfa.h" +@@ -238,11 +239,44 @@ enum srmode + STRUCT_LAYOUT_OPTIMIZE + }; + ++/* Enum the struct layout optimize level, ++ which should be the same as the option -fstruct-reorg=. */ ++ ++enum struct_layout_opt_level ++{ ++ NONE = 0, ++ STRUCT_REORG, ++ STRUCT_REORDER_FIELDS, ++ DEAD_FIELD_ELIMINATION ++}; ++ + static bool is_result_of_mult (tree arg, tree *num, tree struct_size); + bool isptrptr (tree type); + + srmode current_mode; + ++hash_map replace_type_map; ++ ++/* Return true if one of these types is created by struct-reorg. */ ++ ++static bool ++is_replace_type (tree type1, tree type2) ++{ ++ if (replace_type_map.is_empty ()) ++ return false; ++ if (type1 == NULL_TREE || type2 == NULL_TREE) ++ return false; ++ tree *type_value = replace_type_map.get (type1); ++ if (type_value) ++ if (types_compatible_p (*type_value, type2)) ++ return true; ++ type_value = replace_type_map.get (type2); ++ if (type_value) ++ if (types_compatible_p (*type_value, type1)) ++ return true; ++ return false; ++} ++ + } // anon namespace + + namespace struct_reorg { +@@ -318,12 +352,13 @@ srfunction::simple_dump (FILE *file) + /* Constructor of FIELD. */ + + srfield::srfield (tree field, srtype *base) +- : offset(int_byte_position (field)), ++ : offset (int_byte_position (field)), + fieldtype (TREE_TYPE (field)), + fielddecl (field), +- base(base), +- type(NULL), +- clusternum(0) ++ base (base), ++ type (NULL), ++ clusternum (0), ++ field_access (EMPTY_FIELD) + { + for(int i = 0;i < max_split; i++) + newfield[i] = NULL_TREE; +@@ -362,6 +397,25 @@ srtype::srtype (tree type) + } + } + ++/* Check it if all fields in the RECORD_TYPE are referenced. */ ++ ++bool ++srtype::has_dead_field (void) ++{ ++ bool may_dfe = false; ++ srfield *this_field; ++ unsigned i; ++ FOR_EACH_VEC_ELT (fields, i, this_field) ++ { ++ if (!(this_field->field_access & READ_FIELD)) ++ { ++ may_dfe = true; ++ break; ++ } ++ } ++ return may_dfe; ++} ++ + /* Mark the type as escaping type E at statement STMT. */ + + void +@@ -833,6 +887,10 @@ srtype::create_new_type (void) + for (unsigned i = 0; i < fields.length (); i++) + { + srfield *f = fields[i]; ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE ++ && struct_layout_optimize_level >= DEAD_FIELD_ELIMINATION ++ && !(f->field_access & READ_FIELD)) ++ continue; + f->create_new_fields (newtype, newfields, newlast); + } + +@@ -854,6 +912,16 @@ srtype::create_new_type (void) + + warn_padded = save_warn_padded; + ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE ++ && replace_type_map.get (this->newtype[0]) == NULL) ++ replace_type_map.put (this->newtype[0], this->type); ++ if (dump_file) ++ { ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE ++ && struct_layout_optimize_level >= DEAD_FIELD_ELIMINATION ++ && has_dead_field ()) ++ fprintf (dump_file, "Dead field elimination.\n"); ++ } + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Created %d types:\n", maxclusters); +@@ -1128,12 +1196,12 @@ csrtype::init_type_info (void) + + /* Close enough to pad to improve performance. + 33~63 should pad to 64 but 33~48 (first half) are too far away, and +- 65~127 should pad to 128 but 65~96 (first half) are too far away. */ ++ 65~127 should pad to 128 but 65~80 (first half) are too far away. */ + if (old_size > 48 && old_size < 64) + { + new_size = 64; + } +- if (old_size > 96 && old_size < 128) ++ if (old_size > 80 && old_size < 128) + { + new_size = 128; + } +@@ -1272,6 +1340,7 @@ public: + bool has_rewritten_type (srfunction*); + void maybe_mark_or_record_other_side (tree side, tree other, gimple *stmt); + unsigned execute_struct_relayout (void); ++ bool remove_dead_field_stmt (tree lhs); + }; + + struct ipa_struct_relayout +@@ -3206,6 +3275,90 @@ ipa_struct_reorg::find_vars (gimple *stmt) + } + } + ++/* Update field_access in srfield. */ ++ ++static void ++update_field_access (tree record, tree field, unsigned access, void *data) ++{ ++ srtype *this_srtype = ((ipa_struct_reorg *)data)->find_type (record); ++ if (this_srtype == NULL) ++ return; ++ srfield *this_srfield = this_srtype->find_field (int_byte_position (field)); ++ if (this_srfield == NULL) ++ return; ++ ++ this_srfield->field_access |= access; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "record field access %d:", access); ++ print_generic_expr (dump_file, record); ++ fprintf (dump_file, " field:"); ++ print_generic_expr (dump_file, field); ++ fprintf (dump_file, "\n"); ++ } ++ return; ++} ++ ++/* A callback for walk_stmt_load_store_ops to visit store. */ ++ ++static bool ++find_field_p_store (gimple *, tree node, tree op, void *data) ++{ ++ if (TREE_CODE (op) != COMPONENT_REF) ++ return false; ++ tree node_type = TREE_TYPE (node); ++ if (!handled_type (node_type)) ++ return false; ++ ++ update_field_access (node_type, TREE_OPERAND (op, 1), WRITE_FIELD, data); ++ ++ return false; ++} ++ ++/* A callback for walk_stmt_load_store_ops to visit load. */ ++ ++static bool ++find_field_p_load (gimple *, tree node, tree op, void *data) ++{ ++ if (TREE_CODE (op) != COMPONENT_REF) ++ return false; ++ tree node_type = TREE_TYPE (node); ++ if (!handled_type (node_type)) ++ return false; ++ ++ update_field_access (node_type, TREE_OPERAND (op, 1), READ_FIELD, data); ++ ++ return false; ++} ++ ++/* Determine whether the stmt should be deleted. */ ++ ++bool ++ipa_struct_reorg::remove_dead_field_stmt (tree lhs) ++{ ++ tree base = NULL_TREE; ++ bool indirect = false; ++ srtype *t = NULL; ++ srfield *f = NULL; ++ bool realpart = false; ++ bool imagpart = false; ++ bool address = false; ++ bool escape_from_base = false; ++ if (!get_type_field (lhs, base, indirect, t, f, realpart, imagpart, ++ address, escape_from_base)) ++ return false; ++ if (t ==NULL) ++ return false; ++ if (t->newtype[0] == t->type) ++ return false; ++ if (f == NULL) ++ return false; ++ if (f->newfield[0] == NULL ++ && (f->field_access & WRITE_FIELD)) ++ return true; ++ return false; ++} ++ + /* Maybe record access of statement for further analaysis. */ + + void +@@ -3227,6 +3380,13 @@ ipa_struct_reorg::maybe_record_stmt (cgraph_node *node, gimple *stmt) + default: + break; + } ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE ++ && struct_layout_optimize_level >= DEAD_FIELD_ELIMINATION) ++ { ++ /* Look for loads and stores. */ ++ walk_stmt_load_store_ops (stmt, this, find_field_p_load, ++ find_field_p_store); ++ } + } + + /* Calculate the multiplier. */ +@@ -3543,8 +3703,11 @@ ipa_struct_reorg::maybe_mark_or_record_other_side (tree side, tree other, gimple + } + else if (type != d->type) + { +- type->mark_escape (escape_cast_another_ptr, stmt); +- d->type->mark_escape (escape_cast_another_ptr, stmt); ++ if (!is_replace_type (d->type->type, type->type)) ++ { ++ type->mark_escape (escape_cast_another_ptr, stmt); ++ d->type->mark_escape (escape_cast_another_ptr, stmt); ++ } + } + /* x_1 = y.x_nodes; void *x; + Directly mark the structure pointer type assigned +@@ -4131,8 +4294,9 @@ ipa_struct_reorg::check_type_and_push (tree newdecl, srdecl *decl, + } + /* If we have a non void* or a decl (which is hard to track), + then mark the type as escaping. */ +- if (!VOID_POINTER_P (TREE_TYPE (newdecl)) +- || DECL_P (newdecl)) ++ if (replace_type_map.get (type->type) == NULL ++ && (!VOID_POINTER_P (TREE_TYPE (newdecl)) ++ || DECL_P (newdecl))) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { +@@ -4142,7 +4306,7 @@ ipa_struct_reorg::check_type_and_push (tree newdecl, srdecl *decl, + print_generic_expr (dump_file, TREE_TYPE (newdecl)); + fprintf (dump_file, "\n"); + } +- type->mark_escape (escape_cast_another_ptr, stmt); ++ type->mark_escape (escape_cast_another_ptr, stmt); + return; + } + /* At this point there should only be unkown void* ssa names. */ +@@ -4465,11 +4629,13 @@ ipa_struct_reorg::check_other_side (srdecl *decl, tree other, gimple *stmt, vec< + + return; + } ++ if (!is_replace_type (t1->type, type->type)) ++ { ++ if (t1) ++ t1->mark_escape (escape_cast_another_ptr, stmt); + +- if (t1) +- t1->mark_escape (escape_cast_another_ptr, stmt); +- +- type->mark_escape (escape_cast_another_ptr, stmt); ++ type->mark_escape (escape_cast_another_ptr, stmt); ++ } + } + + +@@ -5722,6 +5888,19 @@ bool + ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + { + bool remove = false; ++ ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE ++ && struct_layout_optimize_level >= DEAD_FIELD_ELIMINATION ++ && remove_dead_field_stmt (gimple_assign_lhs (stmt))) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\n rewriting statement (remove): \n"); ++ print_gimple_stmt (dump_file, stmt, 0); ++ } ++ return true; ++ } ++ + if (gimple_clobber_p (stmt)) + { + tree lhs = gimple_assign_lhs (stmt); +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.h b/gcc/ipa-struct-reorg/ipa-struct-reorg.h +index 54b0dc655..936c0fa6f 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.h ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.h +@@ -142,6 +142,7 @@ public: + + bool create_new_type (void); + void analyze (void); ++ bool has_dead_field (void); + void mark_escape (escape_type, gimple *stmt); + bool has_escaped (void) + { +@@ -163,6 +164,12 @@ public: + } + }; + ++/* Bitflags used for determining if a field ++ is never accessed, read or written. */ ++const unsigned EMPTY_FIELD = 0x0u; ++const unsigned READ_FIELD = 0x01u; ++const unsigned WRITE_FIELD = 0x02u; ++ + struct srfield + { + unsigned HOST_WIDE_INT offset; +@@ -174,7 +181,7 @@ struct srfield + unsigned clusternum; + + tree newfield[max_split]; +- ++ unsigned field_access; /* FIELD_DECL -> bitflag (use for dfe). */ + // Constructors + srfield (tree field, srtype *base); + +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_DTE_verify.c b/gcc/testsuite/gcc.dg/struct/dfe_DTE_verify.c +new file mode 100644 +index 000000000..4261d2352 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_DTE_verify.c +@@ -0,0 +1,86 @@ ++/* { dg-do compile } */ ++ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node *node_p; ++ ++typedef struct arc arc_t; ++typedef struct arc *arc_p; ++ ++typedef struct network ++{ ++ arc_p arcs; ++ arc_p sorted_arcs; ++ int x; ++ node_p nodes; ++ node_p stop_nodes; ++} network_t; ++ ++struct node ++{ ++ int64_t potential; ++ int orientation; ++ node_p child; ++ node_p pred; ++ node_p sibling; ++ node_p sibling_prev; ++ arc_p basic_arc; ++ arc_p firstout; ++ arc_p firstin; ++ arc_p arc_tmp; ++ int64_t flow; ++ int64_t depth; ++ int number; ++ int time; ++}; ++ ++struct arc ++{ ++ int id; ++ int64_t cost; ++ node_p tail; ++ node_p head; ++ short ident; ++ arc_p nextout; ++ arc_p nextin; ++ int64_t flow; ++ int64_t org_cost; ++ network_t* net_add; ++}; ++ ++ ++const int MAX = 100; ++ ++/* let it escape_array, "Type is used in an array [not handled yet]". */ ++network_t* net[2]; ++arc_p stop_arcs = NULL; ++ ++int ++main () ++{ ++ net[0] = (network_t*) calloc (1, sizeof(network_t)); ++ net[0]->arcs = (arc_p) calloc (MAX, sizeof (arc_t)); ++ stop_arcs = (arc_p) calloc (MAX, sizeof (arc_t)); ++ ++ net[0]->arcs->id = 100; ++ ++ for (unsigned i = 0; i < 3; i++) ++ { ++ net[0]->arcs->id = net[0]->arcs->id + 2; ++ stop_arcs->cost = net[0]->arcs->id / 2; ++ stop_arcs->net_add = net[0]; ++ printf("stop_arcs->cost = %ld\n", stop_arcs->cost); ++ net[0]->arcs++; ++ stop_arcs++; ++ } ++ ++ if( net[1] != 0 && stop_arcs != 0) ++ { ++ return -1; ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 2 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_ele_minus_verify.c b/gcc/testsuite/gcc.dg/struct/dfe_ele_minus_verify.c +new file mode 100644 +index 000000000..42d38c63a +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_ele_minus_verify.c +@@ -0,0 +1,60 @@ ++// verify newarc[cmp-1].flow ++/* { dg-do compile } */ ++ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node *node_p; ++ ++typedef struct arc arc_t; ++typedef struct arc *arc_p; ++ ++struct node ++{ ++ int64_t potential; ++ int orientation; ++ node_p child; ++ node_p pred; ++ node_p sibling; ++ node_p sibling_prev; ++ arc_p basic_arc; ++ arc_p firstout; ++ arc_p firstin; ++ arc_p arc_tmp; ++ int64_t flow; ++ int64_t depth; ++ int number; ++ int time; ++}; ++ ++struct arc ++{ ++ int id; ++ int64_t cost; ++ node_p tail; ++ node_p head; ++ short ident; ++ arc_p nextout; ++ arc_p nextin; ++ int64_t flow; ++ int64_t org_cost; ++}; ++ ++const int MAX = 100; ++arc_p ap = NULL; ++ ++int ++main () ++{ ++ ap = (arc_p) calloc(MAX, sizeof(arc_t)); ++ printf("%d\n", ap[0].id); ++ for (int i = 1; i < MAX; i++) ++ { ++ ap[i-1].id = 500; ++ } ++ printf("%d\n", ap[0].id); ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 2 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_mem_ref_offset.c b/gcc/testsuite/gcc.dg/struct/dfe_mem_ref_offset.c +new file mode 100644 +index 000000000..53583fe82 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_mem_ref_offset.c +@@ -0,0 +1,58 @@ ++/* Supports the MEM_REF offset. ++ _1 = MEM[(struct arc *)ap_4 + 72B].flow; ++ Old rewrite:_1 = ap.reorder.0_8->flow; ++ New rewrite:_1 = MEM[(struct arc.reorder.0 *)ap.reorder.0_8 + 64B].flow. */ ++/* { dg-do compile } */ ++ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node *node_p; ++ ++typedef struct arc arc_t; ++typedef struct arc *arc_p; ++ ++struct node ++{ ++ int64_t potential; ++ int orientation; ++ node_p child; ++ node_p pred; ++ node_p sibling; ++ node_p sibling_prev; ++ arc_p basic_arc; ++ arc_p firstout; ++ arc_p firstin; ++ arc_p arc_tmp; ++ int64_t flow; ++ int64_t depth; ++ int number; ++ int time; ++}; ++ ++struct arc ++{ ++ int id; ++ int64_t cost; ++ node_p tail; ++ node_p head; ++ short ident; ++ arc_p nextout; ++ arc_p nextin; ++ int64_t flow; ++ int64_t org_cost; ++}; ++ ++int ++main () ++{ ++ const int MAX = 100; ++ /* A similar scenario can be reproduced only by using local variables. */ ++ arc_p ap = NULL; ++ ap = (arc_p) calloc(MAX, sizeof(arc_t)); ++ printf("%d\n", ap[1].flow); ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 2 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_mul_layer_ptr_record_bug.c b/gcc/testsuite/gcc.dg/struct/dfe_mul_layer_ptr_record_bug.c +new file mode 100644 +index 000000000..fd675ec2e +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_mul_layer_ptr_record_bug.c +@@ -0,0 +1,30 @@ ++/* { dg-do compile } */ ++ ++#include ++#include ++ ++typedef struct T_HASH_ENTRY ++{ ++ unsigned int hash; ++ unsigned int klen; ++ char *key; ++} iHashEntry; ++ ++typedef struct T_HASH ++{ ++ unsigned int size; ++ unsigned int fill; ++ unsigned int keys; ++ ++ iHashEntry **array; ++} uHash; ++ ++uHash *retval; ++ ++int ++main() { ++ retval->array = (iHashEntry **)calloc(sizeof(iHashEntry *), retval->size); ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 2 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_ptr_diff.c b/gcc/testsuite/gcc.dg/struct/dfe_ptr_diff.c +new file mode 100644 +index 000000000..600e7908b +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_ptr_diff.c +@@ -0,0 +1,71 @@ ++// support POINTER_DIFF_EXPR & NOP_EXPR to avoid ++// escape_unhandled_rewrite, "Type escapes via a unhandled rewrite stmt" ++/* { dg-do compile } */ ++ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node *node_p; ++ ++typedef struct arc arc_t; ++typedef struct arc *arc_p; ++ ++typedef struct network ++{ ++ arc_p arcs; ++ arc_p sorted_arcs; ++ int x; ++ node_p nodes; ++ node_p stop_nodes; ++} network_t; ++ ++struct node ++{ ++ int64_t potential; ++ int orientation; ++ node_p child; ++ node_p pred; ++ node_p sibling; ++ node_p sibling_prev; ++ arc_p basic_arc; ++ arc_p firstout; ++ arc_p firstin; ++ arc_p arc_tmp; ++ int64_t flow; ++ int64_t depth; ++ int number; ++ int time; ++}; ++ ++struct arc ++{ ++ int id; ++ int64_t cost; ++ node_p tail; ++ node_p head; ++ short ident; ++ arc_p nextout; ++ arc_p nextin; ++ int64_t flow; ++ int64_t org_cost; ++}; ++ ++int ++main () ++{ ++ arc_t *old_arcs; ++ node_t *node; ++ node_t *stop; ++ size_t off; ++ network_t* net; ++ ++ for( ; node->number < stop->number; node++ ) ++ { ++ off = node->basic_arc - old_arcs; ++ node->basic_arc = (arc_t *)(net->arcs + off); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 3 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_ptr_negate_expr.c b/gcc/testsuite/gcc.dg/struct/dfe_ptr_negate_expr.c +new file mode 100644 +index 000000000..f411364a7 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_ptr_negate_expr.c +@@ -0,0 +1,55 @@ ++// support NEGATE_EXPR rewriting ++/* { dg-do compile } */ ++ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node *node_p; ++ ++typedef struct arc arc_t; ++typedef struct arc *arc_p; ++ ++struct node ++{ ++ int64_t potential; ++ int orientation; ++ node_p child; ++ node_p pred; ++ node_p sibling; ++ node_p sibling_prev; ++ arc_p basic_arc; ++ arc_p firstout; ++ arc_p firstin; ++ arc_p arc_tmp; ++ int64_t flow; ++ int64_t depth; ++ int number; ++ int time; ++}; ++ ++struct arc ++{ ++ int id; ++ int64_t cost; ++ node_p tail; ++ node_p head; ++ short ident; ++ arc_p nextout; ++ arc_p nextin; ++ int64_t flow; ++ int64_t org_cost; ++}; ++ ++int ++main () ++{ ++ int64_t susp = 0; ++ const int MAX = 100; ++ arc_p ap = (arc_p) calloc(MAX, sizeof(arc_t)); ++ ap -= susp; ++ printf("%d\n", ap[1].flow); ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 2 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_ptr_ptr.c b/gcc/testsuite/gcc.dg/struct/dfe_ptr_ptr.c +new file mode 100644 +index 000000000..a4e723763 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_ptr_ptr.c +@@ -0,0 +1,55 @@ ++// release escape_ptr_ptr, "Type is used in a pointer to a pointer [not handled yet]"; ++/* { dg-do compile } */ ++ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node *node_p; ++ ++typedef struct arc arc_t; ++typedef struct arc *arc_p; ++ ++struct node ++{ ++ int64_t potential; ++ int orientation; ++ node_p child; ++ node_p pred; ++ node_p sibling; ++ node_p sibling_prev; ++ arc_p basic_arc; ++ arc_p firstout; ++ arc_p firstin; ++ arc_p arc_tmp; ++ int64_t flow; ++ int64_t depth; ++ int number; ++ int time; ++}; ++ ++struct arc ++{ ++ int id; ++ int64_t cost; ++ node_p tail; ++ node_p head; ++ short ident; ++ arc_p nextout; ++ arc_p nextin; ++ int64_t flow; ++ int64_t org_cost; ++}; ++ ++const int MAX = 100; ++arc_t **ap = NULL; ++ ++int ++main () ++{ ++ ap = (arc_t**) malloc(MAX * sizeof(arc_t*)); ++ (*ap)[0].id = 300; ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 2 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/struct-reorg.exp b/gcc/testsuite/gcc.dg/struct/struct-reorg.exp +index 67b3ac2d5..ac5585813 100644 +--- a/gcc/testsuite/gcc.dg/struct/struct-reorg.exp ++++ b/gcc/testsuite/gcc.dg/struct/struct-reorg.exp +@@ -64,8 +64,27 @@ gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/complete_struct_relayout + "" "-fipa-struct-reorg=1 -fdump-ipa-all -flto-partition=one -fwhole-program" + + # -fipa-struct-reorg=2 +-gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \ ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/rf*.c]] \ ++ "" "-fipa-struct-reorg=2 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/wo_prof_*.c]] \ + "" "-fipa-struct-reorg=2 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/w_ratio_*.c]] \ ++ "" "-fipa-struct-reorg=2 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/w_prof_*.c]] \ ++ "" "-fipa-struct-reorg=2 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/struct_reorg*.c]] \ ++ "" "-fipa-struct-reorg=2 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/sr_*.c]] \ ++ "" "-fipa-struct-reorg=2 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/csr_*.c]] \ ++ "" "-fipa-struct-reorg=2 -fdump-ipa-all -flto-partition=one -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/complete_struct_relayout.c]] \ ++ "" "-fipa-struct-reorg=2 -fdump-ipa-all -flto-partition=one -fwhole-program" ++ ++# -fipa-struct-reorg=3 ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/dfe*.c]] \ ++ "" "-fipa-struct-reorg=3 -fdump-ipa-all -flto-partition=one -fwhole-program" ++ + # All done. + torture-finish + dg-finish +-- +2.33.0.windows.2 + diff --git a/0039-Backport-ipa-sra-Fix-thinko-when-overriding-safe_to_.patch b/0039-Backport-ipa-sra-Fix-thinko-when-overriding-safe_to_.patch new file mode 100644 index 0000000000000000000000000000000000000000..33c735fec93651c139c47828af95bd5ffad7c2b2 --- /dev/null +++ b/0039-Backport-ipa-sra-Fix-thinko-when-overriding-safe_to_.patch @@ -0,0 +1,138 @@ +From d8753de2129d230afc9a887d5804747c69824a68 Mon Sep 17 00:00:00 2001 +From: zhaowenyu <804544223@qq.com> +Date: Mon, 20 Jun 2022 11:24:45 +0800 +Subject: [PATCH] [Backport] ipa-sra: Fix thinko when overriding + safe_to_import_accesses (PR 101066) + +Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=5aa28c8cf15cd254cc5a3a12278133b93b8b017f + +ipa-sra: Fix thinko when overriding safe_to_import_accesses (PR 101066) + +The "new" IPA-SRA has a more difficult job than the previous +not-truly-IPA version when identifying situations in which a parameter +passed by reference can be passed into a third function and only thee +converted to one passed by value (and possibly "split" at the same +time). + +In order to allow this, two conditions must be fulfilled. First the +call to the third function must happen before any modifications of +memory, because it could change the value passed by reference. +Second, in order to make sure we do not introduce new (invalid) +dereferences, the call must postdominate the entry BB. + +The second condition is actually not necessary if the caller function +is also certain to dereference the pointer but the first one must +still hold. Unfortunately, the code making this overriding decision +also happen to trigger when the first condition is not fulfilled. +This is fixed in the following patch. + +gcc/ChangeLog: + +2021-06-16 Martin Jambor + +(cherry picked from commit 763121ccd908f52bc666f277ea2cf42110b3aad9) + +diff --git a/gcc/ipa-sra.c b/gcc/ipa-sra.c +index b706fceff..1cb30afc3 100644 +--- a/gcc/ipa-sra.c ++++ b/gcc/ipa-sra.c +@@ -340,7 +340,7 @@ class isra_call_summary + public: + isra_call_summary () + : m_arg_flow (), m_return_ignored (false), m_return_returned (false), +- m_bit_aligned_arg (false) ++ m_bit_aligned_arg (false), m_before_any_store (false) + {} + + void init_inputs (unsigned arg_count); +@@ -359,6 +359,10 @@ public: + + /* Set when any of the call arguments are not byte-aligned. */ + unsigned m_bit_aligned_arg : 1; ++ ++ /* Set to true if the call happend before any (other) store to memory in the ++ caller. */ ++ unsigned m_before_any_store : 1; + }; + + /* Class to manage function summaries. */ +@@ -472,6 +476,8 @@ isra_call_summary::dump (FILE *f) + fprintf (f, " return value ignored\n"); + if (m_return_returned) + fprintf (f, " return value used only to compute caller return value\n"); ++ if (m_before_any_store) ++ fprintf (f, " happens before any store to memory\n"); + for (unsigned i = 0; i < m_arg_flow.length (); i++) + { + fprintf (f, " Parameter %u:\n", i); +@@ -516,6 +522,7 @@ ipa_sra_call_summaries::duplicate (cgraph_edge *, cgraph_edge *, + new_sum->m_return_ignored = old_sum->m_return_ignored; + new_sum->m_return_returned = old_sum->m_return_returned; + new_sum->m_bit_aligned_arg = old_sum->m_bit_aligned_arg; ++ new_sum->m_before_any_store = old_sum->m_before_any_store; + } + + +@@ -2355,6 +2362,7 @@ process_scan_results (cgraph_node *node, struct function *fun, + unsigned count = gimple_call_num_args (call_stmt); + isra_call_summary *csum = call_sums->get_create (cs); + csum->init_inputs (count); ++ csum->m_before_any_store = uses_memory_as_obtained; + for (unsigned argidx = 0; argidx < count; argidx++) + { + if (!csum->m_arg_flow[argidx].pointer_pass_through) +@@ -2601,6 +2609,7 @@ isra_write_edge_summary (output_block *ob, cgraph_edge *e) + bp_pack_value (&bp, csum->m_return_ignored, 1); + bp_pack_value (&bp, csum->m_return_returned, 1); + bp_pack_value (&bp, csum->m_bit_aligned_arg, 1); ++ bp_pack_value (&bp, csum->m_before_any_store, 1); + streamer_write_bitpack (&bp); + } + +@@ -2719,6 +2728,7 @@ isra_read_edge_summary (struct lto_input_block *ib, cgraph_edge *cs) + csum->m_return_ignored = bp_unpack_value (&bp, 1); + csum->m_return_returned = bp_unpack_value (&bp, 1); + csum->m_bit_aligned_arg = bp_unpack_value (&bp, 1); ++ csum->m_before_any_store = bp_unpack_value (&bp, 1); + } + + /* Read intraprocedural analysis information about NODE and all of its outgoing +@@ -3475,7 +3485,8 @@ param_splitting_across_edge (cgraph_edge *cs) + } + else if (!ipf->safe_to_import_accesses) + { +- if (!all_callee_accesses_present_p (param_desc, arg_desc)) ++ if (!csum->m_before_any_store ++ || !all_callee_accesses_present_p (param_desc, arg_desc)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, " %u->%u: cannot import accesses.\n", +diff --git a/gcc/testsuite/gcc.dg/ipa/pr101066.c b/gcc/testsuite/gcc.dg/ipa/pr101066.c +new file mode 100644 +index 000000000..1ceb6e431 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/ipa/pr101066.c +@@ -0,0 +1,20 @@ ++/* { dg-do run } */ ++/* { dg-options "-Os -fno-ipa-cp -fno-inline" } */ ++ ++int a = 1, c, d, e; ++int *b = &a; ++static int g(int *h) { ++ c = *h; ++ return d; ++} ++static void f(int *h) { ++ e = *h; ++ *b = 0; ++ g(h); ++} ++int main() { ++ f(b); ++ if (c) ++ __builtin_abort(); ++ return 0; ++} +-- +2.33.0.windows.2 + diff --git a/0040-Backport-ifcvt-Allow-constants-for-noce_convert_mult.patch b/0040-Backport-ifcvt-Allow-constants-for-noce_convert_mult.patch new file mode 100644 index 0000000000000000000000000000000000000000..be223fb8072dd74bb0ed43a3026c78f7650883a1 --- /dev/null +++ b/0040-Backport-ifcvt-Allow-constants-for-noce_convert_mult.patch @@ -0,0 +1,55 @@ +From ea059ab02ac79eba1c05d6e05cbb2590c47d7c1f Mon Sep 17 00:00:00 2001 +From: zhaowenyu <804544223@qq.com> +Date: Thu, 23 Jun 2022 10:16:08 +0800 +Subject: [PATCH] [Backport] ifcvt: Allow constants for noce_convert_multiple. + +Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=9b8eaa282250ad260e01d164093b597579db00d9 + +This lifts the restriction of not allowing constants for noce_convert_multiple. +The code later checks if a valid sequence is produced anyway. +gcc/ChangeLog: + + * ifcvt.cc (noce_convert_multiple_sets): Allow constants. + (bb_ok_for_noce_convert_multiple_sets): Likewise. + +diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c +index 977dd1bd4..2452f231c 100644 +--- a/gcc/ifcvt.c ++++ b/gcc/ifcvt.c +@@ -3252,7 +3252,9 @@ noce_convert_multiple_sets (struct noce_if_info *if_info) + we'll end up trying to emit r4:HI = cond ? (r1:SI) : (r3:HI). + Wrap the two cmove operands into subregs if appropriate to prevent + that. */ +- if (GET_MODE (new_val) != GET_MODE (temp)) ++ ++ if (!CONSTANT_P (new_val) ++ && GET_MODE (new_val) != GET_MODE (temp)) + { + machine_mode src_mode = GET_MODE (new_val); + machine_mode dst_mode = GET_MODE (temp); +@@ -3263,7 +3265,8 @@ noce_convert_multiple_sets (struct noce_if_info *if_info) + } + new_val = lowpart_subreg (dst_mode, new_val, src_mode); + } +- if (GET_MODE (old_val) != GET_MODE (temp)) ++ if (!CONSTANT_P (old_val) ++ && GET_MODE (old_val) != GET_MODE (temp)) + { + machine_mode src_mode = GET_MODE (old_val); + machine_mode dst_mode = GET_MODE (temp); +@@ -3392,9 +3395,9 @@ bb_ok_for_noce_convert_multiple_sets (basic_block test_bb) + if (!REG_P (dest)) + return false; + +- if (!(REG_P (src) +- || (GET_CODE (src) == SUBREG && REG_P (SUBREG_REG (src)) +- && subreg_lowpart_p (src)))) ++ if (!((REG_P (src) || CONSTANT_P (src)) ++ || (GET_CODE (src) == SUBREG && REG_P (SUBREG_REG (src)) ++ && subreg_lowpart_p (src)))) + return false; + + /* Destination must be appropriate for a conditional write. */ +-- +2.33.0.windows.2 + diff --git a/0041-Backport-Register-sysroot-in-the-driver-switches-tab.patch b/0041-Backport-Register-sysroot-in-the-driver-switches-tab.patch new file mode 100644 index 0000000000000000000000000000000000000000..b7efb20918f35ce4005b66b93d89053087719d68 --- /dev/null +++ b/0041-Backport-Register-sysroot-in-the-driver-switches-tab.patch @@ -0,0 +1,36 @@ +From beeb0fb50c7e40ee3d79044abc6408f760d6584a Mon Sep 17 00:00:00 2001 +From: zhaowenyu <804544223@qq.com> +Date: Thu, 23 Jun 2022 10:40:46 +0800 +Subject: [PATCH] [Backport] Register --sysroot in the driver switches table + +Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=48e2d9b7b88dabed336cd098cd212d0e53c5125f + +This change adjusts the processing of --sysroot to save the option in the internal "switches" +array, which lets self-specs test for it and provide a default value possibly dependent on +environment variables, as in + + --with-specs=%{!-sysroot*:--sysroot=%:getenv("WIND_BASE" /target)} + +2021-12-20 Olivier Hainque + + gcc/ + * gcc.c (driver_handle_option): do_save --sysroot. + +diff --git a/gcc/gcc.c b/gcc/gcc.c +index b55075b14..655beffcc 100644 +--- a/gcc/gcc.c ++++ b/gcc/gcc.c +@@ -4190,7 +4190,9 @@ driver_handle_option (struct gcc_options *opts, + case OPT__sysroot_: + target_system_root = arg; + target_system_root_changed = 1; +- do_save = false; ++ /* Saving this option is useful to let self-specs decide to ++ provide a default one. */ ++ do_save = true; + break; + + case OPT_time_: +-- +2.33.0.windows.2 + diff --git a/0042-DFE-Fix-bugs.patch b/0042-DFE-Fix-bugs.patch new file mode 100644 index 0000000000000000000000000000000000000000..daeeef0f3be761ac9a5e6d835bc271028ebac49f --- /dev/null +++ b/0042-DFE-Fix-bugs.patch @@ -0,0 +1,652 @@ +From f8308a2b440efe124cd6ff59924f135e85e53888 Mon Sep 17 00:00:00 2001 +From: Mingchuan Wu +Date: Sat, 18 Jun 2022 17:51:04 +0800 +Subject: [PATCH] [DFE] Fix bugs + +Fix bugs: +1. Fixed a bug in check replace type. +2. Use new to update field access for ref. +3. We now replace the dead fields in stmt by creating a new ssa. +4. The replaced type is no longer optimized in NORMAL mode. + +Also we added 5 dejaGNU test cases. + +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.c b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +index 2fa560239..00dc4bf1d 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.c ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +@@ -252,6 +252,7 @@ enum struct_layout_opt_level + + static bool is_result_of_mult (tree arg, tree *num, tree struct_size); + bool isptrptr (tree type); ++void get_base (tree &base, tree expr); + + srmode current_mode; + +@@ -631,7 +632,15 @@ srtype::analyze (void) + into 2 different structures. In future we intend to add profile + info and/or static heuristics to differentiate splitting process. */ + if (fields.length () == 2) +- fields[1]->clusternum = 1; ++ { ++ for (hash_map::iterator it = replace_type_map.begin (); ++ it != replace_type_map.end (); ++it) ++ { ++ if (types_compatible_p ((*it).second, this->type)) ++ return; ++ } ++ fields[1]->clusternum = 1; ++ } + + /* Otherwise we do nothing. */ + if (fields.length () >= 3) +@@ -3278,12 +3287,33 @@ ipa_struct_reorg::find_vars (gimple *stmt) + /* Update field_access in srfield. */ + + static void +-update_field_access (tree record, tree field, unsigned access, void *data) ++update_field_access (tree node, tree op, unsigned access, void *data) + { +- srtype *this_srtype = ((ipa_struct_reorg *)data)->find_type (record); ++ HOST_WIDE_INT offset = 0; ++ switch (TREE_CODE (op)) ++ { ++ case COMPONENT_REF: ++ { ++ offset = int_byte_position (TREE_OPERAND (op, 1)); ++ break; ++ } ++ case MEM_REF: ++ { ++ offset = tree_to_uhwi (TREE_OPERAND (op, 1)); ++ break; ++ } ++ default: ++ return; ++ } ++ tree base = node; ++ get_base (base, node); ++ srdecl *this_srdecl = ((ipa_struct_reorg *)data)->find_decl (base); ++ if (this_srdecl == NULL) ++ return; ++ srtype *this_srtype = this_srdecl->type; + if (this_srtype == NULL) + return; +- srfield *this_srfield = this_srtype->find_field (int_byte_position (field)); ++ srfield *this_srfield = this_srtype->find_field (offset); + if (this_srfield == NULL) + return; + +@@ -3291,9 +3321,9 @@ update_field_access (tree record, tree field, unsigned access, void *data) + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "record field access %d:", access); +- print_generic_expr (dump_file, record); ++ print_generic_expr (dump_file, this_srtype->type); + fprintf (dump_file, " field:"); +- print_generic_expr (dump_file, field); ++ print_generic_expr (dump_file, this_srfield->fielddecl); + fprintf (dump_file, "\n"); + } + return; +@@ -3302,15 +3332,10 @@ update_field_access (tree record, tree field, unsigned access, void *data) + /* A callback for walk_stmt_load_store_ops to visit store. */ + + static bool +-find_field_p_store (gimple *, tree node, tree op, void *data) ++find_field_p_store (gimple *stmt ATTRIBUTE_UNUSED, ++ tree node, tree op, void *data) + { +- if (TREE_CODE (op) != COMPONENT_REF) +- return false; +- tree node_type = TREE_TYPE (node); +- if (!handled_type (node_type)) +- return false; +- +- update_field_access (node_type, TREE_OPERAND (op, 1), WRITE_FIELD, data); ++ update_field_access (node, op, WRITE_FIELD, data); + + return false; + } +@@ -3318,15 +3343,10 @@ find_field_p_store (gimple *, tree node, tree op, void *data) + /* A callback for walk_stmt_load_store_ops to visit load. */ + + static bool +-find_field_p_load (gimple *, tree node, tree op, void *data) ++find_field_p_load (gimple *stmt ATTRIBUTE_UNUSED, ++ tree node, tree op, void *data) + { +- if (TREE_CODE (op) != COMPONENT_REF) +- return false; +- tree node_type = TREE_TYPE (node); +- if (!handled_type (node_type)) +- return false; +- +- update_field_access (node_type, TREE_OPERAND (op, 1), READ_FIELD, data); ++ update_field_access (node, op, READ_FIELD, data); + + return false; + } +@@ -4629,7 +4649,7 @@ ipa_struct_reorg::check_other_side (srdecl *decl, tree other, gimple *stmt, vec< + + return; + } +- if (!is_replace_type (t1->type, type->type)) ++ if (!is_replace_type (inner_type (t), type->type)) + { + if (t1) + t1->mark_escape (escape_cast_another_ptr, stmt); +@@ -5898,7 +5918,16 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + fprintf (dump_file, "\n rewriting statement (remove): \n"); + print_gimple_stmt (dump_file, stmt, 0); + } +- return true; ++ /* Replace the dead field in stmt by creating a dummy ssa. */ ++ tree dummy_ssa = make_ssa_name (TREE_TYPE (gimple_assign_lhs (stmt))); ++ gimple_assign_set_lhs (stmt, dummy_ssa); ++ update_stmt (stmt); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "To: \n"); ++ print_gimple_stmt (dump_file, stmt, 0); ++ } ++ return false; + } + + if (gimple_clobber_p (stmt)) +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_extr_dtrace.c b/gcc/testsuite/gcc.dg/struct/dfe_extr_dtrace.c +new file mode 100644 +index 000000000..13a226ee8 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_extr_dtrace.c +@@ -0,0 +1,56 @@ ++/* { dg-do compile} */ ++ ++#define NULL ((void*)0) ++typedef unsigned long size_t; ++typedef long intptr_t; ++typedef unsigned long uintptr_t; ++typedef long scalar_t__; ++typedef int bool; ++#define false 0 ++#define true 1 ++ ++typedef struct TYPE_4__ TYPE_2__; ++typedef struct TYPE_3__ TYPE_1__; ++ ++typedef int uint8_t; ++typedef int uint16_t; ++ ++struct TYPE_4__ ++{ ++ size_t cpu_id; ++}; ++ ++struct TYPE_3__ ++{ ++ int cpuc_dtrace_flags; ++}; ++ ++TYPE_2__ *CPU; ++volatile int CPU_DTRACE_FAULT; ++TYPE_1__ *cpu_core; ++scalar_t__ dtrace_load8 (uintptr_t); ++ ++__attribute__((used)) static int ++dtrace_bcmp (const void *s1, const void *s2, size_t len) ++{ ++ volatile uint16_t *flags; ++ flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags; ++ if (s1 == s2) ++ return (0); ++ if (s1 == NULL || s2 == NULL) ++ return (1); ++ if (s1 != s2 && len != 0) ++ { ++ const uint8_t *ps1 = s1; ++ const uint8_t *ps2 = s2; ++ do ++ { ++ if (dtrace_load8 ((uintptr_t)ps1++) != *ps2++) ++ return (1); ++ } ++ while (--len != 0 && !(*flags & CPU_DTRACE_FAULT)); ++ } ++ return (0); ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 0 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_extr_gc.c b/gcc/testsuite/gcc.dg/struct/dfe_extr_gc.c +new file mode 100644 +index 000000000..1fff2cb9d +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_extr_gc.c +@@ -0,0 +1,162 @@ ++/* { dg-do compile} */ ++ ++#define NULL ((void*)0) ++typedef unsigned long size_t; ++typedef long intptr_t; ++typedef unsigned long uintptr_t; ++typedef long scalar_t__; ++typedef int bool; ++#define false 0 ++#define true 1 ++ ++struct mrb_context ++{ ++ size_t stack; ++ size_t stbase; ++ size_t stend; ++ size_t eidx; ++ int *ci; ++ int *cibase; ++ int status; ++}; ++ ++struct RObject ++{ ++ int dummy; ++}; ++ ++struct RHash ++{ ++ int dummy; ++}; ++ ++struct RFiber ++{ ++ struct mrb_context *cxt; ++}; ++ ++struct RClass ++{ ++ int dummy; ++}; ++ ++struct RBasic ++{ ++ int tt; ++}; ++ ++struct RArray ++{ ++ int dummy; ++}; ++ ++typedef int mrb_state; ++typedef int mrb_gc; ++typedef int mrb_callinfo; ++size_t ARY_LEN (struct RArray *); ++size_t MRB_ENV_STACK_LEN (struct RBasic *); ++int MRB_FIBER_TERMINATED; ++ ++#define MRB_TT_ARRAY 140 ++#define MRB_TT_CLASS 139 ++#define MRB_TT_DATA 138 ++#define MRB_TT_ENV 137 ++#define MRB_TT_EXCEPTION 136 ++#define MRB_TT_FIBER 135 ++#define MRB_TT_HASH 134 ++#define MRB_TT_ICLASS 133 ++#define MRB_TT_MODULE 132 ++#define MRB_TT_OBJECT 131 ++#define MRB_TT_PROC 130 ++#define MRB_TT_RANGE 129 ++#define MRB_TT_SCLASS 128 ++ ++size_t ci_nregs (int *); ++int gc_mark_children (int *, int *, struct RBasic *); ++size_t mrb_gc_mark_hash_size (int *, struct RHash *); ++size_t mrb_gc_mark_iv_size (int *, struct RObject *); ++size_t mrb_gc_mark_mt_size (int *, struct RClass *); ++ ++__attribute__((used)) static size_t ++gc_gray_mark (mrb_state *mrb, mrb_gc *gc, struct RBasic *obj) ++{ ++ size_t children = 0; ++ gc_mark_children (mrb, gc, obj); ++ switch (obj->tt) ++ { ++ case MRB_TT_ICLASS: ++ children++; ++ break; ++ ++ case MRB_TT_CLASS: ++ case MRB_TT_SCLASS: ++ case MRB_TT_MODULE: ++ { ++ struct RClass *c = (struct RClass *)obj; ++ children += mrb_gc_mark_iv_size (mrb, (struct RObject *)obj); ++ children += mrb_gc_mark_mt_size (mrb, c); ++ children ++; ++ } ++ break; ++ ++ case MRB_TT_OBJECT: ++ case MRB_TT_DATA: ++ case MRB_TT_EXCEPTION: ++ children += mrb_gc_mark_iv_size (mrb, (struct RObject *)obj); ++ break; ++ ++ case MRB_TT_ENV: ++ children += MRB_ENV_STACK_LEN (obj); ++ break; ++ ++ case MRB_TT_FIBER: ++ { ++ struct mrb_context *c = ((struct RFiber *)obj)->cxt; ++ size_t i; ++ mrb_callinfo *ci; ++ if (!c || c->status == MRB_FIBER_TERMINATED) ++ break; ++ ++ i = c->stack - c->stbase; ++ if (c->ci) ++ { ++ i += ci_nregs (c->ci); ++ } ++ if (c->stbase + i > c->stend) ++ i = c->stend - c->stbase; ++ ++ children += i; ++ children += c->eidx; ++ if (c->cibase) ++ { ++ for (i = 0, ci = c->cibase; ci <= c->ci; i++, ci++) ++ ; ++ } ++ children += i; ++ } ++ break; ++ ++ case MRB_TT_ARRAY: ++ { ++ struct RArray *a = (struct RArray *)obj; ++ children += ARY_LEN (a); ++ } ++ break; ++ ++ case MRB_TT_HASH: ++ children += mrb_gc_mark_iv_size (mrb, (struct RObject *)obj); ++ children += mrb_gc_mark_hash_size (mrb, (struct RHash *)obj); ++ break; ++ ++ case MRB_TT_PROC: ++ case MRB_TT_RANGE: ++ children += 2; ++ break; ++ default: ++ break; ++ } ++ ++ return children; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 0 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_extr_hpsa.c b/gcc/testsuite/gcc.dg/struct/dfe_extr_hpsa.c +new file mode 100644 +index 000000000..0f577667c +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_extr_hpsa.c +@@ -0,0 +1,126 @@ ++/* { dg-do compile} */ ++ ++#define NULL ((void*)0) ++typedef unsigned long size_t; ++typedef long intptr_t; ++typedef unsigned long uintptr_t; ++typedef long scalar_t__; ++typedef int bool; ++#define false 0 ++#define true 1 ++ ++typedef struct TYPE_6__ TYPE_3__; ++typedef struct TYPE_5__ TYPE_2__; ++typedef struct TYPE_4__ TYPE_1__; ++ ++struct io_accel2_cmd ++{ ++ int dummy; ++}; ++ ++struct hpsa_tmf_struct ++{ ++ int it_nexus; ++}; ++ ++struct hpsa_scsi_dev_t ++{ ++ int nphysical_disks; ++ int ioaccel_handle; ++ struct hpsa_scsi_dev_t **phys_disk; ++}; ++ ++struct ctlr_info ++{ ++ TYPE_3__ *pdev; ++ struct io_accel2_cmd *ioaccel2_cmd_pool; ++}; ++struct TYPE_4__ ++{ ++ int LunAddrBytes; ++}; ++ ++struct TYPE_5__ ++{ ++ TYPE_1__ LUN; ++}; ++ ++struct CommandList ++{ ++ size_t cmdindex; ++ int cmd_type; ++ struct hpsa_scsi_dev_t *phys_disk; ++ TYPE_2__ Header; ++}; ++ ++struct TYPE_6__ ++{ ++ int dev; ++}; ++ ++int BUG (); ++#define CMD_IOACCEL1 132 ++#define CMD_IOACCEL2 131 ++#define CMD_IOCTL_PEND 130 ++#define CMD_SCSI 129 ++#define IOACCEL2_TMF 128 ++int dev_err (int *, char *, int); ++scalar_t__ hpsa_is_cmd_idle (struct CommandList *); ++int le32_to_cpu (int); ++int test_memcmp (unsigned char *, int *, int); ++ ++__attribute__((used)) static bool ++hpsa_cmd_dev_match (struct ctlr_info *h, struct CommandList *c, ++ struct hpsa_scsi_dev_t *dev, unsigned char *scsi3addr) ++{ ++ int i; ++ bool match = false; ++ struct io_accel2_cmd * c2 = &h->ioaccel2_cmd_pool[c->cmdindex]; ++ struct hpsa_tmf_struct *ac = (struct hpsa_tmf_struct *)c2; ++ ++ if (hpsa_is_cmd_idle (c)) ++ return false; ++ ++ switch (c->cmd_type) ++ { ++ case CMD_SCSI: ++ case CMD_IOCTL_PEND: ++ match = !test_memcmp (scsi3addr, &c->Header.LUN.LunAddrBytes, ++ sizeof (c->Header.LUN.LunAddrBytes)); ++ break; ++ ++ case CMD_IOACCEL1: ++ case CMD_IOACCEL2: ++ if (c->phys_disk == dev) ++ { ++ match = true; ++ } ++ else ++ { ++ for (i = 0; i < dev->nphysical_disks && !match; i++) ++ { ++ match = dev->phys_disk[i] == c->phys_disk; ++ } ++ } ++ break; ++ ++ case IOACCEL2_TMF: ++ for (i = 0; i < dev->nphysical_disks && !match; i++) ++ { ++ match = dev->phys_disk[i]->ioaccel_handle == ++ le32_to_cpu (ac->it_nexus); ++ } ++ break; ++ ++ case 0: ++ match = false; ++ break; ++ default: ++ dev_err (&h->pdev->dev, "unexpected cmd_type: %d\n", c->cmd_type); ++ BUG (); ++ } ++ ++ return match; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 0 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_extr_tcp_usrreq.c b/gcc/testsuite/gcc.dg/struct/dfe_extr_tcp_usrreq.c +new file mode 100644 +index 000000000..5570c762e +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_extr_tcp_usrreq.c +@@ -0,0 +1,58 @@ ++/* { dg-do compile} */ ++ ++#define NULL ((void*)0) ++typedef unsigned long size_t; ++typedef long intptr_t; ++typedef unsigned long uintptr_t; ++typedef long scalar_t__; ++typedef int bool; ++#define false 0 ++#define true 1 ++ ++struct tcpcb ++{ ++ int t_state; ++}; ++ ++struct socket ++{ ++ int dummy; ++}; ++ ++struct proc ++{ ++ int dummy; ++}; ++ ++struct inpcb ++{ ++ scalar_t__ inp_lport; ++}; ++ ++int COMMON_END (int); ++int COMMON_START (); ++int PRU_LISTEN; ++int TCPS_LISTEN; ++int in_pcbbind (struct inpcb *, int *, struct proc *); ++struct inpcb* sotoinpcb (struct socket *); ++ ++__attribute__((used)) static void ++tcp_usr_listen (struct socket *so, struct proc *p) ++{ ++ int error = 0; ++ struct inpcb *inp = sotoinpcb (so); ++ struct tcpcb *tp; ++ ++ COMMON_START (); ++ if (inp->inp_lport == 0) ++ { ++ error = in_pcbbind (inp, NULL, p); ++ } ++ if (error == 0) ++ { ++ tp->t_state = TCPS_LISTEN; ++ } ++ COMMON_END (PRU_LISTEN); ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 1 "struct_layout" } } */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_extr_ui_main.c b/gcc/testsuite/gcc.dg/struct/dfe_extr_ui_main.c +new file mode 100644 +index 000000000..50ab9cc24 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_extr_ui_main.c +@@ -0,0 +1,61 @@ ++/* { dg-do compile} */ ++ ++#define NULL ((void*)0) ++typedef unsigned long size_t; ++typedef long intptr_t; ++typedef unsigned long uintptr_t; ++typedef long scalar_t__; ++typedef int bool; ++#define false 0 ++#define true 1 ++ ++typedef struct TYPE_4__ TYPE_2__; ++typedef struct TYPE_3__ TYPE_1__; ++ ++struct TYPE_4__ ++{ ++ size_t modCount; ++ TYPE_1__ *modList; ++}; ++ ++struct TYPE_3__ ++{ ++ void *modDescr; ++ void *modName; ++}; ++ ++size_t MAX_MODS; ++void *String_Alloc (char *); ++int test_strlen (char *); ++int trap_FD_GetFileList (char *, char *, char *, int); ++TYPE_2__ uiInfo; ++ ++__attribute__((used)) static void ++UI_LoadMods () ++{ ++ int numdirs; ++ char dirlist[2048]; ++ char *dirptr; ++ char *descptr; ++ int i; ++ int dirlen; ++ ++ uiInfo.modCount = 0; ++ numdirs = trap_FD_GetFileList ("$modelist", "", dirlist, sizeof (dirlist)); ++ dirptr = dirlist; ++ for (i = 0; i < numdirs; i++) ++ { ++ dirlen = test_strlen (dirptr) + 1; ++ descptr = dirptr + dirlen; ++ uiInfo.modList[uiInfo.modCount].modName = String_Alloc (dirptr); ++ uiInfo.modList[uiInfo.modCount].modDescr = String_Alloc (descptr); ++ dirptr += dirlen + test_strlen (descptr) + 1; ++ uiInfo.modCount++; ++ if (uiInfo.modCount >= MAX_MODS) ++ { ++ break; ++ } ++ } ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 1 "struct_layout" } } */ +-- +2.33.0.windows.2 + diff --git a/0043-Backport-Extend-special_memory_constraint.patch b/0043-Backport-Extend-special_memory_constraint.patch new file mode 100644 index 0000000000000000000000000000000000000000..8ef635a29a5a24c06e281871e19812968ffd654c --- /dev/null +++ b/0043-Backport-Extend-special_memory_constraint.patch @@ -0,0 +1,158 @@ +From b4770dd95fa342671d53c9de2077d77ee07b68dd Mon Sep 17 00:00:00 2001 +From: zhaowenyu <804544223@qq.com> +Date: Sat, 25 Jun 2022 00:41:50 +0800 +Subject: [PATCH] [Backport] Extend special_memory_constraint. + +Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=4de7b010038933dd6ca96bf186ca49f243d0def6 + +For operand with special_memory_constraint, there could be a wrapper for memory_operand. +Extract mem for operand for conditional judgement like MEM_P, also for record_address_regs. + +diff --git a/gcc/ira-costs.c b/gcc/ira-costs.c +index 6891156b5..aeda6588b 100644 +--- a/gcc/ira-costs.c ++++ b/gcc/ira-costs.c +@@ -781,7 +781,8 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops, + + case CT_SPECIAL_MEMORY: + insn_allows_mem[i] = allows_mem[i] = 1; +- if (MEM_P (op) && constraint_satisfied_p (op, cn)) ++ if (MEM_P (extract_mem_from_operand (op)) ++ && constraint_satisfied_p (op, cn)) + win = 1; + break; + +@@ -1397,15 +1398,16 @@ record_operand_costs (rtx_insn *insn, enum reg_class *pref) + commutative. */ + for (i = 0; i < recog_data.n_operands; i++) + { ++ rtx op_mem = extract_mem_from_operand (recog_data.operand[i]); + memcpy (op_costs[i], init_cost, struct_costs_size); + + if (GET_CODE (recog_data.operand[i]) == SUBREG) + recog_data.operand[i] = SUBREG_REG (recog_data.operand[i]); + +- if (MEM_P (recog_data.operand[i])) +- record_address_regs (GET_MODE (recog_data.operand[i]), +- MEM_ADDR_SPACE (recog_data.operand[i]), +- XEXP (recog_data.operand[i], 0), ++ if (MEM_P (op_mem)) ++ record_address_regs (GET_MODE (op_mem), ++ MEM_ADDR_SPACE (op_mem), ++ XEXP (op_mem, 0), + 0, MEM, SCRATCH, frequency * 2); + else if (constraints[i][0] == 'p' + || (insn_extra_address_constraint +diff --git a/gcc/ira.c b/gcc/ira.c +index 681ec2f46..c13650229 100644 +--- a/gcc/ira.c ++++ b/gcc/ira.c +@@ -1868,7 +1868,7 @@ ira_setup_alts (rtx_insn *insn) + + case CT_MEMORY: + case CT_SPECIAL_MEMORY: +- if (MEM_P (op)) ++ if (MEM_P (extract_mem_from_operand (op))) + goto op_success; + win_p = true; + break; +diff --git a/gcc/lra-constraints.c b/gcc/lra-constraints.c +index 7cc479b30..df75c7b94 100644 +--- a/gcc/lra-constraints.c ++++ b/gcc/lra-constraints.c +@@ -409,14 +409,34 @@ valid_address_p (rtx op, struct address_info *ad, + return valid_address_p (ad->mode, *ad->outer, ad->as); + } + ++/* For special_memory_operand, it could be false for MEM_P (op), ++ i.e. bcst_mem_operand in i386 backend. ++ Extract and return real memory operand or op. */ ++rtx ++extract_mem_from_operand (rtx op) ++{ ++ for (rtx x = op;; x = XEXP (x, 0)) ++ { ++ if (MEM_P (x)) ++ return x; ++ if (GET_RTX_LENGTH (GET_CODE (x)) != 1 ++ || GET_RTX_FORMAT (GET_CODE (x))[0] != 'e') ++ break; ++ } ++ return op; ++} ++ + /* Return true if the eliminated form of memory reference OP satisfies + extra (special) memory constraint CONSTRAINT. */ + static bool + satisfies_memory_constraint_p (rtx op, enum constraint_num constraint) + { + struct address_info ad; ++ rtx mem = extract_mem_from_operand (op); ++ if (!MEM_P (mem)) ++ return false; + +- decompose_mem_address (&ad, op); ++ decompose_mem_address (&ad, mem); + address_eliminator eliminator (&ad); + return constraint_satisfied_p (op, constraint); + } +@@ -2344,8 +2364,7 @@ process_alt_operands (int only_alternative) + break; + + case CT_MEMORY: +- if (MEM_P (op) +- && satisfies_memory_constraint_p (op, cn)) ++ if (satisfies_memory_constraint_p (op, cn)) + win = true; + else if (spilled_pseudo_p (op)) + win = true; +@@ -2386,8 +2405,7 @@ process_alt_operands (int only_alternative) + break; + + case CT_SPECIAL_MEMORY: +- if (MEM_P (op) +- && satisfies_memory_constraint_p (op, cn)) ++ if (satisfies_memory_constraint_p (op, cn)) + win = true; + else if (spilled_pseudo_p (op)) + win = true; +diff --git a/gcc/recog.c b/gcc/recog.c +index 2720aaaac..8674054b9 100644 +--- a/gcc/recog.c ++++ b/gcc/recog.c +@@ -1798,7 +1798,8 @@ asm_operand_ok (rtx op, const char *constraint, const char **constraints) + case CT_MEMORY: + case CT_SPECIAL_MEMORY: + /* Every memory operand can be reloaded to fit. */ +- result = result || memory_operand (op, VOIDmode); ++ result = result || memory_operand (extract_mem_from_operand (op), ++ VOIDmode); + break; + + case CT_ADDRESS: +@@ -2584,7 +2585,9 @@ constrain_operands (int strict, alternative_mask alternatives) + + /* A unary operator may be accepted by the predicate, but it + is irrelevant for matching constraints. */ +- if (UNARY_P (op)) ++ /* For special_memory_operand, there could be a memory operand inside, ++ and it would cause a mismatch for constraint_satisfied_p. */ ++ if (UNARY_P (op) && op == extract_mem_from_operand (op)) + op = XEXP (op, 0); + + if (GET_CODE (op) == SUBREG) +diff --git a/gcc/rtl.h b/gcc/rtl.h +index b29afca8d..35fb6ba73 100644 +--- a/gcc/rtl.h ++++ b/gcc/rtl.h +@@ -4323,6 +4323,7 @@ extern rtx gen_hard_reg_clobber (machine_mode, unsigned int); + extern rtx get_reg_known_value (unsigned int); + extern bool get_reg_known_equiv_p (unsigned int); + extern rtx get_reg_base_value (unsigned int); ++extern rtx extract_mem_from_operand (rtx); + + #ifdef STACK_REGS + extern int stack_regs_mentioned (const_rtx insn); +-- +2.33.0.windows.2 + diff --git a/0044-Backport-ira-Fix-unnecessary-register-spill.patch b/0044-Backport-ira-Fix-unnecessary-register-spill.patch new file mode 100644 index 0000000000000000000000000000000000000000..f080b99f285508079ad37429d03321283890aa4f --- /dev/null +++ b/0044-Backport-ira-Fix-unnecessary-register-spill.patch @@ -0,0 +1,68 @@ +From 95d8a6545bef39f5deff376c60c38e4e3c13c8f5 Mon Sep 17 00:00:00 2001 +From: zhaowenyu <804544223@qq.com> +Date: Sat, 25 Jun 2022 00:45:24 +0800 +Subject: [PATCH] [Backport] ira: Fix unnecessary register spill + +Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=edf95e51e53697f3050f076675c26a4cece17741 + +The variables first_moveable_pseudo and last_moveable_pseudo aren't reset after compiling a function, +which means they leak into the first scheduler pass of the following function. In some cases, this +can cause an extra spill during register location of the second function. + +diff --git a/gcc/ira.c b/gcc/ira.c +index 681ec2f46..77e4bb988 100644 +--- a/gcc/ira.c ++++ b/gcc/ira.c +@@ -5130,6 +5130,8 @@ move_unallocated_pseudos (void) + INSN_UID (newinsn), i); + SET_REG_N_REFS (i, 0); + } ++ ++ first_moveable_pseudo = last_moveable_pseudo = 0; + } + + /* If the backend knows where to allocate pseudos for hard +diff --git a/gcc/testsuite/gcc.target/aarch64/nospill.c b/gcc/testsuite/gcc.target/aarch64/nospill.c +new file mode 100644 +index 000000000..968a4267e +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/nospill.c +@@ -0,0 +1,35 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O3" } */ ++ ++/* The pseudo for P is marked as moveable in the IRA pass. */ ++float ++func_0 (float a, float b, float c) ++{ ++ float p = c / a; ++ ++ if (b > 1) ++ { ++ b /= p; ++ if (c > 2) ++ a /= 3; ++ } ++ ++ return b / c * a; ++} ++ ++/* If first_moveable_pseudo and last_moveable_pseudo are not reset correctly, ++ they will carry over and spill the pseudo for Q. */ ++float ++func_1 (float a, float b, float c) ++{ ++ float q = a + b; ++ ++ c *= a / (b + b); ++ if (a > 0) ++ c *= q; ++ ++ return a * b * c; ++} ++ ++/* We have plenty of spare registers, so check nothing has been spilled. */ ++/* { dg-final { scan-assembler-not "\tstr\t" } } */ +-- +2.33.0.windows.2 + diff --git a/0045-Transposed-SLP-Enable-Transposed-SLP.patch b/0045-Transposed-SLP-Enable-Transposed-SLP.patch new file mode 100644 index 0000000000000000000000000000000000000000..47dd95e0b257ea9447609cbfec4a09bde655aa62 --- /dev/null +++ b/0045-Transposed-SLP-Enable-Transposed-SLP.patch @@ -0,0 +1,2970 @@ +From dc20ea35fb5b77a0f17c64c07e3fd423a2610589 Mon Sep 17 00:00:00 2001 +From: luohailing +Date: Fri, 17 Jun 2022 22:38:55 +0800 +Subject: [PATCH] [Transposed SLP] Enable Transposed SLP Enable Transposed + SLP when memory is uncontinual with -ftree-slp-transpose-vectorize. + + +diff --git a/gcc/common.opt b/gcc/common.opt +index 24834cf60..d38401b71 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -3049,6 +3049,10 @@ ftree-vect-analyze-slp-group + Common Report Var(flag_tree_slp_group) Init(0) + Disable SLP vectorization for reduction chain on tree. + ++ftree-slp-transpose-vectorize ++Common Report Var(flag_tree_slp_transpose_vectorize) Optimization Init(0) ++Enable basic block vectorization (SLP) for transposed stores and loads on trees. ++ + fvect-cost-model= + Common Joined RejectNegative Enum(vect_cost_model) Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT) Optimization + -fvect-cost-model=[unlimited|dynamic|cheap] Specifies the cost model for vectorization. +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-1.c b/gcc/testsuite/gcc.dg/vect/transpose-1.c +new file mode 100644 +index 000000000..8237a8b9e +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-1.c +@@ -0,0 +1,53 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++ ++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; ++ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ c2[i] = pix1[2] - pix2[2]; ++ c3[i] = pix1[3] - pix2[3]; ++ c4[i] = pix1[4] - pix2[4]; ++ c5[i] = pix1[5] - pix2[5]; ++ c6[i] = pix1[6] - pix2[6]; ++ c7[i] = pix1[7] - pix2[7]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 16; ++ int i2 = 8; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 2; ++ input2[i] = i; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 1264) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-2.c b/gcc/testsuite/gcc.dg/vect/transpose-2.c +new file mode 100644 +index 000000000..b01a0410e +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-2.c +@@ -0,0 +1,50 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-additional-options "-fno-tree-loop-vectorize" } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 8 ++#define M 256 ++ ++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned short c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; ++ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ c2[i] = pix1[2] - pix2[2]; ++ c3[i] = pix1[3] - pix2[3]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i] + c2[i] + c3[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 5; ++ int i2 = 4; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 4; ++ input2[i] = i * 2; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 1440) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-3.c b/gcc/testsuite/gcc.dg/vect/transpose-3.c +new file mode 100644 +index 000000000..529581c59 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-3.c +@@ -0,0 +1,54 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-additional-options "-fno-tree-loop-vectorize" } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++ ++int foo (unsigned short *pix1, int i_pix1, unsigned short *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; ++ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ c2[i] = pix1[2] - pix2[2]; ++ c3[i] = pix1[3] - pix2[3]; ++ c4[i] = pix1[4] - pix2[4]; ++ c5[i] = pix1[5] - pix2[5]; ++ c6[i] = pix1[6] - pix2[6]; ++ c7[i] = pix1[7] - pix2[7]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned short input1[M]; ++ unsigned short input2[M]; ++ int i1 = 8; ++ int i2 = 4; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 4; ++ input2[i] = i; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 1680) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-4.c b/gcc/testsuite/gcc.dg/vect/transpose-4.c +new file mode 100644 +index 000000000..0b4adea9b +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-4.c +@@ -0,0 +1,53 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++ ++int foo (unsigned *pix1, int i_pix1, unsigned *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; ++ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ c2[i] = pix1[2] - pix2[2]; ++ c3[i] = pix1[3] - pix2[3]; ++ c4[i] = pix1[4] - pix2[4]; ++ c5[i] = pix1[5] - pix2[5]; ++ c6[i] = pix1[6] - pix2[6]; ++ c7[i] = pix1[7] - pix2[7]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned input1[M]; ++ unsigned input2[M]; ++ int i1 = 12; ++ int i2 = 6; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 7; ++ input2[i] = i * 3; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 3616) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-5.c b/gcc/testsuite/gcc.dg/vect/transpose-5.c +new file mode 100644 +index 000000000..81a248840 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-5.c +@@ -0,0 +1,73 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++#define eps 1e-8 ++ ++double foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ unsigned a0[N]; ++ unsigned a1[N]; ++ unsigned a2[N]; ++ unsigned a3[N]; ++ ++ int b0[N]; ++ int b1[N]; ++ int b2[N]; ++ int b3[N]; ++ ++ for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ a0[i] = (pix1[0] - pix2[0]) + ((pix1[4] + pix2[4]) << 16); ++ a1[i] = (pix1[1] - pix2[1]) + ((pix1[5] + pix2[5]) << 16); ++ a2[i] = (pix1[2] - pix2[2]) + ((pix1[6] + pix2[6]) << 16); ++ a3[i] = (pix1[3] - pix2[3]) + ((pix1[7] + pix2[7]) << 16); ++ } ++ ++ for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ b0[i] = (pix1[0] - pix2[0]) + (pix1[4] + pix2[4]); ++ b1[i] = (pix1[1] - pix2[1]) + (pix1[5] + pix2[5]); ++ b2[i] = (pix1[2] - pix2[2]) + (pix1[6] + pix2[6]); ++ b3[i] = (pix1[3] - pix2[3]) + (pix1[7] + pix2[7]); ++ } ++ ++ double sum = 0; ++ for (int i = 0; i < N; i++) ++ { ++ sum += a0[i] + a1[i] + a2[i] + a3[i] + b0[i] + b1[i] + b2[i] + b3[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 8; ++ int i2 = 3; ++ unsigned char m = 2; ++ unsigned short n = 12; ++ float t = 3.0; ++ double k = 4.2; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 6; ++ input2[i] = i * 3; ++ } ++ double sum = foo (input1, i1, input2, i2); ++ if (fabs (sum - 78648144) > eps) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ ++/* { dg-final { scan-tree-dump-times "vectorizable_store for slp transpose" 2 "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-6.c b/gcc/testsuite/gcc.dg/vect/transpose-6.c +new file mode 100644 +index 000000000..3e134ac02 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-6.c +@@ -0,0 +1,67 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-require-effective-target vect_int } */ ++/* { dg-require-effective-target vect_float } */ ++#include ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++#define eps 1e-8 ++ ++float foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ unsigned a0[N]; ++ unsigned a1[N]; ++ unsigned a2[N]; ++ unsigned a3[N]; ++ ++ float c0[N]; ++ float c1[N]; ++ float c2[N]; ++ float c3[N]; ++ ++ for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ a0[i] = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16); ++ a1[i] = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16); ++ a2[i] = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16); ++ a3[i] = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16); ++ ++ c0[i] = (pix1[0] * pix2[0]) + (pix1[4] * pix2[4]); ++ c1[i] = (pix1[1] * pix2[1]) + (pix1[5] * pix2[5]); ++ c2[i] = (pix1[2] * pix2[2]) + (pix1[6] * pix2[6]); ++ c3[i] = (pix1[3] * pix2[3]) + (pix1[7] * pix2[7]); ++ } ++ ++ float sum = 0; ++ for (int i = 0; i < N; i++) ++ { ++ sum += a0[i] + a1[i] + a2[i] + a3[i] + c0[i] + c1[i] + c2[i] + c3[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 18; ++ int i2 = 6; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 4; ++ input2[i] = i * 2; ++ } ++ float sum = foo (input1, i1, input2, i2); ++ if (fabs (sum - 106041168) > eps) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ ++/* { dg-final { scan-tree-dump-times "vectorizable_store for slp transpose" 2 "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-7.c b/gcc/testsuite/gcc.dg/vect/transpose-7.c +new file mode 100644 +index 000000000..2074d9aa8 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-7.c +@@ -0,0 +1,53 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-additional-options "-fno-tree-loop-vectorize" } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 16 ++#define M 256 ++ ++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned char c0[N], c1[N]; ++ for (int i = 0; i < N/2; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ } ++ for (int i = N/2; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 6; ++ int i2 = 4; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 5; ++ input2[i] = i * 2; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 3280) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-8.c b/gcc/testsuite/gcc.dg/vect/transpose-8.c +new file mode 100644 +index 000000000..a154f012a +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-8.c +@@ -0,0 +1,53 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-additional-options "-fno-tree-loop-vectorize" } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 32 ++#define M 256 ++ ++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned char c0[N], c1[N]; ++ for (int i = 0; i < N/2; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ } ++ for (int i = N/2; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 6; ++ int i2 = 4; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 5; ++ input2[i] = i * 2; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 7584) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect.exp b/gcc/testsuite/gcc.dg/vect/vect.exp +index efe17ac6f..d92e1ba5b 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect.exp ++++ b/gcc/testsuite/gcc.dg/vect/vect.exp +@@ -114,6 +114,13 @@ et-dg-runtest dg-runtest [lsort \ + [glob -nocomplain $srcdir/$subdir/no-vfa-*.\[cS\]]] \ + "" $DEFAULT_VECTCFLAGS + ++# -ftree-slp-transpose-vectorize SLP tests ++set VECT_SLP_CFLAGS $SAVED_VECT_SLP_CFLAGS ++lappend VECT_SLP_CFLAGS "-ftree-slp-transpose-vectorize" ++et-dg-runtest dg-runtest [lsort \ ++ [glob -nocomplain $srcdir/$subdir/transpose-*.\[cS\]]] \ ++ "" "-ftree-slp-transpose-vectorize -fdump-tree-slp-details -O3" ++ + # -ffast-math tests + set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS + lappend DEFAULT_VECTCFLAGS "-ffast-math" +diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c +index fcc0726bd..d78b06455 100644 +--- a/gcc/tree-vect-data-refs.c ++++ b/gcc/tree-vect-data-refs.c +@@ -2647,6 +2647,9 @@ vect_analyze_group_access_1 (dr_vec_info *dr_info) + DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element; + + DR_GROUP_SIZE (stmt_info) = groupsize; ++ ++ DR_GROUP_SLP_TRANSPOSE (stmt_info) = false; ++ + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, +@@ -2676,6 +2679,20 @@ vect_analyze_group_access_1 (dr_vec_info *dr_info) + DR_GROUP_GAP (stmt_info)); + } + ++ /* SLP: create an SLP data structure for every interleaving group of ++ loads for further analysis in vect_analyse_slp. */ ++ if (DR_IS_READ (dr) && !slp_impossible) ++ { ++ if (loop_vinfo) ++ { ++ LOOP_VINFO_GROUPED_LOADS (loop_vinfo).safe_push (stmt_info); ++ } ++ if (bb_vinfo) ++ { ++ BB_VINFO_GROUPED_LOADS (bb_vinfo).safe_push (stmt_info); ++ } ++ } ++ + /* SLP: create an SLP data structure for every interleaving group of + stores for further analysis in vect_analyse_slp. */ + if (DR_IS_WRITE (dr) && !slp_impossible) +@@ -5413,6 +5430,225 @@ vect_permute_store_chain (vec dr_chain, + } + } + ++/* Encoding the PERM_MASK_FIRST. */ ++ ++static void ++vect_indices_encoding_first (tree vectype, unsigned int array_num, ++ tree &perm_mask_high_first, ++ tree &perm_mask_low_first) ++{ ++ unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); ++ vec_perm_builder sel (nelt, nelt, 1); ++ sel.quick_grow (nelt); ++ unsigned int group_num = nelt / array_num; ++ unsigned int index = 0; ++ unsigned int array = 0; ++ unsigned int group = 0; ++ ++ /* The encoding has 1 pattern in the fisrt stage. */ ++ for (array = 0; array < array_num / 2; array++) ++ { ++ for (group = 0; group < group_num * 2; group++) ++ { ++ sel[index++] = array + array_num * group; ++ } ++ } ++ vec_perm_indices indices (sel, 2, nelt); ++ perm_mask_high_first = vect_gen_perm_mask_checked (vectype, indices); ++ ++ index = 0; ++ for (array = array_num / 2; array < array_num; array++) ++ { ++ for (group = 0; group < group_num * 2; group++) ++ { ++ sel[index++] = array + array_num * group; ++ } ++ } ++ indices.new_vector (sel, 2, nelt); ++ perm_mask_low_first = vect_gen_perm_mask_checked (vectype, indices); ++} ++ ++/* Encoding the PERM_MASK. */ ++ ++static void ++vect_indices_encoding (tree vectype, unsigned int array_num, ++ tree &perm_mask_high, tree &perm_mask_low) ++{ ++ unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); ++ vec_perm_builder sel (nelt, nelt, 1); ++ sel.quick_grow (nelt); ++ unsigned int group_num = nelt / array_num; ++ unsigned int index = 0; ++ unsigned int array = 0; ++ unsigned int group = 0; ++ ++ /* The encoding has 2 patterns in the folllowing stages. */ ++ for (array = 0; array < array_num / 2; array++) ++ { ++ for (group = 0; group < group_num; group++) ++ { ++ sel[index++] = group + group_num * array; ++ } ++ for (group = 0; group < group_num; group++) ++ { ++ sel[index++] = nelt + group + group_num * array; ++ } ++ } ++ vec_perm_indices indices (sel, 2, nelt); ++ perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); ++ ++ index = 0; ++ for (array = array_num / 2; array < array_num; array++) ++ { ++ for (group = 0; group < group_num; group++) ++ { ++ sel[index++] = group + group_num * array; ++ } ++ for (group = 0; group < group_num; group++) ++ { ++ sel[index++] = nelt + group + group_num * array; ++ } ++ } ++ indices.new_vector (sel, 2, nelt); ++ perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); ++} ++ ++/* Function vect_transpose_store_chain. ++ ++ Given a chain of interleaved stores in DR_CHAIN of LENGTH and ARRAY_NUM that ++ must be a power of 2. Generate interleave_high/low stmts to reorder ++ the data correctly for the stores. Return the final references for stores ++ in RESULT_CHAIN. This function is similar to vect_permute_store_chain (), ++ we interleave the contents of the vectors in their order. ++ ++ E.g., LENGTH is 4, the scalar type is short (i.e., VF is 8) and ARRAY_NUM ++ is 4. That is, the input is 4 vectors each containing 8 elements. ++ And 2 (VF / ARRAY_NUM) of 8 elements come from the same array. we interleave ++ the contents of the four vectors in their order. We assign a number to each ++ element, the input sequence is: ++ ++ 1st vec: 0 1 2 3 4 5 6 7 ++ 2nd vec: 8 9 10 11 12 13 14 15 ++ 3rd vec: 16 17 18 19 20 21 22 23 ++ 4th vec: 24 25 26 27 28 29 30 31 ++ ++ The output sequence should be: ++ ++ 1st vec: 0 4 8 12 16 20 24 28 ++ 2nd vec: 1 5 9 13 17 21 25 29 ++ 3rd vec: 2 6 10 14 18 22 26 30 ++ 4th vec: 3 7 11 15 19 23 27 31 ++ ++ In our example, ++ We get 2 (VF / ARRAY_NUM) elements together in every vector. ++ ++ I1: 0 4 1 5 2 6 3 7 ++ I2: 8 12 9 13 10 14 11 15 ++ I3: 16 20 17 21 18 22 19 23 ++ I4: 24 28 25 29 26 30 27 31 ++ ++ Then, we use interleave_high/low instructions to create such output. ++ Every 2 (VF / ARRAY_NUM) elements are regarded as a whole. The permutation ++ is done in log LENGTH stages. ++ ++ I1: interleave_high (1st vec, 3rd vec) ++ I2: interleave_low (1st vec, 3rd vec) ++ I3: interleave_high (2nd vec, 4th vec) ++ I4: interleave_low (2nd vec, 4th vec) ++ ++ The first stage of the sequence should be: ++ ++ I1: 0 4 16 20 1 5 17 21 ++ I2: 2 6 18 22 3 7 19 23 ++ I3: 8 12 24 28 9 13 25 29 ++ I4: 10 14 26 30 11 15 27 31 ++ ++ The following stage sequence should be, i.e. the final result is: ++ ++ I1: 0 4 8 12 16 20 24 28 ++ I2: 1 5 9 13 17 21 25 29 ++ I3: 2 6 10 14 18 22 26 30 ++ I4: 3 7 11 15 19 23 27 31. */ ++ ++void ++vect_transpose_store_chain (vec dr_chain, unsigned int length, ++ unsigned int array_num, stmt_vec_info stmt_info, ++ gimple_stmt_iterator *gsi, vec *result_chain) ++{ ++ gimple *perm_stmt = NULL; ++ tree vectype = STMT_VINFO_VECTYPE (stmt_info); ++ tree perm_mask_low_first = NULL; ++ tree perm_mask_high_first = NULL; ++ tree perm_mask_low = NULL; ++ tree perm_mask_high = NULL; ++ unsigned int log_length = exact_log2 (length); ++ ++ /* Only power of 2 is supported. */ ++ gcc_assert (pow2p_hwi (length)); ++ ++ /* The encoding has 2 types, one for the grouped pattern in the fisrt stage, ++ another for the interleaved patterns in the following stages. */ ++ gcc_assert (array_num != 0); ++ ++ /* Create grouped stmt (in the first stage): ++ group = nelt / array_num; ++ high_first = VEC_PERM_EXPR ++ low_first = VEC_PERM_EXPR */ ++ vect_indices_encoding_first (vectype, array_num, perm_mask_high_first, ++ perm_mask_low_first); ++ ++ /* Create interleaving stmt (in the following stages): ++ high = VEC_PERM_EXPR ++ low = VEC_PERM_EXPR */ ++ vect_indices_encoding (vectype, array_num, perm_mask_high, perm_mask_low); ++ ++ for (unsigned int perm_time = 0; perm_time < log_length; perm_time++) ++ { ++ for (unsigned int index = 0; index < length / 2; index++) ++ { ++ tree vect1 = dr_chain[index]; ++ tree vect2 = dr_chain[index + length / 2]; ++ ++ tree high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); ++ perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, vect2, ++ perm_time == 0 ? perm_mask_high_first ++ : perm_mask_high); ++ vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); ++ (*result_chain)[2 * index] = high; ++ ++ tree low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); ++ perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, vect2, ++ perm_time == 0 ? perm_mask_low_first ++ : perm_mask_low); ++ vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); ++ (*result_chain)[2 * index+1] = low; ++ } ++ memcpy (dr_chain.address (), result_chain->address (), ++ length * sizeof (tree)); ++ } ++} ++ + /* Function vect_setup_realignment + + This function is called when vectorizing an unaligned load using +diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c +index 476b32370..b616af7d2 100644 +--- a/gcc/tree-vect-slp.c ++++ b/gcc/tree-vect-slp.c +@@ -2414,11 +2414,13 @@ vect_analyze_slp_instance (vec_info *vinfo, + + /* For basic block SLP, try to break the group up into multiples of the + vector size. */ ++ bb_vec_info bb_vinfo = dyn_cast (vinfo); + unsigned HOST_WIDE_INT const_nunits; + if (is_a (vinfo) + && STMT_VINFO_GROUPED_ACCESS (stmt_info) + && DR_GROUP_FIRST_ELEMENT (stmt_info) +- && nunits.is_constant (&const_nunits)) ++ && nunits.is_constant (&const_nunits) ++ && !bb_vinfo->transposed) + { + /* We consider breaking the group only on VF boundaries from the existing + start. */ +@@ -2455,6 +2457,884 @@ vect_analyze_slp_instance (vec_info *vinfo, + return false; + } + ++static inline bool ++is_const_assign (stmt_vec_info store_elem) ++{ ++ if (store_elem == NULL) ++ { ++ gcc_unreachable (); ++ } ++ gimple *stmt = store_elem->stmt; ++ gimple_rhs_class rhs_class = gimple_assign_rhs_class (stmt); ++ return rhs_class == GIMPLE_SINGLE_RHS ++ && TREE_CONSTANT (gimple_assign_rhs1 (store_elem->stmt)); ++} ++ ++/* Push inits to INNERMOST_INITS and check const assign. */ ++ ++static bool ++record_innermost (vec &innermost_inits, ++ vec &innermost_offsets, ++ stmt_vec_info stmt_vinfo) ++{ ++ if (!stmt_vinfo) ++ { ++ return false; ++ } ++ stmt_vec_info next_info = stmt_vinfo; ++ while (next_info) ++ { ++ /* No need to vectorize constant assign in a transposed version. */ ++ if (is_const_assign (next_info)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "no need to vectorize, store is const assign: %G", ++ next_info->stmt); ++ } ++ return false; ++ } ++ innermost_inits.safe_push (STMT_VINFO_DR_INIT (next_info)); ++ innermost_offsets.safe_push (STMT_VINFO_DR_OFFSET (next_info)); ++ next_info = DR_GROUP_NEXT_ELEMENT (next_info); ++ } ++ return true; ++} ++ ++/* Compare inits to INNERMOST_INITS, return FALSE if inits do not match ++ the first grouped_store. And check const assign meanwhile. */ ++ ++static bool ++compare_innermost (const vec &innermost_inits, ++ const vec &innermost_offsets, ++ stmt_vec_info stmt_vinfo) ++{ ++ if (!stmt_vinfo || innermost_inits.length () != stmt_vinfo->size) ++ { ++ return false; ++ } ++ stmt_vec_info next_info = stmt_vinfo; ++ unsigned int i = 0; ++ while (next_info) ++ { ++ if (is_const_assign (next_info)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "no need to vectorize, store is const " ++ "assign: %G", next_info->stmt); ++ } ++ return false; ++ } ++ if (innermost_inits[i] != STMT_VINFO_DR_INIT (next_info) ++ || innermost_offsets[i] != STMT_VINFO_DR_OFFSET (next_info)) ++ { ++ return false; ++ } ++ next_info = DR_GROUP_NEXT_ELEMENT (next_info); ++ i++; ++ } ++ return true; ++} ++ ++/* Check if grouped stores are of same type. ++ input: t1/t2 = TREE_TYPE (gimple_assign_lhs (first_element->stmt)) ++ output: 0 if same, 1 or -1 else. */ ++ ++static int ++tree_type_cmp (const tree t1, const tree t2) ++{ ++ gcc_checking_assert (t1 != NULL && t2 != NULL); ++ if (t1 != t2) ++ { ++ if (TREE_CODE (t1) != TREE_CODE (t2)) ++ { ++ return TREE_CODE (t1) > TREE_CODE (t2) ? 1 : -1; ++ } ++ if (TYPE_UNSIGNED (t1) != TYPE_UNSIGNED (t2)) ++ { ++ return TYPE_UNSIGNED (t1) > TYPE_UNSIGNED (t2) ? 1 : -1; ++ } ++ if (TYPE_PRECISION (t1) != TYPE_PRECISION (t2)) ++ { ++ return TYPE_PRECISION (t1) > TYPE_PRECISION (t2) ? 1 : -1; ++ } ++ } ++ return 0; ++} ++ ++/* Check it if 2 grouped stores are of same type that ++ we can analyze them in a transpose group. */ ++static int ++check_same_store_type (stmt_vec_info grp1, stmt_vec_info grp2) ++{ ++ if (grp1 == grp2) ++ { ++ return 0; ++ } ++ if (grp1->size != grp2->size) ++ { ++ return grp1->size > grp2->size ? 1 : -1; ++ } ++ tree lhs1 = gimple_assign_lhs (grp1->stmt); ++ tree lhs2 = gimple_assign_lhs (grp2->stmt); ++ if (TREE_CODE (lhs1) != TREE_CODE (lhs2)) ++ { ++ return TREE_CODE (lhs1) > TREE_CODE (lhs2) ? 1 : -1; ++ } ++ tree grp_type1 = TREE_TYPE (gimple_assign_lhs (grp1->stmt)); ++ tree grp_type2 = TREE_TYPE (gimple_assign_lhs (grp2->stmt)); ++ int cmp = tree_type_cmp (grp_type1, grp_type2); ++ return cmp; ++} ++ ++/* Sort grouped stores according to group_size and store_type. ++ output: 0 if same, 1 if grp1 > grp2, -1 otherwise. */ ++ ++static int ++grouped_store_cmp (const void *grp1_, const void *grp2_) ++{ ++ stmt_vec_info grp1 = *(stmt_vec_info *)const_cast(grp1_); ++ stmt_vec_info grp2 = *(stmt_vec_info *)const_cast(grp2_); ++ return check_same_store_type (grp1, grp2); ++} ++ ++/* Transposing is based on permutation in registers. Permutation requires ++ vector length being power of 2 and satisfying the vector mode. */ ++ ++static inline bool ++check_filling_reg (stmt_vec_info current_element) ++{ ++ if (current_element->size == 0) ++ { ++ return false; ++ } ++ /* If the gimple STMT was already vectorized in vect pass, it's unable to ++ conduct transpose analysis, skip it. */ ++ bool lhs_vectorized ++ = TREE_CODE (TREE_TYPE (gimple_get_lhs (current_element->stmt))) ++ == VECTOR_TYPE; ++ bool rhs_vectorized ++ = TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (current_element->stmt))) ++ == VECTOR_TYPE; ++ if (lhs_vectorized || rhs_vectorized) ++ { ++ return false; ++ } ++ unsigned int store_precision ++ = TYPE_PRECISION (TREE_TYPE (gimple_get_lhs (current_element->stmt))); ++ auto_vector_modes vector_modes; ++ targetm.vectorize.autovectorize_vector_modes (&vector_modes, false); ++ unsigned min_mode_size = -1u; ++ for (unsigned i = 0; i < vector_modes.length (); i++) ++ { ++ unsigned mode_bit_size = (GET_MODE_BITSIZE (vector_modes[i])).coeffs[0]; ++ min_mode_size = mode_bit_size < min_mode_size ++ ? mode_bit_size : min_mode_size; ++ } ++ return store_precision != 0 ++ && pow2p_hwi (current_element->size) ++ && (current_element->size * store_precision % min_mode_size == 0); ++} ++ ++/* Check if previous groups are suitable to transpose, if not, set their ++ group number to -1, reduce grp_num and clear current_groups. ++ Otherwise, just clear current_groups. */ ++ ++static void ++check_and_clear_groups (vec current_groups, ++ unsigned int &grp_num) ++{ ++ stmt_vec_info first_element; ++ if (current_groups.length () == 1 ++ || (current_groups.length () != 0 ++ && !pow2p_hwi (current_groups.length ()))) ++ { ++ while (current_groups.length () != 0) ++ { ++ first_element = current_groups.pop (); ++ first_element->group_number = -1; ++ } ++ grp_num--; ++ } ++ else ++ { ++ while (current_groups.length ()) ++ { ++ current_groups.pop (); ++ } ++ } ++} ++ ++/* Set grouped_stores with similar MEM_REF to the same group and mark their ++ grp_num. Groups with same grp_num consist the minimum unit to analyze ++ transpose. Return num of such units. */ ++ ++static unsigned ++vect_prepare_transpose (bb_vec_info bb_vinfo) ++{ ++ stmt_vec_info current_element = NULL; ++ stmt_vec_info first_element = NULL; ++ unsigned int i = 0; ++ unsigned int grp_num = 0; ++ /* Use arrays to record MEM_REF data in different GROUPED_STORES. */ ++ auto_vec innermost_inits; ++ auto_vec innermost_offsets; ++ ++ /* A set of stmt_vec_info with same store type. Analyze them if their size ++ is suitable to transpose. */ ++ auto_vec current_groups; ++ ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, current_element) ++ { ++ /* Compare current grouped_store to the first one if first_element exists, ++ push current_element to current_groups if they are similar on innermost ++ behavior of MEM_REF. */ ++ if (first_element != NULL ++ && !check_same_store_type (first_element, current_element) ++ && compare_innermost (innermost_inits, innermost_offsets, ++ current_element)) ++ { ++ current_groups.safe_push (current_element); ++ current_element->group_number = grp_num; ++ /* If current_element is the last element in grouped_stores, continue ++ will exit the loop and leave the last group unanalyzed. */ ++ if (i == bb_vinfo->grouped_stores.length () - 1) ++ { ++ check_and_clear_groups (current_groups, grp_num); ++ } ++ continue; ++ } ++ check_and_clear_groups (current_groups, grp_num); ++ innermost_inits.release (); ++ innermost_offsets.release (); ++ /* Beginning of a new group to analyze whether they are able to consist ++ a unit to conduct transpose analysis. */ ++ first_element = NULL; ++ if (TREE_CODE (gimple_get_lhs (current_element->stmt)) == ARRAY_REF ++ && check_filling_reg (current_element) ++ && record_innermost (innermost_inits, innermost_offsets, ++ current_element)) ++ { ++ first_element = current_element; ++ current_groups.safe_push (current_element); ++ current_element->group_number = ++grp_num; ++ if (i == bb_vinfo->grouped_stores.length () - 1) ++ { ++ check_and_clear_groups (current_groups, grp_num); ++ } ++ continue; ++ } ++ current_element->group_number = -1; ++ } ++ return grp_num; ++} ++ ++/* Return a flag to transpose grouped stores before building slp tree. ++ Add bool may_transpose in class vec_info. */ ++ ++static bool ++vect_may_transpose (bb_vec_info bb_vinfo) ++{ ++ if (targetm.vectorize.vec_perm_const == NULL) ++ { ++ return false; ++ } ++ if (bb_vinfo->grouped_stores.length () < 2) ++ { ++ return false; ++ } ++ DUMP_VECT_SCOPE ("analyze if grouped stores may transpose to slp"); ++ /* Sort grouped_stores according to size and type for function ++ vect_prepare_transpose (). */ ++ bb_vinfo->grouped_stores.qsort (grouped_store_cmp); ++ ++ int groups = vect_prepare_transpose (bb_vinfo); ++ BB_VINFO_TRANS_GROUPS (bb_vinfo) = groups; ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "%d groups to analyze transposed slp.\n", groups); ++ return groups != 0; ++} ++ ++/* Get the base address of STMT_INFO. */ ++ ++static tree ++get_op_base_address (stmt_vec_info stmt_info) ++{ ++ struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); ++ tree op = DR_BASE_ADDRESS (dr); ++ while (TREE_OPERAND_LENGTH (op) > 0) ++ { ++ op = TREE_OPERAND (op, 0); ++ } ++ return op; ++} ++ ++/* Compare the UID of the two stmt_info STMTINFO_A and STMTINFO_B. ++ Sorting them in ascending order. */ ++ ++static int ++dr_group_cmp (const void *stmtinfo_a_, const void *stmtinfo_b_) ++{ ++ stmt_vec_info stmtinfo_a ++ = *(stmt_vec_info *) const_cast (stmtinfo_a_); ++ stmt_vec_info stmtinfo_b ++ = *(stmt_vec_info *) const_cast (stmtinfo_b_); ++ ++ /* Stabilize sort. */ ++ if (stmtinfo_a == stmtinfo_b) ++ { ++ return 0; ++ } ++ return gimple_uid (stmtinfo_a->stmt) < gimple_uid (stmtinfo_b->stmt) ? -1 : 1; ++} ++ ++/* Find the first elements of the grouped loads which are required to merge. */ ++ ++static void ++vect_slp_grouped_load_find (bb_vec_info bb_vinfo, vec &visited, ++ vec &res) ++{ ++ unsigned int i = 0; ++ stmt_vec_info merge_first_element = NULL; ++ stmt_vec_info first_element = NULL; ++ tree opa = NULL; ++ unsigned int grp_size_a = 0; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_loads, i, first_element) ++ { ++ if (visited[i]) ++ { ++ continue; ++ } ++ if (!STMT_VINFO_GROUPED_ACCESS (first_element) ++ || !pow2p_hwi (DR_GROUP_SIZE (first_element))) ++ { ++ /* Non-conforming grouped load should be grouped separately. */ ++ if (merge_first_element == NULL) ++ { ++ visited[i] = true; ++ res.safe_push (first_element); ++ return; ++ } ++ } ++ if (merge_first_element == NULL) ++ { ++ merge_first_element = first_element; ++ opa = get_op_base_address (first_element); ++ grp_size_a = DR_GROUP_SIZE (first_element); ++ res.safe_push (first_element); ++ visited[i] = true; ++ continue; ++ } ++ ++ /* If the two first elements are of the same base address and group size, ++ these two grouped loads need to be merged. */ ++ tree opb = get_op_base_address (first_element); ++ unsigned int grp_size_b = DR_GROUP_SIZE (first_element); ++ if (opa == opb && grp_size_a == grp_size_b) ++ { ++ res.safe_push (first_element); ++ visited[i] = true; ++ } ++ } ++} ++ ++/* Merge the grouped loads that are found from ++ vect_slp_grouped_load_find (). */ ++ ++static stmt_vec_info ++vect_slp_grouped_load_merge (vec res) ++{ ++ stmt_vec_info stmt_info = res[0]; ++ if (res.length () == 1) ++ { ++ return stmt_info; ++ } ++ unsigned int i = 0; ++ unsigned int size = DR_GROUP_SIZE (res[0]); ++ unsigned int new_group_size = size * res.length (); ++ stmt_vec_info first_element = NULL; ++ stmt_vec_info merge_first_element = NULL; ++ stmt_vec_info last_element = NULL; ++ FOR_EACH_VEC_ELT (res, i, first_element) ++ { ++ if (merge_first_element == NULL) ++ { ++ merge_first_element = first_element; ++ last_element = merge_first_element; ++ size = DR_GROUP_SIZE (merge_first_element); ++ } ++ ++ if (last_element != first_element ++ && !DR_GROUP_NEXT_ELEMENT (last_element)) ++ { ++ DR_GROUP_NEXT_ELEMENT (last_element) = first_element; ++ /* Store the gap from the previous member of the group. If there is ++ no gap in the access, DR_GROUP_GAP is always 1. */ ++ DR_GROUP_GAP_TRANS (first_element) = DR_GROUP_GAP (first_element); ++ DR_GROUP_GAP (first_element) = 1; ++ } ++ for (stmt_info = first_element; stmt_info; ++ stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info)) ++ { ++ DR_GROUP_FIRST_ELEMENT (stmt_info) = merge_first_element; ++ DR_GROUP_SIZE_TRANS (stmt_info) = DR_GROUP_SIZE (stmt_info); ++ DR_GROUP_SIZE (stmt_info) = new_group_size; ++ last_element = stmt_info; ++ } ++ } ++ DR_GROUP_SIZE (merge_first_element) = new_group_size; ++ DR_GROUP_SLP_TRANSPOSE (merge_first_element) = true; ++ DR_GROUP_NEXT_ELEMENT (last_element) = NULL; ++ return merge_first_element; ++} ++ ++/* Merge the grouped loads which have the same base address and group size. ++ For example, for grouped loads (opa_1, opa_2, opb_1, opb_2): ++ opa_1: a0->a1->a2->a3 ++ opa_2: a8->a9->a10->a11 ++ opb_1: b0->b1 ++ opb_2: b16->b17 ++ we can probably get two merged grouped loads: ++ opa: a0->a1->a2->a3->a8->a9->a10->a11 ++ opb: b0->b1->b16->b17. */ ++ ++static bool ++vect_merge_slp_grouped_loads (bb_vec_info bb_vinfo) ++{ ++ if (bb_vinfo->grouped_loads.length () <= 0) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "The number of grouped loads is 0.\n"); ++ } ++ return false; ++ } ++ bb_vinfo->grouped_loads.qsort (dr_group_cmp); ++ auto_vec visited (bb_vinfo->grouped_loads.length ()); ++ auto_vec grouped_loads_merge; ++ for (unsigned int i = 0; i < bb_vinfo->grouped_loads.length (); i++) ++ { ++ visited.safe_push (false); ++ } ++ while (1) ++ { ++ /* Find grouped loads which are required to merge. */ ++ auto_vec res; ++ vect_slp_grouped_load_find (bb_vinfo, visited, res); ++ if (res.is_empty ()) ++ { ++ break; ++ } ++ /* Merge the required grouped loads into one group. */ ++ grouped_loads_merge.safe_push (vect_slp_grouped_load_merge (res)); ++ } ++ if (grouped_loads_merge.length () == bb_vinfo->grouped_loads.length ()) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "No grouped loads need to be merged.\n"); ++ } ++ return false; ++ } ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Merging grouped loads successfully.\n"); ++ } ++ BB_VINFO_GROUPED_LOADS (bb_vinfo).release (); ++ for (unsigned int i = 0; i < grouped_loads_merge.length (); i++) ++ { ++ BB_VINFO_GROUPED_LOADS (bb_vinfo).safe_push (grouped_loads_merge[i]); ++ } ++ return true; ++} ++ ++/* Find the first elements of the grouped stores ++ which are required to transpose and merge. */ ++ ++static void ++vect_slp_grouped_store_find (bb_vec_info bb_vinfo, vec &visited, ++ vec &res) ++{ ++ stmt_vec_info first_element = NULL; ++ stmt_vec_info merge_first_element = NULL; ++ unsigned int k = 0; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element) ++ { ++ if (visited[k]) ++ { ++ continue; ++ } ++ /* Non-conforming grouped store should be grouped separately. */ ++ if (!STMT_VINFO_GROUPED_ACCESS (first_element) ++ || first_element->group_number == -1) ++ { ++ if (merge_first_element == NULL) ++ { ++ visited[k] = true; ++ res.safe_push (first_element); ++ return; ++ } ++ } ++ if (first_element->group_number != -1 ++ && merge_first_element == NULL) ++ { ++ merge_first_element = first_element; ++ } ++ if (merge_first_element->group_number == first_element->group_number) ++ { ++ visited[k] = true; ++ res.safe_push (first_element); ++ } ++ } ++} ++ ++/* Transpose and merge the grouped stores that are found from ++ vect_slp_grouped_store_find (). */ ++ ++static stmt_vec_info ++vect_slp_grouped_store_transform (vec res) ++{ ++ stmt_vec_info stmt_info = res[0]; ++ if (res.length () == 1) ++ { ++ return stmt_info; ++ } ++ stmt_vec_info rearrange_first_element = stmt_info; ++ stmt_vec_info last_element = rearrange_first_element; ++ ++ unsigned int size = DR_GROUP_SIZE (rearrange_first_element); ++ unsigned int new_group_size = size * res.length (); ++ for (unsigned int i = 1; i < res.length (); i++) ++ { ++ /* Store the gap from the previous member of the group. If there is no ++ gap in the access, DR_GROUP_GAP is always 1. */ ++ DR_GROUP_GAP_TRANS (res[i]) = DR_GROUP_GAP (res[i]); ++ DR_GROUP_GAP (res[i]) = 1; ++ } ++ while (!res.is_empty ()) ++ { ++ stmt_info = res[0]; ++ res.ordered_remove (0); ++ if (DR_GROUP_NEXT_ELEMENT (stmt_info)) ++ { ++ res.safe_push (DR_GROUP_NEXT_ELEMENT (stmt_info)); ++ } ++ DR_GROUP_FIRST_ELEMENT (stmt_info) = rearrange_first_element; ++ DR_GROUP_NEXT_ELEMENT (last_element) = stmt_info; ++ DR_GROUP_SIZE_TRANS (stmt_info) = DR_GROUP_SIZE (stmt_info); ++ DR_GROUP_SIZE (stmt_info) = new_group_size; ++ last_element = stmt_info; ++ } ++ ++ DR_GROUP_SIZE (rearrange_first_element) = new_group_size; ++ DR_GROUP_SLP_TRANSPOSE (rearrange_first_element) = true; ++ DR_GROUP_NEXT_ELEMENT (last_element) = NULL; ++ return rearrange_first_element; ++} ++ ++/* Save the STMT_INFO in the grouped stores to BB_VINFO_SCALAR_STORES for ++ transposing back grouped stores. */ ++ ++static void ++get_scalar_stores (bb_vec_info bb_vinfo) ++{ ++ unsigned int k = 0; ++ stmt_vec_info first_element = NULL; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element) ++ { ++ /* Filter the grouped store which is unnecessary for transposing. */ ++ if (!STMT_VINFO_GROUPED_ACCESS (first_element) ++ || first_element->group_number == -1) ++ { ++ continue; ++ } ++ vec tmp_scalar_store; ++ tmp_scalar_store.create (DR_GROUP_SIZE (first_element)); ++ for (stmt_vec_info stmt_info = first_element; stmt_info; ++ stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info)) ++ { ++ tmp_scalar_store.safe_push (stmt_info); ++ } ++ BB_VINFO_SCALAR_STORES (bb_vinfo).safe_push (tmp_scalar_store); ++ } ++} ++ ++/* Transpose and merge the grouped stores which have the same group number. ++ For example, for grouped stores (opa_0, opa_1, opa_2, opa_3): ++ opa_0: a00->a01->a02->a03 ++ opa_1: a10->a11->a12->a13 ++ opa_2: a20->a21->a22->a23 ++ opa_2: a30->a31->a32->a33 ++ we can probably get the merged grouped store: ++ opa: a00->a10->a20->a30 ++ ->a01->a11->a21->a31 ++ ->a02->a12->a22->a32 ++ ->a03->a13->a23->a33. */ ++ ++static bool ++vect_transform_slp_grouped_stores (bb_vec_info bb_vinfo) ++{ ++ if (bb_vinfo->grouped_stores.length () <= 0) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "The number of grouped stores is 0.\n"); ++ } ++ return false; ++ } ++ ++ bb_vinfo->grouped_stores.qsort (dr_group_cmp); ++ auto_vec grouped_stores_merge; ++ auto_vec visited (bb_vinfo->grouped_stores.length ()); ++ unsigned int i = 0; ++ for (i = 0; i < bb_vinfo->grouped_stores.length (); i++) ++ { ++ visited.safe_push (false); ++ } ++ ++ /* Get scalar stores for the following transposition recovery. */ ++ get_scalar_stores (bb_vinfo); ++ ++ while (1) ++ { ++ /* Find grouped stores which are required to transpose and merge. */ ++ auto_vec res; ++ vect_slp_grouped_store_find (bb_vinfo, visited, res); ++ if (res.is_empty ()) ++ { ++ break; ++ } ++ /* Transpose and merge the required grouped stores into one group. */ ++ grouped_stores_merge.safe_push (vect_slp_grouped_store_transform (res)); ++ } ++ ++ BB_VINFO_GROUPED_STORES (bb_vinfo).release (); ++ for (i = 0; i < grouped_stores_merge.length (); i++) ++ { ++ BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (grouped_stores_merge[i]); ++ } ++ ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Transposing grouped stores successfully.\n"); ++ } ++ return true; ++} ++ ++/* A helpful function of vect_transform_back_slp_grouped_stores (). */ ++ ++static auto_vec ++vect_transform_back_slp_grouped_store (bb_vec_info bb_vinfo, ++ stmt_vec_info first_stmt_info) ++{ ++ auto_vec grouped_stores_split; ++ for (unsigned int i = 0; i < bb_vinfo->scalar_stores.length (); i++) ++ { ++ vec scalar_tmp = bb_vinfo->scalar_stores[i]; ++ if (scalar_tmp.length () > 1 ++ && scalar_tmp[0]->group_number != first_stmt_info->group_number) ++ { ++ continue; ++ } ++ stmt_vec_info cur_stmt_info = NULL; ++ stmt_vec_info cur_first_stmt_info = NULL; ++ stmt_vec_info last_stmt_info = NULL; ++ unsigned int k = 0; ++ FOR_EACH_VEC_ELT (scalar_tmp, k, cur_stmt_info) ++ { ++ if (k == 0) ++ { ++ cur_first_stmt_info = cur_stmt_info; ++ last_stmt_info = cur_stmt_info; ++ } ++ DR_GROUP_FIRST_ELEMENT (cur_stmt_info) = cur_first_stmt_info; ++ DR_GROUP_NEXT_ELEMENT (last_stmt_info) = cur_stmt_info; ++ last_stmt_info = cur_stmt_info; ++ } ++ DR_GROUP_SIZE (cur_first_stmt_info) = k; ++ DR_GROUP_NEXT_ELEMENT (last_stmt_info) = NULL; ++ if (first_stmt_info != cur_first_stmt_info) ++ { ++ DR_GROUP_GAP (cur_first_stmt_info) ++ = DR_GROUP_GAP_TRANS (cur_first_stmt_info); ++ DR_GROUP_SLP_TRANSPOSE (cur_first_stmt_info) = false; ++ DR_GROUP_NUMBER (cur_first_stmt_info) = -1; ++ } ++ grouped_stores_split.safe_push (cur_first_stmt_info); ++ } ++ return grouped_stores_split; ++} ++ ++/* Transform the grouped store back. */ ++ ++void ++vect_transform_back_slp_grouped_stores (bb_vec_info bb_vinfo, ++ stmt_vec_info first_stmt_info) ++{ ++ if (first_stmt_info->group_number == -1) ++ { ++ return; ++ } ++ /* Transform back. */ ++ auto_vec grouped_stores_split ++ = vect_transform_back_slp_grouped_store (bb_vinfo, first_stmt_info); ++ ++ /* Add the remaining grouped stores to grouped_stores_split. */ ++ stmt_vec_info first_element = NULL; ++ unsigned int i = 0; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, first_element) ++ { ++ if (first_element->group_number != first_stmt_info->group_number) ++ { ++ grouped_stores_split.safe_push (first_element); ++ } ++ } ++ DR_GROUP_SLP_TRANSPOSE (first_stmt_info) = false; ++ DR_GROUP_NUMBER (first_stmt_info) = -1; ++ BB_VINFO_GROUPED_STORES (bb_vinfo).release (); ++ for (i = 0; i < grouped_stores_split.length (); i++) ++ { ++ BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (grouped_stores_split[i]); ++ } ++} ++ ++/* Function check_for_slp_vectype ++ ++ Restriction for grouped stores by checking their vectype. ++ If the vectype of the grouped store is changed, it need transform back. ++ If all grouped stores need to be transformed back, return FALSE. */ ++ ++static bool ++check_for_slp_vectype (bb_vec_info bb_vinfo) ++{ ++ stmt_vec_info first_element = NULL; ++ unsigned int i = 0; ++ int count = 0; ++ auto_vec grouped_stores_check; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, first_element) ++ { ++ grouped_stores_check.safe_push (first_element); ++ } ++ FOR_EACH_VEC_ELT (grouped_stores_check, i, first_element) ++ { ++ if (STMT_VINFO_GROUPED_ACCESS (first_element) ++ && first_element->group_number != -1) ++ { ++ unsigned int group_size_b ++ = DR_GROUP_SIZE_TRANS (first_element); ++ tree vectype = STMT_VINFO_VECTYPE (first_element); ++ poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); ++ if (nunits.to_constant () > group_size_b) ++ { ++ count++; ++ /* If the vectype is changed, this grouped store need ++ to be transformed back. */ ++ vect_transform_back_slp_grouped_stores (bb_vinfo, first_element); ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "No supported: only supported for" ++ " group_size geq than nunits.\n"); ++ } ++ } ++ } ++ } ++ if (count == BB_VINFO_TRANS_GROUPS (bb_vinfo)) ++ { ++ return false; ++ } ++ return true; ++} ++ ++/* Function check_for_dr_alignment ++ ++ Check the alignment of the slp instance loads. ++ Return FALSE if a load cannot be vectorized. */ ++ ++static bool ++check_for_dr_alignment (slp_instance instance) ++{ ++ slp_tree node = NULL; ++ unsigned int i = 0; ++ FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node) ++ { ++ stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; ++ dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info); ++ enum dr_alignment_support supportable_dr_alignment ++ = vect_supportable_dr_alignment (first_dr_info, false); ++ if (supportable_dr_alignment == dr_explicit_realign_optimized ++ || supportable_dr_alignment == dr_explicit_realign) ++ { ++ return false; ++ } ++ } ++ return true; ++} ++ ++/* Initialize slp_transpose flag before transposing. */ ++ ++static void ++init_stmt_info_slp_transpose (bb_vec_info bb_vinfo) ++{ ++ stmt_vec_info first_element = NULL; ++ unsigned int k = 0; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element) ++ { ++ if (STMT_VINFO_GROUPED_ACCESS (first_element)) ++ { ++ DR_GROUP_SLP_TRANSPOSE (first_element) = false; ++ } ++ } ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_loads, k, first_element) ++ { ++ if (STMT_VINFO_GROUPED_ACCESS (first_element)) ++ { ++ DR_GROUP_SLP_TRANSPOSE (first_element) = false; ++ } ++ } ++} ++ ++/* Analyze and transpose the stmts before building the SLP tree. */ ++ ++static bool ++vect_analyze_transpose (bb_vec_info bb_vinfo) ++{ ++ DUMP_VECT_SCOPE ("vect_analyze_transpose"); ++ ++ if (!vect_may_transpose (bb_vinfo)) ++ { ++ return false; ++ } ++ ++ /* For basic block SLP, try to merge the grouped stores and loads ++ into one group. */ ++ init_stmt_info_slp_transpose (bb_vinfo); ++ if (vect_transform_slp_grouped_stores (bb_vinfo) ++ && vect_merge_slp_grouped_loads (bb_vinfo)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Analysis succeeded with SLP transposed.\n"); ++ } ++ return true; ++ } ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Analysis failed with SLP transposed.\n"); ++ } ++ return false; ++} + + /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP + trees of packed scalar stmts if SLP is possible. */ +@@ -3124,7 +4004,11 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo) + + vec_outside_cost = vec_prologue_cost + vec_epilogue_cost; + +- if (dump_enabled_p ()) ++ BB_VINFO_VEC_INSIDE_COST (bb_vinfo) = vec_inside_cost; ++ BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo) = vec_outside_cost; ++ BB_VINFO_SCALAR_COST (bb_vinfo) = scalar_cost; ++ ++ if (!unlimited_cost_model (NULL) && dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); + dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n", +@@ -3239,6 +4123,22 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal) + + vect_pattern_recog (bb_vinfo); + ++ /* Transpose grouped stores and loads for better vectorizable version. */ ++ if (bb_vinfo->transposed) ++ { ++ if (!vect_analyze_transpose (bb_vinfo)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "not vectorized: unhandled slp transposed in " ++ "basic block.\n"); ++ } ++ return false; ++ } ++ } ++ bb_vinfo->before_slp = true; ++ + /* Check the SLP opportunities in the basic block, analyze and build SLP + trees. */ + if (!vect_analyze_slp (bb_vinfo, n_stmts)) +@@ -3254,6 +4154,20 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal) + return false; + } + ++ /* Check if the vectype is suitable for SLP transposed. */ ++ if (bb_vinfo->transposed && !check_for_slp_vectype (bb_vinfo)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "Failed to SLP transposed in the basic block.\n"); ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "not vectorized: vectype is not suitable for " ++ "SLP transposed in basic block.\n"); ++ } ++ return false; ++ } ++ + vect_record_base_alignments (bb_vinfo); + + /* Analyze and verify the alignment of data references and the +@@ -3286,6 +4200,27 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal) + if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ()) + return false; + ++ /* Check if the alignment is suitable for SLP transposed. */ ++ if (bb_vinfo->transposed) ++ { ++ for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); i++) ++ { ++ if (!check_for_dr_alignment (instance)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "Failed to SLP transposed in the basic " ++ "block.\n"); ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "not vectorized: alignment is not suitable " ++ "for SLP transposed in basic block.\n"); ++ } ++ return false; ++ } ++ } ++ } ++ + if (!vect_slp_analyze_operations (bb_vinfo)) + { + if (dump_enabled_p ()) +@@ -3311,6 +4246,83 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal) + return true; + } + ++static bool ++may_new_transpose_bbvinfo (bb_vec_info bb_vinfo_ori, bool res_ori) ++{ ++ /* If the flag is false or the slp analysis is broken before ++ vect_analyze_slp, we don't try to analyze the transposed SLP version. */ ++ if (!flag_tree_slp_transpose_vectorize ++ || !BB_VINFO_BEFORE_SLP (bb_vinfo_ori)) ++ { ++ return false; ++ } ++ ++ /* If the original bb_vinfo can't be vectorized, try to new a bb_vinfo ++ of the transposed version. */ ++ if (!res_ori) ++ { ++ return true; ++ } ++ ++ /* Caculate the cost of the original bb_vinfo. */ ++ if (unlimited_cost_model (NULL)) ++ { ++ vect_bb_vectorization_profitable_p (bb_vinfo_ori); ++ } ++ /* If the vec cost and scalar cost are not much difference (here we set the ++ threshold to 4), we try to new a bb_vinfo of the transposed version. */ ++ if (BB_VINFO_SCALAR_COST (bb_vinfo_ori) ++ < 4 * (BB_VINFO_VEC_INSIDE_COST (bb_vinfo_ori) ++ + BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_ori))) ++ { ++ return true; ++ } ++ return false; ++} ++ ++static bool ++may_chose_transpose_bbvinfo (bb_vec_info bb_vinfo_trans, bool res_trans, ++ bb_vec_info bb_vinfo_ori, bool res_ori) ++{ ++ /* The original bb_vinfo is chosen if the transposed bb_vinfo ++ can't be vectorized. */ ++ if (!res_trans) ++ { ++ return false; ++ } ++ /* Caculate the cost of the transposed bb_vinfo. */ ++ if (unlimited_cost_model (NULL)) ++ { ++ vect_bb_vectorization_profitable_p (bb_vinfo_trans); ++ } ++ int diff_bb_cost = -1; ++ int diff_bb_cost_trans = -1; ++ if (res_ori) ++ { ++ diff_bb_cost = BB_VINFO_SCALAR_COST (bb_vinfo_ori) ++ - BB_VINFO_VEC_INSIDE_COST (bb_vinfo_ori) ++ - BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_ori); ++ } ++ if (res_trans) ++ { ++ diff_bb_cost_trans = BB_VINFO_SCALAR_COST (bb_vinfo_trans) ++ - BB_VINFO_VEC_INSIDE_COST (bb_vinfo_trans) ++ - BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_trans); ++ } ++ /* The original bb_vinfo is chosen when one of the following conditions ++ is satisfied as follows: ++ 1) The cost of original version is better transposed version. ++ 2) The vec cost is similar to scalar cost in the transposed version. */ ++ if ((res_ori && res_trans && diff_bb_cost >= diff_bb_cost_trans) ++ || (res_trans && BB_VINFO_SCALAR_COST (bb_vinfo_trans) ++ < (BB_VINFO_VEC_INSIDE_COST (bb_vinfo_trans) ++ + BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_trans)))) ++ { ++ return false; ++ } ++ return true; ++} ++ + /* Subroutine of vect_slp_bb. Try to vectorize the statements between + REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true + on success. The region has N_STMTS statements and has the datarefs +@@ -3323,6 +4335,7 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin, + unsigned int n_stmts) + { + bb_vec_info bb_vinfo; ++ bb_vec_info bb_vinfo_trans = NULL; + auto_vector_modes vector_modes; + + /* Autodetect first vector size we try. */ +@@ -3337,6 +4350,10 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin, + { + bool vectorized = false; + bool fatal = false; ++ bool res_bb_vinfo_ori = false; ++ bool res_bb_vinfo_trans = false; ++ ++ /* New a bb_vinfo of the original version. */ + bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared); + + bool first_time_p = shared.datarefs.is_empty (); +@@ -3346,8 +4363,57 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin, + else + bb_vinfo->shared->check_datarefs (); + bb_vinfo->vector_mode = next_vector_mode; ++ bb_vinfo->transposed = false; ++ bb_vinfo->before_slp = false; + +- if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal) ++ res_bb_vinfo_ori = vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal); ++ /* Analyze and new a transposed bb_vinfo. */ ++ if (may_new_transpose_bbvinfo (bb_vinfo, res_bb_vinfo_ori)) ++ { ++ bool fatal_trans = false; ++ bb_vinfo_trans ++ = new _bb_vec_info (region_begin, region_end, &shared); ++ bool first_time_p = shared.datarefs.is_empty (); ++ BB_VINFO_DATAREFS (bb_vinfo_trans) = datarefs; ++ if (first_time_p) ++ { ++ bb_vinfo_trans->shared->save_datarefs (); ++ } ++ else ++ { ++ bb_vinfo_trans->shared->check_datarefs (); ++ } ++ bb_vinfo_trans->vector_mode = next_vector_mode; ++ bb_vinfo_trans->transposed = true; ++ bb_vinfo_trans->before_slp = false; ++ ++ res_bb_vinfo_trans ++ = vect_slp_analyze_bb_1 (bb_vinfo_trans, n_stmts, fatal_trans); ++ if (may_chose_transpose_bbvinfo (bb_vinfo_trans, ++ res_bb_vinfo_trans, ++ bb_vinfo, res_bb_vinfo_ori)) ++ { ++ bb_vinfo = bb_vinfo_trans; ++ fatal = fatal_trans; ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Basic block part vectorized " ++ "using transposed version.\n"); ++ } ++ } ++ else ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Basic block part vectorized " ++ "using original version.\n"); ++ } ++ } ++ } ++ ++ if ((res_bb_vinfo_ori || res_bb_vinfo_trans) + && dbg_cnt (vect_slp)) + { + if (dump_enabled_p ()) +@@ -3400,6 +4466,10 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin, + } + + delete bb_vinfo; ++ if (bb_vinfo_trans) ++ { ++ bb_vinfo_trans = NULL; ++ } + + if (mode_i < vector_modes.length () + && VECTOR_MODE_P (autodetected_vector_mode) +diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c +index 6418edb52..b872cfc8d 100644 +--- a/gcc/tree-vect-stmts.c ++++ b/gcc/tree-vect-stmts.c +@@ -7329,6 +7329,153 @@ vectorizable_scan_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + return true; + } + ++/* Function vect_permute_store_chains ++ ++ Call function vect_permute_store_chain (). ++ Given a chain of interleaved stores in DR_CHAIN, generate ++ interleave_high/low stmts to reorder the data correctly. ++ Return the final references for stores in RESULT_CHAIN. */ ++ ++static void ++vect_permute_store_chains (vec dr_chain, unsigned int num_each, ++ stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, ++ vec *result_chain, unsigned int group) ++{ ++ unsigned int k = 0; ++ unsigned int t = 0; ++ ++ /* Divide vectors into GROUP parts. And permute every NUM_EACH vectors ++ together. */ ++ for (k = 0; k < group; k++) ++ { ++ auto_vec dr_chain_transposed (num_each); ++ auto_vec result_chain_transposed (num_each); ++ for (t = k; t < dr_chain.length (); t = t + group) ++ { ++ dr_chain_transposed.quick_push (dr_chain[t]); ++ } ++ vect_permute_store_chain (dr_chain_transposed, num_each, stmt_info, ++ gsi, &result_chain_transposed); ++ for (t = 0; t < num_each; t++) ++ { ++ result_chain->quick_push (result_chain_transposed[t]); ++ } ++ } ++} ++ ++/* Function transpose_oprnd_store ++ ++ Calculate the transposed results from VEC_OPRNDS (VEC_STMT) ++ for vectorizable_store. */ ++ ++static void ++transpose_oprnd_store (vecvec_oprnds, vec *result_chain, ++ unsigned int vec_num, unsigned int const_nunits, ++ unsigned int array_num, stmt_vec_info first_stmt_info, ++ gimple_stmt_iterator *gsi) ++{ ++ unsigned int group_for_transform = 0; ++ unsigned int num_each = 0; ++ ++ /* Transpose back for vec_oprnds. */ ++ /* vec = {vec1, vec2, ...} */ ++ if (array_num < const_nunits ++ && const_nunits % array_num == 0) ++ { ++ vect_transpose_store_chain (vec_oprnds, ++ vec_num, array_num, ++ first_stmt_info, ++ gsi, result_chain); ++ } ++ /* vec1 = {vec_part1}, vec2 = {vec_part2}, ... */ ++ else if (array_num >= const_nunits ++ && array_num % const_nunits == 0) ++ { ++ group_for_transform = array_num / const_nunits; ++ num_each = vec_oprnds.length () / group_for_transform; ++ vect_permute_store_chains (vec_oprnds, ++ num_each, first_stmt_info, ++ gsi, result_chain, ++ group_for_transform); ++ } ++ else ++ { ++ gcc_unreachable (); ++ } ++} ++ ++static dr_vec_info * ++get_dr_info (stmt_vec_info stmt_info) ++{ ++ dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); ++ if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED) ++ { ++ SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN); ++ } ++ return dr_info; ++} ++ ++static unsigned ++dr_align_vect_store (dr_vec_info *cur_first_dr_info, ++ unsigned HOST_WIDE_INT &align) ++{ ++ unsigned misalign = 0; ++ align = known_alignment (DR_TARGET_ALIGNMENT (cur_first_dr_info)); ++ if (aligned_access_p (cur_first_dr_info)) ++ { ++ return misalign; ++ } ++ else if (DR_MISALIGNMENT (cur_first_dr_info) == -1) ++ { ++ align = dr_alignment (vect_dr_behavior (cur_first_dr_info)); ++ } ++ else ++ { ++ misalign = DR_MISALIGNMENT (cur_first_dr_info); ++ } ++ return misalign; ++} ++ ++static stmt_vec_info ++add_new_stmt_vect_store (tree vectype, tree dataref_ptr, tree dataref_offset, ++ tree ref_type, dr_vec_info *cur_first_dr_info, ++ tree vec_oprnd, gimple_stmt_iterator *gsi, ++ stmt_vec_info stmt_info) ++{ ++ /* Data align. */ ++ unsigned HOST_WIDE_INT align; ++ unsigned misalign = dr_align_vect_store (cur_first_dr_info, align); ++ ++ if (dataref_offset == NULL_TREE && TREE_CODE (dataref_ptr) == SSA_NAME) ++ { ++ set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, misalign); ++ } ++ ++ /* Get data_ref. */ ++ tree offset = dataref_offset ? dataref_offset : build_int_cst (ref_type, 0); ++ tree data_ref = fold_build2 (MEM_REF, vectype, dataref_ptr, offset); ++ if (aligned_access_p (cur_first_dr_info)) ++ { ++ ; ++ } ++ else if (DR_MISALIGNMENT (cur_first_dr_info) == -1) ++ { ++ TREE_TYPE (data_ref) = build_aligned_type (TREE_TYPE (data_ref), ++ align * BITS_PER_UNIT); ++ } ++ else ++ { ++ tree elem_type = TREE_TYPE (vectype); ++ TREE_TYPE (data_ref) = build_aligned_type (TREE_TYPE (data_ref), ++ TYPE_ALIGN (elem_type)); ++ } ++ /* Add new stmt. */ ++ vect_copy_ref_info (data_ref, DR_REF (cur_first_dr_info->dr)); ++ gassign *new_stmt = gimple_build_assign (data_ref, vec_oprnd); ++ stmt_vec_info new_stmt_info ++ = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); ++ return new_stmt_info; ++} + + /* Function vectorizable_store. + +@@ -8208,6 +8355,16 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + vect_get_gather_scatter_ops (loop, stmt_info, &gs_info, + &dataref_ptr, &vec_offset); ++ /* If the stmt_info need to be transposed recovery, dataref_ptr ++ will be caculated later. */ ++ else if (memory_access_type == VMAT_CONTIGUOUS ++ && is_a (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE ( ++ DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ dataref_ptr = NULL_TREE; ++ } + else + dataref_ptr + = vect_create_data_ref_ptr (first_stmt_info, aggr_type, +@@ -8299,6 +8456,75 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + } + else + { ++ /* group_size: the size of group after transposing and merging. ++ group_size_b: the size of group before transposing and merging, ++ and only group_size_b >= const_nunits is supported. ++ array_num: the number of arrays. ++ const_nunits: TYPE_VECTOR_SUBPARTS (vectype). ++ ncontinues: group_size_b / const_nunits, it means the number of ++ times an array is stored in memory. */ ++ if (slp && is_a (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "vectorizable_store for slp transpose.\n"); ++ } ++ /* Transpose back for grouped stores. */ ++ vect_transform_back_slp_grouped_stores (bb_vinfo, ++ first_stmt_info); ++ ++ result_chain.create (vec_oprnds.length ()); ++ unsigned int const_nunits = nunits.to_constant (); ++ unsigned int group_size_b = DR_GROUP_SIZE_TRANS (first_stmt_info); ++ unsigned int array_num = group_size / group_size_b; ++ transpose_oprnd_store (vec_oprnds, &result_chain, vec_num, ++ const_nunits, array_num, ++ first_stmt_info, gsi); ++ ++ /* For every store group, not for every vec, because transposing ++ and merging have changed the data reference access. */ ++ gcc_assert (group_size_b >= const_nunits); ++ unsigned int ncontinues = group_size_b / const_nunits; ++ ++ unsigned int k = 0; ++ for (i = 0; i < array_num; i++) ++ { ++ stmt_vec_info first_stmt_b; ++ BB_VINFO_GROUPED_STORES (vinfo).iterate (i, &first_stmt_b); ++ bool simd_lane_access_p ++ = STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_b) != 0; ++ tree ref_type = get_group_alias_ptr_type (first_stmt_b); ++ dataref_ptr = vect_create_data_ref_ptr ( ++ first_stmt_b, aggr_type, ++ simd_lane_access_p ? loop : NULL, ++ offset, &dummy, gsi, &ptr_incr, ++ simd_lane_access_p, NULL_TREE, bump); ++ dr_vec_info *cur_first_dr_info = get_dr_info (first_stmt_b); ++ for (unsigned int t = 0; t < ncontinues; t++) ++ { ++ vec_oprnd = result_chain[k]; ++ k++; ++ if (t > 0) ++ { ++ /* Bump the vector pointer. */ ++ dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, ++ gsi, first_stmt_b, ++ bump); ++ } ++ new_stmt_info = add_new_stmt_vect_store ( ++ vectype, dataref_ptr, dataref_offset, ++ ref_type, cur_first_dr_info, vec_oprnd, ++ gsi, first_stmt_b); ++ } ++ } ++ oprnds.release (); ++ result_chain.release (); ++ vec_oprnds.release (); ++ return true; ++ } + new_stmt_info = NULL; + if (grouped_store) + { +@@ -8557,6 +8783,447 @@ hoist_defs_of_uses (stmt_vec_info stmt_info, class loop *loop) + return true; + } + ++static tree ++calculate_new_type (tree vectype, unsigned int const_nunits, ++ unsigned int group_size_b, unsigned int &nloads, ++ unsigned int &ncontinues, tree &lvectype) ++{ ++ tree ltype = TREE_TYPE (vectype); ++ /* nloads is the number of ARRAYs in a vector. ++ vectemp = {a[], b[], ...} */ ++ if (group_size_b < const_nunits) ++ { ++ tree ptype; ++ tree vtype ++ = vector_vector_composition_type (vectype, ++ const_nunits / group_size_b, ++ &ptype); ++ if (vtype != NULL_TREE) ++ { ++ nloads = const_nunits / group_size_b; ++ lvectype = vtype; ++ ltype = ptype; ++ ncontinues = 1; ++ } ++ } ++ /* ncontinues is the number of vectors from an ARRAY. ++ vectemp1 = {a[0], a[1], ...} ++ ... ++ vectempm = {a[k], a[k+1], ...} */ ++ else ++ { ++ nloads = 1; ++ ltype = vectype; ++ ncontinues = group_size_b / const_nunits; ++ } ++ ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype))); ++ return ltype; ++} ++ ++static void ++generate_old_load_permutations (slp_tree slp_node, unsigned int group_size, ++ vec &old_load_permutation) ++{ ++ /* Generate the old load permutations from the slp_node. */ ++ unsigned i = 0; ++ unsigned k = 0; ++ ++ /* If SLP_NODE has load_permutation, we copy it to old_load_permutation. ++ Otherwise, we generate a permutation sequentially. */ ++ if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) ++ { ++ FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), i, k) ++ { ++ old_load_permutation.safe_push (k); ++ } ++ } ++ else ++ { ++ for (unsigned i = 0; i < group_size; i++) ++ { ++ old_load_permutation.safe_push (i); ++ } ++ } ++} ++ ++static void ++generate_new_load_permutation_mapping (unsigned slp_node_length, ++ vec &group_idx, ++ const vec &load_permutation, ++ unsigned int group_size_b, ++ unsigned &new_group_size, ++ vec &group_from) ++{ ++ /* group_num_vec: only stores the group_loads IDs which are caculated from ++ load_permutation. */ ++ auto_vec group_num_vec; ++ ++ /* Caculate which group_loads are the stmts in SLP_NODE from. */ ++ unsigned i = 0; ++ unsigned k = 0; ++ FOR_EACH_VEC_ELT (load_permutation, i, k) ++ { ++ unsigned int t0 = k / group_size_b; ++ if (!group_num_vec.contains (t0)) ++ { ++ group_num_vec.safe_push (t0); ++ } ++ group_from.safe_push (t0); ++ } ++ group_num_vec.qsort (cmp_for_group_num); ++ /* n_groups: the number of group_loads. */ ++ unsigned int n_groups = group_num_vec.length (); ++ new_group_size = n_groups * group_size_b; ++ for (i = 0; i < n_groups; i++) ++ { ++ group_idx.safe_push (group_num_vec[i] * group_size_b); ++ } ++ /* A new mapping from group_ind_vec to group_from. ++ For example: ++ Origin: group_from = {1,1,3,3,5,5,7,7}; ++ After mapping: group_from = {0,0,1,1,2,2,2,2}; */ ++ auto_vec group_ind_vec (n_groups); ++ for (k = 0; k < n_groups; k++) ++ { ++ group_ind_vec.safe_push (k); ++ } ++ for (i = 0; i < slp_node_length; i++) ++ { ++ for (k = 0; k < n_groups; k++) ++ { ++ if (group_from[i] == group_num_vec[k]) ++ { ++ group_from[i] = group_ind_vec[k]; ++ break; ++ } ++ } ++ } ++} ++ ++static void ++generate_new_load_permutation (vec &new_load_permutation, ++ const vec &old_load_permutation, ++ slp_tree slp_node, bool &this_load_permuted, ++ const vec &group_from, ++ unsigned int group_size_b) ++{ ++ unsigned slp_node_length = SLP_TREE_SCALAR_STMTS (slp_node).length (); ++ /* Generate the new load permutation from the new mapping. */ ++ new_load_permutation.create (slp_node_length); ++ unsigned i = 0; ++ unsigned k = 0; ++ FOR_EACH_VEC_ELT (old_load_permutation, i, k) ++ { ++ /* t1 is the new permutation of k in the old permutation. ++ t1 = base_address + offset: ++ base_address = group_from[i] * group_size_b; ++ offset = k % group_size_b. */ ++ unsigned int t1 ++ = group_from[i] * group_size_b + k % group_size_b; ++ new_load_permutation.safe_push (t1); ++ if (t1 != k) ++ { ++ this_load_permuted = true; ++ } ++ } ++} ++ ++static bool ++is_slp_perm (bool slp_perm, bool this_load_permuted, poly_uint64 nunits, ++ unsigned int group_size, stmt_vec_info first_stmt_info) ++{ ++ /* Calculate the unrolling factor based on the smallest type. */ ++ poly_uint64 unrolling_factor ++ = exact_div (common_multiple (nunits, group_size), group_size); ++ /* The load requires permutation when unrolling exposes ++ a gap either because the group is larger than the SLP ++ group-size or because there is a gap between the groups. */ ++ if (!slp_perm && !this_load_permuted ++ && (known_eq (unrolling_factor, 1U) ++ || (group_size == DR_GROUP_SIZE (first_stmt_info) ++ && DR_GROUP_GAP (first_stmt_info) == 0))) ++ { ++ return false; ++ } ++ else ++ { ++ return true; ++ } ++} ++ ++static void ++generate_load_permutation (slp_tree slp_node, unsigned &new_group_size, ++ unsigned int group_size, unsigned int group_size_b, ++ bool &this_load_permuted, vec &group_idx, ++ vec &new_load_permutation) ++{ ++ /* Generate the old load permutations from SLP_NODE. */ ++ vec old_load_permutation; ++ old_load_permutation.create (group_size); ++ generate_old_load_permutations (slp_node, group_size, old_load_permutation); ++ ++ /* Caculate which group_loads are the stmts in SLP_NODE from. */ ++ unsigned slp_node_length = SLP_TREE_SCALAR_STMTS (slp_node).length (); ++ /* group_from: stores the group_loads ID for every stmt in SLP_NODE. */ ++ vec group_from; ++ group_from.create (slp_node_length); ++ generate_new_load_permutation_mapping (slp_node_length, group_idx, ++ old_load_permutation, ++ group_size_b, new_group_size, ++ group_from); ++ ++ /* Generate the new load permutation from the new mapping and caculate ++ this_load_permuted flag. If this_load_permuted is true, we need execute ++ slp permutation by using new load permutation. */ ++ generate_new_load_permutation (new_load_permutation, old_load_permutation, ++ slp_node, this_load_permuted, group_from, ++ group_size_b); ++ old_load_permutation.release (); ++ group_from.release (); ++} ++ ++static unsigned int ++dr_align_vect_load (dr_vec_info *cur_first_dr_info, ++ unsigned HOST_WIDE_INT &align, ++ enum dr_alignment_support alignment_support_scheme) ++{ ++ unsigned int misalign = 0; ++ ++ align = known_alignment (DR_TARGET_ALIGNMENT (cur_first_dr_info)); ++ if (alignment_support_scheme == dr_aligned) ++ { ++ gcc_assert (aligned_access_p (cur_first_dr_info)); ++ } ++ else if (DR_MISALIGNMENT (cur_first_dr_info) == -1) ++ { ++ align = dr_alignment (vect_dr_behavior (cur_first_dr_info)); ++ } ++ else ++ { ++ misalign = DR_MISALIGNMENT (cur_first_dr_info); ++ } ++ return misalign; ++} ++ ++static stmt_vec_info ++add_new_stmt_vect_load (tree vectype, tree dataref_ptr, tree dataref_offset, ++ tree ref_type, tree ltype, gassign *(&new_stmt), ++ dr_vec_info *cur_first_dr_info, ++ gimple_stmt_iterator *gsi, stmt_vec_info stmt_info) ++{ ++ /* Data align. */ ++ enum dr_alignment_support alignment_support_scheme ++ = vect_supportable_dr_alignment (cur_first_dr_info, false); ++ unsigned HOST_WIDE_INT align; ++ unsigned int misalign = dr_align_vect_load (cur_first_dr_info, align, ++ alignment_support_scheme); ++ if (dataref_offset == NULL_TREE && TREE_CODE (dataref_ptr) == SSA_NAME) ++ { ++ set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, misalign); ++ } ++ ++ /* Get data_ref. */ ++ tree offset = dataref_offset ? dataref_offset : build_int_cst (ref_type, 0); ++ tree data_ref = fold_build2 (MEM_REF, ltype, dataref_ptr, offset); ++ if (alignment_support_scheme == dr_aligned) ++ { ++ ; ++ } ++ else if (DR_MISALIGNMENT (cur_first_dr_info) == -1) ++ { ++ TREE_TYPE (data_ref) ++ = build_aligned_type (TREE_TYPE (data_ref), align * BITS_PER_UNIT); ++ } ++ else ++ { ++ tree elem_type = TREE_TYPE (vectype); ++ TREE_TYPE (data_ref) ++ = build_aligned_type (TREE_TYPE (data_ref), TYPE_ALIGN (elem_type)); ++ } ++ ++ /* Add new stmt. */ ++ vect_copy_ref_info (data_ref, DR_REF (cur_first_dr_info->dr)); ++ new_stmt = gimple_build_assign (make_ssa_name (ltype), data_ref); ++ stmt_vec_info new_stmt_info ++ = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); ++ return new_stmt_info; ++} ++ ++static void ++push_new_stmt_to_dr_chain (bool slp_perm, stmt_vec_info new_stmt_info, ++ vec &dr_chain, slp_tree slp_node) ++{ ++ if (slp_perm) ++ { ++ dr_chain.quick_push (gimple_assign_lhs (new_stmt_info->stmt)); ++ } ++ else ++ { ++ SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info); ++ } ++} ++ ++static stmt_vec_info ++get_first_stmt_info_before_transpose (stmt_vec_info first_stmt_info, ++ unsigned int group_el, ++ unsigned int group_size) ++{ ++ stmt_vec_info last_stmt_info = first_stmt_info; ++ unsigned int count = 0; ++ gcc_assert (group_el < group_size); ++ while (count < group_el) ++ { ++ last_stmt_info = DR_GROUP_NEXT_ELEMENT (last_stmt_info); ++ count++; ++ } ++ return last_stmt_info; ++} ++ ++static stmt_vec_info ++add_new_stmt_for_nloads_greater_than_one (tree lvectype, tree vectype, ++ vec *v, ++ stmt_vec_info stmt_info, ++ gimple_stmt_iterator *gsi) ++{ ++ tree vec_inv = build_constructor (lvectype, v); ++ tree new_temp = vect_init_vector (stmt_info, vec_inv, lvectype, gsi); ++ vec_info *vinfo = stmt_info->vinfo; ++ stmt_vec_info new_stmt_info = vinfo->lookup_def (new_temp); ++ if (lvectype != vectype) ++ { ++ gassign *new_stmt = gimple_build_assign (make_ssa_name (vectype), ++ VIEW_CONVERT_EXPR, ++ build1 (VIEW_CONVERT_EXPR, ++ vectype, new_temp)); ++ new_stmt_info = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); ++ } ++ return new_stmt_info; ++} ++ ++/* Function new_vect_stmt_for_nloads. ++ ++ New a VEC_STMT when nloads Arrays are merged into a vector. ++ ++ ncopies is the number of vectors that need to be loaded from memmory. ++ nloads is the number of ARRAYs in a vector. ++ vectemp = {a[], b[], ...} */ ++ ++static void ++new_vect_stmt_for_nloads (unsigned int ncopies, unsigned int nloads, ++ vec group_idx, stmt_vec_info stmt_info, ++ offset_info *offset_info, vectype_info *vectype_info, ++ vect_memory_access_type memory_access_type, ++ bool slp_perm, vec& dr_chain, slp_tree slp_node, ++ gimple_stmt_iterator *gsi) ++{ ++ vec *v = NULL; ++ stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); ++ unsigned int group_size = DR_GROUP_SIZE (first_stmt_info); ++ stmt_vec_info first_stmt_info_b = NULL; ++ stmt_vec_info new_stmt_info = NULL; ++ tree dataref_ptr = NULL_TREE; ++ tree dummy; ++ gimple *ptr_incr = NULL; ++ unsigned int n = 0; ++ for (unsigned int i = 0; i < ncopies; i++) ++ { ++ vec_alloc (v, nloads); ++ for (unsigned int t = 0; t < nloads; t++) ++ { ++ first_stmt_info_b = get_first_stmt_info_before_transpose ( ++ first_stmt_info, group_idx[n++], group_size); ++ dr_vec_info* cur_first_dr_info = get_dr_info (first_stmt_info_b); ++ tree bump = vect_get_data_ptr_increment (cur_first_dr_info, ++ vectype_info->ltype, ++ memory_access_type); ++ bool simd_lane_access_p ++ = STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_info_b) != 0; ++ ++ /* Create dataref_ptr which is point to init_address. */ ++ dataref_ptr = vect_create_data_ref_ptr ( ++ first_stmt_info_b, vectype_info->ltype, NULL, ++ offset_info->offset, &dummy, gsi, &ptr_incr, ++ simd_lane_access_p, offset_info->byte_offset, bump); ++ ++ gassign *new_stmt = NULL; ++ new_stmt_info = add_new_stmt_vect_load ( ++ vectype_info->vectype, dataref_ptr, ++ offset_info->dataref_offset, vectype_info->ref_type, ++ vectype_info->ltype, new_stmt, cur_first_dr_info, ++ gsi, first_stmt_info_b); ++ ++ CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_assign_lhs (new_stmt)); ++ } ++ new_stmt_info = add_new_stmt_for_nloads_greater_than_one ( ++ vectype_info->lvectype, vectype_info->vectype, ++ v, first_stmt_info_b, gsi); ++ push_new_stmt_to_dr_chain (slp_perm, new_stmt_info, ++ dr_chain, slp_node); ++ } ++} ++ ++/* Function new_vect_stmt_for_ncontinues. ++ ++ New a VEC_STMTs when an Array is divided into several vectors. ++ ++ n_groups is the number of ARRAYs. ++ ncontinues is the number of vectors from an ARRAY. ++ vectemp1 = {a[0], a[1], ...} ++ ... ++ vectempm = {a[k], a[k+1], ...} */ ++ ++static void ++new_vect_stmt_for_ncontinues (unsigned int ncontinues, vec group_idx, ++ stmt_vec_info stmt_info, offset_info* offset_info, ++ vectype_info* vectype_info, ++ vect_memory_access_type memory_access_type, ++ bool slp_perm, vec& dr_chain, ++ slp_tree slp_node, ++ gimple_stmt_iterator *gsi) ++{ ++ stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); ++ unsigned int group_size = DR_GROUP_SIZE (first_stmt_info); ++ stmt_vec_info new_stmt_info = NULL; ++ tree dataref_ptr = NULL_TREE; ++ tree dummy; ++ gimple *ptr_incr = NULL; ++ unsigned int n_groups = group_idx.length (); ++ for (unsigned int i = 0; i < n_groups; i++) ++ { ++ stmt_vec_info first_stmt_info_b = get_first_stmt_info_before_transpose ( ++ first_stmt_info, group_idx[i], group_size); ++ dr_vec_info* cur_first_dr_info = get_dr_info (first_stmt_info_b); ++ tree bump = vect_get_data_ptr_increment (cur_first_dr_info, ++ vectype_info->ltype, memory_access_type); ++ bool simd_lane_access_p ++ = STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_info_b) != 0; ++ for (unsigned int k = 0; k < ncontinues; k++) ++ { ++ /* Create dataref_ptr which is point to init_address. */ ++ if (k == 0) ++ { ++ dataref_ptr = vect_create_data_ref_ptr ( ++ first_stmt_info_b, vectype_info->ltype, NULL, ++ offset_info->offset, &dummy, gsi, &ptr_incr, ++ simd_lane_access_p, offset_info->byte_offset, bump); ++ } ++ else ++ { ++ dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, ++ gsi, first_stmt_info_b, bump); ++ } ++ gassign *new_stmt = NULL; ++ new_stmt_info = add_new_stmt_vect_load ( ++ vectype_info->vectype, dataref_ptr, ++ offset_info->dataref_offset, vectype_info->ref_type, ++ vectype_info->ltype, new_stmt, cur_first_dr_info, ++ gsi, first_stmt_info_b); ++ push_new_stmt_to_dr_chain (slp_perm, new_stmt_info, ++ dr_chain, slp_node); ++ } ++ } ++} ++ + /* vectorizable_load. + + Check if STMT_INFO reads a non scalar data-ref (array/pointer/structure) +@@ -9364,6 +10031,9 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + tree vec_mask = NULL_TREE; + prev_stmt_info = NULL; + poly_uint64 group_elt = 0; ++ unsigned new_group_size = 0; ++ vec new_load_permutation; ++ + for (j = 0; j < ncopies; j++) + { + stmt_vec_info new_stmt_info = NULL; +@@ -9385,6 +10055,15 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + dataref_ptr = unshare_expr (DR_BASE_ADDRESS (first_dr_info->dr)); + dataref_offset = build_int_cst (ref_type, 0); + } ++ /* If the stmt_info need to be transposed recovery, dataref_ptr ++ will be caculated later. */ ++ else if (slp && is_a (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE ( ++ DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ dataref_ptr = NULL_TREE; ++ } + else if (diff_first_stmt_info) + { + dataref_ptr +@@ -9501,6 +10180,63 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + /* Record that VEC_ARRAY is now dead. */ + vect_clobber_variable (stmt_info, gsi, vec_array); + } ++ else if (slp && is_a (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "vectorizable_load for slp transpose.\n"); ++ } ++ /* group_size: the size of group after merging. ++ group_size_b: the size of group before merging. ++ const_nunits: TYPE_VECTOR_SUBPARTS (vectype), it is the number of ++ elements in a vector. ++ nloads: const_nunits / group_size_b or 1, it means the number ++ of ARRAYs in a vector. ++ ncontinues: group_size_b / const_nunits or 1, it means the number ++ of vectors from an ARRAY. */ ++ unsigned int group_size_b = DR_GROUP_SIZE_TRANS (first_stmt_info); ++ unsigned int const_nunits = nunits.to_constant (); ++ unsigned int nloads = const_nunits; ++ unsigned int ncontinues = group_size_b; ++ tree lvectype = vectype; ++ tree ltype = calculate_new_type (vectype, const_nunits, ++ group_size_b, nloads, ++ ncontinues, lvectype); ++ bool this_load_permuted = false; ++ auto_vec group_idx; ++ generate_load_permutation (slp_node, new_group_size, group_size, ++ group_size_b, this_load_permuted, ++ group_idx, new_load_permutation); ++ slp_perm = is_slp_perm (slp_perm, this_load_permuted, nunits, ++ group_size, first_stmt_info); ++ ++ /* ncopies: the number of vectors that need to be loaded from ++ memmory. */ ++ unsigned int ncopies = new_group_size / const_nunits; ++ offset_info offset_info = {offset, byte_offset, dataref_offset}; ++ vectype_info vectype_info = {vectype, ltype, lvectype, ref_type}; ++ if (slp_perm) ++ { ++ dr_chain.create (ncopies); ++ } ++ if (nloads > 1 && ncontinues == 1) ++ { ++ new_vect_stmt_for_nloads (ncopies, nloads, group_idx, stmt_info, ++ &offset_info, &vectype_info, ++ memory_access_type, slp_perm, dr_chain, ++ slp_node, gsi); ++ } ++ else ++ { ++ new_vect_stmt_for_ncontinues (ncontinues, group_idx, stmt_info, ++ &offset_info, &vectype_info, ++ memory_access_type, slp_perm, ++ dr_chain, slp_node, gsi); ++ } ++ } + else + { + for (i = 0; i < vec_num; i++) +@@ -9840,7 +10576,32 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + if (slp && !slp_perm) + continue; + +- if (slp_perm) ++ /* Using the new load permutation to generate vector permute statements ++ from a list of loads in DR_CHAIN. */ ++ if (slp && slp_perm && is_a (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ unsigned n_perms; ++ stmt_vec_info stmt_info_ = SLP_TREE_SCALAR_STMTS (slp_node)[0]; ++ unsigned int old_size = DR_GROUP_SIZE (stmt_info); ++ DR_GROUP_SIZE (stmt_info_) = new_group_size; ++ vec old_load_permutation ++ = SLP_TREE_LOAD_PERMUTATION (slp_node); ++ SLP_TREE_LOAD_PERMUTATION (slp_node) = new_load_permutation; ++ bool perm_load_success = vect_transform_slp_perm_load ( ++ slp_node, dr_chain, gsi, vf, ++ slp_node_instance, false, &n_perms); ++ DR_GROUP_SIZE (stmt_info_) = old_size; ++ SLP_TREE_LOAD_PERMUTATION (slp_node) = old_load_permutation; ++ new_load_permutation.release (); ++ if (!perm_load_success) ++ { ++ dr_chain.release (); ++ return false; ++ } ++ } ++ else if (slp_perm) + { + unsigned n_perms; + if (!vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf, +diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h +index f7becb34a..1c4a6c421 100644 +--- a/gcc/tree-vectorizer.h ++++ b/gcc/tree-vectorizer.h +@@ -297,6 +297,21 @@ public: + vec ddrs; + }; + ++/* Information about offset in vectorizable_load. */ ++struct offset_info { ++ tree offset; ++ tree byte_offset; ++ tree dataref_offset; ++}; ++ ++/* Information about vectype in vectorizable_load. */ ++struct vectype_info { ++ tree vectype; ++ tree ltype; ++ tree lvectype; ++ tree ref_type; ++}; ++ + /* Vectorizer state common between loop and basic-block vectorization. */ + class vec_info { + public: +@@ -335,6 +350,14 @@ public: + stmt in the chain. */ + auto_vec grouped_stores; + ++ /* All interleaving chains of loads, represented by the first ++ stmt in the chain. */ ++ auto_vec grouped_loads; ++ ++ /* All interleaving chains of stores (before transposed), represented by all ++ stmt in the chain. */ ++ auto_vec > scalar_stores; ++ + /* Cost data used by the target cost model. */ + void *target_cost_data; + +@@ -702,6 +725,8 @@ public: + #define LOOP_VINFO_CHECK_NONZERO(L) (L)->check_nonzero + #define LOOP_VINFO_LOWER_BOUNDS(L) (L)->lower_bounds + #define LOOP_VINFO_GROUPED_STORES(L) (L)->grouped_stores ++#define LOOP_VINFO_GROUPED_LOADS(L) (L)->grouped_loads ++#define LOOP_VINFO_SCALAR_STORES(L) (L)->scalar_stores + #define LOOP_VINFO_SLP_INSTANCES(L) (L)->slp_instances + #define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor + #define LOOP_VINFO_REDUCTIONS(L) (L)->reductions +@@ -764,6 +789,25 @@ public: + basic_block bb; + gimple_stmt_iterator region_begin; + gimple_stmt_iterator region_end; ++ ++ /* True, if bb_vinfo can goto vect_analyze_slp. */ ++ bool before_slp; ++ ++ /* True, if bb_vinfo is a transposed version. */ ++ bool transposed; ++ ++ /* The number of transposed groups. */ ++ int transposed_group; ++ ++ /* The cost of the scalar iterations. */ ++ int scalar_cost; ++ ++ /* The cost of the vector prologue and epilogue, including peeled ++ iterations and set-up code. */ ++ int vec_outside_cost; ++ ++ /* The cost of the vector loop body. */ ++ int vec_inside_cost; + } *bb_vec_info; + + #define BB_VINFO_BB(B) (B)->bb +@@ -772,6 +816,14 @@ public: + #define BB_VINFO_DATAREFS(B) (B)->shared->datarefs + #define BB_VINFO_DDRS(B) (B)->shared->ddrs + #define BB_VINFO_TARGET_COST_DATA(B) (B)->target_cost_data ++#define BB_VINFO_GROUPED_LOADS(B) (B)->grouped_loads ++#define BB_VINFO_SCALAR_STORES(B) (B)->scalar_stores ++#define BB_VINFO_VEC_OUTSIDE_COST(B) (B)->vec_outside_cost ++#define BB_VINFO_VEC_INSIDE_COST(B) (B)->vec_inside_cost ++#define BB_VINFO_SCALAR_COST(B) (B)->scalar_cost ++#define BB_VINFO_SLP_TRANSPOSED(B) (B)->transposed ++#define BB_VINFO_BEFORE_SLP(B) (B)->before_slp ++#define BB_VINFO_TRANS_GROUPS(B) (B)->transposed_group + + static inline bb_vec_info + vec_info_for_bb (basic_block bb) +@@ -1012,6 +1064,17 @@ public: + stmt_vec_info next_element; + /* The size of the group. */ + unsigned int size; ++ ++ /* The size of the group before transposed. */ ++ unsigned int size_before_transpose; ++ ++ /* If true, the stmt_info is slp transposed. */ ++ bool slp_transpose; ++ ++ /* Mark the group store number for rebuild interleaving chain ++ during transpose phase. Value -1 represents unable to transpose. */ ++ int group_number; ++ + /* For stores, number of stores from this group seen. We vectorize the last + one. */ + unsigned int store_count; +@@ -1019,6 +1082,9 @@ public: + is 1. */ + unsigned int gap; + ++ /* The gap before transposed. */ ++ unsigned int gap_before_transpose; ++ + /* The minimum negative dependence distance this stmt participates in + or zero if none. */ + unsigned int min_neg_dist; +@@ -1217,6 +1283,12 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo) + #define STMT_VINFO_REDUC_VECTYPE_IN(S) (S)->reduc_vectype_in + #define STMT_VINFO_SLP_VECT_ONLY(S) (S)->slp_vect_only_p + ++#define DR_GROUP_SLP_TRANSPOSE(S) \ ++ (gcc_checking_assert ((S)->dr_aux.dr), (S)->slp_transpose) ++#define DR_GROUP_SIZE_TRANS(S) \ ++ (gcc_checking_assert ((S)->dr_aux.dr), (S)->size_before_transpose) ++#define DR_GROUP_NUMBER(S) \ ++ (gcc_checking_assert ((S)->dr_aux.dr), (S)->group_number) + #define DR_GROUP_FIRST_ELEMENT(S) \ + (gcc_checking_assert ((S)->dr_aux.dr), (S)->first_element) + #define DR_GROUP_NEXT_ELEMENT(S) \ +@@ -1227,6 +1299,8 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo) + (gcc_checking_assert ((S)->dr_aux.dr), (S)->store_count) + #define DR_GROUP_GAP(S) \ + (gcc_checking_assert ((S)->dr_aux.dr), (S)->gap) ++#define DR_GROUP_GAP_TRANS(S) \ ++ (gcc_checking_assert ((S)->dr_aux.dr), (S)->gap_before_transpose) + + #define REDUC_GROUP_FIRST_ELEMENT(S) \ + (gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element) +@@ -1624,6 +1698,17 @@ vect_get_scalar_dr_size (dr_vec_info *dr_info) + return tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_info->dr)))); + } + ++/* Compare two unsigned int A and B. ++ Sorting them in ascending order. */ ++ ++static inline int ++cmp_for_group_num (const void *a_, const void *b_) ++{ ++ unsigned int a = *(unsigned int *)const_cast(a_); ++ unsigned int b = *(unsigned int *)const_cast(b_); ++ return a < b ? -1 : 1; ++} ++ + /* Return true if LOOP_VINFO requires a runtime check for whether the + vector loop is profitable. */ + +@@ -1787,6 +1872,9 @@ extern bool vect_grouped_load_supported (tree, bool, unsigned HOST_WIDE_INT); + extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool); + extern void vect_permute_store_chain (vec ,unsigned int, stmt_vec_info, + gimple_stmt_iterator *, vec *); ++extern void vect_transpose_store_chain (vec, unsigned int, unsigned int, ++ stmt_vec_info, gimple_stmt_iterator *, ++ vec *); + extern tree vect_setup_realignment (stmt_vec_info, gimple_stmt_iterator *, + tree *, enum dr_alignment_support, tree, + class loop **); +@@ -1849,6 +1937,7 @@ extern void vect_free_slp_instance (slp_instance, bool); + extern bool vect_transform_slp_perm_load (slp_tree, vec , + gimple_stmt_iterator *, poly_uint64, + slp_instance, bool, unsigned *); ++extern void vect_transform_back_slp_grouped_stores (bb_vec_info, stmt_vec_info); + extern bool vect_slp_analyze_operations (vec_info *); + extern void vect_schedule_slp (vec_info *); + extern opt_result vect_analyze_slp (vec_info *, unsigned); +-- +2.27.0 + diff --git a/0046-loop-distribution.patch b/0046-loop-distribution.patch new file mode 100644 index 0000000000000000000000000000000000000000..aa257398c2e89fa3db43f0e9a07c10fb85412966 --- /dev/null +++ b/0046-loop-distribution.patch @@ -0,0 +1,2603 @@ +From a398e311a9c7eead357e6c3338abafd2017e51be Mon Sep 17 00:00:00 2001 +From: benniaobufeijiushiji +Date: Wed, 29 Jun 2022 22:06:22 +0800 +Subject: [PATCH] loop distribution + +--- + gcc/cfgloop.h | 3 + + gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c | 69 + + gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c | 17 + + gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c | 19 + + gcc/tree-loop-distribution.c | 1995 +++++++++++++++++-- + gcc/tree-vect-loop.c | 31 +- + gcc/tree-vect-stmts.c | 220 +- + 7 files changed, 2160 insertions(+), 194 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c + +diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h +index 18b404e29..48426fdce 100644 +--- a/gcc/cfgloop.h ++++ b/gcc/cfgloop.h +@@ -230,6 +230,9 @@ public: + flag_finite_loops or similar pragmas state. */ + unsigned finite_p : 1; + ++ /* True if we are processing this loop in pass ldist. */ ++ unsigned processing_ldist : 1; ++ + /* The number of times to unroll the loop. 0 means no information given, + just do what we always do. A value of 1 means do not unroll the loop. + A value of USHRT_MAX means unroll with no specific unrolling factor. +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c +new file mode 100644 +index 000000000..952438cec +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c +@@ -0,0 +1,69 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-do run { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -ftree-slp-transpose-vectorize -fdump-tree-ldist-all-details -save-temps" } */ ++ ++#include ++#include ++ ++static unsigned inline abs2 (unsigned a) ++{ ++ unsigned s = ((a>>15)&0x10001)*0xffff; ++ return (a+s)^s; ++} ++ ++int foo (unsigned char *oxa, int ia, unsigned char *oxb, int ib) ++{ ++ unsigned tmp[4][4]; ++ unsigned a0, a1, a2, a3; ++ int sum = 0; ++ for (int i = 0; i < 4; i++, oxa += ia, oxb += ib) ++ { ++ a0 = (oxa[0] - oxb[0]) + ((oxa[4] - oxb[4]) << 16); ++ a1 = (oxa[1] - oxb[1]) + ((oxa[5] - oxb[5]) << 16); ++ a2 = (oxa[2] - oxb[2]) + ((oxa[6] - oxb[6]) << 16); ++ a3 = (oxa[3] - oxb[3]) + ((oxa[7] - oxb[7]) << 16); ++ int t0 = a0 + a1; ++ int t1 = a0 - a1; ++ int t2 = a2 + a3; ++ int t3 = a2 - a3; ++ tmp[i][0] = t0 + t2; ++ tmp[i][2] = t0 - t2; ++ tmp[i][1] = t1 + t3; ++ tmp[i][3] = t1 - t3; ++ } ++ for (int i = 0; i < 4; i++) ++ { ++ int t0 = tmp[0][i] + tmp[1][i]; ++ int t1 = tmp[0][i] - tmp[1][i]; ++ int t2 = tmp[2][i] + tmp[3][i]; ++ int t3 = tmp[2][i] - tmp[3][i]; ++ a0 = t0 + t2; ++ a2 = t0 - t2; ++ a1 = t1 + t3; ++ a3 = t1 - t3; ++ sum += abs2 (a0) + abs2 (a1) + abs2 (a2) + abs2 (a3); ++ } ++ return (((unsigned short) sum) + ((unsigned) sum >>16)) >> 1; ++} ++ ++int main () ++{ ++ unsigned char oxa[128] = {0}; ++ unsigned char oxb[128] = {0}; ++ for (int i = 0; i < 128; i++) ++ { ++ oxa[i] += i * 3; ++ oxb[i] = i * 2; ++ } ++ int sum = foo (oxa, 16, oxb, 32); ++ if (sum != 736) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump-times "Insertion done: 4 temp arrays inserted" 1 "ldist" } } */ ++/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 1 "ldist" } } */ ++/* { dg-final { scan-assembler-times {\tzip1\t} 8 } } */ ++/* { dg-final { scan-assembler-times {\tzip2\t} 8 } } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c +new file mode 100644 +index 000000000..1b50fd27d +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c +@@ -0,0 +1,17 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -ftree-slp-transpose-vectorize -fdump-tree-ldist-all-details" } */ ++ ++unsigned a0[4], a1[4], a2[4], a3[4]; ++ ++void foo (unsigned char *oxa, int ia, unsigned char *oxb, int ib) ++{ ++ for (int i = 0; i < 4; i++, oxa += ia, oxb += ib) ++ { ++ a0[i] = (oxa[0] - oxb[0]) + ((oxa[4] - oxb[4]) << 16); ++ a1[i] = (oxa[1] - oxb[1]) + ((oxa[5] - oxb[5]) << 16); ++ a2[i] = (oxa[2] - oxb[2]) + ((oxa[6] - oxb[6]) << 16); ++ a3[i] = (oxa[3] - oxb[3]) + ((oxa[7] - oxb[7]) << 16); ++ } ++} ++ ++/* { dg-final { scan-tree-dump-times "Loop 1 not distributed." 1 "ldist" } } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c +new file mode 100644 +index 000000000..94b992b05 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c +@@ -0,0 +1,19 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -ftree-slp-transpose-vectorize -fdump-tree-ldist-all-details" } */ ++ ++unsigned a0[4], a1[4], a2[4], a3[4]; ++ ++void foo (unsigned char *oxa, int ia, unsigned char *oxb, int ib) ++{ ++ for (int i = 0; i < 4; i++, oxa += ia, oxb += ib) ++ { ++ a0[i] = ((oxa[0] - oxb[0]) + ((oxa[4] - oxb[4]) << 16)) + 1; ++ a1[i] = ((oxa[1] - oxb[1]) + ((oxa[5] - oxb[5]) << 16)) - 2; ++ a2[i] = ((oxa[2] - oxb[2]) + ((oxa[6] - oxb[6]) << 16)) * 3; ++ a3[i] = ((oxa[3] - oxb[3]) + ((oxa[7] - oxb[7]) << 16)) / 4; ++ } ++} ++ ++/* { dg-final { scan-tree-dump-times "Insertion done: 4 temp arrays inserted" 1 "ldist" } } */ ++/* { dg-final { scan-tree-dump-times "Insertion removed" 1 "ldist" } } */ ++/* { dg-final { scan-tree-dump-times "Loop 1 not distributed." 1 "ldist" } } */ +\ No newline at end of file +diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c +index 888af4894..765d9ed3a 100644 +--- a/gcc/tree-loop-distribution.c ++++ b/gcc/tree-loop-distribution.c +@@ -36,6 +36,47 @@ along with GCC; see the file COPYING3. If not see + | D(I) = A(I-1)*E + |ENDDO + ++ If an unvectorizable loop has grouped loads, and calculations from grouped ++ loads are isomorphic, build temp arrays using stmts where isomorphic ++ calculations end. Afer distribution, the partition built from temp ++ arrays can be vectorized in pass SLP after loop unrolling. For example, ++ ++ |DO I = 1, N ++ | A = FOO (ARG_1); ++ | B = FOO (ARG_2); ++ | C = BAR_0 (A); ++ | D = BAR_1 (B); ++ |ENDDO ++ ++ is transformed to ++ ++ |DO I = 1, N ++ | J = FOO (ARG_1); ++ | K = FOO (ARG_2); ++ | X[I] = J; ++ | Y[I] = K; ++ | A = X[I]; ++ | B = Y[I]; ++ | C = BAR_0 (A); ++ | D = BAR_1 (B); ++ |ENDDO ++ ++ and is then distributed to ++ ++ |DO I = 1, N ++ | J = FOO (ARG_1); ++ | K = FOO (ARG_2); ++ | X[I] = J; ++ | Y[I] = K; ++ |ENDDO ++ ++ |DO I = 1, N ++ | A = X[I]; ++ | B = Y[I]; ++ | C = BAR_0 (A); ++ | D = BAR_1 (B); ++ |ENDDO ++ + Loop distribution is the dual of loop fusion. It separates statements + of a loop (or loop nest) into multiple loops (or loop nests) with the + same loop header. The major goal is to separate statements which may +@@ -44,7 +85,9 @@ along with GCC; see the file COPYING3. If not see + + 1) Seed partitions with specific type statements. For now we support + two types seed statements: statement defining variable used outside +- of loop; statement storing to memory. ++ of loop; statement storing to memory. Moreover, for unvectorizable ++ loops, we try to find isomorphic stmts from grouped load and build ++ temp arrays as new seed statements. + 2) Build reduced dependence graph (RDG) for loop to be distributed. + The vertices (RDG:V) model all statements in the loop and the edges + (RDG:E) model flow and control dependencies between statements. +@@ -103,6 +146,7 @@ along with GCC; see the file COPYING3. If not see + #include "cfganal.h" + #include "gimple-iterator.h" + #include "gimplify-me.h" ++#include "gimplify.h" + #include "stor-layout.h" + #include "tree-cfg.h" + #include "tree-ssa-loop-manip.h" +@@ -115,6 +159,9 @@ along with GCC; see the file COPYING3. If not see + #include "tree-vectorizer.h" + #include "tree-eh.h" + #include "gimple-fold.h" ++#include "optabs-tree.h" ++#include ++#include + + + #define MAX_DATAREFS_NUM \ +@@ -183,6 +230,59 @@ struct rdg_vertex + #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I])) + #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I])) + ++/* Results of isomorphic group analysis. */ ++#define UNINITIALIZED (0) ++#define ISOMORPHIC (1) ++#define HETEROGENEOUS (1 << 1) ++#define UNCERTAIN (1 << 2) ++ ++/* Information of a stmt while analyzing isomorphic use in group. */ ++ ++typedef struct _group_info ++{ ++ gimple *stmt; ++ ++ /* True if stmt can be a cut point. */ ++ bool cut_point; ++ ++ /* For use_stmt with two rhses, one of which is the lhs of stmt. ++ If the other is unknown to be isomorphic, mark it uncertain. */ ++ bool uncertain; ++ ++ /* Searching of isomorphic stmt reaches heterogeneous groups or reaches ++ MEM stmts. */ ++ bool done; ++ ++ _group_info () ++ { ++ stmt = NULL; ++ cut_point = false; ++ uncertain = false; ++ done = false; ++ } ++} *group_info; ++ ++/* PAIR of cut points and corresponding profit. */ ++typedef std::pair *, int> stmts_profit; ++ ++/* MAP of vector factor VF and corresponding stmts_profit PAIR. */ ++typedef std::map vf_stmts_profit_map; ++ ++/* PAIR of group_num and iteration_num. We consider rhses from the same ++ group and interation are isomorphic. */ ++typedef std::pair group_iteration; ++ ++/* PAIR of lhs of use_stmt and group_iteration PAIR. */ ++typedef std::pair isomer_info; ++ ++/* An isomorphic stmt is detetmined by lhs of use_stmt, group_num and ++ the iteration_num when we insert this stmt to this map. */ ++typedef std::map isomer_stmt_lhs; ++ ++/* Describe an uncertain stmt as >>. ++ Rhs is the oprand that we use to find STMT. */ ++typedef std::map uncertain_stmts; ++ + /* Data dependence type. */ + + enum rdg_dep_type +@@ -594,7 +694,8 @@ class loop_distribution + /* Returns true when PARTITION1 and PARTITION2 access the same memory + object in RDG. */ + bool share_memory_accesses (struct graph *rdg, +- partition *partition1, partition *partition2); ++ partition *partition1, partition *partition2, ++ hash_set * tmp_array_vars); + + /* For each seed statement in STARTING_STMTS, this function builds + partition for it by adding depended statements according to RDG. +@@ -638,7 +739,40 @@ class loop_distribution + /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution. + ALIAS_DDRS contains ddrs which need runtime alias check. */ + void finalize_partitions (class loop *loop, vec +- *partitions, vec *alias_ddrs); ++ *partitions, vec *alias_ddrs, ++ bitmap producers); ++ ++ /* Analyze loop form and if it's vectorizable to decide if we need to ++ insert temp arrays to distribute it. */ ++ bool may_insert_temp_arrays (loop_p loop, struct graph * &rdg, ++ control_dependences *cd); ++ ++ void reset_gimple_uid (loop_p loop); ++ ++ bool check_loop_vectorizable (loop_p loop); ++ ++ void remove_insertion (loop_p loop, struct graph *flow_only_rdg, ++ bitmap producers, struct partition *partition); ++ ++ /* Insert temp arrays if isomorphic computation exists. Temp arrays will be ++ regarded as SEED_STMTS for building partitions in succeeding processes. */ ++ bool insert_temp_arrays (loop_p loop, vec seed_stmts, ++ hash_set *tmp_array_vars, bitmap producers); ++ ++ inline void rebuild_rdg (loop_p loop, struct graph * &rdg, ++ control_dependences *cd); ++ ++ void build_producers (loop_p loop, bitmap producers, ++ vec &transformed); ++ ++ void do_insertion (loop_p loop, struct graph *flow_only_rdg, tree iv, ++ bitmap cut_points, hash_set * tmp_array_vars, ++ bitmap producers); ++ ++ /* Fuse PARTITIONS built from inserted temp arrays into one partition, ++ fuse the rest into another. */ ++ void merge_remaining_partitions (vec *partitions, ++ bitmap producers); + + /* Distributes the code from LOOP in such a way that producer statements + are placed before consumer statements. Tries to separate only the +@@ -1852,7 +1986,8 @@ loop_distribution::classify_partition (loop_p loop, + + bool + loop_distribution::share_memory_accesses (struct graph *rdg, +- partition *partition1, partition *partition2) ++ partition *partition1, partition *partition2, ++ hash_set * tmp_array_vars) + { + unsigned i, j; + bitmap_iterator bi, bj; +@@ -1886,8 +2021,13 @@ loop_distribution::share_memory_accesses (struct graph *rdg, + if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0) + && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0) + && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0) +- && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0)) +- return true; ++ && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0) ++ /* An exception, if PARTITION1 and PARTITION2 contain the ++ temp array we inserted, do not merge them. */ ++ && !tmp_array_vars->contains (DR_REF (dr1))) ++ { ++ return true; ++ } + } + } + +@@ -2848,13 +2988,47 @@ fuse_memset_builtins (vec *partitions) + } + } + ++void ++loop_distribution::merge_remaining_partitions ++ (vec *partitions, ++ bitmap producers) ++{ ++ struct partition *partition = NULL; ++ struct partition *p1 = NULL, *p2 = NULL; ++ for (unsigned i = 0; partitions->iterate (i, &partition); i++) ++ { ++ if (bitmap_intersect_p (producers, partition->stmts)) ++ { ++ if (p1 == NULL) ++ { ++ p1 = partition; ++ continue; ++ } ++ partition_merge_into (NULL, p1, partition, FUSE_FINALIZE); ++ } ++ else ++ { ++ if (p2 == NULL) ++ { ++ p2 = partition; ++ continue; ++ } ++ partition_merge_into (NULL, p2, partition, FUSE_FINALIZE); ++ } ++ partitions->unordered_remove (i); ++ partition_free (partition); ++ i--; ++ } ++} ++ + void + loop_distribution::finalize_partitions (class loop *loop, + vec *partitions, +- vec *alias_ddrs) ++ vec *alias_ddrs, ++ bitmap producers) + { + unsigned i; +- struct partition *partition, *a; ++ struct partition *partition; + + if (partitions->length () == 1 + || alias_ddrs->length () > 0) +@@ -2886,13 +3060,7 @@ loop_distribution::finalize_partitions (class loop *loop, + || (loop->inner == NULL + && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin)) + { +- a = (*partitions)[0]; +- for (i = 1; partitions->iterate (i, &partition); ++i) +- { +- partition_merge_into (NULL, a, partition, FUSE_FINALIZE); +- partition_free (partition); +- } +- partitions->truncate (1); ++ merge_remaining_partitions (partitions, producers); + } + + /* Fuse memset builtins if possible. */ +@@ -2900,157 +3068,1640 @@ loop_distribution::finalize_partitions (class loop *loop, + fuse_memset_builtins (partitions); + } + +-/* Distributes the code from LOOP in such a way that producer statements +- are placed before consumer statements. Tries to separate only the +- statements from STMTS into separate loops. Returns the number of +- distributed loops. Set NB_CALLS to number of generated builtin calls. +- Set *DESTROY_P to whether LOOP needs to be destroyed. */ ++/* Gimple uids of GIMPLE_DEBUG and GIMPLE_LABEL were changed during function ++ vect_analyze_loop, reset them to -1. */ + +-int +-loop_distribution::distribute_loop (class loop *loop, vec stmts, +- control_dependences *cd, int *nb_calls, bool *destroy_p, +- bool only_patterns_p) ++void ++loop_distribution::reset_gimple_uid (loop_p loop) + { +- ddrs_table = new hash_table (389); +- struct graph *rdg; +- partition *partition; +- int i, nbp; +- +- *destroy_p = false; +- *nb_calls = 0; +- loop_nest.create (0); +- if (!find_loop_nest (loop, &loop_nest)) ++ basic_block *bbs = get_loop_body_in_custom_order (loop, this, bb_top_order_cmp_r); ++ for (int i = 0; i < int (loop->num_nodes); i++) + { +- loop_nest.release (); +- delete ddrs_table; +- return 0; ++ basic_block bb = bbs[i]; ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ gimple *stmt = gsi_stmt (gsi); ++ if (is_gimple_debug (stmt) || gimple_code (stmt) == GIMPLE_LABEL) ++ { ++ gimple_set_uid (stmt, -1); ++ } ++ } + } ++ free (bbs); ++} + +- datarefs_vec.create (20); +- has_nonaddressable_dataref_p = false; +- rdg = build_rdg (loop, cd); +- if (!rdg) ++bool ++loop_distribution::check_loop_vectorizable (loop_p loop) ++{ ++ vec_info_shared shared; ++ loop->processing_ldist = true; ++ vect_analyze_loop (loop, &shared); ++ loop_vec_info vinfo = loop_vec_info_for_loop (loop); ++ loop->processing_ldist = false; ++ reset_gimple_uid (loop); ++ if (vinfo == NULL) + { + if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, +- "Loop %d not distributed: failed to build the RDG.\n", +- loop->num); ++ { ++ fprintf (dump_file, ++ "Loop %d no temp array insertion: bad data access pattern," ++ " unable to generate loop_vinfo.\n", loop->num); ++ } ++ return false; ++ } ++ if (vinfo->vectorizable) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Loop %d no temp array insertion: original loop" ++ " can be vectorized without distribution.\n", ++ loop->num); ++ } ++ delete vinfo; ++ loop->aux = NULL; ++ return false; ++ } ++ if (vinfo->grouped_loads.length () == 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Loop %d no temp array insertion: original loop" ++ " has no grouped loads.\n" , loop->num); ++ } ++ delete vinfo; ++ loop->aux = NULL; ++ return false; ++ } ++ return true; ++} + +- loop_nest.release (); +- free_data_refs (datarefs_vec); +- delete ddrs_table; +- return 0; ++inline void ++loop_distribution::rebuild_rdg (loop_p loop, struct graph * &rdg, ++ control_dependences *cd) ++{ ++ free_rdg (rdg); ++ rdg = build_rdg (loop, cd); ++ gcc_checking_assert (rdg != NULL); ++} ++ ++bool ++loop_distribution::may_insert_temp_arrays (loop_p loop, struct graph * &rdg, ++ control_dependences *cd) ++{ ++ if (!flag_tree_slp_transpose_vectorize) ++ { ++ return false; + } + +- if (datarefs_vec.length () > MAX_DATAREFS_NUM) ++ /* Only loops with two basic blocks HEADER and LATCH are supported. HEADER ++ is the main body of a LOOP and LATCH is the basic block that controls the ++ LOOP execution. Size of temp array is determined by loop execution time, ++ so it must be a const. */ ++ tree loop_extent = number_of_latch_executions (loop); ++ if (loop->inner != NULL || loop->num_nodes > 2 ++ || rdg->n_vertices > param_slp_max_insns_in_bb ++ || TREE_CODE (loop_extent) != INTEGER_CST) + { + if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, +- "Loop %d not distributed: too many memory references.\n", +- loop->num); ++ { ++ fprintf (dump_file, "Loop %d: no temp array insertion: bad loop" ++ " form.\n", loop->num); ++ } ++ return false; ++ } + +- free_rdg (rdg); +- loop_nest.release (); +- free_data_refs (datarefs_vec); +- delete ddrs_table; +- return 0; ++ if (loop->dont_vectorize) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Loop %d: no temp array insertion: this loop" ++ " should never be vectorized.\n", ++ loop->num); ++ } ++ return false; + } + +- data_reference_p dref; +- for (i = 0; datarefs_vec.iterate (i, &dref); ++i) +- dref->aux = (void *) (uintptr_t) i; ++ /* Do not distribute a LOOP that is able to be vectorized without ++ distribution. */ ++ if (!check_loop_vectorizable (loop)) ++ { ++ rebuild_rdg (loop, rdg, cd); ++ return false; ++ } + +- if (dump_file && (dump_flags & TDF_DETAILS)) +- dump_rdg (dump_file, rdg); ++ rebuild_rdg (loop, rdg, cd); ++ return true; ++} + +- auto_vec partitions; +- rdg_build_partitions (rdg, stmts, &partitions); ++/* Return max grouped loads' length if all groupes length satisfy len = 2 ^ n. ++ Otherwise, return 0. */ + +- auto_vec alias_ddrs; ++static unsigned ++get_max_vf (loop_vec_info vinfo) ++{ ++ unsigned size = 0; ++ unsigned max = 0; ++ stmt_vec_info stmt_info; ++ unsigned i = 0; ++ FOR_EACH_VEC_ELT (vinfo->grouped_loads, i, stmt_info) ++ { ++ size = stmt_info->size; ++ if (!pow2p_hwi (size)) ++ { ++ return 0; ++ } ++ max = size > max ? size : max; ++ } ++ return max; ++} + +- auto_bitmap stmt_in_all_partitions; +- bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts); +- for (i = 1; partitions.iterate (i, &partition); ++i) +- bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts); ++/* Convert grouped_loads from linked list to vector with length vf. ++ We will re-analyze a group if it is uncertain, so we regard WORKLISTS as ++ a circular queue. */ + +- bool any_builtin = false; +- bool reduction_in_all = false; +- FOR_EACH_VEC_ELT (partitions, i, partition) ++static unsigned ++build_queue (loop_vec_info vinfo, unsigned vf, ++ vec *> &worklists) ++{ ++ stmt_vec_info stmt_info; ++ unsigned i = 0; ++ group_info ginfo = NULL; ++ vec *worklist = NULL; ++ FOR_EACH_VEC_ELT (vinfo->grouped_loads, i, stmt_info) + { +- reduction_in_all +- |= classify_partition (loop, rdg, partition, stmt_in_all_partitions); +- any_builtin |= partition_builtin_p (partition); ++ unsigned group_size = stmt_info->size; ++ stmt_vec_info c_stmt_info = stmt_info; ++ bool succ = true; ++ while (group_size >= vf) ++ { ++ vec_alloc (worklist, vf); ++ for (unsigned j = 0; j < vf; ++j) ++ { ++ if (c_stmt_info == NULL) ++ { ++ worklist->release (); ++ succ = false; ++ break; ++ } ++ ginfo = new _group_info (); ++ ginfo->stmt = c_stmt_info->stmt; ++ worklist->safe_push (ginfo); ++ c_stmt_info = c_stmt_info->next_element; ++ } ++ if (!succ) ++ { ++ break; ++ } ++ worklists.safe_push (worklist); ++ group_size -= vf; ++ } + } ++ return worklists.length (); ++} + +- /* If we are only distributing patterns but did not detect any, +- simply bail out. */ +- if (only_patterns_p +- && !any_builtin) ++static bool ++check_same_oprand_type (tree op1, tree op2) ++{ ++ tree type1 = TREE_TYPE (op1); ++ tree type2 = TREE_TYPE (op2); ++ if (TREE_CODE (type1) != INTEGER_TYPE && TREE_CODE (type1) != REAL_TYPE) + { +- nbp = 0; +- goto ldist_done; ++ return false; + } ++ return (TREE_CODE (type1) == TREE_CODE (type2) ++ && TYPE_UNSIGNED (type1) == TYPE_UNSIGNED (type2) ++ && TYPE_PRECISION (type1) == TYPE_PRECISION (type2)); ++} + +- /* If we are only distributing patterns fuse all partitions that +- were not classified as builtins. This also avoids chopping +- a loop into pieces, separated by builtin calls. That is, we +- only want no or a single loop body remaining. */ +- struct partition *into; +- if (only_patterns_p) ++static bool ++bit_field_p (gimple *stmt) ++{ ++ unsigned i = 0; ++ auto_vec datarefs_vec; ++ data_reference_p dr; ++ if (!find_data_references_in_stmt (NULL, stmt, &datarefs_vec)) + { +- for (i = 0; partitions.iterate (i, &into); ++i) +- if (!partition_builtin_p (into)) +- break; +- for (++i; partitions.iterate (i, &partition); ++i) +- if (!partition_builtin_p (partition)) +- { +- partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN); +- partitions.unordered_remove (i); +- partition_free (partition); +- i--; +- } ++ return true; + } + +- /* Due to limitations in the transform phase we have to fuse all +- reduction partitions into the last partition so the existing +- loop will contain all loop-closed PHI nodes. */ +- for (i = 0; partitions.iterate (i, &into); ++i) +- if (partition_reduction_p (into)) +- break; +- for (i = i + 1; partitions.iterate (i, &partition); ++i) +- if (partition_reduction_p (partition)) +- { +- partition_merge_into (rdg, into, partition, FUSE_REDUCTION); +- partitions.unordered_remove (i); +- partition_free (partition); +- i--; +- } +- +- /* Apply our simple cost model - fuse partitions with similar +- memory accesses. */ +- for (i = 0; partitions.iterate (i, &into); ++i) ++ FOR_EACH_VEC_ELT (datarefs_vec, i, dr) + { +- bool changed = false; +- if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET) +- continue; +- for (int j = i + 1; +- partitions.iterate (j, &partition); ++j) ++ if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF ++ && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1))) + { +- if (share_memory_accesses (rdg, into, partition)) +- { +- partition_merge_into (rdg, into, partition, FUSE_SHARE_REF); +- partitions.unordered_remove (j); +- partition_free (partition); +- j--; +- changed = true; +- } ++ return true; + } +- /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar +- accesses when 1 and 2 have similar accesses but not 0 and 1 +- then in the next iteration we will fail to consider merging +- 1 into 0,2. So try again if we did any merging into 0. */ +- if (changed) +- i--; ++ } ++ return false; ++} ++ ++static inline bool ++shift_operation (enum tree_code op) ++{ ++ return op == LSHIFT_EXPR || op == RSHIFT_EXPR || op == LROTATE_EXPR ++ || op == RROTATE_EXPR; ++} ++ ++/* Return relationship between USE_STMT and the first use_stmt of the group. ++ RHS1 is the lhs of stmt recorded in group_info. If another rhs of use_stmt ++ is not a constant, return UNCERTAIN and re-check it later. */ ++ ++static unsigned ++check_isomorphic (gimple* use_stmt, gimple* &first, ++ tree rhs1, vec &hetero_lhs) ++{ ++ /* Check same operation. */ ++ enum tree_code rhs_code_first = gimple_assign_rhs_code (first); ++ enum tree_code rhs_code_current = gimple_assign_rhs_code (use_stmt); ++ if (rhs_code_first != rhs_code_current) ++ { ++ return HETEROGENEOUS; ++ } ++ /* For shift operations, oprands should be equal. */ ++ if (shift_operation (rhs_code_current)) ++ { ++ tree shift_op_first = gimple_assign_rhs2 (first); ++ tree shift_op_current = gimple_assign_rhs2 (use_stmt); ++ if (!operand_equal_p (shift_op_first, shift_op_current, 0) ++ || !TREE_CONSTANT (shift_op_first)) ++ { ++ return HETEROGENEOUS; ++ } ++ return ISOMORPHIC; ++ } ++ /* Type convertion expr or assignment. */ ++ if (gimple_num_ops (first) == 2) ++ { ++ return (rhs_code_first == NOP_EXPR || rhs_code_first == CONVERT_EXPR ++ || rhs_code_first == SSA_NAME) ? ISOMORPHIC : HETEROGENEOUS; ++ } ++ /* We find USE_STMT of LHS of current stmt, denote it as RHS1 of USE_STMT and ++ the other one as RHS2. Check if calculation of RHS2 is isomorphic with ++ RHS2 of the first USE_STMT. */ ++ tree rhs2_first = gimple_assign_rhs1 (use_stmt) == rhs1 ++ ? gimple_assign_rhs2 (first) : gimple_assign_rhs1 (first); ++ tree rhs2 = gimple_assign_rhs1 (use_stmt) == rhs1 ++ ? gimple_assign_rhs2 (use_stmt) : gimple_assign_rhs1 (use_stmt); ++ ++ if (check_same_oprand_type (rhs2_first, rhs2)) ++ { ++ if (TREE_CONSTANT (rhs2)) ++ { ++ return ISOMORPHIC; ++ } ++ else if (hetero_lhs.contains (rhs2)) ++ { ++ return HETEROGENEOUS; ++ } ++ return UNCERTAIN; ++ } ++ return HETEROGENEOUS; ++} ++ ++/* Check if the single use_stmt of STMT is isomorphic with the first one's ++ use_stmt in current group. */ ++ ++static unsigned ++check_use_stmt (group_info elmt, gimple* &first, ++ vec &tmp_stmts, vec &hetero_lhs) ++{ ++ if (gimple_code (elmt->stmt) != GIMPLE_ASSIGN) ++ { ++ return HETEROGENEOUS; ++ } ++ use_operand_p dummy; ++ tree lhs = gimple_assign_lhs (elmt->stmt); ++ gimple *use_stmt = NULL; ++ single_imm_use (lhs, &dummy, &use_stmt); ++ /* STMTs with three rhs are not supported, e.g., GIMPLE_COND. */ ++ if (use_stmt == NULL || gimple_num_ops (use_stmt) > 3 ++ || gimple_code (use_stmt) != GIMPLE_ASSIGN || bit_field_p (use_stmt)) ++ { ++ return HETEROGENEOUS; ++ } ++ tmp_stmts.safe_push (use_stmt); ++ if (first == NULL) ++ { ++ first = use_stmt; ++ return UNINITIALIZED; ++ } ++ /* Check if current use_stmt and the first menber's use_stmt in the group ++ are of the same type. */ ++ tree lhs_first = gimple_assign_lhs (first); ++ tree use_lhs = gimple_assign_lhs (use_stmt); ++ if (!check_same_oprand_type (lhs_first, use_lhs)) ++ { ++ return HETEROGENEOUS; ++ } ++ return check_isomorphic (use_stmt, first, lhs, hetero_lhs); ++} ++ ++/* Replace stmt field in group with stmts in TMP_STMTS, and insert their ++ lhs_info to ISOMER_LHS. */ ++ ++static void ++update_isomer_lhs (vec *group, unsigned group_num, ++ unsigned iteration, isomer_stmt_lhs &isomer_lhs, ++ vec tmp_stmts, int &profit) ++{ ++ group_info elmt = NULL; ++ /* Do not insert temp array if isomorphic stmts from grouped load have ++ only casting operations. Once isomorphic calculation has 3 oprands, ++ such as plus operation, this group can be regarded as cut point. */ ++ bool operated = (gimple_num_ops (tmp_stmts[0]) == 3); ++ /* Do not insert temp arrays if search of iosomophic stmts reaches ++ MEM stmts. */ ++ bool has_vdef = gimple_vdef (tmp_stmts[0]) != NULL; ++ for (unsigned i = 0; i < group->length (); i++) ++ { ++ elmt = (*group)[i]; ++ elmt->stmt = has_vdef ? NULL : tmp_stmts[i]; ++ elmt->cut_point = has_vdef ? false : (elmt->cut_point || operated); ++ elmt->uncertain = false; ++ elmt->done = has_vdef; ++ tree lhs = gimple_assign_lhs (tmp_stmts[i]); ++ isomer_lhs[lhs] = std::make_pair (group_num, iteration); ++ } ++ enum vect_cost_for_stmt kind = scalar_stmt; ++ int scalar_cost = builtin_vectorization_cost (kind, NULL_TREE, 0); ++ profit = (tmp_stmts.length () - 1) * scalar_cost; ++} ++ ++/* Try to find rhs2 in ISOMER_LHS, if all rhs2 were found and their group_num ++ and iteration are same, GROUP is isomorphic. */ ++ ++static unsigned ++check_isomorphic_rhs (vec *group, vec &tmp_stmts, ++ isomer_stmt_lhs &isomer_lhs) ++{ ++ group_info elmt = NULL; ++ gimple *stmt = NULL; ++ unsigned j = 0; ++ unsigned group_num_tmp = -1u; ++ unsigned iteration_tmp = -1u; ++ tree rhs1 = NULL; ++ tree rhs2 = NULL; ++ unsigned status = UNINITIALIZED; ++ FOR_EACH_VEC_ELT (*group, j, elmt) ++ { ++ rhs1 = gimple_assign_lhs (elmt->stmt); ++ stmt = tmp_stmts[j]; ++ rhs2 = (rhs1 == gimple_assign_rhs1 (stmt)) ++ ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt); ++ isomer_stmt_lhs::iterator iter = isomer_lhs.find (rhs2); ++ if (iter != isomer_lhs.end ()) ++ { ++ if (group_num_tmp == -1u) ++ { ++ group_num_tmp = iter->second.first; ++ iteration_tmp = iter->second.second; ++ status |= ISOMORPHIC; ++ continue; ++ } ++ unsigned group_num_rhs2 = iter->second.first; ++ unsigned iteration_rhs2 = iter->second.second; ++ if (group_num_rhs2 == group_num_tmp ++ && iteration_rhs2 == iteration_tmp) ++ { ++ status |= ISOMORPHIC; ++ continue; ++ } ++ return HETEROGENEOUS; ++ } ++ else ++ { ++ status |= UNCERTAIN; ++ } ++ } ++ return status; ++} ++ ++/* Insert >> into UNCERT_STMTS ++ and update group_info. */ ++ ++static void ++update_uncertain_stmts (vec *group, unsigned group_num, ++ unsigned iteration, vec &tmp_stmts, ++ uncertain_stmts &uncert_stmts) ++{ ++ unsigned j = 0; ++ group_info elmt = NULL; ++ FOR_EACH_VEC_ELT (*group, j, elmt) ++ { ++ tree rhs1 = gimple_assign_lhs (elmt->stmt); ++ tree rhs2 = gimple_assign_rhs1 (tmp_stmts[j]) == rhs1 ++ ? gimple_assign_rhs2 (tmp_stmts[j]) ++ : gimple_assign_rhs1 (tmp_stmts[j]); ++ uncert_stmts[elmt->stmt] = std::make_pair (rhs2, ++ std::make_pair (group_num, iteration)); ++ elmt->uncertain = true; ++ elmt->done = false; ++ } ++} ++ ++/* If a group in UNCERT_STMTS is determined to be isomorphic, remove it ++ from UNCERT_STMTS and insert its lhs_info into ISOMER_LHS. */ ++ ++static void ++remove_uncertain_stmts (uncertain_stmts &uncert_stmts, ++ isomer_stmt_lhs &isomer_lhs, vec &tmp_stmts, ++ unsigned group_num, unsigned iteration) ++{ ++ unsigned i = 0; ++ gimple *stmt = NULL; ++ FOR_EACH_VEC_ELT (tmp_stmts, i, stmt) ++ { ++ tree lhs = gimple_assign_lhs (stmt); ++ uncertain_stmts::iterator it = uncert_stmts.find (stmt); ++ if (it != uncert_stmts.end ()) ++ { ++ uncert_stmts.erase (it); ++ isomer_lhs[lhs] = std::make_pair (group_num, iteration); ++ } ++ } ++} ++ ++/* Try to find stmts in TMP_STMTS in UNCERT_STMTS. If all of them have same ++ group_num and iteration, GROUP is isomorphic and remove this group from ++ UNCERT_STMTS. If no stmt was found, GROUP remains uncertain. Otherwise, ++ GROUP is heterogeneous. */ ++ ++static unsigned ++check_uncertain_stmts (vec *group, ++ unsigned group_num, unsigned iteration, bool &fatal, ++ vec &tmp_stmts, uncertain_stmts &uncert_stmts, ++ isomer_stmt_lhs &isomer_lhs) ++{ ++ group_info elmt = NULL; ++ unsigned j = 0; ++ unsigned group_num_tmp = -1u; ++ unsigned iteration_tmp = -1u; ++ unsigned status = UNINITIALIZED; ++ FOR_EACH_VEC_ELT (*group, j, elmt) ++ { ++ tree rhs1 = gimple_assign_lhs (elmt->stmt); ++ gimple *stmt = tmp_stmts[j]; ++ uncertain_stmts::iterator iter = uncert_stmts.find (stmt); ++ if (iter != uncert_stmts.end ()) ++ { ++ tree rhs2 = iter->second.first; ++ if (rhs1 == rhs2) ++ { ++ status |= UNCERTAIN; ++ continue; ++ } ++ /* E.g., stmt1 and stmt2 are in the same group, lhs1 is lhs of stmt1, ++ lhs2 is lhs of stmt2. If use_stmt of lhs1 and lhs2 are same, ++ current vector factor is not suitable for this group. simply mark ++ the group heterogeneous and no longer analyze it in later ++ iterations. */ ++ if (iter->second.second.first == group_num) ++ { ++ fatal = true; ++ return HETEROGENEOUS; ++ } ++ /* First time we find STMT in UNCERT_STMTS. */ ++ if (group_num_tmp == -1u) ++ { ++ group_num_tmp = iter->second.second.first; ++ iteration_tmp = iter->second.second.second; ++ status |= ISOMORPHIC; ++ continue; ++ } ++ unsigned group_num_rhs2 = iter->second.second.first; ++ unsigned iteration_rhs2 = iter->second.second.second; ++ if (group_num_rhs2 == group_num_tmp ++ && iteration_rhs2 == iteration_tmp) ++ { ++ status |= ISOMORPHIC; ++ continue; ++ } ++ return HETEROGENEOUS; ++ } ++ } ++ if (status == ISOMORPHIC) ++ { ++ remove_uncertain_stmts (uncert_stmts, isomer_lhs, tmp_stmts, ++ group_num, iteration); ++ } ++ return status; ++} ++ ++/* Push stmts in TMP_STMTS into HETERO_LHS. */ ++ ++static void ++set_hetero (vec *group, vec &hetero_lhs, ++ vec &tmp_stmts) ++{ ++ group_info elmt = NULL; ++ unsigned i = 0; ++ for (i = 0; i < group->length (); i++) ++ { ++ elmt = (*group)[i]; ++ elmt->uncertain = false; ++ elmt->done = true; ++ } ++ gimple *stmt = NULL; ++ FOR_EACH_VEC_ELT (tmp_stmts, i, stmt) ++ { ++ if (stmt != NULL) ++ { ++ hetero_lhs.safe_push (gimple_assign_lhs (stmt)); ++ } ++ } ++} ++ ++/* Given an uncertain group, TMP_STMTS are use_stmts of stmts in GROUP. ++ Rhs1 is the lhs of stmt in GROUP, rhs2 is the other rhs of USE_STMT. ++ ++ First try to find rhs2 in ISOMER_LHS, if all found rhs2 have same group_num ++ and iteration, this uncertain group is isomorphic. ++ ++ If no rhs was found in ISOMER_LHS, try to find stmts of the group in ++ UNCERT_STMTS. If all stmts with rhs2 have same group_num and iteration, ++ this GROUP is isomorphic. ++ ++ If no stmt was found, this GROUP remains uncertain. ++ ++ Otherwise, this GROUP is heterogeneous. */ ++ ++static bool ++check_uncertain (vec *group, unsigned group_num, ++ unsigned iteration, bool &fatal, int &profit, ++ vec &tmp_stmts, isomer_stmt_lhs &isomer_lhs, ++ vec &hetero_lhs, uncertain_stmts &uncert_stmts) ++{ ++ unsigned status = check_isomorphic_rhs (group, tmp_stmts, isomer_lhs); ++ if (status == UNINITIALIZED) ++ { ++ status = check_uncertain_stmts (group, group_num, iteration, fatal, ++ tmp_stmts, uncert_stmts, isomer_lhs); ++ } ++ ++ switch (status) ++ { ++ case UNINITIALIZED: ++ update_uncertain_stmts (group, group_num, iteration, ++ tmp_stmts, uncert_stmts); ++ return false; ++ case ISOMORPHIC: ++ update_isomer_lhs (group, group_num, iteration, ++ isomer_lhs, tmp_stmts, profit); ++ return false; ++ default: ++ set_hetero (group, hetero_lhs, tmp_stmts); ++ return true; ++ } ++} ++ ++/* Return false if analysis of this group is not finished, e.g., isomorphic or ++ uncertain. Calculate the profit if vectorized. */ ++ ++static bool ++check_group (vec *group, unsigned group_num, unsigned iteration, ++ bool &fatal, int &profit, vec &merged_groups, ++ isomer_stmt_lhs &isomer_lhs, vec &hetero_lhs, ++ uncertain_stmts uncert_stmts) ++{ ++ unsigned j = 0; ++ group_info elmt = NULL; ++ gimple *first = NULL; ++ unsigned res = 0; ++ /* Record single use stmts in TMP_STMTS and decide whether replace stmts in ++ ginfo in succeeding processes. */ ++ auto_vec tmp_stmts; ++ FOR_EACH_VEC_ELT (*group, j, elmt) ++ { ++ if (merged_groups.contains (j)) ++ { ++ return true; ++ } ++ res |= check_use_stmt (elmt, first, tmp_stmts, hetero_lhs); ++ } ++ ++ /* Update each group member according to RES. */ ++ switch (res) ++ { ++ case ISOMORPHIC: ++ update_isomer_lhs (group, group_num, iteration, ++ isomer_lhs, tmp_stmts, profit); ++ return false; ++ case UNCERTAIN: ++ return check_uncertain (group, group_num, iteration, fatal, profit, ++ tmp_stmts, isomer_lhs, hetero_lhs, uncert_stmts); ++ default: ++ set_hetero (group, hetero_lhs, tmp_stmts); ++ return true; ++ } ++} ++ ++/* Return true if all analysises are done except uncertain groups. */ ++ ++static bool ++end_of_search (vec *> &circular_queue) ++{ ++ unsigned i = 0; ++ vec *group = NULL; ++ group_info elmt = NULL; ++ FOR_EACH_VEC_ELT (circular_queue, i, group) ++ { ++ elmt = (*group)[0]; ++ if (!elmt->done && !elmt->uncertain) ++ { ++ return false; ++ } ++ } ++ return true; ++} ++ ++/* Push valid stmts to STMTS as cutpoints. */ ++ ++static bool ++check_any_cutpoints (vec *> &circular_queue, ++ vec * &stmts, vec &merged_groups) ++{ ++ unsigned front = 0; ++ vec *group = NULL; ++ group_info elmt = NULL; ++ unsigned max = circular_queue.length () * circular_queue[0]->length (); ++ vec_alloc (stmts, max); ++ while (front < circular_queue.length ()) ++ { ++ unsigned i = 0; ++ if (merged_groups.contains (front)) ++ { ++ front++; ++ continue; ++ } ++ group = circular_queue[front++]; ++ FOR_EACH_VEC_ELT (*group, i, elmt) ++ { ++ if (elmt->stmt != NULL && elmt->done && elmt->cut_point) ++ { ++ stmts->safe_push (elmt->stmt); ++ } ++ } ++ } ++ return stmts->length () != 0; ++} ++ ++/* Grouped loads are isomorphic. Make pair for group number and iteration, ++ map load stmt to this pair. We set iteration 0 here. */ ++ ++static void ++init_isomer_lhs (vec *> &groups, isomer_stmt_lhs &isomer_lhs) ++{ ++ vec *group = NULL; ++ group_info elmt = NULL; ++ unsigned i = 0; ++ FOR_EACH_VEC_ELT (groups, i, group) ++ { ++ unsigned j = 0; ++ FOR_EACH_VEC_ELT (*group, j, elmt) ++ { ++ isomer_lhs[gimple_assign_lhs (elmt->stmt)] = std::make_pair (i, 0); ++ } ++ } ++} ++ ++static int ++load_store_profit (unsigned loads, unsigned vf, unsigned tmp) ++{ ++ int profit = 0; ++ enum vect_cost_for_stmt kind = scalar_load; ++ int scalar_cost = builtin_vectorization_cost (kind, NULL_TREE, 0); ++ profit += (loads - (loads / vf)) * scalar_cost; ++ profit -= tmp / vf * scalar_cost; ++ kind = scalar_store; ++ scalar_cost = builtin_vectorization_cost (kind, NULL_TREE, 0); ++ profit -= tmp / vf * scalar_cost; ++ return profit; ++} ++ ++/* Breadth first search the graph consisting of define-use chain starting from ++ the circular queue initialized by function BUILD_QUEUE. Find single use of ++ each stmt in group and check if they are isomorphic. Isomorphic is defined ++ as same rhs type, same operator, and isomorphic calculation of each rhs ++ starting from load. If another rhs is uncertain to be isomorphic, put it ++ at the end of circular queue and re-analyze it during the next iteration. ++ If a group shares the same use_stmt with another group, skip one of them in ++ succeedor prcoesses as merged. Iterate the circular queue until all ++ remianing groups heterogeneous or reaches MEN stmts. If all other groups ++ have finishes the analysis, and the remaining groups are uncertain, ++ return false to avoid endless loop. */ ++ ++bool ++bfs_find_isomer_stmts (vec *> &circular_queue, ++ stmts_profit &profit_pair, unsigned vf, ++ bool &reach_vdef) ++{ ++ isomer_stmt_lhs isomer_lhs; ++ uncertain_stmts uncert_stmts; ++ auto_vec hetero_lhs; ++ auto_vec merged_groups; ++ vec *group = NULL; ++ /* True if analysis finishes. */ ++ bool done = false; ++ int profit_sum = 0; ++ vec *stmts = NULL; ++ init_isomer_lhs (circular_queue, isomer_lhs); ++ for (unsigned i = 1; !done; ++i) ++ { ++ unsigned front = 0; ++ bool fatal = false; ++ /* Re-initialize DONE to TRUE while a new iteration begins. */ ++ done = true; ++ while (front < circular_queue.length ()) ++ { ++ int profit = 0; ++ group = circular_queue[front]; ++ done &= check_group (group, front, i, fatal, profit, merged_groups, ++ isomer_lhs, hetero_lhs, uncert_stmts); ++ if (fatal) ++ { ++ return false; ++ } ++ profit_sum += profit; ++ if (profit != 0 && (*group)[0]->stmt == NULL) ++ { ++ reach_vdef = true; ++ return false; ++ } ++ ++front; ++ } ++ /* Uncertain result, return. */ ++ if (!done && end_of_search (circular_queue)) ++ { ++ return false; ++ } ++ } ++ if (check_any_cutpoints (circular_queue, stmts, merged_groups)) ++ { ++ profit_pair.first = stmts; ++ unsigned loads = circular_queue.length () * circular_queue[0]->length (); ++ profit_pair.second = profit_sum + load_store_profit (loads, vf, ++ stmts->length ()); ++ if (profit_pair.second > 0) ++ { ++ return true; ++ } ++ } ++ return false; ++} ++ ++/* Free memory allocated by ginfo. */ ++ ++static void ++free_ginfos (vec *> &worklists) ++{ ++ vec *worklist; ++ unsigned i = 0; ++ while (i < worklists.length ()) ++ { ++ worklist = worklists[i++]; ++ group_info ginfo; ++ unsigned j = 0; ++ FOR_EACH_VEC_ELT (*worklist, j, ginfo) ++ { ++ delete ginfo; ++ } ++ } ++} ++ ++static void ++release_tmp_stmts (vf_stmts_profit_map &candi_stmts) ++{ ++ vf_stmts_profit_map::iterator iter; ++ for (iter = candi_stmts.begin (); iter != candi_stmts.end (); ++iter) ++ { ++ iter->second.first->release (); ++ } ++} ++ ++/* Choose the group of stmt with maximun profit. */ ++ ++static bool ++decide_stmts_by_profit (vf_stmts_profit_map &candi_stmts, vec &stmts) ++{ ++ vf_stmts_profit_map::iterator iter; ++ int profit = 0; ++ int max = 0; ++ vec *tmp = NULL; ++ for (iter = candi_stmts.begin (); iter != candi_stmts.end (); ++iter) ++ { ++ profit = iter->second.second; ++ if (profit > max) ++ { ++ tmp = iter->second.first; ++ max = profit; ++ } ++ else ++ { ++ iter->second.first->release (); ++ } ++ } ++ if (max == 0) ++ { ++ release_tmp_stmts (candi_stmts); ++ return false; ++ } ++ unsigned i = 0; ++ gimple *stmt = NULL; ++ FOR_EACH_VEC_ELT (*tmp, i, stmt) ++ { ++ stmts.safe_push (stmt); ++ } ++ release_tmp_stmts (candi_stmts); ++ return stmts.length () != 0; ++} ++ ++/* Find isomorphic stmts from grouped loads with vector factor VF. ++ ++ Given source code as follows and ignore casting. ++ ++ a0 = (a[0] + b[0]) + ((a[4] - b[4]) << 16); ++ a1 = (a[1] + b[1]) + ((a[5] - b[5]) << 16); ++ a2 = (a[2] + b[2]) + ((a[6] - b[6]) << 16); ++ a3 = (a[3] + b[3]) + ((a[7] - b[7]) << 16); ++ ++ We get grouped loads in VINFO as ++ ++ GROUP_1 GROUP_2 ++ _1 = *a _11 = *b ++ _2 = *(a + 1) _12 = *(b + 1) ++ _3 = *(a + 2) _13 = *(b + 2) ++ _4 = *(a + 3) _14 = *(b + 3) ++ _5 = *(a + 4) _15 = *(b + 4) ++ _6 = *(a + 5) _16 = *(b + 5) ++ _7 = *(a + 6) _17 = *(b + 6) ++ _8 = *(a + 7) _18 = *(b + 7) ++ ++ First we try VF = 8, we get two worklists ++ ++ WORKLIST_1 WORKLIST_2 ++ _1 = *a _11 = *b ++ _2 = *(a + 1) _12 = *(b + 1) ++ _3 = *(a + 2) _13 = *(b + 2) ++ _4 = *(a + 3) _14 = *(b + 3) ++ _5 = *(a + 4) _15 = *(b + 4) ++ _6 = *(a + 5) _16 = *(b + 5) ++ _7 = *(a + 6) _17 = *(b + 6) ++ _8 = *(a + 7) _18 = *(b + 7) ++ ++ We find _111 = _1 + _11 and _115 = _5 - _15 are not isomorphic, ++ so we try VF = VF / 2. ++ ++ GROUP_1 GROUP_2 ++ _1 = *a _5 = *(a + 4) ++ _2 = *(a + 1) _6 = *(a + 5) ++ _3 = *(a + 2) _7 = *(a + 6) ++ _4 = *(a + 3) _8 = *(a + 7) ++ ++ GROUP_3 GROUP_4 ++ _11 = *b _15 = *(b + 4) ++ _12 = *(b + 1) _16 = *(b + 5) ++ _13 = *(b + 2) _17 = *(b + 6) ++ _14 = *(b + 3) _18 = *(b + 7) ++ ++ We first analyze group_1, and find all operations are isomorphic, then ++ replace stmts in group_1 with their use_stmts. Group_2 as well. ++ ++ GROUP_1 GROUP_2 ++ _111 = _1 + _11 _115 = _5 - _15 ++ _112 = _2 + _12 _116 = _6 - _16 ++ _113 = _3 + _13 _117 = _7 - _17 ++ _114 = _4 + _14 _118 = _8 - _18 ++ ++ When analyzing group_3 and group_4, we find their use_stmts are the same ++ as group_1 and group_2. So group_3 is regarded as being merged to group_1 ++ and group_4 being merged to group_2. In future procedures, we will skip ++ group_3 and group_4. ++ ++ We repeat such processing until opreations are not isomorphic or searching ++ reaches MEM stmts. In our given case, searching end up at a0, a1, a2 and ++ a3. */ ++ ++static bool ++find_isomorphic_stmts (loop_vec_info vinfo, vec &stmts) ++{ ++ unsigned vf = get_max_vf (vinfo); ++ if (vf == 0) ++ { ++ return false; ++ } ++ auto_vec *> circular_queue; ++ /* Map of vector factor and corresponding vectorizing profit. */ ++ stmts_profit profit_map; ++ /* Map of cut_points and vector factor. */ ++ vf_stmts_profit_map candi_stmts; ++ bool reach_vdef = false; ++ while (vf > 2) ++ { ++ if (build_queue (vinfo, vf, circular_queue) == 0) ++ { ++ return false; ++ } ++ if (!bfs_find_isomer_stmts (circular_queue, profit_map, vf, reach_vdef)) ++ { ++ if (reach_vdef) ++ { ++ release_tmp_stmts (candi_stmts); ++ circular_queue.release (); ++ return false; ++ } ++ vf /= 2; ++ circular_queue.release (); ++ continue; ++ } ++ candi_stmts[vf] = profit_map; ++ free_ginfos (circular_queue); ++ vf /= 2; ++ circular_queue.release (); ++ } ++ return decide_stmts_by_profit (candi_stmts, stmts); ++} ++ ++/* Get iv from SEED_STMTS and make sure each seed_stmt has only one iv as index ++ and all indices are the same. */ ++ ++static tree ++find_index (vec seed_stmts) ++{ ++ if (seed_stmts.length () == 0) ++ { ++ return NULL; ++ } ++ bool found_index = false; ++ tree index = NULL; ++ unsigned ui = 0; ++ for (ui = 0; ui < seed_stmts.length (); ui++) ++ { ++ if (!gimple_vdef (seed_stmts[ui])) ++ { ++ return NULL; ++ } ++ tree lhs = gimple_assign_lhs (seed_stmts[ui]); ++ unsigned num_index = 0; ++ while (TREE_CODE (lhs) == ARRAY_REF) ++ { ++ if (TREE_CODE (TREE_OPERAND (lhs, 1)) == SSA_NAME) ++ { ++ num_index++; ++ if (num_index > 1) ++ return NULL; ++ if (index == NULL) ++ { ++ index = TREE_OPERAND (lhs, 1); ++ found_index = true; ++ } ++ else if (index != TREE_OPERAND (lhs, 1)) ++ return NULL; ++ } ++ lhs = TREE_OPERAND (lhs, 0); ++ } ++ if (!found_index) ++ return NULL; ++ } ++ return index; ++} ++ ++/* Check if expression of phi is an increament of a const. */ ++ ++static void ++check_phi_inc (struct vertex *v_phi, struct graph *rdg, bool &found_inc) ++{ ++ struct graph_edge *e_phi; ++ for (e_phi = v_phi->succ; e_phi; e_phi = e_phi->succ_next) ++ { ++ struct vertex *v_inc = &(rdg->vertices[e_phi->dest]); ++ if (!is_gimple_assign (RDGV_STMT (v_inc)) ++ || gimple_expr_code (RDGV_STMT (v_inc)) != PLUS_EXPR) ++ { ++ continue; ++ } ++ tree rhs1 = gimple_assign_rhs1 (RDGV_STMT (v_inc)); ++ tree rhs2 = gimple_assign_rhs2 (RDGV_STMT (v_inc)); ++ if (!(integer_onep (rhs1) || integer_onep (rhs2))) ++ { ++ continue; ++ } ++ struct graph_edge *e_inc; ++ /* find cycle with only two vertices inc and phi: inc <--> phi. */ ++ bool found_cycle = false; ++ for (e_inc = v_inc->succ; e_inc; e_inc = e_inc->succ_next) ++ { ++ if (e_inc->dest == e_phi->src) ++ { ++ found_cycle = true; ++ break; ++ } ++ } ++ if (!found_cycle) ++ { ++ continue; ++ } ++ found_inc = true; ++ } ++} ++ ++/* Check if phi satisfies form like PHI <0, i>. */ ++ ++static inline bool ++iv_check_phi_stmt (gimple *phi_stmt) ++{ ++ return gimple_phi_num_args (phi_stmt) == 2 ++ && (integer_zerop (gimple_phi_arg_def (phi_stmt, 0)) ++ || integer_zerop (gimple_phi_arg_def (phi_stmt, 1))); ++} ++ ++/* Make sure the iteration varible is a phi. */ ++ ++static tree ++get_iv_from_seed (struct graph *flow_only_rdg, vec seed_stmts) ++{ ++ tree index = find_index (seed_stmts); ++ if (index == NULL) ++ { ++ return NULL; ++ } ++ for (int i = 0; i < flow_only_rdg->n_vertices; i++) ++ { ++ struct vertex *v = &(flow_only_rdg->vertices[i]); ++ if (RDGV_STMT (v) != seed_stmts[0]) ++ { ++ continue; ++ } ++ struct graph_edge *e; ++ bool found_phi = false; ++ for (e = v->pred; e; e = e->pred_next) ++ { ++ struct vertex *v_phi = &(flow_only_rdg->vertices[e->src]); ++ gimple *phi_stmt = RDGV_STMT (v_phi); ++ if (gimple_code (phi_stmt) != GIMPLE_PHI ++ || gimple_phi_result (phi_stmt) != index) ++ { ++ continue; ++ } ++ if (!iv_check_phi_stmt (phi_stmt)) ++ { ++ return NULL; ++ } ++ /* find inc expr in succ of phi. */ ++ bool found_inc = false; ++ check_phi_inc (v_phi, flow_only_rdg, found_inc); ++ if (!found_inc) ++ { ++ return NULL; ++ } ++ found_phi = true; ++ break; ++ } ++ if (!found_phi) ++ { ++ return NULL; ++ } ++ break; ++ } ++ return index; ++} ++ ++/* Do not distribute loop if vertexes in ROOT_MAP have antidependence with in ++ FLOW_ONLY_RDG. */ ++ ++static bool ++check_no_dependency (struct graph *flow_only_rdg, bitmap root_map) ++{ ++ bitmap_iterator bi; ++ unsigned ui; ++ auto_vec visited_nodes; ++ auto_bitmap visited_map; ++ EXECUTE_IF_SET_IN_BITMAP (root_map, 0, ui, bi) ++ { ++ visited_nodes.safe_push (ui); ++ } ++ for (ui = 0; ui < visited_nodes.length (); ui++) ++ { ++ struct vertex *v = &(flow_only_rdg->vertices[visited_nodes[ui]]); ++ struct graph_edge *e; ++ for (e = v->succ; e; e = e->succ_next) ++ { ++ if (bitmap_bit_p (root_map, e->dest)) ++ { ++ return false; ++ } ++ if (bitmap_bit_p (visited_map, e->dest)) ++ { ++ continue; ++ } ++ visited_nodes.safe_push (e->dest); ++ bitmap_set_bit (visited_map, e->dest); ++ } ++ } ++ return true; ++} ++ ++/* Find isomorphic stmts from GROUPED_LOADS in VINFO and make sure ++ there is no dependency among those STMT we found. */ ++ ++static unsigned ++get_cut_points (struct graph *flow_only_rdg, bitmap cut_points, ++ loop_vec_info vinfo) ++{ ++ unsigned n_stmts = 0; ++ ++ /* STMTS that may be CUT_POINTS. */ ++ auto_vec stmts; ++ if (!find_isomorphic_stmts (vinfo, stmts)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "No temp array insertion: no isomorphic stmts" ++ " were found.\n"); ++ } ++ return 0; ++ } ++ ++ for (int i = 0; i < flow_only_rdg->n_vertices; i++) ++ { ++ if (stmts.contains (RDG_STMT (flow_only_rdg, i))) ++ { ++ bitmap_set_bit (cut_points, i); ++ } ++ } ++ n_stmts = bitmap_count_bits (cut_points); ++ ++ bool succ = check_no_dependency (flow_only_rdg, cut_points); ++ if (!succ) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "No temp array inserted: data dependency" ++ " among isomorphic stmts.\n"); ++ } ++ return 0; ++ } ++ return n_stmts; ++} ++ ++static void ++build_temp_array (struct vertex *v, gimple_stmt_iterator &gsi, ++ poly_uint64 array_extent, tree iv, ++ hash_set * tmp_array_vars, vec *transformed) ++{ ++ gimple *stmt = RDGV_STMT (v); ++ tree lhs = gimple_assign_lhs (stmt); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "original stmt:\t"); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_VOPS|TDF_MEMSYMS); ++ } ++ tree var_ssa = duplicate_ssa_name (lhs, stmt); ++ gimple_assign_set_lhs (stmt, var_ssa); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "changed to:\t"); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_VOPS | TDF_MEMSYMS); ++ } ++ gimple_set_uid (gsi_stmt (gsi), -1); ++ tree vect_elt_type = TREE_TYPE (lhs); ++ tree array_type = build_array_type_nelts (vect_elt_type, array_extent); ++ tree array = create_tmp_var (array_type); ++ tree array_ssa = build4 (ARRAY_REF, vect_elt_type, array, iv, NULL, NULL); ++ tmp_array_vars->add (array_ssa); ++ gimple *store = gimple_build_assign (array_ssa, var_ssa); ++ tree new_vdef = make_ssa_name (gimple_vop (cfun), store); ++ gsi_insert_after (&gsi, store, GSI_NEW_STMT); ++ gimple_set_vdef (store, new_vdef); ++ transformed->safe_push (store); ++ gimple_set_uid (gsi_stmt (gsi), -1); ++ tree array_ssa2 = build4 (ARRAY_REF, vect_elt_type, array, iv, NULL, NULL); ++ tmp_array_vars->add (array_ssa2); ++ gimple *load = gimple_build_assign (lhs, array_ssa2); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "insert stmt:\t"); ++ print_gimple_stmt (dump_file, store, 0, TDF_VOPS|TDF_MEMSYMS); ++ fprintf (dump_file, " and stmt:\t"); ++ print_gimple_stmt (dump_file, load, 0, TDF_VOPS|TDF_MEMSYMS); ++ } ++ gimple_set_vuse (load, new_vdef); ++ gsi_insert_after (&gsi, load, GSI_NEW_STMT); ++ gimple_set_uid (gsi_stmt (gsi), -1); ++} ++ ++/* Set bitmap PRODUCERS based on vec TRANSFORMED. */ ++ ++void ++loop_distribution::build_producers (loop_p loop, bitmap producers, ++ vec &transformed) ++{ ++ auto_vec stmts; ++ stmts_from_loop (loop, &stmts); ++ int i = 0; ++ gimple *stmt = NULL; ++ ++ FOR_EACH_VEC_ELT (stmts, i, stmt) ++ { ++ gimple_set_uid (stmt, i); ++ } ++ i = 0; ++ FOR_EACH_VEC_ELT (transformed, i, stmt) ++ { ++ bitmap_set_bit (producers, stmt->uid); ++ } ++} ++ ++/* Transform stmt ++ ++ A = FOO (ARG_1); ++ ++ to ++ ++ STMT_1: A1 = FOO (ARG_1); ++ STMT_2: X[I] = A1; ++ STMT_3: A = X[I]; ++ ++ Producer is STMT_2 who defines the temp array and consumer is ++ STMT_3 who uses the temp array. */ ++ ++void ++loop_distribution::do_insertion (loop_p loop, struct graph *flow_only_rdg, ++ tree iv, bitmap cut_points, ++ hash_set * tmp_array_vars, ++ bitmap producers) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "=== do insertion ===\n"); ++ } ++ ++ auto_vec transformed; ++ ++ /* Execution times of loop. */ ++ poly_uint64 array_extent ++ = tree_to_poly_uint64 (number_of_latch_executions (loop)) + 1; ++ ++ basic_block *bbs = get_loop_body_in_custom_order (loop, this, bb_top_order_cmp_r); ++ ++ for (int i = 0; i < int (loop->num_nodes); i++) ++ { ++ basic_block bb = bbs[i]; ++ ++ /* Find all cut points in bb and transform them. */ ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ unsigned j = gimple_uid (gsi_stmt (gsi)); ++ if (bitmap_bit_p (cut_points, j)) ++ { ++ struct vertex *v = &(flow_only_rdg->vertices[j]); ++ build_temp_array (v, gsi, array_extent, iv, tmp_array_vars, ++ &transformed); ++ } ++ } ++ } ++ build_producers (loop, producers, transformed); ++ update_ssa (TODO_update_ssa); ++ free (bbs); ++} ++ ++/* After temp array insertion, given stmts ++ STMT_1: M = FOO (ARG_1); ++ STMT_2: X[I] = M; ++ STMT_3: A = X[I]; ++ STMT_2 is the producer, STMT_1 is its prev and STMT_3 is its next. ++ Replace M with A, and remove STMT_2 and STMT_3. */ ++ ++static void ++reset_gimple_assign (struct graph *flow_only_rdg, struct partition *partition, ++ gimple_stmt_iterator &gsi, int j) ++{ ++ struct vertex *v = &(flow_only_rdg->vertices[j]); ++ gimple *stmt = RDGV_STMT (v); ++ gimple *prev = stmt->prev; ++ gimple *next = stmt->next; ++ tree n_lhs = gimple_assign_lhs (next); ++ gimple_assign_set_lhs (prev, n_lhs); ++ unlink_stmt_vdef (stmt); ++ if (partition) ++ { ++ bitmap_clear_bit (partition->stmts, gimple_uid (gsi_stmt (gsi))); ++ } ++ gsi_remove (&gsi, true); ++ release_defs (stmt); ++ if (partition) ++ { ++ bitmap_clear_bit (partition->stmts, gimple_uid (gsi_stmt (gsi))); ++ } ++ gsi_remove (&gsi, true); ++} ++ ++void ++loop_distribution::remove_insertion (loop_p loop, struct graph *flow_only_rdg, ++ bitmap producers, struct partition *partition) ++{ ++ basic_block *bbs = get_loop_body_in_custom_order (loop, this, bb_top_order_cmp_r); ++ for (int i = 0; i < int (loop->num_nodes); i++) ++ { ++ basic_block bb = bbs[i]; ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ unsigned j = gimple_uid (gsi_stmt (gsi)); ++ if (bitmap_bit_p (producers, j)) ++ { ++ reset_gimple_assign (flow_only_rdg, partition, gsi, j); ++ } ++ } ++ } ++ update_ssa (TODO_update_ssa); ++ free (bbs); ++} ++ ++/* Insert temp arrays if isomorphic computation exists. Temp arrays will be ++ regarded as SEED_STMTS for building partitions in succeeding processes. */ ++ ++bool ++loop_distribution::insert_temp_arrays (loop_p loop, vec seed_stmts, ++ hash_set *tmp_array_vars, bitmap producers) ++{ ++ struct graph *flow_only_rdg = build_rdg (loop, NULL); ++ gcc_checking_assert (flow_only_rdg != NULL); ++ tree iv = get_iv_from_seed (flow_only_rdg, seed_stmts); ++ if (iv == NULL) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Loop %d no temp array insertion: failed to get" ++ " iteration variable.\n", loop->num); ++ } ++ free_rdg (flow_only_rdg); ++ return false; ++ } ++ auto_bitmap cut_points; ++ loop_vec_info vinfo = loop_vec_info_for_loop (loop); ++ unsigned n_cut_points = get_cut_points (flow_only_rdg, cut_points, vinfo); ++ delete vinfo; ++ loop->aux = NULL; ++ if (n_cut_points == 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Loop %d no temp array insertion: no cut points" ++ " found.\n", loop->num); ++ } ++ free_rdg (flow_only_rdg); ++ return false; ++ } ++ do_insertion (loop, flow_only_rdg, iv, cut_points, tmp_array_vars, producers); ++ if (dump_enabled_p ()) ++ { ++ dump_user_location_t loc = find_loop_location (loop); ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, "Insertion done:" ++ " %d temp arrays inserted in Loop %d.\n", ++ n_cut_points, loop->num); ++ } ++ free_rdg (flow_only_rdg); ++ return true; ++} ++ ++/* Given LOOP, this function records seed statements for distribution in ++ WORK_LIST. Return false if there is nothing for distribution. */ ++ ++static bool ++find_seed_stmts_for_distribution (class loop *loop, vec *work_list) ++{ ++ basic_block *bbs = get_loop_body_in_dom_order (loop); ++ ++ /* Initialize the worklist with stmts we seed the partitions with. */ ++ for (unsigned i = 0; i < loop->num_nodes; ++i) ++ { ++ for (gphi_iterator gsi = gsi_start_phis (bbs[i]); ++ !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ gphi *phi = gsi.phi (); ++ if (virtual_operand_p (gimple_phi_result (phi))) ++ continue; ++ /* Distribute stmts which have defs that are used outside of ++ the loop. */ ++ if (!stmt_has_scalar_dependences_outside_loop (loop, phi)) ++ continue; ++ work_list->safe_push (phi); ++ } ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); ++ !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ gimple *stmt = gsi_stmt (gsi); ++ ++ /* Ignore clobbers, they do not have true side effects. */ ++ if (gimple_clobber_p (stmt)) ++ continue; ++ ++ /* If there is a stmt with side-effects bail out - we ++ cannot and should not distribute this loop. */ ++ if (gimple_has_side_effects (stmt)) ++ { ++ free (bbs); ++ return false; ++ } ++ ++ /* Distribute stmts which have defs that are used outside of ++ the loop. */ ++ if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) ++ ; ++ /* Otherwise only distribute stores for now. */ ++ else if (!gimple_vdef (stmt)) ++ continue; ++ ++ work_list->safe_push (stmt); ++ } ++ } ++ free (bbs); ++ return work_list->length () > 0; ++} ++ ++/* Distributes the code from LOOP in such a way that producer statements ++ are placed before consumer statements. Tries to separate only the ++ statements from STMTS into separate loops. Returns the number of ++ distributed loops. Set NB_CALLS to number of generated builtin calls. ++ Set *DESTROY_P to whether LOOP needs to be destroyed. */ ++ ++int ++loop_distribution::distribute_loop (class loop *loop, vec stmts, ++ control_dependences *cd, int *nb_calls, bool *destroy_p, ++ bool only_patterns_p) ++{ ++ ddrs_table = new hash_table (389); ++ struct graph *rdg; ++ partition *partition; ++ int i, nbp; ++ ++ *destroy_p = false; ++ *nb_calls = 0; ++ loop_nest.create (0); ++ if (!find_loop_nest (loop, &loop_nest)) ++ { ++ loop_nest.release (); ++ delete ddrs_table; ++ return 0; ++ } ++ ++ datarefs_vec.create (20); ++ has_nonaddressable_dataref_p = false; ++ rdg = build_rdg (loop, cd); ++ if (!rdg) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, ++ "Loop %d not distributed: failed to build the RDG.\n", ++ loop->num); ++ ++ loop_nest.release (); ++ free_data_refs (datarefs_vec); ++ delete ddrs_table; ++ return 0; ++ } ++ ++ if (datarefs_vec.length () > MAX_DATAREFS_NUM) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, ++ "Loop %d not distributed: too many memory references.\n", ++ loop->num); ++ ++ free_rdg (rdg); ++ loop_nest.release (); ++ free_data_refs (datarefs_vec); ++ delete ddrs_table; ++ return 0; ++ } ++ ++ /* Try to distribute LOOP if LOOP is simple enough and unable to vectorize. ++ If LOOP has grouped loads, recursively find isomorphic stmts and insert ++ temp arrays, rebuild RDG and call find_seed_stmts_for_distribution ++ to replace STMTS. */ ++ ++ hash_set tmp_array_vars; ++ ++ /* STMTs that define those inserted TMP_ARRAYs. */ ++ auto_bitmap producers; ++ ++ /* New SEED_STMTS after insertion. */ ++ auto_vec work_list; ++ bool insert_success = false; ++ if (may_insert_temp_arrays (loop, rdg, cd)) ++ { ++ if (insert_temp_arrays (loop, stmts, &tmp_array_vars, producers)) ++ { ++ if (find_seed_stmts_for_distribution (loop, &work_list)) ++ { ++ insert_success = true; ++ stmts = work_list; ++ } ++ else ++ { ++ remove_insertion (loop, rdg, producers, NULL); ++ } ++ rebuild_rdg (loop, rdg, cd); ++ } ++ } ++ ++ data_reference_p dref; ++ for (i = 0; datarefs_vec.iterate (i, &dref); ++i) ++ dref->aux = (void *) (uintptr_t) i; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_rdg (dump_file, rdg); ++ ++ auto_vec partitions; ++ rdg_build_partitions (rdg, stmts, &partitions); ++ ++ auto_vec alias_ddrs; ++ ++ auto_bitmap stmt_in_all_partitions; ++ bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts); ++ for (i = 1; partitions.iterate (i, &partition); ++i) ++ bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts); ++ ++ bool any_builtin = false; ++ bool reduction_in_all = false; ++ FOR_EACH_VEC_ELT (partitions, i, partition) ++ { ++ reduction_in_all ++ |= classify_partition (loop, rdg, partition, stmt_in_all_partitions); ++ any_builtin |= partition_builtin_p (partition); ++ } ++ ++ /* If we are only distributing patterns but did not detect any, ++ simply bail out. */ ++ if (only_patterns_p ++ && !any_builtin) ++ { ++ nbp = 0; ++ goto ldist_done; ++ } ++ ++ /* If we are only distributing patterns fuse all partitions that ++ were not classified as builtins. This also avoids chopping ++ a loop into pieces, separated by builtin calls. That is, we ++ only want no or a single loop body remaining. */ ++ struct partition *into; ++ if (only_patterns_p) ++ { ++ for (i = 0; partitions.iterate (i, &into); ++i) ++ if (!partition_builtin_p (into)) ++ break; ++ for (++i; partitions.iterate (i, &partition); ++i) ++ if (!partition_builtin_p (partition)) ++ { ++ partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN); ++ partitions.unordered_remove (i); ++ partition_free (partition); ++ i--; ++ } ++ } ++ ++ /* Due to limitations in the transform phase we have to fuse all ++ reduction partitions into the last partition so the existing ++ loop will contain all loop-closed PHI nodes. */ ++ for (i = 0; partitions.iterate (i, &into); ++i) ++ if (partition_reduction_p (into)) ++ break; ++ for (i = i + 1; partitions.iterate (i, &partition); ++i) ++ if (partition_reduction_p (partition)) ++ { ++ partition_merge_into (rdg, into, partition, FUSE_REDUCTION); ++ partitions.unordered_remove (i); ++ partition_free (partition); ++ i--; ++ } ++ ++ /* Apply our simple cost model - fuse partitions with similar ++ memory accesses. */ ++ for (i = 0; partitions.iterate (i, &into); ++i) ++ { ++ bool changed = false; ++ if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET) ++ continue; ++ for (int j = i + 1; ++ partitions.iterate (j, &partition); ++j) ++ { ++ if (share_memory_accesses (rdg, into, partition, &tmp_array_vars)) ++ { ++ partition_merge_into (rdg, into, partition, FUSE_SHARE_REF); ++ partitions.unordered_remove (j); ++ partition_free (partition); ++ j--; ++ changed = true; ++ } ++ } ++ /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar ++ accesses when 1 and 2 have similar accesses but not 0 and 1 ++ then in the next iteration we will fail to consider merging ++ 1 into 0,2. So try again if we did any merging into 0. */ ++ if (changed) ++ i--; + } + + /* Put a non-builtin partition last if we need to preserve a reduction. +@@ -3086,7 +4737,7 @@ loop_distribution::distribute_loop (class loop *loop, vec stmts, + } + } + +- finalize_partitions (loop, &partitions, &alias_ddrs); ++ finalize_partitions (loop, &partitions, &alias_ddrs, producers); + + /* If there is a reduction in all partitions make sure the last one + is not classified for builtin code generation. */ +@@ -3104,6 +4755,24 @@ loop_distribution::distribute_loop (class loop *loop, vec stmts, + } + + nbp = partitions.length (); ++ ++ /* If we have inserted TMP_ARRAYs but there is only one partition left in ++ the succeeding processes, remove those inserted TMP_ARRAYs back to the ++ original version. */ ++ ++ if (nbp == 1 && insert_success) ++ { ++ struct partition *partition = NULL; ++ partitions.iterate (0, &partition); ++ remove_insertion (loop, rdg, producers, partition); ++ if (dump_enabled_p ()) ++ { ++ dump_user_location_t loc = find_loop_location (loop); ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, "Insertion removed:" ++ " unable to distribute loop %d.\n", loop->num); ++ } ++ } ++ + if (nbp == 0 + || (nbp == 1 && !partition_builtin_p (partitions[0])) + || (nbp > 1 && partition_contains_all_rw (rdg, partitions))) +@@ -3169,62 +4838,6 @@ void loop_distribution::bb_top_order_destroy () + bb_top_order_index_size = 0; + } + +- +-/* Given LOOP, this function records seed statements for distribution in +- WORK_LIST. Return false if there is nothing for distribution. */ +- +-static bool +-find_seed_stmts_for_distribution (class loop *loop, vec *work_list) +-{ +- basic_block *bbs = get_loop_body_in_dom_order (loop); +- +- /* Initialize the worklist with stmts we seed the partitions with. */ +- for (unsigned i = 0; i < loop->num_nodes; ++i) +- { +- for (gphi_iterator gsi = gsi_start_phis (bbs[i]); +- !gsi_end_p (gsi); gsi_next (&gsi)) +- { +- gphi *phi = gsi.phi (); +- if (virtual_operand_p (gimple_phi_result (phi))) +- continue; +- /* Distribute stmts which have defs that are used outside of +- the loop. */ +- if (!stmt_has_scalar_dependences_outside_loop (loop, phi)) +- continue; +- work_list->safe_push (phi); +- } +- for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); +- !gsi_end_p (gsi); gsi_next (&gsi)) +- { +- gimple *stmt = gsi_stmt (gsi); +- +- /* Ignore clobbers, they do not have true side effects. */ +- if (gimple_clobber_p (stmt)) +- continue; +- +- /* If there is a stmt with side-effects bail out - we +- cannot and should not distribute this loop. */ +- if (gimple_has_side_effects (stmt)) +- { +- free (bbs); +- return false; +- } +- +- /* Distribute stmts which have defs that are used outside of +- the loop. */ +- if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) +- ; +- /* Otherwise only distribute stores for now. */ +- else if (!gimple_vdef (stmt)) +- continue; +- +- work_list->safe_push (stmt); +- } +- } +- free (bbs); +- return work_list->length () > 0; +-} +- + /* Given innermost LOOP, return the outermost enclosing loop that forms a + perfect loop nest. */ + +diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c +index 899b56087..17bbf9e5d 100644 +--- a/gcc/tree-vect-loop.c ++++ b/gcc/tree-vect-loop.c +@@ -2545,6 +2545,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) + unsigned n_stmts = 0; + machine_mode autodetected_vector_mode = VOIDmode; + opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL); ++ /* Loop_vinfo for loop-distribution pass. */ ++ opt_loop_vec_info fail_loop_vinfo = opt_loop_vec_info::success (NULL); + machine_mode next_vector_mode = VOIDmode; + poly_uint64 lowest_th = 0; + unsigned vectorized_loops = 0; +@@ -2633,6 +2635,13 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) + if (res) + { + LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; ++ /* In loop-distribution pass, we only need to get loop_vinfo, do not ++ conduct further operations. */ ++ if (loop->processing_ldist) ++ { ++ loop->aux = (loop_vec_info) loop_vinfo; ++ return loop_vinfo; ++ } + vectorized_loops++; + + /* Once we hit the desired simdlen for the first time, +@@ -2724,7 +2733,19 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) + } + else + { +- delete loop_vinfo; ++ /* If current analysis shows LOOP is unable to vectorize, loop_vinfo ++ will be deleted. If LOOP is under ldist analysis, backup it before ++ it is deleted and return it if all modes are analyzed and still ++ fail to vectorize. */ ++ if (loop->processing_ldist && (mode_i == vector_modes.length () ++ || autodetected_vector_mode == VOIDmode)) ++ { ++ fail_loop_vinfo = loop_vinfo; ++ } ++ else ++ { ++ delete loop_vinfo; ++ } + if (fatal) + { + gcc_checking_assert (first_loop_vinfo == NULL); +@@ -2773,6 +2794,14 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) + return first_loop_vinfo; + } + ++ /* Return loop_vinfo for ldist if loop is unvectorizable. */ ++ if (loop->processing_ldist && (mode_i == vector_modes.length () ++ || autodetected_vector_mode == VOIDmode)) ++ { ++ loop->aux = (loop_vec_info) fail_loop_vinfo; ++ return fail_loop_vinfo; ++ } ++ + return opt_loop_vec_info::propagate_failure (res); + } + +diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c +index 088dd65cb..6b24c55d4 100644 +--- a/gcc/tree-vect-stmts.c ++++ b/gcc/tree-vect-stmts.c +@@ -2276,6 +2276,212 @@ vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype) + return NULL_TREE; + } + ++/* Find the loop with header containing STMT. */ ++ ++static loop_p ++find_in_loop_header (gimple *stmt) ++{ ++ loop_p loop = stmt->bb->loop_father; ++ if (loop == NULL) ++ return NULL; ++ ++ basic_block bb = loop->header; ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ if (gsi_stmt (gsi) == stmt) ++ { ++ return loop; ++ } ++ } ++ return NULL; ++} ++ ++/* Check succeedor BB, BB without load is regarded as empty BB. Ignore empty ++ BB in DFS. */ ++ ++static unsigned ++get_bb_insns (basic_block bb, vec &stmts) ++{ ++ unsigned insns = 0; ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); ++ !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ gimple *stmt = gsi_stmt (gsi); ++ if (is_gimple_debug (stmt)) ++ { ++ continue; ++ } ++ if (is_gimple_assign (stmt) && gimple_has_mem_ops (stmt) ++ && !gimple_has_volatile_ops (stmt) ++ && TREE_CODE (gimple_assign_rhs1 (stmt)) == MEM_REF) ++ { ++ insns++; ++ stmts.safe_push (stmt); ++ } ++ } ++ return insns; ++} ++ ++static bool ++check_same_base (vec *datarefs, data_reference_p dr) ++{ ++ for (unsigned ui = 0; ui < datarefs->length (); ui++) ++ { ++ tree op1 = TREE_OPERAND (DR_BASE_OBJECT (dr), 0); ++ tree op2 = TREE_OPERAND (DR_BASE_OBJECT ((*datarefs)[ui]), 0); ++ if (TREE_CODE (op1) != TREE_CODE (op2)) ++ { ++ continue; ++ } ++ if (TREE_CODE (op1) == ADDR_EXPR) ++ { ++ op1 = TREE_OPERAND (op1, 0); ++ op2 = TREE_OPERAND (op2, 0); ++ } ++ enum tree_code code = TREE_CODE (op1); ++ switch (code) ++ { ++ case VAR_DECL: ++ if (DECL_NAME (op1) == DECL_NAME (op2) ++ && DR_IS_READ ((*datarefs)[ui])) ++ { ++ return true; ++ } ++ break; ++ case SSA_NAME: ++ if (SSA_NAME_VERSION (op1) == SSA_NAME_VERSION (op2) ++ && DR_IS_READ ((*datarefs)[ui])) ++ { ++ return true; ++ } ++ break; ++ default: ++ break; ++ } ++ } ++ return false; ++} ++ ++/* Iterate all load STMTS, if staisfying same base vectorized stmt, then return, ++ Otherwise, set false to SUCCESS. */ ++ ++static void ++check_vec_use (loop_vec_info loop_vinfo, vec &stmts, ++ gimple *stmt, bool &success) ++{ ++ stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt); ++ if (stmt_vinfo == NULL) ++ { ++ success = false; ++ return; ++ } ++ if (DR_IS_READ (stmt_vinfo->dr_aux.dr)) ++ { ++ success = false; ++ return; ++ } ++ unsigned ui = 0; ++ gimple *candidate = NULL; ++ FOR_EACH_VEC_ELT (stmts, ui, candidate) ++ { ++ if (TREE_CODE (TREE_TYPE (gimple_get_lhs (candidate))) != VECTOR_TYPE) ++ { ++ continue; ++ } ++ loop_p cand_loop = find_in_loop_header (candidate); ++ if (cand_loop == NULL) ++ { ++ continue; ++ } ++ auto_vec datarefs; ++ tree res = find_data_references_in_bb ++ (cand_loop, cand_loop->header, &datarefs); ++ if (res == chrec_dont_know) ++ { ++ success = false; ++ return; ++ } ++ if (check_same_base (&datarefs, stmt_vinfo->dr_aux.dr)) ++ { ++ return; ++ } ++ } ++ success = false; ++} ++ ++/* Deep first search from present BB. If succeedor has load STMTS, ++ stop further searching. */ ++ ++static void ++dfs_check_bb (loop_vec_info loop_vinfo, basic_block bb, gimple *stmt, ++ bool &success, vec &visited_bbs) ++{ ++ if (bb == cfun->cfg->x_exit_block_ptr) ++ { ++ success = false; ++ return; ++ } ++ if (!success || visited_bbs.contains (bb) || bb == loop_vinfo->loop->latch) ++ { ++ return; ++ } ++ visited_bbs.safe_push (bb); ++ auto_vec stmts; ++ unsigned insns = get_bb_insns (bb, stmts); ++ /* Empty BB. */ ++ if (insns == 0) ++ { ++ edge e; ++ edge_iterator ei; ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ dfs_check_bb (loop_vinfo, e->dest, stmt, success, visited_bbs); ++ if (!success) ++ { ++ return; ++ } ++ } ++ return; ++ } ++ /* Non-empty BB. */ ++ check_vec_use (loop_vinfo, stmts, stmt, success); ++} ++ ++/* For grouped store, if all succeedors of present BB have vectorized load ++ from same base of store. If so, set memory_access_type using ++ VMAT_CONTIGUOUS_PERMUTE instead of VMAT_LOAD_STORE_LANES. */ ++ ++static bool ++conti_perm (gimple *stmt, loop_vec_info loop_vinfo) ++{ ++ if (gimple_code (stmt) != GIMPLE_ASSIGN) ++ { ++ return false; ++ } ++ stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt); ++ if (DR_IS_READ (stmt_vinfo->dr_aux.dr)) ++ { ++ return false; ++ } ++ tree lhs = gimple_get_lhs (stmt); ++ if (TREE_CODE (lhs) != ARRAY_REF) ++ { ++ return false; ++ } ++ basic_block bb = stmt->bb; ++ bool success = true; ++ auto_vec visited_bbs; ++ visited_bbs.safe_push (bb); ++ edge e; ++ edge_iterator ei; ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ dfs_check_bb (loop_vinfo, e->dest, stmt, success, visited_bbs); ++ } ++ return success; ++} ++ + /* A subroutine of get_load_store_type, with a subset of the same + arguments. Handle the case where STMT_INFO is part of a grouped load + or store. +@@ -2426,10 +2632,20 @@ get_group_load_store_type (stmt_vec_info stmt_info, tree vectype, bool slp, + } + + /* If that fails, try using permuting loads. */ +- else if (vls_type == VLS_LOAD ++ if ((*memory_access_type == VMAT_ELEMENTWISE ++ /* For vectorized loops, if iteration time is one, try to ++ analyze if we can use contiguous permute instead of ++ load/stroe lane. */ ++ || (!loop_vinfo->loop->processing_ldist ++ && *memory_access_type == VMAT_LOAD_STORE_LANES ++ && TREE_CODE (loop_vinfo->num_iters) == INTEGER_CST ++ && maybe_eq (tree_to_shwi (loop_vinfo->num_iters), ++ loop_vinfo->vectorization_factor) ++ && conti_perm (stmt_info->stmt, loop_vinfo))) ++ && (vls_type == VLS_LOAD + ? vect_grouped_load_supported (vectype, single_element_p, + group_size) +- : vect_grouped_store_supported (vectype, group_size)) ++ : vect_grouped_store_supported (vectype, group_size))) + { + *memory_access_type = VMAT_CONTIGUOUS_PERMUTE; + overrun_p = would_overrun_p; +-- +2.27.0.windows.1 + diff --git a/0047-loop-invariant-Add-option-to-limit-count-check-in-lo.patch b/0047-loop-invariant-Add-option-to-limit-count-check-in-lo.patch new file mode 100644 index 0000000000000000000000000000000000000000..facfe42fb406318d4034f16e0925fa86494f349c --- /dev/null +++ b/0047-loop-invariant-Add-option-to-limit-count-check-in-lo.patch @@ -0,0 +1,51 @@ +From 3443ed7be91c267460d79845004008a27bcb7f39 Mon Sep 17 00:00:00 2001 +From: zhaowenyu <804544223@qq.com> +Date: Fri, 24 Jun 2022 01:50:55 +0800 +Subject: [PATCH] [loop-invariant] Add option to limit count check in + loop-invariant motion + +backport Optimize for loop invariant motion introduce performance recession +on 500.perlbench_r(-3.06%), 525.x264_r(-0.76%).So option -flim-count-check to +limits its impact. + +diff --git a/gcc/common.opt b/gcc/common.opt +index b5ea3c7a1..738311fa2 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -3511,4 +3511,9 @@ fipa-ra + Common Report Var(flag_ipa_ra) Optimization + Use caller save register across calls if possible. + ++flim-count-check ++Common Report Var(flag_lim_count_check) Init(1) Optimization ++Limit count check in loop-invariant. ++ ++ + ; This comment is to ensure we retain the blank line above. +diff --git a/gcc/loop-invariant.c b/gcc/loop-invariant.c +index 24b9bcb11..262683f01 100644 +--- a/gcc/loop-invariant.c ++++ b/gcc/loop-invariant.c +@@ -1192,7 +1192,7 @@ find_invariants_bb (class loop *loop, basic_block bb, bool always_reached, + + /* Don't move insn of cold BB out of loop to preheader to reduce calculations + and register live range in hot loop with cold BB. */ +- if (!always_executed && preheader->count > bb->count) ++ if (!always_executed && preheader->count > bb->count && flag_lim_count_check) + { + if (dump_file) + fprintf (dump_file, "Don't move invariant from bb: %d out of loop %d\n", +diff --git a/gcc/testsuite/gcc.dg/loop-invariant-2.c b/gcc/testsuite/gcc.dg/loop-invariant-2.c +index df3d84585..3bfaecc81 100644 +--- a/gcc/testsuite/gcc.dg/loop-invariant-2.c ++++ b/gcc/testsuite/gcc.dg/loop-invariant-2.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -fdump-rtl-loop2_invariant" } */ ++/* { dg-options "-O2 -fdump-rtl-loop2_invariant -flim-count-check" } */ + + volatile int x; + void +-- +2.27.0 + diff --git a/0048-ArrayWidenCompare-Add-a-new-optimization-for-array-c.patch b/0048-ArrayWidenCompare-Add-a-new-optimization-for-array-c.patch new file mode 100644 index 0000000000000000000000000000000000000000..5efbba772836eeac6962c4b46b865e34f6076321 --- /dev/null +++ b/0048-ArrayWidenCompare-Add-a-new-optimization-for-array-c.patch @@ -0,0 +1,1927 @@ +From 04f1cc6e0f0d23000e2d92269b035329c73bb306 Mon Sep 17 00:00:00 2001 +From: dingguangya +Date: Thu, 2 Jun 2022 12:48:17 +0800 +Subject: [PATCH] [ArrayWidenCompare] Add a new optimization for array + comparison scenarios + +Add option farray-widen-compare. +For an array pointer whose element is a single-byte type, +by changing the pointer type to a long-byte type, the elements +can be combined and compared after loading. + +diff --git a/gcc/Makefile.in b/gcc/Makefile.in +index 23394c64b..2b2bf474a 100644 +--- a/gcc/Makefile.in ++++ b/gcc/Makefile.in +@@ -1591,6 +1591,7 @@ OBJS = \ + tree-ssa-loop-ivopts.o \ + tree-ssa-loop-manip.o \ + tree-ssa-loop-niter.o \ ++ tree-ssa-loop-array-widen-compare.o \ + tree-ssa-loop-prefetch.o \ + tree-ssa-loop-split.o \ + tree-ssa-loop-unswitch.o \ +diff --git a/gcc/common.opt b/gcc/common.opt +index 24834cf60..7e466bae7 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1060,6 +1060,10 @@ fasynchronous-unwind-tables + Common Report Var(flag_asynchronous_unwind_tables) Optimization + Generate unwind tables that are exact at each instruction boundary. + ++farray-widen-compare ++Common Report Var(flag_array_widen_compare) Optimization ++Extends types for pointers to arrays to improve array comparsion performance. ++ + fauto-inc-dec + Common Report Var(flag_auto_inc_dec) Init(1) Optimization + Generate auto-inc/dec instructions. +diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +index 4b0fd2ffb..a73880e75 100644 +--- a/gcc/doc/invoke.texi ++++ b/gcc/doc/invoke.texi +@@ -459,7 +459,7 @@ Objective-C and Objective-C++ Dialects}. + -falign-loops[=@var{n}[:@var{m}:[@var{n2}[:@var{m2}]]]] @gol + -fno-allocation-dce -fallow-store-data-races @gol + -fassociative-math -fauto-profile -fauto-profile[=@var{path}] @gol +--fauto-inc-dec -fbranch-probabilities @gol ++-farray-widen-compare -fauto-inc-dec -fbranch-probabilities @gol + -fcaller-saves @gol + -fcombine-stack-adjustments -fconserve-stack @gol + -fcompare-elim -fcprop-registers -fcrossjumping @gol +@@ -9710,6 +9710,16 @@ This pass is always skipped on architectures that do not have + instructions to support this. Enabled by default at @option{-O} and + higher on architectures that support this. + ++@item -farray-widen-compare ++@opindex farray-widen-compare ++In the narrow-byte array comparison scenario, the types of pointers ++pointing to array are extended so that elements of multiple bytes can ++be loaded at a time when a wide type is used to dereference an array, ++thereby improving the performance of this comparison scenario. ++ ++This option may generate better or worse code; results are highly dependent ++on the structure of loops within the source code. ++ + @item -fdce + @opindex fdce + Perform dead code elimination (DCE) on RTL@. +diff --git a/gcc/passes.def b/gcc/passes.def +index e9c91d26e..797b803ca 100644 +--- a/gcc/passes.def ++++ b/gcc/passes.def +@@ -91,6 +91,7 @@ along with GCC; see the file COPYING3. If not see + NEXT_PASS (pass_dse); + NEXT_PASS (pass_cd_dce); + NEXT_PASS (pass_phiopt, true /* early_p */); ++ NEXT_PASS (pass_array_widen_compare); + NEXT_PASS (pass_tail_recursion); + NEXT_PASS (pass_convert_switch); + NEXT_PASS (pass_cleanup_eh); +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-1.c b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-1.c +new file mode 100644 +index 000000000..27b69b0e9 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-1.c +@@ -0,0 +1,19 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -farray-widen-compare -fdump-tree-awiden_compare-details" } */ ++ ++#include ++#include ++ ++#define my_min(x, y) ((x) < (y) ? (x) : (y)) ++ ++uint32_t ++func (uint32_t len0, uint32_t len1, const uint32_t len_limit, const uint8_t *const pb, const uint8_t *const cur) ++{ ++ uint32_t len = my_min(len0, len1); ++ while (++len != len_limit) ++ if (pb[len] != cur[len]) ++ break; ++ return len; ++} ++ ++/* { dg-final { scan-tree-dump-times "loop form is success" 1 "awiden_compare"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-2.c b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-2.c +new file mode 100644 +index 000000000..d102364f2 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-2.c +@@ -0,0 +1,90 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -farray-widen-compare -fdump-tree-awiden_compare-details" } */ ++ ++#include ++#include ++ ++#define EMPTY_HASH_VALUE 0 ++#define my_min(x, y) ((x) < (y) ? (x) : (y)) ++#define true 1 ++ ++typedef struct { ++ uint32_t len; ++ uint32_t dist; ++} lzma_match; ++ ++ ++lzma_match * ++func ( ++ const uint32_t len_limit, ++ const uint32_t pos, ++ const uint8_t *const cur, ++ uint32_t cur_match, ++ uint32_t depth, ++ uint32_t *const son, ++ const uint32_t cyclic_pos, ++ const uint32_t cyclic_size, ++ lzma_match *matches, ++ uint32_t len_best) ++{ ++ uint32_t *ptr0 = son + (cyclic_pos << 1) + 1; ++ uint32_t *ptr1 = son + (cyclic_pos << 1); ++ ++ uint32_t len0 = 0; ++ uint32_t len1 = 0; ++ ++ while (true) ++ { ++ const uint32_t delta = pos - cur_match; ++ if (depth-- == 0 || delta >= cyclic_size) ++ { ++ *ptr0 = EMPTY_HASH_VALUE; ++ *ptr1 = EMPTY_HASH_VALUE; ++ return matches; ++ } ++ ++ uint32_t *const pair = son + ((cyclic_pos - delta + (delta > cyclic_pos ? cyclic_size : 0)) << 1); ++ ++ const uint8_t *const pb = cur -delta; ++ uint32_t len = my_min(len0, len1); ++ ++ if (pb[len] == cur[len]) ++ { ++ while (++len != len_limit) ++ if (pb[len] != cur[len]) ++ break; ++ ++ if (len_best < len) ++ { ++ len_best = len; ++ matches->len = len; ++ matches->dist = delta - 1; ++ ++matches; ++ ++ if (len == len_limit) ++ { ++ *ptr1 = pair[0]; ++ *ptr0 = pair[1]; ++ return matches; ++ } ++ } ++ } ++ ++ if (pb[len] < cur[len]) ++ { ++ *ptr1 = cur_match; ++ ptr1 = pair + 1; ++ cur_match = *ptr1; ++ len1 = len; ++ } ++ else ++ { ++ *ptr0 = cur_match; ++ ptr0 = pair; ++ cur_match = *ptr0; ++ len0 = len; ++ } ++ } ++} ++ ++/* { dg-final { scan-tree-dump-times "loop form is success" 1 "awiden_compare"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-3.c b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-3.c +new file mode 100644 +index 000000000..52dd6b02b +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-3.c +@@ -0,0 +1,22 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -farray-widen-compare -fdump-tree-awiden_compare-details" } */ ++ ++#include ++#include ++ ++#define my_min(x, y) ((x) < (y) ? (x) : (y)) ++ ++uint32_t ++func (uint32_t len0, uint32_t len1, const uint32_t len_limit, const uint8_t *const pb, const uint8_t *const cur) ++{ ++ uint32_t len = my_min(len0, len1); ++ while (len != len_limit) ++ { ++ if (pb[len] != cur[len]) ++ break; ++ len = len + 1; ++ } ++ return len; ++} ++ ++/* { dg-final { scan-tree-dump-times "loop form is success" 1 "awiden_compare"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-4.c b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-4.c +new file mode 100644 +index 000000000..d3185d326 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-4.c +@@ -0,0 +1,22 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -farray-widen-compare -fdump-tree-awiden_compare-details" } */ ++ ++#include ++#include ++ ++#define my_min(x, y) ((x) < (y) ? (x) : (y)) ++ ++uint32_t ++func (uint32_t len0, uint32_t len1, const uint32_t len_limit, const uint8_t *const pb, const uint8_t *const cur) ++{ ++ uint32_t len = my_min(len0, len1); ++ while (len != len_limit) ++ { ++ if (pb[len] != cur[len]) ++ break; ++ len = len + 2; ++ } ++ return len; ++} ++ ++/* { dg-final { scan-tree-dump-times "loop form is success" 0 "awiden_compare"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-5.c b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-5.c +new file mode 100644 +index 000000000..9743dc623 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-5.c +@@ -0,0 +1,19 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -farray-widen-compare -fdump-tree-awiden_compare-details" } */ ++ ++#include ++#include ++ ++#define my_min(x, y) ((x) < (y) ? (x) : (y)) ++ ++uint32_t ++func (uint32_t len0, uint32_t len1, const uint32_t len_limit, const uint8_t *const pb, const uint8_t *const cur) ++{ ++ uint32_t len = my_min(len0, len1); ++ while (++len != len_limit) ++ if (pb[len] != cur[len-1]) ++ break; ++ return len; ++} ++ ++/* { dg-final { scan-tree-dump-times "loop form is success" 0 "awiden_compare"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-6.c b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-6.c +new file mode 100644 +index 000000000..2323d5bf7 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-6.c +@@ -0,0 +1,19 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -farray-widen-compare -fdump-tree-awiden_compare-details" } */ ++ ++#include ++#include ++ ++#define my_min(x, y) ((x) < (y) ? (x) : (y)) ++ ++uint32_t ++func (uint32_t len0, uint32_t len1, const uint32_t len_limit, const uint8_t *const pb, const uint8_t *const cur) ++{ ++ uint32_t len = my_min(len0, len1); ++ while (len++ != len_limit) ++ if (pb[len] != cur[len]) ++ break; ++ return len; ++} ++ ++/* { dg-final { scan-tree-dump-times "loop form is success" 0 "awiden_compare"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-7.c b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-7.c +new file mode 100644 +index 000000000..33db62fa4 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-7.c +@@ -0,0 +1,22 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -farray-widen-compare -fdump-tree-awiden_compare-details" } */ ++ ++#include ++#include ++ ++#define my_min(x, y) ((x) < (y) ? (x) : (y)) ++ ++uint32_t ++func (uint32_t len0, uint32_t len1, const uint32_t len_limit, const uint8_t *const pb, const uint8_t *const cur) ++{ ++ uint32_t len = my_min(len0, len1); ++ while (len != len_limit) ++ { ++ len = len + 1; ++ if (pb[len] != cur[len]) ++ break; ++ } ++ return len; ++} ++ ++/* { dg-final { scan-tree-dump-times "loop form is success" 0 "awiden_compare"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-8.c b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-8.c +new file mode 100644 +index 000000000..8c96d24a1 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/awiden-compare-8.c +@@ -0,0 +1,24 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -farray-widen-compare -fdump-tree-awiden_compare-details" } */ ++ ++#include ++#include ++ ++#define my_min(x, y) ((x) < (y) ? (x) : (y)) ++ ++uint32_t ++func (uint32_t len0, uint32_t len1, const uint32_t len_limit, const uint8_t *const pb, const uint8_t *const cur) ++{ ++ uint32_t len = my_min(len0, len1); ++ while (++len != len_limit) ++ { ++ if (pb[len] != cur[len]) ++ { ++ len = len - 1; ++ break; ++ } ++ } ++ return len; ++} ++ ++/* { dg-final { scan-tree-dump-times "loop form is success" 0 "awiden_compare"} } */ +diff --git a/gcc/timevar.def b/gcc/timevar.def +index e873747a8..6d90bb6e1 100644 +--- a/gcc/timevar.def ++++ b/gcc/timevar.def +@@ -215,6 +215,7 @@ DEFTIMEVAR (TV_TREE_NRV , "tree NRV optimization") + DEFTIMEVAR (TV_TREE_COPY_RENAME , "tree rename SSA copies") + DEFTIMEVAR (TV_TREE_SSA_VERIFY , "tree SSA verifier") + DEFTIMEVAR (TV_TREE_STMT_VERIFY , "tree STMT verifier") ++DEFTIMEVAR (TV_TREE_ARRAY_WIDEN_COMPARE, "tree array widen compare") + DEFTIMEVAR (TV_TREE_SWITCH_CONVERSION, "tree switch conversion") + DEFTIMEVAR (TV_TREE_SWITCH_LOWERING, "tree switch lowering") + DEFTIMEVAR (TV_TREE_RECIP , "gimple CSE reciprocals") +diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h +index be6387768..aca0b83f2 100644 +--- a/gcc/tree-pass.h ++++ b/gcc/tree-pass.h +@@ -436,6 +436,7 @@ extern gimple_opt_pass *make_pass_cselim (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_phiopt (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_forwprop (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_phiprop (gcc::context *ctxt); ++extern gimple_opt_pass *make_pass_array_widen_compare (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_tree_ifcombine (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_dse (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_nrv (gcc::context *ctxt); +diff --git a/gcc/tree-ssa-loop-array-widen-compare.c b/gcc/tree-ssa-loop-array-widen-compare.c +new file mode 100644 +index 000000000..baa41a655 +--- /dev/null ++++ b/gcc/tree-ssa-loop-array-widen-compare.c +@@ -0,0 +1,1528 @@ ++/* Array widen compare. ++ Copyright (C) 2022-2022 Free Software Foundation, Inc. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify it ++under the terms of the GNU General Public License as published by the ++Free Software Foundation; either version 3, or (at your option) any ++later version. ++ ++GCC is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++#include "config.h" ++#include "system.h" ++#include "coretypes.h" ++#include "backend.h" ++#include "target.h" ++#include "tree.h" ++#include "gimple.h" ++#include "tree-pass.h" ++#include "gimple-ssa.h" ++#include "tree-pretty-print.h" ++#include "fold-const.h" ++#include "gimplify.h" ++#include "gimple-iterator.h" ++#include "tree-ssa-loop-manip.h" ++#include "tree-ssa-loop.h" ++#include "ssa.h" ++#include "tree-into-ssa.h" ++#include "cfganal.h" ++#include "cfgloop.h" ++#include "gimple-pretty-print.h" ++#include "tree-cfg.h" ++#include "cgraph.h" ++#include "print-tree.h" ++#include "cfghooks.h" ++#include "gimple-fold.h" ++ ++/* This pass handles scenarios similar to the following: ++ ++ uint32_t ++ func (uint32_t len0, uint32_t len1, const uint32_t len_limit, ++ const uint8_t *const pb, const uint8_t *const cur) ++ { ++ uint32_t len = my_min (len0, len1); ++ while (++len != len_limit) ++ if (pb[len] != cur[len]) ++ break; ++ return len; ++ } ++ ++ Features of this type of loop: ++ 1) the loop has two exits; ++ 2) One of the exits comes from the comparison result of the array; ++ ++ From the source code point of view, the pass completes the conversion of the ++ above scenario into: ++ ++ uint32_t ++ func (uint32_t len0, uint32_t len1, const uint32_t len_limit, ++ const uint8_t *const pb, const uint8_t *const cur) ++ { ++ uint32_t len = my_min (len0, len1); ++ // align_loop ++ for(++len; len + sizeof(uint64_t) <= len_limit; len += sizeof (uint64_t)) ++ { ++ uint64_t a = *((uint64_t*)(cur+len)); ++ uint64_t b = *((uint64_t*)(pb+len)); ++ if (a != b) ++ { ++ int lz = __builtin_ctzll (a ^ b); ++ len += lz / 8; ++ return len; ++ } ++ } ++ // epilogue_loop ++ for (;len != len_limit; ++len) ++ if (pb[len] != cur[len]) ++ break; ++ return len; ++ } ++ ++ This pass is to complete the conversion of such scenarios from the internal ++ perspective of the compiler: ++ 1) determine_loop_form: The function completes the screening of such ++ scenarios; ++ 2) convert_to_new_loop: The function completes the conversion of ++ origin_loop to new loops, and removes origin_loop; ++ 3) origin_loop_info: The structure is used to record important information ++ of origin_loop: such as loop exit, growth step size ++ of loop induction variable, initial value ++ of induction variable, etc; ++ 4) create_new_loops: The function is used as the key content of the pass ++ to complete the creation of new loops. */ ++ ++/* The useful information of origin loop. */ ++ ++struct origin_loop_info ++{ ++ tree base; /* The initial index of the array in the old loop. */ ++ tree limit; /* The limit index of the array in the old loop. */ ++ tree arr1; /* Array 1 in the old loop. */ ++ tree arr2; /* Array 2 in the old loop. */ ++ edge entry_edge; /* The edge into the old loop. */ ++ basic_block exit_bb1; ++ basic_block exit_bb2; ++ edge exit_e1; ++ edge exit_e2; ++ gimple *cond_stmt1; ++ gimple *cond_stmt2; ++ gimple *update_stmt; ++ bool exist_prolog_assgin; ++ /* Whether the marker has an initial value assigned ++ to the array index. */ ++ unsigned HOST_WIDE_INT step; ++ /* The growth step of the loop induction variable. */ ++}; ++ ++typedef struct origin_loop_info origin_loop_info; ++ ++static origin_loop_info origin_loop; ++hash_map defs_map; ++ ++/* Dump the bb information in a loop. */ ++ ++static void ++dump_loop_bb (struct loop *loop) ++{ ++ basic_block *body = get_loop_body_in_dom_order (loop); ++ basic_block bb = NULL; ++ ++ for (unsigned i = 0; i < loop->num_nodes; i++) ++ { ++ bb = body[i]; ++ if (bb->loop_father != loop) ++ { ++ continue; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "===== the %dth bb of loop ==========:\n", i); ++ gimple_dump_bb (dump_file, bb, 0, dump_flags); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ free (body); ++} ++ ++/* Return true if the loop has precisely one backedge. */ ++ ++static bool ++loop_single_backedge_p (class loop *loop) ++{ ++ basic_block latch = loop->latch; ++ if (!single_succ_p (latch)) ++ return false; ++ ++ edge e = single_succ_edge (latch); ++ edge backedge = find_edge (latch, loop->header); ++ ++ if (e != backedge) ++ return false; ++ ++ return true; ++} ++ ++/* Return true if the loop has precisely one preheader BB. */ ++ ++static bool ++loop_single_preheader_bb (class loop *loop) ++{ ++ basic_block header = loop->header; ++ if (EDGE_COUNT (header->preds) != 2) ++ return false; ++ ++ edge e1 = EDGE_PRED (header, 0); ++ edge e2 = EDGE_PRED (header, 1); ++ ++ if ((e1->src == loop->latch && e2->src->loop_father != loop) ++ || (e2->src == loop->latch && e1->src->loop_father != loop)) ++ return true; ++ ++ return false; ++} ++ ++/* Initialize the origin_loop structure. */ ++static void ++init_origin_loop_structure () ++{ ++ origin_loop.base = NULL; ++ origin_loop.limit = NULL; ++ origin_loop.arr1 = NULL; ++ origin_loop.arr2 = NULL; ++ origin_loop.exit_e1 = NULL; ++ origin_loop.exit_e2 = NULL; ++ origin_loop.exit_bb1 = NULL; ++ origin_loop.exit_bb2 =NULL; ++ origin_loop.entry_edge = NULL; ++ origin_loop.cond_stmt1 = NULL; ++ origin_loop.cond_stmt2 = NULL; ++ origin_loop.update_stmt = NULL; ++ origin_loop.exist_prolog_assgin = false; ++ origin_loop.step = 0; ++} ++ ++/* Get the edge that first entered the loop. */ ++ ++static edge ++get_loop_preheader_edge (class loop *loop) ++{ ++ edge e; ++ edge_iterator ei; ++ ++ FOR_EACH_EDGE (e, ei, loop->header->preds) ++ if (e->src != loop->latch) ++ break; ++ ++ if (!e) ++ { ++ gcc_assert (!loop_outer (loop)); ++ return single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun)); ++ } ++ ++ return e; ++} ++ ++/* Make sure the exit condition stmt satisfies a specific form. */ ++ ++static bool ++check_cond_stmt (gimple *stmt) ++{ ++ if (!stmt) ++ return false; ++ if (gimple_code (stmt) != GIMPLE_COND) ++ return false; ++ ++ if (gimple_cond_code (stmt) != NE_EXPR && gimple_cond_code (stmt) != EQ_EXPR) ++ return false; ++ ++ tree lhs = gimple_cond_lhs (stmt); ++ tree rhs = gimple_cond_rhs (stmt); ++ ++ /* The parameter that does not support the cond statement is not SSA_NAME. ++ eg: if (len_1 != 100). */ ++ if (TREE_CODE (lhs) != SSA_NAME || TREE_CODE (rhs) != SSA_NAME) ++ return false; ++ ++ return true; ++} ++ ++/* Record the exit information in the original loop including exit edge, ++ exit bb block, exit condition stmt, ++ eg: exit_eX origin_exit_bbX cond_stmtX. */ ++ ++static bool ++record_origin_loop_exit_info (class loop *loop) ++{ ++ bool found = false; ++ edge e = NULL; ++ unsigned i = 0; ++ gimple *stmt; ++ ++ if (origin_loop.exit_e1 != NULL || origin_loop.exit_bb1 != NULL ++ || origin_loop.exit_e2 != NULL || origin_loop.exit_bb2 != NULL ++ || origin_loop.cond_stmt1 != NULL || origin_loop.cond_stmt2 != NULL) ++ return false; ++ ++ vec exit_edges = get_loop_exit_edges (loop); ++ if (exit_edges == vNULL) ++ return false; ++ ++ if (exit_edges.length () != 2) ++ goto fail; ++ ++ FOR_EACH_VEC_ELT (exit_edges, i, e) ++ { ++ if (e->src == loop->header) ++ { ++ origin_loop.exit_e1 = e; ++ origin_loop.exit_bb1 = e->dest; ++ stmt = gsi_stmt (gsi_last_bb (e->src)); ++ if (check_cond_stmt (stmt)) ++ origin_loop.cond_stmt1 = stmt; ++ } ++ else ++ { ++ origin_loop.exit_e2 = e; ++ origin_loop.exit_bb2 = e->dest; ++ stmt = gsi_stmt (gsi_last_bb (e->src)); ++ if (check_cond_stmt (stmt)) ++ origin_loop.cond_stmt2 = stmt; ++ } ++ } ++ ++ if (origin_loop.exit_e1 != NULL && origin_loop.exit_bb1 != NULL ++ && origin_loop.exit_e2 != NULL && origin_loop.exit_bb2 != NULL ++ && origin_loop.cond_stmt1 != NULL && origin_loop.cond_stmt2 != NULL) ++ found = true; ++ ++fail: ++ exit_edges.release (); ++ return found; ++} ++ ++/* Get origin loop induction variable upper bound. */ ++ ++static bool ++get_iv_upper_bound (gimple *stmt) ++{ ++ if (origin_loop.limit != NULL) ++ return false; ++ ++ tree lhs = gimple_cond_lhs (stmt); ++ tree rhs = gimple_cond_rhs (stmt); ++ ++ if (TREE_CODE (TREE_TYPE (lhs)) != INTEGER_TYPE ++ || TREE_CODE (TREE_TYPE (rhs)) != INTEGER_TYPE) ++ return false; ++ ++ gimple *g = SSA_NAME_DEF_STMT (rhs); ++ ++ if (SSA_NAME_VAR (rhs) && TREE_CODE (SSA_NAME_VAR (rhs)) == PARM_DECL ++ && g && gimple_code (g) == GIMPLE_NOP ++ && (SSA_NAME_VAR (lhs) && TREE_CODE (SSA_NAME_VAR (lhs)) != PARM_DECL)) ++ { ++ origin_loop.limit = rhs; ++ } ++ else ++ return false; ++ ++ if (origin_loop.limit != NULL) ++ return true; ++ ++ return false; ++} ++ ++/* Returns true only when the expression on the rhs code of stmt is PLUS_EXPR, ++ rhs1 is SSA_NAME with the same var as origin_loop base, and rhs2 is ++ INTEGER_CST. */ ++ ++static bool ++check_update_stmt (gimple *stmt) ++{ ++ if (!stmt) ++ return false; ++ ++ if (gimple_assign_rhs_code (stmt) == PLUS_EXPR) ++ { ++ tree rhs1 = gimple_assign_rhs1 (stmt); ++ tree rhs2 = gimple_assign_rhs2 (stmt); ++ if (TREE_CODE (rhs1) == SSA_NAME && TREE_CODE (rhs2) == INTEGER_CST ++ && SSA_NAME_VAR (rhs1) == SSA_NAME_VAR (origin_loop.base)) ++ { ++ origin_loop.step = tree_to_uhwi (rhs2); ++ if (origin_loop.step == 1) ++ return true; ++ } ++ } ++ return false; ++} ++ ++/* Get origin loop induction variable initial value. */ ++ ++static bool ++get_iv_base (gimple *stmt) ++{ ++ tree lhs = gimple_cond_lhs (stmt); ++ if (origin_loop.base != NULL || origin_loop.update_stmt != NULL) ++ return false; ++ ++ basic_block header = gimple_bb (stmt); ++ ++ gphi_iterator gsi; ++ edge e; ++ edge_iterator ei; ++ tree iv_after; ++ ++ for (gsi = gsi_start_phis (header); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ gphi *phi = gsi.phi (); ++ tree res = gimple_phi_result (phi); ++ if (!SSA_NAME_VAR (res) || SSA_NAME_VAR (res) != SSA_NAME_VAR (lhs)) ++ continue; ++ tree base = PHI_ARG_DEF_FROM_EDGE (phi, origin_loop.entry_edge); ++ if (!base || !SSA_NAME_VAR (base) ++ || SSA_NAME_VAR (base) != SSA_NAME_VAR (lhs)) ++ return false; ++ origin_loop.base = base; ++ FOR_EACH_EDGE (e, ei, header->preds) ++ { ++ if (e != origin_loop.entry_edge) ++ { ++ iv_after = PHI_ARG_DEF_FROM_EDGE (phi, e); ++ gimple *update = SSA_NAME_DEF_STMT (iv_after); ++ if (!check_update_stmt (update)) ++ return false; ++ origin_loop.update_stmt = update; ++ if (gimple_bb (update) == header && iv_after == lhs) ++ origin_loop.exist_prolog_assgin = true; ++ } ++ } ++ } ++ ++ if (origin_loop.base != NULL && origin_loop.update_stmt != NULL) ++ return true; ++ ++ return false; ++} ++ ++/* Record the upper bound and initial value of the induction variable in the ++ original loop; When prolog_assign is present, make sure loop header is in ++ simple form; And the interpretation of prolog_assign is as follows: ++ eg: while (++len != limit) ++ ...... ++ For such a loop, ++len will be processed before entering header_bb, and the ++ assign is regarded as the prolog_assign of the loop. */ ++ ++static bool ++record_origin_loop_header (class loop *loop) ++{ ++ basic_block header = loop->header; ++ ++ if (origin_loop.entry_edge != NULL || origin_loop.base != NULL ++ || origin_loop.update_stmt != NULL || origin_loop.limit != NULL) ++ return false; ++ origin_loop.entry_edge = get_loop_preheader_edge (loop); ++ ++ gimple_stmt_iterator gsi; ++ gimple *stmt; ++ ++ for (gsi = gsi_last_bb (header); !gsi_end_p (gsi); gsi_prev (&gsi)) ++ { ++ stmt = gsi_stmt (gsi); ++ if (stmt && is_gimple_debug (stmt)) ++ continue; ++ if (stmt && gimple_code (stmt) == GIMPLE_COND) ++ { ++ if (!get_iv_upper_bound (stmt)) ++ return false; ++ if (!get_iv_base (stmt)) ++ return false; ++ } ++ else if (stmt && gimple_code (stmt) == GIMPLE_ASSIGN) ++ { ++ if (stmt != origin_loop.update_stmt || !origin_loop.exist_prolog_assgin) ++ return false; ++ } ++ else ++ return false; ++ } ++ ++ if (origin_loop.entry_edge != NULL && origin_loop.base != NULL ++ && origin_loop.update_stmt != NULL && origin_loop.limit != NULL) ++ return true; ++ ++ return false; ++} ++ ++/* When prolog_assign does not exist, make sure that update_stmt exists in the ++ loop latch, and its form is a specific form, eg: ++ len_2 = len_1 + 1. */ ++ ++static bool ++record_origin_loop_latch (class loop *loop) ++{ ++ basic_block latch = loop->latch; ++ gimple_stmt_iterator gsi; ++ gimple *stmt; ++ ++ gsi = gsi_start_bb (latch); ++ ++ if (origin_loop.exist_prolog_assgin) ++ { ++ if (gsi_end_p (gsi)) ++ return true; ++ } ++ else ++ { ++ if (gsi_one_before_end_p (gsi)) ++ { ++ stmt = gsi_stmt (gsi); ++ if (stmt == origin_loop.update_stmt) ++ return true; ++ } ++ } ++ return false; ++} ++ ++/* Returns true when the DEF_STMT corresponding to arg0 of the mem_ref tree ++ satisfies the POINTER_PLUS_EXPR type. */ ++ ++static bool ++check_body_mem_ref (tree mem_ref) ++{ ++ tree arg0 = TREE_OPERAND (mem_ref , 0); ++ tree arg1 = TREE_OPERAND (mem_ref , 1); ++ ++ if (TREE_CODE (TREE_TYPE (arg0)) == POINTER_TYPE ++ && TREE_CODE (arg1) == INTEGER_CST ++ && tree_to_uhwi (arg1) == 0) ++ { ++ gimple *tmp_g = SSA_NAME_DEF_STMT (arg0); ++ if (tmp_g && gimple_assign_rhs_code (tmp_g) == POINTER_PLUS_EXPR) ++ return true; ++ } ++ return false; ++} ++ ++/* Returns true if the rh2 of the current stmt comes from the base in the ++ original loop. */ ++ ++static bool ++check_body_pointer_plus (gimple *stmt, tree &tmp_index) ++{ ++ tree rhs1 = gimple_assign_rhs1 (stmt); ++ tree rhs2 = gimple_assign_rhs2 (stmt); ++ if (TREE_CODE (TREE_TYPE (rhs1)) == POINTER_TYPE) ++ { ++ gimple *g = SSA_NAME_DEF_STMT (rhs2); ++ if (g && gimple_assign_rhs_code (g) == NOP_EXPR) ++ { ++ tree nop_rhs = gimple_assign_rhs1 (g); ++ if (SSA_NAME_VAR (nop_rhs) ++ && SSA_NAME_VAR (nop_rhs) == SSA_NAME_VAR (origin_loop.base)) ++ { ++ if (!origin_loop.arr1) ++ { ++ origin_loop.arr1 = rhs1; ++ tmp_index = rhs2; ++ } ++ else if (!origin_loop.arr2) ++ { ++ origin_loop.arr2 = rhs1; ++ if (tmp_index != rhs2) ++ return false; ++ } ++ else ++ return false; ++ return true; ++ } ++ } ++ } ++ return false; ++} ++ ++/* Record the array comparison information in the original loop, while ensuring ++ that there are only statements related to cont_stmt in the loop body. */ ++ ++static bool ++record_origin_loop_body (class loop *loop) ++{ ++ basic_block body = gimple_bb (origin_loop.cond_stmt2); ++ ++ if (origin_loop.arr1 != NULL || origin_loop.arr2 != NULL) ++ return false; ++ ++ gimple_stmt_iterator gsi; ++ for (gsi = gsi_start_bb (body); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ gimple_set_visited (gsi_stmt (gsi), false); ++ } ++ ++ tree cond_lhs = gimple_cond_lhs (origin_loop.cond_stmt2); ++ tree cond_rhs = gimple_cond_rhs (origin_loop.cond_stmt2); ++ if (TREE_CODE (TREE_TYPE (cond_lhs)) != INTEGER_TYPE ++ || TREE_CODE (TREE_TYPE (cond_rhs)) != INTEGER_TYPE) ++ return false; ++ ++ auto_vec stack; ++ tree tmp_index = NULL; ++ stack.safe_push (cond_lhs); ++ stack.safe_push (cond_rhs); ++ gimple_set_visited (origin_loop.cond_stmt2, true); ++ ++ while (!stack.is_empty ()) ++ { ++ tree op = stack.pop (); ++ gimple *g = SSA_NAME_DEF_STMT (op); ++ if (!g || gimple_bb (g) != body || !is_gimple_assign (g)) ++ continue; ++ gimple_set_visited (g, true); ++ if (gimple_assign_rhs_code (g) == MEM_REF) ++ { ++ tree mem_ref = gimple_assign_rhs1 (g); ++ if (!check_body_mem_ref (mem_ref)) ++ return false; ++ stack.safe_push (TREE_OPERAND (mem_ref , 0)); ++ } ++ else if (gimple_assign_rhs_code (g) == POINTER_PLUS_EXPR) ++ { ++ tree rhs2 = gimple_assign_rhs2 (g); ++ if (!check_body_pointer_plus (g, tmp_index)) ++ return false; ++ stack.safe_push (rhs2); ++ } ++ else if (gimple_assign_rhs_code (g) == NOP_EXPR) ++ { ++ tree rhs = gimple_assign_rhs1 (g); ++ if (!SSA_NAME_VAR (rhs) ++ || SSA_NAME_VAR (rhs) != SSA_NAME_VAR (origin_loop.base)) ++ return false; ++ stack.safe_push (rhs); ++ } ++ else ++ return false; ++ } ++ bool allvisited = true; ++ for (gsi = gsi_start_bb (body); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ if (!gimple_visited_p (gsi_stmt (gsi)) ++ && !is_gimple_debug (gsi_stmt (gsi))) ++ allvisited = false; ++ } ++ if (allvisited) ++ { ++ if (origin_loop.arr1 != NULL && origin_loop.arr2 != NULL) ++ return true; ++ } ++ return false; ++} ++ ++/* Dump the original loop information to see if the origin loop ++ form matches. */ ++ ++static void ++dump_origin_loop_info () ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nThe origin loop info:\n"); ++ fprintf (dump_file, "\n the origin_loop.limit is:\n"); ++ print_node (dump_file, "", origin_loop.limit, 0); ++ fprintf (dump_file, "\n"); ++ fprintf (dump_file, "\n the origin_loop.base is:\n"); ++ print_node (dump_file, "", origin_loop.base, 0); ++ fprintf (dump_file, "\n"); ++ fprintf (dump_file, "\n the origin_loop.arr1 is:\n"); ++ print_node (dump_file, "", origin_loop.arr1, 0); ++ fprintf (dump_file, "\n"); ++ fprintf (dump_file, "\n the origin_loop.arr2 is:\n"); ++ print_node (dump_file, "", origin_loop.arr2, 0); ++ fprintf (dump_file, "\n"); ++ fprintf (dump_file, "\n the origin_loop.cond_stmt1 is:\n"); ++ print_gimple_stmt (dump_file, origin_loop.cond_stmt1, 0); ++ fprintf (dump_file, "\n"); ++ fprintf (dump_file, "\n the origin_loop.cond_stmt2 is:\n"); ++ print_gimple_stmt (dump_file, origin_loop.cond_stmt2, 0); ++ fprintf (dump_file, "\n"); ++ fprintf (dump_file, "\n the origin_loop.update_stmt is:\n"); ++ print_gimple_stmt (dump_file, origin_loop.update_stmt, 0); ++ fprintf (dump_file, "\n"); ++ } ++} ++ ++/* Returns true only if the exit bb of the original loop is unique and its phi ++ node parameter comes from the same variable. */ ++ ++static bool ++check_exit_bb (class loop *loop) ++{ ++ if (origin_loop.exit_bb1 != origin_loop.exit_bb2 ++ || flow_bb_inside_loop_p (loop, origin_loop.exit_bb1)) ++ return false; ++ ++ gphi_iterator gsi; ++ for (gsi = gsi_start_phis (origin_loop.exit_bb1); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ gphi *phi = gsi.phi (); ++ tree res = gimple_phi_result (phi); ++ if (!SSA_NAME_VAR (res) ++ || SSA_NAME_VAR (res) != SSA_NAME_VAR (origin_loop.base)) ++ continue; ++ if (gimple_phi_num_args (phi) == 2) ++ { ++ tree arg0 = gimple_phi_arg_def (phi, 0); ++ tree arg1 = gimple_phi_arg_def (phi, 1); ++ if (arg0 == arg1) ++ return true; ++ } ++ } ++ return false; ++} ++ ++/* Make sure that the recorded origin_loop information meets the ++ relative requirements. */ ++ ++static bool ++check_origin_loop_info (class loop *loop) ++{ ++ dump_origin_loop_info (); ++ tree arr1_elem_size, arr2_elem_size; ++ ++ if (!check_exit_bb (loop)) ++ return false; ++ ++ if (TREE_CODE (origin_loop.base) != SSA_NAME) ++ return false; ++ ++ if (!TYPE_READONLY (TREE_TYPE (origin_loop.limit))) ++ return false; ++ ++ if (!TYPE_READONLY (TREE_TYPE (TREE_TYPE (origin_loop.arr1)))) ++ return false; ++ ++ if (!TYPE_READONLY (TREE_TYPE (TREE_TYPE (origin_loop.arr2)))) ++ return false; ++ ++ if (TREE_CODE (TREE_TYPE (origin_loop.arr1)) != POINTER_TYPE ++ || TREE_CODE (TREE_TYPE (origin_loop.arr2)) != POINTER_TYPE ++ || TREE_CODE (TREE_TYPE (TREE_TYPE (origin_loop.arr1))) != INTEGER_TYPE ++ || TREE_CODE (TREE_TYPE (TREE_TYPE (origin_loop.arr2))) != INTEGER_TYPE) ++ return false; ++ ++ arr1_elem_size = TYPE_SIZE (TREE_TYPE (TREE_TYPE (origin_loop.arr1))); ++ arr2_elem_size = TYPE_SIZE (TREE_TYPE (TREE_TYPE (origin_loop.arr2))); ++ ++ if (tree_to_uhwi (arr1_elem_size) != 8 || tree_to_uhwi (arr2_elem_size) != 8) ++ return false; ++ ++ return true; ++} ++ ++/* Record the useful information of the original loop and judge whether the ++ information meets the specified conditions. */ ++ ++static bool ++check_record_loop_form (class loop *loop) ++{ ++ if (!record_origin_loop_exit_info (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nFailed to record loop exit information.\n"); ++ } ++ return false; ++ } ++ ++ if (!record_origin_loop_header (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nFailed to record loop header information.\n"); ++ } ++ return false; ++ } ++ ++ if (!record_origin_loop_latch (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nFailed to record loop latch information.\n"); ++ } ++ return false; ++ } ++ ++ if (!record_origin_loop_body (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nFailed to record loop body information.\n"); ++ } ++ return false; ++ } ++ ++ if (!check_origin_loop_info (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nFailed to check origin loop information.\n"); ++ } ++ return false; ++ } ++ ++ return true; ++} ++ ++/* The main entry for judging whether the loop meets some conditions. */ ++ ++static bool ++determine_loop_form (class loop *loop) ++{ ++ /* Currently only standard loops are processed, that is, only loop_header, ++ loop_latch, loop_body 3 bb blocks are included. */ ++ if (loop->inner || loop->num_nodes != 3) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nWrong loop form, there is inner loop or" ++ "redundant bb.\n"); ++ } ++ return false; ++ } ++ ++ if (single_exit (loop) || !loop->latch) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nWrong loop form, only one exit or loop_latch" ++ "does not exist.\n"); ++ } ++ return false; ++ } ++ ++ /* Support loop with only one backedge. */ ++ if (!loop_single_backedge_p (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nWrong loop form, loop back edges are not" ++ "unique.\n"); ++ } ++ return false; ++ } ++ ++ /* Support loop with only one preheader BB. */ ++ if (!loop_single_preheader_bb (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nWrong loop form, loop preheader bb are not" ++ "unique.\n"); ++ } ++ return false; ++ } ++ ++ init_origin_loop_structure (); ++ if (!check_record_loop_form (loop)) ++ return false; ++ ++ return true; ++} ++ ++/* Create prolog bb for newly constructed loop; When prolog_assign exists in ++ the original loop, the corresponding assign needs to be added to prolog_bb; ++ eg: ++ len_16 = len_10 + 1 ++ ++ The IR of bb is as above. */ ++ ++static void ++create_prolog_bb (basic_block &prolog_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer, edge entry_edge) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gimple *g; ++ tree lhs1; ++ ++ prolog_bb = create_empty_bb (after_bb); ++ add_bb_to_loop (prolog_bb, outer); ++ redirect_edge_and_branch (entry_edge, prolog_bb); ++ set_immediate_dominator (CDI_DOMINATORS, prolog_bb, dominator_bb); ++ gsi = gsi_last_bb (prolog_bb); ++ lhs1 = copy_ssa_name (origin_loop.base); ++ ++ if (origin_loop.exist_prolog_assgin) ++ g = gimple_build_assign (lhs1, PLUS_EXPR, origin_loop.base, ++ build_int_cst (TREE_TYPE (origin_loop.base), origin_loop.step)); ++ else ++ g = gimple_build_assign (lhs1, NOP_EXPR, origin_loop.base); ++ gimple_seq_add_stmt (&stmts, g); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ set_current_def (origin_loop.base, lhs1); ++ defs_map.put (prolog_bb, lhs1); ++} ++ ++/* Create preheader bb for new loop; In order to ensure the standard form of ++ the loop, add a preheader_bb before loop_header. */ ++ ++static void ++create_loop_pred_bb (basic_block &loop_pred_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer) ++{ ++ loop_pred_bb = create_empty_bb (after_bb); ++ add_bb_to_loop (loop_pred_bb, outer); ++ set_immediate_dominator (CDI_DOMINATORS, loop_pred_bb, dominator_bb); ++ defs_map.put (loop_pred_bb, get_current_def (origin_loop.base)); ++} ++ ++/* Add phi_arg for bb with phi node. */ ++ ++static void ++rewrite_add_phi_arg (basic_block bb) ++{ ++ edge e; ++ edge_iterator ei; ++ gphi *phi; ++ gphi_iterator gsi; ++ tree res; ++ location_t loc; ++ ++ for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ phi = gsi.phi (); ++ res = gimple_phi_result (phi); ++ ++ FOR_EACH_EDGE (e, ei, bb->preds) ++ { ++ if (PHI_ARG_DEF_FROM_EDGE (phi, e)) ++ continue; ++ tree var = *(defs_map.get (e->src)); ++ if (!SSA_NAME_VAR (var) || (SSA_NAME_VAR (var) != SSA_NAME_VAR (res))) ++ continue; ++ if (virtual_operand_p (var)) ++ loc = UNKNOWN_LOCATION; ++ else ++ loc = gimple_location (SSA_NAME_DEF_STMT (var)); ++ add_phi_arg (phi, var, e, loc); ++ } ++ } ++} ++ ++/* Create loop_header BB for align_loop. ++ eg: ++ _18 = (long unsigned int) len_17; ++ _19 = _18 + 8; ++ _20 = (long unsigned int) len_limit_12 (D); ++ if (_19 <= _20) ++ ++ The IR of bb is as above. */ ++ ++static void ++create_align_loop_header (basic_block &align_loop_header, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gcond *cond_stmt; ++ gphi *phi; ++ tree res; ++ ++ tree entry_node = get_current_def (origin_loop.base); ++ align_loop_header = create_empty_bb (after_bb); ++ add_bb_to_loop (align_loop_header, outer); ++ make_single_succ_edge (after_bb, align_loop_header, EDGE_FALLTHRU); ++ set_immediate_dominator (CDI_DOMINATORS, align_loop_header, dominator_bb); ++ gsi = gsi_last_bb (align_loop_header); ++ phi = create_phi_node (NULL_TREE, align_loop_header); ++ create_new_def_for (entry_node, phi, gimple_phi_result_ptr (phi)); ++ res = gimple_phi_result (phi); ++ ++ tree lhs1 = gimple_build (&stmts, NOP_EXPR, long_unsigned_type_node, res); ++ tree lhs2 = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (lhs1), lhs1, ++ build_int_cst (TREE_TYPE (lhs1), 8)); ++ tree lhs3 = gimple_build (&stmts, NOP_EXPR, long_unsigned_type_node, ++ origin_loop.limit); ++ cond_stmt = gimple_build_cond (LE_EXPR, lhs2, lhs3, NULL_TREE, NULL_TREE); ++ gimple_seq_add_stmt (&stmts, cond_stmt); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ ++ set_current_def (origin_loop.base, res); ++ defs_map.put (align_loop_header, res); ++} ++ ++/* Create loop body BB for align_loop. ++ eg: ++ _21 = (sizetype) len_17; ++ _22 = cur_15 (D) + _21; ++ _23 = MEM[(long unsigned int *)_22]; ++ _24 = pb_13 (D) + _21; ++ _25 = MEM[(long unsigned int *)_24]; ++ if (_23 != _25) ++ ++ The IR of bb is as above. */ ++ ++static void ++create_align_loop_body_bb (basic_block &align_loop_body_bb, ++ basic_block after_bb, basic_block dominator_bb, ++ class loop *outer) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gimple *g; ++ gcond *cond_stmt; ++ tree lhs1, lhs2; ++ ++ align_loop_body_bb = create_empty_bb (after_bb); ++ add_bb_to_loop (align_loop_body_bb, outer); ++ make_edge (after_bb, align_loop_body_bb, EDGE_TRUE_VALUE); ++ set_immediate_dominator (CDI_DOMINATORS, align_loop_body_bb, dominator_bb); ++ gsi = gsi_last_bb (align_loop_body_bb); ++ ++ tree var = gimple_build (&stmts, NOP_EXPR, sizetype, ++ get_current_def (origin_loop.base)); ++ lhs1 = gimple_build (&stmts, POINTER_PLUS_EXPR, TREE_TYPE (origin_loop.arr2), ++ origin_loop.arr2, var); ++ g = gimple_build_assign (make_ssa_name (long_unsigned_type_node), ++ fold_build2 (MEM_REF, long_unsigned_type_node, lhs1, ++ build_int_cst (build_pointer_type (long_unsigned_type_node), 0))); ++ gimple_seq_add_stmt (&stmts, g); ++ lhs1 = gimple_assign_lhs (g); ++ lhs2 = gimple_build (&stmts, POINTER_PLUS_EXPR, TREE_TYPE (origin_loop.arr1), ++ origin_loop.arr1, var); ++ g = gimple_build_assign (make_ssa_name (long_unsigned_type_node), ++ fold_build2 (MEM_REF, long_unsigned_type_node, lhs2, ++ build_int_cst (build_pointer_type (long_unsigned_type_node), 0))); ++ gimple_seq_add_stmt (&stmts, g); ++ lhs2 = gimple_assign_lhs (g); ++ cond_stmt = gimple_build_cond (gimple_cond_code (origin_loop.cond_stmt2), ++ lhs1, lhs2, NULL_TREE, NULL_TREE); ++ gimple_seq_add_stmt (&stmts, cond_stmt); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++} ++ ++/* Create loop_latch BB for align_loop. ++ eg: ++ len_26 = len_17 + 8; ++ ++ The IR of bb is as above. */ ++ ++static void ++create_align_loop_latch (basic_block &align_loop_latch, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gimple *g; ++ tree res; ++ ++ tree entry_node = get_current_def (origin_loop.base); ++ align_loop_latch = create_empty_bb (after_bb); ++ add_bb_to_loop (align_loop_latch, outer); ++ make_edge (after_bb, align_loop_latch, EDGE_FALSE_VALUE); ++ set_immediate_dominator (CDI_DOMINATORS, align_loop_latch, dominator_bb); ++ gsi = gsi_last_bb (align_loop_latch); ++ res = copy_ssa_name (entry_node); ++ g = gimple_build_assign (res, PLUS_EXPR, entry_node, ++ build_int_cst (TREE_TYPE (entry_node), 8)); ++ gimple_seq_add_stmt (&stmts, g); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ defs_map.put (align_loop_latch, res); ++} ++ ++/* Create a new loop and add it to outer_loop and return. */ ++ ++static class loop * ++init_new_loop (class loop *outer_loop, basic_block header, basic_block latch) ++{ ++ class loop *new_loop; ++ new_loop = alloc_loop (); ++ new_loop->header = header; ++ new_loop->latch = latch; ++ add_loop (new_loop, outer_loop); ++ ++ return new_loop; ++} ++ ++/* Create necessary exit BB for align_loop. ++ eg: ++ _27 = _23 ^ _25; ++ _28 = __builtin_ctzll (_27); ++ _29 = _28 >> 3; ++ len_30 = _29 + len_17; ++ ++ The IR of bb is as above. */ ++ ++static void ++create_align_loop_exit_bb (basic_block &align_loop_exit_bb, ++ basic_block after_bb, basic_block dominator_bb, ++ class loop *outer) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gimple *g; ++ gimple *cond_stmt; ++ tree lhs1, lhs2; ++ tree cond_lhs, cond_rhs; ++ gcall *build_ctzll; ++ ++ tree entry_node = get_current_def (origin_loop.base); ++ align_loop_exit_bb = create_empty_bb (after_bb); ++ add_bb_to_loop (align_loop_exit_bb, outer); ++ make_edge (after_bb, align_loop_exit_bb, EDGE_TRUE_VALUE); ++ set_immediate_dominator (CDI_DOMINATORS, align_loop_exit_bb, dominator_bb); ++ gsi = gsi_last_bb (align_loop_exit_bb); ++ ++ cond_stmt = gsi_stmt (gsi_last_bb (after_bb)); ++ cond_lhs = gimple_cond_lhs (cond_stmt); ++ cond_rhs = gimple_cond_rhs (cond_stmt); ++ ++ lhs1 = gimple_build (&stmts, BIT_XOR_EXPR, TREE_TYPE (cond_lhs), cond_lhs, ++ cond_rhs); ++ build_ctzll = gimple_build_call (builtin_decl_explicit (BUILT_IN_CTZLL), 1, ++ lhs1); ++ lhs1 = make_ssa_name (integer_type_node); ++ gimple_call_set_lhs (build_ctzll, lhs1); ++ gimple_seq_add_stmt (&stmts, build_ctzll); ++ lhs2 = copy_ssa_name (lhs1); ++ g = gimple_build_assign (lhs2, RSHIFT_EXPR, lhs1, ++ build_int_cst (TREE_TYPE (lhs1), 3)); ++ gimple_seq_add_stmt (&stmts, g); ++ lhs1 = gimple_build (&stmts, NOP_EXPR, TREE_TYPE (entry_node), lhs2); ++ lhs2 = copy_ssa_name (entry_node); ++ g = gimple_build_assign (lhs2, PLUS_EXPR, lhs1, entry_node); ++ gimple_seq_add_stmt (&stmts, g); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ defs_map.put (align_loop_exit_bb, lhs2); ++} ++ ++/* Create loop_header BB for epilogue_loop. ++ eg: ++ # len_31 = PHI ++ if (len_31 != len_limit_12 (D)) ++ ++ The IR of bb is as above. */ ++ ++static void ++create_epilogue_loop_header (basic_block &epilogue_loop_header, ++ basic_block after_bb, basic_block dominator_bb, ++ class loop *outer) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gcond *cond_stmt; ++ tree res; ++ gphi *phi; ++ ++ tree entry_node = get_current_def (origin_loop.base); ++ epilogue_loop_header = create_empty_bb (after_bb); ++ add_bb_to_loop (epilogue_loop_header, outer); ++ make_single_succ_edge (after_bb, epilogue_loop_header, EDGE_FALLTHRU); ++ set_immediate_dominator (CDI_DOMINATORS, epilogue_loop_header, dominator_bb); ++ gsi = gsi_last_bb (epilogue_loop_header); ++ phi = create_phi_node (NULL_TREE, epilogue_loop_header); ++ create_new_def_for (entry_node, phi, gimple_phi_result_ptr (phi)); ++ res = gimple_phi_result (phi); ++ cond_stmt = gimple_build_cond (gimple_cond_code (origin_loop.cond_stmt1), res, ++ origin_loop.limit, NULL_TREE, NULL_TREE); ++ gimple_seq_add_stmt (&stmts, cond_stmt); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ ++ set_current_def (origin_loop.base, res); ++ defs_map.put (epilogue_loop_header, res); ++} ++ ++/* Create loop body BB for epilogue_loop. ++ eg: ++ _32 = (sizetype) len_31; ++ _33 = pb_13 (D) + _32; ++ _34 = *_33; ++ _35 = cur_15 (D) + _32; ++ _36 = *_35; ++ if (_34 != _36) ++ ++ The IR of bb is as above. */ ++ ++static void ++create_epilogue_loop_body_bb (basic_block &epilogue_loop_body_bb, ++ basic_block after_bb, basic_block dominator_bb, ++ class loop *outer) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gimple *g; ++ gcond *cond_stmt; ++ tree lhs1, lhs2, lhs3; ++ ++ tree entry_node = get_current_def (origin_loop.base); ++ epilogue_loop_body_bb = create_empty_bb (after_bb); ++ add_bb_to_loop (epilogue_loop_body_bb, outer); ++ make_edge (after_bb, epilogue_loop_body_bb, EDGE_TRUE_VALUE); ++ set_immediate_dominator (CDI_DOMINATORS, epilogue_loop_body_bb, dominator_bb); ++ gsi = gsi_last_bb (epilogue_loop_body_bb); ++ lhs1 = gimple_build (&stmts, NOP_EXPR, sizetype, entry_node); ++ lhs2 = gimple_build (&stmts, POINTER_PLUS_EXPR, TREE_TYPE (origin_loop.arr1), ++ origin_loop.arr1, lhs1); ++ g = gimple_build_assign (make_ssa_name (unsigned_char_type_node), ++ fold_build2 (MEM_REF, unsigned_char_type_node, lhs2, ++ build_int_cst (TREE_TYPE (lhs2), 0))); ++ gimple_seq_add_stmt (&stmts, g); ++ lhs2 = gimple_assign_lhs (g); ++ lhs3 = gimple_build (&stmts, POINTER_PLUS_EXPR, TREE_TYPE (origin_loop.arr2), ++ origin_loop.arr2, lhs1); ++ g = gimple_build_assign (make_ssa_name (unsigned_char_type_node), ++ fold_build2 (MEM_REF, unsigned_char_type_node, lhs3, ++ build_int_cst (TREE_TYPE (lhs3), 0))); ++ gimple_seq_add_stmt (&stmts, g); ++ lhs3 = gimple_assign_lhs (g); ++ cond_stmt = gimple_build_cond (gimple_cond_code (origin_loop.cond_stmt2), lhs2, ++ lhs3, NULL_TREE, NULL_TREE); ++ gimple_seq_add_stmt (&stmts, cond_stmt); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ defs_map.put (epilogue_loop_body_bb, get_current_def (origin_loop.base)); ++} ++ ++/* Create loop_latch BB for epilogue_loop. ++ eg: ++ len_37 = len_31 + 1; ++ ++ The IR of bb is as above. */ ++ ++static void ++create_epilogue_loop_latch (basic_block &epilogue_loop_latch, ++ basic_block after_bb, basic_block dominator_bb, ++ class loop *outer) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gimple *g; ++ tree res; ++ ++ tree entry_node = get_current_def (origin_loop.base); ++ epilogue_loop_latch = create_empty_bb (after_bb); ++ add_bb_to_loop (epilogue_loop_latch, outer); ++ make_edge (after_bb, epilogue_loop_latch, EDGE_FALSE_VALUE); ++ set_immediate_dominator (CDI_DOMINATORS, epilogue_loop_latch, dominator_bb); ++ gsi = gsi_last_bb (epilogue_loop_latch); ++ res = copy_ssa_name (entry_node); ++ g = gimple_build_assign (res, PLUS_EXPR, entry_node, ++ build_int_cst (TREE_TYPE (entry_node), origin_loop.step)); ++ gimple_seq_add_stmt (&stmts, g); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ defs_map.put (epilogue_loop_latch, res); ++} ++ ++/* convert_to_new_loop ++ | | ++ | | ++ | | entry_edge ++ | ______ | ++ | / V V ++ | | -----origin_loop_header--- ++ | | | | ++ | | -------------------------\ ++ | | | \ ++ | | V \___ ___ ___ ___ ___ ___ ___ ++ | | -----origin_loop_body----- | ++ | | | | | ++ | | -------------------------\ | ++ | | | \___ ___ ___ ___ | ++ | | V V V ++ | | -----origin_loop_latch---- -----exit_bb------ ++ | | | | | | ++ | | /-------------------------- ------------------ ++ | \ __ / ++ | ++ | | ++ | ====> |entry_edge ++ | V ++ | -------prolog_bb----- ++ | | | ++ | --------------------- ++ | | ++ | V ++ | -----align_loop_header---- ++ | /-----------------> | | ++ |/ -------------------------- ++ || / \ ++ || V V ++ || ---align_loop_body--- ---epilogue_loop_header-- ++ || | | -------| |<---| ++ || --------------------\ / ------------------------- | ++ || | \____ | | | ++ || V | | V | ++ || ---align_loop_latch--- | | ---epilogue_loop_body---- | ++ || | | | | ----| | | ++ || ---------------------- | | / ------------------------- | ++ || / __________/ | | | | ++ || / | | | V | ++ | \ __________/ | | | ---epilogue_loop_latch--- | ++ | | | | | | | ++ | | | | ------------------------- / ++ | V | | | / ++ | -align_loop_exit_bb- | | \______________/ ++ | | | | | ++ | -------------------- | | ++ | | | | ++ | | V V ++ | | -----exit_bb------ ++ | |---->| | ++ | ------------------ ++ ++ The origin_loop conversion process starts from entry_edge and ends at ++ exit_bb; The execution logic of origin_loop is completely replaced by ++ align_loop + epilogue_loop: ++ 1) align_loop mainly implements the idea of ​​using wide-type dereference ++ and comparison on array elements, so as to achieve the effect of ++ acceleration; For the corresponding source code understanding, please ++ refer to the description of the pass at the beginning; ++ 2) epilogue_loop processes the previous loop remaining array element ++ comparison. */ ++ ++static void ++create_new_loops (edge entry_edge) ++{ ++ basic_block prolog_bb; ++ basic_block align_loop_header, align_loop_latch, align_loop_body_bb; ++ basic_block align_pred_bb, align_loop_exit_bb; ++ basic_block epilogue_loop_header, epilogue_loop_latch, epilogue_loop_body_bb; ++ basic_block epilogue_loop_pred_bb; ++ class loop *align_loop; ++ class loop *epilogue_loop; ++ ++ class loop *outer = entry_edge->src->loop_father; ++ ++ create_prolog_bb (prolog_bb, entry_edge->src, entry_edge->src, outer, ++ entry_edge); ++ ++ create_loop_pred_bb (align_pred_bb, prolog_bb, prolog_bb, outer); ++ make_single_succ_edge (prolog_bb, align_pred_bb, EDGE_FALLTHRU); ++ ++ create_align_loop_header (align_loop_header, align_pred_bb, ++ align_pred_bb, outer); ++ ++ create_align_loop_body_bb (align_loop_body_bb, align_loop_header, ++ align_loop_header, outer); ++ ++ create_align_loop_latch (align_loop_latch, align_loop_body_bb, ++ align_loop_body_bb, outer); ++ make_edge (align_loop_latch, align_loop_header, EDGE_FALLTHRU); ++ rewrite_add_phi_arg (align_loop_header); ++ ++ align_loop = init_new_loop (outer, align_loop_header, align_loop_latch); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nPrint byte align loop %d:\n", align_loop->num); ++ flow_loop_dump (align_loop, dump_file, NULL, 1); ++ fprintf (dump_file, "\n\n"); ++ } ++ ++ create_align_loop_exit_bb (align_loop_exit_bb, align_loop_body_bb, ++ align_loop_body_bb, outer); ++ ++ create_loop_pred_bb (epilogue_loop_pred_bb, align_loop_header, ++ align_loop_header, outer); ++ make_edge (align_loop_header, epilogue_loop_pred_bb, EDGE_FALSE_VALUE); ++ ++ create_epilogue_loop_header (epilogue_loop_header, epilogue_loop_pred_bb, ++ epilogue_loop_pred_bb, outer); ++ ++ create_epilogue_loop_body_bb (epilogue_loop_body_bb, epilogue_loop_header, ++ epilogue_loop_header, outer); ++ ++ create_epilogue_loop_latch (epilogue_loop_latch, epilogue_loop_body_bb, ++ epilogue_loop_body_bb, outer); ++ make_single_succ_edge (epilogue_loop_latch, epilogue_loop_header, ++ EDGE_FALLTHRU); ++ rewrite_add_phi_arg (epilogue_loop_header); ++ ++ epilogue_loop = init_new_loop (outer, epilogue_loop_header, ++ epilogue_loop_latch); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nPrint epilogue loop %d:\n", epilogue_loop->num); ++ flow_loop_dump (epilogue_loop, dump_file, NULL, 1); ++ fprintf (dump_file, "\n\n"); ++ } ++ make_single_succ_edge (align_loop_exit_bb, origin_loop.exit_bb1, ++ EDGE_FALLTHRU); ++ set_immediate_dominator (CDI_DOMINATORS, origin_loop.exit_bb1, ++ entry_edge->src); ++ make_edge (epilogue_loop_body_bb, origin_loop.exit_bb1, EDGE_TRUE_VALUE); ++ ++ make_edge (epilogue_loop_header, origin_loop.exit_bb2, EDGE_FALSE_VALUE); ++ set_immediate_dominator (CDI_DOMINATORS, origin_loop.exit_bb2, ++ entry_edge->src); ++ ++ rewrite_add_phi_arg (origin_loop.exit_bb1); ++ rewrite_add_phi_arg (origin_loop.exit_bb2); ++ ++ remove_edge (origin_loop.exit_e1); ++ remove_edge (origin_loop.exit_e2); ++} ++ ++/* Make sure that the dominance relationship of the newly inserted cfg ++ is not missing. */ ++ ++static void ++update_loop_dominator (cdi_direction dir) ++{ ++ gcc_assert (dom_info_available_p (dir)); ++ ++ basic_block bb; ++ FOR_EACH_BB_FN (bb, cfun) ++ { ++ basic_block imm_bb = get_immediate_dominator (dir, bb); ++ if (!imm_bb || bb == origin_loop.exit_bb1) ++ { ++ set_immediate_dominator (CDI_DOMINATORS, bb, ++ recompute_dominator (CDI_DOMINATORS, bb)); ++ continue; ++ } ++ } ++} ++ ++/* Clear information about the original loop. */ ++ ++static void ++remove_origin_loop (class loop *loop) ++{ ++ basic_block *body; ++ ++ body = get_loop_body_in_dom_order (loop); ++ unsigned n = loop->num_nodes; ++ for (unsigned i = 0; i < n; i++) ++ { ++ delete_basic_block (body[i]); ++ } ++ free (body); ++ delete_loop (loop); ++} ++ ++/* Perform the conversion of origin_loop to new_loop. */ ++ ++static void ++convert_to_new_loop (class loop *loop) ++{ ++ create_new_loops (origin_loop.entry_edge); ++ remove_origin_loop (loop); ++ update_loop_dominator (CDI_DOMINATORS); ++ update_ssa (TODO_update_ssa); ++} ++ ++/* The main entry of array-widen-compare optimizes. */ ++ ++static unsigned int ++tree_ssa_array_widen_compare () ++{ ++ unsigned int todo = 0; ++ class loop *loop; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ flow_loops_dump (dump_file, NULL, 1); ++ fprintf (dump_file, "\nConfirm which loop can be optimized using" ++ " array-widen-compare\n"); ++ } ++ ++ enum li_flags LI = LI_FROM_INNERMOST; ++ FOR_EACH_LOOP (loop, LI) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "======================================\n"); ++ fprintf (dump_file, "Processing loop %d:\n", loop->num); ++ fprintf (dump_file, "======================================\n"); ++ flow_loop_dump (loop, dump_file, NULL, 1); ++ fprintf (dump_file, "\n\n"); ++ } ++ ++ if (determine_loop_form (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "The %dth loop form is success matched," ++ "and the loop can be optimized.\n", ++ loop->num); ++ dump_loop_bb (loop); ++ } ++ ++ convert_to_new_loop (loop); ++ } ++ } ++ ++ todo |= (TODO_update_ssa); ++ return todo; ++} ++ ++/* Array widen compare. */ ++ ++namespace { ++ ++const pass_data pass_data_tree_array_widen_compare = ++{ ++ GIMPLE_PASS, ++ "awiden_compare", ++ OPTGROUP_LOOP, ++ TV_TREE_ARRAY_WIDEN_COMPARE, ++ (PROP_cfg | PROP_ssa), ++ 0, ++ 0, ++ 0, ++ (TODO_update_ssa | TODO_verify_all) ++}; ++ ++class pass_array_widen_compare : public gimple_opt_pass ++{ ++public: ++ pass_array_widen_compare (gcc::context *ctxt) ++ : gimple_opt_pass (pass_data_tree_array_widen_compare, ctxt) ++ {} ++ ++ /* opt_pass methods: */ ++ virtual bool gate (function *); ++ virtual unsigned int execute (function *); ++ ++}; // class pass_array_widen_compare ++ ++bool ++pass_array_widen_compare::gate (function *) ++{ ++ return (flag_array_widen_compare > 0 && optimize >= 3); ++} ++ ++unsigned int ++pass_array_widen_compare::execute (function *fun) ++{ ++ if (number_of_loops (fun) <= 1) ++ return 0; ++ ++ /* Only supports LP64 data mode. */ ++ if (TYPE_PRECISION (long_integer_type_node) != 64 ++ || POINTER_SIZE != 64 || TYPE_PRECISION (integer_type_node) != 32) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "The current data mode is not supported," ++ "only the LP64 date mode is supported.\n"); ++ return 0; ++ } ++ ++ return tree_ssa_array_widen_compare (); ++} ++ ++} // anon namespace ++ ++gimple_opt_pass * ++make_pass_array_widen_compare (gcc::context *ctxt) ++{ ++ return new pass_array_widen_compare (ctxt); ++} +\ No newline at end of file +-- +2.27.0 + diff --git a/0049-PointerCompression-Structure-pointer-compression.patch b/0049-PointerCompression-Structure-pointer-compression.patch new file mode 100644 index 0000000000000000000000000000000000000000..d1eca963f89566d2bb0417baea53517925df2701 --- /dev/null +++ b/0049-PointerCompression-Structure-pointer-compression.patch @@ -0,0 +1,1884 @@ +From 9abcd6d117c3727dac30e72fc3ae374b3eb80c64 Mon Sep 17 00:00:00 2001 +From: liyancheng <412998149@qq.com> +Date: Tue, 28 Jun 2022 19:20:43 +0800 +Subject: [PATCH] [PointerCompression] Structure pointer compression + +Support compression of structure pointer saved in structure to reduce +the size of structure. +Add flag -fipa-struct-reorg=4 to enable pointer compression. +Add parameter param-pointer-compression-size=[1-3] to set the compressed +pointer size. + +diff --git a/gcc/common.opt b/gcc/common.opt +index b5ea3c7a1..6278ebb39 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1884,8 +1884,8 @@ Common Report Var(flag_ipa_struct_reorg) Init(0) Optimization + Perform structure layout optimizations. + + fipa-struct-reorg= +-Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 3) +--fipa-struct-reorg=[0,1,2,3] adding none, struct-reorg, reorder-fields, dfe optimizations. ++Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 4) ++-fipa-struct-reorg=[0,1,2,3,4] adding none, struct-reorg, reorder-fields, dfe, pointer-compression optimizations. + + fipa-extend-auto-profile + Common Report Var(flag_ipa_extend_auto_profile) +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.c b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +index 00dc4bf1d..2b799339a 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.c ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +@@ -83,6 +83,7 @@ along with GCC; see the file COPYING3. If not see + #include "gimple-iterator.h" + #include "gimple-walk.h" + #include "cfg.h" ++#include "cfghooks.h" + #include "ssa.h" + #include "tree-dfa.h" + #include "fold-const.h" +@@ -109,6 +110,30 @@ namespace { + using namespace struct_reorg; + using namespace struct_relayout; + ++static void ++set_var_attributes (tree var) ++{ ++ if (!var) ++ { ++ return; ++ } ++ gcc_assert (TREE_CODE (var) == VAR_DECL); ++ ++ DECL_ARTIFICIAL (var) = 1; ++ DECL_EXTERNAL (var) = 0; ++ TREE_STATIC (var) = 1; ++ TREE_PUBLIC (var) = 0; ++ TREE_USED (var) = 1; ++ DECL_CONTEXT (var) = NULL; ++ TREE_THIS_VOLATILE (var) = 0; ++ TREE_ADDRESSABLE (var) = 0; ++ TREE_READONLY (var) = 0; ++ if (is_global_var (var)) ++ { ++ set_decl_tls_model (var, TLS_MODEL_NONE); ++ } ++} ++ + /* Return true iff TYPE is stdarg va_list type. */ + + static inline bool +@@ -247,9 +272,21 @@ enum struct_layout_opt_level + NONE = 0, + STRUCT_REORG, + STRUCT_REORDER_FIELDS, +- DEAD_FIELD_ELIMINATION ++ DEAD_FIELD_ELIMINATION, ++ POINTER_COMPRESSION + }; + ++/* Defines the target pointer size for pointer compression, which should be 8, ++ 16, 32. */ ++ ++int POINTER_COMPRESSION_SIZE = 32; ++ ++/* Defines null pointers of different compressed sizes. It is the maximum ++ number of unsigned integers of each size, which is 0xff, 0xffff, ++ 0xffffffff. */ ++ ++unsigned long long POINTER_COMPRESSION_NULL_VAL = 0xffffffff; ++ + static bool is_result_of_mult (tree arg, tree *num, tree struct_size); + bool isptrptr (tree type); + void get_base (tree &base, tree expr); +@@ -371,7 +408,9 @@ srtype::srtype (tree type) + : type (type), + chain_type (false), + escapes (does_not_escape), ++ pc_gptr (NULL), + visited (false), ++ pc_candidate (false), + has_alloc_array (0) + { + for (int i = 0; i < max_split; i++) +@@ -452,6 +491,33 @@ srtype::mark_escape (escape_type e, gimple *stmt) + } + } + ++/* Create a global header for pointer compression. */ ++ ++void ++srtype::create_global_ptr_for_pointer_compression () ++{ ++ if (!pc_candidate || pc_gptr != NULL) ++ { ++ return; ++ } ++ const char *type_name = get_type_name (type); ++ gcc_assert (type_name != NULL); ++ ++ char *gptr_name = concat (type_name, "_pchead", NULL); ++ tree new_name = get_identifier (gptr_name); ++ tree new_type = build_pointer_type (newtype[0]); ++ tree new_var = build_decl (UNKNOWN_LOCATION, VAR_DECL, new_name, new_type); ++ set_var_attributes (new_var); ++ pc_gptr = new_var; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nType: %s has create global header for pointer" ++ " compression: %s\n", type_name, gptr_name); ++ } ++ free (gptr_name); ++} ++ + /* Add FIELD to the list of fields that use this type. */ + + void +@@ -794,21 +860,39 @@ srfield::create_new_optimized_fields (tree newtype[max_split], + = fields_to_finish.get_or_insert (inner_type (type->type)); + fields.safe_push (field); + } +- +- DECL_NAME (field) = DECL_NAME (fielddecl); ++ if (type != NULL && type->pc_candidate) ++ { ++ const char *old_name = IDENTIFIER_POINTER (DECL_NAME (fielddecl)); ++ char *new_name = concat (old_name, "_ptrcmps", NULL); ++ DECL_NAME (field) = get_identifier (new_name); ++ free (new_name); ++ } ++ else ++ { ++ DECL_NAME (field) = DECL_NAME (fielddecl); ++ } + if (type == NULL) + { + /* Common members do not need to reconstruct. + Otherwise, int* -> int** or void* -> void**. */ + TREE_TYPE (field) = nt; ++ SET_DECL_ALIGN (field, DECL_ALIGN (fielddecl)); + } + else + { +- TREE_TYPE (field) ++ if (type->pc_candidate) ++ { ++ TREE_TYPE (field) = make_unsigned_type (POINTER_COMPRESSION_SIZE); ++ SET_DECL_ALIGN (field, POINTER_COMPRESSION_SIZE); ++ } ++ else ++ { ++ TREE_TYPE (field) + = reconstruct_complex_type (TREE_TYPE (fielddecl), nt); ++ SET_DECL_ALIGN (field, DECL_ALIGN (fielddecl)); ++ } + } + DECL_SOURCE_LOCATION (field) = DECL_SOURCE_LOCATION (fielddecl); +- SET_DECL_ALIGN (field, DECL_ALIGN (fielddecl)); + DECL_USER_ALIGN (field) = DECL_USER_ALIGN (fielddecl); + TREE_ADDRESSABLE (field) = TREE_ADDRESSABLE (fielddecl); + DECL_NONADDRESSABLE_P (field) = !TREE_ADDRESSABLE (fielddecl); +@@ -931,6 +1015,12 @@ srtype::create_new_type (void) + && has_dead_field ()) + fprintf (dump_file, "Dead field elimination.\n"); + } ++ ++ if (pc_candidate && pc_gptr == NULL) ++ { ++ create_global_ptr_for_pointer_compression (); ++ } ++ + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Created %d types:\n", maxclusters); +@@ -1350,6 +1440,34 @@ public: + void maybe_mark_or_record_other_side (tree side, tree other, gimple *stmt); + unsigned execute_struct_relayout (void); + bool remove_dead_field_stmt (tree lhs); ++ ++ // Pointer compression methods: ++ void check_and_prune_struct_for_pointer_compression (void); ++ void try_rewrite_with_pointer_compression (gassign *, gimple_stmt_iterator *, ++ tree, tree, tree &, tree &); ++ bool is_safe_void_cmp (tree, srtype *); ++ bool is_compression_candidate_type (tree); ++ bool is_compression_candidate (tree); ++ bool is_type_conversion_candidate_for_pc (tree); ++ bool is_direct_rewrite_chance_for_compression (tree, tree &); ++ bool is_simplify_chance_for_compress_candidate (gassign *, tree); ++ bool compress_candidate_without_check (gimple_stmt_iterator *, tree, tree &); ++ bool compress_candidate_with_check (gimple_stmt_iterator *, tree, tree &); ++ bool compress_candidate (gassign *, gimple_stmt_iterator *, tree, tree &); ++ bool decompress_candidate_without_check (gimple_stmt_iterator *, ++ tree, tree, tree &, tree &); ++ bool decompress_candidate_with_check (gimple_stmt_iterator *, tree, tree &); ++ bool decompress_candidate (gimple_stmt_iterator *, tree, tree, tree &, ++ tree &); ++ srtype *get_compression_candidate_type (tree); ++ tree compress_ptr_to_offset (tree, srtype *, gimple_stmt_iterator *); ++ tree decompress_offset_to_ptr (tree, srtype *, gimple_stmt_iterator *); ++ basic_block create_bb_for_compress_candidate (basic_block, tree, srtype *, ++ tree &); ++ basic_block create_bb_for_decompress_candidate (basic_block, tree, srtype *, ++ tree &); ++ basic_block create_bb_for_compress_nullptr (basic_block, tree &); ++ basic_block create_bb_for_decompress_nullptr (basic_block, tree, tree &); + }; + + struct ipa_struct_relayout +@@ -1400,30 +1518,6 @@ namespace { + + /* Methods for ipa_struct_relayout. */ + +-static void +-set_var_attributes (tree var) +-{ +- if (!var) +- { +- return; +- } +- gcc_assert (TREE_CODE (var) == VAR_DECL); +- +- DECL_ARTIFICIAL (var) = 1; +- DECL_EXTERNAL (var) = 0; +- TREE_STATIC (var) = 1; +- TREE_PUBLIC (var) = 0; +- TREE_USED (var) = 1; +- DECL_CONTEXT (var) = NULL; +- TREE_THIS_VOLATILE (var) = 0; +- TREE_ADDRESSABLE (var) = 0; +- TREE_READONLY (var) = 0; +- if (is_global_var (var)) +- { +- set_decl_tls_model (var, TLS_MODEL_NONE); +- } +-} +- + tree + ipa_struct_relayout::create_new_vars (tree type, const char *name) + { +@@ -3146,6 +3240,21 @@ ipa_struct_reorg::find_vars (gimple *stmt) + records the right value _1 declaration. */ + find_var (gimple_assign_rhs1 (stmt), stmt); + ++ /* Handle _1->t = (struct *) 0x123456; ++ Pointer types from non-zero pointer need to be escaped in pointer ++ compression and complete relayout. */ ++ if (((current_mode == STRUCT_LAYOUT_OPTIMIZE ++ && struct_layout_optimize_level >= POINTER_COMPRESSION) ++ || current_mode == COMPLETE_STRUCT_RELAYOUT) ++ && TREE_CODE (lhs) == COMPONENT_REF ++ && TREE_CODE (TREE_TYPE (lhs)) == POINTER_TYPE ++ && TREE_CODE (rhs) == INTEGER_CST ++ && !integer_zerop (rhs)) ++ { ++ mark_type_as_escape (inner_type (TREE_TYPE (lhs)), ++ escape_cast_int, stmt); ++ } ++ + /* Add a safe func mechanism. */ + bool l_find = true; + bool r_find = true; +@@ -3730,15 +3839,20 @@ ipa_struct_reorg::maybe_mark_or_record_other_side (tree side, tree other, gimple + } + } + /* x_1 = y.x_nodes; void *x; +- Directly mark the structure pointer type assigned +- to the void* variable as escape. */ ++ Mark the structure pointer type assigned ++ to the void* variable as escape. Unless the void* is only used to compare ++ with variables of the same type. */ + else if (current_mode == STRUCT_LAYOUT_OPTIMIZE + && TREE_CODE (side) == SSA_NAME + && VOID_POINTER_P (TREE_TYPE (side)) + && SSA_NAME_VAR (side) + && VOID_POINTER_P (TREE_TYPE (SSA_NAME_VAR (side)))) + { +- mark_type_as_escape (TREE_TYPE (other), escape_cast_void, stmt); ++ if (struct_layout_optimize_level < POINTER_COMPRESSION ++ || !is_safe_void_cmp (side, type)) ++ { ++ mark_type_as_escape (TREE_TYPE (other), escape_cast_void, stmt); ++ } + } + + check_ptr_layers (side, other, stmt); +@@ -4361,7 +4475,9 @@ ipa_struct_reorg::check_type_and_push (tree newdecl, srdecl *decl, + void + ipa_struct_reorg::check_alloc_num (gimple *stmt, srtype *type) + { +- if (current_mode == COMPLETE_STRUCT_RELAYOUT ++ if ((current_mode == COMPLETE_STRUCT_RELAYOUT ++ || (current_mode == STRUCT_LAYOUT_OPTIMIZE ++ && struct_layout_optimize_level >= POINTER_COMPRESSION)) + && handled_allocation_stmt (stmt)) + { + tree arg0 = gimple_call_arg (stmt, 0); +@@ -4529,7 +4645,11 @@ ipa_struct_reorg::check_definition (srdecl *decl, vec &worklist) + if (current_mode == STRUCT_LAYOUT_OPTIMIZE && SSA_NAME_VAR (ssa_name) + && VOID_POINTER_P (TREE_TYPE (SSA_NAME_VAR (ssa_name)))) + { +- type->mark_escape (escape_cast_void, SSA_NAME_DEF_STMT (ssa_name)); ++ if (struct_layout_optimize_level < POINTER_COMPRESSION ++ || !is_safe_void_cmp (ssa_name, type)) ++ { ++ type->mark_escape (escape_cast_void, SSA_NAME_DEF_STMT (ssa_name)); ++ } + } + gimple *stmt = SSA_NAME_DEF_STMT (ssa_name); + +@@ -5521,6 +5641,8 @@ ipa_struct_reorg::create_new_types (void) + for (unsigned i = 0; i < types.length (); i++) + newtypes += types[i]->create_new_type (); + ++ /* Some new types may not have been created at create_new_type (), so ++ recreate new type for all struct fields. */ + if (current_mode == STRUCT_LAYOUT_OPTIMIZE) + { + for (unsigned i = 0; i < types.length (); i++) +@@ -5531,9 +5653,18 @@ ipa_struct_reorg::create_new_types (void) + for (unsigned j = 0; j < fields->length (); j++) + { + tree field = (*fields)[j]; +- TREE_TYPE (field) +- = reconstruct_complex_type (TREE_TYPE (field), +- types[i]->newtype[0]); ++ if (types[i]->pc_candidate) ++ { ++ TREE_TYPE (field) ++ = make_unsigned_type (POINTER_COMPRESSION_SIZE); ++ SET_DECL_ALIGN (field, POINTER_COMPRESSION_SIZE); ++ } ++ else ++ { ++ TREE_TYPE (field) ++ = reconstruct_complex_type (TREE_TYPE (field), ++ types[i]->newtype[0]); ++ } + } + } + } +@@ -5904,6 +6035,703 @@ ipa_struct_reorg::rewrite_expr (tree expr, tree newexpr[max_split], bool ignore_ + return true; + } + ++/* Emit a series of gimples to compress the pointer to the index relative to ++ the global header. The basic blocks where gsi is located must have at least ++ one stmt. */ ++ ++tree ++ipa_struct_reorg::compress_ptr_to_offset (tree xhs, srtype *type, ++ gimple_stmt_iterator *gsi) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nCompress candidate pointer:\n"); ++ print_generic_expr (dump_file, xhs); ++ fprintf (dump_file, "\nto offset:\n"); ++ } ++ ++ /* Emit gimple _X1 = ptr - gptr. */ ++ tree pointer_addr = fold_convert (long_unsigned_type_node, xhs); ++ tree gptr_addr = fold_convert (long_unsigned_type_node, type->pc_gptr); ++ tree step1 = gimplify_build2 (gsi, MINUS_EXPR, long_unsigned_type_node, ++ pointer_addr, gptr_addr); ++ ++ /* Emit gimple _X2 = _X1 / sizeof (struct). */ ++ tree step2 = gimplify_build2 (gsi, TRUNC_DIV_EXPR, long_unsigned_type_node, ++ step1, TYPE_SIZE_UNIT (type->newtype[0])); ++ ++ /* Emit _X3 = (TARGET_SIZE) _X2. */ ++ tree step3 = gimplify_build1 (gsi, NOP_EXPR, ++ make_unsigned_type (POINTER_COMPRESSION_SIZE), ++ step2); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, step3); ++ fprintf (dump_file, "\n"); ++ } ++ return step3; ++} ++ ++/* Emit a series of gimples to decompress the index into the original ++ pointer. The basic blocks where gsi is located must have at least ++ one stmt. */ ++ ++tree ++ipa_struct_reorg::decompress_offset_to_ptr (tree xhs, srtype *type, ++ gimple_stmt_iterator *gsi) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nDecompress candidate offset:\n"); ++ print_generic_expr (dump_file, xhs); ++ fprintf (dump_file, "\nto pointer:\n"); ++ } ++ ++ /* Emit _X1 = offset * sizeof (struct). */ ++ tree offset = fold_convert (long_unsigned_type_node, xhs); ++ tree step1 = gimplify_build2 (gsi, MULT_EXPR, long_unsigned_type_node, ++ offset, TYPE_SIZE_UNIT (type->newtype[0])); ++ ++ /* Emit _X2 = phead + _X1. */ ++ tree gptr_addr = fold_convert (long_unsigned_type_node, type->pc_gptr); ++ tree step2 = gimplify_build2 (gsi, PLUS_EXPR, long_unsigned_type_node, ++ gptr_addr, step1); ++ ++ /* Emit _X3 = (struct *) _X2. */ ++ tree step3 = gimplify_build1 (gsi, NOP_EXPR, TREE_TYPE (type->pc_gptr), ++ step2); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, step3); ++ fprintf (dump_file, "\n"); ++ } ++ return step3; ++} ++ ++/* Return the compression candidate srtype of SSA_NAME or COMPONENT_REF. */ ++ ++srtype * ++ipa_struct_reorg::get_compression_candidate_type (tree xhs) ++{ ++ if (xhs == NULL) ++ { ++ return NULL; ++ } ++ if (TREE_CODE (xhs) == SSA_NAME || TREE_CODE (xhs) == COMPONENT_REF) ++ { ++ srtype *access_type = find_type (inner_type (TREE_TYPE (xhs))); ++ if (access_type != NULL && access_type->pc_candidate) ++ { ++ return access_type; ++ } ++ } ++ return NULL; ++} ++ ++/* True if the input type is the candidate type for pointer compression. */ ++ ++bool ++ipa_struct_reorg::is_compression_candidate_type (tree type) ++{ ++ if (type == NULL) ++ { ++ return false; ++ } ++ if (TREE_CODE (type) == POINTER_TYPE) ++ { ++ if (TREE_CODE (TREE_TYPE (type)) == RECORD_TYPE) ++ { ++ srtype *access_type = find_type (TREE_TYPE (type)); ++ if (access_type != NULL && access_type->pc_candidate) ++ { ++ return true; ++ } ++ } ++ } ++ return false; ++} ++ ++/* True if the input xhs is a candidate for pointer compression. */ ++ ++bool ++ipa_struct_reorg::is_compression_candidate (tree xhs) ++{ ++ if (xhs == NULL) ++ { ++ return false; ++ } ++ if (TREE_CODE (xhs) == COMPONENT_REF) ++ { ++ srtype *base_type = find_type (TREE_TYPE (TREE_OPERAND (xhs, 0))); ++ if (base_type == NULL || base_type->has_escaped ()) ++ { ++ return false; ++ } ++ return is_compression_candidate_type (TREE_TYPE (xhs)); ++ } ++ return false; ++} ++ ++/* True if xhs is a component_ref that base has escaped but uses a compression ++ candidate type. */ ++ ++bool ++ipa_struct_reorg::is_type_conversion_candidate_for_pc (tree xhs) ++{ ++ if (xhs == NULL) ++ { ++ return false; ++ } ++ if (TREE_CODE (xhs) == COMPONENT_REF) ++ { ++ srtype *base_type = find_type (TREE_TYPE (TREE_OPERAND (xhs, 0))); ++ if (base_type != NULL && base_type->has_escaped ()) ++ { ++ return is_compression_candidate_type (TREE_TYPE (xhs)); ++ } ++ } ++ return false; ++} ++ ++/* Create a new basic block that uses magic numbers to represent compressed ++ null pointers. */ ++ ++basic_block ++ipa_struct_reorg::create_bb_for_compress_nullptr (basic_block last_bb, ++ tree &phi) ++{ ++ basic_block new_bb = create_empty_bb (last_bb); ++ if (last_bb->loop_father != NULL) ++ { ++ add_bb_to_loop (new_bb, last_bb->loop_father); ++ loops_state_set (LOOPS_NEED_FIXUP); ++ } ++ ++ /* Emit _X->t_ptrcmps = NULL_MAX_VALUE. */ ++ gimple_stmt_iterator gsi = gsi_last_bb (new_bb); ++ phi = make_ssa_name (make_unsigned_type (POINTER_COMPRESSION_SIZE)); ++ tree rhs = build_int_cst (make_unsigned_type (POINTER_COMPRESSION_SIZE), ++ POINTER_COMPRESSION_NULL_VAL); ++ gimple *new_stmt = gimple_build_assign (phi, rhs); ++ gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nCreate bb %d for compress nullptr:\n", ++ new_bb->index); ++ gimple_dump_bb (dump_file, new_bb, 0, dump_flags); ++ } ++ return new_bb; ++} ++ ++/* Create a new basic block to compress the pointer to the index relative to ++ the allocated memory pool header. */ ++ ++basic_block ++ipa_struct_reorg::create_bb_for_compress_candidate (basic_block last_bb, ++ tree new_rhs, srtype *type, ++ tree &phi) ++{ ++ basic_block new_bb = create_empty_bb (last_bb); ++ if (last_bb->loop_father != NULL) ++ { ++ add_bb_to_loop (new_bb, last_bb->loop_father); ++ loops_state_set (LOOPS_NEED_FIXUP); ++ } ++ ++ gimple_stmt_iterator gsi = gsi_last_bb (new_bb); ++ /* compress_ptr_to_offset () needs at least one stmt in target bb. */ ++ gsi_insert_after (&gsi, gimple_build_nop (), GSI_NEW_STMT); ++ phi = compress_ptr_to_offset (new_rhs, type, &gsi); ++ /* Remove the NOP created above. */ ++ gsi_remove (&gsi, true); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nCreate bb %d for compress candidate:\n", ++ new_bb->index); ++ gimple_dump_bb (dump_file, new_bb, 0, dump_flags); ++ } ++ return new_bb; ++} ++ ++/* Compression can be simplified by these following cases: ++ 1. if rhs is NULL, use a magic fake zero represent it. ++ 2. if new_rhs has been converted into INTEGER_TYPE in the previous stmt, ++ just use it here. For example: ++ _1 = t->s ++ -> tt->s = _1. */ ++ ++bool ++ipa_struct_reorg::is_direct_rewrite_chance_for_compression (tree rhs, ++ tree &new_rhs) ++{ ++ if (integer_zerop (rhs)) ++ { ++ new_rhs ++ = build_int_cst (make_unsigned_type (POINTER_COMPRESSION_SIZE), ++ POINTER_COMPRESSION_NULL_VAL); ++ return true; ++ } ++ else if (new_rhs && TREE_CODE (TREE_TYPE (new_rhs)) == INTEGER_TYPE) ++ { ++ return true; ++ } ++ return false; ++} ++ ++/* The following cases can simplify the checking of null pointer: ++ 1. rhs used as COMPONENT_REF in this basic block. ++ 2. rhs defined from POINTER_PLUS_EXPR. */ ++ ++bool ++ipa_struct_reorg::is_simplify_chance_for_compress_candidate (gassign *stmt, ++ tree rhs) ++{ ++ imm_use_iterator imm_iter; ++ use_operand_p use_p; ++ gimple *def_stmt = SSA_NAME_DEF_STMT (rhs); ++ ++ if (def_stmt && is_gimple_assign (def_stmt) ++ && gimple_assign_rhs_code (def_stmt) == POINTER_PLUS_EXPR) ++ { ++ return true; ++ } ++ ++ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, rhs) ++ { ++ gimple *use_stmt = USE_STMT (use_p); ++ if (use_stmt->bb != stmt->bb || !is_gimple_assign (use_stmt)) ++ { ++ continue; ++ } ++ tree use_rhs = gimple_assign_rhs1 (use_stmt); ++ if (TREE_CODE (use_rhs) == COMPONENT_REF ++ && TREE_OPERAND (TREE_OPERAND (use_rhs, 0), 0) == rhs) ++ { ++ return true; ++ } ++ } ++ return false; ++} ++ ++/* Perform compression directly without checking null pointer. */ ++ ++bool ++ipa_struct_reorg::compress_candidate_without_check (gimple_stmt_iterator *gsi, ++ tree rhs, ++ tree &new_rhs) ++{ ++ srtype *type = get_compression_candidate_type (rhs); ++ gcc_assert (type != NULL); ++ new_rhs = compress_ptr_to_offset (new_rhs, type, gsi); ++ return true; ++} ++ ++/* Perform pointer compression with check. The conversion will be as shown in ++ the following example: ++ Orig bb: ++ bb <1>: ++ _1->t = _2 ++ ++ will be transformed to: ++ bb <1>: ++ _3 = _2 ++ if (_2 == NULL) ++ goto bb <2> ++ else ++ goto bb <3> ++ ++ bb <2>: ++ _3 = NULL_MAX_VALUE ++ goto bb <4> ++ ++ bb <3>: ++ ... ++ _4 = compress (_2) ++ goto bb <4> ++ ++ bb <4>: ++ _5 = PHI (_3, _4) ++ _1->t = _5 ++ The gsi will move to the beginning of split dst bb <4>, _1->t = _5 will be ++ emitted by rewrite_assign (). */ ++ ++bool ++ipa_struct_reorg::compress_candidate_with_check (gimple_stmt_iterator *gsi, ++ tree rhs, tree &new_rhs) ++{ ++ tree cond_lhs = make_ssa_name (TREE_TYPE (new_rhs)); ++ gimple *assign_stmt = gimple_build_assign (cond_lhs, new_rhs); ++ gsi_insert_before (gsi, assign_stmt, GSI_SAME_STMT); ++ ++ /* Insert cond stmt. */ ++ tree rhs_pointer_type = build_pointer_type (TREE_TYPE (new_rhs)); ++ gcond *cond = gimple_build_cond (EQ_EXPR, cond_lhs, ++ build_int_cst (rhs_pointer_type, 0), ++ NULL_TREE, NULL_TREE); ++ gimple_set_location (cond, UNKNOWN_LOCATION); ++ gsi_insert_before (gsi, cond, GSI_SAME_STMT); ++ ++ gimple* cur_stmt = as_a (cond); ++ edge e = split_block (cur_stmt->bb, cur_stmt); ++ basic_block split_src_bb = e->src; ++ basic_block split_dst_bb = e->dest; ++ ++ /* Create bb for nullptr. */ ++ tree phi1 = NULL_TREE; ++ basic_block true_bb = create_bb_for_compress_nullptr (split_src_bb, phi1); ++ ++ /* Create bb for comprssion. */ ++ srtype *type = get_compression_candidate_type (rhs); ++ gcc_assert (type != NULL); ++ tree phi2 = NULL_TREE; ++ basic_block false_bb = create_bb_for_compress_candidate (true_bb, new_rhs, ++ type, phi2); ++ ++ /* Rebuild and reset cfg. */ ++ remove_edge_raw (e); ++ ++ edge etrue = make_edge (split_src_bb, true_bb, EDGE_TRUE_VALUE); ++ etrue->probability = profile_probability::unlikely (); ++ true_bb->count = etrue->count (); ++ ++ edge efalse = make_edge (split_src_bb, false_bb, EDGE_FALSE_VALUE); ++ efalse->probability = profile_probability::likely (); ++ false_bb->count = efalse->count (); ++ ++ edge e1 = make_single_succ_edge (true_bb, split_dst_bb, EDGE_FALLTHRU); ++ edge e2 = make_single_succ_edge (false_bb, split_dst_bb, EDGE_FALLTHRU); ++ ++ tree phi = make_ssa_name (make_unsigned_type (POINTER_COMPRESSION_SIZE)); ++ gphi *phi_node = create_phi_node (phi, split_dst_bb); ++ add_phi_arg (phi_node, phi1, e1, UNKNOWN_LOCATION); ++ add_phi_arg (phi_node, phi2, e2, UNKNOWN_LOCATION); ++ ++ if (dom_info_available_p (CDI_DOMINATORS)) ++ { ++ set_immediate_dominator (CDI_DOMINATORS, split_dst_bb, split_src_bb); ++ set_immediate_dominator (CDI_DOMINATORS, true_bb, split_src_bb); ++ set_immediate_dominator (CDI_DOMINATORS, false_bb, split_src_bb); ++ } ++ *gsi = gsi_start_bb (split_dst_bb); ++ new_rhs = phi; ++ return true; ++} ++ ++/* If there is a direct rewrite chance or simplification opportunity, perform ++ the simplified compression rewrite. Otherwise, create a cond expression and ++ two basic blocks to implement pointer compression. */ ++ ++bool ++ipa_struct_reorg::compress_candidate (gassign *stmt, gimple_stmt_iterator *gsi, ++ tree rhs, tree &new_rhs) ++{ ++ if (is_direct_rewrite_chance_for_compression (rhs, new_rhs)) ++ { ++ return true; ++ } ++ else if (is_simplify_chance_for_compress_candidate (stmt, rhs)) ++ { ++ return compress_candidate_without_check (gsi, rhs, new_rhs); ++ } ++ else if (compress_candidate_with_check (gsi, rhs, new_rhs)) ++ { ++ return true; ++ } ++ return false; ++} ++ ++/* Create a new basic block to decompress the index to null pointer. */ ++ ++basic_block ++ipa_struct_reorg::create_bb_for_decompress_nullptr (basic_block last_bb, ++ tree new_rhs, ++ tree &phi_node) ++{ ++ basic_block new_bb = create_empty_bb (last_bb); ++ if (last_bb->loop_father != NULL) ++ { ++ add_bb_to_loop (new_bb, last_bb->loop_father); ++ loops_state_set (LOOPS_NEED_FIXUP); ++ } ++ gimple_stmt_iterator gsi = gsi_last_bb (new_bb); ++ tree rhs_pointer_type = build_pointer_type (TREE_TYPE (new_rhs)); ++ phi_node = make_ssa_name (rhs_pointer_type); ++ gimple *new_stmt = gimple_build_assign (phi_node, ++ build_int_cst (rhs_pointer_type, 0)); ++ gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nCreate bb %d for decompress nullptr:\n", ++ new_bb->index); ++ gimple_dump_bb (dump_file, new_bb, 0, dump_flags); ++ } ++ return new_bb; ++} ++ ++/* Create a new basic block to decompress the index into original pointer. */ ++ ++basic_block ++ipa_struct_reorg::create_bb_for_decompress_candidate (basic_block last_bb, ++ tree lhs, srtype *type, ++ tree &phi_node) ++{ ++ basic_block new_bb = create_empty_bb (last_bb); ++ if (last_bb->loop_father != NULL) ++ { ++ add_bb_to_loop (new_bb, last_bb->loop_father); ++ loops_state_set (LOOPS_NEED_FIXUP); ++ } ++ gimple_stmt_iterator gsi = gsi_last_bb (new_bb); ++ /* decompress_ptr_to_offset () needs at least one stmt in target bb. */ ++ gsi_insert_after (&gsi, gimple_build_nop (), GSI_NEW_STMT); ++ phi_node = decompress_offset_to_ptr (lhs, type, &gsi); ++ /* Remove the NOP created above. */ ++ gsi_remove (&gsi, true); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nCreate bb %d for decompress candidate:\n", ++ new_bb->index); ++ gimple_dump_bb (dump_file, new_bb, 0, dump_flags); ++ } ++ return new_bb; ++} ++ ++/* Try decompress candidate without check. */ ++ ++bool ++ipa_struct_reorg::decompress_candidate_without_check (gimple_stmt_iterator *gsi, ++ tree lhs, tree rhs, ++ tree &new_lhs, ++ tree &new_rhs) ++{ ++ imm_use_iterator imm_iter; ++ use_operand_p use_p; ++ bool processed = false; ++ ++ if (!gsi_one_before_end_p (*gsi)) ++ { ++ gsi_next (gsi); ++ gimple *next_stmt = gsi_stmt (*gsi); ++ if (gimple_assign_rhs_class (next_stmt) == GIMPLE_SINGLE_RHS) ++ { ++ tree next_rhs = gimple_assign_rhs1 (next_stmt); ++ /* If current lhs is used as rhs in the next stmt: ++ -> _1 = t->s ++ tt->s = _1. */ ++ if (lhs == next_rhs) ++ { ++ /* Check whether the lhs is only used in the next stmt. */ ++ unsigned use_num = 0; ++ processed = true; ++ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) ++ { ++ if (++use_num > 1) ++ { ++ processed = false; ++ break; ++ } ++ } ++ if (processed) ++ { ++ /* Only used in next stmt, direct reset new_rhs here. */ ++ TREE_TYPE (new_lhs) ++ = make_unsigned_type (POINTER_COMPRESSION_SIZE); ++ } ++ /* If the lhs used not only in next stmt return false here. */ ++ } ++ /* handle -> _1 = t->s ++ _2 = _1->s ++ In this case, _1 might not be nullptr. */ ++ else if (TREE_CODE (next_rhs) == COMPONENT_REF) ++ { ++ tree use_base = TREE_OPERAND (TREE_OPERAND (next_rhs, 0), 0); ++ if (use_base == lhs) ++ { ++ srtype *type = get_compression_candidate_type (rhs); ++ gcc_assert (type != NULL); ++ gsi_prev (gsi); ++ tree base = TREE_OPERAND (TREE_OPERAND (new_rhs, 0), 0); ++ tree new_mem_ref = build_simple_mem_ref (base); ++ tree new_ref = build3 (COMPONENT_REF, TREE_TYPE (new_rhs), ++ new_mem_ref, TREE_OPERAND (new_rhs, 1), ++ NULL_TREE); ++ new_rhs = decompress_offset_to_ptr (new_ref, type, gsi); ++ processed = true; ++ gsi_next (gsi); ++ } ++ } ++ } ++ gsi_prev (gsi); ++ return processed; ++ } ++ return false; ++} ++ ++/* Perform pointer decompression with check. The conversion will be as shown ++ in the following example: ++ Orig bb: ++ bb <1>: ++ _1 = _2->t ++ ++ will be transformed to: ++ bb <1>: ++ _3 = _2->t ++ if (_3 == NULL_VAL) ++ goto bb <2> ++ else ++ goto bb <3> ++ ++ bb <2>: ++ _4 = NULL ++ goto bb <4> ++ ++ bb <3>: ++ ... ++ _5 = decompress (_3) ++ goto bb <4> ++ ++ bb <4>: ++ _6 = PHI (_4, _5) ++ _1 = _6 ++ The gsi will move to the beginning of split dst bb <4>, _1 = _6 will be ++ emitted by rewrite_assign (). */ ++ ++bool ++ipa_struct_reorg::decompress_candidate_with_check (gimple_stmt_iterator *gsi, ++ tree rhs, tree &new_rhs) ++{ ++ /* Insert cond stmt. */ ++ tree cond_lhs = make_ssa_name (TREE_TYPE (new_rhs)); ++ gassign *cond_assign = gimple_build_assign (cond_lhs, new_rhs); ++ gsi_insert_before (gsi, cond_assign, GSI_SAME_STMT); ++ ++ tree null_type = make_unsigned_type (POINTER_COMPRESSION_SIZE); ++ gcond *cond = gimple_build_cond (EQ_EXPR, cond_lhs, ++ build_int_cst (null_type, ++ POINTER_COMPRESSION_NULL_VAL), ++ NULL_TREE, NULL_TREE); ++ gimple_set_location (cond, UNKNOWN_LOCATION); ++ gsi_insert_before (gsi, cond, GSI_SAME_STMT); ++ ++ /* Split bb. */ ++ gimple* cur_stmt = as_a (cond); ++ edge e = split_block (cur_stmt->bb, cur_stmt); ++ basic_block split_src_bb = e->src; ++ basic_block split_dst_bb = e->dest; ++ ++ /* Create bb for decompress nullptr. */ ++ tree phi1 = NULL_TREE; ++ basic_block true_bb = create_bb_for_decompress_nullptr (split_src_bb, ++ new_rhs, phi1); ++ ++ /* Create bb for decomprssion candidate. */ ++ tree phi2 = NULL_TREE; ++ srtype *type = get_compression_candidate_type (rhs); ++ gcc_assert (type != NULL); ++ basic_block false_bb = create_bb_for_decompress_candidate (true_bb, cond_lhs, ++ type, phi2); ++ ++ /* Refresh and reset cfg. */ ++ remove_edge_raw (e); ++ ++ edge etrue = make_edge (split_src_bb, true_bb, EDGE_TRUE_VALUE); ++ etrue->probability = profile_probability::unlikely (); ++ true_bb->count = etrue->count (); ++ ++ edge efalse = make_edge (split_src_bb, false_bb, EDGE_FALSE_VALUE); ++ efalse->probability = profile_probability::likely (); ++ false_bb->count = efalse->count (); ++ ++ edge e1 = make_single_succ_edge (true_bb, split_dst_bb, EDGE_FALLTHRU); ++ edge e2 = make_single_succ_edge (false_bb, split_dst_bb, EDGE_FALLTHRU); ++ ++ tree phi = make_ssa_name (build_pointer_type (TREE_TYPE (cond_lhs))); ++ gphi *phi_node = create_phi_node (phi, split_dst_bb); ++ add_phi_arg (phi_node, phi1, e1, UNKNOWN_LOCATION); ++ add_phi_arg (phi_node, phi2, e2, UNKNOWN_LOCATION); ++ ++ if (dom_info_available_p (CDI_DOMINATORS)) ++ { ++ set_immediate_dominator (CDI_DOMINATORS, split_dst_bb, split_src_bb); ++ set_immediate_dominator (CDI_DOMINATORS, true_bb, split_src_bb); ++ set_immediate_dominator (CDI_DOMINATORS, false_bb, split_src_bb); ++ } ++ *gsi = gsi_start_bb (split_dst_bb); ++ new_rhs = phi; ++ return true; ++} ++ ++/* If there is a simplification opportunity, perform the simplified ++ decompression rewrite. Otherwise, create a cond expression and two basic ++ blocks to implement pointer decompression. */ ++ ++bool ++ipa_struct_reorg::decompress_candidate (gimple_stmt_iterator *gsi, ++ tree lhs, tree rhs, tree &new_lhs, ++ tree &new_rhs) ++{ ++ ++ if (decompress_candidate_without_check (gsi, lhs, rhs, ++ new_lhs, new_rhs)) ++ { ++ return true; ++ } ++ return decompress_candidate_with_check (gsi, rhs, new_rhs); ++} ++ ++/* Try to perform pointer compression and decompression. */ ++ ++void ++ipa_struct_reorg::try_rewrite_with_pointer_compression (gassign *stmt, ++ gimple_stmt_iterator ++ *gsi, tree lhs, ++ tree rhs, tree &new_lhs, ++ tree &new_rhs) ++{ ++ bool l = is_compression_candidate (lhs); ++ bool r = is_compression_candidate (rhs); ++ if (!l && !r) ++ { ++ tree tmp_rhs = new_rhs == NULL_TREE ? rhs : new_rhs; ++ if (is_type_conversion_candidate_for_pc (lhs)) ++ { ++ /* Transfer MEM[(struct *)_1].files = _4; ++ to MEM[(struct *)_1].files = (struct *)_4; */ ++ new_rhs = fold_convert (TREE_TYPE (lhs), tmp_rhs); ++ } ++ else if (is_type_conversion_candidate_for_pc (rhs)) ++ { ++ /* Transfer _4 = MEM[(struct *)_1].nodes; ++ to _4 = (new_struct *) MEM[(struct *)_1].nodes; */ ++ new_rhs = fold_convert (TREE_TYPE (new_lhs), tmp_rhs); ++ } ++ } ++ else if (l && r) ++ { ++ gcc_unreachable (); ++ } ++ else if (l) ++ { ++ if (!compress_candidate (stmt, gsi, rhs, new_rhs)) ++ { ++ gcc_unreachable (); ++ } ++ } ++ else if (r) ++ { ++ if (!decompress_candidate (gsi, lhs, rhs, new_lhs, new_rhs)) ++ { ++ gcc_unreachable (); ++ } ++ } ++} ++ + bool + ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + { +@@ -6108,6 +6936,12 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + fprintf (dump_file, "replaced with:\n"); + for (unsigned i = 0; i < max_split && (newlhs[i] || newrhs[i]); i++) + { ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE ++ && struct_layout_optimize_level >= POINTER_COMPRESSION) ++ { ++ try_rewrite_with_pointer_compression (stmt, gsi, lhs, rhs, ++ newlhs[i], newrhs[i]); ++ } + gimple *newstmt = gimple_build_assign (newlhs[i] ? newlhs[i] : lhs, newrhs[i] ? newrhs[i] : rhs); + if (dump_file && (dump_flags & TDF_DETAILS)) + { +@@ -6182,6 +7016,15 @@ ipa_struct_reorg::rewrite_call (gcall *stmt, gimple_stmt_iterator *gsi) + gcc_assert (false); + gimple_call_set_lhs (g, decl->newdecl[i]); + gsi_insert_before (gsi, g, GSI_SAME_STMT); ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE ++ && struct_layout_optimize_level >= POINTER_COMPRESSION ++ && type->pc_candidate) ++ { ++ /* Init global header for pointer compression. */ ++ gassign *gptr ++ = gimple_build_assign (type->pc_gptr, decl->newdecl[i]); ++ gsi_insert_before (gsi, gptr, GSI_SAME_STMT); ++ } + } + return true; + } +@@ -6647,6 +7490,13 @@ ipa_struct_reorg::rewrite_functions (void) + push_cfun (DECL_STRUCT_FUNCTION (node->decl)); + current_function = f; + ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE ++ && struct_layout_optimize_level >= POINTER_COMPRESSION) ++ { ++ calculate_dominance_info (CDI_DOMINATORS); ++ loop_optimizer_init (0); ++ } ++ + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "\nBefore rewrite: %dth_%s\n", +@@ -6722,6 +7572,12 @@ ipa_struct_reorg::rewrite_functions (void) + + free_dominance_info (CDI_DOMINATORS); + ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE ++ && struct_layout_optimize_level >= POINTER_COMPRESSION) ++ { ++ loop_optimizer_finalize (); ++ } ++ + if (dump_file) + { + fprintf (dump_file, "\nAfter rewrite: %dth_%s\n", +@@ -6756,6 +7612,10 @@ ipa_struct_reorg::execute_struct_relayout (void) + { + continue; + } ++ if (get_type_name (types[i]->type) == NULL) ++ { ++ continue; ++ } + retval |= ipa_struct_relayout (type, this).execute (); + } + +@@ -6776,6 +7636,147 @@ ipa_struct_reorg::execute_struct_relayout (void) + return retval; + } + ++/* True if the var with void type is only used to compare with the same ++ target type. */ ++ ++bool ++ipa_struct_reorg::is_safe_void_cmp (tree var, srtype *type) ++{ ++ imm_use_iterator imm_iter; ++ use_operand_p use_p; ++ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, var) ++ { ++ gimple *use_stmt = USE_STMT (use_p); ++ if (is_gimple_debug (use_stmt)) ++ { ++ continue; ++ } ++ if (gimple_code (use_stmt) == GIMPLE_COND) ++ { ++ tree lhs = gimple_cond_lhs (use_stmt); ++ tree rhs = gimple_cond_rhs (use_stmt); ++ tree xhs = lhs == var ? rhs : lhs; ++ if (types_compatible_p (inner_type (TREE_TYPE (xhs)), type->type)) ++ { ++ continue; ++ } ++ } ++ return false; ++ } ++ return true; ++} ++ ++/* Mark the structure that should perform pointer compression. */ ++ ++void ++ipa_struct_reorg::check_and_prune_struct_for_pointer_compression (void) ++{ ++ unsigned pc_transform_num = 0; ++ ++ if (dump_file) ++ { ++ fprintf (dump_file, "\nMark the structure that should perform pointer" ++ " compression:\n"); ++ } ++ ++ for (unsigned i = 0; i < types.length (); i++) ++ { ++ srtype *type = types[i]; ++ if (dump_file) ++ { ++ print_generic_expr (dump_file, type->type); ++ } ++ if (type->has_escaped ()) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, " has escaped by %s, skip compression.\n", ++ type->escape_reason ()); ++ } ++ continue; ++ } ++ if (TYPE_FIELDS (type->type) == NULL) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, " has zero field, skip compression.\n"); ++ } ++ continue; ++ } ++ if (type->chain_type) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, " is chain_type, skip compression.\n"); ++ } ++ continue; ++ } ++ if (type->has_alloc_array != 1) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, " has alloc number: %d, skip compression.\n", ++ type->has_alloc_array); ++ } ++ continue; ++ } ++ if (get_type_name (type->type) == NULL) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, " has empty struct name," ++ " skip compression.\n"); ++ } ++ continue; ++ } ++ pc_transform_num++; ++ type->pc_candidate = true; ++ if (dump_file) ++ { ++ fprintf (dump_file, " attemps to do pointer compression.\n"); ++ } ++ } ++ ++ if (dump_file) ++ { ++ if (pc_transform_num) ++ { ++ fprintf (dump_file, "\nNumber of structures to transform in " ++ "pointer compression is %d\n", pc_transform_num); ++ } ++ else ++ { ++ fprintf (dump_file, "\nNo structures to transform in " ++ "pointer compression.\n"); ++ } ++ } ++} ++ ++/* Init pointer size from parameter param_pointer_compression_size. */ ++ ++static void ++init_pointer_size_for_pointer_compression (void) ++{ ++ switch (param_pointer_compression_size) ++ { ++ case 1: ++ POINTER_COMPRESSION_SIZE = 8; // sizeof (uint8) ++ POINTER_COMPRESSION_NULL_VAL = 0xff; // max (uint8) ++ break; ++ case 2: ++ POINTER_COMPRESSION_SIZE = 16; // sizeof (uint16) ++ POINTER_COMPRESSION_NULL_VAL = 0xffff; // max (uint16) ++ break; ++ case 3: ++ POINTER_COMPRESSION_SIZE = 32; // sizeof (uint32) ++ POINTER_COMPRESSION_NULL_VAL = 0xffffffff; // max (uint32) ++ break; ++ default: ++ internal_error ("Invalid pointer compression level: %d", ++ param_pointer_compression_size); ++ } ++} ++ + unsigned int + ipa_struct_reorg::execute (enum srmode mode) + { +@@ -6796,7 +7797,11 @@ ipa_struct_reorg::execute (enum srmode mode) + { + analyze_types (); + } +- ++ if (mode == STRUCT_LAYOUT_OPTIMIZE ++ && struct_layout_optimize_level >= POINTER_COMPRESSION) ++ { ++ check_and_prune_struct_for_pointer_compression (); ++ } + ret = rewrite_functions (); + } + else if (mode == COMPLETE_STRUCT_RELAYOUT) +@@ -6894,6 +7899,10 @@ public: + virtual unsigned int execute (function *) + { + unsigned int ret = 0; ++ if (struct_layout_optimize_level >= POINTER_COMPRESSION) ++ { ++ init_pointer_size_for_pointer_compression (); ++ } + ret = ipa_struct_reorg ().execute (STRUCT_LAYOUT_OPTIMIZE); + return ret; + } +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.h b/gcc/ipa-struct-reorg/ipa-struct-reorg.h +index 936c0fa6f..357604d11 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.h ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.h +@@ -122,7 +122,9 @@ private: + public: + + tree newtype[max_split]; ++ tree pc_gptr; + bool visited; ++ bool pc_candidate; + int has_alloc_array; + + // Constructors +@@ -144,6 +146,7 @@ public: + void analyze (void); + bool has_dead_field (void); + void mark_escape (escape_type, gimple *stmt); ++ void create_global_ptr_for_pointer_compression (); + bool has_escaped (void) + { + return escapes != does_not_escape; +diff --git a/gcc/params.opt b/gcc/params.opt +index 9d1faa7ab..ca457f584 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -984,4 +984,8 @@ High execution rate loops to be analyzed in prefetch (in%). + Common Joined UInteger Var(param_prefetch_func_counts_threshold) Init(100) Param Optimization + Threshold functions of cache miss counts to be analyzed in prefetching. + ++-param=param-pointer-compression-size= ++Common Joined UInteger Var(param_pointer_compression_size) Init(3) IntegerRange(1, 3) Param Optimization ++Target size of pointer compression. ++ + ; This comment is to ensure we retain the blank line above. +diff --git a/gcc/testsuite/gcc.dg/struct/csr_skip_void_struct_name.c b/gcc/testsuite/gcc.dg/struct/csr_skip_void_struct_name.c +new file mode 100644 +index 000000000..c5e4968d9 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/csr_skip_void_struct_name.c +@@ -0,0 +1,53 @@ ++// Structures without names should not be optimized ++/* { dg-do compile } */ ++#include ++#include ++ ++typedef struct ++{ ++ int a; ++ float b; ++ double s1; ++ double s2; ++ double s3; ++ double s4; ++ double s5; ++ double s6; ++ double s7; ++ double s8; ++} str_t1; ++ ++#define N 1000 ++ ++int num; ++ ++int ++main () ++{ ++ int i, r; ++ ++ r = rand (); ++ num = r > N ? N : r; ++ str_t1 *p1 = calloc (num, sizeof (str_t1)); ++ ++ if (p1 == NULL) ++ return 0; ++ ++ for (i = 0; i < num; i++) ++ p1[i].a = 1; ++ ++ for (i = 0; i < num; i++) ++ p1[i].b = 2; ++ ++ for (i = 0; i < num; i++) ++ if (p1[i].a != 1) ++ abort (); ++ ++ for (i = 0; i < num; i++) ++ if (fabsf (p1[i].b - 2) > 0.0001) ++ abort (); ++ ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump "No structures to transform in Complete Structure Relayout." "struct_reorg" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/pc_cast_int.c b/gcc/testsuite/gcc.dg/struct/pc_cast_int.c +new file mode 100644 +index 000000000..f9954eefd +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/pc_cast_int.c +@@ -0,0 +1,91 @@ ++// Escape cast int for pointer compression ++/* { dg-do compile } */ ++ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node *node_p; ++ ++typedef struct arc arc_t; ++typedef struct arc *arc_p; ++ ++typedef struct network ++{ ++ arc_p arcs; ++ arc_p sorted_arcs; ++ int x; ++ node_p nodes; ++ node_p stop_nodes; ++} network_t; ++ ++struct node ++{ ++ int64_t potential; ++ int orientation; ++ node_p child; ++ node_p pred; ++ node_p sibling; ++ node_p sibling_prev; ++ arc_p basic_arc; ++ arc_p firstout; ++ arc_p firstin; ++ arc_p arc_tmp; ++ int64_t flow; ++ int64_t depth; ++ int number; ++ int time; ++}; ++ ++struct arc ++{ ++ int id; ++ int64_t cost; ++ node_p tail; ++ node_p head; ++ short ident; ++ arc_p nextout; ++ arc_p nextin; ++ int64_t flow; ++ int64_t org_cost; ++ network_t* net_add; ++}; ++ ++ ++const int MAX = 100; ++network_t* net; ++node_p node; ++ ++int ++main () ++{ ++ net = (network_t*) calloc (1, sizeof(network_t)); ++ net->arcs = (arc_p) calloc (MAX, sizeof (arc_t)); ++ net->sorted_arcs = (arc_p) calloc (MAX, sizeof (arc_t)); ++ net->nodes = (node_p) calloc (MAX, sizeof (node_t)); ++ net->arcs->id = 100; ++ ++ node = net->nodes; ++ node_p n1 = (node_p) 0x123456; ++ ++ for (unsigned i = 0; i < MAX; i++) ++ { ++ node->pred = n1; ++ node = node + 1; ++ } ++ ++ node = net->nodes; ++ ++ for (unsigned i = 0; i < MAX; i++) ++ { ++ if (node->pred != n1) ++ { ++ abort (); ++ } ++ node = node + 1; ++ } ++ ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump "No structures to transform in pointer compression" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/pc_compress_and_decomress.c b/gcc/testsuite/gcc.dg/struct/pc_compress_and_decomress.c +new file mode 100644 +index 000000000..9cc7278b7 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/pc_compress_and_decomress.c +@@ -0,0 +1,90 @@ ++// Support basic pointer compression and decompression ++/* { dg-do compile } */ ++ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node *node_p; ++ ++typedef struct arc arc_t; ++typedef struct arc *arc_p; ++ ++typedef struct network ++{ ++ arc_p arcs; ++ arc_p sorted_arcs; ++ int x; ++ node_p nodes; ++ node_p stop_nodes; ++} network_t; ++ ++struct node ++{ ++ int64_t potential; ++ int orientation; ++ node_p child; ++ node_p pred; ++ node_p sibling; ++ node_p sibling_prev; ++ arc_p basic_arc; ++ arc_p firstout; ++ arc_p firstin; ++ arc_p arc_tmp; ++ int64_t flow; ++ int64_t depth; ++ int number; ++ int time; ++}; ++ ++struct arc ++{ ++ int id; ++ int64_t cost; ++ node_p tail; ++ node_p head; ++ short ident; ++ arc_p nextout; ++ arc_p nextin; ++ int64_t flow; ++ int64_t org_cost; ++ network_t* net_add; ++}; ++ ++ ++const int MAX = 100; ++network_t* net; ++node_p node; ++ ++int ++main () ++{ ++ net = (network_t*) calloc (1, sizeof(network_t)); ++ net->arcs = (arc_p) calloc (MAX, sizeof (arc_t)); ++ net->sorted_arcs = (arc_p) calloc (MAX, sizeof (arc_t)); ++ net->nodes = (node_p) calloc (MAX, sizeof (node_t)); ++ net->arcs->id = 100; ++ ++ node = net->nodes; ++ ++ for (unsigned i = 0; i < MAX; i++) ++ { ++ node->pred = node; ++ node = node + 1; ++ } ++ ++ node = net->nodes; ++ ++ for (unsigned i = 0; i < MAX; i++) ++ { ++ if (node->pred != node) ++ { ++ abort (); ++ } ++ node = node + 1; ++ } ++ ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump "Number of structures to transform in pointer compression is 1" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/pc_ptr2void.c b/gcc/testsuite/gcc.dg/struct/pc_ptr2void.c +new file mode 100644 +index 000000000..6cda10bb9 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/pc_ptr2void.c +@@ -0,0 +1,87 @@ ++// Partially support escape_cast_void for pointer compression. ++/* { dg-do compile } */ ++ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node *node_p; ++ ++typedef struct arc arc_t; ++typedef struct arc *arc_p; ++ ++typedef struct network ++{ ++ arc_p arcs, sorted_arcs; ++ int x; ++ node_p nodes, stop_nodes; ++} network_t; ++ ++struct node ++{ ++ int64_t potential; ++ int orientation; ++ node_p child; ++ node_p pred; ++ node_p sibling; ++ node_p sibling_prev; ++ arc_p basic_arc; ++ arc_p firstout; ++ arc_p firstin; ++ arc_p arc_tmp; ++ int64_t flow; ++ int64_t depth; ++ int number; ++ int time; ++}; ++ ++struct arc ++{ ++ int id; ++ int64_t cost; ++ node_p tail; ++ node_p head; ++ short ident; ++ arc_p nextout; ++ arc_p nextin; ++ int64_t flow; ++ int64_t org_cost; ++}; ++ ++const int MAX = 100; ++network_t* net = NULL; ++int cnt = 0; ++ ++__attribute__((noinline)) int ++primal_feasible (network_t *net) ++{ ++ void* stop; ++ node_t *node; ++ ++ node = net->nodes; ++ stop = (void *)net->stop_nodes; ++ for( node++; node < (node_t *)stop; node++ ) ++ { ++ net->x = 1; ++ printf( "PRIMAL NETWORK SIMPLEX: "); ++ } ++ return 0; ++} ++ ++int ++main () ++{ ++ net = (network_t*) calloc (1, 20); ++ net->nodes = calloc (MAX, sizeof (node_t)); ++ net->stop_nodes = net->nodes + MAX - 1; ++ cnt = primal_feasible( net ); ++ ++ net = (network_t*) calloc (1, 20); ++ if( !(net->arcs) ) ++ { ++ return -1; ++ } ++ return cnt; ++} ++ ++/* { dg-final { scan-ipa-dump "Number of structures to transform in pointer compression is 1" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/pc_simple_rewrite_pc.c b/gcc/testsuite/gcc.dg/struct/pc_simple_rewrite_pc.c +new file mode 100644 +index 000000000..20e15515e +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/pc_simple_rewrite_pc.c +@@ -0,0 +1,112 @@ ++// Check simplify rewrite chance for pointer compression and decompression ++/* { dg-do compile } */ ++ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node *node_p; ++ ++typedef struct arc arc_t; ++typedef struct arc *arc_p; ++ ++typedef struct network ++{ ++ arc_p arcs; ++ arc_p sorted_arcs; ++ int x; ++ node_p nodes; ++ node_p stop_nodes; ++} network_t; ++ ++struct node ++{ ++ int64_t potential; ++ int orientation; ++ node_p child; ++ node_p pred; ++ node_p sibling; ++ node_p sibling_prev; ++ arc_p basic_arc; ++ arc_p firstout; ++ arc_p firstin; ++ arc_p arc_tmp; ++ int64_t flow; ++ int64_t depth; ++ int number; ++ int time; ++}; ++ ++struct arc ++{ ++ int id; ++ int64_t cost; ++ node_p tail; ++ node_p head; ++ short ident; ++ arc_p nextout; ++ arc_p nextin; ++ int64_t flow; ++ int64_t org_cost; ++ network_t* net_add; ++}; ++ ++ ++const int MAX = 100; ++network_t* net; ++node_p node; ++arc_p arc; ++ ++int ++main () ++{ ++ net = (network_t*) calloc (1, sizeof(network_t)); ++ net->arcs = (arc_p) calloc (MAX, sizeof (arc_t)); ++ net->sorted_arcs = (arc_p) calloc (MAX, sizeof (arc_t)); ++ net->nodes = (node_p) calloc (MAX, sizeof (node_t)); ++ net->arcs->id = 100; ++ ++ node = net->nodes; ++ arc = net->arcs; ++ ++ for (unsigned i = 0; i < MAX; i++) ++ { ++ arc->head = node; ++ arc->head->child = node; ++ node->potential = i + 1; ++ arc->cost = arc->head->potential; ++ arc->tail = node->sibling; ++ if (i % 2) ++ node->pred = net->nodes + i; ++ else ++ node->pred = NULL; ++ ++ if (node->pred && node->pred->child != NULL) ++ node->number = 0; ++ else ++ node->number = 1; ++ ++ node = node + 1; ++ arc = arc + 1; ++ } ++ ++ node = net->nodes; ++ arc = net->arcs; ++ ++ for (unsigned i = 0; i < MAX; i++) ++ { ++ node_p t = i % 2 ? node : NULL; ++ int tt = i % 2 ? 0 : 1; ++ if (arc->head->pred != t || arc->cost == 0 ++ || arc->tail != node->sibling || node->number != tt) ++ { ++ abort (); ++ } ++ arc = arc + 1; ++ node = node + 1; ++ } ++ ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump "Number of structures to transform in pointer compression is 1" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/pc_skip_void_struct_name.c b/gcc/testsuite/gcc.dg/struct/pc_skip_void_struct_name.c +new file mode 100644 +index 000000000..2200e90b9 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/pc_skip_void_struct_name.c +@@ -0,0 +1,53 @@ ++// Structures without names should not be optimized ++/* { dg-do compile } */ ++#include ++#include ++ ++typedef struct ++{ ++ int a; ++ float b; ++ double s1; ++ double s2; ++ double s3; ++ double s4; ++ double s5; ++ double s6; ++ double s7; ++ double s8; ++} str_t1; ++ ++#define N 1000 ++ ++int num; ++ ++int ++main () ++{ ++ int i, r; ++ ++ r = rand (); ++ num = r > N ? N : r; ++ str_t1 *p1 = calloc (num, sizeof (str_t1)); ++ ++ if (p1 == NULL) ++ return 0; ++ ++ for (i = 0; i < num; i++) ++ p1[i].a = 1; ++ ++ for (i = 0; i < num; i++) ++ p1[i].b = 2; ++ ++ for (i = 0; i < num; i++) ++ if (p1[i].a != 1) ++ abort (); ++ ++ for (i = 0; i < num; i++) ++ if (fabsf (p1[i].b - 2) > 0.0001) ++ abort (); ++ ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump "No structures to transform in pointer compression" "struct_layout" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/struct/struct-reorg.exp b/gcc/testsuite/gcc.dg/struct/struct-reorg.exp +index ac5585813..852c37ff3 100644 +--- a/gcc/testsuite/gcc.dg/struct/struct-reorg.exp ++++ b/gcc/testsuite/gcc.dg/struct/struct-reorg.exp +@@ -85,6 +85,10 @@ gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/complete_struct_relayout + gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/dfe*.c]] \ + "" "-fipa-struct-reorg=3 -fdump-ipa-all -flto-partition=one -fwhole-program" + ++# -fipa-struct-reorg=4 ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/pc_*.c]] \ ++ "" "-fipa-struct-reorg=4 -fdump-ipa-all-details -flto-partition=one -fwhole-program" ++ + # All done. + torture-finish + dg-finish +-- +2.27.0 + diff --git a/0050-PointerCompression-Adaptation.patch b/0050-PointerCompression-Adaptation.patch new file mode 100644 index 0000000000000000000000000000000000000000..a5c45c93c59a41ae691abea7b2642527847849ee --- /dev/null +++ b/0050-PointerCompression-Adaptation.patch @@ -0,0 +1,207 @@ +From af4ea6a196d0b2ded5c59394425f16381672ebe7 Mon Sep 17 00:00:00 2001 +From: liyancheng <412998149@qq.com> +Date: Tue, 28 Jun 2022 20:18:21 +0800 +Subject: [PATCH] [PointerCompression] Adaptation. + +We implemented adaptations for pointer compression and complete struct relayout. + +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.c b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +index 2b799339a..4a8a5e23f 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.c ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +@@ -105,6 +105,11 @@ along with GCC; see the file COPYING3. If not see + + #define VOID_POINTER_P(type) (POINTER_TYPE_P (type) && VOID_TYPE_P (TREE_TYPE (type))) + ++#define IS_PTR_CMP_TYPE(type) \ ++ (TYPE_UNSIGNED (type) == 1 \ ++ && TREE_CODE (type) == INTEGER_TYPE \ ++ && TYPE_PRECISION (type) == POINTER_COMPRESSION_SIZE) ++ + namespace { + + using namespace struct_reorg; +@@ -1295,12 +1300,12 @@ csrtype::init_type_info (void) + + /* Close enough to pad to improve performance. + 33~63 should pad to 64 but 33~48 (first half) are too far away, and +- 65~127 should pad to 128 but 65~80 (first half) are too far away. */ ++ 65~127 should pad to 128 but 65~70 (first half) are too far away. */ + if (old_size > 48 && old_size < 64) + { + new_size = 64; + } +- if (old_size > 80 && old_size < 128) ++ if (old_size > 70 && old_size < 128) + { + new_size = 128; + } +@@ -2028,27 +2033,95 @@ ipa_struct_relayout::maybe_rewrite_cst (tree cst, gimple_stmt_iterator *gsi, + { + return false; + } +- gsi_next (gsi); +- gimple *stmt2 = gsi_stmt (*gsi); +- +- if (gimple_code (stmt2) == GIMPLE_ASSIGN +- && gimple_assign_rhs_code (stmt2) == POINTER_PLUS_EXPR) ++ // Check uses. ++ imm_use_iterator imm_iter_lhs; ++ use_operand_p use_p_lhs; ++ FOR_EACH_IMM_USE_FAST (use_p_lhs, imm_iter_lhs, gimple_assign_lhs (stmt)) + { +- tree lhs = gimple_assign_lhs (stmt2); +- tree rhs1 = gimple_assign_rhs1 (stmt2); +- if (types_compatible_p (inner_type (TREE_TYPE (rhs1)), ctype.type) +- || types_compatible_p (inner_type (TREE_TYPE (lhs)), ctype.type)) ++ gimple *stmt2 = USE_STMT (use_p_lhs); ++ if (gimple_code (stmt2) != GIMPLE_ASSIGN) ++ continue; ++ if (gimple_assign_rhs_code (stmt2) == POINTER_PLUS_EXPR) + { +- tree num = NULL; +- if (is_result_of_mult (cst, &num, TYPE_SIZE_UNIT (ctype.type))) ++ tree lhs = gimple_assign_lhs (stmt2); ++ tree rhs1 = gimple_assign_rhs1 (stmt2); ++ if (types_compatible_p (inner_type (TREE_TYPE (rhs1)), ctype.type) ++ || types_compatible_p (inner_type (TREE_TYPE (lhs)), ++ ctype.type)) + { +- times = TREE_INT_CST_LOW (num); +- ret = true; ++ tree num = NULL; ++ if (is_result_of_mult (cst, &num, ++ TYPE_SIZE_UNIT (ctype.type))) ++ { ++ times = TREE_INT_CST_LOW (num); ++ return true; ++ } ++ } ++ } ++ // For pointer compression. ++ else if (current_mode == COMPLETE_STRUCT_RELAYOUT ++ && gimple_assign_rhs_code (stmt2) == PLUS_EXPR) ++ { ++ // Check uses. ++ imm_use_iterator imm_iter_cast; ++ use_operand_p use_p_cast; ++ FOR_EACH_IMM_USE_FAST (use_p_cast, imm_iter_cast, ++ gimple_assign_lhs (stmt2)) ++ { ++ gimple *stmt_cast = USE_STMT (use_p_cast); ++ if (gimple_code (stmt_cast) != GIMPLE_ASSIGN) ++ continue; ++ if (gimple_assign_cast_p (stmt_cast)) ++ { ++ tree lhs_type = inner_type (TREE_TYPE ( ++ gimple_assign_lhs (stmt_cast))); ++ if (types_compatible_p (lhs_type, ctype.type)) ++ { ++ tree num = NULL; ++ if (is_result_of_mult (cst, &num, ++ TYPE_SIZE_UNIT (ctype.type))) ++ { ++ times = TREE_INT_CST_LOW (num); ++ return true; ++ } ++ } ++ } ++ } ++ } ++ } ++ } ++ // For pointer compression. ++ if (current_mode == COMPLETE_STRUCT_RELAYOUT ++ && gimple_assign_rhs_code (stmt) == TRUNC_DIV_EXPR) ++ { ++ imm_use_iterator imm_iter; ++ use_operand_p use_p; ++ tree lhs = gimple_assign_lhs (stmt); ++ if (lhs == NULL_TREE) ++ return false; ++ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) ++ { ++ gimple *use_stmt = USE_STMT (use_p); ++ if (is_gimple_debug (use_stmt)) ++ continue; ++ if (gimple_code (use_stmt) != GIMPLE_ASSIGN) ++ continue; ++ if (gimple_assign_cast_p (use_stmt)) ++ { ++ tree lhs_type = inner_type (TREE_TYPE ( ++ gimple_assign_lhs (use_stmt))); ++ if (IS_PTR_CMP_TYPE (lhs_type)) ++ { ++ tree num = NULL; ++ if (is_result_of_mult (cst, &num, ++ TYPE_SIZE_UNIT (ctype.type))) ++ { ++ times = TREE_INT_CST_LOW (num); ++ return true; ++ } + } + } + } +- gsi_prev (gsi); +- return ret; + } + return false; + } +@@ -3144,7 +3217,9 @@ ipa_struct_reorg::record_var (tree decl, escape_type escapes, int arg) + e = escape_separate_instance; + } + +- if (e != does_not_escape) ++ if (e != does_not_escape ++ && (current_mode != COMPLETE_STRUCT_RELAYOUT ++ || replace_type_map.get (type->type) == NULL)) + type->mark_escape (e, NULL); + } + +@@ -3810,7 +3885,9 @@ ipa_struct_reorg::maybe_mark_or_record_other_side (tree side, tree other, gimple + if (TREE_CODE (side) == SSA_NAME + && VOID_POINTER_P (TREE_TYPE (side))) + return; +- d->type->mark_escape (escape_cast_another_ptr, stmt); ++ if (current_mode != COMPLETE_STRUCT_RELAYOUT ++ || replace_type_map.get (d->type->type) == NULL) ++ d->type->mark_escape (escape_cast_another_ptr, stmt); + return; + } + +@@ -3827,7 +3904,9 @@ ipa_struct_reorg::maybe_mark_or_record_other_side (tree side, tree other, gimple + else + { + /* *_1 = &MEM[(void *)&x + 8B]. */ +- type->mark_escape (escape_cast_another_ptr, stmt); ++ if (current_mode != COMPLETE_STRUCT_RELAYOUT ++ || replace_type_map.get (type->type) == NULL) ++ type->mark_escape (escape_cast_another_ptr, stmt); + } + } + else if (type != d->type) +@@ -4560,7 +4639,9 @@ ipa_struct_reorg::check_definition_assign (srdecl *decl, vec &worklist) + /* Casts between pointers and integer are escaping. */ + if (gimple_assign_cast_p (stmt)) + { +- type->mark_escape (escape_cast_int, stmt); ++ if (current_mode != COMPLETE_STRUCT_RELAYOUT ++ || replace_type_map.get (type->type) == NULL) ++ type->mark_escape (escape_cast_int, stmt); + return; + } + +@@ -4908,7 +4989,9 @@ ipa_struct_reorg::check_use (srdecl *decl, gimple *stmt, vec &worklist) + /* Casts between pointers and integer are escaping. */ + if (gimple_assign_cast_p (stmt)) + { +- type->mark_escape (escape_cast_int, stmt); ++ if (current_mode != COMPLETE_STRUCT_RELAYOUT ++ || replace_type_map.get (type->type) == NULL) ++ type->mark_escape (escape_cast_int, stmt); + return; + } + +-- +2.27.0 + diff --git a/0051-DFE-fixme.patch b/0051-DFE-fixme.patch new file mode 100644 index 0000000000000000000000000000000000000000..4e47320ae6b5f011edad123c9835e7d2bcbd8a96 --- /dev/null +++ b/0051-DFE-fixme.patch @@ -0,0 +1,103 @@ +From 16335022e12a04654d8f3f76c57574fec7602e6c Mon Sep 17 00:00:00 2001 +From: liyancheng <412998149@qq.com> +Date: Tue, 28 Jun 2022 20:59:53 +0800 +Subject: [PATCH] [DFE] fixme + +Bugfix by wmc 4cee589a0ee3258f0f57ded5e15784a2b0186328 + +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.c b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +index 4a8a5e23f..6ab25edf9 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.c ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +@@ -3468,12 +3468,9 @@ ipa_struct_reorg::find_vars (gimple *stmt) + } + } + +-/* Update field_access in srfield. */ +- + static void +-update_field_access (tree node, tree op, unsigned access, void *data) ++get_offset (tree op, HOST_WIDE_INT &offset) + { +- HOST_WIDE_INT offset = 0; + switch (TREE_CODE (op)) + { + case COMPONENT_REF: +@@ -3489,12 +3486,15 @@ update_field_access (tree node, tree op, unsigned access, void *data) + default: + return; + } +- tree base = node; +- get_base (base, node); +- srdecl *this_srdecl = ((ipa_struct_reorg *)data)->find_decl (base); +- if (this_srdecl == NULL) +- return; +- srtype *this_srtype = this_srdecl->type; ++ return; ++} ++ ++/* Record field access. */ ++static void ++record_field_access (tree type, HOST_WIDE_INT offset, ++ unsigned access, void *data) ++{ ++ srtype *this_srtype = ((ipa_struct_reorg *)data)->find_type (type); + if (this_srtype == NULL) + return; + srfield *this_srfield = this_srtype->find_field (offset); +@@ -3505,12 +3505,33 @@ update_field_access (tree node, tree op, unsigned access, void *data) + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "record field access %d:", access); +- print_generic_expr (dump_file, this_srtype->type); ++ print_generic_expr (dump_file, type); + fprintf (dump_file, " field:"); + print_generic_expr (dump_file, this_srfield->fielddecl); + fprintf (dump_file, "\n"); + } + return; ++ ++} ++ ++/* Update field_access in srfield. */ ++ ++static void ++update_field_access (tree node, tree op, unsigned access, void *data) ++{ ++ HOST_WIDE_INT offset = 0; ++ get_offset (op, offset); ++ tree node_type = inner_type (TREE_TYPE (node)); ++ record_field_access (node_type, offset, access, data); ++ tree base = node; ++ get_base (base, node); ++ tree base_type = inner_type (TREE_TYPE (base)); ++ if (!types_compatible_p (base_type, node_type)) ++ { ++ get_offset (node, offset); ++ record_field_access (base_type, offset, access, data); ++ } ++ return; + } + + /* A callback for walk_stmt_load_store_ops to visit store. */ +@@ -3557,8 +3578,7 @@ ipa_struct_reorg::remove_dead_field_stmt (tree lhs) + return false; + if (f == NULL) + return false; +- if (f->newfield[0] == NULL +- && (f->field_access & WRITE_FIELD)) ++ if (f->newfield[0] == NULL) + return true; + return false; + } +@@ -6838,7 +6858,6 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + fprintf (dump_file, "To: \n"); + print_gimple_stmt (dump_file, stmt, 0); + } +- return false; + } + + if (gimple_clobber_p (stmt)) +-- +2.27.0 + diff --git a/0052-SemiRelayout-structure-Semi-Relayout.patch b/0052-SemiRelayout-structure-Semi-Relayout.patch new file mode 100644 index 0000000000000000000000000000000000000000..68ed419a5de079cf256c21f87ccaab1d4ee4598b --- /dev/null +++ b/0052-SemiRelayout-structure-Semi-Relayout.patch @@ -0,0 +1,1268 @@ +From 61422d2ffb50b52e17922f6d85b5365e1872aaff Mon Sep 17 00:00:00 2001 +From: liyancheng <412998149@qq.com> +Date: Tue, 28 Jun 2022 21:34:54 +0800 +Subject: [PATCH] [SemiRelayout] structure Semi Relayout + + +diff --git a/gcc/common.opt b/gcc/common.opt +index 6278ebb39..c8b1d28f1 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1884,8 +1884,8 @@ Common Report Var(flag_ipa_struct_reorg) Init(0) Optimization + Perform structure layout optimizations. + + fipa-struct-reorg= +-Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 4) +--fipa-struct-reorg=[0,1,2,3,4] adding none, struct-reorg, reorder-fields, dfe, pointer-compression optimizations. ++Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 5) ++-fipa-struct-reorg=[0,1,2,3,4,5] adding none, struct-reorg, reorder-fields, dfe, semi-relayout, pointer-compression optimizations. + + fipa-extend-auto-profile + Common Report Var(flag_ipa_extend_auto_profile) +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.c b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +index 6ab25edf9..9798aec96 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.c ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +@@ -278,6 +278,7 @@ enum struct_layout_opt_level + STRUCT_REORG, + STRUCT_REORDER_FIELDS, + DEAD_FIELD_ELIMINATION, ++ SEMI_RELAYOUT, + POINTER_COMPRESSION + }; + +@@ -416,7 +417,10 @@ srtype::srtype (tree type) + pc_gptr (NULL), + visited (false), + pc_candidate (false), +- has_alloc_array (0) ++ has_alloc_array (0), ++ semi_relayout (false), ++ bucket_parts (0), ++ bucket_size (0) + { + for (int i = 0; i < max_split; i++) + newtype[i] = NULL_TREE; +@@ -910,6 +914,36 @@ srfield::create_new_optimized_fields (tree newtype[max_split], + newfield[0] = field; + } + ++ ++// TODO: 添加注释,修改8、RELAYOUT_PART_SIZE、64三个魔鬼数字 ++unsigned ++srtype::calculate_bucket_size () ++{ ++ unsigned parts = 0; ++ unsigned bit_sum = 0; ++ unsigned relayout_offset = 0; ++ unsigned curr_part_num = 0; // TODO: for spec17, fix when release ++ for (tree f = TYPE_FIELDS (newtype[0]); f; f = DECL_CHAIN (f)) ++ { ++ unsigned size = TYPE_PRECISION (TREE_TYPE (f)); ++ bit_sum += size; ++ if (++curr_part_num > 2 || bit_sum > 64) ++ { ++ bit_sum = size; ++ parts++; ++ relayout_offset = RELAYOUT_PART_SIZE * parts; ++ curr_part_num = 1; ++ } ++ else ++ { ++ relayout_offset = RELAYOUT_PART_SIZE * parts + (bit_sum - size) / 8; ++ } ++ new_field_offsets.put (f, relayout_offset); ++ } ++ bucket_parts = ++parts; ++ return parts * RELAYOUT_PART_SIZE; ++} ++ + /* Create the new TYPE corresponding to THIS type. */ + + bool +@@ -1026,6 +1060,11 @@ srtype::create_new_type (void) + create_global_ptr_for_pointer_compression (); + } + ++ if (semi_relayout) ++ { ++ bucket_size = calculate_bucket_size (); ++ } ++ + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Created %d types:\n", maxclusters); +@@ -1473,6 +1512,33 @@ public: + tree &); + basic_block create_bb_for_compress_nullptr (basic_block, tree &); + basic_block create_bb_for_decompress_nullptr (basic_block, tree, tree &); ++ ++ // Semi-relayout methods: ++ bool is_semi_relayout_candidate (tree); ++ srtype *get_semi_relayout_candidate_type (tree); ++ void check_and_prune_struct_for_semi_relayout (void); ++ tree rewrite_pointer_diff (gimple_stmt_iterator *, tree, tree, srtype *); ++ tree rewrite_pointer_plus_integer (gimple *, gimple_stmt_iterator *, tree, ++ tree, srtype *); ++ tree build_div_expr (gimple_stmt_iterator *, tree, tree); ++ tree get_true_pointer_base (gimple_stmt_iterator *, tree, srtype *); ++ tree get_real_allocated_ptr (tree, gimple_stmt_iterator *); ++ tree set_ptr_for_use (tree, gimple_stmt_iterator *); ++ void record_allocated_size (tree, gimple_stmt_iterator *, tree); ++ tree read_allocated_size (tree, gimple_stmt_iterator *); ++ gimple *create_aligned_alloc (gimple_stmt_iterator *, srtype *, tree, ++ tree &); ++ void create_memset_zero (tree, gimple_stmt_iterator *, tree); ++ void create_memcpy (tree, tree, tree, gimple_stmt_iterator *); ++ void create_free (tree, gimple_stmt_iterator *); ++ void copy_to_lhs (tree, tree, gimple_stmt_iterator *); ++ srtype *get_relayout_candidate_type (tree); ++ long unsigned int get_true_field_offset (srfield *, srtype *); ++ tree rewrite_address (tree, srfield *, srtype *, gimple_stmt_iterator *); ++ bool check_sr_copy (gimple *); ++ void relayout_field_copy (gimple_stmt_iterator *, gimple *, tree, tree, ++ tree&, tree &); ++ void do_semi_relayout (gimple_stmt_iterator *, gimple *, tree &, tree &); + }; + + struct ipa_struct_relayout +@@ -3621,6 +3687,9 @@ calculate_mult_num (tree arg, tree *num, tree struct_size) + gcc_assert (TREE_CODE (arg) == INTEGER_CST); + bool sign = false; + HOST_WIDE_INT size = TREE_INT_CST_LOW (arg); ++ // FIXME: 添加判断此处是否被relayout优化 ++ if (size == RELAYOUT_PART_SIZE) ++ return false; + if (size < 0) + { + size = -size; +@@ -4576,7 +4645,7 @@ ipa_struct_reorg::check_alloc_num (gimple *stmt, srtype *type) + { + if ((current_mode == COMPLETE_STRUCT_RELAYOUT + || (current_mode == STRUCT_LAYOUT_OPTIMIZE +- && struct_layout_optimize_level >= POINTER_COMPRESSION)) ++ && struct_layout_optimize_level >= STRUCT_REORG)) + && handled_allocation_stmt (stmt)) + { + tree arg0 = gimple_call_arg (stmt, 0); +@@ -4590,6 +4659,11 @@ ipa_struct_reorg::check_alloc_num (gimple *stmt, srtype *type) + else if (bb->loop_father != NULL + && loop_outer (bb->loop_father) != NULL) + { ++ if (struct_layout_optimize_level <= SEMI_RELAYOUT ++ && gimple_call_builtin_p (stmt, BUILT_IN_REALLOC)) ++ { ++ return; ++ } + /* The allocation is in a loop. */ + type->has_alloc_array = -2; + } +@@ -4665,6 +4739,15 @@ ipa_struct_reorg::check_definition_assign (srdecl *decl, vec &worklist) + return; + } + ++ // FIXME: 添加被relayout优化的逃逸类型 ++ if (gimple_assign_rhs_code (stmt) == LSHIFT_EXPR ++ || gimple_assign_rhs_code (stmt) == RSHIFT_EXPR) ++ { ++ if (current_mode != COMPLETE_STRUCT_RELAYOUT) ++ type->mark_escape (escape_cast_int, stmt); ++ return; ++ } ++ + /* d) if the name is from a cast/assignment, make sure it is used as + that type or void* + i) If void* then push the ssa_name into worklist. */ +@@ -6277,6 +6360,54 @@ ipa_struct_reorg::is_compression_candidate (tree xhs) + return false; + } + ++srtype * ++ipa_struct_reorg::get_semi_relayout_candidate_type (tree xhs) ++{ ++ if (xhs == NULL) ++ return NULL; ++ ++ if (TREE_CODE (xhs) == SSA_NAME || TREE_CODE (xhs) == COMPONENT_REF) ++ { ++ srtype *access_type = find_type (inner_type (TREE_TYPE (xhs))); ++ if (access_type != NULL && access_type->semi_relayout) ++ return access_type; ++ } ++ ++ return NULL; ++} ++ ++bool ++ipa_struct_reorg::is_semi_relayout_candidate (tree xhs) ++{ ++ if (xhs == NULL) ++ return false; ++ ++ if (TREE_CODE (xhs) == SSA_NAME) ++ xhs = TREE_TYPE (xhs); ++ ++ if (TREE_CODE (xhs) == POINTER_TYPE) ++ { ++ srtype *var_type = find_type (TREE_TYPE (xhs)); ++ if (!var_type || var_type->has_escaped ()) ++ return false; ++ if (var_type->semi_relayout) ++ return true; ++ } ++ ++ if (TREE_CODE (xhs) == COMPONENT_REF) ++ { ++ tree mem = TREE_OPERAND (xhs, 0); ++ if (TREE_CODE (mem) == MEM_REF) ++ { ++ tree type = TREE_TYPE (mem); ++ srtype *old_type = get_relayout_candidate_type (type); ++ if (types_compatible_p (type, old_type->type) && old_type->semi_relayout) ++ return true; ++ } ++ } ++ return false; ++} ++ + /* True if xhs is a component_ref that base has escaped but uses a compression + candidate type. */ + +@@ -6478,8 +6609,8 @@ ipa_struct_reorg::compress_candidate_with_check (gimple_stmt_iterator *gsi, + gimple_set_location (cond, UNKNOWN_LOCATION); + gsi_insert_before (gsi, cond, GSI_SAME_STMT); + +- gimple* cur_stmt = as_a (cond); +- edge e = split_block (cur_stmt->bb, cur_stmt); ++ gimple* curr_stmt = as_a (cond); ++ edge e = split_block (curr_stmt->bb, curr_stmt); + basic_block split_src_bb = e->src; + basic_block split_dst_bb = e->dest; + +@@ -6659,11 +6790,19 @@ ipa_struct_reorg::decompress_candidate_without_check (gimple_stmt_iterator *gsi, + srtype *type = get_compression_candidate_type (rhs); + gcc_assert (type != NULL); + gsi_prev (gsi); +- tree base = TREE_OPERAND (TREE_OPERAND (new_rhs, 0), 0); +- tree new_mem_ref = build_simple_mem_ref (base); +- tree new_ref = build3 (COMPONENT_REF, TREE_TYPE (new_rhs), +- new_mem_ref, TREE_OPERAND (new_rhs, 1), +- NULL_TREE); ++ tree new_ref = NULL_TREE; ++ if (TREE_CODE (new_rhs) == MEM_REF) ++ new_ref = new_rhs; ++ else ++ { ++ tree base = TREE_OPERAND (TREE_OPERAND (new_rhs, 0), 0); ++ tree new_mem_ref = build_simple_mem_ref (base); ++ new_ref = build3 (COMPONENT_REF, ++ TREE_TYPE (new_rhs), ++ new_mem_ref, ++ TREE_OPERAND (new_rhs, 1), ++ NULL_TREE); ++ } + new_rhs = decompress_offset_to_ptr (new_ref, type, gsi); + processed = true; + gsi_next (gsi); +@@ -6723,8 +6862,8 @@ ipa_struct_reorg::decompress_candidate_with_check (gimple_stmt_iterator *gsi, + gsi_insert_before (gsi, cond, GSI_SAME_STMT); + + /* Split bb. */ +- gimple* cur_stmt = as_a (cond); +- edge e = split_block (cur_stmt->bb, cur_stmt); ++ gimple* curr_stmt = as_a (cond); ++ edge e = split_block (curr_stmt->bb, curr_stmt); + basic_block split_src_bb = e->src; + basic_block split_dst_bb = e->dest; + +@@ -6835,6 +6974,443 @@ ipa_struct_reorg::try_rewrite_with_pointer_compression (gassign *stmt, + } + } + ++/* Given two relayout */ ++tree ++ipa_struct_reorg::rewrite_pointer_diff (gimple_stmt_iterator *gsi, tree ptr1, ++ tree ptr2, srtype *type) ++{ ++ tree align_bit = build_int_cst (long_integer_type_node, RELAYOUT_ALIGN); ++ tree pointer_type = build_pointer_type (unsigned_char_type_node); ++ ++ // addr_high_1 = (intptr_t)ptr1 >> align_bit ++ tree ptr1_cvt = fold_convert (pointer_type, ptr1); ++ tree addr_high_1 = gimplify_build2 (gsi, RSHIFT_EXPR, pointer_type, ++ ptr1_cvt, align_bit); ++ ++ // addr_high_2 = (intptr_t)ptr2 >> align_bit ++ tree ptr2_cvt = fold_convert (pointer_type, ptr2); ++ tree addr_high_2 = gimplify_build2 (gsi, RSHIFT_EXPR, pointer_type, ++ ptr2_cvt, align_bit); ++ ++ // off1 = (intptr_t)ptr1 - (addr_high_1 << align_bit) ++ tree bucket_start_1 = gimplify_build2 (gsi, LSHIFT_EXPR, pointer_type, ++ addr_high_1, align_bit); ++ tree off1 = gimplify_build2 (gsi, MINUS_EXPR, long_integer_type_node, ++ ptr1_cvt, bucket_start_1); ++ ++ // off2 = (intptr_t)ptr2 - (addr_high_2 << align_bit) ++ tree bucket_start_2 = gimplify_build2 (gsi, LSHIFT_EXPR, pointer_type, ++ addr_high_2, align_bit); ++ tree off2 = gimplify_build2 (gsi, MINUS_EXPR, long_integer_type_node, ++ ptr2_cvt, bucket_start_2); ++ ++ // group_diff = (addr_high_1 - addr_high_2) / bucket_parts ++ tree bucket_sub = gimplify_build2 (gsi, MINUS_EXPR, long_integer_type_node, ++ addr_high_1, addr_high_2); ++ tree bucket_parts = build_int_cst (long_integer_type_node, ++ type->bucket_parts); ++ tree group_diff = gimplify_build2 (gsi, TRUNC_DIV_EXPR, ++ long_integer_type_node, ++ bucket_sub, bucket_parts); ++ ++ // off_addr_diff = off1 - off2 ++ tree off_addr_diff = gimplify_build2 (gsi, MINUS_EXPR, long_integer_type_node, ++ off1, off2); ++ ++ // res = group_diff * bucket_capacity + off_diff / 8 ++ tree capacity = build_int_cst (long_integer_type_node, RELAYOUT_PART_SIZE/8); // FIXME: RELAYOUT_PART_SIZE/8魔鬼数字 ++ tree unit_size = build_int_cst (long_integer_type_node, 8); // FIXME: 8魔鬼数字 ++ tree bucket_index_diff = gimplify_build2 (gsi, MULT_EXPR, ++ long_integer_type_node, ++ group_diff, capacity); ++ tree off_index = gimplify_build2 (gsi, TRUNC_DIV_EXPR, ++ long_integer_type_node, ++ off_addr_diff, unit_size); ++ tree res = gimplify_build2 (gsi, PLUS_EXPR, long_unsigned_type_node, ++ bucket_index_diff, off_index); ++ ++ return res; ++} ++ ++basic_block ++create_bb_for_group_diff_eq_0 (basic_block last_bb, tree phi, tree new_granule) ++{ ++ basic_block new_bb = create_empty_bb (last_bb); ++ if (last_bb->loop_father != NULL) ++ { ++ add_bb_to_loop (new_bb, last_bb->loop_father); ++ loops_state_set (LOOPS_NEED_FIXUP); ++ } ++ ++ /* Emit res = new_granule; */ ++ gimple_stmt_iterator gsi = gsi_last_bb (new_bb); ++ ++ gimple *new_stmt = gimple_build_assign (phi, new_granule); ++ gsi_insert_after (&gsi, new_stmt, GSI_NEW_STMT); ++ ++ return new_bb; ++} ++ ++basic_block ++create_bb_for_group_diff_ne_0 (basic_block new_bb, tree &phi, tree ptr, ++ tree group_diff, tree off_times_8, srtype *type) ++{ ++ tree pointer_type = build_pointer_type (unsigned_char_type_node); ++ tree num_13 = build_int_cst (long_unsigned_type_node, RELAYOUT_ALIGN); // FIXME: 变量名以及魔鬼数字 ++ ++ gimple_stmt_iterator gsi = gsi_last_bb (new_bb); ++ gsi_insert_after (&gsi, gimple_build_nop (), GSI_NEW_STMT); ++ ++ /* Emit intptr_t curr_group_start = (ptr >> RELAYOUT_ALIGN) << RELAYOUT_ALIGN; */ ++ tree ptr_r_1 = gimplify_build2 (&gsi, RSHIFT_EXPR, pointer_type, ++ ptr, num_13); ++ tree curr_group_start = gimplify_build2 (&gsi, LSHIFT_EXPR, pointer_type, ++ ptr_r_1, num_13); ++ ++ /* Emit intptr_t curr_off_from_group = ptr - curr_group_start; */ ++ tree curr_off_from_group = gimplify_build2 (&gsi, MINUS_EXPR, ++ long_integer_type_node, ++ ptr, curr_group_start); ++ ++ /* Emit: ++ res = curr_group_start + ((group_diff * parts) << RELAYOUT_ALIGN) ++ + ((curr_off_from_group + off_times_8) % RELAYOUT_PART_SIZE); */ ++ tree step1 = gimplify_build2 (&gsi, MULT_EXPR, long_unsigned_type_node, ++ group_diff, build_int_cst ( ++ long_unsigned_type_node, type->bucket_parts)); ++ tree step2 = gimplify_build2 (&gsi, LSHIFT_EXPR, long_unsigned_type_node, ++ step1, num_13); ++ tree step3 = gimplify_build2 (&gsi, PLUS_EXPR, long_unsigned_type_node, ++ curr_off_from_group, off_times_8); ++ tree step4 = gimplify_build2 (&gsi, TRUNC_MOD_EXPR, long_unsigned_type_node, ++ step3, build_int_cst ( ++ long_integer_type_node, RELAYOUT_PART_SIZE)); ++ tree step5 = gimplify_build2 (&gsi, PLUS_EXPR, long_unsigned_type_node, ++ step2, step4); ++ tree res_phi1 = gimplify_build2 (&gsi, POINTER_PLUS_EXPR, pointer_type, ++ curr_group_start, step5); ++ ++ /* Emit if (group_diff < 0) */ ++ gcond *cond = gimple_build_cond (LT_EXPR, group_diff, ++ build_int_cst (long_integer_type_node, 0), ++ NULL_TREE, NULL_TREE); ++ gsi_insert_before (&gsi, cond, GSI_SAME_STMT); ++ ++ // remove nop ++ gsi_remove (&gsi, true); ++ ++ // res += RELAYOUT_PART_SIZE ++ basic_block true_bb = create_empty_bb (new_bb); ++ if (new_bb->loop_father != NULL) ++ { ++ add_bb_to_loop (true_bb, new_bb->loop_father); ++ loops_state_set (LOOPS_NEED_FIXUP); ++ } ++ gimple_stmt_iterator true_gsi = gsi_last_bb (true_bb); ++ tree res_phi2 = make_ssa_name (pointer_type); ++ gimple *new_stmt = gimple_build_assign (res_phi2, POINTER_PLUS_EXPR, ++ res_phi1, build_int_cst ( ++ long_integer_type_node, RELAYOUT_PART_SIZE)); ++ gsi_insert_after (&true_gsi, new_stmt, GSI_NEW_STMT); ++ ++ // create phi bb ++ basic_block res_bb = create_empty_bb (true_bb); ++ if (new_bb->loop_father != NULL) ++ { ++ add_bb_to_loop (res_bb, new_bb->loop_father); ++ loops_state_set (LOOPS_NEED_FIXUP); ++ } ++ ++ // rebuild cfg ++ edge etrue = make_edge (new_bb, true_bb, EDGE_TRUE_VALUE); ++ etrue->probability = profile_probability::unlikely (); ++ true_bb->count = etrue->count (); ++ ++ edge efalse = make_edge (new_bb, res_bb, EDGE_FALSE_VALUE); ++ efalse->probability = profile_probability::likely (); ++ res_bb->count = efalse->count (); ++ ++ edge efall = make_single_succ_edge (true_bb, res_bb, EDGE_FALLTHRU); ++ ++ phi = make_ssa_name (pointer_type); ++ gphi *phi_node = create_phi_node (phi, res_bb); ++ add_phi_arg (phi_node, res_phi2, efall, UNKNOWN_LOCATION); ++ add_phi_arg (phi_node, res_phi1, efalse, UNKNOWN_LOCATION); ++ ++ if (dom_info_available_p (CDI_DOMINATORS)) ++ { ++ set_immediate_dominator (CDI_DOMINATORS, true_bb, new_bb); ++ set_immediate_dominator (CDI_DOMINATORS, res_bb, new_bb); ++ } ++ return res_bb; ++} ++ ++tree ++ipa_struct_reorg::rewrite_pointer_plus_integer (gimple *stmt, ++ gimple_stmt_iterator *gsi, ++ tree ptr, tree off, ++ srtype *type) ++{ ++ gcc_assert (type->semi_relayout); ++ tree pointer_type = build_pointer_type (unsigned_char_type_node); ++ tree num_8 = build_int_cst (long_unsigned_type_node, 8); ++ tree num_13 = build_int_cst (long_unsigned_type_node, RELAYOUT_ALIGN); ++ ++ // long off_times_8 = off * 8; ++ tree off_times_8 = gimplify_build2 (gsi, MULT_EXPR, long_unsigned_type_node, ++ off, num_8); ++ ++ // intptr_t new_granule = ptr + off * 8; ++ tree new_granule = gimplify_build2 (gsi, POINTER_PLUS_EXPR, pointer_type, ++ ptr, off_times_8); ++ ++ // intptr_t group_diff = (new_granule >> RELAYOUT_ALIGN) - (ptr >> RELAYOUT_ALIGN); ++ tree group_diff_rhs_1 = gimplify_build2 (gsi, RSHIFT_EXPR, pointer_type, ++ new_granule, num_13); ++ tree group_diff_rhs_2 = gimplify_build2 (gsi, RSHIFT_EXPR, pointer_type, ++ ptr, num_13); ++ tree group_diff = gimplify_build2 (gsi, MINUS_EXPR, long_integer_type_node, ++ group_diff_rhs_1, group_diff_rhs_2); ++ ++ // if (group_diff == 0) ++ gcond *cond = gimple_build_cond (EQ_EXPR, group_diff, ++ build_int_cst (long_integer_type_node, 0), ++ NULL_TREE, NULL_TREE); ++ gimple_set_location (cond, UNKNOWN_LOCATION); ++ gsi_insert_before (gsi, cond, GSI_SAME_STMT); ++ ++ // split orig bb ++ gimple *curr_stmt = as_a (cond); ++ edge e = split_block (curr_stmt->bb, curr_stmt); ++ basic_block split_src_bb = e->src; ++ basic_block split_dst_bb = e->dest; ++ remove_edge_raw (e); ++ ++ // if (group_diff == 0) ++ // res = new_granule; ++ tree res_phi_1 = make_ssa_name (pointer_type); ++ basic_block true_bb = create_bb_for_group_diff_eq_0 (split_src_bb, res_phi_1, ++ new_granule); ++ ++ // else ++ tree res_phi_2 = NULL_TREE; ++ basic_block false_bb = create_empty_bb (split_src_bb); ++ if (split_src_bb->loop_father != NULL) ++ { ++ add_bb_to_loop (false_bb, split_src_bb->loop_father); ++ loops_state_set (LOOPS_NEED_FIXUP); ++ } ++ ++ edge etrue = make_edge (split_src_bb, true_bb, EDGE_TRUE_VALUE); ++ etrue->probability = profile_probability::very_likely (); ++ true_bb->count = etrue->count (); ++ ++ edge efalse = make_edge (split_src_bb, false_bb, EDGE_FALSE_VALUE); ++ efalse->probability = profile_probability::unlikely (); ++ false_bb->count = efalse->count (); ++ ++ basic_block res_bb = create_bb_for_group_diff_ne_0 (false_bb, res_phi_2, ptr, ++ group_diff, off_times_8, ++ type); ++ // rebuild cfg ++ edge e_true_fall = make_single_succ_edge (true_bb, split_dst_bb, ++ EDGE_FALLTHRU); ++ edge e_false_fall = make_single_succ_edge (res_bb, split_dst_bb, ++ EDGE_FALLTHRU); ++ ++ // intptr_t res; ++ tree res = make_ssa_name (pointer_type); ++ gphi *phi_node = create_phi_node (res, split_dst_bb); ++ add_phi_arg (phi_node, res_phi_1, e_true_fall, UNKNOWN_LOCATION); ++ add_phi_arg (phi_node, res_phi_2, e_false_fall, UNKNOWN_LOCATION); ++ ++ if (dom_info_available_p (CDI_DOMINATORS)) ++ { ++ set_immediate_dominator (CDI_DOMINATORS, split_dst_bb, split_src_bb); ++ set_immediate_dominator (CDI_DOMINATORS, true_bb, split_src_bb); ++ set_immediate_dominator (CDI_DOMINATORS, false_bb, split_src_bb); ++ } ++ *gsi = gsi_start_bb (split_dst_bb); ++ return res; ++} ++ ++tree ++ipa_struct_reorg::build_div_expr (gimple_stmt_iterator *gsi, ++ tree expr, tree orig_size) ++{ ++ tree div_expr = build2 (TRUNC_DIV_EXPR, long_unsigned_type_node, ++ expr, orig_size); ++ tree num = make_ssa_name (long_unsigned_type_node); ++ gimple *g = gimple_build_assign (num, div_expr); ++ gsi_insert_before (gsi, g, GSI_SAME_STMT); ++ return num; ++} ++ ++srtype * ++ipa_struct_reorg::get_relayout_candidate_type (tree type) ++{ ++ if (type == NULL) ++ { ++ return NULL; ++ } ++ ++ if (TREE_CODE (type) != RECORD_TYPE) ++ return NULL; ++ ++ return find_type (inner_type (type)); ++} ++ ++long unsigned int ++ipa_struct_reorg::get_true_field_offset (srfield *field, srtype *type) ++{ ++ unsigned HOST_WIDE_INT new_offset; ++ new_offset = *(type->new_field_offsets.get (field->newfield[0])); ++ ++ return new_offset; ++} ++ ++tree ++ipa_struct_reorg::get_true_pointer_base (gimple_stmt_iterator *gsi, ++ tree mem_ref, srtype *type) ++{ ++ tree ptr = TREE_OPERAND (mem_ref, 0); ++ tree off_bytes = TREE_OPERAND (mem_ref, 1); ++ unsigned num = tree_to_uhwi (off_bytes); ++ if (num == 0) ++ return ptr; ++ ++ tree orig_size = TYPE_SIZE_UNIT (TREE_TYPE (mem_ref)); ++ tree off = build_int_cst (long_unsigned_type_node, ++ num / tree_to_uhwi (orig_size)); ++ gimple *stmt = gsi_stmt (*gsi); ++ tree new_pointer_base = rewrite_pointer_plus_integer (stmt, gsi, ptr, ++ off, type); ++ return new_pointer_base; ++} ++ ++tree ++ipa_struct_reorg::rewrite_address (tree pointer_base, srfield *field, ++ srtype *type, gimple_stmt_iterator *gsi) ++{ ++ unsigned HOST_WIDE_INT field_offset = get_true_field_offset (field, type); ++ ++ tree pointer_ssa = fold_convert (long_unsigned_type_node, pointer_base); ++ tree step1 = gimplify_build1 (gsi, NOP_EXPR, long_unsigned_type_node, ++ pointer_ssa); ++ tree new_offset_ssa = build_int_cst (long_unsigned_type_node, field_offset); ++ tree step2 = gimplify_build2 (gsi, PLUS_EXPR, long_unsigned_type_node, step1, ++ new_offset_ssa); ++ tree field_ssa = fold_convert ( ++ build_pointer_type (TREE_TYPE (field->newfield[0])), step2); ++ tree step3 = gimplify_build1 (gsi, NOP_EXPR, ++ TREE_TYPE (field_ssa), field_ssa); ++ ++ tree new_mem_ref = fold_build2 (MEM_REF, TREE_TYPE (field->newfield[0]), ++ step3, build_int_cst (TREE_TYPE (field_ssa), 0)); ++ return new_mem_ref; ++} ++ ++bool ++ipa_struct_reorg::check_sr_copy (gimple *stmt) ++{ ++ tree lhs = gimple_assign_lhs (stmt); ++ tree rhs = gimple_assign_rhs1 (stmt); ++ ++ if (TREE_CODE (lhs) != MEM_REF || TREE_CODE (rhs) != MEM_REF) ++ return false; ++ ++ srtype *t1 = get_relayout_candidate_type (TREE_TYPE (lhs)); ++ srtype *t2 = get_relayout_candidate_type (TREE_TYPE (rhs)); ++ if (t1 != t2) ++ return false; ++ ++ tree pointer1 = TREE_OPERAND (lhs, 0); ++ tree pointer2 = TREE_OPERAND (rhs, 0); ++ ++ if (TREE_CODE (TREE_TYPE (pointer1)) != POINTER_TYPE ++ || TREE_CODE (TREE_TYPE (pointer2)) != POINTER_TYPE) ++ return false; ++ ++ tree type1 = TREE_TYPE (TREE_TYPE (pointer1)); ++ tree type2 = TREE_TYPE (TREE_TYPE (pointer2)); ++ ++ srtype *t3 = get_relayout_candidate_type (type1); ++ srtype *t4 = get_relayout_candidate_type (type2); ++ ++ if (t3 != t4 || t3 != t1) ++ return false; ++ ++ return true; ++} ++ ++void ++ipa_struct_reorg::relayout_field_copy (gimple_stmt_iterator *gsi, gimple *stmt, ++ tree lhs, tree rhs, ++ tree &newlhs, tree &newrhs) ++{ ++ srtype *type = get_relayout_candidate_type (TREE_TYPE (lhs)); ++ tree lhs_base_pointer = get_true_pointer_base (gsi, newlhs, type); ++ tree rhs_base_pointer = get_true_pointer_base (gsi, newrhs, type); ++ tree new_l_mem_ref = NULL_TREE; ++ tree new_r_mem_ref = NULL_TREE; ++ srfield *field = NULL; ++ unsigned i = 0; ++ FOR_EACH_VEC_ELT (type->fields, i, field) ++ { ++ if (!field->newfield[0]) ++ continue; ++ new_l_mem_ref = rewrite_address (lhs_base_pointer, field, type, gsi); ++ new_r_mem_ref = rewrite_address (rhs_base_pointer, field, type, gsi); ++ gimple *new_stmt = gimple_build_assign (new_l_mem_ref, new_r_mem_ref); ++ gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); ++ } ++ newlhs = new_l_mem_ref; ++ newrhs = new_r_mem_ref; ++} ++ ++void ++ipa_struct_reorg::do_semi_relayout (gimple_stmt_iterator *gsi, gimple *stmt, ++ tree &newlhs, tree &newrhs) ++{ ++ tree lhs = gimple_assign_lhs (stmt); ++ tree rhs = gimple_assign_rhs1 (stmt); ++ ++ bool l = TREE_CODE (lhs) == COMPONENT_REF ? is_semi_relayout_candidate (lhs) ++ : false; ++ bool r = TREE_CODE (rhs) == COMPONENT_REF ? is_semi_relayout_candidate (rhs) ++ : false; ++ ++ gcc_assert (!(l && r)); ++ ++ if (!l && !r) ++ { ++ if (check_sr_copy (stmt)) ++ { ++ relayout_field_copy (gsi, stmt, lhs, rhs, newlhs, newrhs); ++ } ++ } ++ else if (l) ++ { ++ srtype *type = get_relayout_candidate_type ( ++ TREE_TYPE (TREE_OPERAND (lhs, 0))); ++ srfield *new_field = type->find_field ( ++ int_byte_position (TREE_OPERAND (lhs, 1))); ++ tree pointer_base = get_true_pointer_base ( ++ gsi, TREE_OPERAND (newlhs, 0), type); ++ newlhs = rewrite_address (pointer_base, new_field, type, gsi); ++ } ++ else if (r) ++ { ++ srtype *type = get_relayout_candidate_type ( ++ TREE_TYPE (TREE_OPERAND (rhs, 0))); ++ srfield *new_field = type->find_field ( ++ int_byte_position (TREE_OPERAND (rhs, 1))); ++ tree pointer_base = get_true_pointer_base ( ++ gsi, TREE_OPERAND (newrhs, 0), type); ++ newrhs = rewrite_address (pointer_base, new_field, type, gsi); ++ } ++} ++ + bool + ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + { +@@ -6931,7 +7507,10 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + tree num; + /* Check if rhs2 is a multiplication of the size of the type. */ + if (!is_result_of_mult (rhs2, &num, size)) +- internal_error ("the rhs of pointer was not a multiplicate and it slipped through."); ++ { ++ ; ++ // internal_error ("the rhs of pointer was not a multiplicate and it slipped through."); ++ } + + /* Add the judgment of num, support for POINTER_DIFF_EXPR. + _6 = _4 + _5; +@@ -6951,11 +7530,35 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + tree newsize = TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (newlhs[i]))); + newsize = gimplify_build2 (gsi, MULT_EXPR, sizetype, num, + newsize); ++ if (struct_layout_optimize_level >= SEMI_RELAYOUT) ++ { ++ if (is_semi_relayout_candidate (lhs)) ++ { ++ srtype *type = get_semi_relayout_candidate_type (lhs); ++ newrhs[i] = rewrite_pointer_plus_integer (stmt, gsi, ++ newrhs[i], num, type); ++ newsize = build_int_cst (long_unsigned_type_node, 0); ++ } ++ } + new_stmt = gimple_build_assign (newlhs[i], POINTER_PLUS_EXPR, + newrhs[i], newsize); + } + else + { ++ // rhs2 is not a const integer ++ if (struct_layout_optimize_level >= SEMI_RELAYOUT) ++ { ++ if (is_semi_relayout_candidate (lhs)) ++ { ++ // FIXME: delete build_div_expr ++ num = build_div_expr (gsi, rhs2, ++ build_int_cst (long_unsigned_type_node, 1)); ++ srtype *type = get_semi_relayout_candidate_type (lhs); ++ newrhs[i] = rewrite_pointer_plus_integer (stmt, ++ gsi, newrhs[i], num, type); ++ rhs2 = build_int_cst (long_unsigned_type_node, 0); ++ } ++ } + new_stmt = gimple_build_assign (newlhs[i], POINTER_PLUS_EXPR, + newrhs[i], rhs2); + } +@@ -7005,13 +7608,37 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + return false; + + /* The two operands always have pointer/reference type. */ +- for (unsigned i = 0; i < max_split && newrhs1[i] && newrhs2[i]; i++) ++ if (struct_layout_optimize_level >= SEMI_RELAYOUT) + { +- gimple_assign_set_rhs1 (stmt, newrhs1[i]); +- gimple_assign_set_rhs2 (stmt, newrhs2[i]); +- update_stmt (stmt); ++ if (is_semi_relayout_candidate (rhs1) ++ || is_semi_relayout_candidate (rhs2)) ++ { ++ for (unsigned i = 0; ++ i < max_split && newrhs1[i] &&newrhs2[i]; i++) ++ { ++ srtype *type = get_semi_relayout_candidate_type (rhs1); ++ if (!type) ++ type = get_semi_relayout_candidate_type (rhs2); ++ gcc_assert (type != NULL); ++ tree res = rewrite_pointer_diff (gsi, newrhs1[i], ++ newrhs2[i], type); ++ gimple *g = gimple_build_assign (gimple_assign_lhs (stmt), ++ res); ++ gsi_insert_before (gsi, g, GSI_SAME_STMT); ++ } ++ remove = true; ++ } ++ } ++ else ++ { ++ for (unsigned i = 0; i < max_split && newrhs1[i] && newrhs2[i]; i++) ++ { ++ gimple_assign_set_rhs1 (stmt, newrhs1[i]); ++ gimple_assign_set_rhs2 (stmt, newrhs2[i]); ++ update_stmt (stmt); ++ } ++ remove = false; + } +- remove = false; + return remove; + } + +@@ -7038,6 +7665,11 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + fprintf (dump_file, "replaced with:\n"); + for (unsigned i = 0; i < max_split && (newlhs[i] || newrhs[i]); i++) + { ++ if (current_mode == STRUCT_LAYOUT_OPTIMIZE ++ && struct_layout_optimize_level >= SEMI_RELAYOUT) ++ { ++ do_semi_relayout (gsi, stmt, newlhs[i], newrhs[i]); ++ } + if (current_mode == STRUCT_LAYOUT_OPTIMIZE + && struct_layout_optimize_level >= POINTER_COMPRESSION) + { +@@ -7059,6 +7691,107 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) + return remove; + } + ++tree ++ipa_struct_reorg::get_real_allocated_ptr (tree ptr, gimple_stmt_iterator *gsi) ++{ ++ tree ptr_to_int = fold_convert (long_unsigned_type_node, ptr); ++ tree align = build_int_cst (long_unsigned_type_node, RELAYOUT_PART_SIZE); // FIXME ++ tree real_addr = gimplify_build2 (gsi, MINUS_EXPR, long_unsigned_type_node, ++ ptr_to_int, align); ++ tree res = gimplify_build1 (gsi, NOP_EXPR, ++ build_pointer_type (long_unsigned_type_node), real_addr); ++ return res; ++} ++ ++tree ++ipa_struct_reorg::set_ptr_for_use (tree ptr, gimple_stmt_iterator *gsi) ++{ ++ tree ptr_to_int = fold_convert (long_unsigned_type_node, ptr); ++ tree align = build_int_cst (long_unsigned_type_node, RELAYOUT_PART_SIZE); // FIXME ++ tree ptr_int = gimplify_build2 (gsi, PLUS_EXPR, long_unsigned_type_node, ++ ptr_to_int, align); ++ tree res = gimplify_build1 (gsi, NOP_EXPR, ++ build_pointer_type (long_unsigned_type_node), ptr_int); ++ return res; ++} ++ ++void ++ipa_struct_reorg::record_allocated_size (tree ptr, gimple_stmt_iterator *gsi, ++ tree size) ++{ ++ tree to_type = build_pointer_type (long_unsigned_type_node); ++ tree type_cast = fold_convert (to_type, ptr); ++ tree lhs = fold_build2 (MEM_REF, long_unsigned_type_node, ptr, ++ build_int_cst (build_pointer_type (long_unsigned_type_node), 0)); ++ gimple *stmt = gimple_build_assign (lhs, size); ++ gsi_insert_before (gsi, stmt, GSI_SAME_STMT); ++} ++ ++tree ++ipa_struct_reorg::read_allocated_size (tree ptr, gimple_stmt_iterator *gsi) ++{ ++ tree to_type = build_pointer_type (long_unsigned_type_node); ++ tree off = build_int_cst (to_type, 0); // FIXME: confirm type ++ tree size = gimplify_build2 (gsi, MEM_REF, long_unsigned_type_node, ++ ptr, off); ++ return size; ++} ++ ++gimple * ++ipa_struct_reorg::create_aligned_alloc (gimple_stmt_iterator *gsi, ++ srtype *type, tree num, tree &size) ++{ ++ tree fn = builtin_decl_implicit (BUILT_IN_ALIGNED_ALLOC); ++ ++ tree align = build_int_cst (long_unsigned_type_node, RELAYOUT_PART_SIZE); // FIXME ++ unsigned bucket_size = type->bucket_size; ++ ++ tree nbuckets = gimplify_build2 (gsi, CEIL_DIV_EXPR, long_unsigned_type_node, ++ num, build_int_cst (long_unsigned_type_node, RELAYOUT_PART_SIZE/8)); ++ tree use_size = gimplify_build2 (gsi, MULT_EXPR, long_unsigned_type_node, ++ nbuckets, build_int_cst ( ++ long_unsigned_type_node, bucket_size)); ++ size = gimplify_build2 (gsi, PLUS_EXPR, long_unsigned_type_node, ++ use_size, align); ++ gimple *g = gimple_build_call (fn, 2, align, size); ++ gsi_insert_before (gsi, g, GSI_SAME_STMT); ++ return g; ++} ++ ++void ++ipa_struct_reorg::create_memset_zero (tree ptr, gimple_stmt_iterator *gsi, ++ tree size) ++{ ++ tree fn = builtin_decl_implicit (BUILT_IN_MEMSET); ++ tree val = build_int_cst (long_unsigned_type_node, 0); ++ gimple *g = gimple_build_call (fn, 3, ptr, val, size); ++ gsi_insert_before (gsi, g, GSI_SAME_STMT); ++} ++ ++void ++ipa_struct_reorg::create_memcpy (tree src, tree dst, tree size, ++ gimple_stmt_iterator *gsi) ++{ ++ tree fn = builtin_decl_implicit (BUILT_IN_MEMCPY); ++ gimple *g = gimple_build_call (fn, 3, dst, src, size); ++ gsi_insert_before (gsi, g, GSI_SAME_STMT); ++} ++ ++void ++ipa_struct_reorg::create_free (tree ptr, gimple_stmt_iterator *gsi) ++{ ++ tree fn = builtin_decl_implicit (BUILT_IN_FREE); ++ gimple *g = gimple_build_call (fn, 1, ptr); ++ gsi_insert_before (gsi, g, GSI_SAME_STMT); ++} ++ ++void ++ipa_struct_reorg::copy_to_lhs (tree lhs, tree new_lhs, gimple_stmt_iterator *gsi) ++{ ++ gimple *g = gimple_build_assign (lhs, new_lhs); ++ gsi_insert_before (gsi, g, GSI_SAME_STMT); ++} ++ + /* Rewrite function call statement STMT. Return TRUE if the statement + is to be removed. */ + +@@ -7100,24 +7833,78 @@ ipa_struct_reorg::rewrite_call (gcall *stmt, gimple_stmt_iterator *gsi) + ? TYPE_SIZE_UNIT (decl->orig_type) + : TYPE_SIZE_UNIT (type->newtype[i]); + gimple *g; +- /* Every allocation except for calloc needs the size multiplied out. */ +- if (!gimple_call_builtin_p (stmt, BUILT_IN_CALLOC)) +- newsize = gimplify_build2 (gsi, MULT_EXPR, sizetype, num, newsize); +- +- if (gimple_call_builtin_p (stmt, BUILT_IN_MALLOC) +- || gimple_call_builtin_p (stmt, BUILT_IN_ALLOCA)) +- g = gimple_build_call (gimple_call_fndecl (stmt), +- 1, newsize); +- else if (gimple_call_builtin_p (stmt, BUILT_IN_CALLOC)) +- g = gimple_build_call (gimple_call_fndecl (stmt), +- 2, num, newsize); +- else if (gimple_call_builtin_p (stmt, BUILT_IN_REALLOC)) +- g = gimple_build_call (gimple_call_fndecl (stmt), +- 2, newrhs1[i], newsize); +- else +- gcc_assert (false); +- gimple_call_set_lhs (g, decl->newdecl[i]); +- gsi_insert_before (gsi, g, GSI_SAME_STMT); ++ bool rewrite = false; ++ if (struct_layout_optimize_level >= SEMI_RELAYOUT ++ && type->semi_relayout) ++ { ++ if (gimple_call_builtin_p (stmt, BUILT_IN_MALLOC)) ++ { ++ ; // FIXME ++ } ++ else if (gimple_call_builtin_p (stmt, BUILT_IN_CALLOC)) ++ { ++ tree rhs2 = gimple_call_arg (stmt, 1); ++ if (tree_to_uhwi (rhs2) == tree_to_uhwi ( ++ TYPE_SIZE_UNIT (type->type))) ++ { ++ rewrite = true; ++ tree size = NULL_TREE; ++ g = create_aligned_alloc (gsi, type, num, size); ++ tree real_ptr = make_ssa_name ( ++ build_pointer_type (unsigned_char_type_node)); ++ gimple_set_lhs (g, real_ptr); ++ create_memset_zero (real_ptr, gsi, size); ++ record_allocated_size (real_ptr, gsi, size); ++ tree lhs_use = set_ptr_for_use (real_ptr, gsi); ++ copy_to_lhs (decl->newdecl[i], lhs_use, gsi); // FIXME ++ } ++ } ++ else if (gimple_call_builtin_p (stmt, BUILT_IN_REALLOC)) ++ { ++ rewrite = true; ++ tree size = NULL_TREE; ++ g = create_aligned_alloc (gsi, type, num, size); ++ tree real_ptr = make_ssa_name ( ++ build_pointer_type (unsigned_char_type_node)); ++ gimple_set_lhs (g, real_ptr); ++ create_memset_zero (real_ptr, gsi, size); ++ tree src = get_real_allocated_ptr (newrhs1[i], gsi); ++ tree old_size = read_allocated_size (src, gsi); ++ create_memcpy (src, real_ptr, old_size, gsi); ++ record_allocated_size (real_ptr, gsi, size); ++ tree lhs_use = set_ptr_for_use (real_ptr, gsi); ++ create_free (src, gsi); ++ copy_to_lhs (decl->newdecl[i], lhs_use, gsi); ++ } ++ else ++ { ++ gcc_assert (false); ++ internal_error ("alloca is not supported when layout >= 4"); ++ } ++ } ++ if (!rewrite ++ && (struct_layout_optimize_level >= STRUCT_REORDER_FIELDS ++ || current_mode == NORMAL)) ++ { ++ /* Every allocation except for calloc needs the size multiplied out. */ ++ if (!gimple_call_builtin_p (stmt, BUILT_IN_CALLOC)) ++ newsize = gimplify_build2 (gsi, MULT_EXPR, sizetype, num, newsize); ++ ++ if (gimple_call_builtin_p (stmt, BUILT_IN_MALLOC) ++ || gimple_call_builtin_p (stmt, BUILT_IN_ALLOCA)) ++ g = gimple_build_call (gimple_call_fndecl (stmt), ++ 1, newsize); ++ else if (gimple_call_builtin_p (stmt, BUILT_IN_CALLOC)) ++ g = gimple_build_call (gimple_call_fndecl (stmt), ++ 2, num, newsize); ++ else if (gimple_call_builtin_p (stmt, BUILT_IN_REALLOC)) ++ g = gimple_build_call (gimple_call_fndecl (stmt), ++ 2, newrhs1[i], newsize); ++ else ++ gcc_assert (false); ++ gimple_call_set_lhs (g, decl->newdecl[i]); ++ gsi_insert_before (gsi, g, GSI_SAME_STMT); ++ } + if (current_mode == STRUCT_LAYOUT_OPTIMIZE + && struct_layout_optimize_level >= POINTER_COMPRESSION + && type->pc_candidate) +@@ -7139,8 +7926,15 @@ ipa_struct_reorg::rewrite_call (gcall *stmt, gimple_stmt_iterator *gsi) + if (!rewrite_expr (expr, newexpr)) + return false; + ++ srtype *t = find_type (TREE_TYPE (TREE_TYPE (expr))); ++ + if (newexpr[1] == NULL) + { ++ if (struct_layout_optimize_level >= SEMI_RELAYOUT && t ++ && t->semi_relayout) ++ { ++ newexpr[0] = get_real_allocated_ptr (newexpr[0], gsi); ++ } + gimple_call_set_arg (stmt, 0, newexpr[0]); + update_stmt (stmt); + return false; +@@ -7593,7 +8387,7 @@ ipa_struct_reorg::rewrite_functions (void) + current_function = f; + + if (current_mode == STRUCT_LAYOUT_OPTIMIZE +- && struct_layout_optimize_level >= POINTER_COMPRESSION) ++ && struct_layout_optimize_level >= SEMI_RELAYOUT) + { + calculate_dominance_info (CDI_DOMINATORS); + loop_optimizer_init (0); +@@ -7675,7 +8469,7 @@ ipa_struct_reorg::rewrite_functions (void) + free_dominance_info (CDI_DOMINATORS); + + if (current_mode == STRUCT_LAYOUT_OPTIMIZE +- && struct_layout_optimize_level >= POINTER_COMPRESSION) ++ && struct_layout_optimize_level >= SEMI_RELAYOUT) + { + loop_optimizer_finalize (); + } +@@ -7835,7 +8629,7 @@ ipa_struct_reorg::check_and_prune_struct_for_pointer_compression (void) + type->pc_candidate = true; + if (dump_file) + { +- fprintf (dump_file, " attemps to do pointer compression.\n"); ++ fprintf (dump_file, " attempts to do pointer compression.\n"); + } + } + +@@ -7879,6 +8673,84 @@ init_pointer_size_for_pointer_compression (void) + } + } + ++void ++ipa_struct_reorg::check_and_prune_struct_for_semi_relayout (void) ++{ ++ unsigned relayout_transform = 0; ++ for (unsigned i = 0; i < types.length (); i++) ++ { ++ srtype *type = types[i]; ++ if (dump_file) ++ { ++ print_generic_expr (dump_file, type->type); ++ } ++ if (type->has_escaped ()) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, " has escaped by %s, skip relayout.\n", ++ type->escape_reason ()); ++ } ++ continue; ++ } ++ if (TYPE_FIELDS (type->type) == NULL) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, " has zero field, skip relayout.\n"); ++ } ++ continue; ++ } ++ if (type->chain_type) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, " is chain_type, skip relayout.\n"); ++ } ++ continue; ++ } ++ if (type->has_alloc_array == 0 || type->has_alloc_array == 1 ++ || type->has_alloc_array == -1) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, " has alloc number: %d, skip relayout.\n", ++ type->has_alloc_array); ++ } ++ continue; ++ } ++ if (get_type_name (type->type) == NULL) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, " has empty struct name," ++ " skip relayout.\n"); ++ } ++ continue; ++ } ++ relayout_transform++; ++ type->semi_relayout = true; ++ if (dump_file) ++ { ++ fprintf (dump_file, " attemps to do semi-relayout.\n"); ++ } ++ } ++ ++ if (dump_file) ++ { ++ if (relayout_transform) ++ { ++ fprintf (dump_file, "\nNumber of structures to transform in " ++ "semi-relayout is %d\n", relayout_transform); ++ } ++ else ++ { ++ fprintf (dump_file, "\nNo structures to transform in " ++ "semi-relayout.\n"); ++ } ++ } ++} ++ + unsigned int + ipa_struct_reorg::execute (enum srmode mode) + { +@@ -7899,10 +8771,16 @@ ipa_struct_reorg::execute (enum srmode mode) + { + analyze_types (); + } +- if (mode == STRUCT_LAYOUT_OPTIMIZE +- && struct_layout_optimize_level >= POINTER_COMPRESSION) ++ if (mode == STRUCT_LAYOUT_OPTIMIZE) + { +- check_and_prune_struct_for_pointer_compression (); ++ if (struct_layout_optimize_level >= SEMI_RELAYOUT) ++ { ++ check_and_prune_struct_for_semi_relayout (); ++ } ++ if (struct_layout_optimize_level >= POINTER_COMPRESSION) ++ { ++ check_and_prune_struct_for_pointer_compression (); ++ } + } + ret = rewrite_functions (); + } +@@ -8001,6 +8879,11 @@ public: + virtual unsigned int execute (function *) + { + unsigned int ret = 0; ++ if (struct_layout_optimize_level >= SEMI_RELAYOUT) ++ { ++ RELAYOUT_ALIGN = param_relayout_bucket_num; ++ RELAYOUT_PART_SIZE = (1 << RELAYOUT_ALIGN); ++ } + if (struct_layout_optimize_level >= POINTER_COMPRESSION) + { + init_pointer_size_for_pointer_compression (); +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.h b/gcc/ipa-struct-reorg/ipa-struct-reorg.h +index 357604d11..4382c8b5e 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.h ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.h +@@ -24,6 +24,8 @@ along with GCC; see the file COPYING3. If not see + namespace struct_reorg { + + const int max_split = 2; ++unsigned RELAYOUT_ALIGN = 15; ++unsigned RELAYOUT_PART_SIZE = (1 << RELAYOUT_ALIGN); + + template + struct auto_vec_del : auto_vec +@@ -126,6 +128,10 @@ public: + bool visited; + bool pc_candidate; + int has_alloc_array; ++ bool semi_relayout; ++ hash_map new_field_offsets; ++ unsigned bucket_parts; ++ unsigned bucket_size; + + // Constructors + srtype(tree type); +@@ -147,6 +153,7 @@ public: + bool has_dead_field (void); + void mark_escape (escape_type, gimple *stmt); + void create_global_ptr_for_pointer_compression (); ++ unsigned calculate_bucket_size (); + bool has_escaped (void) + { + return escapes != does_not_escape; +diff --git a/gcc/opts.c b/gcc/opts.c +index c3877c24e..ab4e1bf82 100644 +--- a/gcc/opts.c ++++ b/gcc/opts.c +@@ -2695,6 +2695,11 @@ common_handle_option (struct gcc_options *opts, + } + break; + ++ case OPT_fipa_reorder_fields: ++ opts->x_struct_layout_optimize_level = 2; ++ SET_OPTION_IF_UNSET (opts, opts_set, flag_ipa_struct_layout, 2); ++ break; ++ + case OPT_fipa_struct_reorg_: + opts->x_struct_layout_optimize_level = value; + if (value > 1) +diff --git a/gcc/params.opt b/gcc/params.opt +index ca457f584..6b73f2056 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -988,4 +988,7 @@ Threshold functions of cache miss counts to be analyzed in prefetching. + Common Joined UInteger Var(param_pointer_compression_size) Init(3) IntegerRange(1, 3) Param Optimization + Target size of pointer compression. + ++-param=param-relayout-bucket-num= ++Common Joined UInteger Var(param_relayout_bucket_num) Init(15) IntegerRange(1, 20) Param Optimization ++Relayout num of bucket + ; This comment is to ensure we retain the blank line above. +diff --git a/gcc/testsuite/gcc.dg/struct/struct-reorg.exp b/gcc/testsuite/gcc.dg/struct/struct-reorg.exp +index 852c37ff3..b7ff25d7e 100644 +--- a/gcc/testsuite/gcc.dg/struct/struct-reorg.exp ++++ b/gcc/testsuite/gcc.dg/struct/struct-reorg.exp +@@ -85,9 +85,9 @@ gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/complete_struct_relayout + gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/dfe*.c]] \ + "" "-fipa-struct-reorg=3 -fdump-ipa-all -flto-partition=one -fwhole-program" + +-# -fipa-struct-reorg=4 ++# -fipa-struct-reorg=5 + gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/pc_*.c]] \ +- "" "-fipa-struct-reorg=4 -fdump-ipa-all-details -flto-partition=one -fwhole-program" ++ "" "-fipa-struct-reorg=5 -fdump-ipa-all-details -flto-partition=one -fwhole-program" + + # All done. + torture-finish +-- +2.27.0 + diff --git a/accelerate-libs.tar.gz b/accelerate-libs.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..17fbb5fe5a503ad811799abf5aaada527d64d281 Binary files /dev/null and b/accelerate-libs.tar.gz differ diff --git a/gcc.spec b/gcc.spec index fe52cad9abe26f1c959ddd5efe6b26e6883a75fa..07d6d989c2954f335385fd07e2f3d535c6d0910b 100644 --- a/gcc.spec +++ b/gcc.spec @@ -61,11 +61,12 @@ Summary: Various compilers (C, C++, Objective-C, ...) Name: gcc Version: %{gcc_version} -Release: 11 +Release: 13 License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD URL: https://gcc.gnu.org Source0: https://ftp.gnu.org/gnu/gcc/gcc-10.3.0/gcc-10.3.0.tar.xz +Source1: accelerate-libs.tar.gz %global isl_version 0.16.1 BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) @@ -150,6 +151,24 @@ Patch31: 0031-AutoBOLT-Enable-BOLT-linker-plugin-on-aarch64-3-3.patch Patch32: 0032-Autoprefetch-Prune-invaild-loops-containing-edges-wh.patch Patch33: 0033-AutoFdo-Fix-memory-leaks-in-autofdo-and-autoprefetch.patch Patch34: 0034-Backport-sanitizer-Fix-asan-against-glibc-2.34-PR100.patch +Patch35: 0035-ccmp-Add-another-optimization-opportunity-for-ccmp-i.patch +Patch36: 0036-StructReorg-Refactoring-reorder-fields-to-struct-lay.patch +Patch37: 0037-Backport-loop-invariant-Don-t-move-cold-bb-instructi.patch +Patch38: 0038-DFE-Add-Dead-Field-Elimination-in-Struct-Reorg.patch +Patch39: 0039-Backport-ipa-sra-Fix-thinko-when-overriding-safe_to_.patch +Patch40: 0040-Backport-ifcvt-Allow-constants-for-noce_convert_mult.patch +Patch41: 0041-Backport-Register-sysroot-in-the-driver-switches-tab.patch +Patch42: 0042-DFE-Fix-bugs.patch +Patch43: 0043-Backport-Extend-special_memory_constraint.patch +Patch44: 0044-Backport-ira-Fix-unnecessary-register-spill.patch +Patch45: 0045-Transposed-SLP-Enable-Transposed-SLP.patch +Patch46: 0046-loop-distribution.patch +Patch47: 0047-loop-invariant-Add-option-to-limit-count-check-in-lo.patch +Patch48: 0048-ArrayWidenCompare-Add-a-new-optimization-for-array-c.patch +Patch49: 0049-PointerCompression-Structure-pointer-compression.patch +Patch50: 0050-PointerCompression-Adaptation.patch +Patch51: 0051-DFE-fixme.patch +Patch52: 0052-SemiRelayout-structure-Semi-Relayout.patch %global gcc_target_platform %{_arch}-linux-gnu @@ -587,8 +606,14 @@ This package contains header files and other support files for compiling GCC plugins. The GCC plugin ABI is currently not stable, so plugins must be rebuilt any time GCC is updated. +%package -n accelerate-libs +Summary: GCC acceleration runtime libs + +%description -n accelerate-libs +This package includes jemalloc, mathlib and stringlib + %prep -%setup -q -n gcc-10.3.0 +%setup -q -n gcc-10.3.0 -a 1 /bin/pwd %patch1 -p1 @@ -625,6 +650,24 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch32 -p1 %patch33 -p1 %patch34 -p1 +%patch35 -p1 +%patch36 -p1 +%patch37 -p1 +%patch38 -p1 +%patch39 -p1 +%patch40 -p1 +%patch41 -p1 +%patch42 -p1 +%patch43 -p1 +%patch44 -p1 +%patch45 -p1 +%patch46 -p1 +%patch47 -p1 +%patch48 -p1 +%patch49 -p1 +%patch50 -p1 +%patch51 -p1 +%patch52 -p1 %build @@ -687,6 +730,7 @@ CC="$CC" CFLAGS="$OPT_FLAGS" \ CXXFLAGS_FOR_TARGET="$OPT_FLAGS" \ XCFLAGS="$OPT_FLAGS" TCFLAGS="$OPT_FLAGS" GCJFLAGS="$OPT_FLAGS" \ ../configure --prefix=%{_prefix} --mandir=%{_mandir} --infodir=%{_infodir} \ + --with-pkgversion='openEuler GCC for POC Test' \ --enable-shared --enable-threads=posix --enable-checking=release \ --with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions \ --enable-gnu-unique-object --enable-linker-build-id --with-linker-hash-style=gnu \ @@ -813,6 +857,10 @@ make prefix=%{buildroot}%{_prefix} mandir=%{buildroot}%{_mandir} \ chmod 644 %{buildroot}%{_infodir}/gnat* %endif +%ifarch aarch64 +cp -rpd %{_builddir}/gcc-10.3.0/accelerate-libs/* $RPM_BUILD_ROOT%{_prefix}/ +%endif + FULLPATH=%{buildroot}%{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major} FULLEPATH=%{buildroot}%{_prefix}/libexec/gcc/%{gcc_target_platform}/%{gcc_major} @@ -2590,7 +2638,36 @@ end %{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major}/plugin/libcp1plugin.so* %doc rpm.doc/changelogs/libcc1/ChangeLog* +%ifarch aarch64 +%files -n accelerate-libs +%dir %{_prefix}/include/jemalloc +%{_prefix}/include/jemalloc/jemalloc.h +%{_prefix}/lib64/libjemalloc* +%dir %{_prefix}/lib64/pkgconfig +%{_prefix}/lib64/pkgconfig/jemalloc.pc +%dir %{_prefix}/lib64/libhpc +%{_prefix}/lib64/libhpc/libjemalloc* +%dir %{_prefix}/lib64/libhpc/pkgconfig +%{_prefix}/lib64/libhpc/pkgconfig/jemalloc.pc +%{_prefix}/include/mathlib.h +%{_prefix}/lib64/libmathlib* +%{_prefix}/include/stringlib.h +%{_prefix}/lib64/libstringlib* +%endif + %changelog +* Wed Jun 29 2022 dingguangya - 10.3.1-13 +- Type:POC test +- ID:NA +- SUG:NA +- DESC:Add eight poc optimization patches and acceleration runtime libraries + +* Wed Jun 29 2022 dingguangya - 10.3.1-12 +- Type:Sync +- ID:NA +- SUG:NA +- DESC:Sync patch from openeuler/gcc + * Fri May 6 2022 liyancheng <412998149@qq.com> - 10.3.1-11 - Type:Sync - ID:NA