From db8a90ff34c062bc0eb5337ee08833aebf8ef2c4 Mon Sep 17 00:00:00 2001 From: jdkboy Date: Sat, 29 Aug 2020 09:39:46 +0800 Subject: [PATCH] Add several enhancement patches - Add add-checks-to-avoid-spoiling-if-conversion.patch - Add add-option-fallow-store-data-races.patch - Add complete-struct-reorg.patch - Add cse-in-vectorization.patch - Add enable-simd-math.patch - Add fix-ICE-avoid-issueing-loads-in-SM-when-possible.patch - Add fix-ICE-in-compute_live_loop_exits.patch - Add fix-ICE-in-copy_reference_ops_from_ref.patch - Add fix-ICE-in-declare-return-variable.patch - Add fix-ICE-in-exact_div.patch - Add fix-ICE-in-gimple_op.patch - Add fix-ICE-in-model_update_limit_points_in_group.patch - Add fix-ICE-in-reload.patch - Add fix-ICE-in-store_constructor.patch - Add fix-ICE-in-vec.patch - Add fix-ICE-in-vect_create_epilog_for_reduction.patch - Add fix-ICE-in-vect_create_epilog_for_reduction_2.patch - Add fix-ICE-in-vect_create_epilog_for_reduction_3.patch - Add fix-ICE-in-vect_get_vec_def_for_stmt_copy.patch - Add fix-ICE-in-vect_slp_analyze_node_operations.patch - Add fix-ICE-in-vect_stmt_to_vectorize.patch - Add fix-ICE-in-vect_transform_stmt.patch - Add fix-ICE-in-vectorizable_condition.patch - Add fix-ICE-in-verify_ssa.patch - Add fix-ICE-statement-uses-released-SSA-name.patch - Add fix-ICE-when-vectorizing-nested-cycles.patch - Add fix-SSA-update-for-vectorizer-epilogue.patch - Add fix-do-not-build-op.patch - Add fix-load-eliding-in-SM.patch - Add fix-wrong-vectorizer-code.patch - Add generate-csel-for-arrayref.patch - Add ipa-const-prop-self-recursion-bugfix.patch - Add ipa-const-prop.patch - Add ipa-struct-reorg-bugfix.patch - Add ipa-struct-reorg.patch - Add medium-code-mode.patch - Add reduction-chain-slp-option.patch - Add reductions-slp-enhancement.patch - Add simplify-removing-subregs.patch - Add tighten-range-for-generating-csel.patch - Add vectorization-enhancement.patch - Add add-checks-to-avoid-spoiling-if-conversion.patch - Add add-option-fallow-store-data-races.patch - Add complete-struct-reorg.patch - Add cse-in-vectorization.patch - Add enable-simd-math.patch - Add fix-ICE-avoid-issueing-loads-in-SM-when-possible.patch - Add fix-ICE-in-compute_live_loop_exits.patch - Add fix-ICE-in-copy_reference_ops_from_ref.patch - Add fix-ICE-in-declare-return-variable.patch - Add fix-ICE-in-exact_div.patch - Add fix-ICE-in-gimple_op.patch - Add fix-ICE-in-model_update_limit_points_in_group.patch - Add fix-ICE-in-reload.patch - Add fix-ICE-in-store_constructor.patch - Add fix-ICE-in-vec.patch - Add fix-ICE-in-vect_create_epilog_for_reduction.patch - Add fix-ICE-in-vect_create_epilog_for_reduction_2.patch - Add fix-ICE-in-vect_create_epilog_for_reduction_3.patch - Add fix-ICE-in-vect_get_vec_def_for_stmt_copy.patch - Add fix-ICE-in-vect_slp_analyze_node_operations.patch - Add fix-ICE-in-vect_stmt_to_vectorize.patch - Add fix-ICE-in-vect_transform_stmt.patch - Add fix-ICE-in-vectorizable_condition.patch - Add fix-ICE-in-verify_ssa.patch - Add fix-ICE-statement-uses-released-SSA-name.patch - Add fix-ICE-when-vectorizing-nested-cycles.patch - Add fix-SSA-update-for-vectorizer-epilogue.patch - Add fix-do-not-build-op.patch - Add fix-load-eliding-in-SM.patch - Add fix-wrong-vectorizer-code.patch - Add generate-csel-for-arrayref.patch - Add ipa-const-prop-self-recursion-bugfix.patch - Add ipa-const-prop.patch - Add ipa-struct-reorg-bugfix.patch - Add ipa-struct-reorg.patch - Add medium-code-mode.patch - Add reduction-chain-slp-option.patch - Add reductions-slp-enhancement.patch - Add simplify-removing-subregs.patch - Add tighten-range-for-generating-csel.patch - Add vectorization-enhancement.patch --- ...ecks-to-avoid-spoiling-if-conversion.patch | 86 + add-option-fallow-store-data-races.patch | 298 + ...calculation-optimization-within-loop.patch | 10 +- change-gcc-BASE-VER.patch | 28 +- complete-struct-reorg.patch | 1814 ++ cse-in-vectorization.patch | 68 + delete-incorrect-smw.patch | 6 + enable-simd-math.patch | 34 + ...d-issueing-loads-in-SM-when-possible.patch | 123 + fix-ICE-during-pass-ccp.patch | 6 + fix-ICE-in-compute_live_loop_exits.patch | 76 + fix-ICE-in-copy_reference_ops_from_ref.patch | 70 + fix-ICE-in-declare-return-variable.patch | 31 + fix-ICE-in-exact_div.patch | 54 + fix-ICE-in-gimple_op.patch | 65 + ...n-model_update_limit_points_in_group.patch | 248 + fix-ICE-in-reload.patch | 369 + fix-ICE-in-store_constructor.patch | 356 + fix-ICE-in-vec.patch | 93 + ...-in-vect_create_epilog_for_reduction.patch | 81 + ...n-vect_create_epilog_for_reduction_2.patch | 33 + ...n-vect_create_epilog_for_reduction_3.patch | 87 + ...CE-in-vect_get_vec_def_for_stmt_copy.patch | 54 + ...-in-vect_slp_analyze_node_operations.patch | 381 + fix-ICE-in-vect_stmt_to_vectorize.patch | 41 + fix-ICE-in-vect_transform_stmt.patch | 96 + fix-ICE-in-vectorizable-load.patch | 6 + fix-ICE-in-vectorizable_condition.patch | 18 + fix-ICE-in-verify_ssa.patch | 41 + ...ICE-statement-uses-released-SSA-name.patch | 109 + fix-ICE-when-vectorizing-nested-cycles.patch | 145 + fix-SSA-update-for-vectorizer-epilogue.patch | 47 + fix-SYMBOL_TINY_GOT-handling-for-ILP32.patch | 6 + fix-cost-of-plus.patch | 3 + fix-do-not-build-op.patch | 27 + fix-load-eliding-in-SM.patch | 55 + fix-regno-out-of-range.patch | 6 + fix-wrong-vectorizer-code.patch | 71 + gcc.spec | 170 +- generate-csel-for-arrayref.patch | 218 + generate-csel.patch | 6 + ipa-const-prop-self-recursion-bugfix.patch | 191 + ipa-const-prop.patch | 11040 +++++++++ ipa-struct-reorg-bugfix.patch | 613 + ipa-struct-reorg.patch | 5846 +++++ ivopts-1.patch | 3 + ivopts-2.patch | 3 + loop-finite-bugfix.patch | 6 + loop-finite.patch | 6 + loop-split.patch | 6 + medium-code-mode.patch | 426 + reduction-chain-slp-option.patch | 52 + reductions-slp-enhancement.patch | 59 + remove-array-index-inliner-hint.patch | 6 + simplify-removing-subregs.patch | 117 + ...ug-insns-when-computing-inline-costs.patch | 6 + tighten-range-for-generating-csel.patch | 132 + vectorization-enhancement.patch | 20239 ++++++++++++++++ 58 files changed, 44250 insertions(+), 37 deletions(-) create mode 100644 add-checks-to-avoid-spoiling-if-conversion.patch create mode 100644 add-option-fallow-store-data-races.patch create mode 100644 complete-struct-reorg.patch create mode 100644 cse-in-vectorization.patch create mode 100644 enable-simd-math.patch create mode 100644 fix-ICE-avoid-issueing-loads-in-SM-when-possible.patch create mode 100644 fix-ICE-in-compute_live_loop_exits.patch create mode 100644 fix-ICE-in-copy_reference_ops_from_ref.patch create mode 100644 fix-ICE-in-declare-return-variable.patch create mode 100644 fix-ICE-in-exact_div.patch create mode 100644 fix-ICE-in-gimple_op.patch create mode 100644 fix-ICE-in-model_update_limit_points_in_group.patch create mode 100644 fix-ICE-in-reload.patch create mode 100644 fix-ICE-in-store_constructor.patch create mode 100644 fix-ICE-in-vec.patch create mode 100644 fix-ICE-in-vect_create_epilog_for_reduction.patch create mode 100644 fix-ICE-in-vect_create_epilog_for_reduction_2.patch create mode 100644 fix-ICE-in-vect_create_epilog_for_reduction_3.patch create mode 100644 fix-ICE-in-vect_get_vec_def_for_stmt_copy.patch create mode 100644 fix-ICE-in-vect_slp_analyze_node_operations.patch create mode 100644 fix-ICE-in-vect_stmt_to_vectorize.patch create mode 100644 fix-ICE-in-vect_transform_stmt.patch create mode 100644 fix-ICE-in-vectorizable_condition.patch create mode 100644 fix-ICE-in-verify_ssa.patch create mode 100644 fix-ICE-statement-uses-released-SSA-name.patch create mode 100644 fix-ICE-when-vectorizing-nested-cycles.patch create mode 100644 fix-SSA-update-for-vectorizer-epilogue.patch create mode 100644 fix-do-not-build-op.patch create mode 100644 fix-load-eliding-in-SM.patch create mode 100644 fix-wrong-vectorizer-code.patch create mode 100644 generate-csel-for-arrayref.patch create mode 100644 ipa-const-prop-self-recursion-bugfix.patch create mode 100644 ipa-const-prop.patch create mode 100644 ipa-struct-reorg-bugfix.patch create mode 100644 ipa-struct-reorg.patch create mode 100644 medium-code-mode.patch create mode 100644 reduction-chain-slp-option.patch create mode 100644 reductions-slp-enhancement.patch create mode 100644 simplify-removing-subregs.patch create mode 100644 tighten-range-for-generating-csel.patch create mode 100644 vectorization-enhancement.patch diff --git a/add-checks-to-avoid-spoiling-if-conversion.patch b/add-checks-to-avoid-spoiling-if-conversion.patch new file mode 100644 index 0000000..34d7505 --- /dev/null +++ b/add-checks-to-avoid-spoiling-if-conversion.patch @@ -0,0 +1,86 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-tree-optimization-95855-Add-checks-to-avoid-spoiling.patch +33d114f570b4a3583421c700396fd5945acebc28 + +diff -uprN a/gcc/gimple-ssa-split-paths.c b/gcc/gimple-ssa-split-paths.c +--- a/gcc/gimple-ssa-split-paths.c ++++ b/gcc/gimple-ssa-split-paths.c +@@ -34,6 +34,7 @@ along with GCC; see the file COPYING3. If not see + #include "gimple-ssa.h" + #include "tree-phinodes.h" + #include "ssa-iterators.h" ++#include "fold-const.h" + + /* Given LATCH, the latch block in a loop, see if the shape of the + path reaching LATCH is suitable for being split by duplication. +@@ -254,6 +255,44 @@ is_feasible_trace (basic_block bb) + } + } + ++ /* Canonicalize the form. */ ++ if (single_pred_p (pred1) && single_pred (pred1) == pred2 ++ && num_stmts_in_pred1 == 0) ++ std::swap (pred1, pred2); ++ ++ /* This is meant to catch another kind of cases that are likely opportunities ++ for if-conversion. After canonicalizing, PRED2 must be an empty block and ++ PRED1 must be the only predecessor of PRED2. Moreover, PRED1 is supposed ++ to end with a cond_stmt which has the same args with the PHI in BB. */ ++ if (single_pred_p (pred2) && single_pred (pred2) == pred1 ++ && num_stmts_in_pred2 == 0) ++ { ++ gimple *cond_stmt = last_stmt (pred1); ++ if (cond_stmt && gimple_code (cond_stmt) == GIMPLE_COND) ++ { ++ tree lhs = gimple_cond_lhs (cond_stmt); ++ tree rhs = gimple_cond_rhs (cond_stmt); ++ ++ gimple_stmt_iterator gsi; ++ for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ gimple *phi = gsi_stmt (gsi); ++ if ((operand_equal_p (gimple_phi_arg_def (phi, 0), lhs) ++ && operand_equal_p (gimple_phi_arg_def (phi, 1), rhs)) ++ || (operand_equal_p (gimple_phi_arg_def (phi, 0), rhs) ++ && (operand_equal_p (gimple_phi_arg_def (phi, 1), lhs)))) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, ++ "Block %d appears to be optimized to a join " ++ "point for if-convertable half-diamond.\n", ++ bb->index); ++ return false; ++ } ++ } ++ } ++ } ++ + /* If the joiner has no PHIs with useful uses there is zero chance + of CSE/DCE/jump-threading possibilities exposed by duplicating it. */ + bool found_useful_phi = false; +diff -uprN a/gcc/testsuite/gcc.dg/tree-ssa/split-path-12.c b/gcc/testsuite/gcc.dg/tree-ssa/split-path-12.c +new file mode 100644 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/split-path-12.c +@@ -0,0 +1,19 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -fsplit-paths -fdump-tree-split-paths-details " } */ ++ ++double ++foo(double *d1, double *d2, double *d3, int num, double *ip) ++{ ++ double dmax[3]; ++ ++ for (int i = 0; i < num; i++) { ++ dmax[0] = d1[i] < dmax[0] ? dmax[0] : d1[i]; ++ dmax[1] = d2[i] < dmax[1] ? dmax[1] : d2[i]; ++ dmax[2] = d3[i] < dmax[2] ? dmax[2] : d3[i]; ++ ip[i] = dmax[2]; ++ } ++ ++ return dmax[0] + dmax[1] + dmax[2]; ++} ++ ++/* { dg-final { scan-tree-dump "appears to be optimized to a join point for if-convertable half-diamond" "split-paths" } } */ diff --git a/add-option-fallow-store-data-races.patch b/add-option-fallow-store-data-races.patch new file mode 100644 index 0000000..8ecb581 --- /dev/null +++ b/add-option-fallow-store-data-races.patch @@ -0,0 +1,298 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-re-PR-middle-end-92046-Command-line-options-that-are.patch +e622a32db78300821fc1327637ec6413febc2c66 + +diff -uprN a/gcc/common.opt b/gcc/common.opt +--- a/gcc/common.opt 2020-05-28 16:12:58.815511599 +0800 ++++ b/gcc/common.opt 2020-05-28 15:54:33.797511589 +0800 +@@ -993,6 +993,10 @@ Align the start of loops. + falign-loops= + Common RejectNegative Joined Var(str_align_loops) Optimization + ++fallow-store-data-races ++Common Report Var(flag_store_data_races) Optimization ++Allow the compiler to introduce new data races on stores. ++ + fargument-alias + Common Ignore + Does nothing. Preserved for backward compatibility. +diff -uprN a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +--- a/gcc/doc/invoke.texi 2020-05-28 16:12:56.875511599 +0800 ++++ b/gcc/doc/invoke.texi 2020-05-28 15:54:33.757511589 +0800 +@@ -400,6 +400,7 @@ Objective-C and Objective-C++ Dialects}. + -falign-jumps[=@var{n}[:@var{m}:[@var{n2}[:@var{m2}]]]] @gol + -falign-labels[=@var{n}[:@var{m}:[@var{n2}[:@var{m2}]]]] @gol + -falign-loops[=@var{n}[:@var{m}:[@var{n2}[:@var{m2}]]]] @gol ++-fallow-store-data-races @gol + -fassociative-math -fauto-profile -fauto-profile[=@var{path}] @gol + -fauto-inc-dec -fbranch-probabilities @gol + -fbranch-target-load-optimize -fbranch-target-load-optimize2 @gol +@@ -8365,9 +8366,9 @@ designed to reduce code size. + Disregard strict standards compliance. @option{-Ofast} enables all + @option{-O3} optimizations. It also enables optimizations that are not + valid for all standard-compliant programs. +-It turns on @option{-ffast-math} and the Fortran-specific +-@option{-fstack-arrays}, unless @option{-fmax-stack-var-size} is +-specified, and @option{-fno-protect-parens}. ++It turns on @option{-ffast-math}, @option{-fallow-store-data-races} ++and the Fortran-specific @option{-fstack-arrays}, unless ++@option{-fmax-stack-var-size} is specified, and @option{-fno-protect-parens}. + + @item -Og + @opindex Og +@@ -10120,6 +10121,12 @@ The maximum allowed @var{n} option value + + Enabled at levels @option{-O2}, @option{-O3}. + ++@item -fallow-store-data-races ++@opindex fallow-store-data-races ++Allow the compiler to introduce new data races on stores. ++ ++Enabled at level @option{-Ofast}. ++ + @item -funit-at-a-time + @opindex funit-at-a-time + This option is left for compatibility reasons. @option{-funit-at-a-time} +@@ -11902,10 +11909,6 @@ The maximum number of conditional store + if either vectorization (@option{-ftree-vectorize}) or if-conversion + (@option{-ftree-loop-if-convert}) is disabled. + +-@item allow-store-data-races +-Allow optimizers to introduce new data races on stores. +-Set to 1 to allow, otherwise to 0. +- + @item case-values-threshold + The smallest number of different values for which it is best to use a + jump-table instead of a tree of conditional branches. If the value is +diff -uprN a/gcc/opts.c b/gcc/opts.c +--- a/gcc/opts.c 2020-05-28 16:12:58.847511599 +0800 ++++ b/gcc/opts.c 2020-05-28 15:54:35.713511589 +0800 +@@ -560,6 +560,7 @@ static const struct default_options defa + + /* -Ofast adds optimizations to -O3. */ + { OPT_LEVELS_FAST, OPT_ffast_math, NULL, 1 }, ++ { OPT_LEVELS_FAST, OPT_fallow_store_data_races, NULL, 1 }, + + { OPT_LEVELS_NONE, 0, NULL, 0 } + }; +@@ -682,13 +683,6 @@ default_options_optimization (struct gcc + : default_param_value (PARAM_MAX_DSE_ACTIVE_LOCAL_STORES) / 10, + opts->x_param_values, opts_set->x_param_values); + +- /* At -Ofast, allow store motion to introduce potential race conditions. */ +- maybe_set_param_value +- (PARAM_ALLOW_STORE_DATA_RACES, +- opts->x_optimize_fast ? 1 +- : default_param_value (PARAM_ALLOW_STORE_DATA_RACES), +- opts->x_param_values, opts_set->x_param_values); +- + if (opts->x_optimize_size) + /* We want to crossjump as much as possible. */ + maybe_set_param_value (PARAM_MIN_CROSSJUMP_INSNS, 1, +diff -uprN a/gcc/params.def b/gcc/params.def +--- a/gcc/params.def 2020-05-28 16:12:58.831511599 +0800 ++++ b/gcc/params.def 2020-05-28 15:54:35.725511589 +0800 +@@ -1199,12 +1199,6 @@ DEFPARAM (PARAM_CASE_VALUES_THRESHOLD, + "if 0, use the default for the machine.", + 0, 0, 0) + +-/* Data race flags for C++0x memory model compliance. */ +-DEFPARAM (PARAM_ALLOW_STORE_DATA_RACES, +- "allow-store-data-races", +- "Allow new data races on stores to be introduced.", +- 0, 0, 1) +- + /* Reassociation width to be used by tree reassoc optimization. */ + DEFPARAM (PARAM_TREE_REASSOC_WIDTH, + "tree-reassoc-width", +diff -uprN a/gcc/params.h b/gcc/params.h +--- a/gcc/params.h 2020-05-28 16:12:58.843511599 +0800 ++++ b/gcc/params.h 2020-05-28 15:54:35.725511589 +0800 +@@ -228,8 +228,6 @@ extern void init_param_values (int *para + PARAM_VALUE (PARAM_MAX_STORES_TO_SINK) + #define ALLOW_LOAD_DATA_RACES \ + PARAM_VALUE (PARAM_ALLOW_LOAD_DATA_RACES) +-#define ALLOW_STORE_DATA_RACES \ +- PARAM_VALUE (PARAM_ALLOW_STORE_DATA_RACES) + #define ALLOW_PACKED_LOAD_DATA_RACES \ + PARAM_VALUE (PARAM_ALLOW_PACKED_LOAD_DATA_RACES) + #define ALLOW_PACKED_STORE_DATA_RACES \ +diff -uprN a/gcc/testsuite/c-c++-common/cxxbitfields-3.c b/gcc/testsuite/c-c++-common/cxxbitfields-3.c +--- a/gcc/testsuite/c-c++-common/cxxbitfields-3.c 2020-05-28 16:12:56.959511599 +0800 ++++ b/gcc/testsuite/c-c++-common/cxxbitfields-3.c 2020-05-28 15:54:33.853511589 +0800 +@@ -1,5 +1,5 @@ + /* { dg-do compile { target i?86-*-* x86_64-*-* } } */ +-/* { dg-options "-O2 --param allow-store-data-races=0" } */ ++/* { dg-options "-O2 -fno-allow-store-data-races" } */ + + /* Make sure we don't narrow down to a QI or HI to store into VAR.J, + but instead use an SI. */ +diff -uprN a/gcc/testsuite/c-c++-common/cxxbitfields-6.c b/gcc/testsuite/c-c++-common/cxxbitfields-6.c +--- a/gcc/testsuite/c-c++-common/cxxbitfields-6.c 2020-05-28 16:12:56.935511599 +0800 ++++ b/gcc/testsuite/c-c++-common/cxxbitfields-6.c 2020-05-28 15:54:33.845511589 +0800 +@@ -1,6 +1,6 @@ + /* PR middle-end/50141 */ + /* { dg-do compile } */ +-/* { dg-options "-O2 --param allow-store-data-races=0" } */ ++/* { dg-options "-O2 -fno-allow-store-data-races" } */ + + struct S + { +diff -uprN a/gcc/testsuite/c-c++-common/simulate-thread/bitfields-1.c b/gcc/testsuite/c-c++-common/simulate-thread/bitfields-1.c +--- a/gcc/testsuite/c-c++-common/simulate-thread/bitfields-1.c 2020-05-28 16:12:56.939511599 +0800 ++++ b/gcc/testsuite/c-c++-common/simulate-thread/bitfields-1.c 2020-05-28 15:54:33.821511589 +0800 +@@ -1,5 +1,5 @@ + /* { dg-do link } */ +-/* { dg-options "--param allow-store-data-races=0" } */ ++/* { dg-options "-fno-allow-store-data-races" } */ + /* { dg-final { simulate-thread } } */ + + #include +diff -uprN a/gcc/testsuite/c-c++-common/simulate-thread/bitfields-2.c b/gcc/testsuite/c-c++-common/simulate-thread/bitfields-2.c +--- a/gcc/testsuite/c-c++-common/simulate-thread/bitfields-2.c 2020-05-28 16:12:56.939511599 +0800 ++++ b/gcc/testsuite/c-c++-common/simulate-thread/bitfields-2.c 2020-05-28 15:54:33.821511589 +0800 +@@ -1,5 +1,5 @@ + /* { dg-do link { target { ! int16 } } } */ +-/* { dg-options "--param allow-store-data-races=0" } */ ++/* { dg-options "-fno-allow-store-data-races" } */ + /* { dg-final { simulate-thread } } */ + + #include +diff -uprN a/gcc/testsuite/c-c++-common/simulate-thread/bitfields-3.c b/gcc/testsuite/c-c++-common/simulate-thread/bitfields-3.c +--- a/gcc/testsuite/c-c++-common/simulate-thread/bitfields-3.c 2020-05-28 16:12:56.939511599 +0800 ++++ b/gcc/testsuite/c-c++-common/simulate-thread/bitfields-3.c 2020-05-28 15:54:33.821511589 +0800 +@@ -1,5 +1,5 @@ + /* { dg-do link } */ +-/* { dg-options "--param allow-store-data-races=0" } */ ++/* { dg-options "-fno-allow-store-data-races" } */ + /* { dg-final { simulate-thread } } */ + + #include +diff -uprN a/gcc/testsuite/c-c++-common/simulate-thread/bitfields-4.c b/gcc/testsuite/c-c++-common/simulate-thread/bitfields-4.c +--- a/gcc/testsuite/c-c++-common/simulate-thread/bitfields-4.c 2020-05-28 16:12:56.939511599 +0800 ++++ b/gcc/testsuite/c-c++-common/simulate-thread/bitfields-4.c 2020-05-28 15:54:33.821511589 +0800 +@@ -1,5 +1,5 @@ + /* { dg-do link } */ +-/* { dg-options "--param allow-store-data-races=0" } */ ++/* { dg-options "-fno-allow-store-data-races" } */ + /* { dg-final { simulate-thread } } */ + + #include +diff -uprN a/gcc/testsuite/gcc.dg/lto/pr52097_0.c b/gcc/testsuite/gcc.dg/lto/pr52097_0.c +--- a/gcc/testsuite/gcc.dg/lto/pr52097_0.c 2020-05-28 16:12:57.803511599 +0800 ++++ b/gcc/testsuite/gcc.dg/lto/pr52097_0.c 2020-05-28 15:54:34.777511589 +0800 +@@ -1,5 +1,5 @@ + /* { dg-lto-do link } */ +-/* { dg-lto-options { { -O -flto -fexceptions -fnon-call-exceptions --param allow-store-data-races=0 } } } */ ++/* { dg-lto-options { { -O -flto -fexceptions -fnon-call-exceptions -fno-allow-store-data-races } } } */ + /* { dg-require-effective-target exceptions } */ + + typedef struct { unsigned int e0 : 16; } s1; +diff -uprN a/gcc/testsuite/gcc.dg/simulate-thread/speculative-store-2.c b/gcc/testsuite/gcc.dg/simulate-thread/speculative-store-2.c +--- a/gcc/testsuite/gcc.dg/simulate-thread/speculative-store-2.c 2020-05-28 16:12:57.815511599 +0800 ++++ b/gcc/testsuite/gcc.dg/simulate-thread/speculative-store-2.c 2020-05-28 15:54:34.781511589 +0800 +@@ -1,5 +1,5 @@ + /* { dg-do link } */ +-/* { dg-options "--param allow-store-data-races=0 -O2" } */ ++/* { dg-options "-fno-allow-store-data-races -O2" } */ + /* { dg-final { simulate-thread } } */ + + #include +diff -uprN a/gcc/testsuite/gcc.dg/simulate-thread/speculative-store-3.c b/gcc/testsuite/gcc.dg/simulate-thread/speculative-store-3.c +--- a/gcc/testsuite/gcc.dg/simulate-thread/speculative-store-3.c 2020-05-28 16:12:57.815511599 +0800 ++++ b/gcc/testsuite/gcc.dg/simulate-thread/speculative-store-3.c 2020-05-28 15:54:34.781511589 +0800 +@@ -1,5 +1,5 @@ + /* { dg-do link } */ +-/* { dg-options "--param allow-store-data-races=0 -O2" } */ ++/* { dg-options "-fno-allow-store-data-races -O2" } */ + /* { dg-final { simulate-thread } } */ + + #include +diff -uprN a/gcc/testsuite/gcc.dg/simulate-thread/speculative-store-4.c b/gcc/testsuite/gcc.dg/simulate-thread/speculative-store-4.c +--- a/gcc/testsuite/gcc.dg/simulate-thread/speculative-store-4.c 2020-05-28 16:12:57.815511599 +0800 ++++ b/gcc/testsuite/gcc.dg/simulate-thread/speculative-store-4.c 2020-05-28 15:54:34.781511589 +0800 +@@ -1,5 +1,5 @@ + /* { dg-do link } */ +-/* { dg-options "--param allow-store-data-races=0" } */ ++/* { dg-options "-fno-allow-store-data-races" } */ + /* { dg-final { simulate-thread } } */ + + #include +diff -uprN a/gcc/testsuite/gcc.dg/simulate-thread/speculative-store.c b/gcc/testsuite/gcc.dg/simulate-thread/speculative-store.c +--- a/gcc/testsuite/gcc.dg/simulate-thread/speculative-store.c 2020-05-28 16:12:57.815511599 +0800 ++++ b/gcc/testsuite/gcc.dg/simulate-thread/speculative-store.c 2020-05-28 15:54:34.781511589 +0800 +@@ -1,12 +1,12 @@ + /* { dg-do link } */ +-/* { dg-options "--param allow-store-data-races=0" } */ ++/* { dg-options "-fno-allow-store-data-races" } */ + /* { dg-final { simulate-thread } } */ + + #include + #include "simulate-thread.h" + + /* This file tests that speculative store movement out of a loop doesn't +- happen. This is disallowed when --param allow-store-data-races is 0. */ ++ happen. This is disallowed when -fno-allow-store-data-races. */ + + int global = 100; + +diff -uprN a/gcc/testsuite/gcc.dg/tree-ssa/20050314-1.c b/gcc/testsuite/gcc.dg/tree-ssa/20050314-1.c +--- a/gcc/testsuite/gcc.dg/tree-ssa/20050314-1.c 2020-05-28 16:12:58.027511599 +0800 ++++ b/gcc/testsuite/gcc.dg/tree-ssa/20050314-1.c 2020-05-28 15:54:34.997511589 +0800 +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O1 -fdump-tree-lim2-details --param allow-store-data-races=1" } */ ++/* { dg-options "-O1 -fdump-tree-lim2-details -fallow-store-data-races" } */ + + float a[100]; + +diff -uprN a/gcc/testsuite/g++.dg/simulate-thread/bitfields-2.C b/gcc/testsuite/g++.dg/simulate-thread/bitfields-2.C +--- a/gcc/testsuite/g++.dg/simulate-thread/bitfields-2.C 2020-05-28 16:12:57.015511599 +0800 ++++ b/gcc/testsuite/g++.dg/simulate-thread/bitfields-2.C 2020-05-28 15:54:33.885511589 +0800 +@@ -1,5 +1,5 @@ + /* { dg-do link } */ +-/* { dg-options "--param allow-store-data-races=0" } */ ++/* { dg-options "-fno-allow-store-data-races" } */ + /* { dg-final { simulate-thread } } */ + + /* Test that setting does not touch either or . +diff -uprN a/gcc/testsuite/g++.dg/simulate-thread/bitfields.C b/gcc/testsuite/g++.dg/simulate-thread/bitfields.C +--- a/gcc/testsuite/g++.dg/simulate-thread/bitfields.C 2020-05-28 16:12:57.015511599 +0800 ++++ b/gcc/testsuite/g++.dg/simulate-thread/bitfields.C 2020-05-28 15:54:33.885511589 +0800 +@@ -1,5 +1,5 @@ + /* { dg-do link } */ +-/* { dg-options "--param allow-store-data-races=0" } */ ++/* { dg-options "-fno-allow-store-data-races" } */ + /* { dg-final { simulate-thread } } */ + + /* Test that setting does not touch either or . +diff -uprN a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c +--- a/gcc/tree-if-conv.c 2020-05-28 16:12:58.831511599 +0800 ++++ b/gcc/tree-if-conv.c 2020-05-28 15:54:35.641511589 +0800 +@@ -913,10 +913,10 @@ ifcvt_memrefs_wont_trap (gimple *stmt, v + to unconditionally. */ + if (base_master_dr + && DR_BASE_W_UNCONDITIONALLY (*base_master_dr)) +- return PARAM_VALUE (PARAM_ALLOW_STORE_DATA_RACES); ++ return flag_store_data_races; + /* or the base is known to be not readonly. */ + else if (base_object_writable (DR_REF (a))) +- return PARAM_VALUE (PARAM_ALLOW_STORE_DATA_RACES); ++ return flag_store_data_races; + } + + return false; +diff -uprN a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c +--- a/gcc/tree-ssa-loop-im.c 2020-05-28 16:12:58.779511599 +0800 ++++ b/gcc/tree-ssa-loop-im.c 2020-05-28 15:54:35.729511589 +0800 +@@ -2088,7 +2088,7 @@ execute_sm (struct loop *loop, vec + for_each_index (&ref->mem.ref, force_move_till, &fmt_data); + + if (bb_in_transaction (loop_preheader_edge (loop)->src) +- || (! PARAM_VALUE (PARAM_ALLOW_STORE_DATA_RACES) ++ || (! flag_store_data_races + && ! ref_always_accessed_p (loop, ref, true))) + multi_threaded_model_p = true; + diff --git a/address-calculation-optimization-within-loop.patch b/address-calculation-optimization-within-loop.patch index be36f80..9bfa0b6 100644 --- a/address-calculation-optimization-within-loop.patch +++ b/address-calculation-optimization-within-loop.patch @@ -1,3 +1,9 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-widening_mul-restrict-ops-to-be-defined-in-the-same-.patch: +d21dff5b4fee51ae432143065bededfc763dc344 + diff -Nurp a/gcc/testsuite/gcc.dg/pr94269.c b/gcc/testsuite/gcc.dg/pr94269.c --- a/gcc/testsuite/gcc.dg/pr94269.c 1970-01-01 08:00:00.000000000 +0800 +++ b/gcc/testsuite/gcc.dg/pr94269.c 2020-04-17 17:04:50.608000000 +0800 @@ -56,8 +62,8 @@ diff -Nurp a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c &type2, &mult_rhs2)) return false; diff -Nurp a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c ---- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c 2020-03-31 09:51:36.000000000 +0800 -+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c 2020-04-29 10:55:44.937471475 +0800 +--- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c 2020-03-31 09:51:36.000000000 +0800 ++++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c 2020-04-29 10:55:44.937471475 +0800 @@ -17,7 +17,6 @@ f (TYPE *x, TYPE *y, unsigned short n, l /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */ /* Should multiply by (VF-1)*4 rather than (257-1)*4. */ diff --git a/change-gcc-BASE-VER.patch b/change-gcc-BASE-VER.patch index 92ae397..79dd167 100644 --- a/change-gcc-BASE-VER.patch +++ b/change-gcc-BASE-VER.patch @@ -1,17 +1,19 @@ -diff -uprN a/gcc/BASE-VER b/gcc/BASE-VER ---- a/gcc/BASE-VER 2020-03-31 09:51:52.000000000 +0800 -+++ b/gcc/BASE-VER 2020-05-14 16:45:36.416688565 +0800 +diff -Nurp a/gcc/BASE-VER b/gcc/BASE-VER +--- a/gcc/BASE-VER 2020-08-19 10:47:14.100000000 +0800 ++++ b/gcc/BASE-VER 2020-08-19 10:32:30.380000000 +0800 @@ -1 +1 @@ -9.3.0 +9.3.1 -diff -uprN a/gcc/ChangeLog b/gcc/ChangeLog ---- a/gcc/ChangeLog 2020-03-31 09:51:30.000000000 +0800 -+++ b/gcc/ChangeLog 2020-05-14 16:45:36.420688565 +0800 -@@ -1,3 +1,7 @@ -+2020-03-12 openEuler -+ -+ * BASE-VER: Set to 9.3.1. -+ - 2020-03-12 Release Manager +diff -Nurp a/gcc/Makefile.in b/gcc/Makefile.in +--- a/gcc/Makefile.in 2020-08-19 10:32:45.528000000 +0800 ++++ b/gcc/Makefile.in 2020-08-19 10:34:24.968000000 +0800 +@@ -885,8 +885,7 @@ PATCHLEVEL_c := \ + # significant - do not remove it. + BASEVER_s := "\"$(BASEVER_c)\"" + DEVPHASE_s := "\"$(if $(DEVPHASE_c), ($(DEVPHASE_c)))\"" +-DATESTAMP_s := \ +- "\"$(if $(DEVPHASE_c)$(filter-out 0,$(PATCHLEVEL_c)), $(DATESTAMP_c))\"" ++DATESTAMP_s := "\"\"" + PKGVERSION_s:= "\"@PKGVERSION@\"" + BUGURL_s := "\"@REPORT_BUGS_TO@\"" - * GCC 9.3.0 released. diff --git a/complete-struct-reorg.patch b/complete-struct-reorg.patch new file mode 100644 index 0000000..60c8cf2 --- /dev/null +++ b/complete-struct-reorg.patch @@ -0,0 +1,1814 @@ +diff -Nurp a/gcc/ipa-struct-reorg/escapes.def b/gcc/ipa-struct-reorg/escapes.def +--- a/gcc/ipa-struct-reorg/escapes.def 2020-07-18 05:11:11.548000000 -0400 ++++ b/gcc/ipa-struct-reorg/escapes.def 2020-07-18 05:16:25.928000000 -0400 +@@ -56,5 +56,7 @@ DEF_ESCAPE (escape_non_optimize, "Type u + DEF_ESCAPE (escape_array, "Type is used in an array [not handled yet]") + DEF_ESCAPE (escape_ptr_ptr, "Type is used in a pointer to a pointer [not handled yet]") + DEF_ESCAPE (escape_return, "Type escapes via a return [not handled yet]") ++DEF_ESCAPE (escape_separate_instance, "Type escapes via a separate instance") ++DEF_ESCAPE (escape_unhandled_rewrite, "Type escapes via a unhandled rewrite stmt") + + #undef DEF_ESCAPE +diff -Nurp a/gcc/ipa-struct-reorg/ipa-struct-reorg.c b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.c 2020-07-18 05:11:17.664000000 -0400 ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.c 2020-07-20 09:08:08.912000000 -0400 +@@ -95,6 +95,7 @@ along with GCC; see the file COPYING3. + #include "ipa-struct-reorg.h" + #include "tree-eh.h" + #include "bitmap.h" ++#include "cfgloop.h" + #include "ipa-param-manipulation.h" + #include "tree-ssa-live.h" /* For remove_unused_locals. */ + +@@ -103,6 +104,7 @@ along with GCC; see the file COPYING3. + namespace { + + using namespace struct_reorg; ++using namespace struct_relayout; + + /* Return true iff TYPE is stdarg va_list type. */ + +@@ -152,6 +154,14 @@ handled_type (tree type) + return false; + } + ++enum srmode ++{ ++ NORMAL = 0, ++ COMPLETE_STRUCT_RELAYOUT ++}; ++ ++static bool is_result_of_mult (tree arg, tree *num, tree struct_size); ++ + } // anon namespace + + namespace struct_reorg { +@@ -241,7 +251,8 @@ srtype::srtype (tree type) + : type (type), + chain_type (false), + escapes (does_not_escape), +- visited (false) ++ visited (false), ++ has_alloc_array (0) + { + for (int i = 0; i < max_split; i++) + newtype[i] = NULL_TREE; +@@ -441,13 +452,6 @@ srtype::dump (FILE *f) + fn->simple_dump (f); + } + fprintf (f, "\n }\n"); +- fprintf (f, "\n field_sites = {"); +- FOR_EACH_VEC_ELT (field_sites, i, field) +- { +- fprintf (f, " \n"); +- field->simple_dump (f); +- } +- fprintf (f, "\n }\n"); + fprintf (f, "}\n"); + } + +@@ -798,12 +802,6 @@ srfield::dump (FILE *f) + fprintf (f, ", offset = " HOST_WIDE_INT_PRINT_DEC, offset); + fprintf (f, ", type = "); + print_generic_expr (f, fieldtype); +- if (type) +- { +- fprintf (f, "( srtype = "); +- type->simple_dump (f); +- fprintf (f, ")"); +- } + fprintf (f, "\n}\n"); + } + +@@ -813,7 +811,10 @@ srfield::dump (FILE *f) + void + srfield::simple_dump (FILE *f) + { +- fprintf (f, "field (%d)", DECL_UID (fielddecl)); ++ if (fielddecl) ++ { ++ fprintf (f, "field (%d)", DECL_UID (fielddecl)); ++ } + } + + /* Dump out the access structure to FILE. */ +@@ -857,21 +858,113 @@ srdecl::dump (FILE *file) + + } // namespace struct_reorg + ++namespace struct_relayout { ++ ++/* Complete Structure Relayout Optimization. ++ It reorganizes all structure members, and puts same member together. ++ struct s { ++ long a; ++ int b; ++ struct s* c; ++ }; ++ Array looks like ++ abcabcabcabc... ++ will be transformed to ++ aaaa...bbbb...cccc... ++*/ ++ ++#define GPTR_SIZE(i) \ ++ TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (gptr[i]))) ++ ++unsigned transformed = 0; ++ ++unsigned ++csrtype::calculate_field_num (tree field_offset) ++{ ++ HOST_WIDE_INT off = int_byte_position (field_offset); ++ unsigned i = 1; ++ for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) ++ { ++ if (off == int_byte_position (field)) ++ { ++ return i; ++ } ++ i++; ++ } ++ return 0; ++} ++ ++void ++csrtype::init_type_info (void) ++{ ++ if (!type) ++ { ++ return; ++ } ++ new_size = old_size = tree_to_uhwi (TYPE_SIZE_UNIT (type)); ++ ++ /* Close enough to pad to improve performance. */ ++ if (old_size > 48 && old_size < 64) ++ { ++ new_size = 64; ++ } ++ if (old_size > 96 && old_size < 128) ++ { ++ new_size = 128; ++ } ++ ++ /* For performance reasons, only allow structure size ++ that is a power of 2 and not too big. */ ++ if (new_size != 1 && new_size != 2 ++ && new_size != 4 && new_size != 8 ++ && new_size != 16 && new_size != 32 ++ && new_size != 64 && new_size != 128) ++ { ++ new_size = 0; ++ field_count = 0; ++ return; ++ } ++ ++ unsigned i = 0; ++ for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) ++ { ++ i++; ++ } ++ field_count = i; ++ ++ struct_size = build_int_cstu (TREE_TYPE (TYPE_SIZE_UNIT (type)), ++ new_size); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Type: "); ++ print_generic_expr (dump_file, type); ++ fprintf (dump_file, " has %d members.\n", field_count); ++ fprintf (dump_file, "Modify struct size from %ld to %ld.\n", ++ old_size, new_size); ++ } ++} ++ ++} // namespace struct_relayout ++ + namespace { + ++/* Structure definition for ipa_struct_reorg and ipa_struct_relayout. */ ++ + struct ipa_struct_reorg + { ++public: + // Constructors + ipa_struct_reorg(void) + : current_function (NULL), +- done_recording(false) ++ done_recording (false), ++ current_mode (NORMAL) + { + } + +- // public methods +- unsigned execute(void); ++ unsigned execute (enum srmode mode); + void mark_type_as_escape (tree type, escape_type, gimple *stmt = NULL); +-private: ++ + // fields + auto_vec_del types; + auto_vec_del functions; +@@ -879,8 +972,8 @@ private: + srfunction *current_function; + + bool done_recording; ++ srmode current_mode; + +- // private methods + void dump_types (FILE *f); + void dump_types_escaped (FILE *f); + void dump_functions (FILE *f); +@@ -910,6 +1003,7 @@ private: + void maybe_record_allocation_site (cgraph_node *, gimple *); + void record_stmt_expr (tree expr, cgraph_node *node, gimple *stmt); + void mark_expr_escape(tree, escape_type, gimple *stmt); ++ bool handled_allocation_stmt (gimple *stmt); + tree allocate_size (srtype *t, gimple *stmt); + + void mark_decls_in_as_not_needed (tree fn); +@@ -925,6 +1019,7 @@ private: + bool get_type_field (tree expr, tree &base, bool &indirect, srtype *&type, srfield *&field, bool &realpart, bool &imagpart, bool &address, bool should_create = false, bool can_escape = false); + bool wholeaccess (tree expr, tree base, tree accesstype, srtype *t); + ++ void check_alloc_num (gimple *stmt, srtype *type); + void check_definition (srdecl *decl, vec&); + void check_uses (srdecl *decl, vec&); + void check_use (srdecl *decl, gimple *stmt, vec&); +@@ -937,8 +1032,631 @@ private: + + bool has_rewritten_type (srfunction*); + void maybe_mark_or_record_other_side (tree side, tree other, gimple *stmt); ++ unsigned execute_struct_relayout (void); ++}; ++ ++struct ipa_struct_relayout ++{ ++public: ++ // fields ++ tree gptr[max_relayout_split + 1]; ++ csrtype ctype; ++ ipa_struct_reorg* sr; ++ cgraph_node* current_node; ++ ++ // Constructors ++ ipa_struct_relayout (tree type, ipa_struct_reorg* sr_) ++ { ++ ctype.type = type; ++ sr = sr_; ++ current_node = NULL; ++ for (int i = 0; i < max_relayout_split + 1; i++) ++ { ++ gptr[i] = NULL; ++ } ++ } ++ ++ // Methods ++ tree create_new_vars (tree type, const char *name); ++ void create_global_ptrs (void); ++ unsigned int rewrite (void); ++ void rewrite_stmt_in_function (void); ++ bool rewrite_debug (gimple *stmt, gimple_stmt_iterator *gsi); ++ bool rewrite_stmt (gimple *stmt, gimple_stmt_iterator *gsi); ++ bool handled_allocation_stmt (gcall *stmt); ++ void init_global_ptrs (gcall *stmt, gimple_stmt_iterator *gsi); ++ bool check_call_uses (gcall *stmt); ++ bool rewrite_call (gcall *stmt, gimple_stmt_iterator *gsi); ++ tree create_ssa (tree node, gimple_stmt_iterator *gsi); ++ bool is_candidate (tree xhs); ++ tree rewrite_address (tree xhs, gimple_stmt_iterator *gsi); ++ tree rewrite_offset (tree offset, HOST_WIDE_INT num); ++ bool rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi); ++ bool maybe_rewrite_cst (tree cst, gimple_stmt_iterator *gsi, ++ HOST_WIDE_INT ×); ++ unsigned int execute (void); + }; + ++} // anon namespace ++ ++namespace { ++ ++/* Methods for ipa_struct_relayout. */ ++ ++static void ++set_var_attributes (tree var) ++{ ++ if (!var) ++ { ++ return; ++ } ++ gcc_assert (TREE_CODE (var) == VAR_DECL); ++ ++ DECL_ARTIFICIAL (var) = 1; ++ DECL_EXTERNAL (var) = 0; ++ TREE_STATIC (var) = 1; ++ TREE_PUBLIC (var) = 0; ++ TREE_USED (var) = 1; ++ DECL_CONTEXT (var) = NULL; ++ TREE_THIS_VOLATILE (var) = 0; ++ TREE_ADDRESSABLE (var) = 0; ++ TREE_READONLY (var) = 0; ++ if (is_global_var (var)) ++ { ++ set_decl_tls_model (var, TLS_MODEL_NONE); ++ } ++} ++ ++tree ++ipa_struct_relayout::create_new_vars (tree type, const char *name) ++{ ++ gcc_assert (type); ++ tree new_type = build_pointer_type (type); ++ ++ tree new_name = NULL; ++ if (name) ++ { ++ new_name = get_identifier (name); ++ } ++ ++ tree new_var = build_decl (UNKNOWN_LOCATION, VAR_DECL, new_name, new_type); ++ ++ /* set new_var's attributes. */ ++ set_var_attributes (new_var); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Created new var: "); ++ print_generic_expr (dump_file, new_var); ++ fprintf (dump_file, "\n"); ++ } ++ return new_var; ++} ++ ++void ++ipa_struct_relayout::create_global_ptrs (void) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Create global gptrs: {\n"); ++ } ++ ++ const char *type_name = get_type_name (ctype.type); ++ char *gptr0_name = concat (type_name, "_gptr0", NULL); ++ tree var_gptr0 = create_new_vars (ctype.type, gptr0_name); ++ gptr[0] = var_gptr0; ++ varpool_node::add (var_gptr0); ++ ++ unsigned i = 1; ++ for (tree field = TYPE_FIELDS (ctype.type); field; ++ field = DECL_CHAIN (field)) ++ { ++ if (TREE_CODE (field) == FIELD_DECL) ++ { ++ tree type = TREE_TYPE (field); ++ ++ char *name = NULL; ++ char id[10] = {0}; ++ sprintf (id, "%d", i); ++ const char *decl_name = IDENTIFIER_POINTER (DECL_NAME (field)); ++ ++ name = concat (type_name, "_", decl_name, "_gptr", id, NULL); ++ ++ tree var = create_new_vars (type, name); ++ ++ gptr[i] = var; ++ varpool_node::add (var); ++ i++; ++ } ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nTotally create %d gptrs. }\n\n", i); ++ } ++ gcc_assert (ctype.field_count == i - 1); ++} ++ ++void ++ipa_struct_relayout::rewrite_stmt_in_function (void) ++{ ++ gcc_assert (cfun); ++ ++ basic_block bb = NULL; ++ gimple_stmt_iterator si; ++ FOR_EACH_BB_FN (bb, cfun) ++ { ++ for (si = gsi_start_bb (bb); !gsi_end_p (si);) ++ { ++ gimple *stmt = gsi_stmt (si); ++ if (rewrite_stmt (stmt, &si)) ++ { ++ gsi_remove (&si, true); ++ } ++ else ++ { ++ gsi_next (&si); ++ } ++ } ++ } ++ ++ /* Debug statements need to happen after all other statements ++ have changed. */ ++ FOR_EACH_BB_FN (bb, cfun) ++ { ++ for (si = gsi_start_bb (bb); !gsi_end_p (si);) ++ { ++ gimple *stmt = gsi_stmt (si); ++ if (gimple_code (stmt) == GIMPLE_DEBUG ++ && rewrite_debug (stmt, &si)) ++ { ++ gsi_remove (&si, true); ++ } ++ else ++ { ++ gsi_next (&si); ++ } ++ } ++ } ++} ++ ++unsigned int ++ipa_struct_relayout::rewrite (void) ++{ ++ cgraph_node *cnode = NULL; ++ function *fn = NULL; ++ FOR_EACH_FUNCTION (cnode) ++ { ++ if (!cnode->real_symbol_p () || !cnode->has_gimple_body_p ()) ++ { ++ continue; ++ } ++ if (cnode->definition) ++ { ++ fn = DECL_STRUCT_FUNCTION (cnode->decl); ++ current_node = cnode; ++ push_cfun (fn); ++ ++ rewrite_stmt_in_function (); ++ ++ update_ssa (TODO_update_ssa_only_virtuals); ++ ++ if (flag_tree_pta) ++ { ++ compute_may_aliases (); ++ } ++ ++ remove_unused_locals (); ++ ++ cgraph_edge::rebuild_edges (); ++ ++ free_dominance_info (CDI_DOMINATORS); ++ ++ pop_cfun (); ++ current_node = NULL; ++ } ++ } ++ return TODO_verify_all; ++} ++ ++bool ++ipa_struct_relayout::rewrite_debug (gimple *stmt, gimple_stmt_iterator *gsi) ++{ ++ /* TODO: For future implement. */ ++ return true; ++} ++ ++bool ++ipa_struct_relayout::rewrite_stmt (gimple *stmt, gimple_stmt_iterator *gsi) ++{ ++ switch (gimple_code (stmt)) ++ { ++ case GIMPLE_ASSIGN: ++ return rewrite_assign (as_a (stmt), gsi); ++ case GIMPLE_CALL: ++ return rewrite_call (as_a (stmt), gsi); ++ default: ++ break; ++ } ++ return false; ++} ++ ++bool ++ipa_struct_relayout::handled_allocation_stmt (gcall *stmt) ++{ ++ if (gimple_call_builtin_p (stmt, BUILT_IN_CALLOC)) ++ { ++ return true; ++ } ++ return false; ++} ++ ++void ++ipa_struct_relayout::init_global_ptrs (gcall *stmt, gimple_stmt_iterator *gsi) ++{ ++ gcc_assert (handled_allocation_stmt (stmt)); ++ ++ tree lhs = gimple_call_lhs (stmt); ++ ++ /* Case that gimple is at the end of bb. */ ++ if (gsi_one_before_end_p (*gsi)) ++ { ++ gassign* gptr0 = gimple_build_assign (gptr[0], lhs); ++ gsi_insert_after (gsi, gptr0, GSI_SAME_STMT); ++ } ++ gsi_next (gsi); ++ ++ /* Emit gimple gptr0 = _X and gptr1 = _X. */ ++ gassign* gptr0 = gimple_build_assign (gptr[0], lhs); ++ gsi_insert_before (gsi, gptr0, GSI_SAME_STMT); ++ gassign* gptr1 = gimple_build_assign (gptr[1], lhs); ++ gsi_insert_before (gsi, gptr1, GSI_SAME_STMT); ++ ++ /* Emit gimple gptr_[i] = gptr_[i-1] + _Y[gap]. */ ++ for (unsigned i = 2; i <= ctype.field_count; i++) ++ { ++ gimple *new_stmt = NULL; ++ tree gptr_i_prev_ssa = create_ssa (gptr[i-1], gsi); ++ tree gptr_i_ssa = make_ssa_name (TREE_TYPE (gptr[i-1])); ++ ++ /* Emit gimple _Y[gap] = N * sizeof (member). */ ++ tree member_gap = gimplify_build2 (gsi, MULT_EXPR, ++ long_unsigned_type_node, ++ gimple_call_arg (stmt, 0), ++ GPTR_SIZE (i-1)); ++ ++ new_stmt = gimple_build_assign (gptr_i_ssa, POINTER_PLUS_EXPR, ++ gptr_i_prev_ssa, member_gap); ++ gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); ++ ++ gassign *gptr_i = gimple_build_assign (gptr[i], gptr_i_ssa); ++ gsi_insert_before (gsi, gptr_i, GSI_SAME_STMT); ++ } ++ gsi_prev (gsi); ++} ++ ++bool ++ipa_struct_relayout::check_call_uses (gcall *stmt) ++{ ++ gcc_assert (current_node); ++ srfunction *fn = sr->find_function (current_node); ++ tree lhs = gimple_call_lhs (stmt); ++ ++ if (fn == NULL) ++ { ++ return false; ++ } ++ ++ srdecl *d = fn->find_decl (lhs); ++ if (d == NULL) ++ { ++ return false; ++ } ++ if (types_compatible_p (d->type->type, ctype.type)) ++ { ++ return true; ++ } ++ ++ return false; ++} ++ ++bool ++ipa_struct_relayout::rewrite_call (gcall *stmt, gimple_stmt_iterator *gsi) ++{ ++ if (handled_allocation_stmt (stmt)) ++ { ++ /* Rewrite stmt _X = calloc (N, sizeof (struct)). */ ++ tree size = gimple_call_arg (stmt, 1); ++ if (TREE_CODE (size) != INTEGER_CST) ++ { ++ return false; ++ } ++ if (tree_to_uhwi (size) != ctype.old_size) ++ { ++ return false; ++ } ++ if (!check_call_uses (stmt)) ++ { ++ return false; ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Rewrite allocation call:\n"); ++ print_gimple_stmt (dump_file, stmt, 0); ++ fprintf (dump_file, "to\n"); ++ } ++ ++ /* Modify sizeof (struct). */ ++ gimple_call_set_arg (stmt, 1, ctype.struct_size); ++ update_stmt (stmt); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_gimple_stmt (dump_file, stmt, 0); ++ fprintf (dump_file, "\n"); ++ } ++ ++ init_global_ptrs (stmt, gsi); ++ } ++ return false; ++} ++ ++tree ++ipa_struct_relayout::create_ssa (tree node, gimple_stmt_iterator *gsi) ++{ ++ tree node_ssa = make_ssa_name (TREE_TYPE (node)); ++ gassign *stmt = gimple_build_assign (node_ssa, node); ++ gsi_insert_before (gsi, stmt, GSI_SAME_STMT); ++ return node_ssa; ++} ++ ++bool ++ipa_struct_relayout::is_candidate (tree xhs) ++{ ++ if (TREE_CODE (xhs) != COMPONENT_REF) ++ { ++ return false; ++ } ++ tree mem = TREE_OPERAND (xhs, 0); ++ if (TREE_CODE (mem) == MEM_REF) ++ { ++ tree type = TREE_TYPE (mem); ++ if (types_compatible_p (type, ctype.type)) ++ { ++ return true; ++ } ++ } ++ return false; ++} ++ ++tree ++ipa_struct_relayout::rewrite_address (tree xhs, gimple_stmt_iterator *gsi) ++{ ++ tree mem_ref = TREE_OPERAND (xhs, 0); ++ tree pointer = TREE_OPERAND (mem_ref, 0); ++ tree pointer_offset = TREE_OPERAND (mem_ref, 1); ++ tree field = TREE_OPERAND (xhs, 1); ++ ++ tree pointer_ssa = fold_convert (long_unsigned_type_node, pointer); ++ tree gptr0_ssa = fold_convert (long_unsigned_type_node, gptr[0]); ++ ++ /* Emit gimple _X1 = ptr - gptr0. */ ++ tree step1 = gimplify_build2 (gsi, MINUS_EXPR, long_unsigned_type_node, ++ pointer_ssa, gptr0_ssa); ++ ++ /* Emit gimple _X2 = _X1 / sizeof (struct). */ ++ tree step2 = gimplify_build2 (gsi, TRUNC_DIV_EXPR, long_unsigned_type_node, ++ step1, ctype.struct_size); ++ ++ unsigned field_num = ctype.calculate_field_num (field); ++ gcc_assert (field_num > 0 && field_num <= ctype.field_count); ++ ++ /* Emit gimple _X3 = _X2 * sizeof (member). */ ++ tree step3 = gimplify_build2 (gsi, MULT_EXPR, long_unsigned_type_node, ++ step2, GPTR_SIZE (field_num)); ++ ++ /* Emit gimple _X4 = gptr[I]. */ ++ tree gptr_field_ssa = create_ssa (gptr[field_num], gsi); ++ tree new_address = make_ssa_name (TREE_TYPE (gptr[field_num])); ++ gassign *new_stmt = gimple_build_assign (new_address, POINTER_PLUS_EXPR, ++ gptr_field_ssa, step3); ++ gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); ++ ++ /* MEM_REF with nonzero offset like ++ MEM[ptr + sizeof (struct)] = 0B ++ should be transformed to ++ MEM[gptr + sizeof (member)] = 0B ++ */ ++ HOST_WIDE_INT size ++ = tree_to_shwi (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (new_address)))); ++ tree new_size = rewrite_offset (pointer_offset, size); ++ if (new_size) ++ { ++ TREE_OPERAND (mem_ref, 1) = new_size; ++ } ++ ++ /* Update mem_ref pointer. */ ++ TREE_OPERAND (mem_ref, 0) = new_address; ++ ++ /* Update mem_ref TREE_TYPE. */ ++ TREE_TYPE (mem_ref) = TREE_TYPE (TREE_TYPE (new_address)); ++ ++ return mem_ref; ++} ++ ++tree ++ipa_struct_relayout::rewrite_offset (tree offset, HOST_WIDE_INT num) ++{ ++ if (TREE_CODE (offset) == INTEGER_CST) ++ { ++ bool sign = false; ++ HOST_WIDE_INT off = TREE_INT_CST_LOW (offset); ++ if (off == 0) ++ { ++ return NULL; ++ } ++ if (off < 0) ++ { ++ off = -off; ++ sign = true; ++ } ++ if (off % ctype.old_size == 0) ++ { ++ HOST_WIDE_INT times = off / ctype.old_size; ++ times = sign ? -times : times; ++ return build_int_cst (TREE_TYPE (offset), num * times); ++ } ++ } ++ return NULL; ++} ++ ++#define REWRITE_ASSIGN_TREE_IN_STMT(node) \ ++do \ ++{ \ ++ tree node = gimple_assign_##node (stmt); \ ++ if (node && is_candidate (node)) \ ++ { \ ++ tree mem_ref = rewrite_address (node, gsi); \ ++ gimple_assign_set_##node (stmt, mem_ref); \ ++ update_stmt (stmt); \ ++ } \ ++} while (0) ++ ++/* COMPONENT_REF = exp => MEM_REF = exp ++ / \ / \ ++ MEM_REF field gptr offset ++ / \ ++ pointer offset ++*/ ++bool ++ipa_struct_relayout::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Maybe rewrite assign:\n"); ++ print_gimple_stmt (dump_file, stmt, 0); ++ fprintf (dump_file, "to\n"); ++ } ++ ++ switch (gimple_num_ops (stmt)) ++ { ++ case 4: REWRITE_ASSIGN_TREE_IN_STMT (rhs3); // FALLTHRU ++ case 3: ++ { ++ REWRITE_ASSIGN_TREE_IN_STMT (rhs2); ++ tree rhs2 = gimple_assign_rhs2 (stmt); ++ if (rhs2 && TREE_CODE (rhs2) == INTEGER_CST) ++ { ++ /* Handle pointer++ and pointer-- or ++ factor is euqal to struct size. */ ++ HOST_WIDE_INT times = 1; ++ if (maybe_rewrite_cst (rhs2, gsi, times)) ++ { ++ tree tmp = build_int_cst ( ++ TREE_TYPE (TYPE_SIZE_UNIT (ctype.type)), ++ ctype.new_size * times); ++ gimple_assign_set_rhs2 (stmt, tmp); ++ update_stmt (stmt); ++ } ++ } ++ } // FALLTHRU ++ case 2: REWRITE_ASSIGN_TREE_IN_STMT (rhs1); // FALLTHRU ++ case 1: REWRITE_ASSIGN_TREE_IN_STMT (lhs); // FALLTHRU ++ case 0: break; ++ default: gcc_unreachable (); ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_gimple_stmt (dump_file, stmt, 0); ++ fprintf (dump_file, "\n"); ++ } ++ return false; ++} ++ ++bool ++ipa_struct_relayout::maybe_rewrite_cst (tree cst, gimple_stmt_iterator *gsi, ++ HOST_WIDE_INT ×) ++{ ++ bool ret = false; ++ gcc_assert (TREE_CODE (cst) == INTEGER_CST); ++ ++ gimple *stmt = gsi_stmt (*gsi); ++ if (gimple_assign_rhs_code (stmt) == POINTER_PLUS_EXPR) ++ { ++ tree lhs = gimple_assign_lhs (stmt); ++ tree rhs1 = gimple_assign_rhs1 (stmt); ++ if (types_compatible_p (inner_type (TREE_TYPE (rhs1)), ctype.type) ++ || types_compatible_p (inner_type (TREE_TYPE (lhs)), ctype.type)) ++ { ++ tree num = NULL; ++ if (is_result_of_mult (cst, &num, TYPE_SIZE_UNIT (ctype.type))) ++ { ++ times = TREE_INT_CST_LOW (num); ++ return true; ++ } ++ } ++ } ++ ++ if (gimple_assign_rhs_code (stmt) == MULT_EXPR) ++ { ++ if (gsi_one_before_end_p (*gsi)) ++ { ++ return false; ++ } ++ gsi_next (gsi); ++ gimple *stmt2 = gsi_stmt (*gsi); ++ ++ if (gimple_code (stmt2) == GIMPLE_ASSIGN ++ && gimple_assign_rhs_code (stmt2) == POINTER_PLUS_EXPR) ++ { ++ tree lhs = gimple_assign_lhs (stmt2); ++ tree rhs1 = gimple_assign_rhs1 (stmt2); ++ if (types_compatible_p (inner_type (TREE_TYPE (rhs1)), ctype.type) ++ || types_compatible_p (inner_type (TREE_TYPE (lhs)), ctype.type)) ++ { ++ tree num = NULL; ++ if (is_result_of_mult (cst, &num, TYPE_SIZE_UNIT (ctype.type))) ++ { ++ times = TREE_INT_CST_LOW (num); ++ ret = true; ++ } ++ } ++ } ++ gsi_prev (gsi); ++ return ret; ++ } ++ return false; ++} ++ ++unsigned int ++ipa_struct_relayout::execute (void) ++{ ++ ctype.init_type_info (); ++ if (ctype.field_count < min_relayout_split ++ || ctype.field_count > max_relayout_split) ++ { ++ return 0; ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Complete Struct Relayout Type: "); ++ print_generic_expr (dump_file, ctype.type); ++ fprintf (dump_file, "\n"); ++ } ++ transformed++; ++ ++ create_global_ptrs (); ++ return rewrite (); ++} ++ ++} // anon namespace ++ ++namespace { ++ ++/* Methods for ipa_struct_reorg. */ ++ + /* Dump all of the recorded types to file F. */ + + void +@@ -1134,8 +1852,10 @@ ipa_struct_reorg::record_type (tree type + f->type = t1; + t1->add_field_site (f); + } +- if (t1 == type1) +- type1->mark_escape (escape_rescusive_type, NULL); ++ if (t1 == type1 && current_mode != COMPLETE_STRUCT_RELAYOUT) ++ { ++ type1->mark_escape (escape_rescusive_type, NULL); ++ } + } + } + } +@@ -1272,6 +1992,14 @@ ipa_struct_reorg::record_var (tree decl, + else + e = escape_type_volatile_array_or_ptrptr (TREE_TYPE (decl)); + ++ /* Separate instance is hard to trace in complete struct ++ relayout optimization. */ ++ if (current_mode == COMPLETE_STRUCT_RELAYOUT ++ && TREE_CODE (TREE_TYPE (decl)) == RECORD_TYPE) ++ { ++ e = escape_separate_instance; ++ } ++ + if (e != does_not_escape) + type->mark_escape (e, NULL); + } +@@ -1347,7 +2075,8 @@ ipa_struct_reorg::find_vars (gimple *stm + { + case GIMPLE_ASSIGN: + if (gimple_assign_rhs_class (stmt) == GIMPLE_SINGLE_RHS +- || gimple_assign_rhs_code (stmt) == POINTER_PLUS_EXPR) ++ || gimple_assign_rhs_code (stmt) == POINTER_PLUS_EXPR ++ || gimple_assign_rhs_code (stmt) == NOP_EXPR) + { + tree lhs = gimple_assign_lhs (stmt); + tree rhs = gimple_assign_rhs1 (stmt); +@@ -1372,6 +2101,32 @@ ipa_struct_reorg::find_vars (gimple *stm + current_function->record_decl (t, rhs, -1); + } + } ++ else ++ { ++ /* Because we won't handle these stmts in rewrite phase, ++ just mark these types as escaped. */ ++ switch (gimple_num_ops (stmt)) ++ { ++ case 4: mark_type_as_escape ( ++ TREE_TYPE (gimple_assign_rhs3 (stmt)), ++ escape_unhandled_rewrite, stmt); ++ // FALLTHRU ++ case 3: mark_type_as_escape ( ++ TREE_TYPE (gimple_assign_rhs2 (stmt)), ++ escape_unhandled_rewrite, stmt); ++ // FALLTHRU ++ case 2: mark_type_as_escape ( ++ TREE_TYPE (gimple_assign_rhs1 (stmt)), ++ escape_unhandled_rewrite, stmt); ++ // FALLTHRU ++ case 1: mark_type_as_escape ( ++ TREE_TYPE (gimple_assign_lhs (stmt)), ++ escape_unhandled_rewrite, stmt); ++ // FALLTHRU ++ case 0: break; ++ default: gcc_unreachable (); ++ } ++ } + break; + + case GIMPLE_CALL: +@@ -1453,9 +2208,23 @@ is_result_of_mult (tree arg, tree *num, + /* If we have a integer, just check if it is a multiply of STRUCT_SIZE. */ + if (TREE_CODE (arg) == INTEGER_CST) + { +- if (integer_zerop (size_binop (FLOOR_MOD_EXPR, arg, struct_size))) ++ bool sign = false; ++ HOST_WIDE_INT size = TREE_INT_CST_LOW (arg); ++ if (size < 0) + { +- *num = size_binop (FLOOR_DIV_EXPR, arg, struct_size); ++ size = -size; ++ sign = true; ++ } ++ tree arg2 = build_int_cst (TREE_TYPE (arg), size); ++ if (integer_zerop (size_binop (FLOOR_MOD_EXPR, arg2, struct_size))) ++ { ++ tree number = size_binop (FLOOR_DIV_EXPR, arg2, struct_size); ++ if (sign) ++ { ++ number = build_int_cst (TREE_TYPE (number), ++ -tree_to_shwi (number)); ++ } ++ *num = number; + return true; + } + return false; +@@ -1525,15 +2294,19 @@ is_result_of_mult (tree arg, tree *num, + + /* Return TRUE if STMT is an allocation statement that is handled. */ + +-static bool +-handled_allocation_stmt (gimple *stmt) ++bool ++ipa_struct_reorg::handled_allocation_stmt (gimple *stmt) + { +- if (gimple_call_builtin_p (stmt, BUILT_IN_REALLOC) +- || gimple_call_builtin_p (stmt, BUILT_IN_MALLOC) +- || gimple_call_builtin_p (stmt, BUILT_IN_CALLOC) +- || gimple_call_builtin_p (stmt, BUILT_IN_ALIGNED_ALLOC) +- || gimple_call_builtin_p (stmt, BUILT_IN_ALLOCA) +- || gimple_call_builtin_p (stmt, BUILT_IN_ALLOCA_WITH_ALIGN)) ++ if (current_mode == COMPLETE_STRUCT_RELAYOUT ++ && gimple_call_builtin_p (stmt, BUILT_IN_CALLOC)) ++ return true; ++ if (current_mode != COMPLETE_STRUCT_RELAYOUT ++ && (gimple_call_builtin_p (stmt, BUILT_IN_REALLOC) ++ || gimple_call_builtin_p (stmt, BUILT_IN_MALLOC) ++ || gimple_call_builtin_p (stmt, BUILT_IN_CALLOC) ++ || gimple_call_builtin_p (stmt, BUILT_IN_ALIGNED_ALLOC) ++ || gimple_call_builtin_p (stmt, BUILT_IN_ALLOCA) ++ || gimple_call_builtin_p (stmt, BUILT_IN_ALLOCA_WITH_ALIGN))) + return true; + return false; + } +@@ -1575,7 +2348,8 @@ ipa_struct_reorg::allocate_size (srtype + /* Check that second argument is a constant equal to the size of structure. */ + if (operand_equal_p (arg1, struct_size, 0)) + return size; +- /* Check that first argument is a constant equal to the size of structure. */ ++ /* ??? Check that first argument is a constant ++ equal to the size of structure. */ + if (operand_equal_p (size, struct_size, 0)) + return arg1; + if (dump_file && (dump_flags & TDF_DETAILS)) +@@ -1692,6 +2466,29 @@ ipa_struct_reorg::maybe_record_assign (c + } + } + ++bool ++check_mem_ref_offset (tree expr) ++{ ++ tree num = NULL; ++ bool ret = false; ++ ++ if (TREE_CODE (expr) != MEM_REF) ++ { ++ return false; ++ } ++ ++ /* Try to find the structure size. */ ++ tree field_off = TREE_OPERAND (expr, 1); ++ tree tmp = TREE_OPERAND (expr, 0); ++ if (TREE_CODE (tmp) == ADDR_EXPR) ++ { ++ tmp = TREE_OPERAND (tmp, 0); ++ } ++ tree size = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (tmp))); ++ ret = is_result_of_mult (field_off, &num, size); ++ return ret; ++} ++ + tree + get_ref_base_and_offset (tree &e, HOST_WIDE_INT &offset, bool &realpart, bool &imagpart, tree &accesstype) + { +@@ -1731,7 +2528,10 @@ get_ref_base_and_offset (tree &e, HOST_W + gcc_assert (TREE_CODE (field_off) == INTEGER_CST); + /* So we can mark the types as escaping if different. */ + accesstype = TREE_TYPE (field_off); +- offset += tree_to_uhwi (field_off); ++ if (!check_mem_ref_offset (expr)) ++ { ++ offset += tree_to_uhwi (field_off); ++ } + return TREE_OPERAND (expr, 0); + } + default: +@@ -2108,6 +2908,39 @@ ipa_struct_reorg::check_type_and_push (t + + } + ++void ++ipa_struct_reorg::check_alloc_num (gimple *stmt, srtype *type) ++{ ++ if (current_mode == COMPLETE_STRUCT_RELAYOUT ++ && handled_allocation_stmt (stmt)) ++ { ++ tree arg0 = gimple_call_arg (stmt, 0); ++ basic_block bb = gimple_bb (stmt); ++ cgraph_node *node = current_function->node; ++ if (integer_onep (arg0)) ++ { ++ /* Actually NOT an array, but may ruin other array. */ ++ type->has_alloc_array = -1; ++ } ++ else if (bb->loop_father != NULL ++ && loop_outer (bb->loop_father) != NULL) ++ { ++ /* The allocation is in a loop. */ ++ type->has_alloc_array = -2; ++ } ++ else if (node->callers != NULL) ++ { ++ type->has_alloc_array = -3; ++ } ++ else ++ { ++ type->has_alloc_array = type->has_alloc_array < 0 ++ ? type->has_alloc_array ++ : type->has_alloc_array + 1; ++ } ++ } ++} ++ + /* + 2) Check SSA_NAMEs for non type usages (source or use) (worlist of srdecl) + a) if the SSA_NAME is sourced from a pointer plus, record the pointer and +@@ -2151,6 +2984,7 @@ ipa_struct_reorg::check_definition (srde + if (!handled_allocation_stmt (stmt) + || !allocate_size (type, stmt)) + type->mark_escape (escape_return, stmt); ++ check_alloc_num (stmt, type); + return; + } + /* If the SSA_NAME is sourced from an inline-asm, just mark the type as escaping. */ +@@ -2189,6 +3023,21 @@ ipa_struct_reorg::check_definition (srde + return; + } + ++ if (gimple_assign_rhs_code (stmt) == MAX_EXPR ++ || gimple_assign_rhs_code (stmt) == MIN_EXPR) ++ { ++ tree rhs2 = gimple_assign_rhs2 (stmt); ++ if (TREE_CODE (rhs) == SSA_NAME) ++ { ++ check_type_and_push (rhs, type, worklist, stmt); ++ } ++ if (TREE_CODE (rhs2) == SSA_NAME) ++ { ++ check_type_and_push (rhs2, type, worklist, stmt); ++ } ++ return; ++ } ++ + /* Casts between pointers and integer are escaping. */ + if (gimple_assign_cast_p (stmt)) + { +@@ -2251,6 +3100,13 @@ ipa_struct_reorg::check_other_side (srde + srtype *t1 = find_type (inner_type (t)); + if (t1 == type) + { ++ /* In Complete Struct Relayout opti, if lhs type is the same ++ as rhs type, we could return without any harm. */ ++ if (current_mode == COMPLETE_STRUCT_RELAYOUT) ++ { ++ return; ++ } ++ + tree base; + bool indirect; + srtype *type1; +@@ -2298,8 +3154,11 @@ ipa_struct_reorg::check_use (srdecl *dec + tree rhs1 = gimple_cond_lhs (stmt); + tree rhs2 = gimple_cond_rhs (stmt); + tree orhs = rhs1; +- if (gimple_cond_code (stmt) != EQ_EXPR +- && gimple_cond_code (stmt) != NE_EXPR) ++ enum tree_code code = gimple_cond_code (stmt); ++ if (code != EQ_EXPR && code != NE_EXPR ++ && (current_mode != COMPLETE_STRUCT_RELAYOUT ++ || (code != LT_EXPR && code != LE_EXPR ++ && code != GT_EXPR && code != GE_EXPR))) + { + mark_expr_escape (rhs1, escape_non_eq, stmt); + mark_expr_escape (rhs2, escape_non_eq, stmt); +@@ -2329,8 +3188,11 @@ ipa_struct_reorg::check_use (srdecl *dec + tree rhs1 = gimple_assign_rhs1 (stmt); + tree rhs2 = gimple_assign_rhs2 (stmt); + tree orhs = rhs1; +- if (gimple_assign_rhs_code (stmt) != EQ_EXPR +- && gimple_assign_rhs_code (stmt) != NE_EXPR) ++ enum tree_code code = gimple_assign_rhs_code (stmt); ++ if (code != EQ_EXPR && code != NE_EXPR ++ && (current_mode != COMPLETE_STRUCT_RELAYOUT ++ || (code != LT_EXPR && code != LE_EXPR ++ && code != GT_EXPR && code != GE_EXPR))) + { + mark_expr_escape (rhs1, escape_non_eq, stmt); + mark_expr_escape (rhs2, escape_non_eq, stmt); +@@ -2727,8 +3589,11 @@ ipa_struct_reorg::propagate_escape (void + void + ipa_struct_reorg::prune_escaped_types (void) + { +- detect_cycles (); +- propagate_escape (); ++ if (current_mode != COMPLETE_STRUCT_RELAYOUT) ++ { ++ detect_cycles (); ++ propagate_escape (); ++ } + + if (dump_file && (dump_flags & TDF_DETAILS)) + { +@@ -3850,16 +4715,82 @@ ipa_struct_reorg::rewrite_functions (voi + } + + unsigned int +-ipa_struct_reorg::execute (void) ++ipa_struct_reorg::execute_struct_relayout (void) + { +- /* FIXME: If there is a top-level inline-asm, the pass immediately returns. */ +- if (symtab->first_asm_symbol ()) +- return 0; +- record_accesses (); +- prune_escaped_types (); +- analyze_types (); ++ unsigned retval = 0; ++ for (unsigned i = 0; i < types.length (); i++) ++ { ++ tree type = types[i]->type; ++ if (TYPE_FIELDS (type) == NULL) ++ { ++ continue; ++ } ++ if (types[i]->has_alloc_array != 1) ++ { ++ continue; ++ } ++ if (types[i]->chain_type) ++ { ++ continue; ++ } ++ retval |= ipa_struct_relayout (type, this).execute (); ++ } ++ ++ if (dump_file) ++ { ++ if (transformed) ++ { ++ fprintf (dump_file, "\nNumber of structures to transform in " ++ "Complete Structure Relayout is %d\n", transformed); ++ } ++ else ++ { ++ fprintf (dump_file, "\nNo structures to transform in " ++ "Complete Structure Relayout.\n"); ++ } ++ } + +- return rewrite_functions (); ++ return retval; ++} ++ ++unsigned int ++ipa_struct_reorg::execute (enum srmode mode) ++{ ++ unsigned int ret = 0; ++ ++ if (mode == NORMAL) ++ { ++ current_mode = NORMAL; ++ /* FIXME: If there is a top-level inline-asm, ++ the pass immediately returns. */ ++ if (symtab->first_asm_symbol ()) ++ { ++ return 0; ++ } ++ record_accesses (); ++ prune_escaped_types (); ++ analyze_types (); ++ ++ ret = rewrite_functions (); ++ } ++ else if (mode == COMPLETE_STRUCT_RELAYOUT) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, "\n\nTry Complete Struct Relayout:\n"); ++ } ++ current_mode = COMPLETE_STRUCT_RELAYOUT; ++ if (symtab->first_asm_symbol ()) ++ { ++ return 0; ++ } ++ record_accesses (); ++ prune_escaped_types (); ++ ++ ret = execute_struct_relayout (); ++ } ++ ++ return ret; + } + + const pass_data pass_data_ipa_struct_reorg = +@@ -3884,17 +4815,27 @@ public: + + /* opt_pass methods: */ + virtual bool gate (function *); +- virtual unsigned int execute (function *) { return ipa_struct_reorg ().execute(); } ++ virtual unsigned int execute (function *) ++ { ++ unsigned int ret = 0; ++ ret = ipa_struct_reorg ().execute (NORMAL); ++ if (!ret) ++ { ++ ret = ipa_struct_reorg ().execute (COMPLETE_STRUCT_RELAYOUT); ++ } ++ return ret; ++ } + + }; // class pass_ipa_struct_reorg + + bool + pass_ipa_struct_reorg::gate (function *) + { +- return (optimize ++ return (optimize >= 3 + && flag_ipa_struct_reorg + /* Don't bother doing anything if the program has errors. */ +- && !seen_error ()); ++ && !seen_error () ++ && flag_lto_partition == LTO_PARTITION_ONE); + } + + } // anon namespace +diff -Nurp a/gcc/ipa-struct-reorg/ipa-struct-reorg.h b/gcc/ipa-struct-reorg/ipa-struct-reorg.h +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.h 2020-07-18 05:11:11.548000000 -0400 ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.h 2020-07-18 05:16:25.928000000 -0400 +@@ -121,6 +121,7 @@ public: + + tree newtype[max_split]; + bool visited; ++ int has_alloc_array; + + // Constructors + srtype(tree type); +@@ -232,4 +233,34 @@ struct srdecl + + } // namespace struct_reorg + ++ ++namespace struct_relayout { ++ ++const int min_relayout_split = 8; ++const int max_relayout_split = 16; ++ ++struct csrtype ++{ ++ tree type; ++ unsigned HOST_WIDE_INT old_size; ++ unsigned HOST_WIDE_INT new_size; ++ unsigned field_count; ++ tree struct_size; ++ ++ // Constructors ++ csrtype () ++ : type (NULL), ++ old_size (0), ++ new_size (0), ++ field_count (0), ++ struct_size (NULL) ++ {} ++ ++ // Methods ++ unsigned calculate_field_num (tree field_offset); ++ void init_type_info (void); ++}; ++ ++} // namespace struct_relayout ++ + #endif +diff -Nurp a/gcc/testsuite/gcc.dg/struct/complete_struct_relayout.c b/gcc/testsuite/gcc.dg/struct/complete_struct_relayout.c +--- a/gcc/testsuite/gcc.dg/struct/complete_struct_relayout.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/complete_struct_relayout.c 2020-07-18 05:16:25.928000000 -0400 +@@ -0,0 +1,60 @@ ++// { dg-do run } ++ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node* node_p; ++ ++struct node { ++ unsigned long a; ++ unsigned long b; ++ node_p c; ++ node_p d; ++ long e; ++ long f; ++ long g; ++ long h; ++ long i; ++ long j; ++ long k; ++ long l; ++ int m; ++ int n; ++}; ++ ++const int MAX = 10000; ++node_p n; ++ ++int ++main () ++{ ++ n = (node_p) calloc (MAX, sizeof (node_t)); ++ ++ for (int i = 0; i < MAX; i++) ++ { ++ n[i].a = 100; ++ } ++ for (int i = 0; i < MAX; i++) ++ { ++ if (n[i].a != 100) ++ { ++ abort (); ++ } ++ } ++ ++ for (int i = 0; i < MAX; i++) ++ { ++ n[i].l = n[i].a; ++ } ++ for (int i = 0; i < MAX; i++) ++ { ++ if (n[i].l != 100) ++ { ++ abort (); ++ } ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump "Number of structures to transform in Complete Structure Relayout is 1" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/csr_allocation-1.c b/gcc/testsuite/gcc.dg/struct/csr_allocation-1.c +--- a/gcc/testsuite/gcc.dg/struct/csr_allocation-1.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/csr_allocation-1.c 2020-07-18 05:16:25.928000000 -0400 +@@ -0,0 +1,46 @@ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node* node_p; ++ ++struct node { ++ unsigned long a; ++ unsigned long b; ++ node_p c; ++ node_p d; ++ long e; ++ long f; ++ long g; ++ long h; ++ long i; ++ long j; ++ long k; ++ long l; ++ int m; ++ int n; ++}; ++ ++const int MAX = 1; ++node_p n; ++ ++int ++main () ++{ ++ n = (node_p) calloc (MAX, sizeof (node_t)); ++ ++ for (int i = 0; i < MAX; i++) ++ { ++ n[i].a = 100; ++ } ++ for (int i = 0; i < MAX; i++) ++ { ++ if (n[i].a != 100) ++ { ++ abort (); ++ } ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump "No structures to transform in Complete Structure Relayout." "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/csr_allocation-2.c b/gcc/testsuite/gcc.dg/struct/csr_allocation-2.c +--- a/gcc/testsuite/gcc.dg/struct/csr_allocation-2.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/csr_allocation-2.c 2020-07-18 05:16:25.928000000 -0400 +@@ -0,0 +1,59 @@ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node* node_p; ++ ++struct node { ++ unsigned long a; ++ unsigned long b; ++ node_p c; ++ node_p d; ++ long e; ++ long f; ++ long g; ++ long h; ++ long i; ++ long j; ++ long k; ++ long l; ++ int m; ++ int n; ++}; ++ ++const int MAX = 10; ++node_p n; ++node_p m; ++ ++int main() ++{ ++ int i; ++ for (i = 0; i < MAX / 5; i++) ++ { ++ n = (node_p) calloc(MAX, sizeof(node_t)); ++ if (i == 0) ++ { ++ m = n; ++ } ++ } ++ ++ for (int i = 0; i < MAX; i++) ++ { ++ n[i].a = 100; ++ } ++ for (int i = 0; i < MAX; i++) ++ { ++ m[i].a = 50; ++ } ++ ++ for (int i = 0; i < MAX; i++) ++ { ++ if (n[i].a != 100) ++ { ++ abort (); ++ } ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump "No structures to transform in Complete Structure Relayout." "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/csr_allocation-3.c b/gcc/testsuite/gcc.dg/struct/csr_allocation-3.c +--- a/gcc/testsuite/gcc.dg/struct/csr_allocation-3.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/csr_allocation-3.c 2020-07-18 05:16:25.928000000 -0400 +@@ -0,0 +1,77 @@ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node* node_p; ++ ++struct node { ++ unsigned long a; ++ unsigned long b; ++ node_p c; ++ node_p d; ++ long e; ++ long f; ++ long g; ++ long h; ++ long i; ++ long j; ++ long k; ++ long l; ++ int m; ++ int n; ++}; ++ ++const int MAX = 10; ++node_p n; ++node_p m; ++ ++void test (int, int) __attribute__((noinline)); ++ ++void ++test (int num, int flag) ++{ ++ if (num <= 0) ++ { ++ return; ++ } ++ n = (node_p) calloc (num, sizeof (node_t)); ++ if (flag) ++ { ++ m = n; ++ } ++ return; ++} ++ ++int ++main () ++{ ++ test (MAX, 1); ++ test (MAX, 0); ++ ++ for (int i = 0; i < MAX; i++) ++ { ++ n[i].a = 100; ++ } ++ for (int i = 0; i < MAX; i++) ++ { ++ m[i].a = 50; ++ } ++ ++ for (int i = 0; i < MAX; i++) ++ { ++ if (n[i].a != 100) ++ { ++ abort (); ++ } ++ } ++ for (int i = 0; i < MAX; i++) ++ { ++ if (m[i].a != 50) ++ { ++ abort (); ++ } ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump "No structures to transform in Complete Structure Relayout." "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/csr_cast_int.c b/gcc/testsuite/gcc.dg/struct/csr_cast_int.c +--- a/gcc/testsuite/gcc.dg/struct/csr_cast_int.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/csr_cast_int.c 2020-07-18 05:16:25.928000000 -0400 +@@ -0,0 +1,52 @@ ++// { dg-do run } ++ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node* node_p; ++ ++struct node { ++ unsigned long a; ++ unsigned long b; ++ node_p c; ++ node_p d; ++ long e; ++ long f; ++ long g; ++ long h; ++ long i; ++ long j; ++ long k; ++ long l; ++ int m; ++ int n; ++}; ++ ++const int MAX = 100; ++node_p n; ++unsigned long y; ++ ++int ++main () ++{ ++ n = (node_p) calloc (MAX, sizeof (node_t)); ++ ++ for (int i = 0; i < MAX; i++) ++ { ++ n[i].b = 50; ++ } ++ ++ node_p x = &n[5]; ++ y = (unsigned long) x; ++ y += 8; ++ ++ if (*((unsigned long*) y) != 50) ++ { ++ abort (); ++ } ++ ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump "struct node has escaped: \"Type escapes a cast from/to intergral type\"" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/csr_separate_instance.c b/gcc/testsuite/gcc.dg/struct/csr_separate_instance.c +--- a/gcc/testsuite/gcc.dg/struct/csr_separate_instance.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/csr_separate_instance.c 2020-07-18 05:16:25.928000000 -0400 +@@ -0,0 +1,48 @@ ++#include ++#include ++ ++typedef struct node node_t; ++typedef struct node* node_p; ++ ++struct node { ++ unsigned long a; ++ unsigned long b; ++ node_p c; ++ node_p d; ++ long e; ++ long f; ++ long g; ++ long h; ++ long i; ++ long j; ++ long k; ++ long l; ++ int m; ++ int n; ++}; ++ ++const int MAX = 10000; ++node_p n; ++node_t t; ++ ++int ++main () ++{ ++ n = (node_p) calloc (MAX, sizeof (node_t)); ++ t.a = 100; ++ ++ for (int i = 0; i < MAX; i++) ++ { ++ n[i].a = t.a; ++ } ++ for (int i = 0; i < MAX; i++) ++ { ++ if (n[i].a != 100) ++ { ++ abort (); ++ } ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump "struct node has escaped: \"Type escapes via a separate instance\"" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/sr_address_of_field.c b/gcc/testsuite/gcc.dg/struct/sr_address_of_field.c +--- a/gcc/testsuite/gcc.dg/struct/sr_address_of_field.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/sr_address_of_field.c 2020-07-18 05:16:25.928000000 -0400 +@@ -0,0 +1,37 @@ ++/* { dg-do run } */ ++ ++static struct S { ++ int *p1; ++ int *p2; ++} s; ++ ++typedef __UINTPTR_TYPE__ uintptr_t; ++ ++int ++foo () ++{ ++ int i = 1; ++ int j = 2; ++ struct S s; ++ int **p; ++ s.p1 = &i; ++ s.p2 = &j; ++ p = &s.p1; ++ uintptr_t pi = (uintptr_t) p; ++ pi = pi + sizeof (int *); ++ p = (int **)pi; ++ **p = 3; ++ return j; ++} ++ ++int ++main () ++{ ++ if (foo () != 3) ++ { ++ __builtin_abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump "struct S has escaped: \"Type escapes via taking the address of field\"" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/sr_maxmin_expr.c b/gcc/testsuite/gcc.dg/struct/sr_maxmin_expr.c +--- a/gcc/testsuite/gcc.dg/struct/sr_maxmin_expr.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/sr_maxmin_expr.c 2020-07-18 05:16:25.928000000 -0400 +@@ -0,0 +1,25 @@ ++// { dg-do compile } ++ ++#include ++ ++struct S { ++ unsigned long a; ++ unsigned long b; ++}; ++ ++struct S* s; ++struct S* t = (struct S*) 1000; ++ ++int ++main () ++{ ++ s = (struct S*) calloc (1000, sizeof (struct S)); ++ s = s > t ? s : t; ++ if (s == 0) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump "No structures to transform." "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/sr_pointer_minus.c b/gcc/testsuite/gcc.dg/struct/sr_pointer_minus.c +--- a/gcc/testsuite/gcc.dg/struct/sr_pointer_minus.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/sr_pointer_minus.c 2020-07-18 05:16:25.928000000 -0400 +@@ -0,0 +1,33 @@ ++// { dg-do compile } ++ ++#include ++ ++typedef struct node node_t; ++typedef struct node* node_p; ++ ++struct node { ++ unsigned long a; ++ unsigned long b; ++}; ++ ++int max; ++int x; ++ ++node_p n; ++node_p z; ++ ++int ++main () ++{ ++ n = (node_p) calloc (max, sizeof (node_t)); ++ ++ node_p xp = &n[x]; ++ ++ if (xp - z == 10) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump "struct node has escaped: \"Type escapes via a unhandled rewrite stmt\"" "struct_reorg" } } */ diff --git a/cse-in-vectorization.patch b/cse-in-vectorization.patch new file mode 100644 index 0000000..ac8e7a2 --- /dev/null +++ b/cse-in-vectorization.patch @@ -0,0 +1,68 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-vect-CSE-for-bump-and-offset-in-strided-load-store-o.patch +4a31a8add56d49867c187d90b3a89e97634543c2 + +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr95199.c b/gcc/testsuite/gcc.target/aarch64/sve/pr95199.c +new file mode 100644 +index 00000000000..adcd5124a7c +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/sve/pr95199.c +@@ -0,0 +1,17 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -fdump-tree-vect" } */ ++ ++void ++foo (double *a, double *b, double m, int inc_x, int inc_y) ++{ ++ int ix = 0, iy = 0; ++ for (int i = 0; i < 1000; ++i) ++ { ++ a[ix] += m * b[iy]; ++ ix += inc_x; ++ iy += inc_y; ++ } ++ return ; ++} ++ ++/* { dg-final { scan-tree-dump-times "VEC_SERIES_EXPR" 2 "vect" } } */ +diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c +index 4a0a907fcb4..c9174395fca 100644 +--- a/gcc/tree-vect-stmts.c ++++ b/gcc/tree-vect-stmts.c +@@ -2846,16 +2846,12 @@ vect_get_strided_load_store_ops (stmt_vec_info stmt_info, + tree *dataref_bump, tree *vec_offset) + { + struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); +- struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + tree vectype = STMT_VINFO_VECTYPE (stmt_info); +- gimple_seq stmts; + + tree bump = size_binop (MULT_EXPR, + fold_convert (sizetype, DR_STEP (dr)), + size_int (TYPE_VECTOR_SUBPARTS (vectype))); +- *dataref_bump = force_gimple_operand (bump, &stmts, true, NULL_TREE); +- if (stmts) +- gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); ++ *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump); + + /* The offset given in GS_INFO can have pointer type, so use the element + type of the vector instead. */ +@@ -2866,13 +2862,11 @@ vect_get_strided_load_store_ops (stmt_vec_info stmt_info, + tree step = size_binop (EXACT_DIV_EXPR, DR_STEP (dr), + ssize_int (gs_info->scale)); + step = fold_convert (offset_type, step); +- step = force_gimple_operand (step, &stmts, true, NULL_TREE); + + /* Create {0, X, X*2, X*3, ...}. */ +- *vec_offset = gimple_build (&stmts, VEC_SERIES_EXPR, offset_vectype, +- build_zero_cst (offset_type), step); +- if (stmts) +- gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); ++ tree offset = fold_build2 (VEC_SERIES_EXPR, offset_vectype, ++ build_zero_cst (offset_type), step); ++ *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo, offset); + } + + /* Return the amount that should be added to a vector pointer to move diff --git a/delete-incorrect-smw.patch b/delete-incorrect-smw.patch index 087166f..44ed526 100644 --- a/delete-incorrect-smw.patch +++ b/delete-incorrect-smw.patch @@ -1,3 +1,9 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-re-PR-middle-end-91195-incorrect-may-be-used-uniniti.patch +06e8db10cd80d88fb3a6afedf2c35da6c1fa6d85 + diff -uprN a/gcc/testsuite/gcc.dg/pr91195.c b/gcc/testsuite/gcc.dg/pr91195.c new file mode 100644 --- /dev/null diff --git a/enable-simd-math.patch b/enable-simd-math.patch new file mode 100644 index 0000000..46f7d3d --- /dev/null +++ b/enable-simd-math.patch @@ -0,0 +1,34 @@ +diff -Nurp a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c +--- a/gcc/config/aarch64/aarch64.c 2020-07-06 17:20:30.368000000 +0800 ++++ b/gcc/config/aarch64/aarch64.c 2020-07-06 20:02:39.480000000 +0800 +@@ -18860,8 +18860,12 @@ aarch64_simd_clone_compute_vecsize_and_s + elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type)); + if (clonei->simdlen == 0) + { +- count = 2; +- vec_bits = (num == 0 ? 64 : 128); ++ /* Currently mathlib or sleef hasn't provide function for V2SF mode ++ simdclone of single precision functions. (e.g._ZCVnN2v_expf) ++ Therefore this mode is disabled by default to avoid link error. ++ Use -msimdmath-64 option to enable this mode. */ ++ count = flag_simdmath_64 ? 2 : 1; ++ vec_bits = ((num == 0 && flag_simdmath_64) ? 64 : 128); + clonei->simdlen = vec_bits / elt_bits; + } + else +diff -Nurp a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt +--- a/gcc/config/aarch64/aarch64.opt 2020-07-06 17:20:30.364000000 +0800 ++++ b/gcc/config/aarch64/aarch64.opt 2020-07-06 20:02:39.480000000 +0800 +@@ -186,6 +186,12 @@ precision of square root results to abou + single precision and to 32 bits for double precision. + If enabled, it implies -mlow-precision-recip-sqrt. + ++msimdmath-64 ++Target Var(flag_simdmath_64) Optimization ++Allow compiler to generate V2SF 64 bits simdclone of math functions, ++which is not currently supported in mathlib or sleef. ++Therefore this option is disabled by default. ++ + mlow-precision-div + Target Var(flag_mlow_precision_div) Optimization + Enable the division approximation. Enabling this reduces diff --git a/fix-ICE-avoid-issueing-loads-in-SM-when-possible.patch b/fix-ICE-avoid-issueing-loads-in-SM-when-possible.patch new file mode 100644 index 0000000..01a33e3 --- /dev/null +++ b/fix-ICE-avoid-issueing-loads-in-SM-when-possible.patch @@ -0,0 +1,123 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-tree-optimization-39612-avoid-issueing-loads-in-SM-w.patch +f9e1ea10e657af9fb02fafecf1a600740fd34409 + +diff -Nurp a/gcc/testsuite/gcc.dg/tree-ssa/pr39612.c b/gcc/testsuite/gcc.dg/tree-ssa/pr39612.c +--- a/gcc/testsuite/gcc.dg/tree-ssa/pr39612.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr39612.c 2020-08-17 11:14:08.000000000 +0800 +@@ -0,0 +1,21 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -fdump-tree-lim2-details -Wuninitialized" } */ ++ ++void foo(int *); ++void f2(int dst[3], int R) ++{ ++ int i, inter[2]; ++ ++ for (i = 1; i < R; i++) { ++ if (i & 8) ++ { ++ inter[0] = 1; ++ inter[1] = 1; ++ } ++ } ++ ++ foo(inter); ++} ++ ++/* { dg-final { scan-tree-dump-times "Executing store motion" 2 "lim2" } } */ ++/* { dg-final { scan-tree-dump-not " = inter\\\[\[0-1\]\\\];" "lim2" } } */ +diff -Nurp a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c +--- a/gcc/tree-ssa-loop-im.c 2020-08-17 11:13:58.436000000 +0800 ++++ b/gcc/tree-ssa-loop-im.c 2020-08-17 11:14:08.000000000 +0800 +@@ -127,6 +127,8 @@ struct im_mem_ref + + bitmap stored; /* The set of loops in that this memory location + is stored to. */ ++ bitmap loaded; /* The set of loops in that this memory location ++ is loaded from. */ + vec accesses_in_loop; + /* The locations of the accesses. Vector + indexed by the loop number. */ +@@ -1394,6 +1396,7 @@ mem_ref_alloc (ao_ref *mem, unsigned has + ref->ref_decomposed = false; + ref->hash = hash; + ref->stored = NULL; ++ ref->loaded = NULL; + bitmap_initialize (&ref->indep_loop, &lim_bitmap_obstack); + bitmap_initialize (&ref->dep_loop, &lim_bitmap_obstack); + ref->accesses_in_loop.create (1); +@@ -1434,6 +1437,27 @@ mark_ref_stored (im_mem_ref *ref, struct + loop = loop_outer (loop); + } + ++/* Set the LOOP bit in REF loaded bitmap and allocate that if ++ necessary. Return whether a bit was changed. */ ++ ++static bool ++set_ref_loaded_in_loop (im_mem_ref *ref, class loop *loop) ++{ ++ if (!ref->loaded) ++ ref->loaded = BITMAP_ALLOC (&lim_bitmap_obstack); ++ return bitmap_set_bit (ref->loaded, loop->num); ++} ++ ++/* Marks reference REF as loaded in LOOP. */ ++ ++static void ++mark_ref_loaded (im_mem_ref *ref, class loop *loop) ++{ ++ while (loop != current_loops->tree_root ++ && set_ref_loaded_in_loop (ref, loop)) ++ loop = loop_outer (loop); ++} ++ + /* Gathers memory references in statement STMT in LOOP, storing the + information about them in the memory_accesses structure. Marks + the vops accessed through unrecognized statements there as +@@ -1569,6 +1593,8 @@ gather_mem_refs_stmt (struct loop *loop, + bitmap_set_bit (&memory_accesses.refs_stored_in_loop[loop->num], ref->id); + mark_ref_stored (ref, loop); + } ++ else ++ mark_ref_loaded (ref, loop); + init_lim_data (stmt)->ref = ref->id; + return; + } +@@ -1956,6 +1982,8 @@ execute_sm_if_changed (edge ex, tree mem + gsi = gsi_start_bb (then_bb); + /* Insert actual store. */ + stmt = gimple_build_assign (unshare_expr (mem), tmp_var); ++ /* Make sure to not warn about maybe-uninit uses of tmp_var here. */ ++ gimple_set_no_warning (stmt, true); + gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING); + + edge e1 = single_succ_edge (new_bb); +@@ -2102,14 +2130,17 @@ execute_sm (struct loop *loop, vec + by move_computations after all dependencies. */ + gsi = gsi_for_stmt (first_mem_ref_loc (loop, ref)->stmt); + +- /* FIXME/TODO: For the multi-threaded variant, we could avoid this +- load altogether, since the store is predicated by a flag. We +- could, do the load only if it was originally in the loop. */ +- load = gimple_build_assign (tmp_var, unshare_expr (ref->mem.ref)); +- lim_data = init_lim_data (load); +- lim_data->max_loop = loop; +- lim_data->tgt_loop = loop; +- gsi_insert_before (&gsi, load, GSI_SAME_STMT); ++ /* Avoid doing a load if there was no load of the ref in the loop. ++ Esp. when the ref is not always stored we cannot optimize it ++ away later. */ ++ if (ref->loaded && bitmap_bit_p (ref->loaded, loop->num)) ++ { ++ load = gimple_build_assign (tmp_var, unshare_expr (ref->mem.ref)); ++ lim_data = init_lim_data (load); ++ lim_data->max_loop = loop; ++ lim_data->tgt_loop = loop; ++ gsi_insert_before (&gsi, load, GSI_SAME_STMT); ++ } + + if (multi_threaded_model_p) + { diff --git a/fix-ICE-during-pass-ccp.patch b/fix-ICE-during-pass-ccp.patch index 67d332d..15cb1df 100644 --- a/fix-ICE-during-pass-ccp.patch +++ b/fix-ICE-during-pass-ccp.patch @@ -1,3 +1,9 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-PR-tree-optimization-94574-aarch64-ICE-during-GIMPLE.patch +f65cecabc32fe12b024253502af953e657e1a878 + diff -uprN a/gcc/testsuite/gcc.dg/pr94574.c b/gcc/testsuite/gcc.dg/pr94574.c --- a/gcc/testsuite/gcc.dg/pr94574.c 1970-01-01 00:00:00.000000000 +0000 +++ b/gcc/testsuite/gcc.dg/pr94574.c 2020-04-15 21:08:48.972000000 +0000 diff --git a/fix-ICE-in-compute_live_loop_exits.patch b/fix-ICE-in-compute_live_loop_exits.patch new file mode 100644 index 0000000..013ec83 --- /dev/null +++ b/fix-ICE-in-compute_live_loop_exits.patch @@ -0,0 +1,76 @@ +diff -Nurp a/gcc/testsuite/gcc.dg/tree-ssa/pr92085-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr92085-1.c +--- a/gcc/testsuite/gcc.dg/tree-ssa/pr92085-1.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr92085-1.c 2020-07-09 11:05:23.136000000 +0800 +@@ -0,0 +1,20 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O1 -fexceptions -fnon-call-exceptions -ftree-loop-vectorize -fno-tree-sink --param dse-max-alias-queries-per-store=2 -w" } */ ++ ++void ++di (int y9, int qw) ++{ ++ if ((int) &y9 != 0) ++ { ++ int py; ++ int **fq = &py; ++ ++ while (qw < 1) ++ { ++ if ((0 < (**fq ? **fq : (**fq = 1))) / (**fq = y9)) ++ ; ++ ++ ++qw; ++ } ++ } ++} +diff -Nurp a/gcc/testsuite/gcc.dg/tree-ssa/pr92085-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr92085-2.c +--- a/gcc/testsuite/gcc.dg/tree-ssa/pr92085-2.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr92085-2.c 2020-07-09 11:05:23.136000000 +0800 +@@ -0,0 +1,29 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O1 -ftree-loop-vectorize -fno-tree-dce -fno-tree-sink -w" } */ ++ ++int a8; ++ ++void ++c1 (int oz, int dk, int ub) ++{ ++ int *hd = 0; ++ long int *th = &dk; ++ ++ while (ub < 1) ++ { ++ oz || dk; ++ ++ub; ++ } ++ ++ while (oz < 2) ++ { ++ long int *lq = &oz; ++ ++ (*hd < (*lq = *th)) < oz; ++ ++ if (oz == 0) ++ *th = a8 = oz; ++ ++ *lq = 0; ++ } ++} +diff -Nurp a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c +--- a/gcc/tree-if-conv.c 2020-07-09 11:04:58.832000000 +0800 ++++ b/gcc/tree-if-conv.c 2020-07-09 11:05:23.136000000 +0800 +@@ -2984,10 +2984,11 @@ ifcvt_local_dce (class loop *loop) + ao_ref write; + ao_ref_init (&write, lhs); + +- if (dse_classify_store (&write, stmt, false, NULL, NULL, latch_vdef) +- == DSE_STORE_DEAD) +- delete_dead_or_redundant_assignment (&gsi, "dead"); +- gsi_next (&gsi); ++ if (dse_classify_store (&write, stmt, false, NULL, NULL, latch_vdef) ++ == DSE_STORE_DEAD) ++ delete_dead_or_redundant_assignment (&gsi, "dead"); ++ else ++ gsi_next (&gsi); + continue; + } + diff --git a/fix-ICE-in-copy_reference_ops_from_ref.patch b/fix-ICE-in-copy_reference_ops_from_ref.patch new file mode 100644 index 0000000..52c660d --- /dev/null +++ b/fix-ICE-in-copy_reference_ops_from_ref.patch @@ -0,0 +1,70 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-tree-ssa-sccvn.c-copy_reference_ops_from_ref-Adjust-.patch +2f215d2176608467aeee73b245beedfc60836b71 + +diff -Nurp gcc-9.3.0_org/gcc/tree-ssa-sccvn.c gcc-9.3.0/gcc/tree-ssa-sccvn.c +--- gcc-9.3.0_org/gcc/tree-ssa-sccvn.c 2020-08-18 15:31:39.308000000 +0800 ++++ gcc-9.3.0/gcc/tree-ssa-sccvn.c 2020-08-18 15:32:03.456000000 +0800 +@@ -797,39 +797,6 @@ vn_reference_eq (const_vn_reference_t co + static void + copy_reference_ops_from_ref (tree ref, vec *result) + { +- if (TREE_CODE (ref) == TARGET_MEM_REF) +- { +- vn_reference_op_s temp; +- +- result->reserve (3); +- +- memset (&temp, 0, sizeof (temp)); +- temp.type = TREE_TYPE (ref); +- temp.opcode = TREE_CODE (ref); +- temp.op0 = TMR_INDEX (ref); +- temp.op1 = TMR_STEP (ref); +- temp.op2 = TMR_OFFSET (ref); +- temp.off = -1; +- temp.clique = MR_DEPENDENCE_CLIQUE (ref); +- temp.base = MR_DEPENDENCE_BASE (ref); +- result->quick_push (temp); +- +- memset (&temp, 0, sizeof (temp)); +- temp.type = NULL_TREE; +- temp.opcode = ERROR_MARK; +- temp.op0 = TMR_INDEX2 (ref); +- temp.off = -1; +- result->quick_push (temp); +- +- memset (&temp, 0, sizeof (temp)); +- temp.type = NULL_TREE; +- temp.opcode = TREE_CODE (TMR_BASE (ref)); +- temp.op0 = TMR_BASE (ref); +- temp.off = -1; +- result->quick_push (temp); +- return; +- } +- + /* For non-calls, store the information that makes up the address. */ + tree orig = ref; + while (ref) +@@ -859,6 +826,20 @@ copy_reference_ops_from_ref (tree ref, v + temp.base = MR_DEPENDENCE_BASE (ref); + temp.reverse = REF_REVERSE_STORAGE_ORDER (ref); + break; ++ case TARGET_MEM_REF: ++ /* The base address gets its own vn_reference_op_s structure. */ ++ temp.op0 = TMR_INDEX (ref); ++ temp.op1 = TMR_STEP (ref); ++ temp.op2 = TMR_OFFSET (ref); ++ temp.clique = MR_DEPENDENCE_CLIQUE (ref); ++ temp.base = MR_DEPENDENCE_BASE (ref); ++ result->safe_push (temp); ++ memset (&temp, 0, sizeof (temp)); ++ temp.type = NULL_TREE; ++ temp.opcode = ERROR_MARK; ++ temp.op0 = TMR_INDEX2 (ref); ++ temp.off = -1; ++ break; + case BIT_FIELD_REF: + /* Record bits, position and storage order. */ + temp.op0 = TREE_OPERAND (ref, 1); diff --git a/fix-ICE-in-declare-return-variable.patch b/fix-ICE-in-declare-return-variable.patch new file mode 100644 index 0000000..4faa0cb --- /dev/null +++ b/fix-ICE-in-declare-return-variable.patch @@ -0,0 +1,31 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-re-PR-ipa-92409-r277920-causes-ICE-in-gcc.dg-cast-fu.patch +e7399b548c866ee2e408e0855b3be794c056fb1d + +diff -uprN a/gcc/tree-inline.c b/gcc/tree-inline.c +--- a/gcc/tree-inline.c ++++ b/gcc/tree-inline.c +@@ -3593,7 +3593,9 @@ declare_return_variable (copy_body_data *id, tree return_slot, tree modify_dest, + vs. the call expression. */ + if (modify_dest) + caller_type = TREE_TYPE (modify_dest); +- else ++ else if (return_slot) ++ caller_type = TREE_TYPE (return_slot); ++ else /* No LHS on the call. */ + caller_type = TREE_TYPE (TREE_TYPE (callee)); + + /* We don't need to do anything for functions that don't return anything. */ +@@ -3634,6 +3636,10 @@ declare_return_variable (copy_body_data *id, tree return_slot, tree modify_dest, + && !DECL_GIMPLE_REG_P (result) + && DECL_P (var)) + DECL_GIMPLE_REG_P (var) = 0; ++ ++ if (!useless_type_conversion_p (callee_type, caller_type)) ++ var = build1 (VIEW_CONVERT_EXPR, callee_type, var); ++ + use = NULL; + goto done; + } diff --git a/fix-ICE-in-exact_div.patch b/fix-ICE-in-exact_div.patch new file mode 100644 index 0000000..a606157 --- /dev/null +++ b/fix-ICE-in-exact_div.patch @@ -0,0 +1,54 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-re-PR-tree-optimization-92555-ICE-in-exact_div-at-po.patch +f1e0c7e0eb3eafb122fc3d00242828c82a9286a2 + +diff -Nurp a/gcc/testsuite/gcc.dg/vect/pr92555.c b/gcc/testsuite/gcc.dg/vect/pr92555.c +--- a/gcc/testsuite/gcc.dg/vect/pr92555.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/vect/pr92555.c 2020-08-11 09:36:18.060000000 +0800 +@@ -0,0 +1,22 @@ ++/* { dg-do compile } */ ++/* { dg-additional-options "-fwrapv" } */ ++ ++signed char rq; ++ ++signed char ++pu (int tr, int al) ++{ ++ signed char x8; ++ ++ while (tr != 0) ++ { ++ for (x8 = 0; x8 >= 0; x8 += 2) ++ ; ++ ++ rq ^= al ^ 1; ++ ++x8; ++ ++tr; ++ } ++ ++ return x8; ++} +diff -Nurp a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c +--- a/gcc/tree-vect-loop.c 2020-08-11 09:35:10.952000000 +0800 ++++ b/gcc/tree-vect-loop.c 2020-08-11 09:36:18.064000000 +0800 +@@ -1415,6 +1415,18 @@ vect_update_vf_for_slp (loop_vec_info lo + for (i = 0; i < nbbs; i++) + { + basic_block bb = bbs[i]; ++ for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); ++ gsi_next (&si)) ++ { ++ stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ()); ++ if (!stmt_info) ++ continue; ++ if ((STMT_VINFO_RELEVANT_P (stmt_info) ++ || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) ++ && !PURE_SLP_STMT (stmt_info)) ++ /* STMT needs both SLP and loop-based vectorization. */ ++ only_slp_in_loop = false; ++ } + for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); + gsi_next (&si)) + { diff --git a/fix-ICE-in-gimple_op.patch b/fix-ICE-in-gimple_op.patch new file mode 100644 index 0000000..e6949dc --- /dev/null +++ b/fix-ICE-in-gimple_op.patch @@ -0,0 +1,65 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-re-PR-tree-optimization-92512-ICE-in-gimple_op-at-gi.patch +b9f71c51cd578c6ab6ad2986edb80ba48aa477bc + +diff -Nurp a/gcc/testsuite/gcc.dg/torture/pr92512.c b/gcc/testsuite/gcc.dg/torture/pr92512.c +--- a/gcc/testsuite/gcc.dg/torture/pr92512.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/torture/pr92512.c 2020-08-10 20:53:50.404000000 +0800 +@@ -0,0 +1,17 @@ ++/* { dg-do compile } */ ++/* { dg-additional-options "-ftree-vectorize" } */ ++ ++long int ++nl (long int fy, int k3, int zr) ++{ ++ while (k3 < 1) ++ { ++ if (zr == 0) ++ fy = 0; ++ ++ fy *= fy < zr; ++ ++k3; ++ } ++ ++ return fy; ++} +diff -Nurp a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c +--- a/gcc/tree-vect-loop.c 2020-08-10 20:53:42.636000000 +0800 ++++ b/gcc/tree-vect-loop.c 2020-08-10 20:53:50.404000000 +0800 +@@ -2931,9 +2931,11 @@ pop: + /* The following make sure we can compute the operand index + easily plus it mostly disallows chaining via COND_EXPR condition + operands. */ +- || (gimple_assign_rhs1 (use_stmt) != op +- && gimple_assign_rhs2 (use_stmt) != op +- && gimple_assign_rhs3 (use_stmt) != op)) ++ || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use ++ && (gimple_num_ops (use_stmt) <= 2 ++ || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use) ++ && (gimple_num_ops (use_stmt) <= 3 ++ || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use))) + { + fail = true; + break; +@@ -2946,7 +2948,18 @@ pop: + FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op) + if (!is_gimple_debug (op_use_stmt) + && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))) +- cnt++; ++ { ++ /* We want to allow x + x but not x < 1 ? x : 2. */ ++ if (is_gimple_assign (op_use_stmt) ++ && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR) ++ { ++ use_operand_p use_p; ++ FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) ++ cnt++; ++ } ++ else ++ cnt++; ++ } + if (cnt != 1) + { + fail = true; diff --git a/fix-ICE-in-model_update_limit_points_in_group.patch b/fix-ICE-in-model_update_limit_points_in_group.patch new file mode 100644 index 0000000..2692196 --- /dev/null +++ b/fix-ICE-in-model_update_limit_points_in_group.patch @@ -0,0 +1,248 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-re-PR-tree-optimization-88828-Inefficient-update-of-.patch +3bc104bdb4b5aa99ff6dceb246beaa65b012c5ac + +diff -Nurp a/gcc/testsuite/gcc.target/i386/pr88828-0.c b/gcc/testsuite/gcc.target/i386/pr88828-0.c +--- a/gcc/testsuite/gcc.target/i386/pr88828-0.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.target/i386/pr88828-0.c 2020-08-24 21:08:23.028000000 +0800 +@@ -0,0 +1,27 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -msse4.2" } */ ++ ++typedef int v4si __attribute__((vector_size(16))); ++typedef float v4sf __attribute__((vector_size(16))); ++ ++v4si foo (v4si x) ++{ ++ return (v4si){ x[0], 1, x[2], 3 }; ++} ++ ++/* { dg-final { scan-assembler "pblendw" } } */ ++ ++v4si bar (v4sf x) ++{ ++ return (v4si){ 1, x[1], x[2], 3 }; ++} ++ ++/* { dg-final { scan-assembler "cvttps2dq" } } */ ++/* { dg-final { scan-assembler "pblendw" } } */ ++ ++v4si baz (v4si x) ++{ ++ return (v4si) { x[1], x[2], x[3], 0 }; ++} ++ ++/* { dg-final { scan-assembler "psrldq" } } */ +diff -Nurp a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c +--- a/gcc/tree-ssa-forwprop.c 2020-08-24 21:07:59.800000000 +0800 ++++ b/gcc/tree-ssa-forwprop.c 2020-08-24 21:08:23.028000000 +0800 +@@ -1997,17 +1997,54 @@ simplify_permutation (gimple_stmt_iterat + return 0; + } + ++/* Get the BIT_FIELD_REF definition of VAL, if any, looking through ++ conversions with code CONV_CODE or update it if still ERROR_MARK. ++ Return NULL_TREE if no such matching def was found. */ ++ ++static tree ++get_bit_field_ref_def (tree val, enum tree_code &conv_code) ++{ ++ if (TREE_CODE (val) != SSA_NAME) ++ return NULL_TREE ; ++ gimple *def_stmt = get_prop_source_stmt (val, false, NULL); ++ if (!def_stmt) ++ return NULL_TREE; ++ enum tree_code code = gimple_assign_rhs_code (def_stmt); ++ if (code == FLOAT_EXPR ++ || code == FIX_TRUNC_EXPR) ++ { ++ tree op1 = gimple_assign_rhs1 (def_stmt); ++ if (conv_code == ERROR_MARK) ++ { ++ if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (val))), ++ GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1))))) ++ return NULL_TREE; ++ conv_code = code; ++ } ++ else if (conv_code != code) ++ return NULL_TREE; ++ if (TREE_CODE (op1) != SSA_NAME) ++ return NULL_TREE; ++ def_stmt = SSA_NAME_DEF_STMT (op1); ++ if (! is_gimple_assign (def_stmt)) ++ return NULL_TREE; ++ code = gimple_assign_rhs_code (def_stmt); ++ } ++ if (code != BIT_FIELD_REF) ++ return NULL_TREE; ++ return gimple_assign_rhs1 (def_stmt); ++} ++ + /* Recognize a VEC_PERM_EXPR. Returns true if there were any changes. */ + + static bool + simplify_vector_constructor (gimple_stmt_iterator *gsi) + { + gimple *stmt = gsi_stmt (*gsi); +- gimple *def_stmt; + tree op, op2, orig[2], type, elem_type; + unsigned elem_size, i; + unsigned HOST_WIDE_INT nelts; +- enum tree_code code, conv_code; ++ enum tree_code conv_code; + constructor_elt *elt; + bool maybe_ident; + +@@ -2027,6 +2064,9 @@ simplify_vector_constructor (gimple_stmt + orig[1] = NULL; + conv_code = ERROR_MARK; + maybe_ident = true; ++ tree one_constant = NULL_TREE; ++ auto_vec constants; ++ constants.safe_grow_cleared (nelts); + FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt) + { + tree ref, op1; +@@ -2034,68 +2074,57 @@ simplify_vector_constructor (gimple_stmt + if (i >= nelts) + return false; + +- if (TREE_CODE (elt->value) != SSA_NAME) +- return false; +- def_stmt = get_prop_source_stmt (elt->value, false, NULL); +- if (!def_stmt) +- return false; +- code = gimple_assign_rhs_code (def_stmt); +- if (code == FLOAT_EXPR +- || code == FIX_TRUNC_EXPR) ++ op1 = get_bit_field_ref_def (elt->value, conv_code); ++ if (op1) + { +- op1 = gimple_assign_rhs1 (def_stmt); +- if (conv_code == ERROR_MARK) ++ ref = TREE_OPERAND (op1, 0); ++ unsigned int j; ++ for (j = 0; j < 2; ++j) + { +- if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (elt->value))), +- GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1))))) +- return false; +- conv_code = code; ++ if (!orig[j]) ++ { ++ if (TREE_CODE (ref) != SSA_NAME) ++ return false; ++ if (! VECTOR_TYPE_P (TREE_TYPE (ref)) ++ || ! useless_type_conversion_p (TREE_TYPE (op1), ++ TREE_TYPE (TREE_TYPE (ref)))) ++ return false; ++ if (j && !useless_type_conversion_p (TREE_TYPE (orig[0]), ++ TREE_TYPE (ref))) ++ return false; ++ orig[j] = ref; ++ break; ++ } ++ else if (ref == orig[j]) ++ break; + } +- else if (conv_code != code) ++ if (j == 2) + return false; +- if (TREE_CODE (op1) != SSA_NAME) +- return false; +- def_stmt = SSA_NAME_DEF_STMT (op1); +- if (! is_gimple_assign (def_stmt)) ++ ++ unsigned int elt; ++ if (maybe_ne (bit_field_size (op1), elem_size) ++ || !constant_multiple_p (bit_field_offset (op1), elem_size, &elt)) + return false; +- code = gimple_assign_rhs_code (def_stmt); ++ if (j) ++ elt += nelts; ++ if (elt != i) ++ maybe_ident = false; ++ sel.quick_push (elt); + } +- if (code != BIT_FIELD_REF) +- return false; +- op1 = gimple_assign_rhs1 (def_stmt); +- ref = TREE_OPERAND (op1, 0); +- unsigned int j; +- for (j = 0; j < 2; ++j) ++ else if (CONSTANT_CLASS_P (elt->value)) + { +- if (!orig[j]) +- { +- if (TREE_CODE (ref) != SSA_NAME) +- return false; +- if (! VECTOR_TYPE_P (TREE_TYPE (ref)) +- || ! useless_type_conversion_p (TREE_TYPE (op1), +- TREE_TYPE (TREE_TYPE (ref)))) +- return false; +- if (j && !useless_type_conversion_p (TREE_TYPE (orig[0]), +- TREE_TYPE (ref))) +- return false; +- orig[j] = ref; +- break; +- } +- else if (ref == orig[j]) +- break; ++ if (orig[1] ++ && orig[1] != error_mark_node) ++ return false; ++ orig[1] = error_mark_node; ++ if (!one_constant) ++ one_constant = elt->value; ++ constants[i] = elt->value; ++ sel.quick_push (i + nelts); ++ maybe_ident = false; + } +- if (j == 2) +- return false; +- +- unsigned int elt; +- if (maybe_ne (bit_field_size (op1), elem_size) +- || !constant_multiple_p (bit_field_offset (op1), elem_size, &elt)) ++ else + return false; +- if (j) +- elt += nelts; +- if (elt != i) +- maybe_ident = false; +- sel.quick_push (elt); + } + if (i < nelts) + return false; +@@ -2138,9 +2167,29 @@ simplify_vector_constructor (gimple_stmt + op2 = vec_perm_indices_to_tree (mask_type, indices); + if (!orig[1]) + orig[1] = orig[0]; ++ if (orig[1] == error_mark_node) ++ { ++ tree_vector_builder vec (type, nelts, 1); ++ for (unsigned i = 0; i < nelts; ++i) ++ if (constants[i]) ++ vec.quick_push (constants[i]); ++ else ++ /* ??? Push a don't-care value. */ ++ vec.quick_push (one_constant); ++ orig[1] = vec.build (); ++ } + if (conv_code == ERROR_MARK) + gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0], + orig[1], op2); ++ else if (TREE_CODE (orig[1]) == VECTOR_CST) ++ { ++ gimple *conv ++ = gimple_build_assign (make_ssa_name (type), conv_code, orig[0]); ++ orig[0] = gimple_assign_lhs (conv); ++ gsi_insert_before (gsi, conv, GSI_SAME_STMT); ++ gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, ++ orig[0], orig[1], op2); ++ } + else + { + gimple *perm diff --git a/fix-ICE-in-reload.patch b/fix-ICE-in-reload.patch new file mode 100644 index 0000000..e1cd079 --- /dev/null +++ b/fix-ICE-in-reload.patch @@ -0,0 +1,369 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-Remove-gimple_call_types_likely_match_p-PR-70929.patch +7313607478c11e9455a32fb0dbfd7867e04ea96a + +diff -uprN a/gcc/auto-profile.c b/gcc/auto-profile.c +--- a/gcc/auto-profile.c 2020-03-31 09:51:52.000000000 +0800 ++++ b/gcc/auto-profile.c 2020-07-28 11:15:31.469393370 +0800 +@@ -605,8 +605,6 @@ function_instance::find_icall_target_map + get_identifier (afdo_string_table->get_name (callee))); + if (node == NULL) + continue; +- if (!check_ic_target (stmt, node)) +- continue; + (*map)[callee] = iter->second->total_count (); + ret += iter->second->total_count (); + } +@@ -1033,7 +1031,7 @@ afdo_indirect_call (gimple_stmt_iterator + print_generic_expr (dump_file, direct_call->decl, TDF_SLIM); + } + +- if (direct_call == NULL || !check_ic_target (stmt, direct_call)) ++ if (direct_call == NULL) + { + if (dump_file) + fprintf (dump_file, " not transforming\n"); +diff -uprN a/gcc/cgraph.c b/gcc/cgraph.c +--- a/gcc/cgraph.c 2020-07-28 11:18:05.385393370 +0800 ++++ b/gcc/cgraph.c 2020-07-28 11:15:31.469393370 +0800 +@@ -876,19 +876,8 @@ symbol_table::create_edge (cgraph_node * + edge->can_throw_external + = call_stmt ? stmt_can_throw_external (DECL_STRUCT_FUNCTION (caller->decl), + call_stmt) : false; +- if (call_stmt +- && callee && callee->decl +- && !gimple_check_call_matching_types (call_stmt, callee->decl, +- false)) +- { +- edge->inline_failed = CIF_MISMATCHED_ARGUMENTS; +- edge->call_stmt_cannot_inline_p = true; +- } +- else +- { +- edge->inline_failed = CIF_FUNCTION_NOT_CONSIDERED; +- edge->call_stmt_cannot_inline_p = false; +- } ++ edge->inline_failed = CIF_FUNCTION_NOT_CONSIDERED; ++ edge->call_stmt_cannot_inline_p = false; + + edge->indirect_info = NULL; + edge->indirect_inlining_edge = 0; +@@ -1253,13 +1242,6 @@ cgraph_edge::make_direct (cgraph_node *c + /* Insert to callers list of the new callee. */ + edge->set_callee (callee); + +- if (call_stmt +- && !gimple_check_call_matching_types (call_stmt, callee->decl, false)) +- { +- call_stmt_cannot_inline_p = true; +- inline_failed = CIF_MISMATCHED_ARGUMENTS; +- } +- + /* We need to re-determine the inlining status of the edge. */ + initialize_inline_failed (edge); + return edge; +@@ -1288,28 +1270,9 @@ cgraph_edge::redirect_call_stmt_to_calle + substitution), forget about speculating. */ + if (decl) + e = e->resolve_speculation (decl); +- /* If types do not match, speculation was likely wrong. +- The direct edge was possibly redirected to the clone with a different +- signature. We did not update the call statement yet, so compare it +- with the reference that still points to the proper type. */ +- else if (!gimple_check_call_matching_types (e->call_stmt, +- ref->referred->decl, +- true)) +- { +- if (dump_file) +- fprintf (dump_file, "Not expanding speculative call of %s -> %s\n" +- "Type mismatch.\n", +- e->caller->dump_name (), +- e->callee->dump_name ()); +- e = e->resolve_speculation (); +- /* We are producing the final function body and will throw away the +- callgraph edges really soon. Reset the counts/frequencies to +- keep verifier happy in the case of roundoff errors. */ +- e->count = gimple_bb (e->call_stmt)->count; +- } +- /* Expand speculation into GIMPLE code. */ + else + { ++ /* Expand speculation into GIMPLE code. */ + if (dump_file) + { + fprintf (dump_file, +@@ -3664,102 +3627,6 @@ cgraph_node::get_fun (void) + return fun; + } + +-/* Verify if the type of the argument matches that of the function +- declaration. If we cannot verify this or there is a mismatch, +- return false. */ +- +-static bool +-gimple_check_call_args (gimple *stmt, tree fndecl, bool args_count_match) +-{ +- tree parms, p; +- unsigned int i, nargs; +- +- /* Calls to internal functions always match their signature. */ +- if (gimple_call_internal_p (stmt)) +- return true; +- +- nargs = gimple_call_num_args (stmt); +- +- /* Get argument types for verification. */ +- if (fndecl) +- parms = TYPE_ARG_TYPES (TREE_TYPE (fndecl)); +- else +- parms = TYPE_ARG_TYPES (gimple_call_fntype (stmt)); +- +- /* Verify if the type of the argument matches that of the function +- declaration. If we cannot verify this or there is a mismatch, +- return false. */ +- if (fndecl && DECL_ARGUMENTS (fndecl)) +- { +- for (i = 0, p = DECL_ARGUMENTS (fndecl); +- i < nargs; +- i++, p = DECL_CHAIN (p)) +- { +- tree arg; +- /* We cannot distinguish a varargs function from the case +- of excess parameters, still deferring the inlining decision +- to the callee is possible. */ +- if (!p) +- break; +- arg = gimple_call_arg (stmt, i); +- if (p == error_mark_node +- || DECL_ARG_TYPE (p) == error_mark_node +- || arg == error_mark_node +- || (!types_compatible_p (DECL_ARG_TYPE (p), TREE_TYPE (arg)) +- && !fold_convertible_p (DECL_ARG_TYPE (p), arg))) +- return false; +- } +- if (args_count_match && p) +- return false; +- } +- else if (parms) +- { +- for (i = 0, p = parms; i < nargs; i++, p = TREE_CHAIN (p)) +- { +- tree arg; +- /* If this is a varargs function defer inlining decision +- to callee. */ +- if (!p) +- break; +- arg = gimple_call_arg (stmt, i); +- if (TREE_VALUE (p) == error_mark_node +- || arg == error_mark_node +- || TREE_CODE (TREE_VALUE (p)) == VOID_TYPE +- || (!types_compatible_p (TREE_VALUE (p), TREE_TYPE (arg)) +- && !fold_convertible_p (TREE_VALUE (p), arg))) +- return false; +- } +- } +- else +- { +- if (nargs != 0) +- return false; +- } +- return true; +-} +- +-/* Verify if the type of the argument and lhs of CALL_STMT matches +- that of the function declaration CALLEE. If ARGS_COUNT_MATCH is +- true, the arg count needs to be the same. +- If we cannot verify this or there is a mismatch, return false. */ +- +-bool +-gimple_check_call_matching_types (gimple *call_stmt, tree callee, +- bool args_count_match) +-{ +- tree lhs; +- +- if ((DECL_RESULT (callee) +- && !DECL_BY_REFERENCE (DECL_RESULT (callee)) +- && (lhs = gimple_call_lhs (call_stmt)) != NULL_TREE +- && !useless_type_conversion_p (TREE_TYPE (DECL_RESULT (callee)), +- TREE_TYPE (lhs)) +- && !fold_convertible_p (TREE_TYPE (DECL_RESULT (callee)), lhs)) +- || !gimple_check_call_args (call_stmt, callee, args_count_match)) +- return false; +- return true; +-} +- + /* Reset all state within cgraph.c so that we can rerun the compiler + within the same process. For use by toplev::finalize. */ + +diff -uprN a/gcc/cgraph.h b/gcc/cgraph.h +--- a/gcc/cgraph.h 2020-07-28 11:18:04.361393370 +0800 ++++ b/gcc/cgraph.h 2020-07-28 11:15:31.469393370 +0800 +@@ -2412,8 +2412,6 @@ bool cgraph_function_possibly_inlined_p + const char* cgraph_inline_failed_string (cgraph_inline_failed_t); + cgraph_inline_failed_type_t cgraph_inline_failed_type (cgraph_inline_failed_t); + +-extern bool gimple_check_call_matching_types (gimple *, tree, bool); +- + /* In cgraphunit.c */ + void cgraphunit_c_finalize (void); + +diff -uprN a/gcc/cif-code.def b/gcc/cif-code.def +--- a/gcc/cif-code.def 2020-03-31 09:51:52.000000000 +0800 ++++ b/gcc/cif-code.def 2020-07-28 11:15:31.469393370 +0800 +@@ -88,10 +88,6 @@ DEFCIFCODE(NOT_DECLARED_INLINED, CIF_FIN + N_("function not declared inline and code size would grow")) + + /* Caller and callee disagree on the arguments. */ +-DEFCIFCODE(MISMATCHED_ARGUMENTS, CIF_FINAL_ERROR, +- N_("mismatched arguments")) +- +-/* Caller and callee disagree on the arguments. */ + DEFCIFCODE(LTO_MISMATCHED_DECLARATIONS, CIF_FINAL_ERROR, + N_("mismatched declarations during linktime optimization")) + +diff -uprN a/gcc/ipa-inline.c b/gcc/ipa-inline.c +--- a/gcc/ipa-inline.c 2020-07-28 11:18:04.377393370 +0800 ++++ b/gcc/ipa-inline.c 2020-07-28 11:15:31.469393370 +0800 +@@ -2844,14 +2844,6 @@ early_inliner (function *fun) + = estimate_num_insns (edge->call_stmt, &eni_size_weights); + es->call_stmt_time + = estimate_num_insns (edge->call_stmt, &eni_time_weights); +- +- if (edge->callee->decl +- && !gimple_check_call_matching_types ( +- edge->call_stmt, edge->callee->decl, false)) +- { +- edge->inline_failed = CIF_MISMATCHED_ARGUMENTS; +- edge->call_stmt_cannot_inline_p = true; +- } + } + if (iterations < PARAM_VALUE (PARAM_EARLY_INLINER_MAX_ITERATIONS) - 1) + ipa_update_overall_fn_summary (node); +diff -uprN a/gcc/ipa-prop.c b/gcc/ipa-prop.c +--- a/gcc/ipa-prop.c 2020-07-28 11:18:04.377393370 +0800 ++++ b/gcc/ipa-prop.c 2020-07-28 11:15:31.469393370 +0800 +@@ -3841,11 +3841,6 @@ update_indirect_edges_after_inlining (st + else if (new_direct_edge) + { + new_direct_edge->indirect_inlining_edge = 1; +- if (new_direct_edge->call_stmt) +- new_direct_edge->call_stmt_cannot_inline_p +- = !gimple_check_call_matching_types ( +- new_direct_edge->call_stmt, +- new_direct_edge->callee->decl, false); + if (new_edges) + { + new_edges->safe_push (new_direct_edge); +diff -uprN a/gcc/testsuite/gcc.dg/winline-10.c b/gcc/testsuite/gcc.dg/winline-10.c +--- a/gcc/testsuite/gcc.dg/winline-10.c 2020-03-31 09:51:43.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/winline-10.c 2020-07-28 11:15:31.473393370 +0800 +@@ -1,9 +1,9 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -Winline" } */ ++/* { dg-options "-O2 -Winline -fopt-info-optimized-inline=stderr" } */ + + struct s { int a; }; + +-inline void f (x) /* { dg-warning "inlining .* mismatched arg" } */ ++inline void f (x) + int x; + { + asm (""); +@@ -11,7 +11,7 @@ inline void f (x) /* { dg-warning "inlin + + void g (struct s x) + { +- f (x); /* { dg-message "called from here" } */ ++ f (x); /* { dg-optimized "Inlining f.* into g" } */ + } + + void f (int x); /* { dg-warning "follows non-prototype definition" } */ +diff -uprN a/gcc/testsuite/g++.dg/lto/pr70929_0.C b/gcc/testsuite/g++.dg/lto/pr70929_0.C +--- a/gcc/testsuite/g++.dg/lto/pr70929_0.C 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/g++.dg/lto/pr70929_0.C 2020-07-28 11:15:31.469393370 +0800 +@@ -0,0 +1,18 @@ ++// { dg-lto-do run } ++// { dg-lto-options { "-O3 -flto" } } ++ ++struct s ++{ ++ int a; ++ s() {a=1;} ++ ~s() {} ++}; ++int t(struct s s); ++int main() ++{ ++ s s; ++ int v=t(s); ++ if (!__builtin_constant_p (v)) ++ __builtin_abort (); ++ return 0; ++} +diff -uprN a/gcc/testsuite/g++.dg/lto/pr70929_1.C b/gcc/testsuite/g++.dg/lto/pr70929_1.C +--- a/gcc/testsuite/g++.dg/lto/pr70929_1.C 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/g++.dg/lto/pr70929_1.C 2020-07-28 11:15:31.473393370 +0800 +@@ -0,0 +1,10 @@ ++struct s ++{ ++ int a; ++ s() {a=1;} ++ ~s() {} ++}; ++int t(struct s s) ++{ ++ return s.a; ++} +diff -uprN a/gcc/value-prof.c b/gcc/value-prof.c +--- a/gcc/value-prof.c 2020-03-31 09:51:30.000000000 +0800 ++++ b/gcc/value-prof.c 2020-07-28 11:17:08.281393370 +0800 +@@ -1249,25 +1249,6 @@ find_func_by_profile_id (int profile_id) + return NULL; + } + +-/* Perform sanity check on the indirect call target. Due to race conditions, +- false function target may be attributed to an indirect call site. If the +- call expression type mismatches with the target function's type, expand_call +- may ICE. Here we only do very minimal sanity check just to make compiler happy. +- Returns true if TARGET is considered ok for call CALL_STMT. */ +- +-bool +-check_ic_target (gcall *call_stmt, struct cgraph_node *target) +-{ +- if (gimple_check_call_matching_types (call_stmt, target->decl, true)) +- return true; +- +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_MISSED_OPTIMIZATION, call_stmt, +- "Skipping target %s with mismatching types for icall\n", +- target->name ()); +- return false; +-} +- + /* Do transformation + + if (actual_callee_address == address_of_most_common_function/method) +@@ -1473,21 +1454,6 @@ gimple_ic_transform (gimple_stmt_iterato + return false; + } + +- if (!check_ic_target (stmt, direct_call)) +- { +- if (dump_file) +- { +- fprintf (dump_file, "Indirect call -> direct call "); +- print_generic_expr (dump_file, gimple_call_fn (stmt), TDF_SLIM); +- fprintf (dump_file, "=> "); +- print_generic_expr (dump_file, direct_call->decl, TDF_SLIM); +- fprintf (dump_file, " transformation skipped because of type mismatch"); +- print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM); +- } +- gimple_remove_histogram_value (cfun, stmt, histogram); +- return false; +- } +- + if (dump_file) + { + fprintf (dump_file, "Indirect call -> direct call "); diff --git a/fix-ICE-in-store_constructor.patch b/fix-ICE-in-store_constructor.patch new file mode 100644 index 0000000..98cae50 --- /dev/null +++ b/fix-ICE-in-store_constructor.patch @@ -0,0 +1,356 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-SLP-SLP-vectorization-vectorize-vector-constructors.patch +818b3293f4545d899148810f4f7d676b81e989dd + +diff -N -urp a/gcc/expr.c b/gcc/expr.c +--- a/gcc/expr.c 2020-07-24 11:19:53.840000000 +0800 ++++ b/gcc/expr.c 2020-07-24 11:56:50.128000000 +0800 +@@ -6788,6 +6788,7 @@ store_constructor (tree exp, rtx target, + && n_elts.is_constant (&const_n_elts)) + { + machine_mode emode = eltmode; ++ bool vector_typed_elts_p = false; + + if (CONSTRUCTOR_NELTS (exp) + && (TREE_CODE (TREE_TYPE (CONSTRUCTOR_ELT (exp, 0)->value)) +@@ -6798,13 +6799,14 @@ store_constructor (tree exp, rtx target, + * TYPE_VECTOR_SUBPARTS (etype), + n_elts)); + emode = TYPE_MODE (etype); ++ vector_typed_elts_p = true; + } + icode = convert_optab_handler (vec_init_optab, mode, emode); + if (icode != CODE_FOR_nothing) + { + unsigned int i, n = const_n_elts; + +- if (emode != eltmode) ++ if (vector_typed_elts_p) + { + n = CONSTRUCTOR_NELTS (exp); + vec_vec_init_p = true; +diff -N -urp a/gcc/testsuite/gcc.dg/vect/bb-slp-40.c b/gcc/testsuite/gcc.dg/vect/bb-slp-40.c +--- a/gcc/testsuite/gcc.dg/vect/bb-slp-40.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/vect/bb-slp-40.c 2020-07-24 11:56:50.128000000 +0800 +@@ -0,0 +1,34 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O3 -fdump-tree-slp-all" } */ ++/* { dg-require-effective-target vect_int } */ ++ ++char g_d[1024], g_s1[1024], g_s2[1024]; ++void foo(void) ++{ ++ char *d = g_d, *s1 = g_s1, *s2 = g_s2; ++ ++ for ( int y = 0; y < 128; y++ ) ++ { ++ d[0 ] = s1[0 ] + s2[0 ]; ++ d[1 ] = s1[1 ] + s2[1 ]; ++ d[2 ] = s1[2 ] + s2[2 ]; ++ d[3 ] = s1[3 ] + s2[3 ]; ++ d[4 ] = s1[4 ] + s2[4 ]; ++ d[5 ] = s1[5 ] + s2[5 ]; ++ d[6 ] = s1[6 ] + s2[6 ]; ++ d[7 ] = s1[7 ] + s2[7 ]; ++ d[8 ] = s1[8 ] + s2[8 ]; ++ d[9 ] = s1[9 ] + s2[9 ]; ++ d[10] = s1[10] + s2[10]; ++ d[11] = s1[11] + s2[11]; ++ d[12] = s1[12] + s2[12]; ++ d[13] = s1[13] + s2[13]; ++ d[14] = s1[14] + s2[14]; ++ d[15] = s1[15] + s2[15]; ++ d += 16; ++ } ++} ++ ++/* See that we vectorize an SLP instance. */ ++/* { dg-final { scan-tree-dump-times "Found vectorizable constructor" 1 "slp1" } } */ ++/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "slp1" } } */ +diff -N -urp a/gcc/testsuite/gcc.dg/vect/bb-slp-41.c b/gcc/testsuite/gcc.dg/vect/bb-slp-41.c +--- a/gcc/testsuite/gcc.dg/vect/bb-slp-41.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/vect/bb-slp-41.c 2020-07-24 11:56:50.128000000 +0800 +@@ -0,0 +1,61 @@ ++/* { dg-do run } */ ++/* { dg-options "-O3 -fdump-tree-slp-all -fno-vect-cost-model" } */ ++/* { dg-require-effective-target vect_int } */ ++ ++#define ARR_SIZE 1000 ++ ++void foo (int *a, int *b) ++{ ++ int i; ++ for (i = 0; i < (ARR_SIZE - 2); ++i) ++ a[i] = b[0] + b[1] + b[i+1] + b[i+2]; ++} ++ ++void bar (int *a, int *b) ++{ ++ int i; ++ for (i = 0; i < (ARR_SIZE - 2); ++i) ++ { ++ a[i] = b[0]; ++ } ++ for (i = 0; i < (ARR_SIZE - 2); ++i) ++ { ++ a[i] = a[i] + b[1]; ++ } ++ for (i = 0; i < (ARR_SIZE - 2); ++i) ++ { ++ a[i] = a[i] + b[i+1]; ++ } ++ for (i = 0; i < (ARR_SIZE - 2); ++i) ++ { ++ a[i] = a[i] + b[i+2]; ++ } ++} ++ ++int main () ++{ ++ int a1[ARR_SIZE]; ++ int a2[ARR_SIZE]; ++ int b[ARR_SIZE]; ++ int i; ++ ++ for (i = 0; i < ARR_SIZE; i++) ++ { ++ a1[i] = 0; ++ a2[i] = 0; ++ b[i] = i; ++ } ++ ++ foo (a1, b); ++ bar (a2, b); ++ ++ for (i = 0; i < ARR_SIZE; i++) ++ if (a1[i] != a2[i]) ++ return 1; ++ ++ return 0; ++ ++} ++/* See that we vectorize an SLP instance. */ ++/* { dg-final { scan-tree-dump-times "Found vectorizable constructor" 12 "slp1" } } */ ++/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "slp1" } } */ +diff -N -urp a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h +--- a/gcc/tree-vectorizer.h 2020-07-24 11:19:51.976000000 +0800 ++++ b/gcc/tree-vectorizer.h 2020-07-24 11:56:50.132000000 +0800 +@@ -151,6 +151,10 @@ typedef struct _slp_instance { + /* The root of SLP tree. */ + slp_tree root; + ++ /* For vector constructors, the constructor stmt that the SLP tree is built ++ from, NULL otherwise. */ ++ stmt_vec_info root_stmt; ++ + /* Size of groups of scalar stmts that will be replaced by SIMD stmt/s. */ + unsigned int group_size; + +@@ -170,6 +174,7 @@ typedef struct _slp_instance { + #define SLP_INSTANCE_GROUP_SIZE(S) (S)->group_size + #define SLP_INSTANCE_UNROLLING_FACTOR(S) (S)->unrolling_factor + #define SLP_INSTANCE_LOADS(S) (S)->loads ++#define SLP_INSTANCE_ROOT_STMT(S) (S)->root_stmt + + #define SLP_TREE_CHILDREN(S) (S)->children + #define SLP_TREE_SCALAR_STMTS(S) (S)->stmts +diff -N -urp a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c +--- a/gcc/tree-vect-slp.c 2020-07-24 11:19:51.980000000 +0800 ++++ b/gcc/tree-vect-slp.c 2020-07-24 11:56:50.132000000 +0800 +@@ -2019,6 +2019,7 @@ vect_analyze_slp_instance (vec_info *vin + unsigned int i; + struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); + vec scalar_stmts; ++ bool constructor = false; + + if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) + { +@@ -2032,6 +2033,13 @@ vect_analyze_slp_instance (vec_info *vin + vectype = STMT_VINFO_VECTYPE (stmt_info); + group_size = REDUC_GROUP_SIZE (stmt_info); + } ++ else if (is_gimple_assign (stmt_info->stmt) ++ && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR) ++ { ++ vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt)); ++ group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt)); ++ constructor = true; ++ } + else + { + gcc_assert (is_a (vinfo)); +@@ -2079,6 +2087,25 @@ vect_analyze_slp_instance (vec_info *vin + STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) + = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ())); + } ++ else if (constructor) ++ { ++ tree rhs = gimple_assign_rhs1 (stmt_info->stmt); ++ tree val; ++ FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val) ++ { ++ if (TREE_CODE (val) == SSA_NAME) ++ { ++ gimple* def = SSA_NAME_DEF_STMT (val); ++ stmt_vec_info def_info = vinfo->lookup_stmt (def); ++ /* Value is defined in another basic block. */ ++ if (!def_info) ++ return false; ++ scalar_stmts.safe_push (def_info); ++ } ++ else ++ return false; ++ } ++ } + else + { + /* Collect reduction statements. */ +@@ -2164,6 +2191,8 @@ vect_analyze_slp_instance (vec_info *vin + SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size; + SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor; + SLP_INSTANCE_LOADS (new_instance) = vNULL; ++ SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL; ++ + vect_gather_slp_loads (new_instance, node); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, +@@ -3032,6 +3061,43 @@ vect_bb_vectorization_profitable_p (bb_v + return true; + } + ++/* Find any vectorizable constructors and add them to the grouped_store ++ array. */ ++ ++static void ++vect_slp_check_for_constructors (bb_vec_info bb_vinfo) ++{ ++ gimple_stmt_iterator gsi; ++ ++ for (gsi = bb_vinfo->region_begin; ++ gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi)) ++ { ++ gimple *stmt = gsi_stmt (gsi); ++ ++ if (is_gimple_assign (stmt) ++ && gimple_assign_rhs_code (stmt) == CONSTRUCTOR ++ && TREE_CODE (gimple_assign_lhs (stmt)) == SSA_NAME ++ && TREE_CODE (TREE_TYPE (gimple_assign_lhs (stmt))) == VECTOR_TYPE) ++ { ++ tree rhs = gimple_assign_rhs1 (stmt); ++ ++ if (CONSTRUCTOR_NELTS (rhs) == 0) ++ continue; ++ ++ poly_uint64 subparts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)); ++ ++ if (maybe_ne (subparts, CONSTRUCTOR_NELTS (rhs))) ++ continue; ++ ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Found vectorizable constructor: %G\n", stmt); ++ stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (stmt); ++ BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info); ++ } ++ } ++} ++ + /* Check if the region described by BB_VINFO can be vectorized, returning + true if so. When returning false, set FATAL to true if the same failure + would prevent vectorization at other vector sizes, false if it is still +@@ -3079,6 +3145,8 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vi + return false; + } + ++ vect_slp_check_for_constructors (bb_vinfo); ++ + /* If there are no grouped stores in the region there is no need + to continue with pattern recog as vect_analyze_slp will fail + anyway. */ +@@ -3135,6 +3203,8 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vi + relevant. */ + vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance)); + vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance)); ++ if (SLP_INSTANCE_ROOT_STMT (instance)) ++ STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp; + + i++; + } +@@ -4175,6 +4245,49 @@ vect_remove_slp_scalar_calls (slp_tree n + vect_remove_slp_scalar_calls (node, visited); + } + ++/* Vectorize the instance root. */ ++ ++void ++vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance) ++{ ++ gassign *rstmt; ++ ++ if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1) ++ { ++ stmt_vec_info child_stmt_info; ++ int j; ++ ++ FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info) ++ { ++ tree vect_lhs = gimple_get_lhs (child_stmt_info->stmt); ++ tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt); ++ rstmt = gimple_build_assign (root_lhs, vect_lhs); ++ break; ++ } ++ } ++ else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1) ++ { ++ int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++ stmt_vec_info child_stmt_info; ++ int j; ++ vec *v; ++ vec_alloc (v, nelts); ++ ++ FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info) ++ { ++ CONSTRUCTOR_APPEND_ELT (v, ++ NULL_TREE, ++ gimple_get_lhs (child_stmt_info->stmt)); ++ } ++ tree lhs = gimple_get_lhs (instance->root_stmt->stmt); ++ tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt)); ++ tree r_constructor = build_constructor (rtype, v); ++ rstmt = gimple_build_assign (lhs, r_constructor); ++ } ++ gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt); ++ gsi_replace (&rgsi, rstmt, true); ++} ++ + /* Generate vector code for all SLP instances in the loop/basic block. */ + + void +@@ -4189,9 +4302,13 @@ vect_schedule_slp (vec_info *vinfo) + slp_instances = vinfo->slp_instances; + FOR_EACH_VEC_ELT (slp_instances, i, instance) + { ++ slp_tree node = SLP_INSTANCE_TREE (instance); + /* Schedule the tree of INSTANCE. */ +- vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance), +- instance, bst_map); ++ vect_schedule_slp_instance (node, instance, bst_map); ++ ++ if (SLP_INSTANCE_ROOT_STMT (instance)) ++ vectorize_slp_instance_root_stmt (node, instance); ++ + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "vectorizing stmts using SLP.\n"); +@@ -4220,6 +4337,9 @@ vect_schedule_slp (vec_info *vinfo) + if (!STMT_VINFO_DATA_REF (store_info)) + break; + ++ if (SLP_INSTANCE_ROOT_STMT (instance)) ++ continue; ++ + store_info = vect_orig_stmt (store_info); + /* Free the attached stmt_vec_info and remove the stmt. */ + vinfo->remove_stmt (store_info); diff --git a/fix-ICE-in-vec.patch b/fix-ICE-in-vec.patch new file mode 100644 index 0000000..30d1c7e --- /dev/null +++ b/fix-ICE-in-vec.patch @@ -0,0 +1,93 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-re-PR-tree-optimization-92345-ICE-in-vec-_stmt_vec_i.patch +a6ba623777513e31721030092e4d786f461a0f06 + +diff -Nurp a/gcc/testsuite/gcc.dg/torture/pr92345.c b/gcc/testsuite/gcc.dg/torture/pr92345.c +--- a/gcc/testsuite/gcc.dg/torture/pr92345.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/torture/pr92345.c 2020-08-10 15:08:19.992000000 +0800 +@@ -0,0 +1,18 @@ ++/* { dg-do compile } */ ++/* { dg-additional-options "-ftree-vectorize" } */ ++ ++long int x1; ++int fr; ++ ++int ++us (int sk, int jx) ++{ ++ while (sk < 1) ++ { ++ jx *= 2; ++ fr += x1 + 1; ++ ++sk; ++ } ++ ++ return jx; ++} +diff -Nurp a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c +--- a/gcc/tree-vect-loop.c 2020-08-10 15:07:44.456000000 +0800 ++++ b/gcc/tree-vect-loop.c 2020-08-10 15:08:19.992000000 +0800 +@@ -155,7 +155,7 @@ along with GCC; see the file COPYING3. + + static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *); + static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info, +- bool *); ++ bool *, bool *); + + /* Subroutine of vect_determine_vf_for_stmt that handles only one + statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE +@@ -489,7 +489,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_i + tree init, step; + auto_vec worklist; + gphi_iterator gsi; +- bool double_reduc; ++ bool double_reduc, reduc_chain; + + DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles"); + +@@ -561,7 +561,8 @@ vect_analyze_scalar_cycles_1 (loop_vec_i + && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type); + + stmt_vec_info reduc_stmt_info +- = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc); ++ = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc, ++ &reduc_chain); + if (reduc_stmt_info) + { + STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info; +@@ -596,7 +597,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_i + /* Store the reduction cycles for possible vectorization in + loop-aware SLP if it was not detected as reduction + chain. */ +- if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info)) ++ if (! reduc_chain) + LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push + (reduc_stmt_info); + } +@@ -3032,7 +3033,7 @@ check_reduction_path (dump_user_location + + static stmt_vec_info + vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, +- bool *double_reduc) ++ bool *double_reduc, bool *reduc_chain_p) + { + gphi *phi = as_a (phi_info->stmt); + gimple *phi_use_stmt = NULL; +@@ -3040,6 +3041,7 @@ vect_is_simple_reduction (loop_vec_info + use_operand_p use_p; + + *double_reduc = false; ++ *reduc_chain_p = false; + STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION; + + tree phi_name = PHI_RESULT (phi); +@@ -3214,6 +3216,7 @@ vect_is_simple_reduction (loop_vec_info + LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]); + REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length (); + ++ *reduc_chain_p = true; + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "reduction: detected reduction chain\n"); diff --git a/fix-ICE-in-vect_create_epilog_for_reduction.patch b/fix-ICE-in-vect_create_epilog_for_reduction.patch new file mode 100644 index 0000000..fef451b --- /dev/null +++ b/fix-ICE-in-vect_create_epilog_for_reduction.patch @@ -0,0 +1,81 @@ +diff -Nurp a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c +--- a/gcc/tree-vect-loop.c 2020-07-09 10:42:35.824000000 +0800 ++++ b/gcc/tree-vect-loop.c 2020-07-09 10:43:23.920000000 +0800 +@@ -1143,7 +1143,9 @@ vect_compute_single_scalar_iteration_cos + else + kind = scalar_store; + } +- else ++ else if (vect_nop_conversion_p (stmt_info)) ++ continue; ++ else + kind = scalar_stmt; + + record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), +diff -Nurp a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h +--- a/gcc/tree-vectorizer.h 2020-07-09 10:42:35.824000000 +0800 ++++ b/gcc/tree-vectorizer.h 2020-07-09 10:43:23.920000000 +0800 +@@ -1645,6 +1645,7 @@ extern tree vect_get_vec_def_for_stmt_co + extern bool vect_transform_stmt (stmt_vec_info, gimple_stmt_iterator *, + slp_tree, slp_instance); + extern void vect_remove_stores (stmt_vec_info); ++extern bool vect_nop_conversion_p (stmt_vec_info); + extern opt_result vect_analyze_stmt (stmt_vec_info, bool *, slp_tree, + slp_instance, stmt_vector_for_cost *); + extern void vect_get_load_cost (stmt_vec_info, int, bool, +diff -Nurp a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c +--- a/gcc/tree-vect-slp.c 2020-07-09 10:42:35.736000000 +0800 ++++ b/gcc/tree-vect-slp.c 2020-07-09 10:43:23.920000000 +0800 +@@ -2940,6 +2940,8 @@ vect_bb_slp_scalar_cost (basic_block bb, + else + kind = scalar_store; + } ++ else if (vect_nop_conversion_p (stmt_info)) ++ continue; + else + kind = scalar_stmt; + record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body); +diff -Nurp a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c +--- a/gcc/tree-vect-stmts.c 2020-07-09 10:42:35.732000000 +0800 ++++ b/gcc/tree-vect-stmts.c 2020-07-09 10:43:23.920000000 +0800 +@@ -5283,6 +5283,29 @@ vectorizable_conversion (stmt_vec_info s + return true; + } + ++/* Return true if we can assume from the scalar form of STMT_INFO that ++ neither the scalar nor the vector forms will generate code. STMT_INFO ++ is known not to involve a data reference. */ ++ ++bool ++vect_nop_conversion_p (stmt_vec_info stmt_info) ++{ ++ gassign *stmt = dyn_cast (stmt_info->stmt); ++ if (!stmt) ++ return false; ++ ++ tree lhs = gimple_assign_lhs (stmt); ++ tree_code code = gimple_assign_rhs_code (stmt); ++ tree rhs = gimple_assign_rhs1 (stmt); ++ ++ if (code == SSA_NAME || code == VIEW_CONVERT_EXPR) ++ return true; ++ ++ if (CONVERT_EXPR_CODE_P (code)) ++ return tree_nop_conversion_p (TREE_TYPE (lhs), TREE_TYPE (rhs)); ++ ++ return false; ++} + + /* Function vectorizable_assignment. + +@@ -5398,7 +5421,9 @@ vectorizable_assignment (stmt_vec_info s + { + STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type; + DUMP_VECT_SCOPE ("vectorizable_assignment"); +- vect_model_simple_cost (stmt_info, ncopies, dt, ndts, slp_node, cost_vec); ++ if (!vect_nop_conversion_p (stmt_info)) ++ vect_model_simple_cost (stmt_info, ncopies, dt, ndts, slp_node, ++ cost_vec); + return true; + } + diff --git a/fix-ICE-in-vect_create_epilog_for_reduction_2.patch b/fix-ICE-in-vect_create_epilog_for_reduction_2.patch new file mode 100644 index 0000000..1130c05 --- /dev/null +++ b/fix-ICE-in-vect_create_epilog_for_reduction_2.patch @@ -0,0 +1,33 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-re-PR-tree-optimization-92162-ICE-in-vect_create_epi.patch +53b15ca96116544a7a3ca8bc5f4e1649b74f3d45 + +diff -Nurp gcc-9.3.0_org/gcc/tree-vect-loop.c gcc-9.3.0/gcc/tree-vect-loop.c +--- gcc-9.3.0_org/gcc/tree-vect-loop.c 2020-08-17 10:23:55.768000000 +0800 ++++ gcc-9.3.0/gcc/tree-vect-loop.c 2020-08-17 10:27:15.848000000 +0800 +@@ -4574,9 +4574,9 @@ vect_create_epilog_for_reduction (stmt_v + (CCOMPARE). The then and else values mirror the main VEC_COND_EXPR: + the reduction phi corresponds to NEW_PHI_TREE and the new values + correspond to INDEX_BEFORE_INCR. */ +- gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) >= 1); ++ gcc_assert (STMT_VINFO_REDUC_IDX (reduc_info) >= 1); + tree index_cond_expr; +- if (STMT_VINFO_REDUC_IDX (stmt_info) == 2) ++ if (STMT_VINFO_REDUC_IDX (reduc_info) == 2) + index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type, + ccompare, indx_before_incr, new_phi_tree); + else +diff -Nurp gcc-9.3.0_org/gcc/tree-vect-stmts.c gcc-9.3.0/gcc/tree-vect-stmts.c +--- gcc-9.3.0_org/gcc/tree-vect-stmts.c 2020-08-17 10:23:53.960000000 +0800 ++++ gcc-9.3.0/gcc/tree-vect-stmts.c 2020-08-17 10:27:15.848000000 +0800 +@@ -9077,7 +9077,7 @@ vectorizable_condition (stmt_vec_info st + return false; + reduc_info = info_for_reduction (stmt_info); + reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); +- reduc_index = STMT_VINFO_REDUC_IDX (stmt_info); ++ reduc_index = STMT_VINFO_REDUC_IDX (reduc_info); + gcc_assert (reduction_type != EXTRACT_LAST_REDUCTION + || reduc_index != -1); + } diff --git a/fix-ICE-in-vect_create_epilog_for_reduction_3.patch b/fix-ICE-in-vect_create_epilog_for_reduction_3.patch new file mode 100644 index 0000000..f74be1e --- /dev/null +++ b/fix-ICE-in-vect_create_epilog_for_reduction_3.patch @@ -0,0 +1,87 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-re-PR-tree-optimization-92554-ICE-in-vect_create_epi.patch +04c4599d30b1eb7c21d39b15a685aa1d9b8bf968 + +diff -Nurp a/gcc/testsuite/gcc.dg/vect/pr92554.c b/gcc/testsuite/gcc.dg/vect/pr92554.c +--- a/gcc/testsuite/gcc.dg/vect/pr92554.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/vect/pr92554.c 2020-08-17 11:08:28.424000000 +0800 +@@ -0,0 +1,11 @@ ++/* { dg-do compile } */ ++ ++short int w9; ++ ++void __attribute__ ((simd)) ++zc (int in) ++{ ++ int va = 1; ++ ++ w9 *= va != 0 ? in < 0 : 0; ++} +diff -Nurp a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c +--- a/gcc/tree-vect-loop.c 2020-08-17 10:41:56.756000000 +0800 ++++ b/gcc/tree-vect-loop.c 2020-08-17 11:09:36.474259880 +0800 +@@ -4515,12 +4515,21 @@ vect_create_epilog_for_reduction (stmt_v + zeroes. */ + if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION) + { +- tree indx_before_incr, indx_after_incr; +- poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype); +- +- gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt; ++ stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info); ++ cond_info = vect_stmt_to_vectorize (cond_info); ++ while (gimple_assign_rhs_code (cond_info->stmt) != COND_EXPR) ++ { ++ cond_info ++ = loop_vinfo->lookup_def (gimple_op (cond_info->stmt, ++ 1 + STMT_VINFO_REDUC_IDX ++ (cond_info))); ++ cond_info = vect_stmt_to_vectorize (cond_info); ++ } ++ gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt; + gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR); + ++ tree indx_before_incr, indx_after_incr; ++ poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype); + int scalar_precision + = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype))); + tree cr_index_scalar_type = make_unsigned_type (scalar_precision); +@@ -4574,9 +4583,9 @@ vect_create_epilog_for_reduction (stmt_v + (CCOMPARE). The then and else values mirror the main VEC_COND_EXPR: + the reduction phi corresponds to NEW_PHI_TREE and the new values + correspond to INDEX_BEFORE_INCR. */ +- gcc_assert (STMT_VINFO_REDUC_IDX (reduc_info) >= 1); ++ gcc_assert (STMT_VINFO_REDUC_IDX (cond_info) >= 1); + tree index_cond_expr; +- if (STMT_VINFO_REDUC_IDX (reduc_info) == 2) ++ if (STMT_VINFO_REDUC_IDX (cond_info) == 2) + index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type, + ccompare, indx_before_incr, new_phi_tree); + else +@@ -4772,10 +4781,11 @@ vect_create_epilog_for_reduction (stmt_v + be zero. */ + + /* Vector of {0, 0, 0,...}. */ +- tree zero_vec = make_ssa_name (vectype); +- tree zero_vec_rhs = build_zero_cst (vectype); +- gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs); +- gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT); ++ tree zero_vec = build_zero_cst (vectype); ++ ++ gimple_seq stmts = NULL; ++ new_phi_result = gimple_convert (&stmts, vectype, new_phi_result); ++ gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); + + /* Find maximum value from the vector of found indexes. */ + tree max_index = make_ssa_name (index_scalar_type); +@@ -4843,7 +4853,7 @@ vect_create_epilog_for_reduction (stmt_v + + /* Convert the reduced value back to the result type and set as the + result. */ +- gimple_seq stmts = NULL; ++ stmts = NULL; + new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type, + data_reduc); + gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); diff --git a/fix-ICE-in-vect_get_vec_def_for_stmt_copy.patch b/fix-ICE-in-vect_get_vec_def_for_stmt_copy.patch new file mode 100644 index 0000000..b40d5e3 --- /dev/null +++ b/fix-ICE-in-vect_get_vec_def_for_stmt_copy.patch @@ -0,0 +1,54 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-re-PR-tree-optimization-92161-ICE-in-vect_get_vec_de.patch +ae7f3143a3876378d051e64c8e68718f27c41075 + +diff -Nurp a/gcc/testsuite/gfortran.dg/pr92161.f b/gcc/testsuite/gfortran.dg/pr92161.f +--- a/gcc/testsuite/gfortran.dg/pr92161.f 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gfortran.dg/pr92161.f 2020-08-17 10:18:05.996000000 +0800 +@@ -0,0 +1,23 @@ ++! { dg-do compile } ++! { dg-options "-O1 -ftree-loop-vectorize -fno-signed-zeros -fno-trapping-math" } ++! { dg-additional-options "-mvsx" { target { powerpc*-*-* } } } ++ COMPLEX FUNCTION R1 (ZR, CC, EA, U6) ++ ++ INTEGER ZR, U6, FZ, J2 ++ COMPLEX EA(*), CC ++ DOUBLE PRECISION OS, GA, YU, XT ++ ++ OS = DBLE(REAL(CC)) ++ GA = DBLE(AIMAG(CC)) ++ J2 = 1 ++ ++ DO 5 FZ = 1, ZR ++ YU = DBLE(REAL(EA(J2))) ++ XT = DBLE(AIMAG(EA(J2))) ++ OS = OS + (YU * 2) - (XT * 2) ++ GA = GA + (YU * 3) + (XT * 3) ++ J2 = J2 + U6 ++ 5 CONTINUE ++ R1 = CMPLX(REAL(OS), REAL(GA)) ++ RETURN ++ END +diff -Nurp a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c +--- a/gcc/tree-vect-loop.c 2020-08-17 10:17:08.288000000 +0800 ++++ b/gcc/tree-vect-loop.c 2020-08-17 10:18:05.996000000 +0800 +@@ -2339,6 +2339,17 @@ again: + { + stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); + STMT_SLP_TYPE (stmt_info) = loop_vect; ++ if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def ++ || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) ++ { ++ /* vectorizable_reduction adjusts reduction stmt def-types, ++ restore them to that of the PHI. */ ++ STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info)) ++ = STMT_VINFO_DEF_TYPE (stmt_info); ++ STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize ++ (STMT_VINFO_REDUC_DEF (stmt_info))) ++ = STMT_VINFO_DEF_TYPE (stmt_info); ++ } + } + for (gimple_stmt_iterator si = gsi_start_bb (bb); + !gsi_end_p (si); gsi_next (&si)) diff --git a/fix-ICE-in-vect_slp_analyze_node_operations.patch b/fix-ICE-in-vect_slp_analyze_node_operations.patch new file mode 100644 index 0000000..5f5d336 --- /dev/null +++ b/fix-ICE-in-vect_slp_analyze_node_operations.patch @@ -0,0 +1,381 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-re-PR-tree-optimization-92516-ICE-in-vect_schedule_s.patch +10a73df76280e12886cb20b028727436d73724c5 + +diff -Nurp a/gcc/testsuite/gcc.dg/vect/vect-ctor-1.c b/gcc/testsuite/gcc.dg/vect/vect-ctor-1.c +--- a/gcc/testsuite/gcc.dg/vect/vect-ctor-1.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/vect/vect-ctor-1.c 2020-08-17 10:33:56.052000000 +0800 +@@ -0,0 +1,17 @@ ++/* { dg-do compile } */ ++/* { dg-additional-options "-O3" } */ ++/* { dg-additional-options "-mavx2" { target { i?86-*-* x86_64-*-* } } } */ ++ ++typedef struct { ++ unsigned short mprr_2[5][16][16]; ++} ImageParameters; ++int s[16][2]; ++void intrapred_luma_16x16(ImageParameters *img, int s0) ++{ ++ for (int j=0; j < 16; j++) ++ for (int i=0; i < 16; i++) ++ { ++ img->mprr_2[1 ][j][i]=s[j][1]; ++ img->mprr_2[2 ][j][i]=s0; ++ } ++} +diff -Nurp a/gcc/testsuite/g++.dg/vect/slp-pr92516.cc b/gcc/testsuite/g++.dg/vect/slp-pr92516.cc +--- a/gcc/testsuite/g++.dg/vect/slp-pr92516.cc 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/g++.dg/vect/slp-pr92516.cc 2020-08-17 10:33:56.052000000 +0800 +@@ -0,0 +1,43 @@ ++// { dg-do compile } ++// { dg-require-effective-target c++14 } ++ ++class a { ++public: ++ typedef int b; ++ operator b(); ++}; ++class c { ++public: ++ constexpr int m_fn1() const; ++ constexpr int d() const; ++ int e; ++ int f; ++}; ++constexpr int c::m_fn1() const { return e; } ++constexpr int c::d() const { return f; } ++class g { ++public: ++ g(); ++ constexpr void i(const c &) noexcept; ++ int j; ++ int k; ++ int l; ++ int m; ++}; ++constexpr void g::i(const c &n) noexcept { ++ int v = l - j, h = m - k; ++ j = n.m_fn1() - v / 2; ++ k = n.d() - h / 2; ++ l = j + v; ++ m = k + h; ++} ++class o { ++ void m_fn4() const; ++ a p; ++} r; ++void o::m_fn4() const { ++ g q; ++ c t; ++ q.i(t); ++ r.p || 0; ++} +diff -Nurp a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c +--- a/gcc/tree-vect-slp.c 2020-08-17 10:31:58.236000000 +0800 ++++ b/gcc/tree-vect-slp.c 2020-08-17 10:36:40.976796520 +0800 +@@ -2010,6 +2010,7 @@ calculate_unrolling_factor (poly_uint64 + + static bool + vect_analyze_slp_instance (vec_info *vinfo, ++ scalar_stmts_to_slp_tree_map_t *bst_map, + stmt_vec_info stmt_info, unsigned max_tree_size) + { + slp_instance new_instance; +@@ -2117,19 +2118,11 @@ vect_analyze_slp_instance (vec_info *vin + /* Build the tree for the SLP instance. */ + bool *matches = XALLOCAVEC (bool, group_size); + unsigned npermutes = 0; +- scalar_stmts_to_slp_tree_map_t *bst_map +- = new scalar_stmts_to_slp_tree_map_t (); + poly_uint64 max_nunits = nunits; + unsigned tree_size = 0; + node = vect_build_slp_tree (vinfo, scalar_stmts, group_size, + &max_nunits, matches, &npermutes, + &tree_size, bst_map); +- /* The map keeps a reference on SLP nodes built, release that. */ +- for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin (); +- it != bst_map->end (); ++it) +- if ((*it).second) +- vect_free_slp_tree ((*it).second, false); +- delete bst_map; + if (node != NULL) + { + /* If this is a reduction chain with a conversion in front +@@ -2183,6 +2176,18 @@ vect_analyze_slp_instance (vec_info *vin + matches[group_size / const_max_nunits * const_max_nunits] = false; + vect_free_slp_tree (node, false); + } ++ else if (constructor ++ && SLP_TREE_DEF_TYPE (node) != vect_internal_def) ++ { ++ /* CONSTRUCTOR vectorization relies on a vector stmt being ++ generated, that doesn't work for fully external ones. */ ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "Build SLP failed: CONSTRUCTOR of external " ++ "or constant elements\n"); ++ vect_free_slp_tree (node, false); ++ return false; ++ } + else + { + /* Create a new SLP instance. */ +@@ -2317,7 +2322,7 @@ vect_analyze_slp_instance (vec_info *vin + + stmt_vec_info rest = vect_split_slp_store_group (stmt_info, + group1_size); +- bool res = vect_analyze_slp_instance (vinfo, stmt_info, ++ bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info, + max_tree_size); + /* If the first non-match was in the middle of a vector, + skip the rest of that vector. */ +@@ -2328,7 +2333,8 @@ vect_analyze_slp_instance (vec_info *vin + rest = vect_split_slp_store_group (rest, const_nunits); + } + if (i < group_size) +- res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size); ++ res |= vect_analyze_slp_instance (vinfo, bst_map, ++ rest, max_tree_size); + return res; + } + /* Even though the first vector did not all match, we might be able to SLP +@@ -2350,9 +2356,12 @@ vect_analyze_slp (vec_info *vinfo, unsig + + DUMP_VECT_SCOPE ("vect_analyze_slp"); + ++ scalar_stmts_to_slp_tree_map_t *bst_map ++ = new scalar_stmts_to_slp_tree_map_t (); ++ + /* Find SLP sequences starting from groups of grouped stores. */ + FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element) +- vect_analyze_slp_instance (vinfo, first_element, max_tree_size); ++ vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size); + + if (loop_vec_info loop_vinfo = dyn_cast (vinfo)) + { +@@ -2361,7 +2370,7 @@ vect_analyze_slp (vec_info *vinfo, unsig + { + /* Find SLP sequences starting from reduction chains. */ + FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element) +- if (! vect_analyze_slp_instance (vinfo, first_element, ++ if (! vect_analyze_slp_instance (vinfo, bst_map, first_element, + max_tree_size)) + { + /* Dissolve reduction chain group. */ +@@ -2383,10 +2392,17 @@ vect_analyze_slp (vec_info *vinfo, unsig + + /* Find SLP sequences starting from groups of reductions. */ + if (loop_vinfo->reductions.length () > 1) +- vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0], ++ vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0], + max_tree_size); + } + ++ /* The map keeps a reference on SLP nodes built, release that. */ ++ for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin (); ++ it != bst_map->end (); ++it) ++ if ((*it).second) ++ vect_free_slp_tree ((*it).second, false); ++ delete bst_map; ++ + return opt_result::success (); + } + +@@ -2513,13 +2529,6 @@ vect_detect_hybrid_slp_stmts (slp_tree n + vect_detect_hybrid_slp_stmts (child, i, stype, visited); + } + +-static void +-vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype) +-{ +- hash_map visited; +- vect_detect_hybrid_slp_stmts (node, i, stype, visited); +-} +- + /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */ + + static tree +@@ -2602,11 +2611,12 @@ vect_detect_hybrid_slp (loop_vec_info lo + /* Then walk the SLP instance trees marking stmts with uses in + non-SLP stmts as hybrid, also propagating hybrid down the + SLP tree, collecting the above info on-the-fly. */ ++ hash_map visited; + FOR_EACH_VEC_ELT (slp_instances, i, instance) + { + for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i) + vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance), +- i, pure_slp); ++ i, pure_slp, visited); + } + } + +@@ -2763,8 +2773,8 @@ vect_slp_convert_to_external (vec_info * + static bool + vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node, + slp_instance node_instance, +- scalar_stmts_to_slp_tree_map_t *visited, +- scalar_stmts_to_slp_tree_map_t *lvisited, ++ hash_set &visited, ++ hash_set &lvisited, + stmt_vector_for_cost *cost_vec) + { + int i, j; +@@ -2774,27 +2784,13 @@ vect_slp_analyze_node_operations (vec_in + return true; + + /* If we already analyzed the exact same set of scalar stmts we're done. +- We share the generated vector stmts for those. */ +- slp_tree *leader; +- if ((leader = visited->get (SLP_TREE_SCALAR_STMTS (node))) +- || (leader = lvisited->get (SLP_TREE_SCALAR_STMTS (node)))) +- { +- SLP_TREE_NUMBER_OF_VEC_STMTS (node) +- = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader); +- /* Cope with cases in which we made a late decision to build the +- node from scalars. */ +- if (SLP_TREE_DEF_TYPE (*leader) == vect_external_def +- && vect_slp_convert_to_external (vinfo, node, node_instance)) +- ; +- else +- gcc_assert (SLP_TREE_DEF_TYPE (node) == SLP_TREE_DEF_TYPE (*leader)); +- return true; +- } +- +- /* The SLP graph is acyclic so not caching whether we failed or succeeded ++ We share the generated vector stmts for those. ++ The SLP graph is acyclic so not caching whether we failed or succeeded + doesn't result in any issue since we throw away the lvisited set + when we fail. */ +- lvisited->put (SLP_TREE_SCALAR_STMTS (node).copy (), node); ++ if (visited.contains (node) ++ || lvisited.add (node)) ++ return true; + + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) + if (!vect_slp_analyze_node_operations (vinfo, child, node_instance, +@@ -2867,16 +2863,15 @@ vect_slp_analyze_operations (vec_info *v + + DUMP_VECT_SCOPE ("vect_slp_analyze_operations"); + +- scalar_stmts_to_slp_tree_map_t *visited +- = new scalar_stmts_to_slp_tree_map_t (); ++ hash_set visited; + for (i = 0; vinfo->slp_instances.iterate (i, &instance); ) + { +- scalar_stmts_to_slp_tree_map_t lvisited; ++ hash_set lvisited; + stmt_vector_for_cost cost_vec; + cost_vec.create (2); + if (!vect_slp_analyze_node_operations (vinfo, + SLP_INSTANCE_TREE (instance), +- instance, visited, &lvisited, ++ instance, visited, lvisited, + &cost_vec)) + { + slp_tree node = SLP_INSTANCE_TREE (instance); +@@ -2891,16 +2886,15 @@ vect_slp_analyze_operations (vec_info *v + } + else + { +- for (scalar_stmts_to_slp_tree_map_t::iterator x = lvisited.begin(); ++ for (hash_set::iterator x = lvisited.begin(); + x != lvisited.end(); ++x) +- visited->put ((*x).first.copy (), (*x).second); ++ visited.add (*x); + i++; + + add_stmt_costs (vinfo->target_cost_data, &cost_vec); + cost_vec.release (); + } + } +- delete visited; + + return !vinfo->slp_instances.is_empty (); + } +@@ -2991,15 +2985,6 @@ vect_bb_slp_scalar_cost (basic_block bb, + } + } + +-static void +-vect_bb_slp_scalar_cost (basic_block bb, +- slp_tree node, vec *life, +- stmt_vector_for_cost *cost_vec) +-{ +- hash_set visited; +- vect_bb_slp_scalar_cost (bb, node, life, cost_vec, visited); +-} +- + /* Check if vectorization of the basic block is profitable. */ + + static bool +@@ -3014,13 +2999,14 @@ vect_bb_vectorization_profitable_p (bb_v + /* Calculate scalar cost. */ + stmt_vector_for_cost scalar_costs; + scalar_costs.create (0); ++ hash_set visited; + FOR_EACH_VEC_ELT (slp_instances, i, instance) + { + auto_vec life; + life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance)); + vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo), + SLP_INSTANCE_TREE (instance), +- &life, &scalar_costs); ++ &life, &scalar_costs, visited); + } + void *target_cost_data = init_cost (NULL); + add_stmt_costs (target_cost_data, &scalar_costs); +@@ -4052,8 +4038,7 @@ vect_transform_slp_perm_load (slp_tree n + /* Vectorize SLP instance tree in postorder. */ + + static void +-vect_schedule_slp_instance (slp_tree node, slp_instance instance, +- scalar_stmts_to_slp_tree_map_t *bst_map) ++vect_schedule_slp_instance (slp_tree node, slp_instance instance) + { + gimple_stmt_iterator si; + stmt_vec_info stmt_info; +@@ -4070,17 +4055,8 @@ vect_schedule_slp_instance (slp_tree nod + if (SLP_TREE_VEC_STMTS (node).exists ()) + return; + +- /* See if we have already vectorized the same set of stmts and reuse their +- vectorized stmts across instances. */ +- if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node))) +- { +- SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader)); +- return; +- } +- +- bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node); + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) +- vect_schedule_slp_instance (child, instance, bst_map); ++ vect_schedule_slp_instance (child, instance); + + /* Push SLP node def-type to stmts. */ + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) +@@ -4297,14 +4273,12 @@ vect_schedule_slp (vec_info *vinfo) + slp_instance instance; + unsigned int i; + +- scalar_stmts_to_slp_tree_map_t *bst_map +- = new scalar_stmts_to_slp_tree_map_t (); + slp_instances = vinfo->slp_instances; + FOR_EACH_VEC_ELT (slp_instances, i, instance) + { + slp_tree node = SLP_INSTANCE_TREE (instance); + /* Schedule the tree of INSTANCE. */ +- vect_schedule_slp_instance (node, instance, bst_map); ++ vect_schedule_slp_instance (node, instance); + + if (SLP_INSTANCE_ROOT_STMT (instance)) + vectorize_slp_instance_root_stmt (node, instance); +@@ -4313,7 +4287,6 @@ vect_schedule_slp (vec_info *vinfo) + dump_printf_loc (MSG_NOTE, vect_location, + "vectorizing stmts using SLP.\n"); + } +- delete bst_map; + + FOR_EACH_VEC_ELT (slp_instances, i, instance) + { diff --git a/fix-ICE-in-vect_stmt_to_vectorize.patch b/fix-ICE-in-vect_stmt_to_vectorize.patch new file mode 100644 index 0000000..67c9818 --- /dev/null +++ b/fix-ICE-in-vect_stmt_to_vectorize.patch @@ -0,0 +1,41 @@ +diff -Nurp a/gcc/testsuite/gcc.dg/torture/pr92252.c b/gcc/testsuite/gcc.dg/torture/pr92252.c +--- a/gcc/testsuite/gcc.dg/torture/pr92252.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/torture/pr92252.c 2020-07-03 10:39:44.808000000 +0800 +@@ -0,0 +1,23 @@ ++/* { do-do compile } */ ++/* { dg-additional-options "-ftree-vectorize" } */ ++ ++long int ar; ++int dt; ++ ++long int ++pc (unsigned long int q3, int zw) ++{ ++ long int em = 0; ++ ++ while (zw < 1) ++ { ++ q3 = zw * 2ul; ++ if (q3 != 0) ++ for (ar = 0; ar < 2; ++ar) ++ em = dt; ++ ++ ++zw; ++ } ++ ++ return em; ++} +diff -Nurp a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c +--- a/gcc/tree-vect-slp.c 2020-07-03 10:35:59.876000000 +0800 ++++ b/gcc/tree-vect-slp.c 2020-07-03 10:39:44.808000000 +0800 +@@ -581,6 +581,10 @@ again: + { + swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt), + gimple_assign_rhs3_ptr (stmt)); ++ if (STMT_VINFO_REDUC_IDX (stmt_info) == 1) ++ STMT_VINFO_REDUC_IDX (stmt_info) = 2; ++ else if (STMT_VINFO_REDUC_IDX (stmt_info) == 2) ++ STMT_VINFO_REDUC_IDX (stmt_info) = 1; + bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0)); + code = invert_tree_comparison (TREE_CODE (cond), honor_nans); + gcc_assert (code != ERROR_MARK); diff --git a/fix-ICE-in-vect_transform_stmt.patch b/fix-ICE-in-vect_transform_stmt.patch new file mode 100644 index 0000000..9433155 --- /dev/null +++ b/fix-ICE-in-vect_transform_stmt.patch @@ -0,0 +1,96 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-re-PR-fortran-92094-ice-in-vect_transform_stmt-at-tr.patch +c30587c0200f52f8845a5aea21bd7bef6cbe0bf4 + +diff -Nurp a/gcc/testsuite/gfortran.dg/pr92094.f90 b/gcc/testsuite/gfortran.dg/pr92094.f90 +--- a/gcc/testsuite/gfortran.dg/pr92094.f90 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gfortran.dg/pr92094.f90 2020-08-18 14:25:12.392000000 +0800 +@@ -0,0 +1,28 @@ ++! { dg-do compile } ++! { dg-options "-O3" } ++ subroutine hesfcn(n, x, h, ldh) ++ integer n,ldh ++ double precision x(n), h(ldh) ++ ++ integer i,j,k,kj ++ double precision th,u1,u2,v2 ++ ++ kj = 0 ++ do 770 j = 1, n ++ kj = kj - j ++ do 760 k = 1, j ++ kj = kj + 1 ++ v2 = 2 * x(k) - 1 ++ u1 = 0 ++ u2 = 2 ++ do 750 i = 1, n ++ h(kj) = h(kj) + u2 ++ th = 4 * v2 + u2 - u1 ++ u1 = u2 ++ u2 = th ++ th = v2 - 1 ++ 750 continue ++ 760 continue ++ 770 continue ++ ++ end +diff -Nurp a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c +--- a/gcc/tree-vect-loop.c 2020-08-18 14:19:43.784000000 +0800 ++++ b/gcc/tree-vect-loop.c 2020-08-18 14:25:12.396000000 +0800 +@@ -5891,20 +5891,9 @@ vectorizable_reduction (stmt_vec_info st + if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) + { + if (is_a (stmt_info->stmt)) +- { +- /* Analysis for double-reduction is done on the outer +- loop PHI, nested cycles have no further restrictions. */ +- STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type; +- /* For nested cycles we want to let regular vectorizable_* +- routines handle code-generation. */ +- if (STMT_VINFO_DEF_TYPE (reduc_info) != vect_double_reduction_def) +- { +- stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); +- STMT_VINFO_DEF_TYPE (stmt_info) = vect_internal_def; +- STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (stmt_info)) +- = vect_internal_def; +- } +- } ++ /* Analysis for double-reduction is done on the outer ++ loop PHI, nested cycles have no further restrictions. */ ++ STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type; + else + STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; + return true; +diff -Nurp a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c +--- a/gcc/tree-vect-stmts.c 2020-08-18 14:19:45.556000000 +0800 ++++ b/gcc/tree-vect-stmts.c 2020-08-18 14:25:12.396000000 +0800 +@@ -10224,13 +10224,16 @@ vect_transform_stmt (stmt_vec_info stmt_ + && STMT_VINFO_REDUC_TYPE (reduc_info) != EXTRACT_LAST_REDUCTION) + { + gphi *phi; ++ edge e; + if (!slp_node + && (phi = dyn_cast + (STMT_VINFO_REDUC_DEF (orig_stmt_info)->stmt)) + && dominated_by_p (CDI_DOMINATORS, +- gimple_bb (orig_stmt_info->stmt), gimple_bb (phi))) ++ gimple_bb (orig_stmt_info->stmt), gimple_bb (phi)) ++ && (e = loop_latch_edge (gimple_bb (phi)->loop_father)) ++ && (PHI_ARG_DEF_FROM_EDGE (phi, e) ++ == gimple_get_lhs (orig_stmt_info->stmt))) + { +- edge e = loop_latch_edge (gimple_bb (phi)->loop_father); + stmt_vec_info phi_info + = STMT_VINFO_VEC_STMT (STMT_VINFO_REDUC_DEF (orig_stmt_info)); + stmt_vec_info vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); +@@ -10250,7 +10253,7 @@ vect_transform_stmt (stmt_vec_info stmt_ + { + slp_tree phi_node = slp_node_instance->reduc_phis; + gphi *phi = as_a (SLP_TREE_SCALAR_STMTS (phi_node)[0]->stmt); +- edge e = loop_latch_edge (gimple_bb (phi)->loop_father); ++ e = loop_latch_edge (gimple_bb (phi)->loop_father); + gcc_assert (SLP_TREE_VEC_STMTS (phi_node).length () + == SLP_TREE_VEC_STMTS (slp_node).length ()); + for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i) diff --git a/fix-ICE-in-vectorizable-load.patch b/fix-ICE-in-vectorizable-load.patch index 690ce6c..bb31637 100644 --- a/fix-ICE-in-vectorizable-load.patch +++ b/fix-ICE-in-vectorizable-load.patch @@ -1,3 +1,9 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-vect-ICE-in-vectorizable_load-at-tree-vect-stmts.c-9.patch: +f14b41d27124601284347a10d496362c8b4b8e1c + diff -Nurp a/gcc/testsuite/gcc.target/aarch64/pr94398.c b/gcc/testsuite/gcc.target/aarch64/pr94398.c --- a/gcc/testsuite/gcc.target/aarch64/pr94398.c 1970-01-01 08:00:00.000000000 +0800 +++ b/gcc/testsuite/gcc.target/aarch64/pr94398.c 2020-04-17 17:15:58.176000000 +0800 diff --git a/fix-ICE-in-vectorizable_condition.patch b/fix-ICE-in-vectorizable_condition.patch new file mode 100644 index 0000000..98f367e --- /dev/null +++ b/fix-ICE-in-vectorizable_condition.patch @@ -0,0 +1,18 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-Fix-reduc_index-calculation-in-vectorizable_conditio.patch +1d149b7260bcc4c0c6367b3aea47a8b91a1cf345 + +diff -Nurp a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c +--- a/gcc/tree-vect-stmts.c 2020-08-18 19:35:06.352000000 +0800 ++++ b/gcc/tree-vect-stmts.c 2020-08-18 19:35:20.792000000 +0800 +@@ -9077,7 +9077,7 @@ vectorizable_condition (stmt_vec_info st + return false; + reduc_info = info_for_reduction (stmt_info); + reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); +- reduc_index = STMT_VINFO_REDUC_IDX (reduc_info); ++ reduc_index = STMT_VINFO_REDUC_IDX (stmt_info); + gcc_assert (reduction_type != EXTRACT_LAST_REDUCTION + || reduc_index != -1); + } diff --git a/fix-ICE-in-verify_ssa.patch b/fix-ICE-in-verify_ssa.patch new file mode 100644 index 0000000..056c276 --- /dev/null +++ b/fix-ICE-in-verify_ssa.patch @@ -0,0 +1,41 @@ +diff -Nurp a/gcc/testsuite/gcc.dg/torture/pr92461.c b/gcc/testsuite/gcc.dg/torture/pr92461.c +--- a/gcc/testsuite/gcc.dg/torture/pr92461.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/torture/pr92461.c 2020-07-28 19:48:09.324000000 +0800 +@@ -0,0 +1,20 @@ ++/* { dg-do compile } */ ++/* { dg-additional-options "-ftree-vectorize" } */ ++ ++short int zb; ++ ++void ++gs (void) ++{ ++ while (zb < 1) ++ { ++ int at; ++ ++ zb %= 1; ++ ++ for (at = 0; at < 56; ++at) ++ zb += zb; ++ ++ ++zb; ++ } ++} +diff -Nurp a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c +--- a/gcc/tree-vect-loop.c 2020-07-28 19:47:53.896000000 +0800 ++++ b/gcc/tree-vect-loop.c 2020-07-28 19:48:09.324000000 +0800 +@@ -5459,8 +5459,11 @@ vect_create_epilog_for_reduction (stmt_v + orig_name = PHI_RESULT (exit_phi); + scalar_result = scalar_results[k]; + FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) +- FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) +- SET_USE (use_p, scalar_result); ++ { ++ FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) ++ SET_USE (use_p, scalar_result); ++ update_stmt (use_stmt); ++ } + } + + phis.release (); diff --git a/fix-ICE-statement-uses-released-SSA-name.patch b/fix-ICE-statement-uses-released-SSA-name.patch new file mode 100644 index 0000000..06107b4 --- /dev/null +++ b/fix-ICE-statement-uses-released-SSA-name.patch @@ -0,0 +1,109 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-tree-ssa-sccvn.c-class-pass_fre-Add-may_iterate-pass.patch +744fd446c321f78f9a1ce4ef5f83df8dcfa44a9e + +diff -Nurp a/gcc/passes.def b/gcc/passes.def +--- a/gcc/passes.def 2020-08-17 09:46:40.340000000 +0800 ++++ b/gcc/passes.def 2020-08-17 10:09:10.808000000 +0800 +@@ -83,7 +83,7 @@ along with GCC; see the file COPYING3. + /* pass_build_ealias is a dummy pass that ensures that we + execute TODO_rebuild_alias at this point. */ + NEXT_PASS (pass_build_ealias); +- NEXT_PASS (pass_fre); ++ NEXT_PASS (pass_fre, true /* may_iterate */); + NEXT_PASS (pass_early_vrp); + NEXT_PASS (pass_merge_phi); + NEXT_PASS (pass_dse); +@@ -117,7 +117,7 @@ along with GCC; see the file COPYING3. + NEXT_PASS (pass_oacc_kernels); + PUSH_INSERT_PASSES_WITHIN (pass_oacc_kernels) + NEXT_PASS (pass_ch); +- NEXT_PASS (pass_fre); ++ NEXT_PASS (pass_fre, true /* may_iterate */); + /* We use pass_lim to rewrite in-memory iteration and reduction + variable accesses in loops into local variables accesses. */ + NEXT_PASS (pass_lim); +@@ -201,7 +201,7 @@ along with GCC; see the file COPYING3. + execute TODO_rebuild_alias at this point. */ + NEXT_PASS (pass_build_alias); + NEXT_PASS (pass_return_slot); +- NEXT_PASS (pass_fre); ++ NEXT_PASS (pass_fre, true /* may_iterate */); + NEXT_PASS (pass_merge_phi); + NEXT_PASS (pass_thread_jumps); + NEXT_PASS (pass_vrp, true /* warn_array_bounds_p */); +@@ -312,6 +312,7 @@ along with GCC; see the file COPYING3. + NEXT_PASS (pass_strength_reduction); + NEXT_PASS (pass_split_paths); + NEXT_PASS (pass_tracer); ++ NEXT_PASS (pass_fre, false /* may_iterate */); + NEXT_PASS (pass_thread_jumps); + NEXT_PASS (pass_dominator, false /* may_peel_loop_headers_p */); + NEXT_PASS (pass_strlen); +diff -Nurp a/gcc/testsuite/gcc.dg/tree-ssa/pr77445-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr77445-2.c +--- a/gcc/testsuite/gcc.dg/tree-ssa/pr77445-2.c 2020-08-17 09:46:41.332000000 +0800 ++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr77445-2.c 2020-08-17 10:09:10.808000000 +0800 +@@ -125,7 +125,7 @@ enum STATES FMS( u8 **in , u32 *transiti + jump threading opportunities. Skip the later tests on aarch64. */ + /* { dg-final { scan-tree-dump "Jumps threaded: 1\[1-9\]" "thread1" } } */ + /* { dg-final { scan-tree-dump-times "Invalid sum" 3 "thread1" } } */ +-/* { dg-final { scan-tree-dump-not "not considered" "thread1" } } */ +-/* { dg-final { scan-tree-dump-not "not considered" "thread2" } } */ +-/* { dg-final { scan-tree-dump-not "not considered" "thread3" { target { ! aarch64*-*-* } } } } */ +-/* { dg-final { scan-tree-dump-not "not considered" "thread4" { target { ! aarch64*-*-* } } } } */ ++/* { dg-final { scan-tree-dump-not "optimizing for size" "thread1" } } */ ++/* { dg-final { scan-tree-dump-not "optimizing for size" "thread2" } } */ ++/* { dg-final { scan-tree-dump-not "optimizing for size" "thread3" { target { ! aarch64*-*-* } } } } */ ++/* { dg-final { scan-tree-dump-not "optimizing for size" "thread4" { target { ! aarch64*-*-* } } } } */ +diff -Nurp a/gcc/tree-ssa-sccvn.c b/gcc/tree-ssa-sccvn.c +--- a/gcc/tree-ssa-sccvn.c 2020-08-17 09:46:42.212000000 +0800 ++++ b/gcc/tree-ssa-sccvn.c 2020-08-17 10:09:10.808000000 +0800 +@@ -7232,14 +7232,24 @@ class pass_fre : public gimple_opt_pass + { + public: + pass_fre (gcc::context *ctxt) +- : gimple_opt_pass (pass_data_fre, ctxt) ++ : gimple_opt_pass (pass_data_fre, ctxt), may_iterate (true) + {} + + /* opt_pass methods: */ + opt_pass * clone () { return new pass_fre (m_ctxt); } +- virtual bool gate (function *) { return flag_tree_fre != 0; } ++ void set_pass_param (unsigned int n, bool param) ++ { ++ gcc_assert (n == 0); ++ may_iterate = param; ++ } ++ virtual bool gate (function *) ++ { ++ return flag_tree_fre != 0 && (may_iterate || optimize > 1); ++ } + virtual unsigned int execute (function *); + ++private: ++ bool may_iterate; + }; // class pass_fre + + unsigned int +@@ -7248,15 +7258,16 @@ pass_fre::execute (function *fun) + unsigned todo = 0; + + /* At -O[1g] use the cheap non-iterating mode. */ ++ bool iterate_p = may_iterate && (optimize > 1); + calculate_dominance_info (CDI_DOMINATORS); +- if (optimize > 1) ++ if (iterate_p) + loop_optimizer_init (AVOID_CFG_MODIFICATIONS); + + default_vn_walk_kind = VN_WALKREWRITE; +- todo = do_rpo_vn (fun, NULL, NULL, optimize > 1, true); ++ todo = do_rpo_vn (fun, NULL, NULL, iterate_p, true); + free_rpo_vn (); + +- if (optimize > 1) ++ if (iterate_p) + loop_optimizer_finalize (); + + return todo; diff --git a/fix-ICE-when-vectorizing-nested-cycles.patch b/fix-ICE-when-vectorizing-nested-cycles.patch new file mode 100644 index 0000000..d8a5b69 --- /dev/null +++ b/fix-ICE-when-vectorizing-nested-cycles.patch @@ -0,0 +1,145 @@ +diff -uprN a/gcc/testsuite/gcc.dg/vect/pr96698.c b/gcc/testsuite/gcc.dg/vect/pr96698.c +--- a/gcc/testsuite/gcc.dg/vect/pr96698.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/vect/pr96698.c 2020-08-27 17:53:24.396000000 +0800 +@@ -0,0 +1,19 @@ ++/* { dg-do compile } */ ++ ++void test(int a, int* i) ++{ ++ for (; a < 5; ++a) ++ { ++ int b = 0; ++ int c = 0; ++ for (; b != -11; b--) ++ for (int d = 0; d ==0; d++) ++ { ++ *i += c & a; ++ c = b; ++ } ++ } ++} ++ ++/* We should be able to vectorize the inner cycle. */ ++/* { dg-final { scan-tree-dump "OUTER LOOP VECTORIZED" "vect" { target vect_int } } } */ +diff -uprN a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c +--- a/gcc/tree-vect-loop.c 2020-08-27 09:25:58.000000000 +0800 ++++ b/gcc/tree-vect-loop.c 2020-08-27 18:41:41.016000000 +0800 +@@ -4325,7 +4325,8 @@ info_for_reduction (stmt_vec_info stmt_i + { + stmt_info = vect_orig_stmt (stmt_info); + gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info)); +- if (!is_a (stmt_info->stmt)) ++ if (!is_a (stmt_info->stmt) ++ || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) + stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); + gphi *phi = as_a (stmt_info->stmt); + if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) +@@ -8622,6 +8623,43 @@ vect_transform_loop (loop_vec_info loop_ + } + } + ++ /* Fill in backedge defs of reductions. */ ++ for (unsigned i = 0; i < loop_vinfo->reduc_latch_defs.length (); ++i) ++ { ++ stmt_vec_info stmt_info = loop_vinfo->reduc_latch_defs[i]; ++ stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); ++ stmt_vec_info phi_info ++ = STMT_VINFO_VEC_STMT (STMT_VINFO_REDUC_DEF (orig_stmt_info)); ++ stmt_vec_info vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); ++ gphi *phi ++ = dyn_cast (STMT_VINFO_REDUC_DEF (orig_stmt_info)->stmt); ++ edge e = loop_latch_edge (gimple_bb (phi_info->stmt)->loop_father); ++ do ++ { ++ add_phi_arg (as_a (phi_info->stmt), ++ gimple_get_lhs (vec_stmt->stmt), e, ++ gimple_phi_arg_location (phi, e->dest_idx)); ++ phi_info = STMT_VINFO_RELATED_STMT (phi_info); ++ vec_stmt = STMT_VINFO_RELATED_STMT (vec_stmt); ++ } ++ while (phi_info); ++ gcc_assert (!vec_stmt); ++ } ++ for (unsigned i = 0; i < loop_vinfo->reduc_latch_slp_defs.length (); ++i) ++ { ++ slp_tree slp_node = loop_vinfo->reduc_latch_slp_defs[i].first; ++ slp_tree phi_node = loop_vinfo->reduc_latch_slp_defs[i].second; ++ gphi *phi = as_a (SLP_TREE_SCALAR_STMTS (phi_node)[0]->stmt); ++ e = loop_latch_edge (gimple_bb (phi)->loop_father); ++ gcc_assert (SLP_TREE_VEC_STMTS (phi_node).length () ++ == SLP_TREE_VEC_STMTS (slp_node).length ()); ++ for (unsigned j = 0; j < SLP_TREE_VEC_STMTS (phi_node).length (); ++j) ++ add_phi_arg (as_a (SLP_TREE_VEC_STMTS (phi_node)[j]->stmt), ++ gimple_get_lhs ++ (SLP_TREE_VEC_STMTS (slp_node)[j]->stmt), ++ e, gimple_phi_arg_location (phi, e->dest_idx)); ++ } ++ + /* Stub out scalar statements that must not survive vectorization. + Doing this here helps with grouped statements, or statements that + are involved in patterns. */ +diff -uprN a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h +--- a/gcc/tree-vectorizer.h 2020-08-27 09:25:57.000000000 +0800 ++++ b/gcc/tree-vectorizer.h 2020-08-27 17:53:24.400000000 +0800 +@@ -575,6 +575,11 @@ typedef struct _loop_vec_info : public v + stmt in the chain. */ + auto_vec reduction_chains; + ++ /* The vectorized stmts defining the latch values of the reduction ++ they are involved with. */ ++ auto_vec reduc_latch_defs; ++ auto_vec > reduc_latch_slp_defs; ++ + /* Cost vector for a single scalar iteration. */ + auto_vec scalar_cost_vec; + +diff -uprN a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c +--- a/gcc/tree-vect-stmts.c 2020-08-27 09:25:58.000000000 +0800 ++++ b/gcc/tree-vect-stmts.c 2020-08-27 17:53:24.400000000 +0800 +@@ -10213,8 +10213,8 @@ vect_transform_stmt (stmt_vec_info stmt_ + if (STMT_VINFO_TYPE (stmt_info) == store_vec_info_type) + return is_store; + +- /* If this stmt defines a value used on a backedge, update the +- vectorized PHIs. */ ++ /* If this stmt defines a value used on a backedge, record it so ++ we can update the vectorized PHIs later. */ + stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); + stmt_vec_info reduc_info; + if (STMT_VINFO_REDUC_DEF (orig_stmt_info) +@@ -10234,32 +10234,13 @@ vect_transform_stmt (stmt_vec_info stmt_ + && (PHI_ARG_DEF_FROM_EDGE (phi, e) + == gimple_get_lhs (orig_stmt_info->stmt))) + { +- stmt_vec_info phi_info +- = STMT_VINFO_VEC_STMT (STMT_VINFO_REDUC_DEF (orig_stmt_info)); +- stmt_vec_info vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); +- do +- { +- add_phi_arg (as_a (phi_info->stmt), +- gimple_get_lhs (vec_stmt->stmt), e, +- gimple_phi_arg_location (phi, e->dest_idx)); +- phi_info = STMT_VINFO_RELATED_STMT (phi_info); +- vec_stmt = STMT_VINFO_RELATED_STMT (vec_stmt); +- } +- while (phi_info); +- gcc_assert (!vec_stmt); ++ as_a (vinfo)->reduc_latch_defs.safe_push (stmt_info); + } + else if (slp_node + && slp_node != slp_node_instance->reduc_phis) + { +- slp_tree phi_node = slp_node_instance->reduc_phis; +- gphi *phi = as_a (SLP_TREE_SCALAR_STMTS (phi_node)[0]->stmt); +- e = loop_latch_edge (gimple_bb (phi)->loop_father); +- gcc_assert (SLP_TREE_VEC_STMTS (phi_node).length () +- == SLP_TREE_VEC_STMTS (slp_node).length ()); +- for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i) +- add_phi_arg (as_a (SLP_TREE_VEC_STMTS (phi_node)[i]->stmt), +- gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt), +- e, gimple_phi_arg_location (phi, e->dest_idx)); ++ as_a (vinfo)->reduc_latch_slp_defs.safe_push ++ (std::make_pair (slp_node, slp_node_instance->reduc_phis)); + } + } + diff --git a/fix-SSA-update-for-vectorizer-epilogue.patch b/fix-SSA-update-for-vectorizer-epilogue.patch new file mode 100644 index 0000000..96469b6 --- /dev/null +++ b/fix-SSA-update-for-vectorizer-epilogue.patch @@ -0,0 +1,47 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-tree-optimization-95717-fix-SSA-update-for-vectorize.patch +d0909f5858ad81e6d8b73fa6193be19cb5e6ed7b + +diff -Nurp a/gcc/testsuite/g++.dg/torture/pr95717.C b/gcc/testsuite/g++.dg/torture/pr95717.C +--- a/gcc/testsuite/g++.dg/torture/pr95717.C 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/g++.dg/torture/pr95717.C 2020-08-24 21:45:48.436000000 +0800 +@@ -0,0 +1,12 @@ ++// { dg-do compile } ++ ++bool a; ++extern bool b[]; ++long c, d; ++int *f; ++void g(bool h) ++{ ++ for (short e = 0; e < c; e = 4) ++ for (; d; d++) ++ b[d] = a = f[d] ? c ? h : 0 : h; ++} +diff -Nurp a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c +--- a/gcc/tree-vect-loop-manip.c 2020-08-24 21:45:23.620000000 +0800 ++++ b/gcc/tree-vect-loop-manip.c 2020-08-24 21:45:48.436000000 +0800 +@@ -1073,6 +1073,10 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( + + add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL); + ++ /* Skip new preheader since it's deleted if copy loop is added at entry. */ ++ for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++) ++ rename_variables_in_bb (new_bbs[i], duplicate_outer_loop); ++ + if (scalar_loop != loop) + { + /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from +@@ -1150,10 +1154,6 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( + loop_preheader_edge (new_loop)->src); + } + +- /* Skip new preheader since it's deleted if copy loop is added at entry. */ +- for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++) +- rename_variables_in_bb (new_bbs[i], duplicate_outer_loop); +- + if (scalar_loop != loop) + { + /* Update new_loop->header PHIs, so that on the preheader diff --git a/fix-SYMBOL_TINY_GOT-handling-for-ILP32.patch b/fix-SYMBOL_TINY_GOT-handling-for-ILP32.patch index fccdea5..9f596ca 100644 --- a/fix-SYMBOL_TINY_GOT-handling-for-ILP32.patch +++ b/fix-SYMBOL_TINY_GOT-handling-for-ILP32.patch @@ -1,3 +1,9 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-aarch64-Fix-SYMBOL_TINY_GOT-handling-for-ILP32-PR942.patch: +d91480dee934478063fe5945b73ff3c108e40a91 + diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index b0cbb6e2d55..58d38f74bde 100644 --- a/gcc/config/aarch64/aarch64.c diff --git a/fix-cost-of-plus.patch b/fix-cost-of-plus.patch index 7a34072..7edb1b1 100644 --- a/fix-cost-of-plus.patch +++ b/fix-cost-of-plus.patch @@ -1,3 +1,6 @@ +AArch64-Fix-cost-of-plus-.-const_int-C.patch: +commit 835d50c66aa5bde2f354a6e63a2afa7d2f76a05a + diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 56a4a47db73..71d44de1d0a 100644 --- a/gcc/config/aarch64/aarch64.c diff --git a/fix-do-not-build-op.patch b/fix-do-not-build-op.patch new file mode 100644 index 0000000..d3a59d4 --- /dev/null +++ b/fix-do-not-build-op.patch @@ -0,0 +1,27 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-tree-vect-slp.c-vect_build_slp_tree_2-Do-not-build-o.patch +f99d62629933adf91e7e0bc1b1ff344ffb68e1a2 + +diff -Nurp a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c +--- a/gcc/tree-vect-slp.c 2020-08-24 21:31:24.780000000 +0800 ++++ b/gcc/tree-vect-slp.c 2020-08-24 21:31:53.516000000 +0800 +@@ -1326,7 +1326,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, + slp_tree grandchild; + + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild) +- if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def) ++ if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def) + break; + if (!grandchild) + { +@@ -1486,7 +1486,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, + slp_tree grandchild; + + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild) +- if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def) ++ if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def) + break; + if (!grandchild) + { diff --git a/fix-load-eliding-in-SM.patch b/fix-load-eliding-in-SM.patch new file mode 100644 index 0000000..5e25a3d --- /dev/null +++ b/fix-load-eliding-in-SM.patch @@ -0,0 +1,55 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-tree-optimization-94949-fix-load-eliding-in-SM.patch +0424a5ece5307cc22bbc0fe97edf4707d7a798ed + +diff -Nurp a/gcc/testsuite/gcc.dg/torture/pr94949.c b/gcc/testsuite/gcc.dg/torture/pr94949.c +--- a/gcc/testsuite/gcc.dg/torture/pr94949.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/torture/pr94949.c 2020-08-24 21:40:32.208000000 +0800 +@@ -0,0 +1,17 @@ ++/* { dg-do run } */ ++/* { dg-additional-options "-fallow-store-data-races" } */ ++ ++static int x = 1; ++static volatile int y = -1; ++int ++main() ++{ ++ for (int i = 0; i < 128; ++i) ++ { ++ if (i == y) ++ x = i; ++ } ++ if (x != 1) ++ __builtin_abort (); ++ return 0; ++} +diff -Nurp a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c +--- a/gcc/tree-ssa-loop-im.c 2020-08-24 21:40:14.164000000 +0800 ++++ b/gcc/tree-ssa-loop-im.c 2020-08-24 21:40:32.208000000 +0800 +@@ -2115,9 +2115,9 @@ execute_sm (struct loop *loop, vec + fmt_data.orig_loop = loop; + for_each_index (&ref->mem.ref, force_move_till, &fmt_data); + ++ bool always_stored = ref_always_accessed_p (loop, ref, true); + if (bb_in_transaction (loop_preheader_edge (loop)->src) +- || (! flag_store_data_races +- && ! ref_always_accessed_p (loop, ref, true))) ++ || (! flag_store_data_races && ! always_stored)) + multi_threaded_model_p = true; + + if (multi_threaded_model_p) +@@ -2132,8 +2132,10 @@ execute_sm (struct loop *loop, vec + + /* Avoid doing a load if there was no load of the ref in the loop. + Esp. when the ref is not always stored we cannot optimize it +- away later. */ +- if (ref->loaded && bitmap_bit_p (ref->loaded, loop->num)) ++ away later. But when it is not always stored we must use a conditional ++ store then. */ ++ if ((!always_stored && !multi_threaded_model_p) ++ || (ref->loaded && bitmap_bit_p (ref->loaded, loop->num))) + { + load = gimple_build_assign (tmp_var, unshare_expr (ref->mem.ref)); + lim_data = init_lim_data (load); diff --git a/fix-regno-out-of-range.patch b/fix-regno-out-of-range.patch index aa8aaa5..cf2746b 100644 --- a/fix-regno-out-of-range.patch +++ b/fix-regno-out-of-range.patch @@ -1,3 +1,9 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-PR93561-bounds-checking-memory-overflow-for-spill_fo.patch: +d26f37a16e3ed3d75a93ffb1da10c44c36a8a36d + diff -Nurp a/gcc/lra-assigns.c b/gcc/lra-assigns.c --- a/gcc/lra-assigns.c 2020-04-17 16:27:46.192000000 +0800 +++ b/gcc/lra-assigns.c 2020-04-17 16:29:37.125688580 +0800 diff --git a/fix-wrong-vectorizer-code.patch b/fix-wrong-vectorizer-code.patch new file mode 100644 index 0000000..e3387bc --- /dev/null +++ b/fix-wrong-vectorizer-code.patch @@ -0,0 +1,71 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-re-PR-bootstrap-92301-Wrong-vectorizer-code-since-r2.patch +b76f4e6c06bd494d2383c4c16d1e1a034da74641 + +diff -Nurp a/gcc/testsuite/gcc.dg/pr92301.c b/gcc/testsuite/gcc.dg/pr92301.c +--- a/gcc/testsuite/gcc.dg/pr92301.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/pr92301.c 2020-08-24 21:36:23.556000000 +0800 +@@ -0,0 +1,35 @@ ++/* { dg-do run } */ ++/* { dg-options "-O3" } */ ++ ++unsigned int m; ++ ++#define N 128 ++unsigned int a[N]; ++ ++unsigned int ++__attribute__((noipa)) ++df_count_refs (_Bool include_defs) ++{ ++ int size = 0; ++ ++ for (unsigned int regno = 0; regno < m; regno++) ++ if (include_defs) ++ size += a[regno]; ++ return size; ++} ++ ++int main(int argc, char **argv) ++{ ++ for (unsigned i = 0; i < N; i++) ++ a[i] = i; ++ ++ if (argc == 1) ++ m = 17; ++ ++ unsigned int r = df_count_refs(1); ++ __builtin_printf ("r: %d\n", r); ++ if (r != 136) ++ __builtin_abort (); ++ ++ return 0; ++} +diff -Nurp a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c +--- a/gcc/tree-vect-stmts.c 2020-08-24 21:35:23.664000000 +0800 ++++ b/gcc/tree-vect-stmts.c 2020-08-24 21:36:23.556000000 +0800 +@@ -474,6 +474,22 @@ process_use (stmt_vec_info stmt_vinfo, t + basic_block def_bb = gimple_bb (dstmt_vinfo->stmt); + basic_block bb = gimple_bb (stmt_vinfo->stmt); + ++ /* case 2: A reduction phi (STMT) defined by a reduction stmt (DSTMT_VINFO). ++ We have to force the stmt live since the epilogue loop needs it to ++ continue computing the reduction. */ ++ if (gimple_code (stmt_vinfo->stmt) == GIMPLE_PHI ++ && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def ++ && gimple_code (dstmt_vinfo->stmt) != GIMPLE_PHI ++ && STMT_VINFO_DEF_TYPE (dstmt_vinfo) == vect_reduction_def ++ && bb->loop_father == def_bb->loop_father) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "reduc-stmt defining reduc-phi in the same nest.\n"); ++ vect_mark_relevant (worklist, dstmt_vinfo, relevant, true); ++ return opt_result::success (); ++ } ++ + /* case 3a: outer-loop stmt defining an inner-loop stmt: + outer-loop-header-bb: + d = dstmt_vinfo diff --git a/gcc.spec b/gcc.spec index 306aba1..a0e5fc2 100644 --- a/gcc.spec +++ b/gcc.spec @@ -1,4 +1,4 @@ -%global DATE 20200629 +%global DATE 20200828 %global gcc_version 9.3.1 %global gcc_major 9.3.1 @@ -112,27 +112,69 @@ Provides: bundled(libiberty) Provides: gcc(major) = %{gcc_major} Patch0: enable-aarch64-libquadmath.patch -Patch1: generate-csel.patch -Patch2: delete-incorrect-smw.patch -Patch3: remove-array-index-inliner-hint.patch -Patch4: ivopts-1.patch -Patch5: ivopts-2.patch -Patch6: dont-generate-IF_THEN_ELSE.patch -Patch7: fix-cost-of-plus.patch -Patch8: div-opti.patch -Patch9: fix-SYMBOL_TINY_GOT-handling-for-ILP32.patch -Patch10: fix-ICE-during-pass-ccp.patch -Patch11: loop-split.patch -Patch12: loop-finite.patch -Patch13: loop-finite-bugfix.patch -Patch14: fix-regno-out-of-range.patch -Patch15: fix-ICE-in-vectorizable-load.patch -Patch16: address-calculation-optimization-within-loop.patch -Patch17: skip-debug-insns-when-computing-inline-costs.patch -Patch18: change-gcc-BASE-VER.patch -Patch19: PR92303-Try-to-simplify-memory-subreg.patch -Patch20: Fix-PR94185.patch -Patch21: testsuite-Fix-pr94185.patch +Patch1: medium-code-mode.patch +Patch2: generate-csel.patch +Patch3: delete-incorrect-smw.patch +Patch4: remove-array-index-inliner-hint.patch +Patch5: ivopts-1.patch +Patch6: ivopts-2.patch +Patch7: dont-generate-IF_THEN_ELSE.patch +Patch8: fix-cost-of-plus.patch +Patch9: div-opti.patch +Patch10: fix-SYMBOL_TINY_GOT-handling-for-ILP32.patch +Patch11: fix-ICE-during-pass-ccp.patch +Patch12: loop-split.patch +Patch13: loop-finite.patch +Patch14: loop-finite-bugfix.patch +Patch15: fix-regno-out-of-range.patch +Patch16: fix-ICE-in-vectorizable-load.patch +Patch17: address-calculation-optimization-within-loop.patch +Patch18: skip-debug-insns-when-computing-inline-costs.patch +Patch19: ipa-const-prop.patch +Patch20: ipa-const-prop-self-recursion-bugfix.patch +Patch21: change-gcc-BASE-VER.patch +Patch22: add-option-fallow-store-data-races.patch +Patch23: tighten-range-for-generating-csel.patch +Patch24: generate-csel-for-arrayref.patch +Patch25: vectorization-enhancement.patch +Patch26: ipa-struct-reorg.patch +Patch27: ipa-struct-reorg-bugfix.patch +Patch28: enable-simd-math.patch +Patch29: complete-struct-reorg.patch +Patch30: reductions-slp-enhancement.patch +Patch31: cse-in-vectorization.patch +Patch32: PR92303-Try-to-simplify-memory-subreg.patch +Patch33: Fix-PR94185.patch +Patch34: testsuite-Fix-pr94185.patch +Patch35: fix-ICE-in-vect_stmt_to_vectorize.patch +Patch36: add-checks-to-avoid-spoiling-if-conversion.patch +Patch37: fix-ICE-in-vect_create_epilog_for_reduction.patch +Patch38: fix-ICE-in-compute_live_loop_exits.patch +Patch39: fix-ICE-in-store_constructor.patch +Patch40: fix-ICE-in-verify_ssa.patch +Patch41: fix-ICE-in-reload.patch +Patch42: fix-ICE-in-declare-return-variable.patch +Patch43: simplify-removing-subregs.patch +Patch44: fix-ICE-in-vec.patch +Patch45: fix-ICE-in-gimple_op.patch +Patch46: fix-ICE-in-exact_div.patch +Patch47: fix-ICE-statement-uses-released-SSA-name.patch +Patch48: fix-ICE-in-vect_get_vec_def_for_stmt_copy.patch +Patch49: fix-ICE-in-vect_create_epilog_for_reduction_2.patch +Patch50: fix-ICE-in-vect_slp_analyze_node_operations.patch +Patch51: fix-ICE-in-vect_create_epilog_for_reduction_3.patch +Patch52: fix-ICE-avoid-issueing-loads-in-SM-when-possible.patch +Patch53: fix-ICE-in-vect_transform_stmt.patch +Patch54: fix-ICE-in-copy_reference_ops_from_ref.patch +Patch55: fix-ICE-in-vectorizable_condition.patch +Patch56: reduction-chain-slp-option.patch +Patch57: fix-ICE-in-model_update_limit_points_in_group.patch +Patch58: fix-do-not-build-op.patch +Patch59: fix-wrong-vectorizer-code.patch +Patch60: fix-load-eliding-in-SM.patch +Patch61: fix-SSA-update-for-vectorizer-epilogue.patch +Patch62: fix-ICE-when-vectorizing-nested-cycles.patch + %global gcc_target_platform %{_arch}-linux-gnu @@ -596,6 +638,47 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch19 -p1 %patch20 -p1 %patch21 -p1 +%patch22 -p1 +%patch23 -p1 +%patch24 -p1 +%patch25 -p1 +%patch26 -p1 +%patch27 -p1 +%patch28 -p1 +%patch29 -p1 +%patch30 -p1 +%patch31 -p1 +%patch32 -p1 +%patch33 -p1 +%patch34 -p1 +%patch35 -p1 +%patch36 -p1 +%patch37 -p1 +%patch38 -p1 +%patch39 -p1 +%patch40 -p1 +%patch41 -p1 +%patch42 -p1 +%patch43 -p1 +%patch44 -p1 +%patch45 -p1 +%patch46 -p1 +%patch47 -p1 +%patch48 -p1 +%patch49 -p1 +%patch50 -p1 +%patch51 -p1 +%patch52 -p1 +%patch53 -p1 +%patch54 -p1 +%patch55 -p1 +%patch56 -p1 +%patch57 -p1 +%patch58 -p1 +%patch59 -p1 +%patch60 -p1 +%patch61 -p1 +%patch62 -p1 %build @@ -2524,6 +2607,49 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Mon Aug 28 2020 eastb233 - 9.3.1-20200828.4 + - Add add-checks-to-avoid-spoiling-if-conversion.patch + - Add add-option-fallow-store-data-races.patch + - Add complete-struct-reorg.patch + - Add cse-in-vectorization.patch + - Add enable-simd-math.patch + - Add fix-ICE-avoid-issueing-loads-in-SM-when-possible.patch + - Add fix-ICE-in-compute_live_loop_exits.patch + - Add fix-ICE-in-copy_reference_ops_from_ref.patch + - Add fix-ICE-in-declare-return-variable.patch + - Add fix-ICE-in-exact_div.patch + - Add fix-ICE-in-gimple_op.patch + - Add fix-ICE-in-model_update_limit_points_in_group.patch + - Add fix-ICE-in-reload.patch + - Add fix-ICE-in-store_constructor.patch + - Add fix-ICE-in-vec.patch + - Add fix-ICE-in-vect_create_epilog_for_reduction.patch + - Add fix-ICE-in-vect_create_epilog_for_reduction_2.patch + - Add fix-ICE-in-vect_create_epilog_for_reduction_3.patch + - Add fix-ICE-in-vect_get_vec_def_for_stmt_copy.patch + - Add fix-ICE-in-vect_slp_analyze_node_operations.patch + - Add fix-ICE-in-vect_stmt_to_vectorize.patch + - Add fix-ICE-in-vect_transform_stmt.patch + - Add fix-ICE-in-vectorizable_condition.patch + - Add fix-ICE-in-verify_ssa.patch + - Add fix-ICE-statement-uses-released-SSA-name.patch + - Add fix-ICE-when-vectorizing-nested-cycles.patch + - Add fix-SSA-update-for-vectorizer-epilogue.patch + - Add fix-do-not-build-op.patch + - Add fix-load-eliding-in-SM.patch + - Add fix-wrong-vectorizer-code.patch + - Add generate-csel-for-arrayref.patch + - Add ipa-const-prop-self-recursion-bugfix.patch + - Add ipa-const-prop.patch + - Add ipa-struct-reorg-bugfix.patch + - Add ipa-struct-reorg.patch + - Add medium-code-mode.patch + - Add reduction-chain-slp-option.patch + - Add reductions-slp-enhancement.patch + - Add simplify-removing-subregs.patch + - Add tighten-range-for-generating-csel.patch + - Add vectorization-enhancement.patch + * Mon Jun 29 2020 eastb233 - 9.3.1-20200629.3 - gcc.spec: Change release version diff --git a/generate-csel-for-arrayref.patch b/generate-csel-for-arrayref.patch new file mode 100644 index 0000000..c94311e --- /dev/null +++ b/generate-csel-for-arrayref.patch @@ -0,0 +1,218 @@ +diff -uprN a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-1.c +--- a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-1.c 2020-05-26 21:03:43.132721856 +0800 ++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-1.c 2020-05-19 20:12:32.655794652 +0800 +@@ -9,4 +9,4 @@ unsigned test(unsigned k, unsigned b) { + return a[0]+a[1]; + } + +-/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" { xfail *-*-* } } } */ ++/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" } } */ +diff -uprN a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-2.c +--- a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-2.c 2020-05-26 21:03:43.132721856 +0800 ++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-2.c 2020-05-19 20:12:32.667794652 +0800 +@@ -11,4 +11,4 @@ unsigned test(unsigned k, unsigned b) { + return a[0]+a[1]; + } + +-/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" { xfail *-*-* } } } */ ++/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" } } */ +diff -uprN a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-5.c b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-5.c +--- a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-5.c 2020-05-26 21:03:43.132721856 +0800 ++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-5.c 2020-05-19 20:12:32.667794652 +0800 +@@ -13,4 +13,4 @@ int test(int b, int k) { + return a.data[0] + a.data[1]; + } + +-/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" { xfail *-*-* } } } */ ++/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" } } */ +diff -uprN a/gcc/tree-ssa-phiopt.c b/gcc/tree-ssa-phiopt.c +--- a/gcc/tree-ssa-phiopt.c 2020-05-26 21:03:43.132721856 +0800 ++++ b/gcc/tree-ssa-phiopt.c 2020-05-26 21:02:02.872006469 +0800 +@@ -47,6 +47,7 @@ along with GCC; see the file COPYING3. + #include "params.h" + #include "case-cfn-macros.h" + #include "tree-eh.h" ++#include "inchash.h" + + static unsigned int tree_ssa_phiopt_worker (bool, bool, bool); + static bool two_value_replacement (basic_block, basic_block, edge, gphi *, +@@ -1984,6 +1985,18 @@ struct name_to_bb + basic_block bb; + }; + ++/* A hash-table of ARRAY_REF with a base of VAR_DECL and an offset of ++ SSA_NAME, and in which basic block it was seen, which would constitute ++ a no-trap region for same accessed. */ ++struct array_ref_to_bb ++{ ++ unsigned int ssa_name_ver; ++ unsigned int phase; ++ HOST_WIDE_INT size; ++ tree var_decl; ++ basic_block bb; ++}; ++ + /* Hashtable helpers. */ + + struct ssa_names_hasher : free_ptr_hash +@@ -1992,6 +2005,12 @@ struct ssa_names_hasher : free_ptr_hash + static inline bool equal (const name_to_bb *, const name_to_bb *); + }; + ++struct array_refs_hasher : free_ptr_hash ++{ ++ static inline hashval_t hash (const array_ref_to_bb *); ++ static inline bool equal (const array_ref_to_bb *, const array_ref_to_bb *); ++}; ++ + /* Used for quick clearing of the hash-table when we see calls. + Hash entries with phase < nt_call_phase are invalid. */ + static unsigned int nt_call_phase; +@@ -2005,6 +2024,16 @@ ssa_names_hasher::hash (const name_to_bb + ^ (n->offset << 6) ^ (n->size << 3); + } + ++inline hashval_t ++array_refs_hasher::hash (const array_ref_to_bb *n) ++{ ++ inchash::hash hstate (0); ++ hstate.add_int (n->ssa_name_ver); ++ hstate.add_hwi (n->size); ++ hstate.add_ptr (n->var_decl); ++ return hstate.end (); ++} ++ + /* The equality function of *P1 and *P2. */ + + inline bool +@@ -2016,11 +2045,21 @@ ssa_names_hasher::equal (const name_to_b + && n1->size == n2->size; + } + ++inline bool ++array_refs_hasher::equal (const array_ref_to_bb *n1, const array_ref_to_bb *n2) ++{ ++ return n1->ssa_name_ver == n2->ssa_name_ver ++ && n1->size == n2->size ++ && n1->var_decl == n2->var_decl; ++} ++ + class nontrapping_dom_walker : public dom_walker + { + public: + nontrapping_dom_walker (cdi_direction direction, hash_set *ps) +- : dom_walker (direction), m_nontrapping (ps), m_seen_ssa_names (128) {} ++ : dom_walker (direction), m_nontrapping (ps), ++ m_seen_ssa_names (128), m_seen_array_refs (128) ++ {} + + virtual edge before_dom_children (basic_block); + virtual void after_dom_children (basic_block); +@@ -2028,16 +2067,18 @@ public: + private: + + /* We see the expression EXP in basic block BB. If it's an interesting +- expression (an MEM_REF through an SSA_NAME) possibly insert the +- expression into the set NONTRAP or the hash table of seen expressions. +- STORE is true if this expression is on the LHS, otherwise it's on +- the RHS. */ ++ expression (an MEM_REF through an SSA_NAME or an ARRAY_REF with a base ++ of VAR_DECL and an offset of SSA_NAME) possibly insert the expression ++ into the set NONTRAP or the hash table of seen expressions. STORE ++ is true if this expression is on the LHS, otherwise it's on the RHS. */ + void add_or_mark_expr (basic_block, tree, bool); ++ void add_or_mark_array_ref (basic_block, tree); + + hash_set *m_nontrapping; + + /* The hash table for remembering what we've seen. */ + hash_table m_seen_ssa_names; ++ hash_table m_seen_array_refs; + }; + + /* Called by walk_dominator_tree, when entering the block BB. */ +@@ -2071,7 +2112,9 @@ nontrapping_dom_walker::before_dom_child + else if (gimple_assign_single_p (stmt) && !gimple_has_volatile_ops (stmt)) + { + add_or_mark_expr (bb, gimple_assign_lhs (stmt), true); ++ add_or_mark_array_ref (bb, gimple_assign_lhs (stmt)); + add_or_mark_expr (bb, gimple_assign_rhs1 (stmt), false); ++ add_or_mark_array_ref (bb, gimple_assign_rhs1 (stmt)); + } + } + return NULL; +@@ -2148,6 +2191,74 @@ nontrapping_dom_walker::add_or_mark_expr + } + } + } ++} ++ ++/* We see the expression EXP in basic block BB. If it's an interesting ++ expression (an ARRAY_REF with a base of VAR_DECL and an offset of ++ SSA_NAME) possibly insert the expression into the set NONTRAP or the ++ hash table of seen expressions. */ ++void ++nontrapping_dom_walker::add_or_mark_array_ref (basic_block bb, tree exp) ++{ ++ if (TREE_CODE (exp) == ARRAY_REF ++ && TREE_CODE (TREE_OPERAND (exp, 1)) == SSA_NAME ++ && int_size_in_bytes (TREE_TYPE (exp)) > 0) ++ { ++ HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp)); ++ tree base = get_base_address (exp); ++ /* if BASE is a local variable without address-taken, which can't be ++ read-only, a dominating load can constitute a no-trap region for ++ a store as well. */ ++ if (TREE_CODE (base) == VAR_DECL ++ && auto_var_p (base) && !TREE_ADDRESSABLE (base)) ++ { ++ struct array_ref_to_bb array_map; ++ basic_block found_array_bb = 0; ++ ++ /* Try to find the last seen ARRAY_REF with the same base and ++ offset, which can trap. */ ++ array_map.ssa_name_ver = SSA_NAME_VERSION (TREE_OPERAND (exp, 1)); ++ array_map.phase = 0; ++ array_map.bb = 0; ++ array_map.size = size; ++ array_map.var_decl = base; ++ ++ array_ref_to_bb **slot ++ = m_seen_array_refs.find_slot (&array_map, INSERT); ++ struct array_ref_to_bb *a2bb = *slot; ++ if (a2bb != NULL && a2bb->phase >= nt_call_phase) ++ { ++ found_array_bb = a2bb->bb; ++ } ++ ++ /* If we've found a trapping MEM_REF, _and_ it dominates EXP ++ (it's in a basic block on the path from us to the dominator root) ++ then we can't trap. */ ++ if (found_array_bb && (((size_t)found_array_bb->aux) & 1) == 1) ++ { ++ m_nontrapping->add (exp); ++ } ++ else ++ { ++ /* EXP might trap, so insert it into the hash table. */ ++ if (a2bb != NULL) ++ { ++ a2bb->phase = nt_call_phase; ++ a2bb->bb = bb; ++ } ++ else ++ { ++ a2bb = XNEW (struct array_ref_to_bb); ++ a2bb->ssa_name_ver = SSA_NAME_VERSION (TREE_OPERAND (exp, 1)); ++ a2bb->phase = nt_call_phase; ++ a2bb->bb = bb; ++ a2bb->size = size; ++ a2bb->var_decl = base; ++ *slot = a2bb; ++ } ++ } ++ } ++ } + } + + /* This is the entry point of gathering non trapping memory accesses. diff --git a/generate-csel.patch b/generate-csel.patch index 41fb032..3aaf261 100644 --- a/generate-csel.patch +++ b/generate-csel.patch @@ -1,3 +1,9 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-re-PR-tree-optimization-89430-A-missing-ifcvt-optimi.patch +b9ef6a2e04bfd01329902781818ef80c52cd8b97 + diff -uprN a/gcc/testsuite/gcc.dg/graphite/scop-21.c b/gcc/testsuite/gcc.dg/graphite/scop-21.c --- a/gcc/testsuite/gcc.dg/graphite/scop-21.c +++ b/gcc/testsuite/gcc.dg/graphite/scop-21.c diff --git a/ipa-const-prop-self-recursion-bugfix.patch b/ipa-const-prop-self-recursion-bugfix.patch new file mode 100644 index 0000000..9e878a3 --- /dev/null +++ b/ipa-const-prop-self-recursion-bugfix.patch @@ -0,0 +1,191 @@ +This patch is backport from gcc-trunk. It is a combined patch from + +Find matched aggregate lattice for self-recursive CP (PR ipa/93084) +https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=42d73fa9d575e3c8c21e88bd7f65922e17b052f1 + +and + +Do not propagate self-dependent value (PR ipa/93763) +https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=47772af10c00f7e1e95cd52557fc893dc602a420 + +adapted the using of parameter to gcc9 style. + +diff -Nurp a/gcc/ipa-cp.c b/gcc/ipa-cp.c +--- a/gcc/ipa-cp.c 2020-05-23 16:16:58.032000000 +0800 ++++ b/gcc/ipa-cp.c 2020-05-22 18:03:41.980000000 +0800 +@@ -1766,8 +1766,8 @@ ipcp_lattice::add_value (valtyp + } + + /* Return true, if a ipcp_value VAL is orginated from parameter value of +- self-feeding recursive function by applying non-passthrough arithmetic +- transformation. */ ++ self-feeding recursive function via some kind of pass-through jump ++ function. */ + + static bool + self_recursively_generated_p (ipcp_value *val) +@@ -1778,19 +1778,36 @@ self_recursively_generated_p (ipcp_value + { + cgraph_edge *cs = src->cs; + +- if (!src->val || cs->caller != cs->callee->function_symbol () +- || src->val == val) ++ if (!src->val || cs->caller != cs->callee->function_symbol ()) + return false; + ++ if (src->val == val) ++ continue; ++ + if (!info) + info = IPA_NODE_REF (cs->caller); + + class ipcp_param_lattices *plats = ipa_get_parm_lattices (info, + src->index); +- ipcp_lattice *src_lat = src->offset == -1 ? &plats->itself +- : plats->aggs; ++ ipcp_lattice *src_lat; + ipcp_value *src_val; + ++ if (src->offset == -1) ++ src_lat = &plats->itself; ++ else ++ { ++ struct ipcp_agg_lattice *src_aglat; ++ ++ for (src_aglat = plats->aggs; src_aglat; src_aglat = src_aglat->next) ++ if (src_aglat->offset == src->offset) ++ break; ++ ++ if (!src_aglat) ++ return false; ++ ++ src_lat = src_aglat; ++ } ++ + for (src_val = src_lat->values; src_val; src_val = src_val->next) + if (src_val == val) + break; +@@ -1887,6 +1904,8 @@ propagate_vals_across_arith_jfunc (cgrap + val_seeds.safe_push (src_val); + } + ++ gcc_assert ((int) val_seeds.length () ++ <= PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE)); + /* Recursively generate lattice values with a limited count. */ + FOR_EACH_VEC_ELT (val_seeds, i, src_val) + { +diff -Nurp a/gcc/testsuite/gcc.dg/ipa/ipa-clone-3.c b/gcc/testsuite/gcc.dg/ipa/ipa-clone-3.c +--- a/gcc/testsuite/gcc.dg/ipa/ipa-clone-3.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/ipa/ipa-clone-3.c 2020-05-22 17:55:24.036000000 +0800 +@@ -0,0 +1,42 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O3 -fdump-ipa-cp-details -fno-early-inlining --param ipa-cp-max-recursive-depth=8 --param ipa-cp-eval-threshold=1" } */ ++ ++struct V { ++ int f0; ++ int f1; ++}; ++ ++int data[100]; ++ ++int fn (); ++ ++int recur_fn (struct V * __restrict v) ++{ ++ int i = v->f0; ++ int j = v->f1; ++ struct V t; ++ ++ if (j > 100) ++ { ++ fn (); ++ return 1; ++ } ++ ++ data[i] = i; ++ ++ t.f0 = i - 2; ++ t.f1 = j + 1; ++ ++ recur_fn (&t); ++ ++ return i * j; ++} ++ ++int main () ++{ ++ struct V v = {1, 3}; ++ ++ return recur_fn (&v); ++} ++ ++/* { dg-final { scan-ipa-dump-times "Creating a specialized node of recur_fn/\[0-9\]*\\." 8 "cp" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/ipa/pr93763.c b/gcc/testsuite/gcc.dg/ipa/pr93763.c +--- a/gcc/testsuite/gcc.dg/ipa/pr93763.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/ipa/pr93763.c 2020-05-22 17:57:10.532000000 +0800 +@@ -0,0 +1,46 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O3" } */ ++ ++typedef struct a a; ++struct a { ++ a *b ++} d; ++e, k, ah, al; ++f(aa) { ++ if (aa & 1) ++ goto g; ++ f(aa | 2); ++g: ++ h(); ++} ++l() { ++ { ++ f(072); ++ i(e, d, 92); ++ } ++} ++ag() { ++ { i(e, d, 36); } ++} ++ai(a *m, a *n, unsigned aa) { ++ f(aa); ++ j(k, l, ah, 1); ++} ++j(int c, a m, int aj, int aa) { ++ int ak = aa; ++ { i(e, d, ak); } ++} ++i(int c, a *m, unsigned aa) { ++ { ++ { i(c, (*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*( ++*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*( ++*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*m).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b) ++.b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b) ++.b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b) ++.b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b) ++.b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b, 0); ++ } ++ } ++ int am = aa; ++ ai(ag, al, am); ++} +diff -Nurp a/gcc/testsuite/g++.dg/ipa/pr93763.C b/gcc/testsuite/g++.dg/ipa/pr93763.C +--- a/gcc/testsuite/g++.dg/ipa/pr93763.C 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/g++.dg/ipa/pr93763.C 2020-05-22 17:57:10.532000000 +0800 +@@ -0,0 +1,15 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O3" } */ ++ ++struct search_param { ++ int total; ++}; ++void search_trivial(search_param error_left) { ++ search_trivial(error_left); ++ search_param error_left2{error_left}; ++ error_left2.total--; ++ search_trivial(error_left2); ++} ++void search_algo_uni(search_param error_left) { search_trivial(error_left); } ++void search_algo(search_param error_left) { search_algo_uni(error_left); } ++int main() { search_algo({}); return 0; } diff --git a/ipa-const-prop.patch b/ipa-const-prop.patch new file mode 100644 index 0000000..7cad13f --- /dev/null +++ b/ipa-const-prop.patch @@ -0,0 +1,11040 @@ +This backport contains 50 patchs from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +ipa-const-prop-2019-06-10-add-ignore-edge-func.patch: +commit 97e59627567757759b047479c75be2f238ea45c3 + +ipa-const-prop-2019-06-14-prop-by-ref-to-callee.patch: +commit 46771da57463c62f66af32e9189f1b6fb8bbe8c7 + +ipa-const-prop-2019-07-05-add-tbaa-para.patch: +ipa-const-prop-2019-07-05-add-tbaa-para-conflict-fix.patch +commit fb4697e30bd0cd4bda66932e21c183273a5d1e63 + +ipa-const-prop-2019-07-08-bugfix-drop-useless-instr.patch: +ipa-const-prop-2019-07-08-bugfix-drop-useless-instr-conflict-fix.patch +commit 38988cbf9ebaa96fb1e891a46aa063f0c298a2e2 + +ipa-const-prop-2019-07-09-ipa-cp-class-change.patch +ipa-const-prop-2019-07-09-ipa-fnsummary-class-change.patch +ipa-const-prop-2019-07-09-ipa-inline-analysis-class-change.patch +ipa-const-prop-2019-07-09-ipa-prop-class-change.patch +ipa-const-prop-2019-07-09-ipa-prop-class-change-conflic-fix.patch +ipa-const-prop-2019-07-09-ipa-predicate-class-change.patch +commit 99b1c316ec974a39bdd949f8559bb28861b69592 + +ipa-const-prop-2019-08-07-change-to-poly_64.patch: +commit 8600364582f24d2a3f227111c6a87b7d98561c69 + +ipa-const-prop-2019-08-12-bugfix-add-condition-fix.patch: +commit 52c9b7face987062527c612e0a65f084e43c85fd + +ipa-const-prop-2019-09-17-new-para-ipa-max-switch.patch: +commit 351e7c3b5fbd45bde3efb601f7fee9a31c4f2063 + +ipa-const-prop-2019-09-19-auto-switch-predicate.patch: +commit efe126563bb8d28cb3958423a735d0021e75702f + +ipa-const-prop-2019-10-03-generate-ipa-on-para-ref.patch: +commit 4307a485c39fd1c317d6cead2707a903052c4753 + +ipa-const-prop-2019-10-05-inline-size-para-change.patch: +commit 6c291ad828fcb5f01a1d2cb23f6078e9a6f958b9 + +ipa-const-prop-2019-10-10-bugfix-20040708-split-splay-tree.patch: +commit 6488759f404f3aff6642b005242a9c82a1c2cee2 + +ipa-const-prop-2019-10-23-bugfix-20040708-fix-uid-func.patch: +commit b5b6485f1cc54f21713b5b03c5d63d56839ca458 + +ipa-const-prop-2019-10-23-bugfix-20040708-fix-uid-func-2nd.patch: +commit 45012be1f5c7e6039e594bab41ebb94d89a9aca0 + +ipa-const-prop-2019-10-24-toggle-static-write.patch: +commit abebffc609506176f8ba3f64533e15ece49446c0 + +ipa-const-prop-2019-10-25-bugfix-empty-edge-ICE.patch: +commit 5a0236f8ca9d239bb62ef54c9273e6ca3f068f87 + +ipa-const-prop-2019-10-25-call-size-summary.patch: +ipa-const-prop-2019-10-25-call-size-summary-confict-fix.patch +commit f658ad3002a0afc8aa86d5646ee704921d969ebe + +ipa-const-prop-2019-10-27-bugfix-solve-LTO-ICE.patch: +commit b1e655646f5b0be3d146825c130690078a8601c3 + +ipa-const-prop-2019-10-27-do-not-move-jump.patch: +commit 051d8a5faa3b37b0dda84c8382174ee70d5b7992 + +ipa-const-prop-2019-10-27-drop-if-no-arg.patch: +commit a33c028eb38268b5084ebc4cc17a1cb64b3a838b + +ipa-const-prop-2019-10-27-update-sum-after-expand.patch: +commit a088d7b10f296dbd57bccbac1bfcf8abb207b034 + +ipa-const-prop-2019-10-30-remove-global.patch: +commit a62bfab5d2a332925fcf10c45b4c5d8ca499439d + +ipa-const-prop-2019-11-03-add-deplicate-form.patch: +commit ac6f2e594886e2209446114023ecdff96b0bd7c4 + +ipa-const-prop-2019-11-03-ipa-inline-analysis-conflict-fix.patch: +ipa-const-prop-2019-11-03-improve-efficiency-of-ipa-poly.patch: +commit 40a777e840f74dd5c19ea26c55d1248a335fd11b + +ipa-const-prop-2019-11-03-ipa-fnsummary-add-call-context.patch: +commit 1532500ecbe8dbf59bef498e46b447b3a6b0fa65 + +ipa-const-prop-2019-11-03-size-ahead-time.patch: +commit 360386c7ef1c3fa30de216b1d68ed6a27296fd80 + +ipa-const-prop-2019-11-04-ipa-inline-includes-ipa-utils.patch: +commit 2bc2379be5c98d34ecbb347b2abf059aa6d94499 + +ipa-const-prop-2019-11-09-add-ipacp-clone.patch: +commit 6cf67b62c8cda035dccaca2ae6ff94d560b37a6f + +ipa-const-prop-2019-11-09-call-nodeRef-on-func-sym.patch: +commit 2ee6e04aaecc856bced29711f9765660e0888994 + +ipa-const-prop-2019-11-13-bugfix-inline-check-before-flatten.patch: +commit 2895b172d56c355373b64517a3298a01a2f10ec0 + +ipa-const-prop-2019-11-13-bugfix-inline-empty-edge.patch: +commit 367c959f0303e11e0a6d875abba7d03c72686668 + +ipa-const-prop-2019-11-13-bugfix-inline-small-function.patch: +commit b914768c1968d924d77bbe3f4e707c6105f3682c + +ipa-const-prop-2019-11-13-bugfix-lto-ICE.patch: +commit d200a49f5c83fa0f2e7332aecf69b6ab4a51b052 + +ipa-const-prop-2019-11-13-fix-ipa-profile-indirect-call.patch: +commit 7b34a284cab5d533552c1df995a88f7167d243bd + +ipa-const-prop-2019-11-14-by-ref-const-prop.patch: +ipa-const-prop-2019-11-14-by-ref-const-prop-conflict-fix.patch +commit eb270950acbae6f70e3487a6e63a26c1294656b3 + +ipa-const-prop-2019-11-15-bugfix-segfault-with-null-top.patch: +commit 1c3c3f455021130c429f57b09ef39bc218bd7fff + +ipa-const-prop-2019-11-18-bugfix-ICE-null-edge.patch: +commit 8d890d37e0183735586c18f1f056deb5848617ca + +ipa-const-prop-2019-11-18-bug-fix-ICE.patch: +commit 8d890d37e0183735586c18f1f056deb5848617ca + +ipa-const-prop-2019-12-02-recusion-versioning.patch: +ipa-const-prop-2019-12-02-param-conflict-fix.patch +commit 9b14fc3326e087975653b1af8ac54114041cde51 + +The original of these commit can be found on + https://github.com/gcc-mirror/gcc + +Not all these commits are applied directly. If the commit node contains +code that affact other modules that unrelated to ipa constant propgation +optimization, the part that the optimization need is regrouped into +a small new patch, which usually named conflict-fix. + +diff -Nurp a/gcc/cgraphbuild.c b/gcc/cgraphbuild.c +--- a/gcc/cgraphbuild.c 2020-04-30 15:14:04.580000000 +0800 ++++ b/gcc/cgraphbuild.c 2020-04-30 15:14:56.584000000 +0800 +@@ -428,7 +428,7 @@ cgraph_edge::rebuild_edges (void) + node->record_stmt_references (gsi_stmt (gsi)); + } + record_eh_tables (node, cfun); +- gcc_assert (!node->global.inlined_to); ++ gcc_assert (!node->inlined_to); + return 0; + } + +diff -Nurp a/gcc/cgraph.c b/gcc/cgraph.c +--- a/gcc/cgraph.c 2020-04-30 15:14:04.576000000 +0800 ++++ b/gcc/cgraph.c 2020-04-30 15:14:56.584000000 +0800 +@@ -539,7 +539,7 @@ cgraph_node::get_create (tree decl) + { + cgraph_node *first_clone = cgraph_node::get (decl); + +- if (first_clone && !first_clone->global.inlined_to) ++ if (first_clone && !first_clone->inlined_to) + return first_clone; + + cgraph_node *node = cgraph_node::create (decl); +@@ -659,7 +659,7 @@ cgraph_node::get_for_asmname (tree asmna + node = node->next_sharing_asm_name) + { + cgraph_node *cn = dyn_cast (node); +- if (cn && !cn->global.inlined_to) ++ if (cn && !cn->inlined_to) + return cn; + } + return NULL; +@@ -1857,7 +1857,7 @@ cgraph_node::remove (void) + { + cgraph_node *n = cgraph_node::get (decl); + if (!n +- || (!n->clones && !n->clone_of && !n->global.inlined_to ++ || (!n->clones && !n->clone_of && !n->inlined_to + && ((symtab->global_info_ready || in_lto_p) + && (TREE_ASM_WRITTEN (n->decl) + || DECL_EXTERNAL (n->decl) +@@ -1888,7 +1888,7 @@ cgraph_node::mark_address_taken (void) + { + /* Indirect inlining can figure out that all uses of the address are + inlined. */ +- if (global.inlined_to) ++ if (inlined_to) + { + gcc_assert (cfun->after_inlining); + gcc_assert (callers->indirect_inlining_edge); +@@ -2012,10 +2012,10 @@ cgraph_node::dump (FILE *f) + + dump_base (f); + +- if (global.inlined_to) ++ if (inlined_to) + fprintf (f, " Function %s is inline copy in %s\n", + dump_name (), +- global.inlined_to->dump_name ()); ++ inlined_to->dump_name ()); + if (clone_of) + fprintf (f, " Clone of %s\n", clone_of->dump_asm_name ()); + if (symtab->function_flags_ready) +@@ -2159,7 +2159,7 @@ cgraph_node::dump (FILE *f) + if (dyn_cast (ref->referring)->count.initialized_p ()) + sum += dyn_cast (ref->referring)->count.ipa (); + +- if (global.inlined_to ++ if (inlined_to + || (symtab->state < EXPANSION + && ultimate_alias_target () == this && only_called_directly_p ())) + ok = !count.ipa ().differs_from_p (sum); +@@ -2259,14 +2259,14 @@ cgraph_node::get_availability (symtab_no + { + cgraph_node *cref = dyn_cast (ref); + if (cref) +- ref = cref->global.inlined_to; ++ ref = cref->inlined_to; + } + enum availability avail; + if (!analyzed) + avail = AVAIL_NOT_AVAILABLE; + else if (local.local) + avail = AVAIL_LOCAL; +- else if (global.inlined_to) ++ else if (inlined_to) + avail = AVAIL_AVAILABLE; + else if (transparent_alias) + ultimate_alias_target (&avail, ref); +@@ -2878,7 +2878,7 @@ bool + cgraph_node::will_be_removed_from_program_if_no_direct_calls_p + (bool will_inline) + { +- gcc_assert (!global.inlined_to); ++ gcc_assert (!inlined_to); + if (DECL_EXTERNAL (decl)) + return true; + +@@ -3065,7 +3065,7 @@ cgraph_edge::verify_corresponds_to_fndec + { + cgraph_node *node; + +- if (!decl || callee->global.inlined_to) ++ if (!decl || callee->inlined_to) + return false; + if (symtab->state == LTO_STREAMING) + return false; +@@ -3126,7 +3126,7 @@ cgraph_node::verify_node (void) + error ("cgraph count invalid"); + error_found = true; + } +- if (global.inlined_to && same_comdat_group) ++ if (inlined_to && same_comdat_group) + { + error ("inline clone in same comdat group list"); + error_found = true; +@@ -3136,17 +3136,17 @@ cgraph_node::verify_node (void) + error ("local symbols must be defined"); + error_found = true; + } +- if (global.inlined_to && externally_visible) ++ if (inlined_to && externally_visible) + { + error ("externally visible inline clone"); + error_found = true; + } +- if (global.inlined_to && address_taken) ++ if (inlined_to && address_taken) + { + error ("inline clone with address taken"); + error_found = true; + } +- if (global.inlined_to && force_output) ++ if (inlined_to && force_output) + { + error ("inline clone is forced to output"); + error_found = true; +@@ -3183,9 +3183,9 @@ cgraph_node::verify_node (void) + } + if (!e->inline_failed) + { +- if (global.inlined_to +- != (e->caller->global.inlined_to +- ? e->caller->global.inlined_to : e->caller)) ++ if (inlined_to ++ != (e->caller->inlined_to ++ ? e->caller->inlined_to : e->caller)) + { + error ("inlined_to pointer is wrong"); + error_found = true; +@@ -3197,7 +3197,7 @@ cgraph_node::verify_node (void) + } + } + else +- if (global.inlined_to) ++ if (inlined_to) + { + error ("inlined_to pointer set for noninline callers"); + error_found = true; +@@ -3208,7 +3208,7 @@ cgraph_node::verify_node (void) + if (e->verify_count ()) + error_found = true; + if (gimple_has_body_p (e->caller->decl) +- && !e->caller->global.inlined_to ++ && !e->caller->inlined_to + && !e->speculative + /* Optimized out calls are redirected to __builtin_unreachable. */ + && (e->count.nonzero_p () +@@ -3233,7 +3233,7 @@ cgraph_node::verify_node (void) + if (e->verify_count ()) + error_found = true; + if (gimple_has_body_p (e->caller->decl) +- && !e->caller->global.inlined_to ++ && !e->caller->inlined_to + && !e->speculative + && e->count.ipa_p () + && count +@@ -3250,12 +3250,12 @@ cgraph_node::verify_node (void) + error_found = true; + } + } +- if (!callers && global.inlined_to) ++ if (!callers && inlined_to) + { + error ("inlined_to pointer is set but no predecessors found"); + error_found = true; + } +- if (global.inlined_to == this) ++ if (inlined_to == this) + { + error ("inlined_to pointer refers to itself"); + error_found = true; +@@ -3344,7 +3344,7 @@ cgraph_node::verify_node (void) + error ("More than one edge out of thunk node"); + error_found = true; + } +- if (gimple_has_body_p (decl) && !global.inlined_to) ++ if (gimple_has_body_p (decl) && !inlined_to) + { + error ("Thunk is not supposed to have body"); + error_found = true; +@@ -3352,7 +3352,7 @@ cgraph_node::verify_node (void) + } + else if (analyzed && gimple_has_body_p (decl) + && !TREE_ASM_WRITTEN (decl) +- && (!DECL_EXTERNAL (decl) || global.inlined_to) ++ && (!DECL_EXTERNAL (decl) || inlined_to) + && !flag_wpa) + { + if (this_cfun->cfg) +@@ -3623,7 +3623,7 @@ cgraph_node::get_body (void) + early. + TODO: Materializing clones here will likely lead to smaller LTRANS + footprint. */ +- gcc_assert (!global.inlined_to && !clone_of); ++ gcc_assert (!inlined_to && !clone_of); + if (ipa_transforms_to_apply.exists ()) + { + opt_pass *saved_current_pass = current_pass; +@@ -3813,8 +3813,8 @@ cgraph_node::has_thunk_p (cgraph_node *n + sreal + cgraph_edge::sreal_frequency () + { +- return count.to_sreal_scale (caller->global.inlined_to +- ? caller->global.inlined_to->count ++ return count.to_sreal_scale (caller->inlined_to ++ ? caller->inlined_to->count + : caller->count); + } + +diff -Nurp a/gcc/cgraphclones.c b/gcc/cgraphclones.c +--- a/gcc/cgraphclones.c 2020-04-30 15:14:04.644000000 +0800 ++++ b/gcc/cgraphclones.c 2020-04-30 15:14:56.628000000 +0800 +@@ -458,8 +458,7 @@ cgraph_node::create_clone (tree new_decl + new_node->externally_visible = false; + new_node->no_reorder = no_reorder; + new_node->local.local = true; +- new_node->global = global; +- new_node->global.inlined_to = new_inlined_to; ++ new_node->inlined_to = new_inlined_to; + new_node->rtl = rtl; + new_node->frequency = frequency; + new_node->tp_first_run = tp_first_run; +@@ -671,6 +670,7 @@ cgraph_node::create_virtual_clone (vecipcp_clone = ipcp_clone; + new_node->clone.tree_map = tree_map; + if (!implicit_section) + new_node->set_section (get_section ()); +@@ -965,7 +965,7 @@ cgraph_node::create_version_clone (tree + new_version->externally_visible = false; + new_version->no_reorder = no_reorder; + new_version->local.local = new_version->definition; +- new_version->global = global; ++ new_version->inlined_to = inlined_to; + new_version->rtl = rtl; + new_version->count = count; + +diff -Nurp a/gcc/cgraph.h b/gcc/cgraph.h +--- a/gcc/cgraph.h 2020-04-30 15:14:04.624000000 +0800 ++++ b/gcc/cgraph.h 2020-04-30 15:14:56.628000000 +0800 +@@ -718,15 +718,6 @@ struct GTY(()) cgraph_local_info { + unsigned tm_may_enter_irr : 1; + }; + +-/* Information about the function that needs to be computed globally +- once compilation is finished. Available only with -funit-at-a-time. */ +- +-struct GTY(()) cgraph_global_info { +- /* For inline clones this points to the function they will be +- inlined into. */ +- cgraph_node *inlined_to; +-}; +- + /* Represent which DECL tree (or reference to such tree) + will be replaced by another tree while versioning. */ + struct GTY(()) ipa_replace_map +@@ -959,7 +950,7 @@ public: + + If the new node is being inlined into another one, NEW_INLINED_TO should be + the outline function the new one is (even indirectly) inlined to. +- All hooks will see this in node's global.inlined_to, when invoked. ++ All hooks will see this in node's inlined_to, when invoked. + Can be NULL if the node is not inlined. SUFFIX is string that is appended + to the original name. */ + cgraph_node *create_clone (tree decl, profile_count count, +@@ -1420,7 +1411,11 @@ public: + vec GTY((skip)) ipa_transforms_to_apply; + + cgraph_local_info local; +- cgraph_global_info global; ++ ++ /* For inline clones this points to the function they will be ++ inlined into. */ ++ cgraph_node *inlined_to; ++ + struct cgraph_rtl_info *rtl; + cgraph_clone_info clone; + cgraph_thunk_info thunk; +@@ -1474,6 +1469,8 @@ public: + unsigned split_part : 1; + /* True if the function appears as possible target of indirect call. */ + unsigned indirect_call_target : 1; ++ /* True if this was a clone created by ipa-cp. */ ++ unsigned ipcp_clone : 1; + + private: + /* Unique id of the node. */ +@@ -2474,7 +2471,7 @@ symtab_node::real_symbol_p (void) + if (!is_a (this)) + return true; + cnode = dyn_cast (this); +- if (cnode->global.inlined_to) ++ if (cnode->inlined_to) + return false; + return true; + } +@@ -2497,13 +2494,13 @@ symtab_node::in_same_comdat_group_p (sym + + if (cgraph_node *cn = dyn_cast (target)) + { +- if (cn->global.inlined_to) +- source = cn->global.inlined_to; ++ if (cn->inlined_to) ++ source = cn->inlined_to; + } + if (cgraph_node *cn = dyn_cast (target)) + { +- if (cn->global.inlined_to) +- target = cn->global.inlined_to; ++ if (cn->inlined_to) ++ target = cn->inlined_to; + } + + return source->get_comdat_group () == target->get_comdat_group (); +@@ -2964,7 +2961,7 @@ struct GTY((for_user)) constant_descript + inline bool + cgraph_node::only_called_directly_or_aliased_p (void) + { +- gcc_assert (!global.inlined_to); ++ gcc_assert (!inlined_to); + return (!force_output && !address_taken + && !ifunc_resolver + && !used_from_other_partition +@@ -2981,7 +2978,7 @@ cgraph_node::only_called_directly_or_ali + inline bool + cgraph_node::can_remove_if_no_direct_calls_and_refs_p (void) + { +- gcc_checking_assert (!global.inlined_to); ++ gcc_checking_assert (!inlined_to); + /* Extern inlines can always go, we will use the external definition. */ + if (DECL_EXTERNAL (decl)) + return true; +@@ -3152,8 +3149,8 @@ inline bool + cgraph_edge::recursive_p (void) + { + cgraph_node *c = callee->ultimate_alias_target (); +- if (caller->global.inlined_to) +- return caller->global.inlined_to->decl == c->decl; ++ if (caller->inlined_to) ++ return caller->inlined_to->decl == c->decl; + else + return caller->decl == c->decl; + } +@@ -3190,8 +3187,8 @@ cgraph_edge::binds_to_current_def_p () + inline int + cgraph_edge::frequency () + { +- return count.to_cgraph_frequency (caller->global.inlined_to +- ? caller->global.inlined_to->count ++ return count.to_cgraph_frequency (caller->inlined_to ++ ? caller->inlined_to->count + : caller->count); + } + +@@ -3213,7 +3210,7 @@ inline void + cgraph_node::mark_force_output (void) + { + force_output = 1; +- gcc_checking_assert (!global.inlined_to); ++ gcc_checking_assert (!inlined_to); + } + + /* Return true if function should be optimized for size. */ +diff -Nurp a/gcc/cgraphunit.c b/gcc/cgraphunit.c +--- a/gcc/cgraphunit.c 2020-04-30 15:14:04.592000000 +0800 ++++ b/gcc/cgraphunit.c 2020-04-30 15:14:56.584000000 +0800 +@@ -340,7 +340,10 @@ symbol_table::process_new_functions (voi + and splitting. This is redundant for functions added late. + Just throw away whatever it did. */ + if (!summaried_computed) +- ipa_free_fn_summary (); ++ { ++ ipa_free_fn_summary (); ++ ipa_free_size_summary (); ++ } + } + else if (ipa_fn_summaries != NULL) + compute_fn_summary (node, true); +@@ -389,7 +392,7 @@ cgraph_node::reset (void) + + /* Reset our data structures so we can analyze the function again. */ + memset (&local, 0, sizeof (local)); +- memset (&global, 0, sizeof (global)); ++ inlined_to = NULL; + memset (&rtl, 0, sizeof (rtl)); + analyzed = false; + definition = false; +@@ -1504,7 +1507,7 @@ mark_functions_to_output (void) + if (node->analyzed + && !node->thunk.thunk_p + && !node->alias +- && !node->global.inlined_to ++ && !node->inlined_to + && !TREE_ASM_WRITTEN (decl) + && !DECL_EXTERNAL (decl)) + { +@@ -1529,7 +1532,7 @@ mark_functions_to_output (void) + { + /* We should've reclaimed all functions that are not needed. */ + if (flag_checking +- && !node->global.inlined_to ++ && !node->inlined_to + && gimple_has_body_p (decl) + /* FIXME: in ltrans unit when offline copy is outside partition but inline copies + are inside partition, we can end up not removing the body since we no longer +@@ -1542,7 +1545,7 @@ mark_functions_to_output (void) + node->debug (); + internal_error ("failed to reclaim unneeded function"); + } +- gcc_assert (node->global.inlined_to ++ gcc_assert (node->inlined_to + || !gimple_has_body_p (decl) + || node->in_other_partition + || node->clones +@@ -1557,7 +1560,7 @@ mark_functions_to_output (void) + if (node->same_comdat_group && !node->process) + { + tree decl = node->decl; +- if (!node->global.inlined_to ++ if (!node->inlined_to + && gimple_has_body_p (decl) + /* FIXME: in an ltrans unit when the offline copy is outside a + partition but inline copies are inside a partition, we can +@@ -2118,7 +2121,7 @@ cgraph_node::assemble_thunks_and_aliases + + for (e = callers; e;) + if (e->caller->thunk.thunk_p +- && !e->caller->global.inlined_to) ++ && !e->caller->inlined_to) + { + cgraph_node *thunk = e->caller; + +@@ -2155,7 +2158,7 @@ cgraph_node::expand (void) + location_t saved_loc; + + /* We ought to not compile any inline clones. */ +- gcc_assert (!global.inlined_to); ++ gcc_assert (!inlined_to); + + /* __RTL functions are compiled as soon as they are parsed, so don't + do it again. */ +@@ -2707,7 +2710,7 @@ symbol_table::compile (void) + bool error_found = false; + + FOR_EACH_DEFINED_FUNCTION (node) +- if (node->global.inlined_to ++ if (node->inlined_to + || gimple_has_body_p (node->decl)) + { + error_found = true; +diff -Nurp a/gcc/data-streamer.h b/gcc/data-streamer.h +--- a/gcc/data-streamer.h 2020-04-30 15:14:04.648000000 +0800 ++++ b/gcc/data-streamer.h 2020-04-30 15:14:56.504000000 +0800 +@@ -53,6 +53,7 @@ HOST_WIDE_INT bp_unpack_var_len_int (str + void streamer_write_zero (struct output_block *); + void streamer_write_uhwi (struct output_block *, unsigned HOST_WIDE_INT); + void streamer_write_hwi (struct output_block *, HOST_WIDE_INT); ++void streamer_write_poly_uint64 (struct output_block *, poly_uint64); + void streamer_write_gcov_count (struct output_block *, gcov_type); + void streamer_write_string (struct output_block *, struct lto_output_stream *, + const char *, bool); +@@ -82,6 +83,7 @@ const char *bp_unpack_indexed_string (st + const char *bp_unpack_string (struct data_in *, struct bitpack_d *); + unsigned HOST_WIDE_INT streamer_read_uhwi (struct lto_input_block *); + HOST_WIDE_INT streamer_read_hwi (struct lto_input_block *); ++poly_uint64 streamer_read_poly_uint64 (struct lto_input_block *); + gcov_type streamer_read_gcov_count (struct lto_input_block *); + wide_int streamer_read_wide_int (struct lto_input_block *); + widest_int streamer_read_widest_int (struct lto_input_block *); +diff -Nurp a/gcc/data-streamer-in.c b/gcc/data-streamer-in.c +--- a/gcc/data-streamer-in.c 2020-04-30 15:14:04.628000000 +0800 ++++ b/gcc/data-streamer-in.c 2020-04-30 15:14:56.504000000 +0800 +@@ -175,6 +175,17 @@ streamer_read_hwi (struct lto_input_bloc + } + } + ++/* Read a poly_uint64 from IB. */ ++ ++poly_uint64 ++streamer_read_poly_uint64 (class lto_input_block *ib) ++{ ++ poly_uint64 res; ++ for (unsigned int i = 0; i < NUM_POLY_INT_COEFFS; ++i) ++ res.coeffs[i] = streamer_read_uhwi (ib); ++ return res; ++} ++ + /* Read gcov_type value from IB. */ + + gcov_type +diff -Nurp a/gcc/data-streamer-out.c b/gcc/data-streamer-out.c +--- a/gcc/data-streamer-out.c 2020-04-30 15:14:04.600000000 +0800 ++++ b/gcc/data-streamer-out.c 2020-04-30 15:14:56.504000000 +0800 +@@ -220,6 +220,15 @@ streamer_write_hwi (struct output_block + streamer_write_hwi_stream (ob->main_stream, work); + } + ++/* Write a poly_uint64 value WORK to OB->main_stream. */ ++ ++void ++streamer_write_poly_uint64 (struct output_block *ob, poly_uint64 work) ++{ ++ for (int i = 0; i < NUM_POLY_INT_COEFFS; ++i) ++ streamer_write_uhwi_stream (ob->main_stream, work.coeffs[i]); ++} ++ + /* Write a gcov counter value WORK to OB->main_stream. */ + + void +diff -Nurp a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +--- a/gcc/doc/invoke.texi 2020-04-30 15:14:04.664000000 +0800 ++++ b/gcc/doc/invoke.texi 2020-04-30 15:14:56.692000000 +0800 +@@ -11836,6 +11836,13 @@ IPA-CP calculates its own score of cloni + and performs those cloning opportunities with scores that exceed + @option{ipa-cp-eval-threshold}. + ++@item ipa-cp-max-recursive-depth ++Maximum depth of recursive cloning for self-recursive function. ++ ++@item ipa-cp-min-recursive-probability ++Recursive cloning only when the probability of call being executed exceeds ++the parameter. ++ + @item ipa-cp-recursion-penalty + Percentage penalty the recursive functions will receive when they + are evaluated for cloning. +diff -Nurp a/gcc/gimple-fold.c b/gcc/gimple-fold.c +--- a/gcc/gimple-fold.c 2020-04-30 15:14:04.632000000 +0800 ++++ b/gcc/gimple-fold.c 2020-04-30 15:14:56.584000000 +0800 +@@ -135,7 +135,7 @@ can_refer_decl_in_current_unit_p (tree d + if (!snode || !snode->definition) + return false; + node = dyn_cast (snode); +- return !node || !node->global.inlined_to; ++ return !node || !node->inlined_to; + } + + /* We will later output the initializer, so we can refer to it. +@@ -184,7 +184,7 @@ can_refer_decl_in_current_unit_p (tree d + || (!snode->forced_by_abi && !snode->force_output)))) + return false; + node = dyn_cast (snode); +- return !node || !node->global.inlined_to; ++ return !node || !node->inlined_to; + } + + /* Create a temporary for TYPE for a statement STMT. If the current function +diff -Nurp a/gcc/ipa.c b/gcc/ipa.c +--- a/gcc/ipa.c 2020-04-30 15:14:04.636000000 +0800 ++++ b/gcc/ipa.c 2020-04-30 15:14:56.588000000 +0800 +@@ -71,9 +71,9 @@ update_inlined_to_pointer (struct cgraph + { + struct cgraph_edge *e; + for (e = node->callees; e; e = e->next_callee) +- if (e->callee->global.inlined_to) ++ if (e->callee->inlined_to) + { +- e->callee->global.inlined_to = inlined_to; ++ e->callee->inlined_to = inlined_to; + update_inlined_to_pointer (e->callee, inlined_to); + } + } +@@ -335,11 +335,11 @@ symbol_table::remove_unreachable_nodes ( + node->used_as_abstract_origin = false; + node->indirect_call_target = false; + if (node->definition +- && !node->global.inlined_to ++ && !node->inlined_to + && !node->in_other_partition + && !node->can_remove_if_no_direct_calls_and_refs_p ()) + { +- gcc_assert (!node->global.inlined_to); ++ gcc_assert (!node->inlined_to); + reachable.add (node); + enqueue_node (node, &first, &reachable); + } +@@ -451,7 +451,7 @@ symbol_table::remove_unreachable_nodes ( + + /* When inline clone exists, mark body to be preserved so when removing + offline copy of the function we don't kill it. */ +- if (cnode->global.inlined_to) ++ if (cnode->inlined_to) + body_needed_for_clonning.add (cnode->decl); + + /* For non-inline clones, force their origins to the boundary and ensure +@@ -560,11 +560,11 @@ symbol_table::remove_unreachable_nodes ( + to turn it into normal cone. */ + FOR_EACH_FUNCTION (node) + { +- if (node->global.inlined_to ++ if (node->inlined_to + && !node->callers) + { + gcc_assert (node->clones); +- node->global.inlined_to = NULL; ++ node->inlined_to = NULL; + update_inlined_to_pointer (node, node); + } + node->aux = NULL; +@@ -1207,8 +1207,8 @@ propagate_single_user (varpool_node *vno + struct cgraph_node *cnode = dyn_cast (ref->referring); + if (cnode) + { +- if (cnode->global.inlined_to) +- cnode = cnode->global.inlined_to; ++ if (cnode->inlined_to) ++ cnode = cnode->inlined_to; + if (!function) + function = cnode; + else if (function != cnode) +diff -Nurp a/gcc/ipa-comdats.c b/gcc/ipa-comdats.c +--- a/gcc/ipa-comdats.c 2020-04-30 15:14:04.612000000 +0800 ++++ b/gcc/ipa-comdats.c 2020-04-30 15:14:56.584000000 +0800 +@@ -98,8 +98,8 @@ propagate_comdat_group (struct symtab_no + + if (cgraph_node * cn = dyn_cast (symbol2)) + { +- if (cn->global.inlined_to) +- symbol2 = cn->global.inlined_to; ++ if (cn->inlined_to) ++ symbol2 = cn->inlined_to; + } + + /* The actual merge operation. */ +@@ -133,8 +133,8 @@ propagate_comdat_group (struct symtab_no + /* If we see inline clone, its comdat group actually + corresponds to the comdat group of the function it + is inlined to. */ +- if (cn->global.inlined_to) +- symbol2 = cn->global.inlined_to; ++ if (cn->inlined_to) ++ symbol2 = cn->inlined_to; + } + + /* The actual merge operation. */ +diff -Nurp a/gcc/ipa-cp.c b/gcc/ipa-cp.c +--- a/gcc/ipa-cp.c 2020-04-30 15:14:04.592000000 +0800 ++++ b/gcc/ipa-cp.c 2020-04-30 15:14:56.700000000 +0800 +@@ -229,7 +229,9 @@ public: + inline bool set_contains_variable (); + bool add_value (valtype newval, cgraph_edge *cs, + ipcp_value *src_val = NULL, +- int src_idx = 0, HOST_WIDE_INT offset = -1); ++ int src_idx = 0, HOST_WIDE_INT offset = -1, ++ ipcp_value **val_p = NULL, ++ bool unlimited = false); + void print (FILE * f, bool dump_sources, bool dump_benefits); + }; + +@@ -381,8 +383,8 @@ static hash_map + + /* Return the param lattices structure corresponding to the Ith formal + parameter of the function described by INFO. */ +-static inline struct ipcp_param_lattices * +-ipa_get_parm_lattices (struct ipa_node_params *info, int i) ++static inline class ipcp_param_lattices * ++ipa_get_parm_lattices (class ipa_node_params *info, int i) + { + gcc_assert (i >= 0 && i < ipa_get_param_count (info)); + gcc_checking_assert (!info->ipcp_orig_node); +@@ -393,18 +395,18 @@ ipa_get_parm_lattices (struct ipa_node_p + /* Return the lattice corresponding to the scalar value of the Ith formal + parameter of the function described by INFO. */ + static inline ipcp_lattice * +-ipa_get_scalar_lat (struct ipa_node_params *info, int i) ++ipa_get_scalar_lat (class ipa_node_params *info, int i) + { +- struct ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); ++ class ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); + return &plats->itself; + } + + /* Return the lattice corresponding to the scalar value of the Ith formal + parameter of the function described by INFO. */ + static inline ipcp_lattice * +-ipa_get_poly_ctx_lat (struct ipa_node_params *info, int i) ++ipa_get_poly_ctx_lat (class ipa_node_params *info, int i) + { +- struct ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); ++ class ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); + return &plats->ctxlat; + } + +@@ -539,7 +541,7 @@ print_all_lattices (FILE * f, bool dump_ + fprintf (f, "\nLattices:\n"); + FOR_EACH_FUNCTION_WITH_GIMPLE_BODY (node) + { +- struct ipa_node_params *info; ++ class ipa_node_params *info; + + info = IPA_NODE_REF (node); + /* Skip constprop clones since we don't make lattices for them. */ +@@ -550,7 +552,7 @@ print_all_lattices (FILE * f, bool dump_ + for (i = 0; i < count; i++) + { + struct ipcp_agg_lattice *aglat; +- struct ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); ++ class ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); + fprintf (f, " param [%d]: ", i); + plats->itself.print (f, dump_sources, dump_benefits); + fprintf (f, " ctxs: "); +@@ -585,7 +587,7 @@ print_all_lattices (FILE * f, bool dump_ + + static void + determine_versionability (struct cgraph_node *node, +- struct ipa_node_params *info) ++ class ipa_node_params *info) + { + const char *reason = NULL; + +@@ -656,7 +658,7 @@ determine_versionability (struct cgraph_ + static bool + ipcp_versionable_function_p (struct cgraph_node *node) + { +- return IPA_NODE_REF (node)->versionable; ++ return IPA_NODE_REF (node) && IPA_NODE_REF (node)->versionable; + } + + /* Structure holding accumulated information about callers of a node. */ +@@ -731,7 +733,7 @@ ipcp_cloning_candidate_p (struct cgraph_ + init_caller_stats (&stats); + node->call_for_symbol_thunks_and_aliases (gather_caller_stats, &stats, false); + +- if (ipa_fn_summaries->get (node)->self_size < stats.n_calls) ++ if (ipa_size_summaries->get (node)->self_size < stats.n_calls) + { + if (dump_file) + fprintf (dump_file, "Considering %s for cloning; code might shrink.\n", +@@ -806,23 +808,39 @@ public: + {} + }; + ++/* Skip edges from and to nodes without ipa_cp enabled. ++ Ignore not available symbols. */ ++ ++static bool ++ignore_edge_p (cgraph_edge *e) ++{ ++ enum availability avail; ++ cgraph_node *ultimate_target ++ = e->callee->function_or_virtual_thunk_symbol (&avail, e->caller); ++ ++ return (avail <= AVAIL_INTERPOSABLE ++ || !opt_for_fn (e->caller->decl, flag_ipa_cp) ++ || !opt_for_fn (ultimate_target->decl, flag_ipa_cp)); ++} ++ + /* Allocate the arrays in TOPO and topologically sort the nodes into order. */ + + static void +-build_toporder_info (struct ipa_topo_info *topo) ++build_toporder_info (class ipa_topo_info *topo) + { + topo->order = XCNEWVEC (struct cgraph_node *, symtab->cgraph_count); + topo->stack = XCNEWVEC (struct cgraph_node *, symtab->cgraph_count); + + gcc_checking_assert (topo->stack_top == 0); +- topo->nnodes = ipa_reduced_postorder (topo->order, true, NULL); ++ topo->nnodes = ipa_reduced_postorder (topo->order, true, ++ ignore_edge_p); + } + + /* Free information about strongly connected components and the arrays in + TOPO. */ + + static void +-free_toporder_info (struct ipa_topo_info *topo) ++free_toporder_info (class ipa_topo_info *topo) + { + ipa_free_postorder_info (); + free (topo->order); +@@ -832,9 +850,9 @@ free_toporder_info (struct ipa_topo_info + /* Add NODE to the stack in TOPO, unless it is already there. */ + + static inline void +-push_node_to_stack (struct ipa_topo_info *topo, struct cgraph_node *node) ++push_node_to_stack (class ipa_topo_info *topo, struct cgraph_node *node) + { +- struct ipa_node_params *info = IPA_NODE_REF (node); ++ class ipa_node_params *info = IPA_NODE_REF (node); + if (info->node_enqueued) + return; + info->node_enqueued = 1; +@@ -845,7 +863,7 @@ push_node_to_stack (struct ipa_topo_info + is empty. */ + + static struct cgraph_node * +-pop_node_from_stack (struct ipa_topo_info *topo) ++pop_node_from_stack (class ipa_topo_info *topo) + { + if (topo->stack_top) + { +@@ -887,7 +905,7 @@ ipcp_lattice::set_contains_vari + not previously set as such. */ + + static inline bool +-set_agg_lats_to_bottom (struct ipcp_param_lattices *plats) ++set_agg_lats_to_bottom (class ipcp_param_lattices *plats) + { + bool ret = !plats->aggs_bottom; + plats->aggs_bottom = true; +@@ -898,7 +916,7 @@ set_agg_lats_to_bottom (struct ipcp_para + return true if they were not previously marked as such. */ + + static inline bool +-set_agg_lats_contain_variable (struct ipcp_param_lattices *plats) ++set_agg_lats_contain_variable (class ipcp_param_lattices *plats) + { + bool ret = !plats->aggs_contain_variable; + plats->aggs_contain_variable = true; +@@ -1108,7 +1126,7 @@ ipcp_bits_lattice::meet_with (ipcp_bits_ + return true is any of them has not been marked as such so far. */ + + static inline bool +-set_all_contains_variable (struct ipcp_param_lattices *plats) ++set_all_contains_variable (class ipcp_param_lattices *plats) + { + bool ret; + ret = plats->itself.set_contains_variable (); +@@ -1158,7 +1176,7 @@ set_single_call_flag (cgraph_node *node, + static void + initialize_node_lattices (struct cgraph_node *node) + { +- struct ipa_node_params *info = IPA_NODE_REF (node); ++ class ipa_node_params *info = IPA_NODE_REF (node); + struct cgraph_edge *ie; + bool disable = false, variable = false; + int i; +@@ -1188,7 +1206,7 @@ initialize_node_lattices (struct cgraph_ + + for (i = 0; i < ipa_get_param_count (info); i++) + { +- struct ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); ++ class ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); + plats->m_value_range.init (); + } + +@@ -1196,7 +1214,7 @@ initialize_node_lattices (struct cgraph_ + { + for (i = 0; i < ipa_get_param_count (info); i++) + { +- struct ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); ++ class ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); + if (disable) + { + plats->itself.set_to_bottom (); +@@ -1224,23 +1242,23 @@ initialize_node_lattices (struct cgraph_ + } + } + +-/* Return the result of a (possibly arithmetic) pass through jump function +- JFUNC on the constant value INPUT. RES_TYPE is the type of the parameter +- to which the result is passed. Return NULL_TREE if that cannot be +- determined or be considered an interprocedural invariant. */ ++/* Return the result of a (possibly arithmetic) operation on the constant ++ value INPUT. OPERAND is 2nd operand for binary operation. RES_TYPE is ++ the type of the parameter to which the result is passed. Return ++ NULL_TREE if that cannot be determined or be considered an ++ interprocedural invariant. */ + + static tree +-ipa_get_jf_pass_through_result (struct ipa_jump_func *jfunc, tree input, +- tree res_type) ++ipa_get_jf_arith_result (enum tree_code opcode, tree input, tree operand, ++ tree res_type) + { + tree res; + +- if (ipa_get_jf_pass_through_operation (jfunc) == NOP_EXPR) ++ if (opcode == NOP_EXPR) + return input; + if (!is_gimple_ip_invariant (input)) + return NULL_TREE; + +- tree_code opcode = ipa_get_jf_pass_through_operation (jfunc); + if (!res_type) + { + if (TREE_CODE_CLASS (opcode) == tcc_comparison) +@@ -1254,8 +1272,7 @@ ipa_get_jf_pass_through_result (struct i + if (TREE_CODE_CLASS (opcode) == tcc_unary) + res = fold_unary (opcode, res_type, input); + else +- res = fold_binary (opcode, res_type, input, +- ipa_get_jf_pass_through_operand (jfunc)); ++ res = fold_binary (opcode, res_type, input, operand); + + if (res && !is_gimple_ip_invariant (res)) + return NULL_TREE; +@@ -1263,6 +1280,21 @@ ipa_get_jf_pass_through_result (struct i + return res; + } + ++/* Return the result of a (possibly arithmetic) pass through jump function ++ JFUNC on the constant value INPUT. RES_TYPE is the type of the parameter ++ to which the result is passed. Return NULL_TREE if that cannot be ++ determined or be considered an interprocedural invariant. */ ++ ++static tree ++ipa_get_jf_pass_through_result (struct ipa_jump_func *jfunc, tree input, ++ tree res_type) ++{ ++ return ipa_get_jf_arith_result (ipa_get_jf_pass_through_operation (jfunc), ++ input, ++ ipa_get_jf_pass_through_operand (jfunc), ++ res_type); ++} ++ + /* Return the result of an ancestor jump function JFUNC on the constant value + INPUT. Return NULL_TREE if that cannot be determined. */ + +@@ -1289,7 +1321,7 @@ ipa_get_jf_ancestor_result (struct ipa_j + passed. */ + + tree +-ipa_value_from_jfunc (struct ipa_node_params *info, struct ipa_jump_func *jfunc, ++ipa_value_from_jfunc (class ipa_node_params *info, struct ipa_jump_func *jfunc, + tree parm_type) + { + if (jfunc->type == IPA_JF_CONST) +@@ -1396,6 +1428,146 @@ ipa_context_from_jfunc (ipa_node_params + return ctx; + } + ++/* See if NODE is a clone with a known aggregate value at a given OFFSET of a ++ parameter with the given INDEX. */ ++ ++static tree ++get_clone_agg_value (struct cgraph_node *node, HOST_WIDE_INT offset, ++ int index) ++{ ++ struct ipa_agg_replacement_value *aggval; ++ ++ aggval = ipa_get_agg_replacements_for_node (node); ++ while (aggval) ++ { ++ if (aggval->offset == offset ++ && aggval->index == index) ++ return aggval->value; ++ aggval = aggval->next; ++ } ++ return NULL_TREE; ++} ++ ++/* Determine whether ITEM, jump function for an aggregate part, evaluates to a ++ single known constant value and if so, return it. Otherwise return NULL. ++ NODE and INFO describes the caller node or the one it is inlined to, and ++ its related info. */ ++ ++static tree ++ipa_agg_value_from_node (class ipa_node_params *info, ++ struct cgraph_node *node, ++ struct ipa_agg_jf_item *item) ++{ ++ tree value = NULL_TREE; ++ int src_idx; ++ ++ if (item->offset < 0 || item->jftype == IPA_JF_UNKNOWN) ++ return NULL_TREE; ++ ++ if (item->jftype == IPA_JF_CONST) ++ return item->value.constant; ++ ++ gcc_checking_assert (item->jftype == IPA_JF_PASS_THROUGH ++ || item->jftype == IPA_JF_LOAD_AGG); ++ ++ src_idx = item->value.pass_through.formal_id; ++ ++ if (info->ipcp_orig_node) ++ { ++ if (item->jftype == IPA_JF_PASS_THROUGH) ++ value = info->known_csts[src_idx]; ++ else ++ value = get_clone_agg_value (node, item->value.load_agg.offset, ++ src_idx); ++ } ++ else if (info->lattices) ++ { ++ class ipcp_param_lattices *src_plats ++ = ipa_get_parm_lattices (info, src_idx); ++ ++ if (item->jftype == IPA_JF_PASS_THROUGH) ++ { ++ struct ipcp_lattice *lat = &src_plats->itself; ++ ++ if (!lat->is_single_const ()) ++ return NULL_TREE; ++ ++ value = lat->values->value; ++ } ++ else if (src_plats->aggs ++ && !src_plats->aggs_bottom ++ && !src_plats->aggs_contain_variable ++ && src_plats->aggs_by_ref == item->value.load_agg.by_ref) ++ { ++ struct ipcp_agg_lattice *aglat; ++ ++ for (aglat = src_plats->aggs; aglat; aglat = aglat->next) ++ { ++ if (aglat->offset > item->value.load_agg.offset) ++ break; ++ ++ if (aglat->offset == item->value.load_agg.offset) ++ { ++ if (aglat->is_single_const ()) ++ value = aglat->values->value; ++ break; ++ } ++ } ++ } ++ } ++ ++ if (!value) ++ return NULL_TREE; ++ ++ if (item->jftype == IPA_JF_LOAD_AGG) ++ { ++ tree load_type = item->value.load_agg.type; ++ tree value_type = TREE_TYPE (value); ++ ++ /* Ensure value type is compatible with load type. */ ++ if (!useless_type_conversion_p (load_type, value_type)) ++ return NULL_TREE; ++ } ++ ++ return ipa_get_jf_arith_result (item->value.pass_through.operation, ++ value, ++ item->value.pass_through.operand, ++ item->type); ++} ++ ++/* Determine whether AGG_JFUNC evaluates to a set of known constant value for ++ an aggregate and if so, return it. Otherwise return an empty set. NODE ++ and INFO describes the caller node or the one it is inlined to, and its ++ related info. */ ++ ++struct ipa_agg_value_set ++ipa_agg_value_set_from_jfunc (class ipa_node_params *info, cgraph_node *node, ++ struct ipa_agg_jump_function *agg_jfunc) ++{ ++ struct ipa_agg_value_set agg; ++ struct ipa_agg_jf_item *item; ++ int i; ++ ++ agg.items = vNULL; ++ agg.by_ref = agg_jfunc->by_ref; ++ ++ FOR_EACH_VEC_SAFE_ELT (agg_jfunc->items, i, item) ++ { ++ tree value = ipa_agg_value_from_node (info, node, item); ++ ++ if (value) ++ { ++ struct ipa_agg_value value_item; ++ ++ value_item.offset = item->offset; ++ value_item.value = value; ++ ++ agg.items.safe_push (value_item); ++ } ++ } ++ return agg; ++} ++ + /* If checking is enabled, verify that no lattice is in the TOP state, i.e. not + bottom, not containing a variable component and without any known value at + the same time. */ +@@ -1407,7 +1579,9 @@ ipcp_verify_propagated_values (void) + + FOR_EACH_FUNCTION_WITH_GIMPLE_BODY (node) + { +- struct ipa_node_params *info = IPA_NODE_REF (node); ++ class ipa_node_params *info = IPA_NODE_REF (node); ++ if (!opt_for_fn (node->decl, flag_ipa_cp)) ++ continue; + int i, count = ipa_get_param_count (info); + + for (i = 0; i < count; i++) +@@ -1516,22 +1690,32 @@ allocate_and_init_ipcp_value (ipa_polymo + /* Try to add NEWVAL to LAT, potentially creating a new ipcp_value for it. CS, + SRC_VAL SRC_INDEX and OFFSET are meant for add_source and have the same + meaning. OFFSET -1 means the source is scalar and not a part of an +- aggregate. */ ++ aggregate. If non-NULL, VAL_P records address of existing or newly added ++ ipcp_value. UNLIMITED means whether value count should not exceed the limit ++ given by PARAM_IPA_CP_VALUE_LIST_SIZE. */ + + template + bool + ipcp_lattice::add_value (valtype newval, cgraph_edge *cs, + ipcp_value *src_val, +- int src_idx, HOST_WIDE_INT offset) ++ int src_idx, HOST_WIDE_INT offset, ++ ipcp_value **val_p, ++ bool unlimited) + { +- ipcp_value *val; ++ ipcp_value *val, *last_val = NULL; ++ ++ if (val_p) ++ *val_p = NULL; + + if (bottom) + return false; + +- for (val = values; val; val = val->next) ++ for (val = values; val; last_val = val, val = val->next) + if (values_equal_for_ipcp_p (val->value, newval)) + { ++ if (val_p) ++ *val_p = val; ++ + if (ipa_edge_within_scc (cs)) + { + ipcp_value_source *s; +@@ -1546,7 +1730,7 @@ ipcp_lattice::add_value (valtyp + return false; + } + +- if (values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE)) ++ if (!unlimited && values_count == PARAM_VALUE (PARAM_IPA_CP_VALUE_LIST_SIZE)) + { + /* We can only free sources, not the values themselves, because sources + of other values in this SCC might point to them. */ +@@ -1559,7 +1743,6 @@ ipcp_lattice::add_value (valtyp + ipcp_sources_pool.remove ((ipcp_value_source*)src); + } + } +- + values = NULL; + return set_to_bottom (); + } +@@ -1567,41 +1750,177 @@ ipcp_lattice::add_value (valtyp + values_count++; + val = allocate_and_init_ipcp_value (newval); + val->add_source (cs, src_val, src_idx, offset); +- val->next = values; +- values = val; ++ val->next = NULL; ++ ++ /* Add the new value to end of value list, which can reduce iterations ++ of propagation stage for recursive function. */ ++ if (last_val) ++ last_val->next = val; ++ else ++ values = val; ++ ++ if (val_p) ++ *val_p = val; ++ + return true; + } + +-/* Propagate values through a pass-through jump function JFUNC associated with +- edge CS, taking values from SRC_LAT and putting them into DEST_LAT. SRC_IDX +- is the index of the source parameter. PARM_TYPE is the type of the +- parameter to which the result is passed. */ ++/* Return true, if a ipcp_value VAL is orginated from parameter value of ++ self-feeding recursive function by applying non-passthrough arithmetic ++ transformation. */ + + static bool +-propagate_vals_across_pass_through (cgraph_edge *cs, ipa_jump_func *jfunc, +- ipcp_lattice *src_lat, +- ipcp_lattice *dest_lat, int src_idx, +- tree parm_type) ++self_recursively_generated_p (ipcp_value *val) ++{ ++ class ipa_node_params *info = NULL; ++ ++ for (ipcp_value_source *src = val->sources; src; src = src->next) ++ { ++ cgraph_edge *cs = src->cs; ++ ++ if (!src->val || cs->caller != cs->callee->function_symbol () ++ || src->val == val) ++ return false; ++ ++ if (!info) ++ info = IPA_NODE_REF (cs->caller); ++ ++ class ipcp_param_lattices *plats = ipa_get_parm_lattices (info, ++ src->index); ++ ipcp_lattice *src_lat = src->offset == -1 ? &plats->itself ++ : plats->aggs; ++ ipcp_value *src_val; ++ ++ for (src_val = src_lat->values; src_val; src_val = src_val->next) ++ if (src_val == val) ++ break; ++ ++ if (!src_val) ++ return false; ++ } ++ ++ return true; ++} ++ ++/* A helper function that returns result of operation specified by OPCODE on ++ the value of SRC_VAL. If non-NULL, OPND1_TYPE is expected type for the ++ value of SRC_VAL. If the operation is binary, OPND2 is a constant value ++ acting as its second operand. If non-NULL, RES_TYPE is expected type of ++ the result. */ ++ ++static tree ++get_val_across_arith_op (enum tree_code opcode, ++ tree opnd1_type, ++ tree opnd2, ++ ipcp_value *src_val, ++ tree res_type) ++{ ++ tree opnd1 = src_val->value; ++ ++ /* Skip source values that is incompatible with specified type. */ ++ if (opnd1_type ++ && !useless_type_conversion_p (opnd1_type, TREE_TYPE (opnd1))) ++ return NULL_TREE; ++ ++ return ipa_get_jf_arith_result (opcode, opnd1, opnd2, res_type); ++} ++ ++/* Propagate values through an arithmetic transformation described by a jump ++ function associated with edge CS, taking values from SRC_LAT and putting ++ them into DEST_LAT. OPND1_TYPE is expected type for the values in SRC_LAT. ++ OPND2 is a constant value if transformation is a binary operation. ++ SRC_OFFSET specifies offset in an aggregate if SRC_LAT describes lattice of ++ a part of the aggregate. SRC_IDX is the index of the source parameter. ++ RES_TYPE is the value type of result being propagated into. Return true if ++ DEST_LAT changed. */ ++ ++static bool ++propagate_vals_across_arith_jfunc (cgraph_edge *cs, ++ enum tree_code opcode, ++ tree opnd1_type, ++ tree opnd2, ++ ipcp_lattice *src_lat, ++ ipcp_lattice *dest_lat, ++ HOST_WIDE_INT src_offset, ++ int src_idx, ++ tree res_type) + { + ipcp_value *src_val; + bool ret = false; + +- /* Do not create new values when propagating within an SCC because if there +- are arithmetic functions with circular dependencies, there is infinite +- number of them and we would just make lattices bottom. If this condition +- is ever relaxed we have to detect self-feeding recursive calls in +- cgraph_edge_brings_value_p in a smarter way. */ +- if ((ipa_get_jf_pass_through_operation (jfunc) != NOP_EXPR) +- && ipa_edge_within_scc (cs)) +- ret = dest_lat->set_contains_variable (); ++ /* Due to circular dependencies, propagating within an SCC through arithmetic ++ transformation would create infinite number of values. But for ++ self-feeding recursive function, we could allow propagation in a limited ++ count, and this can enable a simple kind of recursive function versioning. ++ For other scenario, we would just make lattices bottom. */ ++ if (opcode != NOP_EXPR && ipa_edge_within_scc (cs)) ++ { ++ int i; ++ ++ if (src_lat != dest_lat || PARAM_VALUE(PARAM_IPA_CP_MAX_RECURSIVE_DEPTH) < 1) ++ return dest_lat->set_contains_variable (); ++ ++ /* No benefit if recursive execution is in low probability. */ ++ if (cs->sreal_frequency () * 100 ++ <= ((sreal) 1) * PARAM_VALUE(PARAM_IPA_CP_MIN_RECURSIVE_PROBABILITY)) ++ return dest_lat->set_contains_variable (); ++ ++ auto_vec *, 8> val_seeds; ++ ++ for (src_val = src_lat->values; src_val; src_val = src_val->next) ++ { ++ /* Now we do not use self-recursively generated value as propagation ++ source, this is absolutely conservative, but could avoid explosion ++ of lattice's value space, especially when one recursive function ++ calls another recursive. */ ++ if (self_recursively_generated_p (src_val)) ++ { ++ ipcp_value_source *s; ++ ++ /* If the lattice has already been propagated for the call site, ++ no need to do that again. */ ++ for (s = src_val->sources; s; s = s->next) ++ if (s->cs == cs) ++ return dest_lat->set_contains_variable (); ++ } ++ else ++ val_seeds.safe_push (src_val); ++ } ++ ++ /* Recursively generate lattice values with a limited count. */ ++ FOR_EACH_VEC_ELT (val_seeds, i, src_val) ++ { ++ for (int j = 1; j < PARAM_VALUE(PARAM_IPA_CP_MAX_RECURSIVE_DEPTH); j++) ++ { ++ tree cstval = get_val_across_arith_op (opcode, opnd1_type, opnd2, ++ src_val, res_type); ++ if (!cstval) ++ break; ++ ++ ret |= dest_lat->add_value (cstval, cs, src_val, src_idx, ++ src_offset, &src_val, true); ++ gcc_checking_assert (src_val); ++ } ++ } ++ ret |= dest_lat->set_contains_variable (); ++ } + else + for (src_val = src_lat->values; src_val; src_val = src_val->next) + { +- tree cstval = ipa_get_jf_pass_through_result (jfunc, src_val->value, +- parm_type); ++ /* Now we do not use self-recursively generated value as propagation ++ source, otherwise it is easy to make value space of normal lattice ++ overflow. */ ++ if (self_recursively_generated_p (src_val)) ++ { ++ ret |= dest_lat->set_contains_variable (); ++ continue; ++ } + ++ tree cstval = get_val_across_arith_op (opcode, opnd1_type, opnd2, ++ src_val, res_type); + if (cstval) +- ret |= dest_lat->add_value (cstval, cs, src_val, src_idx); ++ ret |= dest_lat->add_value (cstval, cs, src_val, src_idx, ++ src_offset); + else + ret |= dest_lat->set_contains_variable (); + } +@@ -1609,6 +1928,24 @@ propagate_vals_across_pass_through (cgra + return ret; + } + ++/* Propagate values through a pass-through jump function JFUNC associated with ++ edge CS, taking values from SRC_LAT and putting them into DEST_LAT. SRC_IDX ++ is the index of the source parameter. PARM_TYPE is the type of the ++ parameter to which the result is passed. */ ++ ++static bool ++propagate_vals_across_pass_through (cgraph_edge *cs, ipa_jump_func *jfunc, ++ ipcp_lattice *src_lat, ++ ipcp_lattice *dest_lat, int src_idx, ++ tree parm_type) ++{ ++ return propagate_vals_across_arith_jfunc (cs, ++ ipa_get_jf_pass_through_operation (jfunc), ++ NULL_TREE, ++ ipa_get_jf_pass_through_operand (jfunc), ++ src_lat, dest_lat, -1, src_idx, parm_type); ++} ++ + /* Propagate values through an ancestor jump function JFUNC associated with + edge CS, taking values from SRC_LAT and putting them into DEST_LAT. SRC_IDX + is the index of the source parameter. */ +@@ -1659,7 +1996,7 @@ propagate_scalar_across_jump_function (s + else if (jfunc->type == IPA_JF_PASS_THROUGH + || jfunc->type == IPA_JF_ANCESTOR) + { +- struct ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); ++ class ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); + ipcp_lattice *src_lat; + int src_idx; + bool ret; +@@ -1721,7 +2058,7 @@ propagate_context_across_jump_function ( + if (jfunc->type == IPA_JF_PASS_THROUGH + || jfunc->type == IPA_JF_ANCESTOR) + { +- struct ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); ++ class ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); + int src_idx; + ipcp_lattice *src_lat; + +@@ -1769,7 +2106,6 @@ propagate_context_across_jump_function ( + added_sth = true; + } + } +- + } + + prop_fail: +@@ -1797,7 +2133,7 @@ propagate_bits_across_jump_function (cgr + + enum availability availability; + cgraph_node *callee = cs->callee->function_symbol (&availability); +- struct ipa_node_params *callee_info = IPA_NODE_REF (callee); ++ class ipa_node_params *callee_info = IPA_NODE_REF (callee); + tree parm_type = ipa_get_type (callee_info, idx); + + /* For K&R C programs, ipa_get_type() could return NULL_TREE. Avoid the +@@ -1820,7 +2156,7 @@ propagate_bits_across_jump_function (cgr + if (jfunc->type == IPA_JF_PASS_THROUGH + || jfunc->type == IPA_JF_ANCESTOR) + { +- struct ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); ++ class ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); + tree operand = NULL_TREE; + enum tree_code code; + unsigned src_idx; +@@ -1840,7 +2176,7 @@ propagate_bits_across_jump_function (cgr + operand = build_int_cstu (size_type_node, offset); + } + +- struct ipcp_param_lattices *src_lats ++ class ipcp_param_lattices *src_lats + = ipa_get_parm_lattices (caller_info, src_idx); + + /* Try to propagate bits if src_lattice is bottom, but jfunc is known. +@@ -1894,7 +2230,7 @@ ipa_vr_operation_and_type_effects (value + + static bool + propagate_vr_across_jump_function (cgraph_edge *cs, ipa_jump_func *jfunc, +- struct ipcp_param_lattices *dest_plats, ++ class ipcp_param_lattices *dest_plats, + tree param_type) + { + ipcp_vr_lattice *dest_lat = &dest_plats->m_value_range; +@@ -1913,10 +2249,10 @@ propagate_vr_across_jump_function (cgrap + + if (TREE_CODE_CLASS (operation) == tcc_unary) + { +- struct ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); ++ class ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); + int src_idx = ipa_get_jf_pass_through_formal_id (jfunc); + tree operand_type = ipa_get_type (caller_info, src_idx); +- struct ipcp_param_lattices *src_lats ++ class ipcp_param_lattices *src_lats + = ipa_get_parm_lattices (caller_info, src_idx); + + if (src_lats->m_value_range.bottom_p ()) +@@ -1959,7 +2295,7 @@ propagate_vr_across_jump_function (cgrap + aggs_by_ref to NEW_AGGS_BY_REF. */ + + static bool +-set_check_aggs_by_ref (struct ipcp_param_lattices *dest_plats, ++set_check_aggs_by_ref (class ipcp_param_lattices *dest_plats, + bool new_aggs_by_ref) + { + if (dest_plats->aggs) +@@ -1986,7 +2322,7 @@ set_check_aggs_by_ref (struct ipcp_param + true. */ + + static bool +-merge_agg_lats_step (struct ipcp_param_lattices *dest_plats, ++merge_agg_lats_step (class ipcp_param_lattices *dest_plats, + HOST_WIDE_INT offset, HOST_WIDE_INT val_size, + struct ipcp_agg_lattice ***aglat, + bool pre_existing, bool *change) +@@ -2064,8 +2400,8 @@ set_chain_of_aglats_contains_variable (s + + static bool + merge_aggregate_lattices (struct cgraph_edge *cs, +- struct ipcp_param_lattices *dest_plats, +- struct ipcp_param_lattices *src_plats, ++ class ipcp_param_lattices *dest_plats, ++ class ipcp_param_lattices *src_plats, + int src_idx, HOST_WIDE_INT offset_delta) + { + bool pre_existing = dest_plats->aggs != NULL; +@@ -2119,7 +2455,7 @@ merge_aggregate_lattices (struct cgraph_ + rules about propagating values passed by reference. */ + + static bool +-agg_pass_through_permissible_p (struct ipcp_param_lattices *src_plats, ++agg_pass_through_permissible_p (class ipcp_param_lattices *src_plats, + struct ipa_jump_func *jfunc) + { + return src_plats->aggs +@@ -2127,13 +2463,92 @@ agg_pass_through_permissible_p (struct i + || ipa_get_jf_pass_through_agg_preserved (jfunc)); + } + ++/* Propagate values through ITEM, jump function for a part of an aggregate, ++ into corresponding aggregate lattice AGLAT. CS is the call graph edge ++ associated with the jump function. Return true if AGLAT changed in any ++ way. */ ++ ++static bool ++propagate_aggregate_lattice (struct cgraph_edge *cs, ++ struct ipa_agg_jf_item *item, ++ struct ipcp_agg_lattice *aglat) ++{ ++ class ipa_node_params *caller_info; ++ class ipcp_param_lattices *src_plats; ++ struct ipcp_lattice *src_lat; ++ HOST_WIDE_INT src_offset; ++ int src_idx; ++ tree load_type; ++ bool ret; ++ ++ if (item->jftype == IPA_JF_CONST) ++ { ++ tree value = item->value.constant; ++ ++ gcc_checking_assert (is_gimple_ip_invariant (value)); ++ return aglat->add_value (value, cs, NULL, 0); ++ } ++ ++ gcc_checking_assert (item->jftype == IPA_JF_PASS_THROUGH ++ || item->jftype == IPA_JF_LOAD_AGG); ++ ++ caller_info = IPA_NODE_REF (cs->caller); ++ src_idx = item->value.pass_through.formal_id; ++ src_plats = ipa_get_parm_lattices (caller_info, src_idx); ++ ++ if (item->jftype == IPA_JF_PASS_THROUGH) ++ { ++ load_type = NULL_TREE; ++ src_lat = &src_plats->itself; ++ src_offset = -1; ++ } ++ else ++ { ++ HOST_WIDE_INT load_offset = item->value.load_agg.offset; ++ struct ipcp_agg_lattice *src_aglat; ++ ++ for (src_aglat = src_plats->aggs; src_aglat; src_aglat = src_aglat->next) ++ if (src_aglat->offset >= load_offset) ++ break; ++ ++ load_type = item->value.load_agg.type; ++ if (!src_aglat ++ || src_aglat->offset > load_offset ++ || src_aglat->size != tree_to_shwi (TYPE_SIZE (load_type)) ++ || src_plats->aggs_by_ref != item->value.load_agg.by_ref) ++ return aglat->set_contains_variable (); ++ ++ src_lat = src_aglat; ++ src_offset = load_offset; ++ } ++ ++ if (src_lat->bottom ++ || (!ipcp_versionable_function_p (cs->caller) ++ && !src_lat->is_single_const ())) ++ return aglat->set_contains_variable (); ++ ++ ret = propagate_vals_across_arith_jfunc (cs, ++ item->value.pass_through.operation, ++ load_type, ++ item->value.pass_through.operand, ++ src_lat, aglat, ++ src_offset, ++ src_idx, ++ item->type); ++ ++ if (src_lat->contains_variable) ++ ret |= aglat->set_contains_variable (); ++ ++ return ret; ++} ++ + /* Propagate scalar values across jump function JFUNC that is associated with + edge CS and put the values into DEST_LAT. */ + + static bool + propagate_aggs_across_jump_function (struct cgraph_edge *cs, + struct ipa_jump_func *jfunc, +- struct ipcp_param_lattices *dest_plats) ++ class ipcp_param_lattices *dest_plats) + { + bool ret = false; + +@@ -2143,9 +2558,9 @@ propagate_aggs_across_jump_function (str + if (jfunc->type == IPA_JF_PASS_THROUGH + && ipa_get_jf_pass_through_operation (jfunc) == NOP_EXPR) + { +- struct ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); ++ class ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); + int src_idx = ipa_get_jf_pass_through_formal_id (jfunc); +- struct ipcp_param_lattices *src_plats; ++ class ipcp_param_lattices *src_plats; + + src_plats = ipa_get_parm_lattices (caller_info, src_idx); + if (agg_pass_through_permissible_p (src_plats, jfunc)) +@@ -2162,9 +2577,9 @@ propagate_aggs_across_jump_function (str + else if (jfunc->type == IPA_JF_ANCESTOR + && ipa_get_jf_ancestor_agg_preserved (jfunc)) + { +- struct ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); ++ class ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); + int src_idx = ipa_get_jf_ancestor_formal_id (jfunc); +- struct ipcp_param_lattices *src_plats; ++ class ipcp_param_lattices *src_plats; + + src_plats = ipa_get_parm_lattices (caller_info, src_idx); + if (src_plats->aggs && src_plats->aggs_by_ref) +@@ -2194,15 +2609,14 @@ propagate_aggs_across_jump_function (str + { + HOST_WIDE_INT val_size; + +- if (item->offset < 0) ++ if (item->offset < 0 || item->jftype == IPA_JF_UNKNOWN) + continue; +- gcc_checking_assert (is_gimple_ip_invariant (item->value)); +- val_size = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (item->value))); ++ val_size = tree_to_shwi (TYPE_SIZE (item->type)); + + if (merge_agg_lats_step (dest_plats, item->offset, val_size, + &aglat, pre_existing, &ret)) + { +- ret |= (*aglat)->add_value (item->value, cs, NULL, 0, 0); ++ ret |= propagate_aggregate_lattice (cs, item, *aglat); + aglat = &(*aglat)->next; + } + else if (dest_plats->aggs_bottom) +@@ -2235,10 +2649,10 @@ call_passes_through_thunk_p (cgraph_edge + static bool + propagate_constants_across_call (struct cgraph_edge *cs) + { +- struct ipa_node_params *callee_info; ++ class ipa_node_params *callee_info; + enum availability availability; + cgraph_node *callee; +- struct ipa_edge_args *args; ++ class ipa_edge_args *args; + bool ret = false; + int i, args_count, parms_count; + +@@ -2247,12 +2661,21 @@ propagate_constants_across_call (struct + return false; + gcc_checking_assert (callee->has_gimple_body_p ()); + callee_info = IPA_NODE_REF (callee); ++ if (!callee_info) ++ return false; + + args = IPA_EDGE_REF (cs); +- args_count = ipa_get_cs_argument_count (args); + parms_count = ipa_get_param_count (callee_info); + if (parms_count == 0) + return false; ++ if (!args) ++ { ++ for (i = 0; i < parms_count; i++) ++ ret |= set_all_contains_variable (ipa_get_parm_lattices (callee_info, ++ i)); ++ return ret; ++ } ++ args_count = ipa_get_cs_argument_count (args); + + /* If this call goes through a thunk we must not propagate to the first (0th) + parameter. However, we might need to uncover a thunk from below a series +@@ -2269,7 +2692,7 @@ propagate_constants_across_call (struct + for (; (i < args_count) && (i < parms_count); i++) + { + struct ipa_jump_func *jump_func = ipa_get_ith_jump_func (args, i); +- struct ipcp_param_lattices *dest_plats; ++ class ipcp_param_lattices *dest_plats; + tree param_type = ipa_get_type (callee_info, i); + + dest_plats = ipa_get_parm_lattices (callee_info, i); +@@ -2308,7 +2731,7 @@ static tree + ipa_get_indirect_edge_target_1 (struct cgraph_edge *ie, + vec known_csts, + vec known_contexts, +- vec known_aggs, ++ vec known_aggs, + struct ipa_agg_replacement_value *agg_reps, + bool *speculative) + { +@@ -2346,9 +2769,9 @@ ipa_get_indirect_edge_target_1 (struct c + } + if (!t) + { +- struct ipa_agg_jump_function *agg; ++ struct ipa_agg_value_set *agg; + if (known_aggs.length () > (unsigned int) param_index) +- agg = known_aggs[param_index]; ++ agg = &known_aggs[param_index]; + else + agg = NULL; + bool from_global_constant; +@@ -2402,8 +2825,7 @@ ipa_get_indirect_edge_target_1 (struct c + if (!t && known_aggs.length () > (unsigned int) param_index + && !ie->indirect_info->by_ref) + { +- struct ipa_agg_jump_function *agg; +- agg = known_aggs[param_index]; ++ struct ipa_agg_value_set *agg = &known_aggs[param_index]; + t = ipa_find_agg_cst_for_param (agg, known_csts[param_index], + ie->indirect_info->offset, true); + } +@@ -2526,7 +2948,7 @@ tree + ipa_get_indirect_edge_target (struct cgraph_edge *ie, + vec known_csts, + vec known_contexts, +- vec known_aggs, ++ vec known_aggs, + bool *speculative) + { + return ipa_get_indirect_edge_target_1 (ie, known_csts, known_contexts, +@@ -2540,7 +2962,7 @@ static int + devirtualization_time_bonus (struct cgraph_node *node, + vec known_csts, + vec known_contexts, +- vec known_aggs) ++ vec known_aggs) + { + struct cgraph_edge *ie; + int res = 0; +@@ -2548,7 +2970,7 @@ devirtualization_time_bonus (struct cgra + for (ie = node->indirect_calls; ie; ie = ie->next_callee) + { + struct cgraph_node *callee; +- struct ipa_fn_summary *isummary; ++ class ipa_fn_summary *isummary; + enum availability avail; + tree target; + bool speculative; +@@ -2570,13 +2992,14 @@ devirtualization_time_bonus (struct cgra + if (!isummary || !isummary->inlinable) + continue; + ++ int size = ipa_size_summaries->get (callee)->size; + /* FIXME: The values below need re-considering and perhaps also + integrating into the cost metrics, at lest in some very basic way. */ +- if (isummary->size <= MAX_INLINE_INSNS_AUTO / 4) ++ if (size <= MAX_INLINE_INSNS_AUTO / 4) + res += 31 / ((int)speculative + 1); +- else if (isummary->size <= MAX_INLINE_INSNS_AUTO / 2) ++ else if (size <= MAX_INLINE_INSNS_AUTO / 2) + res += 15 / ((int)speculative + 1); +- else if (isummary->size <= MAX_INLINE_INSNS_AUTO ++ else if (size <= MAX_INLINE_INSNS_AUTO + || DECL_DECLARED_INLINE_P (callee->decl)) + res += 7 / ((int)speculative + 1); + } +@@ -2601,7 +3024,7 @@ hint_time_bonus (ipa_hints hints) + static inline int64_t + incorporate_penalties (ipa_node_params *info, int64_t evaluation) + { +- if (info->node_within_scc) ++ if (info->node_within_scc && !info->node_is_self_scc) + evaluation = (evaluation + * (100 - PARAM_VALUE (PARAM_IPA_CP_RECURSION_PENALTY))) / 100; + +@@ -2628,7 +3051,7 @@ good_cloning_opportunity_p (struct cgrap + + gcc_assert (size_cost > 0); + +- struct ipa_node_params *info = IPA_NODE_REF (node); ++ class ipa_node_params *info = IPA_NODE_REF (node); + if (max_count > profile_count::zero ()) + { + int factor = RDIV (count_sum.probability_in +@@ -2645,7 +3068,8 @@ good_cloning_opportunity_p (struct cgrap + count_sum.dump (dump_file); + fprintf (dump_file, "%s%s) -> evaluation: " "%" PRId64 + ", threshold: %i\n", +- info->node_within_scc ? ", scc" : "", ++ info->node_within_scc ++ ? (info->node_is_self_scc ? ", self_scc" : ", scc") : "", + info->node_calling_single_call ? ", single_call" : "", + evaluation, PARAM_VALUE (PARAM_IPA_CP_EVAL_THRESHOLD)); + } +@@ -2663,7 +3087,8 @@ good_cloning_opportunity_p (struct cgrap + "size: %i, freq_sum: %i%s%s) -> evaluation: " + "%" PRId64 ", threshold: %i\n", + time_benefit, size_cost, freq_sum, +- info->node_within_scc ? ", scc" : "", ++ info->node_within_scc ++ ? (info->node_is_self_scc ? ", self_scc" : ", scc") : "", + info->node_calling_single_call ? ", single_call" : "", + evaluation, PARAM_VALUE (PARAM_IPA_CP_EVAL_THRESHOLD)); + +@@ -2674,25 +3099,25 @@ good_cloning_opportunity_p (struct cgrap + /* Return all context independent values from aggregate lattices in PLATS in a + vector. Return NULL if there are none. */ + +-static vec * +-context_independent_aggregate_values (struct ipcp_param_lattices *plats) ++static vec ++context_independent_aggregate_values (class ipcp_param_lattices *plats) + { +- vec *res = NULL; ++ vec res = vNULL; + + if (plats->aggs_bottom + || plats->aggs_contain_variable + || plats->aggs_count == 0) +- return NULL; ++ return vNULL; + + for (struct ipcp_agg_lattice *aglat = plats->aggs; + aglat; + aglat = aglat->next) + if (aglat->is_single_const ()) + { +- struct ipa_agg_jf_item item; ++ struct ipa_agg_value item; + item.offset = aglat->offset; + item.value = aglat->values->value; +- vec_safe_push (res, item); ++ res.safe_push (item); + } + return res; + } +@@ -2704,11 +3129,11 @@ context_independent_aggregate_values (st + it. */ + + static bool +-gather_context_independent_values (struct ipa_node_params *info, ++gather_context_independent_values (class ipa_node_params *info, + vec *known_csts, + vec + *known_contexts, +- vec *known_aggs, ++ vec *known_aggs, + int *removable_params_cost) + { + int i, count = ipa_get_param_count (info); +@@ -2729,7 +3154,7 @@ gather_context_independent_values (struc + + for (i = 0; i < count; i++) + { +- struct ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); ++ class ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); + ipcp_lattice *lat = &plats->itself; + + if (lat->is_single_const ()) +@@ -2758,40 +3183,20 @@ gather_context_independent_values (struc + + if (known_aggs) + { +- vec *agg_items; +- struct ipa_agg_jump_function *ajf; ++ vec agg_items; ++ struct ipa_agg_value_set *agg; + + agg_items = context_independent_aggregate_values (plats); +- ajf = &(*known_aggs)[i]; +- ajf->items = agg_items; +- ajf->by_ref = plats->aggs_by_ref; +- ret |= agg_items != NULL; ++ agg = &(*known_aggs)[i]; ++ agg->items = agg_items; ++ agg->by_ref = plats->aggs_by_ref; ++ ret |= !agg_items.is_empty (); + } + } + + return ret; + } + +-/* The current interface in ipa-inline-analysis requires a pointer vector. +- Create it. +- +- FIXME: That interface should be re-worked, this is slightly silly. Still, +- I'd like to discuss how to change it first and this demonstrates the +- issue. */ +- +-static vec +-agg_jmp_p_vec_for_t_vec (vec known_aggs) +-{ +- vec ret; +- struct ipa_agg_jump_function *ajf; +- int i; +- +- ret.create (known_aggs.length ()); +- FOR_EACH_VEC_ELT (known_aggs, i, ajf) +- ret.quick_push (ajf); +- return ret; +-} +- + /* Perform time and size measurement of NODE with the context given in + KNOWN_CSTS, KNOWN_CONTEXTS and KNOWN_AGGS, calculate the benefit and cost + given BASE_TIME of the node without specialization, REMOVABLE_PARAMS_COST of +@@ -2801,7 +3206,7 @@ agg_jmp_p_vec_for_t_vec (vec known_csts, + vec known_contexts, +- vec known_aggs_ptrs, ++ vec known_aggs, + int removable_params_cost, + int est_move_cost, ipcp_value_base *val) + { +@@ -2810,7 +3215,7 @@ perform_estimation_of_a_value (cgraph_no + ipa_hints hints; + + estimate_ipcp_clone_size_and_time (node, known_csts, known_contexts, +- known_aggs_ptrs, &size, &time, ++ known_aggs, &size, &time, + &base_time, &hints); + base_time -= time; + if (base_time > 65535) +@@ -2824,7 +3229,7 @@ perform_estimation_of_a_value (cgraph_no + else + time_benefit = base_time.to_int () + + devirtualization_time_bonus (node, known_csts, known_contexts, +- known_aggs_ptrs) ++ known_aggs) + + hint_time_bonus (hints) + + removable_params_cost + est_move_cost; + +@@ -2846,12 +3251,11 @@ perform_estimation_of_a_value (cgraph_no + static void + estimate_local_effects (struct cgraph_node *node) + { +- struct ipa_node_params *info = IPA_NODE_REF (node); ++ class ipa_node_params *info = IPA_NODE_REF (node); + int i, count = ipa_get_param_count (info); + vec known_csts; + vec known_contexts; +- vec known_aggs; +- vec known_aggs_ptrs; ++ vec known_aggs; + bool always_const; + int removable_params_cost; + +@@ -2864,9 +3268,8 @@ estimate_local_effects (struct cgraph_no + always_const = gather_context_independent_values (info, &known_csts, + &known_contexts, &known_aggs, + &removable_params_cost); +- known_aggs_ptrs = agg_jmp_p_vec_for_t_vec (known_aggs); + int devirt_bonus = devirtualization_time_bonus (node, known_csts, +- known_contexts, known_aggs_ptrs); ++ known_contexts, known_aggs); + if (always_const || devirt_bonus + || (removable_params_cost && node->local.can_change_signature)) + { +@@ -2879,7 +3282,7 @@ estimate_local_effects (struct cgraph_no + node->call_for_symbol_thunks_and_aliases (gather_caller_stats, &stats, + false); + estimate_ipcp_clone_size_and_time (node, known_csts, known_contexts, +- known_aggs_ptrs, &size, &time, ++ known_aggs, &size, &time, + &base_time, &hints); + time -= devirt_bonus; + time -= hint_time_bonus (hints); +@@ -2926,7 +3329,7 @@ estimate_local_effects (struct cgraph_no + + for (i = 0; i < count; i++) + { +- struct ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); ++ class ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); + ipcp_lattice *lat = &plats->itself; + ipcp_value *val; + +@@ -2942,7 +3345,7 @@ estimate_local_effects (struct cgraph_no + + int emc = estimate_move_cost (TREE_TYPE (val->value), true); + perform_estimation_of_a_value (node, known_csts, known_contexts, +- known_aggs_ptrs, ++ known_aggs, + removable_params_cost, emc, val); + + if (dump_file && (dump_flags & TDF_DETAILS)) +@@ -2960,7 +3363,7 @@ estimate_local_effects (struct cgraph_no + + for (i = 0; i < count; i++) + { +- struct ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); ++ class ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); + + if (!plats->virt_call) + continue; +@@ -2977,7 +3380,7 @@ estimate_local_effects (struct cgraph_no + { + known_contexts[i] = val->value; + perform_estimation_of_a_value (node, known_csts, known_contexts, +- known_aggs_ptrs, ++ known_aggs, + removable_params_cost, 0, val); + + if (dump_file && (dump_flags & TDF_DETAILS)) +@@ -2995,14 +3398,14 @@ estimate_local_effects (struct cgraph_no + + for (i = 0; i < count; i++) + { +- struct ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); +- struct ipa_agg_jump_function *ajf; ++ class ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); ++ struct ipa_agg_value_set *agg; + struct ipcp_agg_lattice *aglat; + + if (plats->aggs_bottom || !plats->aggs) + continue; + +- ajf = &known_aggs[i]; ++ agg = &known_aggs[i]; + for (aglat = plats->aggs; aglat; aglat = aglat->next) + { + ipcp_value *val; +@@ -3014,14 +3417,14 @@ estimate_local_effects (struct cgraph_no + + for (val = aglat->values; val; val = val->next) + { +- struct ipa_agg_jf_item item; ++ struct ipa_agg_value item; + + item.offset = aglat->offset; + item.value = val->value; +- vec_safe_push (ajf->items, item); ++ agg->items.safe_push (item); + + perform_estimation_of_a_value (node, known_csts, known_contexts, +- known_aggs_ptrs, ++ known_aggs, + removable_params_cost, 0, val); + + if (dump_file && (dump_flags & TDF_DETAILS)) +@@ -3037,18 +3440,14 @@ estimate_local_effects (struct cgraph_no + val->local_time_benefit, val->local_size_cost); + } + +- ajf->items->pop (); ++ agg->items.pop (); + } + } + } + +- for (i = 0; i < count; i++) +- vec_free (known_aggs[i].items); +- + known_csts.release (); + known_contexts.release (); +- known_aggs.release (); +- known_aggs_ptrs.release (); ++ ipa_release_agg_values (known_aggs); + } + + +@@ -3112,12 +3511,12 @@ value_topo_info::add_val (ipcp_ + static void + add_all_node_vals_to_toposort (cgraph_node *node, ipa_topo_info *topo) + { +- struct ipa_node_params *info = IPA_NODE_REF (node); ++ class ipa_node_params *info = IPA_NODE_REF (node); + int i, count = ipa_get_param_count (info); + + for (i = 0; i < count; i++) + { +- struct ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); ++ class ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); + ipcp_lattice *lat = &plats->itself; + struct ipcp_agg_lattice *aglat; + +@@ -3152,7 +3551,7 @@ add_all_node_vals_to_toposort (cgraph_no + connected components. */ + + static void +-propagate_constants_topo (struct ipa_topo_info *topo) ++propagate_constants_topo (class ipa_topo_info *topo) + { + int i; + +@@ -3166,20 +3565,46 @@ propagate_constants_topo (struct ipa_top + until all lattices stabilize. */ + FOR_EACH_VEC_ELT (cycle_nodes, j, v) + if (v->has_gimple_body_p ()) +- push_node_to_stack (topo, v); ++ { ++ if (opt_for_fn (v->decl, flag_ipa_cp)) ++ push_node_to_stack (topo, v); ++ /* When V is not optimized, we can not push it to stac, but ++ still we need to set all its callees lattices to bottom. */ ++ else ++ { ++ for (cgraph_edge *cs = v->callees; cs; cs = cs->next_callee) ++ propagate_constants_across_call (cs); ++ } ++ } + + v = pop_node_from_stack (topo); + while (v) + { + struct cgraph_edge *cs; ++ class ipa_node_params *info = NULL; ++ bool self_scc = true; + + for (cs = v->callees; cs; cs = cs->next_callee) + if (ipa_edge_within_scc (cs)) + { +- IPA_NODE_REF (v)->node_within_scc = true; ++ cgraph_node *callee = cs->callee->function_symbol (); ++ ++ if (v != callee) ++ self_scc = false; ++ ++ if (!info) ++ { ++ info = IPA_NODE_REF (v); ++ info->node_within_scc = true; ++ } ++ + if (propagate_constants_across_call (cs)) +- push_node_to_stack (topo, cs->callee->function_symbol ()); ++ push_node_to_stack (topo, callee); + } ++ ++ if (info) ++ info->node_is_self_scc = self_scc; ++ + v = pop_node_from_stack (topo); + } + +@@ -3187,7 +3612,8 @@ propagate_constants_topo (struct ipa_top + the local effects of the discovered constants and all valid values to + their topological sort. */ + FOR_EACH_VEC_ELT (cycle_nodes, j, v) +- if (v->has_gimple_body_p ()) ++ if (v->has_gimple_body_p () ++ && opt_for_fn (v->decl, flag_ipa_cp)) + { + struct cgraph_edge *cs; + +@@ -3255,7 +3681,7 @@ value_topo_info::propagate_effe + summaries interprocedurally. */ + + static void +-ipcp_propagate_stage (struct ipa_topo_info *topo) ++ipcp_propagate_stage (class ipa_topo_info *topo) + { + struct cgraph_node *node; + +@@ -3266,16 +3692,15 @@ ipcp_propagate_stage (struct ipa_topo_in + + FOR_EACH_DEFINED_FUNCTION (node) + { +- struct ipa_node_params *info = IPA_NODE_REF (node); +- +- determine_versionability (node, info); +- if (node->has_gimple_body_p ()) ++ if (node->has_gimple_body_p () && opt_for_fn (node->decl, flag_ipa_cp)) + { +- info->lattices = XCNEWVEC (struct ipcp_param_lattices, ++ class ipa_node_params *info = IPA_NODE_REF (node); ++ determine_versionability (node, info); ++ info->lattices = XCNEWVEC (class ipcp_param_lattices, + ipa_get_param_count (info)); + initialize_node_lattices (node); + } +- ipa_fn_summary *s = ipa_fn_summaries->get (node); ++ ipa_size_summary *s = ipa_size_summaries->get (node); + if (node->definition && !node->alias && s != NULL) + overall_size += s->self_size; + max_count = max_count.max (node->count.ipa ()); +@@ -3335,7 +3760,7 @@ ipcp_discover_new_direct_edges (struct c + + if (cs && !agg_contents && !polymorphic) + { +- struct ipa_node_params *info = IPA_NODE_REF (node); ++ class ipa_node_params *info = IPA_NODE_REF (node); + int c = ipa_get_controlled_uses (info, param_index); + if (c != IPA_UNDESCRIBED_USE) + { +@@ -3415,26 +3840,6 @@ edge_clone_summary_t::duplicate (cgraph_ + src_data->next_clone = dst_edge; + } + +-/* See if NODE is a clone with a known aggregate value at a given OFFSET of a +- parameter with the given INDEX. */ +- +-static tree +-get_clone_agg_value (struct cgraph_node *node, HOST_WIDE_INT offset, +- int index) +-{ +- struct ipa_agg_replacement_value *aggval; +- +- aggval = ipa_get_agg_replacements_for_node (node); +- while (aggval) +- { +- if (aggval->offset == offset +- && aggval->index == index) +- return aggval->value; +- aggval = aggval->next; +- } +- return NULL_TREE; +-} +- + /* Return true is NODE is DEST or its clone for all contexts. */ + + static bool +@@ -3443,7 +3848,7 @@ same_node_or_its_all_contexts_clone_p (c + if (node == dest) + return true; + +- struct ipa_node_params *info = IPA_NODE_REF (node); ++ class ipa_node_params *info = IPA_NODE_REF (node); + return info->is_all_contexts_clone && info->ipcp_orig_node == dest; + } + +@@ -3454,12 +3859,12 @@ static bool + cgraph_edge_brings_value_p (cgraph_edge *cs, ipcp_value_source *src, + cgraph_node *dest, ipcp_value *dest_val) + { +- struct ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); ++ class ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); + enum availability availability; + cgraph_node *real_dest = cs->callee->function_symbol (&availability); + +- if (!same_node_or_its_all_contexts_clone_p (real_dest, dest) +- || availability <= AVAIL_INTERPOSABLE ++ if (availability <= AVAIL_INTERPOSABLE ++ || !same_node_or_its_all_contexts_clone_p (real_dest, dest) + || caller_info->node_dead) + return false; + +@@ -3485,7 +3890,7 @@ cgraph_edge_brings_value_p (cgraph_edge + return true; + + struct ipcp_agg_lattice *aglat; +- struct ipcp_param_lattices *plats = ipa_get_parm_lattices (caller_info, ++ class ipcp_param_lattices *plats = ipa_get_parm_lattices (caller_info, + src->index); + if (src->offset == -1) + return (plats->itself.is_single_const () +@@ -3514,10 +3919,12 @@ cgraph_edge_brings_value_p (cgraph_edge + cgraph_node *dest, + ipcp_value *) + { +- struct ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); +- cgraph_node *real_dest = cs->callee->function_symbol (); ++ class ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); ++ enum availability avail; ++ cgraph_node *real_dest = cs->callee->function_symbol (&avail); + +- if (!same_node_or_its_all_contexts_clone_p (real_dest, dest) ++ if (avail <= AVAIL_INTERPOSABLE ++ || !same_node_or_its_all_contexts_clone_p (real_dest, dest) + || caller_info->node_dead) + return false; + if (!src->val) +@@ -3528,7 +3935,7 @@ cgraph_edge_brings_value_p (cgraph_edge + && values_equal_for_ipcp_p (src->val->value, + caller_info->known_contexts[src->index]); + +- struct ipcp_param_lattices *plats = ipa_get_parm_lattices (caller_info, ++ class ipcp_param_lattices *plats = ipa_get_parm_lattices (caller_info, + src->index); + return plats->ctxlat.is_single_const () + && values_equal_for_ipcp_p (src->val->value, +@@ -3575,6 +3982,9 @@ get_info_about_necessary_edges (ipcp_val + hot |= cs->maybe_hot_p (); + if (cs->caller != dest) + non_self_recursive = true; ++ else if (src->val) ++ gcc_assert (values_equal_for_ipcp_p (src->val->value, ++ val->value)); + } + cs = get_next_cgraph_edge_clone (cs); + } +@@ -3588,6 +3998,19 @@ get_info_about_necessary_edges (ipcp_val + *freq_sum = freq; + *count_sum = cnt; + *caller_count = count; ++ ++ if (!hot && IPA_NODE_REF (dest)->node_within_scc) ++ { ++ struct cgraph_edge *cs; ++ ++ /* Cold non-SCC source edge could trigger hot recursive execution of ++ function. Consider the case as hot and rely on following cost model ++ computation to further select right one. */ ++ for (cs = dest->callers; cs; cs = cs->next_caller) ++ if (cs->caller == dest && cs->maybe_hot_p ()) ++ return true; ++ } ++ + return hot; + } + +@@ -3621,7 +4044,7 @@ gather_edges_for_value (ipcp_value callers) + { +- struct ipa_node_params *new_info, *info = IPA_NODE_REF (node); ++ class ipa_node_params *new_info, *info = IPA_NODE_REF (node); + vec *replace_trees = NULL; + struct ipa_agg_replacement_value *av; + struct cgraph_node *new_node; +@@ -3891,6 +4314,7 @@ create_specialized_node (struct cgraph_n + update_profiling_info (node, new_node); + new_info = IPA_NODE_REF (new_node); + new_info->ipcp_orig_node = node; ++ new_node->ipcp_clone = true; + new_info->known_csts = known_csts; + new_info->known_contexts = known_contexts; + +@@ -3924,7 +4348,7 @@ find_more_scalar_values_for_callers_subs + vec known_csts, + vec callers) + { +- struct ipa_node_params *info = IPA_NODE_REF (node); ++ class ipa_node_params *info = IPA_NODE_REF (node); + int i, count = ipa_get_param_count (info); + + for (i = 0; i < count; i++) +@@ -3946,7 +4370,8 @@ find_more_scalar_values_for_callers_subs + if (IPA_NODE_REF (cs->caller)->node_dead) + continue; + +- if (i >= ipa_get_cs_argument_count (IPA_EDGE_REF (cs)) ++ if (!IPA_EDGE_REF (cs) ++ || i >= ipa_get_cs_argument_count (IPA_EDGE_REF (cs)) + || (i == 0 + && call_passes_through_thunk_p (cs))) + { +@@ -4015,7 +4440,8 @@ find_more_contexts_for_caller_subset (cg + + FOR_EACH_VEC_ELT (callers, j, cs) + { +- if (i >= ipa_get_cs_argument_count (IPA_EDGE_REF (cs))) ++ if (!IPA_EDGE_REF (cs) ++ || i >= ipa_get_cs_argument_count (IPA_EDGE_REF (cs))) + return; + ipa_jump_func *jfunc = ipa_get_ith_jump_func (IPA_EDGE_REF (cs), + i); +@@ -4056,10 +4482,10 @@ find_more_contexts_for_caller_subset (cg + /* Go through PLATS and create a vector of values consisting of values and + offsets (minus OFFSET) of lattices that contain only a single value. */ + +-static vec +-copy_plats_to_inter (struct ipcp_param_lattices *plats, HOST_WIDE_INT offset) ++static vec ++copy_plats_to_inter (class ipcp_param_lattices *plats, HOST_WIDE_INT offset) + { +- vec res = vNULL; ++ vec res = vNULL; + + if (!plats->aggs || plats->aggs_contain_variable || plats->aggs_bottom) + return vNULL; +@@ -4067,7 +4493,7 @@ copy_plats_to_inter (struct ipcp_param_l + for (struct ipcp_agg_lattice *aglat = plats->aggs; aglat; aglat = aglat->next) + if (aglat->is_single_const ()) + { +- struct ipa_agg_jf_item ti; ++ struct ipa_agg_value ti; + ti.offset = aglat->offset - offset; + ti.value = aglat->values->value; + res.safe_push (ti); +@@ -4079,12 +4505,12 @@ copy_plats_to_inter (struct ipcp_param_l + subtracting OFFSET). */ + + static void +-intersect_with_plats (struct ipcp_param_lattices *plats, +- vec *inter, ++intersect_with_plats (class ipcp_param_lattices *plats, ++ vec *inter, + HOST_WIDE_INT offset) + { + struct ipcp_agg_lattice *aglat; +- struct ipa_agg_jf_item *item; ++ struct ipa_agg_value *item; + int k; + + if (!plats->aggs || plats->aggs_contain_variable || plats->aggs_bottom) +@@ -4122,18 +4548,18 @@ intersect_with_plats (struct ipcp_param_ + /* Copy aggregate replacement values of NODE (which is an IPA-CP clone) to the + vector result while subtracting OFFSET from the individual value offsets. */ + +-static vec ++static vec + agg_replacements_to_vector (struct cgraph_node *node, int index, + HOST_WIDE_INT offset) + { + struct ipa_agg_replacement_value *av; +- vec res = vNULL; ++ vec res = vNULL; + + for (av = ipa_get_agg_replacements_for_node (node); av; av = av->next) + if (av->index == index + && (av->offset - offset) >= 0) + { +- struct ipa_agg_jf_item item; ++ struct ipa_agg_value item; + gcc_checking_assert (av->value); + item.offset = av->offset - offset; + item.value = av->value; +@@ -4149,11 +4575,11 @@ agg_replacements_to_vector (struct cgrap + + static void + intersect_with_agg_replacements (struct cgraph_node *node, int index, +- vec *inter, ++ vec *inter, + HOST_WIDE_INT offset) + { + struct ipa_agg_replacement_value *srcvals; +- struct ipa_agg_jf_item *item; ++ struct ipa_agg_value *item; + int i; + + srcvals = ipa_get_agg_replacements_for_node (node); +@@ -4190,22 +4616,22 @@ intersect_with_agg_replacements (struct + copy all incoming values to it. If we determine we ended up with no values + whatsoever, return a released vector. */ + +-static vec ++static vec + intersect_aggregates_with_edge (struct cgraph_edge *cs, int index, +- vec inter) ++ vec inter) + { + struct ipa_jump_func *jfunc; + jfunc = ipa_get_ith_jump_func (IPA_EDGE_REF (cs), index); + if (jfunc->type == IPA_JF_PASS_THROUGH + && ipa_get_jf_pass_through_operation (jfunc) == NOP_EXPR) + { +- struct ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); ++ class ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); + int src_idx = ipa_get_jf_pass_through_formal_id (jfunc); + + if (caller_info->ipcp_orig_node) + { + struct cgraph_node *orig_node = caller_info->ipcp_orig_node; +- struct ipcp_param_lattices *orig_plats; ++ class ipcp_param_lattices *orig_plats; + orig_plats = ipa_get_parm_lattices (IPA_NODE_REF (orig_node), + src_idx); + if (agg_pass_through_permissible_p (orig_plats, jfunc)) +@@ -4224,7 +4650,7 @@ intersect_aggregates_with_edge (struct c + } + else + { +- struct ipcp_param_lattices *src_plats; ++ class ipcp_param_lattices *src_plats; + src_plats = ipa_get_parm_lattices (caller_info, src_idx); + if (agg_pass_through_permissible_p (src_plats, jfunc)) + { +@@ -4246,9 +4672,9 @@ intersect_aggregates_with_edge (struct c + else if (jfunc->type == IPA_JF_ANCESTOR + && ipa_get_jf_ancestor_agg_preserved (jfunc)) + { +- struct ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); ++ class ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); + int src_idx = ipa_get_jf_ancestor_formal_id (jfunc); +- struct ipcp_param_lattices *src_plats; ++ class ipcp_param_lattices *src_plats; + HOST_WIDE_INT delta = ipa_get_jf_ancestor_offset (jfunc); + + if (caller_info->ipcp_orig_node) +@@ -4273,12 +4699,26 @@ intersect_aggregates_with_edge (struct c + } + else if (jfunc->agg.items) + { +- struct ipa_agg_jf_item *item; ++ class ipa_node_params *caller_info = IPA_NODE_REF (cs->caller); ++ struct ipa_agg_value *item; + int k; + + if (!inter.exists ()) + for (unsigned i = 0; i < jfunc->agg.items->length (); i++) +- inter.safe_push ((*jfunc->agg.items)[i]); ++ { ++ struct ipa_agg_jf_item *agg_item = &(*jfunc->agg.items)[i]; ++ tree value = ipa_agg_value_from_node (caller_info, cs->caller, ++ agg_item); ++ if (value) ++ { ++ struct ipa_agg_value agg_value; ++ ++ agg_value.offset = agg_item->offset; ++ agg_value.value = value; ++ ++ inter.safe_push (agg_value); ++ } ++ } + else + FOR_EACH_VEC_ELT (inter, k, item) + { +@@ -4296,9 +4736,10 @@ intersect_aggregates_with_edge (struct c + break; + if (ti->offset == item->offset) + { +- gcc_checking_assert (ti->value); +- if (values_equal_for_ipcp_p (item->value, +- ti->value)) ++ tree value = ipa_agg_value_from_node (caller_info, ++ cs->caller, ti); ++ if (value ++ && values_equal_for_ipcp_p (item->value, value)) + found = true; + break; + } +@@ -4311,7 +4752,7 @@ intersect_aggregates_with_edge (struct c + else + { + inter.release (); +- return vec(); ++ return vNULL; + } + return inter; + } +@@ -4323,7 +4764,7 @@ static struct ipa_agg_replacement_value + find_aggregate_values_for_callers_subset (struct cgraph_node *node, + vec callers) + { +- struct ipa_node_params *dest_info = IPA_NODE_REF (node); ++ class ipa_node_params *dest_info = IPA_NODE_REF (node); + struct ipa_agg_replacement_value *res; + struct ipa_agg_replacement_value **tail = &res; + struct cgraph_edge *cs; +@@ -4331,6 +4772,11 @@ find_aggregate_values_for_callers_subset + + FOR_EACH_VEC_ELT (callers, j, cs) + { ++ if (!IPA_EDGE_REF (cs)) ++ { ++ count = 0; ++ break; ++ } + int c = ipa_get_cs_argument_count (IPA_EDGE_REF (cs)); + if (c < count) + count = c; +@@ -4339,9 +4785,9 @@ find_aggregate_values_for_callers_subset + for (i = 0; i < count; i++) + { + struct cgraph_edge *cs; +- vec inter = vNULL; +- struct ipa_agg_jf_item *item; +- struct ipcp_param_lattices *plats = ipa_get_parm_lattices (dest_info, i); ++ vec inter = vNULL; ++ struct ipa_agg_value *item; ++ class ipcp_param_lattices *plats = ipa_get_parm_lattices (dest_info, i); + int j; + + /* Among other things, the following check should deal with all by_ref +@@ -4394,10 +4840,10 @@ static bool + cgraph_edge_brings_all_scalars_for_node (struct cgraph_edge *cs, + struct cgraph_node *node) + { +- struct ipa_node_params *dest_info = IPA_NODE_REF (node); ++ class ipa_node_params *dest_info = IPA_NODE_REF (node); + int count = ipa_get_param_count (dest_info); +- struct ipa_node_params *caller_info; +- struct ipa_edge_args *args; ++ class ipa_node_params *caller_info; ++ class ipa_edge_args *args; + int i; + + caller_info = IPA_NODE_REF (cs->caller); +@@ -4428,8 +4874,7 @@ static bool + cgraph_edge_brings_all_agg_vals_for_node (struct cgraph_edge *cs, + struct cgraph_node *node) + { +- struct ipa_node_params *orig_caller_info = IPA_NODE_REF (cs->caller); +- struct ipa_node_params *orig_node_info; ++ class ipa_node_params *orig_node_info; + struct ipa_agg_replacement_value *aggval; + int i, ec, count; + +@@ -4445,12 +4890,10 @@ cgraph_edge_brings_all_agg_vals_for_node + return false; + + orig_node_info = IPA_NODE_REF (IPA_NODE_REF (node)->ipcp_orig_node); +- if (orig_caller_info->ipcp_orig_node) +- orig_caller_info = IPA_NODE_REF (orig_caller_info->ipcp_orig_node); + + for (i = 0; i < count; i++) + { +- struct ipcp_param_lattices *plats; ++ class ipcp_param_lattices *plats; + bool interesting = false; + for (struct ipa_agg_replacement_value *av = aggval; av; av = av->next) + if (aggval->index == i) +@@ -4465,15 +4908,14 @@ cgraph_edge_brings_all_agg_vals_for_node + if (plats->aggs_bottom) + return false; + +- vec values +- = intersect_aggregates_with_edge (cs, i, vNULL); ++ vec values = intersect_aggregates_with_edge (cs, i, vNULL); + if (!values.exists ()) + return false; + + for (struct ipa_agg_replacement_value *av = aggval; av; av = av->next) + if (aggval->index == i) + { +- struct ipa_agg_jf_item *item; ++ struct ipa_agg_value *item; + int j; + bool found = false; + FOR_EACH_VEC_ELT (values, j, item) +@@ -4708,11 +5150,10 @@ decide_about_value (struct cgraph_node * + static bool + decide_whether_version_node (struct cgraph_node *node) + { +- struct ipa_node_params *info = IPA_NODE_REF (node); ++ class ipa_node_params *info = IPA_NODE_REF (node); + int i, count = ipa_get_param_count (info); + vec known_csts; + vec known_contexts; +- vec known_aggs = vNULL; + bool ret = false; + + if (count == 0) +@@ -4723,12 +5164,11 @@ decide_whether_version_node (struct cgra + node->dump_name ()); + + gather_context_independent_values (info, &known_csts, &known_contexts, +- info->do_clone_for_all_contexts ? &known_aggs +- : NULL, NULL); ++ NULL, NULL); + + for (i = 0; i < count;i++) + { +- struct ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); ++ class ipcp_param_lattices *plats = ipa_get_parm_lattices (info, i); + ipcp_lattice *lat = &plats->itself; + ipcp_lattice *ctxlat = &plats->ctxlat; + +@@ -4793,9 +5233,6 @@ decide_whether_version_node (struct cgra + info = IPA_NODE_REF (node); + info->do_clone_for_all_contexts = false; + IPA_NODE_REF (clone)->is_all_contexts_clone = true; +- for (i = 0; i < count; i++) +- vec_free (known_aggs[i].items); +- known_aggs.release (); + ret = true; + } + else +@@ -4818,7 +5255,7 @@ spread_undeadness (struct cgraph_node *n + if (ipa_edge_within_scc (cs)) + { + struct cgraph_node *callee; +- struct ipa_node_params *info; ++ class ipa_node_params *info; + + callee = cs->callee->function_symbol (NULL); + info = IPA_NODE_REF (callee); +@@ -4881,7 +5318,7 @@ identify_dead_nodes (struct cgraph_node + TOPO and make specialized clones if deemed beneficial. */ + + static void +-ipcp_decision_stage (struct ipa_topo_info *topo) ++ipcp_decision_stage (class ipa_topo_info *topo) + { + int i; + +@@ -4923,7 +5360,7 @@ ipcp_store_bits_results (void) + bool dumped_sth = false; + bool found_useful_result = false; + +- if (!opt_for_fn (node->decl, flag_ipa_bit_cp)) ++ if (!opt_for_fn (node->decl, flag_ipa_bit_cp) || !info) + { + if (dump_file) + fprintf (dump_file, "Not considering %s for ipa bitwise propagation " +@@ -5055,7 +5492,7 @@ ipcp_store_vr_results (void) + static unsigned int + ipcp_driver (void) + { +- struct ipa_topo_info topo; ++ class ipa_topo_info topo; + + if (edge_clone_summaries == NULL) + edge_clone_summaries = new edge_clone_summary_t (symtab); +diff -Nurp a/gcc/ipa-devirt.c b/gcc/ipa-devirt.c +--- a/gcc/ipa-devirt.c 2020-04-30 15:14:04.624000000 +0800 ++++ b/gcc/ipa-devirt.c 2020-04-30 15:14:56.624000000 +0800 +@@ -172,6 +172,11 @@ struct default_hash_traits + } + }; + ++/* HACK alert: this is used to communicate with ipa-inline-transform that ++ thunk is being expanded and there is no need to clear the polymorphic ++ call target cache. */ ++bool thunk_expansion; ++ + static bool odr_types_equivalent_p (tree, tree, bool, bool *, + hash_set *, + location_t, location_t); +@@ -2557,7 +2562,7 @@ maybe_record_node (vec & + || target_node->definition) + && target_node->real_symbol_p ()) + { +- gcc_assert (!target_node->global.inlined_to); ++ gcc_assert (!target_node->inlined_to); + gcc_assert (target_node->real_symbol_p ()); + /* When sanitizing, do not assume that __cxa_pure_virtual is not called + by valid program. */ +@@ -2892,6 +2897,7 @@ static void + devirt_node_removal_hook (struct cgraph_node *n, void *d ATTRIBUTE_UNUSED) + { + if (cached_polymorphic_call_targets ++ && !thunk_expansion + && cached_polymorphic_call_targets->contains (n)) + free_polymorphic_call_targets_hash (); + } +diff -Nurp a/gcc/ipa-fnsummary.c b/gcc/ipa-fnsummary.c +--- a/gcc/ipa-fnsummary.c 2020-04-30 15:14:04.568000000 +0800 ++++ b/gcc/ipa-fnsummary.c 2020-04-30 15:14:56.664000000 +0800 +@@ -86,6 +86,7 @@ along with GCC; see the file COPYING3. + + /* Summaries. */ + fast_function_summary *ipa_fn_summaries; ++fast_function_summary *ipa_size_summaries; + fast_call_summary *ipa_call_summaries; + + /* Edge predicates goes here. */ +@@ -207,7 +208,7 @@ ipa_fn_summary::account_size_time (int s + } + if (!found) + { +- struct size_time_entry new_entry; ++ class size_time_entry new_entry; + new_entry.size = size; + new_entry.time = time; + new_entry.exec_predicate = exec_pred; +@@ -236,7 +237,7 @@ redirect_to_unreachable (struct cgraph_e + e->make_direct (target); + else + e->redirect_callee (target); +- struct ipa_call_summary *es = ipa_call_summaries->get (e); ++ class ipa_call_summary *es = ipa_call_summaries->get (e); + e->inline_failed = CIF_UNREACHABLE; + e->count = profile_count::zero (); + es->call_stmt_size = 0; +@@ -261,7 +262,7 @@ edge_set_predicate (struct cgraph_edge * + && (!e->speculative || e->callee)) + e = redirect_to_unreachable (e); + +- struct ipa_call_summary *es = ipa_call_summaries->get (e); ++ class ipa_call_summary *es = ipa_call_summaries->get (e); + if (predicate && *predicate != true) + { + if (!es->predicate) +@@ -306,9 +307,9 @@ set_hint_predicate (predicate **p, predi + the fact that parameter is indeed a constant. + + KNOWN_VALS is partial mapping of parameters of NODE to constant values. +- KNOWN_AGGS is a vector of aggreggate jump functions for each parameter. +- Return clause of possible truths. When INLINE_P is true, assume that we are +- inlining. ++ KNOWN_AGGS is a vector of aggreggate known offset/value set for each ++ parameter. Return clause of possible truths. When INLINE_P is true, assume ++ that we are inlining. + + ERROR_MARK means compile time invariant. */ + +@@ -316,14 +317,13 @@ static void + evaluate_conditions_for_known_args (struct cgraph_node *node, + bool inline_p, + vec known_vals, +- vec +- known_aggs, ++ vec known_aggs, + clause_t *ret_clause, + clause_t *ret_nonspec_clause) + { + clause_t clause = inline_p ? 0 : 1 << predicate::not_inlined_condition; + clause_t nonspec_clause = 1 << predicate::not_inlined_condition; +- struct ipa_fn_summary *info = ipa_fn_summaries->get (node); ++ class ipa_fn_summary *info = ipa_fn_summaries->get (node); + int i; + struct condition *c; + +@@ -331,6 +331,8 @@ evaluate_conditions_for_known_args (stru + { + tree val; + tree res; ++ int j; ++ struct expr_eval_op *op; + + /* We allow call stmt to have fewer arguments than the callee function + (especially for K&R style programs). So bound check here (we assume +@@ -347,7 +349,7 @@ evaluate_conditions_for_known_args (stru + + if (c->agg_contents) + { +- struct ipa_agg_jump_function *agg; ++ struct ipa_agg_value_set *agg; + + if (c->code == predicate::changed + && !c->by_ref +@@ -356,7 +358,7 @@ evaluate_conditions_for_known_args (stru + + if (known_aggs.exists ()) + { +- agg = known_aggs[c->operand_num]; ++ agg = &known_aggs[c->operand_num]; + val = ipa_find_agg_cst_for_param (agg, known_vals[c->operand_num], + c->offset, c->by_ref); + } +@@ -382,7 +384,7 @@ evaluate_conditions_for_known_args (stru + continue; + } + +- if (tree_to_shwi (TYPE_SIZE (TREE_TYPE (val))) != c->size) ++ if (TYPE_SIZE (c->type) != TYPE_SIZE (TREE_TYPE (val))) + { + clause |= 1 << (i + predicate::first_dynamic_condition); + nonspec_clause |= 1 << (i + predicate::first_dynamic_condition); +@@ -394,7 +396,30 @@ evaluate_conditions_for_known_args (stru + continue; + } + +- val = fold_unary (VIEW_CONVERT_EXPR, TREE_TYPE (c->val), val); ++ val = fold_unary (VIEW_CONVERT_EXPR, c->type, val); ++ for (j = 0; vec_safe_iterate (c->param_ops, j, &op); j++) ++ { ++ if (!val) ++ break; ++ if (!op->val[0]) ++ val = fold_unary (op->code, op->type, val); ++ else if (!op->val[1]) ++ val = fold_binary (op->code, op->type, ++ op->index ? op->val[0] : val, ++ op->index ? val : op->val[0]); ++ else if (op->index == 0) ++ val = fold_ternary (op->code, op->type, ++ val, op->val[0], op->val[1]); ++ else if (op->index == 1) ++ val = fold_ternary (op->code, op->type, ++ op->val[0], val, op->val[1]); ++ else if (op->index == 2) ++ val = fold_ternary (op->code, op->type, ++ op->val[0], op->val[1], val); ++ else ++ val = NULL_TREE; ++ } ++ + res = val + ? fold_binary_to_constant (c->code, boolean_type_node, val, c->val) + : NULL; +@@ -420,12 +445,13 @@ evaluate_properties_for_edge (struct cgr + vec *known_vals_ptr, + vec + *known_contexts_ptr, +- vec *known_aggs_ptr) ++ vec *known_aggs_ptr) + { + struct cgraph_node *callee = e->callee->ultimate_alias_target (); +- struct ipa_fn_summary *info = ipa_fn_summaries->get (callee); ++ class ipa_fn_summary *info = ipa_fn_summaries->get (callee); + vec known_vals = vNULL; +- vec known_aggs = vNULL; ++ vec known_aggs = vNULL; ++ class ipa_edge_args *args; + + if (clause_ptr) + *clause_ptr = inline_p ? 0 : 1 << predicate::not_inlined_condition; +@@ -436,18 +462,20 @@ evaluate_properties_for_edge (struct cgr + + if (ipa_node_params_sum + && !e->call_stmt_cannot_inline_p +- && ((clause_ptr && info->conds) || known_vals_ptr || known_contexts_ptr)) ++ && ((clause_ptr && info->conds) || known_vals_ptr || known_contexts_ptr) ++ && (args = IPA_EDGE_REF (e)) != NULL) + { +- struct ipa_node_params *caller_parms_info, *callee_pi; +- struct ipa_edge_args *args = IPA_EDGE_REF (e); +- struct ipa_call_summary *es = ipa_call_summaries->get (e); ++ struct cgraph_node *caller; ++ class ipa_node_params *caller_parms_info, *callee_pi; ++ class ipa_call_summary *es = ipa_call_summaries->get (e); + int i, count = ipa_get_cs_argument_count (args); + +- if (e->caller->global.inlined_to) +- caller_parms_info = IPA_NODE_REF (e->caller->global.inlined_to); ++ if (e->caller->inlined_to) ++ caller = e->caller->inlined_to; + else +- caller_parms_info = IPA_NODE_REF (e->caller); +- callee_pi = IPA_NODE_REF (e->callee); ++ caller = e->caller; ++ caller_parms_info = IPA_NODE_REF (caller); ++ callee_pi = IPA_NODE_REF (callee); + + if (count && (info->conds || known_vals_ptr)) + known_vals.safe_grow_cleared (count); +@@ -456,36 +484,38 @@ evaluate_properties_for_edge (struct cgr + if (count && known_contexts_ptr) + known_contexts_ptr->safe_grow_cleared (count); + +- for (i = 0; i < count; i++) +- { +- struct ipa_jump_func *jf = ipa_get_ith_jump_func (args, i); +- tree cst = ipa_value_from_jfunc (caller_parms_info, jf, +- ipa_get_type (callee_pi, i)); +- +- if (!cst && e->call_stmt +- && i < (int)gimple_call_num_args (e->call_stmt)) +- { +- cst = gimple_call_arg (e->call_stmt, i); +- if (!is_gimple_min_invariant (cst)) +- cst = NULL; +- } +- if (cst) +- { +- gcc_checking_assert (TREE_CODE (cst) != TREE_BINFO); +- if (known_vals.exists ()) +- known_vals[i] = cst; +- } +- else if (inline_p && !es->param[i].change_prob) +- known_vals[i] = error_mark_node; +- +- if (known_contexts_ptr) +- (*known_contexts_ptr)[i] +- = ipa_context_from_jfunc (caller_parms_info, e, i, jf); +- /* TODO: When IPA-CP starts propagating and merging aggregate jump +- functions, use its knowledge of the caller too, just like the +- scalar case above. */ +- known_aggs[i] = &jf->agg; +- } ++ if (callee_pi) ++ for (i = 0; i < count; i++) ++ { ++ struct ipa_jump_func *jf = ipa_get_ith_jump_func (args, i); ++ tree cst = ipa_value_from_jfunc (caller_parms_info, jf, ++ ipa_get_type (callee_pi, i)); ++ ++ if (!cst && e->call_stmt ++ && i < (int)gimple_call_num_args (e->call_stmt)) ++ { ++ cst = gimple_call_arg (e->call_stmt, i); ++ if (!is_gimple_min_invariant (cst)) ++ cst = NULL; ++ } ++ if (cst) ++ { ++ gcc_checking_assert (TREE_CODE (cst) != TREE_BINFO); ++ if (known_vals.exists ()) ++ known_vals[i] = cst; ++ } ++ else if (inline_p && !es->param[i].change_prob) ++ known_vals[i] = error_mark_node; ++ ++ if (known_contexts_ptr) ++ (*known_contexts_ptr)[i] ++ = ipa_context_from_jfunc (caller_parms_info, e, i, jf); ++ ++ known_aggs[i] = ipa_agg_value_set_from_jfunc (caller_parms_info, ++ caller, &jf->agg); ++ } ++ else ++ gcc_assert (callee->thunk.thunk_p); + } + else if (e->call_stmt && !e->call_stmt_cannot_inline_p + && ((clause_ptr && info->conds) || known_vals_ptr)) +@@ -516,7 +546,7 @@ evaluate_properties_for_edge (struct cgr + if (known_aggs_ptr) + *known_aggs_ptr = known_aggs; + else +- known_aggs.release (); ++ ipa_release_agg_values (known_aggs); + } + + +@@ -527,6 +557,8 @@ ipa_fn_summary_alloc (void) + { + gcc_checking_assert (!ipa_fn_summaries); + ipa_fn_summaries = ipa_fn_summary_t::create_ggc (symtab); ++ ipa_size_summaries = new fast_function_summary ++ (symtab); + ipa_call_summaries = new ipa_call_summary_t (symtab); + } + +@@ -597,7 +629,7 @@ ipa_fn_summary_t::duplicate (cgraph_node + { + vec *entry = info->size_time_table; + /* Use SRC parm info since it may not be copied yet. */ +- struct ipa_node_params *parms_info = IPA_NODE_REF (src); ++ class ipa_node_params *parms_info = IPA_NODE_REF (src); + vec known_vals = vNULL; + int count = ipa_get_param_count (parms_info); + int i, j; +@@ -661,7 +693,7 @@ ipa_fn_summary_t::duplicate (cgraph_node + for (edge = dst->callees; edge; edge = next) + { + predicate new_predicate; +- struct ipa_call_summary *es = ipa_call_summaries->get_create (edge); ++ class ipa_call_summary *es = ipa_call_summaries->get_create (edge); + next = edge->next_callee; + + if (!edge->inline_failed) +@@ -680,7 +712,7 @@ ipa_fn_summary_t::duplicate (cgraph_node + for (edge = dst->indirect_calls; edge; edge = next) + { + predicate new_predicate; +- struct ipa_call_summary *es = ipa_call_summaries->get_create (edge); ++ class ipa_call_summary *es = ipa_call_summaries->get_create (edge); + next = edge->next_callee; + + gcc_checking_assert (edge->inline_failed); +@@ -719,7 +751,7 @@ ipa_fn_summary_t::duplicate (cgraph_node + set_hint_predicate (&info->loop_stride, p); + } + } +- if (!dst->global.inlined_to) ++ if (!dst->inlined_to) + ipa_update_overall_fn_summary (dst); + } + +@@ -729,8 +761,8 @@ ipa_fn_summary_t::duplicate (cgraph_node + void + ipa_call_summary_t::duplicate (struct cgraph_edge *src, + struct cgraph_edge *dst, +- struct ipa_call_summary *srcinfo, +- struct ipa_call_summary *info) ++ class ipa_call_summary *srcinfo, ++ class ipa_call_summary *info) + { + new (info) ipa_call_summary (*srcinfo); + info->predicate = NULL; +@@ -750,12 +782,12 @@ ipa_call_summary_t::duplicate (struct cg + + static void + dump_ipa_call_summary (FILE *f, int indent, struct cgraph_node *node, +- struct ipa_fn_summary *info) ++ class ipa_fn_summary *info) + { + struct cgraph_edge *edge; + for (edge = node->callees; edge; edge = edge->next_callee) + { +- struct ipa_call_summary *es = ipa_call_summaries->get (edge); ++ class ipa_call_summary *es = ipa_call_summaries->get (edge); + struct cgraph_node *callee = edge->callee->ultimate_alias_target (); + int i; + +@@ -768,9 +800,10 @@ dump_ipa_call_summary (FILE *f, int inde + es->call_stmt_size, es->call_stmt_time); + + ipa_fn_summary *s = ipa_fn_summaries->get (callee); ++ ipa_size_summary *ss = ipa_size_summaries->get (callee); + if (s != NULL) +- fprintf (f, "callee size:%2i stack:%2i", +- (int) (s->size / ipa_fn_summary::size_scale), ++ fprintf (f, " callee size:%2i stack:%2i", ++ (int) (ss->size / ipa_fn_summary::size_scale), + (int) s->estimated_stack_size); + + if (es->predicate) +@@ -794,19 +827,17 @@ dump_ipa_call_summary (FILE *f, int inde + } + if (!edge->inline_failed) + { +- ipa_fn_summary *s = ipa_fn_summaries->get (callee); +- fprintf (f, "%*sStack frame offset %i, callee self size %i," +- " callee size %i\n", ++ ipa_size_summary *ss = ipa_size_summaries->get (callee); ++ fprintf (f, "%*sStack frame offset %i, callee self size %i\n", + indent + 2, "", +- (int) s->stack_frame_offset, +- (int) s->estimated_self_stack_size, +- (int) s->estimated_stack_size); ++ (int) ipa_get_stack_frame_offset (callee), ++ (int) ss->estimated_self_stack_size); + dump_ipa_call_summary (f, indent + 2, callee, info); + } + } + for (edge = node->indirect_calls; edge; edge = edge->next_callee) + { +- struct ipa_call_summary *es = ipa_call_summaries->get (edge); ++ class ipa_call_summary *es = ipa_call_summaries->get (edge); + fprintf (f, "%*sindirect call loop depth:%2i freq:%4.2f size:%2i" + " time: %2i", + indent, "", +@@ -829,7 +860,8 @@ ipa_dump_fn_summary (FILE *f, struct cgr + { + if (node->definition) + { +- struct ipa_fn_summary *s = ipa_fn_summaries->get (node); ++ class ipa_fn_summary *s = ipa_fn_summaries->get (node); ++ class ipa_size_summary *ss = ipa_size_summaries->get (node); + if (s != NULL) + { + size_time_entry *e; +@@ -842,11 +874,11 @@ ipa_dump_fn_summary (FILE *f, struct cgr + if (s->fp_expressions) + fprintf (f, " fp_expression"); + fprintf (f, "\n global time: %f\n", s->time.to_double ()); +- fprintf (f, " self size: %i\n", s->self_size); +- fprintf (f, " global size: %i\n", s->size); ++ fprintf (f, " self size: %i\n", ss->self_size); ++ fprintf (f, " global size: %i\n", ss->size); + fprintf (f, " min size: %i\n", s->min_size); + fprintf (f, " self stack: %i\n", +- (int) s->estimated_self_stack_size); ++ (int) ss->estimated_self_stack_size); + fprintf (f, " global stack: %i\n", (int) s->estimated_stack_size); + if (s->growth) + fprintf (f, " estimated growth:%i\n", (int) s->growth); +@@ -900,7 +932,7 @@ ipa_dump_fn_summaries (FILE *f) + struct cgraph_node *node; + + FOR_EACH_DEFINED_FUNCTION (node) +- if (!node->global.inlined_to) ++ if (!node->inlined_to) + ipa_dump_fn_summary (f, node); + } + +@@ -922,7 +954,7 @@ mark_modified (ao_ref *ao ATTRIBUTE_UNUS + + static tree + unmodified_parm_1 (ipa_func_body_info *fbi, gimple *stmt, tree op, +- HOST_WIDE_INT *size_p) ++ poly_int64 *size_p) + { + /* SSA_NAME referring to parm default def? */ + if (TREE_CODE (op) == SSA_NAME +@@ -930,7 +962,7 @@ unmodified_parm_1 (ipa_func_body_info *f + && TREE_CODE (SSA_NAME_VAR (op)) == PARM_DECL) + { + if (size_p) +- *size_p = tree_to_shwi (TYPE_SIZE (TREE_TYPE (op))); ++ *size_p = tree_to_poly_int64 (TYPE_SIZE (TREE_TYPE (op))); + return SSA_NAME_VAR (op); + } + /* Non-SSA parm reference? */ +@@ -951,7 +983,7 @@ unmodified_parm_1 (ipa_func_body_info *f + if (!modified) + { + if (size_p) +- *size_p = tree_to_shwi (TYPE_SIZE (TREE_TYPE (op))); ++ *size_p = tree_to_poly_int64 (TYPE_SIZE (TREE_TYPE (op))); + return op; + } + } +@@ -965,7 +997,7 @@ unmodified_parm_1 (ipa_func_body_info *f + + static tree + unmodified_parm (ipa_func_body_info *fbi, gimple *stmt, tree op, +- HOST_WIDE_INT *size_p) ++ poly_int64 *size_p) + { + tree res = unmodified_parm_1 (fbi, stmt, op, size_p); + if (res) +@@ -990,7 +1022,7 @@ unmodified_parm (ipa_func_body_info *fbi + static bool + unmodified_parm_or_parm_agg_item (struct ipa_func_body_info *fbi, + gimple *stmt, tree op, int *index_p, +- HOST_WIDE_INT *size_p, ++ poly_int64 *size_p, + struct agg_position_info *aggpos) + { + tree res = unmodified_parm_1 (fbi, stmt, op, size_p); +@@ -1157,25 +1189,147 @@ eliminated_by_inlining_prob (ipa_func_bo + } + } + ++/* Analyze EXPR if it represents a series of simple operations performed on ++ a function parameter and return true if so. FBI, STMT, EXPR, INDEX_P and ++ AGGPOS have the same meaning like in unmodified_parm_or_parm_agg_item. ++ Type of the parameter or load from an aggregate via the parameter is ++ stored in *TYPE_P. Operations on the parameter are recorded to ++ PARAM_OPS_P if it is not NULL. */ ++ ++static bool ++decompose_param_expr (struct ipa_func_body_info *fbi, ++ gimple *stmt, tree expr, ++ int *index_p, tree *type_p, ++ struct agg_position_info *aggpos, ++ expr_eval_ops *param_ops_p = NULL) ++{ ++ int op_limit = PARAM_VALUE (PARAM_IPA_MAX_PARAM_EXPR_OPS); ++ int op_count = 0; ++ ++ if (param_ops_p) ++ *param_ops_p = NULL; ++ ++ while (true) ++ { ++ expr_eval_op eval_op; ++ unsigned rhs_count; ++ unsigned cst_count = 0; ++ ++ if (unmodified_parm_or_parm_agg_item (fbi, stmt, expr, index_p, NULL, ++ aggpos)) ++ { ++ tree type = TREE_TYPE (expr); ++ ++ if (aggpos->agg_contents) ++ { ++ /* Stop if containing bit-field. */ ++ if (TREE_CODE (expr) == BIT_FIELD_REF ++ || contains_bitfld_component_ref_p (expr)) ++ break; ++ } ++ ++ *type_p = type; ++ return true; ++ } ++ ++ if (TREE_CODE (expr) != SSA_NAME || SSA_NAME_IS_DEFAULT_DEF (expr)) ++ break; ++ ++ if (!is_gimple_assign (stmt = SSA_NAME_DEF_STMT (expr))) ++ break; ++ ++ switch (gimple_assign_rhs_class (stmt)) ++ { ++ case GIMPLE_SINGLE_RHS: ++ expr = gimple_assign_rhs1 (stmt); ++ continue; ++ ++ case GIMPLE_UNARY_RHS: ++ rhs_count = 1; ++ break; ++ ++ case GIMPLE_BINARY_RHS: ++ rhs_count = 2; ++ break; ++ ++ case GIMPLE_TERNARY_RHS: ++ rhs_count = 3; ++ break; ++ ++ default: ++ goto fail; ++ } ++ ++ /* Stop if expression is too complex. */ ++ if (op_count++ == op_limit) ++ break; ++ ++ if (param_ops_p) ++ { ++ eval_op.code = gimple_assign_rhs_code (stmt); ++ eval_op.type = TREE_TYPE (gimple_assign_lhs (stmt)); ++ eval_op.val[0] = NULL_TREE; ++ eval_op.val[1] = NULL_TREE; ++ } ++ ++ expr = NULL_TREE; ++ for (unsigned i = 0; i < rhs_count; i++) ++ { ++ tree op = gimple_op (stmt, i + 1); ++ ++ gcc_assert (op && !TYPE_P (op)); ++ if (is_gimple_ip_invariant (op)) ++ { ++ if (++cst_count == rhs_count) ++ goto fail; ++ ++ eval_op.val[cst_count - 1] = op; ++ } ++ else if (!expr) ++ { ++ /* Found a non-constant operand, and record its index in rhs ++ operands. */ ++ eval_op.index = i; ++ expr = op; ++ } ++ else ++ { ++ /* Found more than one non-constant operands. */ ++ goto fail; ++ } ++ } ++ ++ if (param_ops_p) ++ vec_safe_insert (*param_ops_p, 0, eval_op); ++ } ++ ++ /* Failed to decompose, free resource and return. */ ++fail: ++ if (param_ops_p) ++ vec_free (*param_ops_p); ++ ++ return false; ++} + + /* If BB ends by a conditional we can turn into predicates, attach corresponding + predicates to the CFG edges. */ + + static void + set_cond_stmt_execution_predicate (struct ipa_func_body_info *fbi, +- struct ipa_fn_summary *summary, ++ class ipa_fn_summary *summary, ++ class ipa_node_params *params_summary, + basic_block bb) + { + gimple *last; +- tree op; ++ tree op, op2; + int index; +- HOST_WIDE_INT size; + struct agg_position_info aggpos; + enum tree_code code, inverted_code; + edge e; + edge_iterator ei; + gimple *set_stmt; +- tree op2; ++ tree param_type; ++ expr_eval_ops param_ops; + + last = last_stmt (bb); + if (!last || gimple_code (last) != GIMPLE_COND) +@@ -1183,10 +1337,9 @@ set_cond_stmt_execution_predicate (struc + if (!is_gimple_ip_invariant (gimple_cond_rhs (last))) + return; + op = gimple_cond_lhs (last); +- /* TODO: handle conditionals like +- var = op0 < 4; +- if (var != 0). */ +- if (unmodified_parm_or_parm_agg_item (fbi, last, op, &index, &size, &aggpos)) ++ ++ if (decompose_param_expr (fbi, last, op, &index, ¶m_type, &aggpos, ++ ¶m_ops)) + { + code = gimple_cond_code (last); + inverted_code = invert_tree_comparison (code, HONOR_NANS (op)); +@@ -1197,17 +1350,24 @@ set_cond_stmt_execution_predicate (struc + ? code : inverted_code); + /* invert_tree_comparison will return ERROR_MARK on FP + comparsions that are not EQ/NE instead of returning proper +- unordered one. Be sure it is not confused with NON_CONSTANT. */ +- if (this_code != ERROR_MARK) ++ unordered one. Be sure it is not confused with NON_CONSTANT. ++ ++ And if the edge's target is the final block of diamond CFG graph ++ of this conditional statement, we do not need to compute ++ predicate for the edge because the final block's predicate must ++ be at least as that of the first block of the statement. */ ++ if (this_code != ERROR_MARK ++ && !dominated_by_p (CDI_POST_DOMINATORS, bb, e->dest)) + { + predicate p +- = add_condition (summary, index, size, &aggpos, this_code, +- unshare_expr_without_location +- (gimple_cond_rhs (last))); ++ = add_condition (summary, params_summary, index, ++ param_type, &aggpos, ++ this_code, gimple_cond_rhs (last), param_ops); + e->aux = edge_predicate_pool.allocate (); + *(predicate *) e->aux = p; + } + } ++ vec_free (param_ops); + } + + if (TREE_CODE (op) != SSA_NAME) +@@ -1230,12 +1390,12 @@ set_cond_stmt_execution_predicate (struc + || gimple_call_num_args (set_stmt) != 1) + return; + op2 = gimple_call_arg (set_stmt, 0); +- if (!unmodified_parm_or_parm_agg_item (fbi, set_stmt, op2, &index, &size, +- &aggpos)) ++ if (!decompose_param_expr (fbi, set_stmt, op2, &index, ¶m_type, &aggpos)) + return; + FOR_EACH_EDGE (e, ei, bb->succs) if (e->flags & EDGE_FALSE_VALUE) + { +- predicate p = add_condition (summary, index, size, &aggpos, ++ predicate p = add_condition (summary, params_summary, index, ++ param_type, &aggpos, + predicate::is_not_constant, NULL_TREE); + e->aux = edge_predicate_pool.allocate (); + *(predicate *) e->aux = p; +@@ -1248,63 +1408,200 @@ set_cond_stmt_execution_predicate (struc + + static void + set_switch_stmt_execution_predicate (struct ipa_func_body_info *fbi, +- struct ipa_fn_summary *summary, ++ class ipa_fn_summary *summary, ++ class ipa_node_params *params_summary, + basic_block bb) + { + gimple *lastg; + tree op; + int index; +- HOST_WIDE_INT size; + struct agg_position_info aggpos; + edge e; + edge_iterator ei; + size_t n; + size_t case_idx; ++ tree param_type; ++ expr_eval_ops param_ops; + + lastg = last_stmt (bb); + if (!lastg || gimple_code (lastg) != GIMPLE_SWITCH) + return; + gswitch *last = as_a (lastg); + op = gimple_switch_index (last); +- if (!unmodified_parm_or_parm_agg_item (fbi, last, op, &index, &size, &aggpos)) ++ if (!decompose_param_expr (fbi, last, op, &index, ¶m_type, &aggpos, ++ ¶m_ops)) + return; + ++ auto_vec > ranges; ++ tree type = TREE_TYPE (op); ++ int bound_limit = PARAM_VALUE (PARAM_IPA_MAX_SWITCH_PREDICATE_BOUNDS); ++ int bound_count = 0; ++ wide_int vr_wmin, vr_wmax; ++ value_range_kind vr_type = get_range_info (op, &vr_wmin, &vr_wmax); ++ + FOR_EACH_EDGE (e, ei, bb->succs) + { + e->aux = edge_predicate_pool.allocate (); + *(predicate *) e->aux = false; + } ++ ++ e = gimple_switch_edge (cfun, last, 0); ++ /* Set BOUND_COUNT to maximum count to bypass computing predicate for ++ default case if its target basic block is in convergence point of all ++ switch cases, which can be determined by checking whether it ++ post-dominates the switch statement. */ ++ if (dominated_by_p (CDI_POST_DOMINATORS, bb, e->dest)) ++ bound_count = INT_MAX; ++ + n = gimple_switch_num_labels (last); +- for (case_idx = 0; case_idx < n; ++case_idx) ++ for (case_idx = 1; case_idx < n; ++case_idx) + { + tree cl = gimple_switch_label (last, case_idx); +- tree min, max; ++ tree min = CASE_LOW (cl); ++ tree max = CASE_HIGH (cl); + predicate p; + + e = gimple_switch_edge (cfun, last, case_idx); +- min = CASE_LOW (cl); +- max = CASE_HIGH (cl); + +- /* For default we might want to construct predicate that none +- of cases is met, but it is bit hard to do not having negations +- of conditionals handy. */ +- if (!min && !max) ++ /* The case value might not have same type as switch expression, ++ extend the value based on the expression type. */ ++ if (TREE_TYPE (min) != type) ++ min = wide_int_to_tree (type, wi::to_wide (min)); ++ ++ if (!max) ++ max = min; ++ else if (TREE_TYPE (max) != type) ++ max = wide_int_to_tree (type, wi::to_wide (max)); ++ ++ /* The case's target basic block is in convergence point of all switch ++ cases, its predicate should be at least as that of the switch ++ statement. */ ++ if (dominated_by_p (CDI_POST_DOMINATORS, bb, e->dest)) + p = true; +- else if (!max) +- p = add_condition (summary, index, size, &aggpos, EQ_EXPR, +- unshare_expr_without_location (min)); ++ else if (min == max) ++ p = add_condition (summary, params_summary, index, param_type, ++ &aggpos, EQ_EXPR, min, param_ops); + else + { + predicate p1, p2; +- p1 = add_condition (summary, index, size, &aggpos, GE_EXPR, +- unshare_expr_without_location (min)); +- p2 = add_condition (summary, index, size, &aggpos, LE_EXPR, +- unshare_expr_without_location (max)); ++ p1 = add_condition (summary, params_summary, index, param_type, ++ &aggpos, GE_EXPR, min, param_ops); ++ p2 = add_condition (summary, params_summary,index, param_type, ++ &aggpos, LE_EXPR, max, param_ops); + p = p1 & p2; + } +- *(struct predicate *) e->aux +- = p.or_with (summary->conds, *(struct predicate *) e->aux); ++ *(class predicate *) e->aux ++ = p.or_with (summary->conds, *(class predicate *) e->aux); ++ ++ /* If there are too many disjoint case ranges, predicate for default ++ case might become too complicated. So add a limit here. */ ++ if (bound_count > bound_limit) ++ continue; ++ ++ bool new_range = true; ++ ++ if (!ranges.is_empty ()) ++ { ++ wide_int curr_wmin = wi::to_wide (min); ++ wide_int last_wmax = wi::to_wide (ranges.last ().second); ++ ++ /* Merge case ranges if they are continuous. */ ++ if (curr_wmin == last_wmax + 1) ++ new_range = false; ++ else if (vr_type == VR_ANTI_RANGE) ++ { ++ /* If two disjoint case ranges can be connected by anti-range ++ of switch index, combine them to one range. */ ++ if (wi::lt_p (vr_wmax, curr_wmin - 1, TYPE_SIGN (type))) ++ vr_type = VR_UNDEFINED; ++ else if (wi::le_p (vr_wmin, last_wmax + 1, TYPE_SIGN (type))) ++ new_range = false; ++ } ++ } ++ ++ /* Create/extend a case range. And we count endpoints of range set, ++ this number nearly equals to number of conditions that we will create ++ for predicate of default case. */ ++ if (new_range) ++ { ++ bound_count += (min == max) ? 1 : 2; ++ ranges.safe_push (std::make_pair (min, max)); ++ } ++ else ++ { ++ bound_count += (ranges.last ().first == ranges.last ().second); ++ ranges.last ().second = max; ++ } ++ } ++ ++ e = gimple_switch_edge (cfun, last, 0); ++ if (bound_count > bound_limit) ++ { ++ *(class predicate *) e->aux = true; ++ vec_free (param_ops); ++ return; ++ } ++ ++ predicate p_seg = true; ++ predicate p_all = false; ++ ++ if (vr_type != VR_RANGE) ++ { ++ vr_wmin = wi::to_wide (TYPE_MIN_VALUE (type)); ++ vr_wmax = wi::to_wide (TYPE_MAX_VALUE (type)); + } ++ ++ /* Construct predicate to represent default range set that is negation of ++ all case ranges. Case range is classified as containing single/non-single ++ values. Suppose a piece of case ranges in the following. ++ ++ [D1...D2] [S1] ... [Sn] [D3...D4] ++ ++ To represent default case's range sets between two non-single value ++ case ranges (From D2 to D3), we construct predicate as: ++ ++ D2 < x < D3 && x != S1 && ... && x != Sn ++ */ ++ for (size_t i = 0; i < ranges.length (); i++) ++ { ++ tree min = ranges[i].first; ++ tree max = ranges[i].second; ++ ++ if (min == max) ++ p_seg &= add_condition (summary, params_summary, index, ++ param_type, &aggpos, NE_EXPR, ++ min, param_ops); ++ else ++ { ++ /* Do not create sub-predicate for range that is beyond low bound ++ of switch index. */ ++ if (wi::lt_p (vr_wmin, wi::to_wide (min), TYPE_SIGN (type))) ++ { ++ p_seg &= add_condition (summary, params_summary, index, ++ param_type, &aggpos, ++ LT_EXPR, min, param_ops); ++ p_all = p_all.or_with (summary->conds, p_seg); ++ } ++ ++ /* Do not create sub-predicate for range that is beyond up bound ++ of switch index. */ ++ if (wi::le_p (vr_wmax, wi::to_wide (max), TYPE_SIGN (type))) ++ { ++ p_seg = false; ++ break; ++ } ++ ++ p_seg = add_condition (summary, params_summary, index, ++ param_type, &aggpos, GT_EXPR, ++ max, param_ops); ++ } ++ } ++ ++ p_all = p_all.or_with (summary->conds, p_seg); ++ *(class predicate *) e->aux ++ = p_all.or_with (summary->conds, *(class predicate *) e->aux); ++ ++ vec_free (param_ops); + } + + +@@ -1314,7 +1611,8 @@ set_switch_stmt_execution_predicate (str + static void + compute_bb_predicates (struct ipa_func_body_info *fbi, + struct cgraph_node *node, +- struct ipa_fn_summary *summary) ++ class ipa_fn_summary *summary, ++ class ipa_node_params *params_summary) + { + struct function *my_function = DECL_STRUCT_FUNCTION (node->decl); + bool done = false; +@@ -1322,8 +1620,8 @@ compute_bb_predicates (struct ipa_func_b + + FOR_EACH_BB_FN (bb, my_function) + { +- set_cond_stmt_execution_predicate (fbi, summary, bb); +- set_switch_stmt_execution_predicate (fbi, summary, bb); ++ set_cond_stmt_execution_predicate (fbi, summary, params_summary, bb); ++ set_switch_stmt_execution_predicate (fbi, summary, params_summary, bb); + } + + /* Entry block is always executable. */ +@@ -1348,16 +1646,16 @@ compute_bb_predicates (struct ipa_func_b + predicate this_bb_predicate + = *(predicate *) e->src->aux; + if (e->aux) +- this_bb_predicate &= (*(struct predicate *) e->aux); ++ this_bb_predicate &= (*(class predicate *) e->aux); + p = p.or_with (summary->conds, this_bb_predicate); + if (p == true) + break; + } + } +- if (p == false) +- gcc_checking_assert (!bb->aux); +- else ++ if (p != false) + { ++ basic_block pdom_bb; ++ + if (!bb->aux) + { + done = false; +@@ -1376,6 +1674,34 @@ compute_bb_predicates (struct ipa_func_b + *((predicate *) bb->aux) = p; + } + } ++ ++ /* For switch/if statement, we can OR-combine predicates of all ++ its cases/branches to get predicate for basic block in their ++ convergence point, but sometimes this will generate very ++ complicated predicate. Actually, we can get simplified ++ predicate in another way by using the fact that predicate ++ for a basic block must also hold true for its post dominators. ++ To be specific, basic block in convergence point of ++ conditional statement should include predicate of the ++ statement. */ ++ pdom_bb = get_immediate_dominator (CDI_POST_DOMINATORS, bb); ++ if (pdom_bb == EXIT_BLOCK_PTR_FOR_FN (my_function) || !pdom_bb) ++ ; ++ else if (!pdom_bb->aux) ++ { ++ done = false; ++ pdom_bb->aux = edge_predicate_pool.allocate (); ++ *((predicate *) pdom_bb->aux) = p; ++ } ++ else if (p != *(predicate *) pdom_bb->aux) ++ { ++ p = p.or_with (summary->conds, *(predicate *)pdom_bb->aux); ++ if (p != *(predicate *) pdom_bb->aux) ++ { ++ done = false; ++ *((predicate *) pdom_bb->aux) = p; ++ } ++ } + } + } + } +@@ -1387,21 +1713,21 @@ compute_bb_predicates (struct ipa_func_b + + static predicate + will_be_nonconstant_expr_predicate (ipa_func_body_info *fbi, +- struct ipa_fn_summary *summary, ++ class ipa_fn_summary *summary, ++ class ipa_node_params *params_summary, + tree expr, + vec nonconstant_names) + { + tree parm; + int index; +- HOST_WIDE_INT size; + + while (UNARY_CLASS_P (expr)) + expr = TREE_OPERAND (expr, 0); + +- parm = unmodified_parm (fbi, NULL, expr, &size); ++ parm = unmodified_parm (fbi, NULL, expr, NULL); + if (parm && (index = ipa_get_param_decl_index (fbi->info, parm)) >= 0) +- return add_condition (summary, index, size, NULL, predicate::changed, +- NULL_TREE); ++ return add_condition (summary, params_summary, index, TREE_TYPE (parm), NULL, ++ predicate::changed, NULL_TREE); + if (is_gimple_min_invariant (expr)) + return false; + if (TREE_CODE (expr) == SSA_NAME) +@@ -1410,6 +1736,7 @@ will_be_nonconstant_expr_predicate (ipa_ + { + predicate p1 + = will_be_nonconstant_expr_predicate (fbi, summary, ++ params_summary, + TREE_OPERAND (expr, 0), + nonconstant_names); + if (p1 == true) +@@ -1417,6 +1744,7 @@ will_be_nonconstant_expr_predicate (ipa_ + + predicate p2 + = will_be_nonconstant_expr_predicate (fbi, summary, ++ params_summary, + TREE_OPERAND (expr, 1), + nonconstant_names); + return p1.or_with (summary->conds, p2); +@@ -1425,6 +1753,7 @@ will_be_nonconstant_expr_predicate (ipa_ + { + predicate p1 + = will_be_nonconstant_expr_predicate (fbi, summary, ++ params_summary, + TREE_OPERAND (expr, 0), + nonconstant_names); + if (p1 == true) +@@ -1432,12 +1761,14 @@ will_be_nonconstant_expr_predicate (ipa_ + + predicate p2 + = will_be_nonconstant_expr_predicate (fbi, summary, ++ params_summary, + TREE_OPERAND (expr, 1), + nonconstant_names); + if (p2 == true) + return p2; + p1 = p1.or_with (summary->conds, p2); + p2 = will_be_nonconstant_expr_predicate (fbi, summary, ++ params_summary, + TREE_OPERAND (expr, 2), + nonconstant_names); + return p2.or_with (summary->conds, p1); +@@ -1458,17 +1789,18 @@ will_be_nonconstant_expr_predicate (ipa_ + + static predicate + will_be_nonconstant_predicate (struct ipa_func_body_info *fbi, +- struct ipa_fn_summary *summary, ++ class ipa_fn_summary *summary, ++ class ipa_node_params *params_summary, + gimple *stmt, + vec nonconstant_names) + { + predicate p = true; + ssa_op_iter iter; + tree use; ++ tree param_type = NULL_TREE; + predicate op_non_const; + bool is_load; + int base_index; +- HOST_WIDE_INT size; + struct agg_position_info aggpos; + + /* What statments might be optimized away +@@ -1489,11 +1821,9 @@ will_be_nonconstant_predicate (struct ip + /* Loads can be optimized when the value is known. */ + if (is_load) + { +- tree op; +- gcc_assert (gimple_assign_single_p (stmt)); +- op = gimple_assign_rhs1 (stmt); +- if (!unmodified_parm_or_parm_agg_item (fbi, stmt, op, &base_index, &size, +- &aggpos)) ++ tree op = gimple_assign_rhs1 (stmt); ++ if (!decompose_param_expr (fbi, stmt, op, &base_index, ¶m_type, ++ &aggpos)) + return p; + } + else +@@ -1518,21 +1848,22 @@ will_be_nonconstant_predicate (struct ip + + if (is_load) + op_non_const = +- add_condition (summary, base_index, size, &aggpos, predicate::changed, +- NULL); ++ add_condition (summary, params_summary, ++ base_index, param_type, &aggpos, ++ predicate::changed, NULL_TREE); + else + op_non_const = false; + FOR_EACH_SSA_TREE_OPERAND (use, stmt, iter, SSA_OP_USE) + { +- HOST_WIDE_INT size; +- tree parm = unmodified_parm (fbi, stmt, use, &size); ++ tree parm = unmodified_parm (fbi, stmt, use, NULL); + int index; + + if (parm && (index = ipa_get_param_decl_index (fbi->info, parm)) >= 0) + { + if (index != base_index) +- p = add_condition (summary, index, size, NULL, predicate::changed, +- NULL_TREE); ++ p = add_condition (summary, params_summary, index, ++ TREE_TYPE (parm), NULL, ++ predicate::changed, NULL_TREE); + else + continue; + } +@@ -1566,7 +1897,7 @@ struct record_modified_bb_info + static basic_block + get_minimal_bb (basic_block init_bb, basic_block use_bb) + { +- struct loop *l = find_common_loop (init_bb->loop_father, use_bb->loop_father); ++ class loop *l = find_common_loop (init_bb->loop_father, use_bb->loop_father); + if (l && l->header->count < init_bb->count) + return l->header; + return init_bb; +@@ -1664,7 +1995,7 @@ param_change_prob (ipa_func_body_info *f + return REG_BR_PROB_BASE; + if (dump_file) + { +- fprintf (dump_file, " Analyzing param change probablity of "); ++ fprintf (dump_file, " Analyzing param change probability of "); + print_generic_expr (dump_file, op, TDF_SLIM); + fprintf (dump_file, "\n"); + } +@@ -1718,7 +2049,9 @@ param_change_prob (ipa_func_body_info *f + + static bool + phi_result_unknown_predicate (ipa_func_body_info *fbi, +- ipa_fn_summary *summary, basic_block bb, ++ ipa_fn_summary *summary, ++ class ipa_node_params *params_summary, ++ basic_block bb, + predicate *p, + vec nonconstant_names) + { +@@ -1762,7 +2095,7 @@ phi_result_unknown_predicate (ipa_func_b + || !is_gimple_ip_invariant (gimple_cond_rhs (stmt))) + return false; + +- *p = will_be_nonconstant_expr_predicate (fbi, summary, ++ *p = will_be_nonconstant_expr_predicate (fbi, summary, params_summary, + gimple_cond_lhs (stmt), + nonconstant_names); + if (*p == true) +@@ -1777,7 +2110,7 @@ phi_result_unknown_predicate (ipa_func_b + NONCONSTANT_NAMES, if possible. */ + + static void +-predicate_for_phi_result (struct ipa_fn_summary *summary, gphi *phi, ++predicate_for_phi_result (class ipa_fn_summary *summary, gphi *phi, + predicate *p, + vec nonconstant_names) + { +@@ -1954,7 +2287,8 @@ analyze_function_body (struct cgraph_nod + basic_block bb; + struct function *my_function = DECL_STRUCT_FUNCTION (node->decl); + sreal freq; +- struct ipa_fn_summary *info = ipa_fn_summaries->get_create (node); ++ class ipa_fn_summary *info = ipa_fn_summaries->get_create (node); ++ class ipa_node_params *params_summary = early ? NULL : IPA_NODE_REF (node); + predicate bb_predicate; + struct ipa_func_body_info fbi; + vec nonconstant_names = vNULL; +@@ -1980,6 +2314,7 @@ analyze_function_body (struct cgraph_nod + if (opt_for_fn (node->decl, optimize)) + { + calculate_dominance_info (CDI_DOMINATORS); ++ calculate_dominance_info (CDI_POST_DOMINATORS); + if (!early) + loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS); + else +@@ -2019,7 +2354,7 @@ analyze_function_body (struct cgraph_nod + bb_predicate); + + if (fbi.info) +- compute_bb_predicates (&fbi, node, info); ++ compute_bb_predicates (&fbi, node, info, params_summary); + order = XNEWVEC (int, n_basic_blocks_for_fn (cfun)); + nblocks = pre_and_rev_post_order_compute (NULL, order, false); + for (n = 0; n < nblocks; n++) +@@ -2061,7 +2396,9 @@ analyze_function_body (struct cgraph_nod + gsi_next (&bsi)) + { + if (first_phi +- && !phi_result_unknown_predicate (&fbi, info, bb, ++ && !phi_result_unknown_predicate (&fbi, info, ++ params_summary, ++ bb, + &phi_predicate, + nonconstant_names)) + break; +@@ -2159,7 +2496,7 @@ analyze_function_body (struct cgraph_nod + just maximum of the possible paths. */ + if (fbi.info) + will_be_nonconstant +- = will_be_nonconstant_predicate (&fbi, info, ++ = will_be_nonconstant_predicate (&fbi, info, params_summary, + stmt, nonconstant_names); + else + will_be_nonconstant = true; +@@ -2174,7 +2511,7 @@ analyze_function_body (struct cgraph_nod + if (prob == 2 && dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "\t\tWill be eliminated by inlining\n"); + +- struct predicate p = bb_predicate & will_be_nonconstant; ++ class predicate p = bb_predicate & will_be_nonconstant; + + /* We can ignore statement when we proved it is never going + to happen, but we cannot do that for call statements +@@ -2226,7 +2563,8 @@ analyze_function_body (struct cgraph_nod + predicate p = bb_predicate; + if (fbi.info) + p = p & will_be_nonconstant_expr_predicate +- (&fbi, info, TREE_OPERAND (op, 1), ++ (&fbi, info, params_summary, ++ TREE_OPERAND (op, 1), + nonconstant_names); + if (p != false) + { +@@ -2249,7 +2587,7 @@ analyze_function_body (struct cgraph_nod + + if (nonconstant_names.exists () && !early) + { +- struct loop *loop; ++ class loop *loop; + predicate loop_iterations = true; + predicate loop_stride = true; + +@@ -2261,7 +2599,7 @@ analyze_function_body (struct cgraph_nod + vec exits; + edge ex; + unsigned int j; +- struct tree_niter_desc niter_desc; ++ class tree_niter_desc niter_desc; + bb_predicate = *(predicate *) loop->header->aux; + + exits = get_loop_exit_edges (loop); +@@ -2271,6 +2609,7 @@ analyze_function_body (struct cgraph_nod + { + predicate will_be_nonconstant + = will_be_nonconstant_expr_predicate (&fbi, info, ++ params_summary, + niter_desc.niter, + nonconstant_names); + if (will_be_nonconstant != true) +@@ -2315,7 +2654,9 @@ analyze_function_body (struct cgraph_nod + continue; + + predicate will_be_nonconstant +- = will_be_nonconstant_expr_predicate (&fbi, info, iv.step, ++ = will_be_nonconstant_expr_predicate (&fbi, info, ++ params_summary, ++ iv.step, + nonconstant_names); + if (will_be_nonconstant != true) + will_be_nonconstant = bb_predicate & will_be_nonconstant; +@@ -2349,8 +2690,9 @@ analyze_function_body (struct cgraph_nod + } + } + ipa_fn_summary *s = ipa_fn_summaries->get (node); ++ ipa_size_summary *ss = ipa_size_summaries->get (node); + s->time = time; +- s->self_size = size; ++ ss->self_size = size; + nonconstant_names.release (); + ipa_release_body_info (&fbi); + if (opt_for_fn (node->decl, optimize)) +@@ -2360,6 +2702,7 @@ analyze_function_body (struct cgraph_nod + else if (!ipa_edge_args_sum) + ipa_free_all_node_params (); + free_dominance_info (CDI_DOMINATORS); ++ free_dominance_info (CDI_POST_DOMINATORS); + } + if (dump_file) + { +@@ -2377,9 +2720,8 @@ compute_fn_summary (struct cgraph_node * + { + HOST_WIDE_INT self_stack_size; + struct cgraph_edge *e; +- struct ipa_fn_summary *info; + +- gcc_assert (!node->global.inlined_to); ++ gcc_assert (!node->inlined_to); + + if (!ipa_fn_summaries) + ipa_fn_summary_alloc (); +@@ -2387,14 +2729,14 @@ compute_fn_summary (struct cgraph_node * + /* Create a new ipa_fn_summary. */ + ((ipa_fn_summary_t *)ipa_fn_summaries)->remove_callees (node); + ipa_fn_summaries->remove (node); +- info = ipa_fn_summaries->get_create (node); ++ class ipa_fn_summary *info = ipa_fn_summaries->get_create (node); ++ class ipa_size_summary *size_info = ipa_size_summaries->get_create (node); + + /* Estimate the stack size for the function if we're optimizing. */ + self_stack_size = optimize && !node->thunk.thunk_p + ? estimated_stack_frame_size (node) : 0; +- info->estimated_self_stack_size = self_stack_size; ++ size_info->estimated_self_stack_size = self_stack_size; + info->estimated_stack_size = self_stack_size; +- info->stack_frame_offset = 0; + + if (node->thunk.thunk_p) + { +@@ -2412,7 +2754,7 @@ compute_fn_summary (struct cgraph_node * + t = predicate::not_inlined (); + info->account_size_time (2 * ipa_fn_summary::size_scale, 0, t, t); + ipa_update_overall_fn_summary (node); +- info->self_size = info->size; ++ size_info->self_size = size_info->size; + if (stdarg_p (TREE_TYPE (node->decl))) + { + info->inlinable = false; +@@ -2468,16 +2810,15 @@ compute_fn_summary (struct cgraph_node * + node->calls_comdat_local = (e != NULL); + + /* Inlining characteristics are maintained by the cgraph_mark_inline. */ +- info->size = info->self_size; +- info->stack_frame_offset = 0; +- info->estimated_stack_size = info->estimated_self_stack_size; ++ size_info->size = size_info->self_size; ++ info->estimated_stack_size = size_info->estimated_self_stack_size; + + /* Code above should compute exactly the same result as + ipa_update_overall_fn_summary but because computation happens in + different order the roundoff errors result in slight changes. */ + ipa_update_overall_fn_summary (node); + /* In LTO mode we may have speculative edges set. */ +- gcc_assert (in_lto_p || info->size == info->self_size); ++ gcc_assert (in_lto_p || size_info->size == size_info->self_size); + } + + +@@ -2499,11 +2840,11 @@ estimate_edge_devirt_benefit (struct cgr + int *size, int *time, + vec known_vals, + vec known_contexts, +- vec known_aggs) ++ vec known_aggs) + { + tree target; + struct cgraph_node *callee; +- struct ipa_fn_summary *isummary; ++ class ipa_fn_summary *isummary; + enum availability avail; + bool speculative; + +@@ -2548,10 +2889,10 @@ estimate_edge_size_and_time (struct cgra + int prob, + vec known_vals, + vec known_contexts, +- vec known_aggs, ++ vec known_aggs, + ipa_hints *hints) + { +- struct ipa_call_summary *es = ipa_call_summaries->get (e); ++ class ipa_call_summary *es = ipa_call_summaries->get (e); + int call_size = es->call_stmt_size; + int call_time = es->call_stmt_time; + int cur_size; +@@ -2583,12 +2924,12 @@ estimate_calls_size_and_time (struct cgr + clause_t possible_truths, + vec known_vals, + vec known_contexts, +- vec known_aggs) ++ vec known_aggs) + { + struct cgraph_edge *e; + for (e = node->callees; e; e = e->next_callee) + { +- struct ipa_call_summary *es = ipa_call_summaries->get_create (e); ++ class ipa_call_summary *es = ipa_call_summaries->get_create (e); + + /* Do not care about zero sized builtins. */ + if (e->inline_failed && !es->call_stmt_size) +@@ -2619,7 +2960,7 @@ estimate_calls_size_and_time (struct cgr + } + for (e = node->indirect_calls; e; e = e->next_callee) + { +- struct ipa_call_summary *es = ipa_call_summaries->get_create (e); ++ class ipa_call_summary *es = ipa_call_summaries->get_create (e); + if (!es->predicate + || es->predicate->evaluate (possible_truths)) + estimate_edge_size_and_time (e, size, +@@ -2630,31 +2971,250 @@ estimate_calls_size_and_time (struct cgr + } + } + ++/* Default constructor for ipa call context. ++ Memory alloction of known_vals, known_contexts ++ and known_aggs vectors is owned by the caller, but can ++ be release by ipa_call_context::release. ++ ++ inline_param_summary is owned by the caller. */ ++ipa_call_context::ipa_call_context (cgraph_node *node, ++ clause_t possible_truths, ++ clause_t nonspec_possible_truths, ++ vec known_vals, ++ vec ++ known_contexts, ++ vec known_aggs, ++ vec ++ inline_param_summary) ++: m_node (node), m_possible_truths (possible_truths), ++ m_nonspec_possible_truths (nonspec_possible_truths), ++ m_inline_param_summary (inline_param_summary), ++ m_known_vals (known_vals), ++ m_known_contexts (known_contexts), ++ m_known_aggs (known_aggs) ++{ ++} ++ ++/* Set THIS to be a duplicate of CTX. Copy all relevant info. */ ++ ++void ++ipa_call_context::duplicate_from (const ipa_call_context &ctx) ++{ ++ m_node = ctx.m_node; ++ m_possible_truths = ctx.m_possible_truths; ++ m_nonspec_possible_truths = ctx.m_nonspec_possible_truths; ++ class ipa_node_params *params_summary = IPA_NODE_REF (m_node); ++ unsigned int nargs = params_summary ++ ? ipa_get_param_count (params_summary) : 0; ++ ++ m_inline_param_summary = vNULL; ++ /* Copy the info only if there is at least one useful entry. */ ++ if (ctx.m_inline_param_summary.exists ()) ++ { ++ unsigned int n = MIN (ctx.m_inline_param_summary.length (), nargs); ++ ++ for (unsigned int i = 0; i < n; i++) ++ if (ipa_is_param_used_by_ipa_predicates (params_summary, i) ++ && !ctx.m_inline_param_summary[i].useless_p ()) ++ { ++ m_inline_param_summary ++ = ctx.m_inline_param_summary.copy (); ++ break; ++ } ++ } ++ m_known_vals = vNULL; ++ if (ctx.m_known_vals.exists ()) ++ { ++ unsigned int n = MIN (ctx.m_known_vals.length (), nargs); ++ ++ for (unsigned int i = 0; i < n; i++) ++ if (ipa_is_param_used_by_indirect_call (params_summary, i) ++ && ctx.m_known_vals[i]) ++ { ++ m_known_vals = ctx.m_known_vals.copy (); ++ break; ++ } ++ } ++ ++ m_known_contexts = vNULL; ++ if (ctx.m_known_contexts.exists ()) ++ { ++ unsigned int n = MIN (ctx.m_known_contexts.length (), nargs); ++ ++ for (unsigned int i = 0; i < n; i++) ++ if (ipa_is_param_used_by_polymorphic_call (params_summary, i) ++ && !ctx.m_known_contexts[i].useless_p ()) ++ { ++ m_known_contexts = ctx.m_known_contexts.copy (); ++ break; ++ } ++ } ++ ++ m_known_aggs = vNULL; ++ if (ctx.m_known_aggs.exists ()) ++ { ++ unsigned int n = MIN (ctx.m_known_aggs.length (), nargs); ++ ++ for (unsigned int i = 0; i < n; i++) ++ if (ipa_is_param_used_by_indirect_call (params_summary, i) ++ && !ctx.m_known_aggs[i].is_empty ()) ++ { ++ m_known_aggs = ipa_copy_agg_values (ctx.m_known_aggs); ++ break; ++ } ++ } ++} ++ ++/* Release memory used by known_vals/contexts/aggs vectors. ++ If ALL is true release also inline_param_summary. ++ This happens when context was previously duplciated to be stored ++ into cache. */ ++ ++void ++ipa_call_context::release (bool all) ++{ ++ /* See if context is initialized at first place. */ ++ if (!m_node) ++ return; ++ m_known_vals.release (); ++ m_known_contexts.release (); ++ ipa_release_agg_values (m_known_aggs); ++ if (all) ++ m_inline_param_summary.release (); ++} ++ ++/* Return true if CTX describes the same call context as THIS. */ ++ ++bool ++ipa_call_context::equal_to (const ipa_call_context &ctx) ++{ ++ if (m_node != ctx.m_node ++ || m_possible_truths != ctx.m_possible_truths ++ || m_nonspec_possible_truths != ctx.m_nonspec_possible_truths) ++ return false; ++ ++ class ipa_node_params *params_summary = IPA_NODE_REF (m_node); ++ unsigned int nargs = params_summary ++ ? ipa_get_param_count (params_summary) : 0; ++ ++ if (m_inline_param_summary.exists () || ctx.m_inline_param_summary.exists ()) ++ { ++ for (unsigned int i = 0; i < nargs; i++) ++ { ++ if (!ipa_is_param_used_by_ipa_predicates (params_summary, i)) ++ continue; ++ if (i >= m_inline_param_summary.length () ++ || m_inline_param_summary[i].useless_p ()) ++ { ++ if (i < ctx.m_inline_param_summary.length () ++ && !ctx.m_inline_param_summary[i].useless_p ()) ++ return false; ++ continue; ++ } ++ if (i >= ctx.m_inline_param_summary.length () ++ || ctx.m_inline_param_summary[i].useless_p ()) ++ { ++ if (i < m_inline_param_summary.length () ++ && !m_inline_param_summary[i].useless_p ()) ++ return false; ++ continue; ++ } ++ if (!m_inline_param_summary[i].equal_to ++ (ctx.m_inline_param_summary[i])) ++ return false; ++ } ++ } ++ if (m_known_vals.exists () || ctx.m_known_vals.exists ()) ++ { ++ for (unsigned int i = 0; i < nargs; i++) ++ { ++ if (!ipa_is_param_used_by_indirect_call (params_summary, i)) ++ continue; ++ if (i >= m_known_vals.length () || !m_known_vals[i]) ++ { ++ if (i < ctx.m_known_vals.length () && ctx.m_known_vals[i]) ++ return false; ++ continue; ++ } ++ if (i >= ctx.m_known_vals.length () || !ctx.m_known_vals[i]) ++ { ++ if (i < m_known_vals.length () && m_known_vals[i]) ++ return false; ++ continue; ++ } ++ if (m_known_vals[i] != ctx.m_known_vals[i]) ++ return false; ++ } ++ } ++ if (m_known_contexts.exists () || ctx.m_known_contexts.exists ()) ++ { ++ for (unsigned int i = 0; i < nargs; i++) ++ { ++ if (!ipa_is_param_used_by_polymorphic_call (params_summary, i)) ++ continue; ++ if (i >= m_known_contexts.length () ++ || m_known_contexts[i].useless_p ()) ++ { ++ if (i < ctx.m_known_contexts.length () ++ && !ctx.m_known_contexts[i].useless_p ()) ++ return false; ++ continue; ++ } ++ if (i >= ctx.m_known_contexts.length () ++ || ctx.m_known_contexts[i].useless_p ()) ++ { ++ if (i < m_known_contexts.length () ++ && !m_known_contexts[i].useless_p ()) ++ return false; ++ continue; ++ } ++ if (!m_known_contexts[i].equal_to ++ (ctx.m_known_contexts[i])) ++ return false; ++ } ++ } ++ if (m_known_aggs.exists () || ctx.m_known_aggs.exists ()) ++ { ++ for (unsigned int i = 0; i < nargs; i++) ++ { ++ if (!ipa_is_param_used_by_indirect_call (params_summary, i)) ++ continue; ++ if (i >= m_known_aggs.length () || m_known_aggs[i].is_empty ()) ++ { ++ if (i < ctx.m_known_aggs.length () ++ && !ctx.m_known_aggs[i].is_empty ()) ++ return false; ++ continue; ++ } ++ if (i >= ctx.m_known_aggs.length () ++ || ctx.m_known_aggs[i].is_empty ()) ++ { ++ if (i < m_known_aggs.length () ++ && !m_known_aggs[i].is_empty ()) ++ return false; ++ continue; ++ } ++ if (!m_known_aggs[i].equal_to (ctx.m_known_aggs[i])) ++ return false; ++ } ++ } ++ return true; ++} + +-/* Estimate size and time needed to execute NODE assuming +- POSSIBLE_TRUTHS clause, and KNOWN_VALS, KNOWN_AGGS and KNOWN_CONTEXTS +- information about NODE's arguments. If non-NULL use also probability +- information present in INLINE_PARAM_SUMMARY vector. ++/* Estimate size and time needed to execute call in the given context. + Additionally detemine hints determined by the context. Finally compute + minimal size needed for the call that is independent on the call context and + can be used for fast estimates. Return the values in RET_SIZE, + RET_MIN_SIZE, RET_TIME and RET_HINTS. */ + + void +-estimate_node_size_and_time (struct cgraph_node *node, +- clause_t possible_truths, +- clause_t nonspec_possible_truths, +- vec known_vals, +- vec known_contexts, +- vec known_aggs, +- int *ret_size, int *ret_min_size, +- sreal *ret_time, +- sreal *ret_nonspecialized_time, +- ipa_hints *ret_hints, +- vec +- inline_param_summary) ++ipa_call_context::estimate_size_and_time (int *ret_size, ++ int *ret_min_size, ++ sreal *ret_time, ++ sreal *ret_nonspecialized_time, ++ ipa_hints *ret_hints) + { +- struct ipa_fn_summary *info = ipa_fn_summaries->get_create (node); ++ class ipa_fn_summary *info = ipa_fn_summaries->get_create (m_node); + size_time_entry *e; + int size = 0; + sreal time = 0; +@@ -2666,13 +3226,13 @@ estimate_node_size_and_time (struct cgra + { + bool found = false; + fprintf (dump_file, " Estimating body: %s/%i\n" +- " Known to be false: ", node->name (), +- node->order); ++ " Known to be false: ", m_node->name (), ++ m_node->order); + + for (i = predicate::not_inlined_condition; + i < (predicate::first_dynamic_condition + + (int) vec_safe_length (info->conds)); i++) +- if (!(possible_truths & (1 << i))) ++ if (!(m_possible_truths & (1 << i))) + { + if (found) + fprintf (dump_file, ", "); +@@ -2681,19 +3241,19 @@ estimate_node_size_and_time (struct cgra + } + } + +- estimate_calls_size_and_time (node, &size, &min_size, &time, &hints, possible_truths, +- known_vals, known_contexts, known_aggs); ++ estimate_calls_size_and_time (m_node, &size, &min_size, &time, &hints, m_possible_truths, ++ m_known_vals, m_known_contexts, m_known_aggs); + sreal nonspecialized_time = time; + + for (i = 0; vec_safe_iterate (info->size_time_table, i, &e); i++) + { +- bool exec = e->exec_predicate.evaluate (nonspec_possible_truths); ++ bool exec = e->exec_predicate.evaluate (m_nonspec_possible_truths); + + /* Because predicates are conservative, it can happen that nonconst is 1 + but exec is 0. */ + if (exec) + { +- bool nonconst = e->nonconst_predicate.evaluate (possible_truths); ++ bool nonconst = e->nonconst_predicate.evaluate (m_possible_truths); + + gcc_checking_assert (e->time >= 0); + gcc_checking_assert (time >= 0); +@@ -2709,7 +3269,7 @@ estimate_node_size_and_time (struct cgra + nonspecialized_time += e->time; + if (!nonconst) + ; +- else if (!inline_param_summary.exists ()) ++ else if (!m_inline_param_summary.exists ()) + { + if (nonconst) + time += e->time; +@@ -2717,8 +3277,8 @@ estimate_node_size_and_time (struct cgra + else + { + int prob = e->nonconst_predicate.probability +- (info->conds, possible_truths, +- inline_param_summary); ++ (info->conds, m_possible_truths, ++ m_inline_param_summary); + gcc_checking_assert (prob >= 0); + gcc_checking_assert (prob <= REG_BR_PROB_BASE); + time += e->time * prob / REG_BR_PROB_BASE; +@@ -2742,14 +3302,14 @@ estimate_node_size_and_time (struct cgra + time = nonspecialized_time; + + if (info->loop_iterations +- && !info->loop_iterations->evaluate (possible_truths)) ++ && !info->loop_iterations->evaluate (m_possible_truths)) + hints |= INLINE_HINT_loop_iterations; + if (info->loop_stride +- && !info->loop_stride->evaluate (possible_truths)) ++ && !info->loop_stride->evaluate (m_possible_truths)) + hints |= INLINE_HINT_loop_stride; + if (info->scc_no) + hints |= INLINE_HINT_in_scc; +- if (DECL_DECLARED_INLINE_P (node->decl)) ++ if (DECL_DECLARED_INLINE_P (m_node->decl)) + hints |= INLINE_HINT_declared_inline; + + size = RDIV (size, ipa_fn_summary::size_scale); +@@ -2782,7 +3342,7 @@ estimate_ipcp_clone_size_and_time (struc + vec known_vals, + vec + known_contexts, +- vec known_aggs, ++ vec known_aggs, + int *ret_size, sreal *ret_time, + sreal *ret_nonspec_time, + ipa_hints *hints) +@@ -2791,10 +3351,31 @@ estimate_ipcp_clone_size_and_time (struc + + evaluate_conditions_for_known_args (node, false, known_vals, known_aggs, + &clause, &nonspec_clause); +- estimate_node_size_and_time (node, clause, nonspec_clause, +- known_vals, known_contexts, +- known_aggs, ret_size, NULL, ret_time, +- ret_nonspec_time, hints, vNULL); ++ ipa_call_context ctx (node, clause, nonspec_clause, ++ known_vals, known_contexts, ++ known_aggs, vNULL); ++ ctx.estimate_size_and_time (ret_size, NULL, ret_time, ++ ret_nonspec_time, hints); ++} ++ ++/* Return stack frame offset where frame of NODE is supposed to start inside ++ of the function it is inlined to. ++ Return 0 for functions that are not inlined. */ ++ ++HOST_WIDE_INT ++ipa_get_stack_frame_offset (struct cgraph_node *node) ++{ ++ HOST_WIDE_INT offset = 0; ++ if (!node->inlined_to) ++ return 0; ++ node = node->callers->caller; ++ while (true) ++ { ++ offset += ipa_size_summaries->get (node)->estimated_self_stack_size; ++ if (!node->inlined_to) ++ return offset; ++ node = node->callers->caller; ++ } + } + + +@@ -2805,19 +3386,7 @@ static void + inline_update_callee_summaries (struct cgraph_node *node, int depth) + { + struct cgraph_edge *e; +- ipa_fn_summary *callee_info = ipa_fn_summaries->get (node); +- ipa_fn_summary *caller_info = ipa_fn_summaries->get (node->callers->caller); +- HOST_WIDE_INT peak; +- +- callee_info->stack_frame_offset +- = caller_info->stack_frame_offset +- + caller_info->estimated_self_stack_size; +- peak = callee_info->stack_frame_offset +- + callee_info->estimated_self_stack_size; +- +- ipa_fn_summary *s = ipa_fn_summaries->get (node->global.inlined_to); +- if (s->estimated_stack_size < peak) +- s->estimated_stack_size = peak; ++ + ipa_propagate_frequency (node); + for (e = node->callees; e; e = e->next_callee) + { +@@ -2830,7 +3399,7 @@ inline_update_callee_summaries (struct c + } + + /* Update change_prob of EDGE after INLINED_EDGE has been inlined. +- When functoin A is inlined in B and A calls C with parameter that ++ When function A is inlined in B and A calls C with parameter that + changes with probability PROB1 and C is known to be passthroug + of argument if B that change with probability PROB2, the probability + of change is now PROB1*PROB2. */ +@@ -2842,9 +3411,11 @@ remap_edge_change_prob (struct cgraph_ed + if (ipa_node_params_sum) + { + int i; +- struct ipa_edge_args *args = IPA_EDGE_REF (edge); +- struct ipa_call_summary *es = ipa_call_summaries->get (edge); +- struct ipa_call_summary *inlined_es ++ class ipa_edge_args *args = IPA_EDGE_REF (edge); ++ if (!args) ++ return; ++ class ipa_call_summary *es = ipa_call_summaries->get (edge); ++ class ipa_call_summary *inlined_es + = ipa_call_summaries->get (inlined_edge); + + if (es->param.length () == 0) +@@ -2885,8 +3456,9 @@ remap_edge_change_prob (struct cgraph_ed + static void + remap_edge_summaries (struct cgraph_edge *inlined_edge, + struct cgraph_node *node, +- struct ipa_fn_summary *info, +- struct ipa_fn_summary *callee_info, ++ class ipa_fn_summary *info, ++ class ipa_node_params *params_summary, ++ class ipa_fn_summary *callee_info, + vec operand_map, + vec offset_map, + clause_t possible_truths, +@@ -2895,18 +3467,19 @@ remap_edge_summaries (struct cgraph_edge + struct cgraph_edge *e, *next; + for (e = node->callees; e; e = next) + { +- struct ipa_call_summary *es = ipa_call_summaries->get (e); + predicate p; + next = e->next_callee; + + if (e->inline_failed) + { ++ class ipa_call_summary *es = ipa_call_summaries->get (e); + remap_edge_change_prob (inlined_edge, e); + + if (es->predicate) + { + p = es->predicate->remap_after_inlining +- (info, callee_info, operand_map, ++ (info, params_summary, ++ callee_info, operand_map, + offset_map, possible_truths, + *toplev_predicate); + edge_set_predicate (e, &p); +@@ -2915,13 +3488,14 @@ remap_edge_summaries (struct cgraph_edge + edge_set_predicate (e, toplev_predicate); + } + else +- remap_edge_summaries (inlined_edge, e->callee, info, callee_info, ++ remap_edge_summaries (inlined_edge, e->callee, info, ++ params_summary, callee_info, + operand_map, offset_map, possible_truths, + toplev_predicate); + } + for (e = node->indirect_calls; e; e = next) + { +- struct ipa_call_summary *es = ipa_call_summaries->get (e); ++ class ipa_call_summary *es = ipa_call_summaries->get (e); + predicate p; + next = e->next_callee; + +@@ -2929,7 +3503,8 @@ remap_edge_summaries (struct cgraph_edge + if (es->predicate) + { + p = es->predicate->remap_after_inlining +- (info, callee_info, operand_map, offset_map, ++ (info, params_summary, ++ callee_info, operand_map, offset_map, + possible_truths, *toplev_predicate); + edge_set_predicate (e, &p); + } +@@ -2941,8 +3516,9 @@ remap_edge_summaries (struct cgraph_edge + /* Same as remap_predicate, but set result into hint *HINT. */ + + static void +-remap_hint_predicate (struct ipa_fn_summary *info, +- struct ipa_fn_summary *callee_info, ++remap_hint_predicate (class ipa_fn_summary *info, ++ class ipa_node_params *params_summary, ++ class ipa_fn_summary *callee_info, + predicate **hint, + vec operand_map, + vec offset_map, +@@ -2954,7 +3530,7 @@ remap_hint_predicate (struct ipa_fn_summ + if (!*hint) + return; + p = (*hint)->remap_after_inlining +- (info, callee_info, ++ (info, params_summary, callee_info, + operand_map, offset_map, + possible_truths, *toplev_predicate); + if (p != false && p != true) +@@ -2972,17 +3548,18 @@ void + ipa_merge_fn_summary_after_inlining (struct cgraph_edge *edge) + { + ipa_fn_summary *callee_info = ipa_fn_summaries->get (edge->callee); +- struct cgraph_node *to = (edge->caller->global.inlined_to +- ? edge->caller->global.inlined_to : edge->caller); +- struct ipa_fn_summary *info = ipa_fn_summaries->get (to); ++ struct cgraph_node *to = (edge->caller->inlined_to ++ ? edge->caller->inlined_to : edge->caller); ++ class ipa_fn_summary *info = ipa_fn_summaries->get (to); + clause_t clause = 0; /* not_inline is known to be false. */ + size_time_entry *e; +- vec operand_map = vNULL; +- vec offset_map = vNULL; ++ auto_vec operand_map; ++ auto_vec offset_map; + int i; + predicate toplev_predicate; +- predicate true_p = true; +- struct ipa_call_summary *es = ipa_call_summaries->get (edge); ++ class ipa_call_summary *es = ipa_call_summaries->get (edge); ++ class ipa_node_params *params_summary = (ipa_node_params_sum ++ ? IPA_NODE_REF (to) : NULL); + + if (es->predicate) + toplev_predicate = *es->predicate; +@@ -2995,8 +3572,8 @@ ipa_merge_fn_summary_after_inlining (str + evaluate_properties_for_edge (edge, true, &clause, NULL, NULL, NULL, NULL); + if (ipa_node_params_sum && callee_info->conds) + { +- struct ipa_edge_args *args = IPA_EDGE_REF (edge); +- int count = ipa_get_cs_argument_count (args); ++ class ipa_edge_args *args = IPA_EDGE_REF (edge); ++ int count = args ? ipa_get_cs_argument_count (args) : 0; + int i; + + if (count) +@@ -3029,19 +3606,21 @@ ipa_merge_fn_summary_after_inlining (str + } + } + operand_map[i] = map; +- gcc_assert (map < ipa_get_param_count (IPA_NODE_REF (to))); ++ gcc_assert (map < ipa_get_param_count (params_summary)); + } + } + for (i = 0; vec_safe_iterate (callee_info->size_time_table, i, &e); i++) + { + predicate p; + p = e->exec_predicate.remap_after_inlining +- (info, callee_info, operand_map, ++ (info, params_summary, ++ callee_info, operand_map, + offset_map, clause, + toplev_predicate); + predicate nonconstp; + nonconstp = e->nonconst_predicate.remap_after_inlining +- (info, callee_info, operand_map, ++ (info, params_summary, ++ callee_info, operand_map, + offset_map, clause, + toplev_predicate); + if (p != false && nonconstp != false) +@@ -3059,48 +3638,53 @@ ipa_merge_fn_summary_after_inlining (str + info->account_size_time (e->size, add_time, p, nonconstp); + } + } +- remap_edge_summaries (edge, edge->callee, info, callee_info, operand_map, ++ remap_edge_summaries (edge, edge->callee, info, params_summary, ++ callee_info, operand_map, + offset_map, clause, &toplev_predicate); +- remap_hint_predicate (info, callee_info, ++ remap_hint_predicate (info, params_summary, callee_info, + &callee_info->loop_iterations, + operand_map, offset_map, clause, &toplev_predicate); +- remap_hint_predicate (info, callee_info, ++ remap_hint_predicate (info, params_summary, callee_info, + &callee_info->loop_stride, + operand_map, offset_map, clause, &toplev_predicate); + +- ipa_call_summary *s = ipa_call_summaries->get (edge); +- inline_update_callee_summaries (edge->callee, s->loop_depth); ++ HOST_WIDE_INT stack_frame_offset = ipa_get_stack_frame_offset (edge->callee); ++ HOST_WIDE_INT peak = stack_frame_offset + callee_info->estimated_stack_size; + +- /* We do not maintain predicates of inlined edges, free it. */ +- edge_set_predicate (edge, &true_p); +- /* Similarly remove param summaries. */ +- es->param.release (); +- operand_map.release (); +- offset_map.release (); ++ if (info->estimated_stack_size < peak) ++ info->estimated_stack_size = peak; ++ ++ inline_update_callee_summaries (edge->callee, es->loop_depth); ++ ++ /* Free summaries that are not maintained for inline clones/edges. */ ++ ipa_call_summaries->remove (edge); ++ ipa_fn_summaries->remove (edge->callee); + } + +-/* For performance reasons ipa_merge_fn_summary_after_inlining is not updating overall size +- and time. Recompute it. */ ++/* For performance reasons ipa_merge_fn_summary_after_inlining is not updating ++ overall size and time. Recompute it. */ + + void + ipa_update_overall_fn_summary (struct cgraph_node *node) + { +- struct ipa_fn_summary *info = ipa_fn_summaries->get_create (node); ++ class ipa_fn_summary *info = ipa_fn_summaries->get_create (node); ++ class ipa_size_summary *size_info = ipa_size_summaries->get_create (node); + size_time_entry *e; + int i; + +- info->size = 0; ++ size_info->size = 0; + info->time = 0; + for (i = 0; vec_safe_iterate (info->size_time_table, i, &e); i++) + { +- info->size += e->size; ++ size_info->size += e->size; + info->time += e->time; + } +- estimate_calls_size_and_time (node, &info->size, &info->min_size, ++ estimate_calls_size_and_time (node, &size_info->size, &info->min_size, + &info->time, NULL, + ~(clause_t) (1 << predicate::false_condition), + vNULL, vNULL, vNULL); +- info->size = (info->size + ipa_fn_summary::size_scale / 2) / ipa_fn_summary::size_scale; ++ size_info->size = (size_info->size + ipa_fn_summary::size_scale / 2) ++ / ipa_fn_summary::size_scale; + } + + +@@ -3181,10 +3765,10 @@ ipa_fn_summary_generate (void) + /* Write inline summary for edge E to OB. */ + + static void +-read_ipa_call_summary (struct lto_input_block *ib, struct cgraph_edge *e, ++read_ipa_call_summary (class lto_input_block *ib, struct cgraph_edge *e, + bool prevails) + { +- struct ipa_call_summary *es = prevails ++ class ipa_call_summary *es = prevails + ? ipa_call_summaries->get_create (e) : NULL; + predicate p; + int length, i; +@@ -3235,7 +3819,7 @@ inline_read_section (struct lto_file_dec + const int cfg_offset = sizeof (struct lto_function_header); + const int main_offset = cfg_offset + header->cfg_size; + const int string_offset = main_offset + header->main_size; +- struct data_in *data_in; ++ class data_in *data_in; + unsigned int i, count2, j; + unsigned int f_count; + +@@ -3250,7 +3834,9 @@ inline_read_section (struct lto_file_dec + { + unsigned int index; + struct cgraph_node *node; +- struct ipa_fn_summary *info; ++ class ipa_fn_summary *info; ++ class ipa_node_params *params_summary; ++ class ipa_size_summary *size_info; + lto_symtab_encoder_t encoder; + struct bitpack_d bp; + struct cgraph_edge *e; +@@ -3261,6 +3847,9 @@ inline_read_section (struct lto_file_dec + node = dyn_cast (lto_symtab_encoder_deref (encoder, + index)); + info = node->prevailing_p () ? ipa_fn_summaries->get_create (node) : NULL; ++ params_summary = node->prevailing_p () ? IPA_NODE_REF (node) : NULL; ++ size_info = node->prevailing_p () ++ ? ipa_size_summaries->get_create (node) : NULL; + + int stack_size = streamer_read_uhwi (&ib); + int size = streamer_read_uhwi (&ib); +@@ -3269,8 +3858,8 @@ inline_read_section (struct lto_file_dec + if (info) + { + info->estimated_stack_size +- = info->estimated_self_stack_size = stack_size; +- info->size = info->self_size = size; ++ = size_info->estimated_self_stack_size = stack_size; ++ size_info->size = size_info->self_size = size; + info->time = time; + } + +@@ -3288,26 +3877,70 @@ inline_read_section (struct lto_file_dec + + count2 = streamer_read_uhwi (&ib); + gcc_assert (!info || !info->conds); ++ if (info) ++ vec_safe_reserve_exact (info->conds, count2); + for (j = 0; j < count2; j++) + { + struct condition c; ++ unsigned int k, count3; + c.operand_num = streamer_read_uhwi (&ib); +- c.size = streamer_read_uhwi (&ib); + c.code = (enum tree_code) streamer_read_uhwi (&ib); ++ c.type = stream_read_tree (&ib, data_in); + c.val = stream_read_tree (&ib, data_in); + bp = streamer_read_bitpack (&ib); + c.agg_contents = bp_unpack_value (&bp, 1); + c.by_ref = bp_unpack_value (&bp, 1); + if (c.agg_contents) + c.offset = streamer_read_uhwi (&ib); ++ count3 = streamer_read_uhwi (&ib); ++ c.param_ops = NULL; + if (info) +- vec_safe_push (info->conds, c); ++ vec_safe_reserve_exact (c.param_ops, count3); ++ if (params_summary) ++ ipa_set_param_used_by_ipa_predicates ++ (params_summary, c.operand_num, true); ++ for (k = 0; k < count3; k++) ++ { ++ struct expr_eval_op op; ++ enum gimple_rhs_class rhs_class; ++ op.code = (enum tree_code) streamer_read_uhwi (&ib); ++ op.type = stream_read_tree (&ib, data_in); ++ switch (rhs_class = get_gimple_rhs_class (op.code)) ++ { ++ case GIMPLE_UNARY_RHS: ++ op.index = 0; ++ op.val[0] = NULL_TREE; ++ op.val[1] = NULL_TREE; ++ break; ++ ++ case GIMPLE_BINARY_RHS: ++ case GIMPLE_TERNARY_RHS: ++ bp = streamer_read_bitpack (&ib); ++ op.index = bp_unpack_value (&bp, 2); ++ op.val[0] = stream_read_tree (&ib, data_in); ++ if (rhs_class == GIMPLE_BINARY_RHS) ++ op.val[1] = NULL_TREE; ++ else ++ op.val[1] = stream_read_tree (&ib, data_in); ++ break; ++ ++ default: ++ fatal_error (UNKNOWN_LOCATION, ++ "invalid fnsummary in LTO stream"); ++ } ++ if (info) ++ c.param_ops->quick_push (op); ++ } ++ if (info) ++ info->conds->quick_push (c); + } + count2 = streamer_read_uhwi (&ib); + gcc_assert (!info || !info->size_time_table); ++ if (info && count2) ++ vec_safe_reserve_exact (info->size_time_table, count2); + for (j = 0; j < count2; j++) + { +- struct size_time_entry e; ++ class size_time_entry e; + + e.size = streamer_read_uhwi (&ib); + e.time = sreal::stream_in (&ib); +@@ -3315,7 +3948,7 @@ inline_read_section (struct lto_file_dec + e.nonconst_predicate.stream_in (&ib); + + if (info) +- vec_safe_push (info->size_time_table, e); ++ info->size_time_table->quick_push (e); + } + + p.stream_in (&ib); +@@ -3378,7 +4011,7 @@ ipa_fn_summary_read (void) + static void + write_ipa_call_summary (struct output_block *ob, struct cgraph_edge *e) + { +- struct ipa_call_summary *es = ipa_call_summaries->get (e); ++ class ipa_call_summary *es = ipa_call_summaries->get (e); + int i; + + streamer_write_uhwi (ob, es->call_stmt_size); +@@ -3426,7 +4059,8 @@ ipa_fn_summary_write (void) + cgraph_node *cnode = lsei_cgraph_node (lsei); + if (cnode->definition && !cnode->alias) + { +- struct ipa_fn_summary *info = ipa_fn_summaries->get (cnode); ++ class ipa_fn_summary *info = ipa_fn_summaries->get (cnode); ++ class ipa_size_summary *size_info = ipa_size_summaries->get (cnode); + struct bitpack_d bp; + struct cgraph_edge *edge; + int i; +@@ -3434,8 +4068,8 @@ ipa_fn_summary_write (void) + struct condition *c; + + streamer_write_uhwi (ob, lto_symtab_encoder_encode (encoder, cnode)); +- streamer_write_hwi (ob, info->estimated_self_stack_size); +- streamer_write_hwi (ob, info->self_size); ++ streamer_write_hwi (ob, size_info->estimated_self_stack_size); ++ streamer_write_hwi (ob, size_info->self_size); + info->time.stream_out (ob); + bp = bitpack_create (ob->main_stream); + bp_pack_value (&bp, info->inlinable, 1); +@@ -3445,9 +4079,12 @@ ipa_fn_summary_write (void) + streamer_write_uhwi (ob, vec_safe_length (info->conds)); + for (i = 0; vec_safe_iterate (info->conds, i, &c); i++) + { ++ int j; ++ struct expr_eval_op *op; ++ + streamer_write_uhwi (ob, c->operand_num); +- streamer_write_uhwi (ob, c->size); + streamer_write_uhwi (ob, c->code); ++ stream_write_tree (ob, c->type, true); + stream_write_tree (ob, c->val, true); + bp = bitpack_create (ob->main_stream); + bp_pack_value (&bp, c->agg_contents, 1); +@@ -3455,6 +4092,21 @@ ipa_fn_summary_write (void) + streamer_write_bitpack (&bp); + if (c->agg_contents) + streamer_write_uhwi (ob, c->offset); ++ streamer_write_uhwi (ob, vec_safe_length (c->param_ops)); ++ for (j = 0; vec_safe_iterate (c->param_ops, j, &op); j++) ++ { ++ streamer_write_uhwi (ob, op->code); ++ stream_write_tree (ob, op->type, true); ++ if (op->val[0]) ++ { ++ bp = bitpack_create (ob->main_stream); ++ bp_pack_value (&bp, op->index, 2); ++ streamer_write_bitpack (&bp); ++ stream_write_tree (ob, op->val[0], true); ++ if (op->val[1]) ++ stream_write_tree (ob, op->val[1], true); ++ } ++ } + } + streamer_write_uhwi (ob, vec_safe_length (info->size_time_table)); + for (i = 0; vec_safe_iterate (info->size_time_table, i, &e); i++) +@@ -3487,23 +4139,33 @@ ipa_fn_summary_write (void) + } + + +-/* Release inline summary. */ ++/* Release function summary. */ + + void + ipa_free_fn_summary (void) + { +- struct cgraph_node *node; + if (!ipa_call_summaries) + return; +- FOR_EACH_DEFINED_FUNCTION (node) +- if (!node->alias) +- ipa_fn_summaries->remove (node); + ipa_fn_summaries->release (); + ipa_fn_summaries = NULL; + ipa_call_summaries->release (); + delete ipa_call_summaries; + ipa_call_summaries = NULL; + edge_predicate_pool.release (); ++ /* During IPA this is one of largest datastructures to release. */ ++ if (flag_wpa) ++ ggc_trim (); ++} ++ ++/* Release function summary. */ ++ ++void ++ipa_free_size_summary (void) ++{ ++ if (!ipa_size_summaries) ++ return; ++ ipa_size_summaries->release (); ++ ipa_size_summaries = NULL; + } + + namespace { +@@ -3578,10 +4240,12 @@ public: + gcc_assert (n == 0); + small_p = param; + } +- virtual bool gate (function *) { return small_p || !flag_wpa; } ++ virtual bool gate (function *) { return true; } + virtual unsigned int execute (function *) + { + ipa_free_fn_summary (); ++ if (!flag_wpa) ++ ipa_free_size_summary (); + return 0; + } + +diff -Nurp a/gcc/ipa-fnsummary.h b/gcc/ipa-fnsummary.h +--- a/gcc/ipa-fnsummary.h 2020-04-30 15:14:04.588000000 +0800 ++++ b/gcc/ipa-fnsummary.h 2020-04-30 15:14:56.664000000 +0800 +@@ -81,16 +81,40 @@ struct GTY(()) size_time_entry + sreal GTY((skip)) time; + }; + ++/* Summary about function and stack frame sizes. We keep this info ++ for inline clones and also for WPA streaming. For this reason this is not ++ part of ipa_fn_summary which exists only for offline functions. */ ++class ipa_size_summary ++{ ++public: ++ /* Estimated stack frame consumption by the function. */ ++ HOST_WIDE_INT estimated_self_stack_size; ++ /* Size of the function body. */ ++ int self_size; ++ /* Estimated size of the function after inlining. */ ++ int size; ++ ++ ipa_size_summary () ++ : estimated_self_stack_size (0), self_size (0), size (0) ++ { ++ } ++ /* Copy constructor. */ ++ ipa_size_summary (const ipa_size_summary &s) ++ : estimated_self_stack_size (0), self_size (s.self_size), size (s.size) ++ { ++ } ++}; ++ + /* Function inlining information. */ + struct GTY(()) ipa_fn_summary + { + /* Keep all field empty so summary dumping works during its computation. + This is useful for debugging. */ + ipa_fn_summary () +- : estimated_self_stack_size (0), self_size (0), min_size (0), ++ : min_size (0), + inlinable (false), single_caller (false), + fp_expressions (false), estimated_stack_size (false), +- stack_frame_offset (false), time (0), size (0), conds (NULL), ++ time (0), conds (NULL), + size_time_table (NULL), loop_iterations (NULL), loop_stride (NULL), + growth (0), scc_no (0) + { +@@ -98,13 +122,11 @@ struct GTY(()) ipa_fn_summary + + /* Copy constructor. */ + ipa_fn_summary (const ipa_fn_summary &s) +- : estimated_self_stack_size (s.estimated_self_stack_size), +- self_size (s.self_size), min_size (s.min_size), ++ : min_size (s.min_size), + inlinable (s.inlinable), single_caller (s.single_caller), + fp_expressions (s.fp_expressions), + estimated_stack_size (s.estimated_stack_size), +- stack_frame_offset (s.stack_frame_offset), time (s.time), size (s.size), +- conds (s.conds), size_time_table (s.size_time_table), ++ time (s.time), conds (s.conds), size_time_table (s.size_time_table), + loop_iterations (s.loop_iterations), loop_stride (s.loop_stride), + growth (s.growth), scc_no (s.scc_no) + {} +@@ -114,10 +136,6 @@ struct GTY(()) ipa_fn_summary + + /* Information about the function body itself. */ + +- /* Estimated stack frame consumption by the function. */ +- HOST_WIDE_INT estimated_self_stack_size; +- /* Size of the function body. */ +- int self_size; + /* Minimal size increase after inlining. */ + int min_size; + +@@ -135,11 +153,8 @@ struct GTY(()) ipa_fn_summary + + /* Estimated stack frame consumption by the function. */ + HOST_WIDE_INT estimated_stack_size; +- /* Expected offset of the stack frame of function. */ +- HOST_WIDE_INT stack_frame_offset; +- /* Estimated size of the function after inlining. */ ++ /* Estimated runtime of function after inlining. */ + sreal GTY((skip)) time; +- int size; + + /* Conditional size/time information. The summaries are being + merged during inlining. */ +@@ -177,7 +192,7 @@ public: + + static ipa_fn_summary_t *create_ggc (symbol_table *symtab) + { +- struct ipa_fn_summary_t *summary = new (ggc_alloc ()) ++ class ipa_fn_summary_t *summary = new (ggc_alloc ()) + ipa_fn_summary_t (symtab); + summary->disable_insertion_hook (); + return summary; +@@ -199,6 +214,24 @@ public: + extern GTY(()) fast_function_summary + *ipa_fn_summaries; + ++class ipa_size_summary_t: ++ public fast_function_summary ++{ ++public: ++ ipa_size_summary_t (symbol_table *symtab): ++ fast_function_summary (symtab) {} ++ ++ static ipa_size_summary_t *create_ggc (symbol_table *symtab) ++ { ++ class ipa_size_summary_t *summary = new (ggc_alloc ()) ++ ipa_size_summary_t (symtab); ++ summary->disable_insertion_hook (); ++ return summary; ++ } ++}; ++extern fast_function_summary ++ *ipa_size_summaries; ++ + /* Information kept about callgraph edges. */ + struct ipa_call_summary + { +@@ -245,6 +278,57 @@ public: + ipa_call_summary *dst_data); + }; + ++/* This object describe a context of call. That is a summary of known ++ information about its parameters. Main purpose of this context is ++ to give more realistic esitmations of function runtime, size and ++ inline hints. */ ++class ipa_call_context ++{ ++public: ++ ipa_call_context (cgraph_node *node, ++ clause_t possible_truths, ++ clause_t nonspec_possible_truths, ++ vec known_vals, ++ vec known_contexts, ++ vec known_aggs, ++ vec m_inline_param_summary); ++ ipa_call_context () ++ : m_node(NULL) ++ { ++ } ++ void estimate_size_and_time (int *ret_size, int *ret_min_size, ++ sreal *ret_time, ++ sreal *ret_nonspecialized_time, ++ ipa_hints *ret_hints); ++ void duplicate_from (const ipa_call_context &ctx); ++ void release (bool all = false); ++ bool equal_to (const ipa_call_context &); ++ bool exists_p () ++ { ++ return m_node != NULL; ++ } ++private: ++ /* Called function. */ ++ cgraph_node *m_node; ++ /* Clause describing what predicate conditionals can be satisfied ++ in this context if function is inlined/specialised. */ ++ clause_t m_possible_truths; ++ /* Clause describing what predicate conditionals can be satisfied ++ in this context if function is kept offline. */ ++ clause_t m_nonspec_possible_truths; ++ /* Inline summary maintains info about change probabilities. */ ++ vec m_inline_param_summary; ++ ++ /* The following is used only to resolve indirect calls. */ ++ ++ /* Vector describing known values of parameters. */ ++ vec m_known_vals; ++ /* Vector describing known polymorphic call contexts. */ ++ vec m_known_contexts; ++ /* Vector describing known aggregate values. */ ++ vec m_known_aggs; ++}; ++ + extern fast_call_summary *ipa_call_summaries; + + /* In ipa-fnsummary.c */ +@@ -253,11 +337,12 @@ void ipa_dump_fn_summaries (FILE *f); + void ipa_dump_fn_summary (FILE *f, struct cgraph_node *node); + void ipa_dump_hints (FILE *f, ipa_hints); + void ipa_free_fn_summary (void); ++void ipa_free_size_summary (void); + void inline_analyze_function (struct cgraph_node *node); + void estimate_ipcp_clone_size_and_time (struct cgraph_node *, + vec, + vec, +- vec, ++ vec, + int *, sreal *, sreal *, + ipa_hints *); + void ipa_merge_fn_summary_after_inlining (struct cgraph_edge *edge); +@@ -265,26 +350,16 @@ void ipa_update_overall_fn_summary (stru + void compute_fn_summary (struct cgraph_node *, bool); + + +-void evaluate_properties_for_edge (struct cgraph_edge *e, bool inline_p, ++void evaluate_properties_for_edge (struct cgraph_edge *e, ++ bool inline_p, + clause_t *clause_ptr, + clause_t *nonspec_clause_ptr, + vec *known_vals_ptr, + vec + *known_contexts_ptr, +- vec *); +-void estimate_node_size_and_time (struct cgraph_node *node, +- clause_t possible_truths, +- clause_t nonspec_possible_truths, +- vec known_vals, +- vec, +- vec known_aggs, +- int *ret_size, int *ret_min_size, +- sreal *ret_time, +- sreal *ret_nonspecialized_time, +- ipa_hints *ret_hints, +- vec +- inline_param_summary); ++ vec *); + + void ipa_fnsummary_c_finalize (void); ++HOST_WIDE_INT ipa_get_stack_frame_offset (struct cgraph_node *node); + + #endif /* GCC_IPA_FNSUMMARY_H */ +diff -Nurp a/gcc/ipa-icf.c b/gcc/ipa-icf.c +--- a/gcc/ipa-icf.c 2020-04-30 15:14:04.596000000 +0800 ++++ b/gcc/ipa-icf.c 2020-04-30 15:14:56.632000000 +0800 +@@ -491,7 +491,7 @@ sem_function::param_used_p (unsigned int + + struct ipa_node_params *parms_info = IPA_NODE_REF (get_node ()); + +- if (vec_safe_length (parms_info->descriptors) <= i) ++ if (!parms_info || vec_safe_length (parms_info->descriptors) <= i) + return true; + + return ipa_is_param_used (IPA_NODE_REF (get_node ()), i); +@@ -1149,8 +1149,8 @@ sem_function::merge (sem_item *alias_ite + "cannot create wrapper of stdarg function.\n"); + } + else if (ipa_fn_summaries +- && ipa_fn_summaries->get (alias) != NULL +- && ipa_fn_summaries->get (alias)->self_size <= 2) ++ && ipa_size_summaries->get (alias) != NULL ++ && ipa_size_summaries->get (alias)->self_size <= 2) + { + if (dump_file) + fprintf (dump_file, "Wrapper creation is not " +@@ -1268,6 +1268,7 @@ sem_function::merge (sem_item *alias_ite + + /* Remove the function's body. */ + ipa_merge_profiles (original, alias); ++ symtab->call_cgraph_removal_hooks (alias); + alias->release_body (true); + alias->reset (); + /* Notice global symbol possibly produced RTL. */ +@@ -1288,11 +1289,13 @@ sem_function::merge (sem_item *alias_ite + { + gcc_assert (!create_alias); + alias->icf_merged = true; ++ symtab->call_cgraph_removal_hooks (alias); + local_original->icf_merged = true; + + /* FIXME update local_original counts. */ + ipa_merge_profiles (original, alias, true); + alias->create_wrapper (local_original); ++ symtab->call_cgraph_insertion_hooks (alias); + + if (dump_file) + fprintf (dump_file, "Unified; Wrapper has been created.\n\n"); +diff -Nurp a/gcc/ipa-inline-analysis.c b/gcc/ipa-inline-analysis.c +--- a/gcc/ipa-inline-analysis.c 2020-04-30 15:14:04.556000000 +0800 ++++ b/gcc/ipa-inline-analysis.c 2020-04-30 15:14:56.680000000 +0800 +@@ -53,6 +53,48 @@ along with GCC; see the file COPYING3. + /* Cached node/edge growths. */ + call_summary *edge_growth_cache = NULL; + ++/* The context cache remembers estimated time/size and hints for given ++ ipa_call_context of a call. */ ++class node_context_cache_entry ++{ ++public: ++ ipa_call_context ctx; ++ sreal time, nonspec_time; ++ int size; ++ ipa_hints hints; ++ ++ node_context_cache_entry () ++ : ctx () ++ { ++ } ++ ~node_context_cache_entry () ++ { ++ ctx.release (); ++ } ++}; ++ ++/* At the moment we implement primitive single entry LRU cache. */ ++class node_context_summary ++{ ++public: ++ node_context_cache_entry entry; ++ ++ node_context_summary () ++ : entry () ++ { ++ } ++ ~node_context_summary () ++ { ++ } ++}; ++ ++/* Summary holding the context cache. */ ++static fast_function_summary ++ *node_context_cache = NULL; ++/* Statistics about the context cache effectivity. */ ++static long node_context_cache_hit, node_context_cache_miss, ++ node_context_cache_clear; ++ + /* Give initial reasons why inlining would fail on EDGE. This gets either + nullified or usually overwritten by more precise reasons later. */ + +@@ -77,6 +119,16 @@ initialize_inline_failed (struct cgraph_ + == CIF_FINAL_ERROR); + } + ++/* Allocate edge growth caches. */ ++ ++void ++initialize_growth_caches () ++{ ++ edge_growth_cache ++ = new call_summary (symtab, false); ++ node_context_cache ++ = new fast_function_summary (symtab); ++} + + /* Free growth caches. */ + +@@ -84,7 +136,17 @@ void + free_growth_caches (void) + { + delete edge_growth_cache; ++ delete node_context_cache; + edge_growth_cache = NULL; ++ node_context_cache = NULL; ++ if (dump_file) ++ fprintf (dump_file, "node context cache: %li hits, %li misses," ++ " %li initializations\n", ++ node_context_cache_hit, node_context_cache_miss, ++ node_context_cache_clear); ++ node_context_cache_hit = 0; ++ node_context_cache_miss = 0; ++ node_context_cache_clear = 0; + } + + /* Return hints derrived from EDGE. */ +@@ -93,8 +155,8 @@ int + simple_edge_hints (struct cgraph_edge *edge) + { + int hints = 0; +- struct cgraph_node *to = (edge->caller->global.inlined_to +- ? edge->caller->global.inlined_to : edge->caller); ++ struct cgraph_node *to = (edge->caller->inlined_to ++ ? edge->caller->inlined_to : edge->caller); + struct cgraph_node *callee = edge->callee->ultimate_alias_target (); + int to_scc_no = ipa_fn_summaries->get (to)->scc_no; + int callee_scc_no = ipa_fn_summaries->get (callee)->scc_no; +@@ -127,9 +189,9 @@ do_estimate_edge_time (struct cgraph_edg + clause_t clause, nonspec_clause; + vec known_vals; + vec known_contexts; +- vec known_aggs; +- struct ipa_call_summary *es = ipa_call_summaries->get (edge); +- int min_size; ++ vec known_aggs; ++ class ipa_call_summary *es = ipa_call_summaries->get (edge); ++ int min_size = -1; + + callee = edge->callee->ultimate_alias_target (); + +@@ -137,9 +199,53 @@ do_estimate_edge_time (struct cgraph_edg + evaluate_properties_for_edge (edge, true, + &clause, &nonspec_clause, &known_vals, + &known_contexts, &known_aggs); +- estimate_node_size_and_time (callee, clause, nonspec_clause, known_vals, +- known_contexts, known_aggs, &size, &min_size, +- &time, &nonspec_time, &hints, es->param); ++ ipa_call_context ctx (callee, clause, nonspec_clause, known_vals, ++ known_contexts, known_aggs, es->param); ++ if (node_context_cache != NULL) ++ { ++ node_context_summary *e = node_context_cache->get_create (callee); ++ if (e->entry.ctx.equal_to (ctx)) ++ { ++ node_context_cache_hit++; ++ size = e->entry.size; ++ time = e->entry.time; ++ nonspec_time = e->entry.nonspec_time; ++ hints = e->entry.hints; ++ if (flag_checking ++ && !callee->count.ipa_p ()) ++ { ++ sreal chk_time, chk_nonspec_time; ++ int chk_size, chk_min_size; ++ ++ ipa_hints chk_hints; ++ ctx.estimate_size_and_time (&chk_size, &chk_min_size, ++ &chk_time, &chk_nonspec_time, ++ &chk_hints); ++ gcc_assert (chk_size == size && chk_time == time ++ && chk_nonspec_time == nonspec_time ++ && chk_hints == hints); ++ } ++ } ++ else ++ { ++ if (e->entry.ctx.exists_p ()) ++ node_context_cache_miss++; ++ else ++ node_context_cache_clear++; ++ e->entry.ctx.release (true); ++ e->entry.ctx = ctx; ++ ctx.estimate_size_and_time (&size, &min_size, ++ &time, &nonspec_time, &hints); ++ e->entry.size = size; ++ e->entry.time = time; ++ e->entry.nonspec_time = nonspec_time; ++ e->entry.hints = hints; ++ e->entry.ctx.duplicate_from (ctx); ++ } ++ } ++ else ++ ctx.estimate_size_and_time (&size, &min_size, ++ &time, &nonspec_time, &hints); + + /* When we have profile feedback, we can quite safely identify hot + edges and for those we disable size limits. Don't do that when +@@ -147,21 +253,21 @@ do_estimate_edge_time (struct cgraph_edg + may hurt optimization of the caller's hot path. */ + if (edge->count.ipa ().initialized_p () && edge->maybe_hot_p () + && (edge->count.ipa ().apply_scale (2, 1) +- > (edge->caller->global.inlined_to +- ? edge->caller->global.inlined_to->count.ipa () ++ > (edge->caller->inlined_to ++ ? edge->caller->inlined_to->count.ipa () + : edge->caller->count.ipa ()))) + hints |= INLINE_HINT_known_hot; + +- known_vals.release (); +- known_contexts.release (); +- known_aggs.release (); ++ ctx.release (); + gcc_checking_assert (size >= 0); + gcc_checking_assert (time >= 0); + + /* When caching, update the cache entry. */ + if (edge_growth_cache != NULL) + { +- ipa_fn_summaries->get_create (edge->callee)->min_size = min_size; ++ if (min_size >= 0) ++ ipa_fn_summaries->get (edge->callee->function_symbol ())->min_size ++ = min_size; + edge_growth_cache_entry *entry + = edge_growth_cache->get_create (edge); + entry->time = time; +@@ -174,6 +280,14 @@ do_estimate_edge_time (struct cgraph_edg + return time; + } + ++/* Reset cache for NODE. ++ This must be done each time NODE body is modified. */ ++void ++reset_node_cache (struct cgraph_node *node) ++{ ++ if (node_context_cache) ++ node_context_cache->remove (node); ++} + + /* Return estimated callee growth after inlining EDGE. + Only to be called via estimate_edge_size. */ +@@ -186,7 +300,7 @@ do_estimate_edge_size (struct cgraph_edg + clause_t clause, nonspec_clause; + vec known_vals; + vec known_contexts; +- vec known_aggs; ++ vec known_aggs; + + /* When we do caching, use do_estimate_edge_time to populate the entry. */ + +@@ -206,12 +320,10 @@ do_estimate_edge_size (struct cgraph_edg + &clause, &nonspec_clause, + &known_vals, &known_contexts, + &known_aggs); +- estimate_node_size_and_time (callee, clause, nonspec_clause, known_vals, +- known_contexts, known_aggs, &size, NULL, NULL, +- NULL, NULL, vNULL); +- known_vals.release (); +- known_contexts.release (); +- known_aggs.release (); ++ ipa_call_context ctx (callee, clause, nonspec_clause, known_vals, ++ known_contexts, known_aggs, vNULL); ++ ctx.estimate_size_and_time (&size, NULL, NULL, NULL, NULL); ++ ctx.release (); + return size; + } + +@@ -227,7 +339,7 @@ do_estimate_edge_hints (struct cgraph_ed + clause_t clause, nonspec_clause; + vec known_vals; + vec known_contexts; +- vec known_aggs; ++ vec known_aggs; + + /* When we do caching, use do_estimate_edge_time to populate the entry. */ + +@@ -247,12 +359,10 @@ do_estimate_edge_hints (struct cgraph_ed + &clause, &nonspec_clause, + &known_vals, &known_contexts, + &known_aggs); +- estimate_node_size_and_time (callee, clause, nonspec_clause, known_vals, +- known_contexts, known_aggs, NULL, NULL, +- NULL, NULL, &hints, vNULL); +- known_vals.release (); +- known_contexts.release (); +- known_aggs.release (); ++ ipa_call_context ctx (callee, clause, nonspec_clause, known_vals, ++ known_contexts, known_aggs, vNULL); ++ ctx.estimate_size_and_time (NULL, NULL, NULL, NULL, &hints); ++ ctx.release (); + hints |= simple_edge_hints (edge); + return hints; + } +@@ -264,8 +374,8 @@ int + estimate_size_after_inlining (struct cgraph_node *node, + struct cgraph_edge *edge) + { +- struct ipa_call_summary *es = ipa_call_summaries->get (edge); +- ipa_fn_summary *s = ipa_fn_summaries->get (node); ++ class ipa_call_summary *es = ipa_call_summaries->get (edge); ++ ipa_size_summary *s = ipa_size_summaries->get (node); + if (!es->predicate || *es->predicate != false) + { + int size = s->size + estimate_edge_growth (edge); +@@ -321,7 +431,7 @@ int + estimate_growth (struct cgraph_node *node) + { + struct growth_data d = { node, false, false, 0 }; +- struct ipa_fn_summary *info = ipa_fn_summaries->get (node); ++ class ipa_size_summary *info = ipa_size_summaries->get (node); + + node->call_for_symbol_and_aliases (do_estimate_growth_1, &d, true); + +@@ -396,7 +506,7 @@ growth_likely_positive (struct cgraph_no + || node->address_taken) + return true; + +- max_callers = ipa_fn_summaries->get (node)->size * 4 / edge_growth + 2; ++ max_callers = ipa_size_summaries->get (node)->size * 4 / edge_growth + 2; + + for (e = node->callers; e; e = e->next_caller) + { +diff -Nurp a/gcc/ipa-inline.c b/gcc/ipa-inline.c +--- a/gcc/ipa-inline.c 2020-04-30 15:14:04.652000000 +0800 ++++ b/gcc/ipa-inline.c 2020-04-30 15:14:56.684000000 +0800 +@@ -150,8 +150,7 @@ caller_growth_limits (struct cgraph_edge + int newsize; + int limit = 0; + HOST_WIDE_INT stack_size_limit = 0, inlined_stack; +- ipa_fn_summary *info, *what_info; +- ipa_fn_summary *outer_info = ipa_fn_summaries->get (to); ++ ipa_size_summary *outer_info = ipa_size_summaries->get (to); + + /* Look for function e->caller is inlined to. While doing + so work out the largest function body on the way. As +@@ -163,28 +162,29 @@ caller_growth_limits (struct cgraph_edge + too much in order to prevent compiler from exploding". */ + while (true) + { +- info = ipa_fn_summaries->get (to); +- if (limit < info->self_size) +- limit = info->self_size; +- if (stack_size_limit < info->estimated_self_stack_size) +- stack_size_limit = info->estimated_self_stack_size; +- if (to->global.inlined_to) ++ ipa_size_summary *size_info = ipa_size_summaries->get (to); ++ if (limit < size_info->self_size) ++ limit = size_info->self_size; ++ if (stack_size_limit < size_info->estimated_self_stack_size) ++ stack_size_limit = size_info->estimated_self_stack_size; ++ if (to->inlined_to) + to = to->callers->caller; + else + break; + } + +- what_info = ipa_fn_summaries->get (what); ++ ipa_fn_summary *what_info = ipa_fn_summaries->get (what); ++ ipa_size_summary *what_size_info = ipa_size_summaries->get (what); + +- if (limit < what_info->self_size) +- limit = what_info->self_size; ++ if (limit < what_size_info->self_size) ++ limit = what_size_info->self_size; + + limit += limit * PARAM_VALUE (PARAM_LARGE_FUNCTION_GROWTH) / 100; + + /* Check the size after inlining against the function limits. But allow + the function to shrink if it went over the limits by forced inlining. */ + newsize = estimate_size_after_inlining (to, e); +- if (newsize >= info->size ++ if (newsize >= ipa_size_summaries->get (what)->size + && newsize > PARAM_VALUE (PARAM_LARGE_FUNCTION_INSNS) + && newsize > limit) + { +@@ -203,7 +203,7 @@ caller_growth_limits (struct cgraph_edge + stack_size_limit += ((gcov_type)stack_size_limit + * PARAM_VALUE (PARAM_STACK_FRAME_GROWTH) / 100); + +- inlined_stack = (outer_info->stack_frame_offset ++ inlined_stack = (ipa_get_stack_frame_offset (to) + + outer_info->estimated_self_stack_size + + what_info->estimated_stack_size); + /* Check new stack consumption with stack consumption at the place +@@ -213,7 +213,7 @@ caller_growth_limits (struct cgraph_edge + inline call, we can inline, too. + This bit overoptimistically assume that we are good at stack + packing. */ +- && inlined_stack > info->estimated_stack_size ++ && inlined_stack > ipa_fn_summaries->get (to)->estimated_stack_size + && inlined_stack > PARAM_VALUE (PARAM_LARGE_STACK_FRAME)) + { + e->inline_failed = CIF_LARGE_STACK_FRAME_GROWTH_LIMIT; +@@ -321,8 +321,8 @@ can_inline_edge_p (struct cgraph_edge *e + + bool inlinable = true; + enum availability avail; +- cgraph_node *caller = e->caller->global.inlined_to +- ? e->caller->global.inlined_to : e->caller; ++ cgraph_node *caller = (e->caller->inlined_to ++ ? e->caller->inlined_to : e->caller); + cgraph_node *callee = e->callee->ultimate_alias_target (&avail, caller); + + if (!callee->definition) +@@ -414,8 +414,8 @@ can_inline_edge_by_limits_p (struct cgra + + bool inlinable = true; + enum availability avail; +- cgraph_node *caller = e->caller->global.inlined_to +- ? e->caller->global.inlined_to : e->caller; ++ cgraph_node *caller = (e->caller->inlined_to ++ ? e->caller->inlined_to : e->caller); + cgraph_node *callee = e->callee->ultimate_alias_target (&avail, caller); + tree caller_tree = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (caller->decl); + tree callee_tree +@@ -687,8 +687,8 @@ inline sreal + compute_uninlined_call_time (struct cgraph_edge *edge, + sreal uninlined_call_time) + { +- cgraph_node *caller = (edge->caller->global.inlined_to +- ? edge->caller->global.inlined_to ++ cgraph_node *caller = (edge->caller->inlined_to ++ ? edge->caller->inlined_to + : edge->caller); + + sreal freq = edge->sreal_frequency (); +@@ -708,8 +708,8 @@ inline sreal + compute_inlined_call_time (struct cgraph_edge *edge, + sreal time) + { +- cgraph_node *caller = (edge->caller->global.inlined_to +- ? edge->caller->global.inlined_to ++ cgraph_node *caller = (edge->caller->inlined_to ++ ? edge->caller->inlined_to + : edge->caller); + sreal caller_time = ipa_fn_summaries->get (caller)->time; + +@@ -895,7 +895,7 @@ want_inline_self_recursive_call_p (struc + reason = "--param max-inline-recursive-depth exceeded."; + want_inline = false; + } +- else if (outer_node->global.inlined_to ++ else if (outer_node->inlined_to + && (caller_freq = outer_node->callers->sreal_frequency ()) == 0) + { + reason = "caller frequency is 0"; +@@ -1005,7 +1005,7 @@ want_inline_function_to_all_callers_p (s + if (node->alias) + return false; + /* Already inlined? */ +- if (node->global.inlined_to) ++ if (node->inlined_to) + return false; + /* Does it have callers? */ + if (!node->call_for_symbol_and_aliases (has_caller_p, NULL, true)) +@@ -1037,8 +1037,8 @@ edge_badness (struct cgraph_edge *edge, + struct cgraph_node *callee = edge->callee->ultimate_alias_target (); + struct ipa_fn_summary *callee_info = ipa_fn_summaries->get (callee); + ipa_hints hints; +- cgraph_node *caller = (edge->caller->global.inlined_to +- ? edge->caller->global.inlined_to ++ cgraph_node *caller = (edge->caller->inlined_to ++ ? edge->caller->inlined_to + : edge->caller); + + growth = estimate_edge_growth (edge); +@@ -1051,7 +1051,7 @@ edge_badness (struct cgraph_edge *edge, + gcc_checking_assert ((edge_time * 100 + - callee_info->time * 101).to_int () <= 0 + || callee->count.ipa ().initialized_p ()); +- gcc_checking_assert (growth <= callee_info->size); ++ gcc_checking_assert (growth <= ipa_size_summaries->get (callee)->size); + + if (dump) + { +@@ -1122,7 +1122,7 @@ edge_badness (struct cgraph_edge *edge, + if (need_more_work) + noninline_callee (); + } +- Withhout panilizing this case, we usually inline noninline_callee ++ Withhout penalizing this case, we usually inline noninline_callee + into the inline_caller because overall_growth is small preventing + further inlining of inline_caller. + +@@ -1132,7 +1132,7 @@ edge_badness (struct cgraph_edge *edge, + if (growth > overall_growth + /* ... and having only one caller which is not inlined ... */ + && callee_info->single_caller +- && !edge->caller->global.inlined_to ++ && !edge->caller->inlined_to + /* ... and edges executed only conditionally ... */ + && edge->sreal_frequency () < 1 + /* ... consider case where callee is not inline but caller is ... */ +@@ -1155,7 +1155,7 @@ edge_badness (struct cgraph_edge *edge, + and it is not called once and. */ + if (!caller_info->single_caller && overall_growth < caller_growth + && caller_info->inlinable +- && caller_info->size ++ && ipa_size_summaries->get (caller)->size + < (DECL_DECLARED_INLINE_P (caller->decl) + ? MAX_INLINE_INSNS_SINGLE : MAX_INLINE_INSNS_AUTO)) + { +@@ -1178,7 +1178,7 @@ edge_badness (struct cgraph_edge *edge, + overall_growth += 256 * 256 - 256; + denominator *= overall_growth; + } +- denominator *= ipa_fn_summaries->get (caller)->self_size + growth; ++ denominator *= ipa_size_summaries->get (caller)->size + growth; + + badness = - numerator / denominator; + +@@ -1300,8 +1300,10 @@ reset_edge_caches (struct cgraph_node *n + struct cgraph_node *where = node; + struct ipa_ref *ref; + +- if (where->global.inlined_to) +- where = where->global.inlined_to; ++ if (where->inlined_to) ++ where = where->inlined_to; ++ ++ reset_node_cache (where); + + if (edge_growth_cache != NULL) + for (edge = where->callers; edge; edge = edge->next_caller) +@@ -1351,7 +1353,7 @@ update_caller_keys (edge_heap_t *heap, s + struct ipa_ref *ref; + + if ((!node->alias && !ipa_fn_summaries->get (node)->inlinable) +- || node->global.inlined_to) ++ || node->inlined_to) + return; + if (!bitmap_set_bit (updated_nodes, node->get_uid ())) + return; +@@ -1479,8 +1481,8 @@ recursive_inlining (struct cgraph_edge * + int n = 0; + + node = edge->caller; +- if (node->global.inlined_to) +- node = node->global.inlined_to; ++ if (node->inlined_to) ++ node = node->inlined_to; + + if (DECL_DECLARED_INLINE_P (node->decl)) + limit = PARAM_VALUE (PARAM_MAX_INLINE_INSNS_RECURSIVE); +@@ -1528,7 +1530,7 @@ recursive_inlining (struct cgraph_edge * + + depth = 1; + for (cnode = curr->caller; +- cnode->global.inlined_to; cnode = cnode->callers->caller) ++ cnode->inlined_to; cnode = cnode->callers->caller) + if (node->decl + == curr->callee->ultimate_alias_target ()->decl) + depth++; +@@ -1567,6 +1569,7 @@ recursive_inlining (struct cgraph_edge * + } + + inline_call (curr, false, new_edges, &overall_size, true); ++ reset_node_cache (node); + lookup_recursive_calls (node, curr->callee, &heap); + n++; + } +@@ -1581,8 +1584,8 @@ recursive_inlining (struct cgraph_edge * + dump_printf_loc (MSG_NOTE, edge->call_stmt, + "\n Inlined %i times, " + "body grown from size %i to %i, time %f to %f\n", n, +- ipa_fn_summaries->get (master_clone)->size, +- ipa_fn_summaries->get (node)->size, ++ ipa_size_summaries->get (master_clone)->size, ++ ipa_size_summaries->get (node)->size, + ipa_fn_summaries->get (master_clone)->time.to_double (), + ipa_fn_summaries->get (node)->time.to_double ()); + +@@ -1593,7 +1596,7 @@ recursive_inlining (struct cgraph_edge * + node = next) + { + next = symtab->next_function (node); +- if (node->global.inlined_to == master_clone) ++ if (node->inlined_to == master_clone) + node->remove (); + } + master_clone->remove (); +@@ -1707,8 +1710,8 @@ resolve_noninline_speculation (edge_heap + if (edge->speculative && !speculation_useful_p (edge, false)) + { + struct cgraph_node *node = edge->caller; +- struct cgraph_node *where = node->global.inlined_to +- ? node->global.inlined_to : node; ++ struct cgraph_node *where = node->inlined_to ++ ? node->inlined_to : node; + auto_bitmap updated_nodes; + + if (edge->count.ipa ().initialized_p ()) +@@ -1749,6 +1752,16 @@ sum_callers (struct cgraph_node *node, v + return false; + } + ++/* We only propagate across edges with non-interposable callee. */ ++ ++inline bool ++ignore_edge_p (struct cgraph_edge *e) ++{ ++ enum availability avail; ++ e->callee->function_or_virtual_thunk_symbol (&avail, e->caller); ++ return (avail <= AVAIL_INTERPOSABLE); ++} ++ + /* We use greedy algorithm for inlining of small functions: + All inline candidates are put into prioritized heap ordered in + increasing badness. +@@ -1776,11 +1789,11 @@ inline_small_functions (void) + metrics. */ + + max_count = profile_count::uninitialized (); +- ipa_reduced_postorder (order, true, NULL); ++ ipa_reduced_postorder (order, true, ignore_edge_p); + free (order); + + FOR_EACH_DEFINED_FUNCTION (node) +- if (!node->global.inlined_to) ++ if (!node->inlined_to) + { + if (!node->alias && node->analyzed + && (node->has_gimple_body_p () || node->thunk.thunk_p) +@@ -1792,7 +1805,7 @@ inline_small_functions (void) + /* Do not account external functions, they will be optimized out + if not inlined. Also only count the non-cold portion of program. */ + if (inline_account_function_p (node)) +- initial_size += info->size; ++ initial_size += ipa_size_summaries->get (node)->size; + info->growth = estimate_growth (node); + + int num_calls = 0; +@@ -1808,7 +1821,8 @@ inline_small_functions (void) + n2 = ((struct ipa_dfs_info *) n2->aux)->next_cycle) + if (opt_for_fn (n2->decl, optimize)) + { +- ipa_fn_summary *info2 = ipa_fn_summaries->get (n2); ++ ipa_fn_summary *info2 = ipa_fn_summaries->get ++ (n2->inlined_to ? n2->inlined_to : n2); + if (info2->scc_no) + break; + info2->scc_no = id; +@@ -1820,8 +1834,7 @@ inline_small_functions (void) + max_count = max_count.max (edge->count.ipa ()); + } + ipa_free_postorder_info (); +- edge_growth_cache +- = new call_summary (symtab, false); ++ initialize_growth_caches (); + + if (dump_file) + fprintf (dump_file, +@@ -1872,8 +1885,8 @@ inline_small_functions (void) + } + if (update) + { +- struct cgraph_node *where = node->global.inlined_to +- ? node->global.inlined_to : node; ++ struct cgraph_node *where = node->inlined_to ++ ? node->inlined_to : node; + ipa_update_overall_fn_summary (where); + reset_edge_caches (where); + update_caller_keys (&edge_heap, where, +@@ -1902,11 +1915,10 @@ inline_small_functions (void) + if (!edge->inline_failed || !edge->callee->analyzed) + continue; + +-#if CHECKING_P + /* Be sure that caches are maintained consistent. + This check is affected by scaling roundoff errors when compiling for + IPA this we skip it in that case. */ +- if (!edge->callee->count.ipa_p () ++ if (flag_checking && !edge->callee->count.ipa_p () + && (!max_count.initialized_p () || !max_count.nonzero_p ())) + { + sreal cached_badness = edge_badness (edge, false); +@@ -1917,6 +1929,9 @@ inline_small_functions (void) + + if (edge_growth_cache != NULL) + edge_growth_cache->remove (edge); ++ reset_node_cache (edge->caller->inlined_to ++ ? edge->caller->inlined_to ++ : edge->caller); + gcc_assert (old_size_est == estimate_edge_size (edge)); + gcc_assert (old_time_est == estimate_edge_time (edge)); + /* FIXME: +@@ -1941,9 +1956,6 @@ inline_small_functions (void) + } + else + current_badness = edge_badness (edge, false); +-#else +- current_badness = edge_badness (edge, false); +-#endif + if (current_badness != badness) + { + if (edge_heap.min () && current_badness > edge_heap.min_key ()) +@@ -1969,7 +1981,7 @@ inline_small_functions (void) + fprintf (dump_file, + "\nConsidering %s with %i size\n", + callee->dump_name (), +- ipa_fn_summaries->get (callee)->size); ++ ipa_size_summaries->get (callee)->size); + fprintf (dump_file, + " to be inlined into %s in %s:%i\n" + " Estimated badness is %f, frequency %.2f.\n", +@@ -2017,8 +2029,8 @@ inline_small_functions (void) + if (edge->recursive_p ()) + { + where = edge->caller; +- if (where->global.inlined_to) +- where = where->global.inlined_to; ++ if (where->inlined_to) ++ where = where->inlined_to; + if (!recursive_inlining (edge, + opt_for_fn (edge->caller->decl, + flag_indirect_inlining) +@@ -2048,7 +2060,7 @@ inline_small_functions (void) + selective. */ + + where = edge->caller; +- while (where->global.inlined_to) ++ while (where->inlined_to) + { + if (where->decl == callee->decl) + outer_node = where, depth++; +@@ -2067,17 +2079,16 @@ inline_small_functions (void) + else if (depth && dump_file) + fprintf (dump_file, " Peeling recursion with depth %i\n", depth); + +- gcc_checking_assert (!callee->global.inlined_to); ++ gcc_checking_assert (!callee->inlined_to); + inline_call (edge, true, &new_indirect_edges, &overall_size, true); +- add_new_edges_to_heap (&edge_heap, new_indirect_edges); +- + reset_edge_caches (edge->callee); ++ add_new_edges_to_heap (&edge_heap, new_indirect_edges); + + update_callee_keys (&edge_heap, where, updated_nodes); + } + where = edge->caller; +- if (where->global.inlined_to) +- where = where->global.inlined_to; ++ if (where->inlined_to) ++ where = where->inlined_to; + + /* Our profitability metric can depend on local properties + such as number of inlinable calls and size of the function body. +@@ -2095,7 +2106,7 @@ inline_small_functions (void) + + if (dump_enabled_p ()) + { +- ipa_fn_summary *s = ipa_fn_summaries->get (edge->caller); ++ ipa_fn_summary *s = ipa_fn_summaries->get (where); + + /* dump_printf can't handle %+i. */ + char buf_net_change[100]; +@@ -2106,7 +2117,9 @@ inline_small_functions (void) + " Inlined %C into %C which now has time %f and " + "size %i, net change of %s.\n", + edge->callee, edge->caller, +- s->time.to_double (), s->size, buf_net_change); ++ s->time.to_double (), ++ ipa_size_summaries->get (edge->caller)->size, ++ buf_net_change); + } + if (min_size > overall_size) + { +@@ -2208,8 +2221,8 @@ flatten_function (struct cgraph_node *no + + node->aux = NULL; + if (update) +- ipa_update_overall_fn_summary (node->global.inlined_to +- ? node->global.inlined_to : node); ++ ipa_update_overall_fn_summary (node->inlined_to ++ ? node->inlined_to : node); + } + + /* Inline NODE to all callers. Worker for cgraph_for_node_and_aliases. +@@ -2223,7 +2236,7 @@ inline_to_all_callers_1 (struct cgraph_n + int *num_calls = (int *)data; + bool callee_removed = false; + +- while (node->callers && !node->global.inlined_to) ++ while (node->callers && !node->inlined_to) + { + struct cgraph_node *caller = node->callers->caller; + +@@ -2243,11 +2256,11 @@ inline_to_all_callers_1 (struct cgraph_n + fprintf (dump_file, + "\nInlining %s size %i.\n", + ultimate->name (), +- ipa_fn_summaries->get (ultimate)->size); ++ ipa_size_summaries->get (ultimate)->size); + fprintf (dump_file, + " Called once from %s %i insns.\n", + node->callers->caller->name (), +- ipa_fn_summaries->get (node->callers->caller)->size); ++ ipa_size_summaries->get (node->callers->caller)->size); + } + + /* Remember which callers we inlined to, delaying updating the +@@ -2258,7 +2271,7 @@ inline_to_all_callers_1 (struct cgraph_n + fprintf (dump_file, + " Inlined into %s which now has %i size\n", + caller->name (), +- ipa_fn_summaries->get (caller)->size); ++ ipa_size_summaries->get (caller)->size); + if (!(*num_calls)--) + { + if (dump_file) +@@ -2296,7 +2309,7 @@ dump_overall_stats (void) + struct cgraph_node *node; + + FOR_EACH_DEFINED_FUNCTION (node) +- if (!node->global.inlined_to ++ if (!node->inlined_to + && !node->alias) + { + ipa_fn_summary *s = ipa_fn_summaries->get (node); +@@ -2482,8 +2495,9 @@ ipa_inline (void) + for (i = nnodes - 1, j = i; i >= 0; i--) + { + node = order[i]; +- if (lookup_attribute ("flatten", +- DECL_ATTRIBUTES (node->decl)) != NULL) ++ if (node->definition ++ && lookup_attribute ("flatten", ++ DECL_ATTRIBUTES (node->decl)) != NULL) + order[j--] = order[i]; + } + +@@ -2588,8 +2602,8 @@ ipa_inline (void) + } + if (update) + { +- struct cgraph_node *where = node->global.inlined_to +- ? node->global.inlined_to : node; ++ struct cgraph_node *where = node->inlined_to ++ ? node->inlined_to : node; + reset_edge_caches (where); + ipa_update_overall_fn_summary (where); + } +diff -Nurp a/gcc/ipa-inline.h b/gcc/ipa-inline.h +--- a/gcc/ipa-inline.h 2020-04-30 15:14:04.608000000 +0800 ++++ b/gcc/ipa-inline.h 2020-04-30 15:14:56.608000000 +0800 +@@ -47,6 +47,8 @@ bool growth_likely_positive (struct cgra + int do_estimate_edge_size (struct cgraph_edge *edge); + sreal do_estimate_edge_time (struct cgraph_edge *edge); + ipa_hints do_estimate_edge_hints (struct cgraph_edge *edge); ++void reset_node_cache (struct cgraph_node *node); ++void initialize_growth_caches (); + void free_growth_caches (void); + + /* In ipa-inline.c */ +diff -Nurp a/gcc/ipa-inline-transform.c b/gcc/ipa-inline-transform.c +--- a/gcc/ipa-inline-transform.c 2020-04-30 15:14:04.568000000 +0800 ++++ b/gcc/ipa-inline-transform.c 2020-04-30 15:14:56.624000000 +0800 +@@ -47,6 +47,7 @@ along with GCC; see the file COPYING3. + #include "function.h" + #include "cfg.h" + #include "basic-block.h" ++#include "ipa-utils.h" + + int ncalls_inlined; + int nfunctions_inlined; +@@ -166,8 +167,8 @@ clone_inlined_nodes (struct cgraph_edge + struct cgraph_node *inlining_into; + struct cgraph_edge *next; + +- if (e->caller->global.inlined_to) +- inlining_into = e->caller->global.inlined_to; ++ if (e->caller->inlined_to) ++ inlining_into = e->caller->inlined_to; + else + inlining_into = e->caller; + +@@ -193,14 +194,14 @@ clone_inlined_nodes (struct cgraph_edge + + For now we keep the ohter functions in the group in program until + cgraph_remove_unreachable_functions gets rid of them. */ +- gcc_assert (!e->callee->global.inlined_to); ++ gcc_assert (!e->callee->inlined_to); + e->callee->remove_from_same_comdat_group (); + if (e->callee->definition + && inline_account_function_p (e->callee)) + { + gcc_assert (!e->callee->alias); + if (overall_size) +- *overall_size -= ipa_fn_summaries->get (e->callee)->size; ++ *overall_size -= ipa_size_summaries->get (e->callee)->size; + nfunctions_inlined++; + } + duplicate = false; +@@ -226,7 +227,7 @@ clone_inlined_nodes (struct cgraph_edge + else + e->callee->remove_from_same_comdat_group (); + +- e->callee->global.inlined_to = inlining_into; ++ e->callee->inlined_to = inlining_into; + + /* Recursively clone all bodies. */ + for (e = e->callee->callees; e; e = next) +@@ -310,20 +311,24 @@ inline_call (struct cgraph_edge *e, bool + /* Don't inline inlined edges. */ + gcc_assert (e->inline_failed); + /* Don't even think of inlining inline clone. */ +- gcc_assert (!callee->global.inlined_to); ++ gcc_assert (!callee->inlined_to); + + to = e->caller; +- if (to->global.inlined_to) +- to = to->global.inlined_to; ++ if (to->inlined_to) ++ to = to->inlined_to; + if (to->thunk.thunk_p) + { + struct cgraph_node *target = to->callees->callee; ++ thunk_expansion = true; ++ symtab->call_cgraph_removal_hooks (to); + if (in_lto_p) + to->get_untransformed_body (); + to->expand_thunk (false, true); + /* When thunk is instrumented we may have multiple callees. */ + for (e = to->callees; e && e->callee != target; e = e->next_callee) + ; ++ symtab->call_cgraph_insertion_hooks (to); ++ thunk_expansion = false; + gcc_assert (e); + } + +@@ -442,9 +447,9 @@ inline_call (struct cgraph_edge *e, bool + + clone_inlined_nodes (e, true, update_original, overall_size); + +- gcc_assert (curr->callee->global.inlined_to == to); ++ gcc_assert (curr->callee->inlined_to == to); + +- old_size = ipa_fn_summaries->get (to)->size; ++ old_size = ipa_size_summaries->get (to)->size; + ipa_merge_fn_summary_after_inlining (e); + if (e->in_polymorphic_cdtor) + mark_all_inlined_calls_cdtor (e->callee); +@@ -458,8 +463,8 @@ inline_call (struct cgraph_edge *e, bool + work for further inlining into this function. Before inlining + the function we inlined to again we expect the caller to update + the overall summary. */ +- ipa_fn_summaries->get (to)->size += estimated_growth; +- new_size = ipa_fn_summaries->get (to)->size; ++ ipa_size_summaries->get (to)->size += estimated_growth; ++ new_size = ipa_size_summaries->get (to)->size; + + if (callee->calls_comdat_local) + to->calls_comdat_local = true; +diff -Nurp a/gcc/ipa-predicate.c b/gcc/ipa-predicate.c +--- a/gcc/ipa-predicate.c 2020-04-30 15:14:04.620000000 +0800 ++++ b/gcc/ipa-predicate.c 2020-04-30 15:14:56.620000000 +0800 +@@ -33,9 +33,36 @@ along with GCC; see the file COPYING3. + #include "fold-const.h" + #include "tree-pretty-print.h" + #include "gimple.h" ++#include "gimplify.h" + #include "data-streamer.h" + + ++/* Check whether two set of operations have same effects. */ ++static bool ++expr_eval_ops_equal_p (expr_eval_ops ops1, expr_eval_ops ops2) ++{ ++ if (ops1) ++ { ++ if (!ops2 || ops1->length () != ops2->length ()) ++ return false; ++ ++ for (unsigned i = 0; i < ops1->length (); i++) ++ { ++ expr_eval_op &op1 = (*ops1)[i]; ++ expr_eval_op &op2 = (*ops2)[i]; ++ ++ if (op1.code != op2.code ++ || op1.index != op2.index ++ || !vrp_operand_equal_p (op1.val[0], op2.val[0]) ++ || !vrp_operand_equal_p (op1.val[1], op2.val[1]) ++ || !types_compatible_p (op1.type, op2.type)) ++ return false; ++ } ++ return true; ++ } ++ return !ops2; ++} ++ + /* Add clause CLAUSE into the predicate P. + When CONDITIONS is NULL do not perform checking whether NEW_CLAUSE + is obviously true. This is useful only when NEW_CLAUSE is known to be +@@ -110,14 +137,16 @@ predicate::add_clause (conditions condit + for (c2 = c1 + 1; c2 < num_conditions; c2++) + if (new_clause & (1 << c2)) + { +- condition *cc1 = +- &(*conditions)[c1 - predicate::first_dynamic_condition]; + condition *cc2 = + &(*conditions)[c2 - predicate::first_dynamic_condition]; + if (cc1->operand_num == cc2->operand_num +- && cc1->val == cc2->val ++ && vrp_operand_equal_p (cc1->val, cc2->val) + && cc2->code != is_not_constant +- && cc2->code != predicate::changed ++ && cc2->code != changed ++ && expr_eval_ops_equal_p (cc1->param_ops, cc2->param_ops) ++ && cc2->agg_contents == cc1->agg_contents ++ && cc2->by_ref == cc1->by_ref ++ && types_compatible_p (cc2->type, cc1->type) + && cc1->code == invert_tree_comparison (cc2->code, + HONOR_NANS (cc1->val))) + return; +@@ -300,6 +329,83 @@ dump_condition (FILE *f, conditions cond + if (c->agg_contents) + fprintf (f, "[%soffset: " HOST_WIDE_INT_PRINT_DEC "]", + c->by_ref ? "ref " : "", c->offset); ++ ++ for (unsigned i = 0; i < vec_safe_length (c->param_ops); i++) ++ { ++ expr_eval_op &op = (*(c->param_ops))[i]; ++ const char *op_name = op_symbol_code (op.code); ++ ++ if (op_name == op_symbol_code (ERROR_MARK)) ++ op_name = get_tree_code_name (op.code); ++ ++ fprintf (f, ",("); ++ ++ if (!op.val[0]) ++ { ++ switch (op.code) ++ { ++ case FLOAT_EXPR: ++ case FIX_TRUNC_EXPR: ++ case FIXED_CONVERT_EXPR: ++ case VIEW_CONVERT_EXPR: ++ CASE_CONVERT: ++ if (op.code == VIEW_CONVERT_EXPR) ++ fprintf (f, "VCE"); ++ fprintf (f, "("); ++ print_generic_expr (f, op.type); ++ fprintf (f, ")" ); ++ break; ++ ++ default: ++ fprintf (f, "%s", op_name); ++ } ++ fprintf (f, " #"); ++ } ++ else if (!op.val[1]) ++ { ++ if (op.index) ++ { ++ print_generic_expr (f, op.val[0]); ++ fprintf (f, " %s #", op_name); ++ } ++ else ++ { ++ fprintf (f, "# %s ", op_name); ++ print_generic_expr (f, op.val[0]); ++ } ++ } ++ else ++ { ++ fprintf (f, "%s ", op_name); ++ switch (op.index) ++ { ++ case 0: ++ fprintf (f, "#, "); ++ print_generic_expr (f, op.val[0]); ++ fprintf (f, ", "); ++ print_generic_expr (f, op.val[1]); ++ break; ++ ++ case 1: ++ print_generic_expr (f, op.val[0]); ++ fprintf (f, ", #, "); ++ print_generic_expr (f, op.val[1]); ++ break; ++ ++ case 2: ++ print_generic_expr (f, op.val[0]); ++ fprintf (f, ", "); ++ print_generic_expr (f, op.val[1]); ++ fprintf (f, ", #"); ++ break; ++ ++ default: ++ fprintf (f, "*, *, *"); ++ } ++ } ++ fprintf (f, ")"); ++ } ++ + if (c->code == predicate::is_not_constant) + { + fprintf (f, " not constant"); +@@ -398,8 +504,9 @@ predicate::remap_after_duplication (clau + for other purposes). */ + + predicate +-predicate::remap_after_inlining (struct ipa_fn_summary *info, +- struct ipa_fn_summary *callee_info, ++predicate::remap_after_inlining (class ipa_fn_summary *info, ++ class ipa_node_params *params_summary, ++ class ipa_fn_summary *callee_info, + vec operand_map, + vec offset_map, + clause_t possible_truths, +@@ -460,10 +567,10 @@ predicate::remap_after_inlining (struct + ap.offset = c->offset + offset_delta; + ap.agg_contents = c->agg_contents; + ap.by_ref = c->by_ref; +- cond_predicate = add_condition (info, ++ cond_predicate = add_condition (info, params_summary, + operand_map[c->operand_num], +- c->size, &ap, c->code, +- c->val); ++ c->type, &ap, c->code, ++ c->val, c->param_ops); + } + } + /* Fixed conditions remains same, construct single +@@ -483,7 +590,7 @@ predicate::remap_after_inlining (struct + /* Read predicate from IB. */ + + void +-predicate::stream_in (struct lto_input_block *ib) ++predicate::stream_in (class lto_input_block *ib) + { + clause_t clause; + int k = 0; +@@ -516,21 +623,28 @@ predicate::stream_out (struct output_blo + } + + +-/* Add condition to condition list SUMMARY. OPERAND_NUM, SIZE, CODE and VAL +- correspond to fields of condition structure. AGGPOS describes whether the +- used operand is loaded from an aggregate and where in the aggregate it is. +- It can be NULL, which means this not a load from an aggregate. */ ++/* Add condition to condition list SUMMARY. OPERAND_NUM, TYPE, CODE, VAL and ++ PARAM_OPS correspond to fields of condition structure. AGGPOS describes ++ whether the used operand is loaded from an aggregate and where in the ++ aggregate it is. It can be NULL, which means this not a load from an ++ aggregate. */ + + predicate +-add_condition (struct ipa_fn_summary *summary, int operand_num, +- HOST_WIDE_INT size, struct agg_position_info *aggpos, +- enum tree_code code, tree val) ++add_condition (class ipa_fn_summary *summary, ++ class ipa_node_params *params_summary, ++ int operand_num, ++ tree type, struct agg_position_info *aggpos, ++ enum tree_code code, tree val, expr_eval_ops param_ops) + { +- int i; ++ int i, j; + struct condition *c; + struct condition new_cond; + HOST_WIDE_INT offset; + bool agg_contents, by_ref; ++ expr_eval_op *op; ++ ++ if (params_summary) ++ ipa_set_param_used_by_ipa_predicates (params_summary, operand_num, true); + + if (aggpos) + { +@@ -549,10 +663,11 @@ add_condition (struct ipa_fn_summary *su + for (i = 0; vec_safe_iterate (summary->conds, i, &c); i++) + { + if (c->operand_num == operand_num +- && c->size == size + && c->code == code +- && c->val == val ++ && types_compatible_p (c->type, type) ++ && vrp_operand_equal_p (c->val, val) + && c->agg_contents == agg_contents ++ && expr_eval_ops_equal_p (c->param_ops, param_ops) + && (!agg_contents || (c->offset == offset && c->by_ref == by_ref))) + return predicate::predicate_testing_cond (i); + } +@@ -562,11 +677,21 @@ add_condition (struct ipa_fn_summary *su + + new_cond.operand_num = operand_num; + new_cond.code = code; +- new_cond.val = val; ++ new_cond.type = unshare_expr_without_location (type); ++ new_cond.val = val ? unshare_expr_without_location (val) : val; + new_cond.agg_contents = agg_contents; + new_cond.by_ref = by_ref; + new_cond.offset = offset; +- new_cond.size = size; ++ new_cond.param_ops = vec_safe_copy (param_ops); ++ ++ for (j = 0; vec_safe_iterate (new_cond.param_ops, j, &op); j++) ++ { ++ if (op->val[0]) ++ op->val[0] = unshare_expr_without_location (op->val[0]); ++ if (op->val[1]) ++ op->val[1] = unshare_expr_without_location (op->val[1]); ++ } ++ + vec_safe_push (summary->conds, new_cond); + + return predicate::predicate_testing_cond (i); +diff -Nurp a/gcc/ipa-predicate.h b/gcc/ipa-predicate.h +--- a/gcc/ipa-predicate.h 2020-04-30 15:14:04.612000000 +0800 ++++ b/gcc/ipa-predicate.h 2020-04-30 15:14:56.620000000 +0800 +@@ -22,16 +22,36 @@ along with GCC; see the file COPYING3. + inlined into (i.e. known constant values of function parameters. + + Conditions that are interesting for function body are collected into CONDS +- vector. They are of simple for function_param OP VAL, where VAL is +- IPA invariant. The conditions are then referred by predicates. */ ++ vector. They are of simple as kind of a mathematical transformation on ++ function parameter, T(function_param), in which the parameter occurs only ++ once, and other operands are IPA invariant. The conditions are then ++ referred by predicates. */ ++ ++ ++/* A simplified representation of tree node, for unary, binary and ternary ++ operation. Computations on parameter are decomposed to a series of this ++ kind of structure. */ ++struct GTY(()) expr_eval_op ++{ ++ /* Result type of expression. */ ++ tree type; ++ /* Constant operands in expression, there are at most two. */ ++ tree val[2]; ++ /* Index of parameter operand in expression. */ ++ unsigned index : 2; ++ /* Operation code of expression. */ ++ ENUM_BITFIELD(tree_code) code : 16; ++}; ++ ++typedef vec *expr_eval_ops; + + struct GTY(()) condition + { + /* If agg_contents is set, this is the offset from which the used data was + loaded. */ + HOST_WIDE_INT offset; +- /* Size of the access reading the data (or the PARM_DECL SSA_NAME). */ +- HOST_WIDE_INT size; ++ /* Type of the access reading the data (or the PARM_DECL SSA_NAME). */ ++ tree type; + tree val; + int operand_num; + ENUM_BITFIELD(tree_code) code : 16; +@@ -41,6 +61,9 @@ struct GTY(()) condition + /* If agg_contents is set, this differentiates between loads from data + passed by reference and by value. */ + unsigned by_ref : 1; ++ /* A set of sequential operations on the parameter, which can be seen as ++ a mathmatical function on the parameter. */ ++ expr_eval_ops param_ops; + }; + + /* Information kept about parameter of call site. */ +@@ -54,6 +77,14 @@ struct inline_param_summary + + Value 0 is reserved for compile time invariants. */ + int change_prob; ++ bool equal_to (const inline_param_summary &other) const ++ { ++ return change_prob == other.change_prob; ++ } ++ bool useless_p (void) const ++ { ++ return change_prob == REG_BR_PROB_BASE; ++ } + }; + + typedef vec *conditions; +@@ -205,11 +236,12 @@ public: + predicate remap_after_duplication (clause_t); + + /* Return predicate equal to THIS after inlining. */ +- predicate remap_after_inlining (struct ipa_fn_summary *, +- struct ipa_fn_summary *, ++ predicate remap_after_inlining (class ipa_fn_summary *, ++ class ipa_node_params *params_summary, ++ class ipa_fn_summary *, + vec, vec, clause_t, const predicate &); + +- void stream_in (struct lto_input_block *); ++ void stream_in (class lto_input_block *); + void stream_out (struct output_block *); + + private: +@@ -227,6 +259,9 @@ private: + }; + + void dump_condition (FILE *f, conditions conditions, int cond); +-predicate add_condition (struct ipa_fn_summary *summary, int operand_num, +- HOST_WIDE_INT size, struct agg_position_info *aggpos, +- enum tree_code code, tree val); ++predicate add_condition (class ipa_fn_summary *summary, ++ class ipa_node_params *params_summary, ++ int operand_num, ++ tree type, struct agg_position_info *aggpos, ++ enum tree_code code, tree val, ++ expr_eval_ops param_ops = NULL); +diff -Nurp a/gcc/ipa-profile.c b/gcc/ipa-profile.c +--- a/gcc/ipa-profile.c 2020-04-30 15:14:04.632000000 +0800 ++++ b/gcc/ipa-profile.c 2020-04-30 15:14:56.652000000 +0800 +@@ -326,8 +326,8 @@ ipa_propagate_frequency_1 (struct cgraph + if (profile_info + && !(edge->callee->count.ipa () == profile_count::zero ()) + && (edge->caller->frequency != NODE_FREQUENCY_UNLIKELY_EXECUTED +- || (edge->caller->global.inlined_to +- && edge->caller->global.inlined_to->frequency ++ || (edge->caller->inlined_to ++ && edge->caller->inlined_to->frequency + != NODE_FREQUENCY_UNLIKELY_EXECUTED))) + d->maybe_unlikely_executed = false; + if (edge->count.ipa ().initialized_p () +@@ -477,6 +477,29 @@ ipa_propagate_frequency (struct cgraph_n + return changed; + } + ++/* Check that number of arguments of N agrees with E. ++ Be conservative when summaries are not present. */ ++ ++static bool ++check_argument_count (struct cgraph_node *n, struct cgraph_edge *e) ++{ ++ if (!ipa_node_params_sum || !ipa_edge_args_sum) ++ return true; ++ class ipa_node_params *info = IPA_NODE_REF (n->function_symbol ()); ++ if (!info) ++ return true; ++ if (!info->descriptors) ++ return true; ++ ipa_edge_args *e_info = IPA_EDGE_REF (e); ++ if (!e) ++ return true; ++ if (ipa_get_param_count (info) != ipa_get_cs_argument_count (e_info) ++ && (ipa_get_param_count (info) >= ipa_get_cs_argument_count (e_info) ++ || !stdarg_p (TREE_TYPE (n->decl)))) ++ return false; ++ return true; ++} ++ + /* Simple ipa profile pass propagating frequencies across the callgraph. */ + + static unsigned int +@@ -600,14 +623,7 @@ ipa_profile (void) + "Not speculating: target is overwritable " + "and can be discarded.\n"); + } +- else if (ipa_node_params_sum && ipa_edge_args_sum +- && (!vec_safe_is_empty +- (IPA_NODE_REF (n2)->descriptors)) +- && ipa_get_param_count (IPA_NODE_REF (n2)) +- != ipa_get_cs_argument_count (IPA_EDGE_REF (e)) +- && (ipa_get_param_count (IPA_NODE_REF (n2)) +- >= ipa_get_cs_argument_count (IPA_EDGE_REF (e)) +- || !stdarg_p (TREE_TYPE (n2->decl)))) ++ else if (!check_argument_count (n2, e)) + { + nmismatch++; + if (dump_file) +diff -Nurp a/gcc/ipa-prop.c b/gcc/ipa-prop.c +--- a/gcc/ipa-prop.c 2020-04-30 15:14:04.616000000 +0800 ++++ b/gcc/ipa-prop.c 2020-04-30 15:14:56.676000000 +0800 +@@ -203,7 +203,7 @@ ipa_get_param_decl_index_1 (vecdescriptors, ptree); + } +@@ -227,8 +227,10 @@ ipa_populate_param_decls (struct cgraph_ + for (parm = fnargs; parm; parm = DECL_CHAIN (parm)) + { + descriptors[param_num].decl_or_type = parm; +- descriptors[param_num].move_cost = estimate_move_cost (TREE_TYPE (parm), +- true); ++ unsigned int cost = estimate_move_cost (TREE_TYPE (parm), true); ++ descriptors[param_num].move_cost = cost; ++ /* Watch overflow, move_cost is a bitfield. */ ++ gcc_checking_assert (cost == descriptors[param_num].move_cost); + param_num++; + } + } +@@ -253,7 +255,7 @@ count_formal_params (tree fndecl) + using ipa_initialize_node_params. */ + + void +-ipa_dump_param (FILE *file, struct ipa_node_params *info, int i) ++ipa_dump_param (FILE *file, class ipa_node_params *info, int i) + { + fprintf (file, "param #%i", i); + if ((*info->descriptors)[i].decl_or_type) +@@ -269,7 +271,7 @@ ipa_dump_param (FILE *file, struct ipa_n + static bool + ipa_alloc_node_params (struct cgraph_node *node, int param_count) + { +- struct ipa_node_params *info = IPA_NODE_REF (node); ++ class ipa_node_params *info = IPA_NODE_REF_GET_CREATE (node); + + if (!info->descriptors && param_count) + { +@@ -287,7 +289,7 @@ ipa_alloc_node_params (struct cgraph_nod + void + ipa_initialize_node_params (struct cgraph_node *node) + { +- struct ipa_node_params *info = IPA_NODE_REF (node); ++ class ipa_node_params *info = IPA_NODE_REF_GET_CREATE (node); + + if (!info->descriptors + && ipa_alloc_node_params (node, count_formal_params (node->decl))) +@@ -359,23 +361,50 @@ ipa_print_node_jump_functions_for_edge ( + + fprintf (f, " Aggregate passed by %s:\n", + jump_func->agg.by_ref ? "reference" : "value"); +- FOR_EACH_VEC_SAFE_ELT (jump_func->agg.items, j, item) ++ FOR_EACH_VEC_ELT (*jump_func->agg.items, j, item) + { + fprintf (f, " offset: " HOST_WIDE_INT_PRINT_DEC ", ", + item->offset); +- if (TYPE_P (item->value)) +- fprintf (f, "clobber of " HOST_WIDE_INT_PRINT_DEC " bits", +- tree_to_uhwi (TYPE_SIZE (item->value))); +- else ++ fprintf (f, "type: "); ++ print_generic_expr (f, item->type); ++ fprintf (f, ", "); ++ if (item->jftype == IPA_JF_PASS_THROUGH) ++ fprintf (f, "PASS THROUGH: %d,", ++ item->value.pass_through.formal_id); ++ else if (item->jftype == IPA_JF_LOAD_AGG) ++ { ++ fprintf (f, "LOAD AGG: %d", ++ item->value.pass_through.formal_id); ++ fprintf (f, " [offset: " HOST_WIDE_INT_PRINT_DEC ", by %s],", ++ item->value.load_agg.offset, ++ item->value.load_agg.by_ref ? "reference" ++ : "value"); ++ } ++ ++ if (item->jftype == IPA_JF_PASS_THROUGH ++ || item->jftype == IPA_JF_LOAD_AGG) ++ { ++ fprintf (f, " op %s", ++ get_tree_code_name (item->value.pass_through.operation)); ++ if (item->value.pass_through.operation != NOP_EXPR) ++ { ++ fprintf (f, " "); ++ print_generic_expr (f, item->value.pass_through.operand); ++ } ++ } ++ else if (item->jftype == IPA_JF_CONST) + { +- fprintf (f, "cst: "); +- print_generic_expr (f, item->value); ++ fprintf (f, "CONST: "); ++ print_generic_expr (f, item->value.constant); + } ++ else if (item->jftype == IPA_JF_UNKNOWN) ++ fprintf (f, "UNKNOWN: " HOST_WIDE_INT_PRINT_DEC " bits", ++ tree_to_uhwi (TYPE_SIZE (item->type))); + fprintf (f, "\n"); + } + } + +- struct ipa_polymorphic_call_context *ctx ++ class ipa_polymorphic_call_context *ctx + = ipa_get_ith_polymorhic_call_context (IPA_EDGE_REF (cs), i); + if (ctx && !ctx->useless_p ()) + { +@@ -432,7 +461,7 @@ ipa_print_node_jump_functions (FILE *f, + + for (cs = node->indirect_calls; cs; cs = cs->next_callee) + { +- struct cgraph_indirect_call_info *ii; ++ class cgraph_indirect_call_info *ii; + if (!ipa_edge_args_info_available_for_edge_p (cs)) + continue; + +@@ -1059,7 +1088,7 @@ bool + ipa_load_from_parm_agg (struct ipa_func_body_info *fbi, + vec *descriptors, + gimple *stmt, tree op, int *index_p, +- HOST_WIDE_INT *offset_p, HOST_WIDE_INT *size_p, ++ HOST_WIDE_INT *offset_p, poly_int64 *size_p, + bool *by_ref_p, bool *guaranteed_unmodified) + { + int index; +@@ -1135,6 +1164,67 @@ ipa_load_from_parm_agg (struct ipa_func_ + return false; + } + ++/* If STMT is an assignment that loads a value from a parameter declaration, ++ or from an aggregate passed as the parameter either by value or reference, ++ return the index of the parameter in ipa_node_params. Otherwise return -1. ++ ++ FBI holds gathered information about the function. INFO describes ++ parameters of the function, STMT is the assignment statement. If it is a ++ memory load from an aggregate, *OFFSET_P is filled with offset within the ++ aggregate, and *BY_REF_P specifies whether the aggregate is passed by ++ reference. */ ++ ++static int ++load_from_unmodified_param_or_agg (struct ipa_func_body_info *fbi, ++ class ipa_node_params *info, ++ gimple *stmt, ++ HOST_WIDE_INT *offset_p, ++ bool *by_ref_p) ++{ ++ int index = load_from_unmodified_param (fbi, info->descriptors, stmt); ++ poly_int64 size; ++ ++ /* Load value from a parameter declaration. */ ++ if (index >= 0) ++ { ++ *offset_p = -1; ++ return index; ++ } ++ ++ if (!gimple_assign_load_p (stmt)) ++ return -1; ++ ++ tree rhs = gimple_assign_rhs1 (stmt); ++ ++ /* Skip memory reference containing VIEW_CONVERT_EXPR. */ ++ for (tree t = rhs; handled_component_p (t); t = TREE_OPERAND (t, 0)) ++ if (TREE_CODE (t) == VIEW_CONVERT_EXPR) ++ return -1; ++ ++ /* Skip memory reference containing bit-field. */ ++ if (TREE_CODE (rhs) == BIT_FIELD_REF ++ || contains_bitfld_component_ref_p (rhs)) ++ return -1; ++ ++ if (!ipa_load_from_parm_agg (fbi, info->descriptors, stmt, rhs, &index, ++ offset_p, &size, by_ref_p)) ++ return -1; ++ ++ gcc_assert (!maybe_ne (tree_to_poly_int64 (TYPE_SIZE (TREE_TYPE (rhs))), ++ size)); ++ if (!*by_ref_p) ++ { ++ tree param_type = ipa_get_type (info, index); ++ ++ if (!param_type || !AGGREGATE_TYPE_P (param_type)) ++ return -1; ++ } ++ else if (TREE_THIS_VOLATILE (rhs)) ++ return -1; ++ ++ return index; ++} ++ + /* Given that an actual argument is an SSA_NAME (given in NAME) and is a result + of an assignment statement STMT, try to determine whether we are actually + handling any of the following cases and construct an appropriate jump +@@ -1190,7 +1280,7 @@ ipa_load_from_parm_agg (struct ipa_func_ + + static void + compute_complex_assign_jump_func (struct ipa_func_body_info *fbi, +- struct ipa_node_params *info, ++ class ipa_node_params *info, + struct ipa_jump_func *jfunc, + gcall *call, gimple *stmt, tree name, + tree param_type) +@@ -1346,7 +1436,7 @@ get_ancestor_addr_info (gimple *assign, + + static void + compute_complex_ancestor_jump_func (struct ipa_func_body_info *fbi, +- struct ipa_node_params *info, ++ class ipa_node_params *info, + struct ipa_jump_func *jfunc, + gcall *call, gphi *phi) + { +@@ -1440,11 +1530,11 @@ type_like_member_ptr_p (tree type, tree + } + + /* If RHS is an SSA_NAME and it is defined by a simple copy assign statement, +- return the rhs of its defining statement. Otherwise return RHS as it +- is. */ ++ return the rhs of its defining statement, and this statement is stored in ++ *RHS_STMT. Otherwise return RHS as it is. */ + + static inline tree +-get_ssa_def_if_simple_copy (tree rhs) ++get_ssa_def_if_simple_copy (tree rhs, gimple **rhs_stmt) + { + while (TREE_CODE (rhs) == SSA_NAME && !SSA_NAME_IS_DEFAULT_DEF (rhs)) + { +@@ -1454,100 +1544,323 @@ get_ssa_def_if_simple_copy (tree rhs) + rhs = gimple_assign_rhs1 (def_stmt); + else + break; ++ *rhs_stmt = def_stmt; + } + return rhs; + } + +-/* Simple linked list, describing known contents of an aggregate beforere +- call. */ ++/* Simple linked list, describing contents of an aggregate before call. */ + + struct ipa_known_agg_contents_list + { + /* Offset and size of the described part of the aggregate. */ + HOST_WIDE_INT offset, size; +- /* Known constant value or NULL if the contents is known to be unknown. */ +- tree constant; ++ ++ /* Type of the described part of the aggregate. */ ++ tree type; ++ ++ /* Known constant value or jump function data describing contents. */ ++ struct ipa_load_agg_data value; ++ + /* Pointer to the next structure in the list. */ + struct ipa_known_agg_contents_list *next; + }; + +-/* Find the proper place in linked list of ipa_known_agg_contents_list +- structures where to put a new one with the given LHS_OFFSET and LHS_SIZE, +- unless there is a partial overlap, in which case return NULL, or such +- element is already there, in which case set *ALREADY_THERE to true. */ +- +-static struct ipa_known_agg_contents_list ** +-get_place_in_agg_contents_list (struct ipa_known_agg_contents_list **list, +- HOST_WIDE_INT lhs_offset, +- HOST_WIDE_INT lhs_size, +- bool *already_there) ++/* Add an aggregate content item into a linked list of ++ ipa_known_agg_contents_list structure, in which all elements ++ are sorted ascendingly by offset. */ ++ ++static inline void ++add_to_agg_contents_list (struct ipa_known_agg_contents_list **plist, ++ struct ipa_known_agg_contents_list *item) + { +- struct ipa_known_agg_contents_list **p = list; +- while (*p && (*p)->offset < lhs_offset) ++ struct ipa_known_agg_contents_list *list = *plist; ++ ++ for (; list; list = list->next) + { +- if ((*p)->offset + (*p)->size > lhs_offset) +- return NULL; +- p = &(*p)->next; ++ if (list->offset >= item->offset) ++ break; ++ ++ plist = &list->next; + } + +- if (*p && (*p)->offset < lhs_offset + lhs_size) ++ item->next = list; ++ *plist = item; ++} ++ ++/* Check whether a given aggregate content is clobbered by certain element in ++ a linked list of ipa_known_agg_contents_list. */ ++ ++static inline bool ++clobber_by_agg_contents_list_p (struct ipa_known_agg_contents_list *list, ++ struct ipa_known_agg_contents_list *item) ++{ ++ for (; list; list = list->next) + { +- if ((*p)->offset == lhs_offset && (*p)->size == lhs_size) +- /* We already know this value is subsequently overwritten with +- something else. */ +- *already_there = true; +- else +- /* Otherwise this is a partial overlap which we cannot +- represent. */ +- return NULL; ++ if (list->offset >= item->offset) ++ return list->offset < item->offset + item->size; ++ ++ if (list->offset + list->size > item->offset) ++ return true; + } +- return p; ++ ++ return false; + } + + /* Build aggregate jump function from LIST, assuming there are exactly +- CONST_COUNT constant entries there and that th offset of the passed argument ++ VALUE_COUNT entries there and that offset of the passed argument + is ARG_OFFSET and store it into JFUNC. */ + + static void + build_agg_jump_func_from_list (struct ipa_known_agg_contents_list *list, +- int const_count, HOST_WIDE_INT arg_offset, ++ int value_count, HOST_WIDE_INT arg_offset, + struct ipa_jump_func *jfunc) + { +- vec_alloc (jfunc->agg.items, const_count); +- while (list) ++ vec_alloc (jfunc->agg.items, value_count); ++ for (; list; list = list->next) ++ { ++ struct ipa_agg_jf_item item; ++ tree operand = list->value.pass_through.operand; ++ ++ if (list->value.pass_through.formal_id >= 0) ++ { ++ /* Content value is derived from some formal paramerter. */ ++ if (list->value.offset >= 0) ++ item.jftype = IPA_JF_LOAD_AGG; ++ else ++ item.jftype = IPA_JF_PASS_THROUGH; ++ ++ item.value.load_agg = list->value; ++ if (operand) ++ item.value.pass_through.operand ++ = unshare_expr_without_location (operand); ++ } ++ else if (operand) ++ { ++ /* Content value is known constant. */ ++ item.jftype = IPA_JF_CONST; ++ item.value.constant = unshare_expr_without_location (operand); ++ } ++ else ++ continue; ++ ++ item.type = list->type; ++ gcc_assert (tree_to_shwi (TYPE_SIZE (list->type)) == list->size); ++ ++ item.offset = list->offset - arg_offset; ++ gcc_assert ((item.offset % BITS_PER_UNIT) == 0); ++ ++ jfunc->agg.items->quick_push (item); ++ } ++} ++ ++/* Given an assignment statement STMT, try to collect information into ++ AGG_VALUE that will be used to construct jump function for RHS of the ++ assignment, from which content value of an aggregate part comes. ++ ++ Besides constant and simple pass-through jump functions, also try to ++ identify whether it matches the following pattern that can be described by ++ a load-value-from-aggregate jump function, which is a derivative of simple ++ pass-through jump function. ++ ++ foo (int *p) ++ { ++ ... ++ ++ *(q_5 + 4) = *(p_3(D) + 28) op 1; ++ bar (q_5); ++ } ++ ++ Here IPA_LOAD_AGG_DATA data structure is informative enough to describe ++ constant, simple pass-through and load-vale-from-aggregate. If value ++ is constant, it will be kept in field OPERAND, and field FORMAL_ID is ++ set to -1. For simple pass-through and load-value-from-aggregate, field ++ FORMAL_ID specifies the related formal parameter index, and field ++ OFFSET can be used to distinguish them, -1 means simple pass-through, ++ otherwise means load-value-from-aggregate. */ ++ ++static void ++analyze_agg_content_value (struct ipa_func_body_info *fbi, ++ struct ipa_load_agg_data *agg_value, ++ gimple *stmt) ++{ ++ tree lhs = gimple_assign_lhs (stmt); ++ tree rhs1 = gimple_assign_rhs1 (stmt); ++ enum tree_code code; ++ int index = -1; ++ ++ /* Initialize jump function data for the aggregate part. */ ++ memset (agg_value, 0, sizeof (*agg_value)); ++ agg_value->pass_through.operation = NOP_EXPR; ++ agg_value->pass_through.formal_id = -1; ++ agg_value->offset = -1; ++ ++ if (AGGREGATE_TYPE_P (TREE_TYPE (lhs)) /* TODO: Support aggregate type. */ ++ || TREE_THIS_VOLATILE (lhs) ++ || TREE_CODE (lhs) == BIT_FIELD_REF ++ || contains_bitfld_component_ref_p (lhs)) ++ return; ++ ++ /* Skip SSA copies. */ ++ while (gimple_assign_rhs_class (stmt) == GIMPLE_SINGLE_RHS) ++ { ++ if (TREE_CODE (rhs1) != SSA_NAME || SSA_NAME_IS_DEFAULT_DEF (rhs1)) ++ break; ++ ++ stmt = SSA_NAME_DEF_STMT (rhs1); ++ if (!is_gimple_assign (stmt)) ++ return; ++ ++ rhs1 = gimple_assign_rhs1 (stmt); ++ } ++ ++ code = gimple_assign_rhs_code (stmt); ++ switch (gimple_assign_rhs_class (stmt)) + { +- if (list->constant) ++ case GIMPLE_SINGLE_RHS: ++ if (is_gimple_ip_invariant (rhs1)) + { +- struct ipa_agg_jf_item item; +- item.offset = list->offset - arg_offset; +- gcc_assert ((item.offset % BITS_PER_UNIT) == 0); +- item.value = unshare_expr_without_location (list->constant); +- jfunc->agg.items->quick_push (item); ++ agg_value->pass_through.operand = rhs1; ++ return; + } +- list = list->next; ++ code = NOP_EXPR; ++ break; ++ ++ case GIMPLE_UNARY_RHS: ++ /* NOTE: A GIMPLE_UNARY_RHS operation might not be tcc_unary ++ (truth_not_expr is example), GIMPLE_BINARY_RHS does not imply ++ tcc_binary, this subtleness is somewhat misleading. ++ ++ Since tcc_unary is widely used in IPA-CP code to check an operation ++ with one operand, here we only allow tc_unary operation to avoid ++ possible problem. Then we can use (opclass == tc_unary) or not to ++ distinguish unary and binary. */ ++ if (TREE_CODE_CLASS (code) != tcc_unary || CONVERT_EXPR_CODE_P (code)) ++ return; ++ ++ rhs1 = get_ssa_def_if_simple_copy (rhs1, &stmt); ++ break; ++ ++ case GIMPLE_BINARY_RHS: ++ { ++ gimple *rhs1_stmt = stmt; ++ gimple *rhs2_stmt = stmt; ++ tree rhs2 = gimple_assign_rhs2 (stmt); ++ ++ rhs1 = get_ssa_def_if_simple_copy (rhs1, &rhs1_stmt); ++ rhs2 = get_ssa_def_if_simple_copy (rhs2, &rhs2_stmt); ++ ++ if (is_gimple_ip_invariant (rhs2)) ++ { ++ agg_value->pass_through.operand = rhs2; ++ stmt = rhs1_stmt; ++ } ++ else if (is_gimple_ip_invariant (rhs1)) ++ { ++ if (TREE_CODE_CLASS (code) == tcc_comparison) ++ code = swap_tree_comparison (code); ++ else if (!commutative_tree_code (code)) ++ return; ++ ++ agg_value->pass_through.operand = rhs1; ++ stmt = rhs2_stmt; ++ rhs1 = rhs2; ++ } ++ else ++ return; ++ ++ if (TREE_CODE_CLASS (code) != tcc_comparison ++ && !useless_type_conversion_p (TREE_TYPE (lhs), TREE_TYPE (rhs1))) ++ return; ++ } ++ break; ++ ++ default: ++ return; ++ } ++ ++ if (TREE_CODE (rhs1) != SSA_NAME) ++ index = load_from_unmodified_param_or_agg (fbi, fbi->info, stmt, ++ &agg_value->offset, ++ &agg_value->by_ref); ++ else if (SSA_NAME_IS_DEFAULT_DEF (rhs1)) ++ index = ipa_get_param_decl_index (fbi->info, SSA_NAME_VAR (rhs1)); ++ ++ if (index >= 0) ++ { ++ if (agg_value->offset >= 0) ++ agg_value->type = TREE_TYPE (rhs1); ++ agg_value->pass_through.formal_id = index; ++ agg_value->pass_through.operation = code; + } ++ else ++ agg_value->pass_through.operand = NULL_TREE; ++} ++ ++/* If STMT is a memory store to the object whose address is BASE, extract ++ information (offset, size, and value) into CONTENT, and return true, ++ otherwise we conservatively assume the whole object is modified with ++ unknown content, and return false. CHECK_REF means that access to object ++ is expected to be in form of MEM_REF expression. */ ++ ++static bool ++extract_mem_content (struct ipa_func_body_info *fbi, ++ gimple *stmt, tree base, bool check_ref, ++ struct ipa_known_agg_contents_list *content) ++{ ++ HOST_WIDE_INT lhs_offset, lhs_size; ++ bool reverse; ++ ++ if (!is_gimple_assign (stmt)) ++ return false; ++ ++ tree lhs = gimple_assign_lhs (stmt); ++ tree lhs_base = get_ref_base_and_extent_hwi (lhs, &lhs_offset, &lhs_size, ++ &reverse); ++ if (!lhs_base) ++ return false; ++ ++ if (check_ref) ++ { ++ if (TREE_CODE (lhs_base) != MEM_REF ++ || TREE_OPERAND (lhs_base, 0) != base ++ || !integer_zerop (TREE_OPERAND (lhs_base, 1))) ++ return false; ++ } ++ else if (lhs_base != base) ++ return false; ++ ++ content->offset = lhs_offset; ++ content->size = lhs_size; ++ content->type = TREE_TYPE (lhs); ++ content->next = NULL; ++ ++ analyze_agg_content_value (fbi, &content->value, stmt); ++ return true; + } + + /* Traverse statements from CALL backwards, scanning whether an aggregate given +- in ARG is filled in with constant values. ARG can either be an aggregate +- expression or a pointer to an aggregate. ARG_TYPE is the type of the +- aggregate. JFUNC is the jump function into which the constants are +- subsequently stored. */ ++ in ARG is filled in constants or values that are derived from caller's ++ formal parameter in the way described by some kinds of jump functions. FBI ++ is the context of the caller function for interprocedural analysis. ARG can ++ either be an aggregate expression or a pointer to an aggregate. ARG_TYPE is ++ the type of the aggregate, JFUNC is the jump function for the aggregate. */ + + static void +-determine_locally_known_aggregate_parts (gcall *call, tree arg, +- tree arg_type, +- struct ipa_jump_func *jfunc) +-{ +- struct ipa_known_agg_contents_list *list = NULL; +- int item_count = 0, const_count = 0; ++determine_known_aggregate_parts (struct ipa_func_body_info *fbi, ++ gcall *call, tree arg, ++ tree arg_type, ++ struct ipa_jump_func *jfunc) ++{ ++ struct ipa_known_agg_contents_list *list = NULL, *all_list = NULL; ++ bitmap visited = NULL; ++ int item_count = 0, value_count = 0; + HOST_WIDE_INT arg_offset, arg_size; +- gimple_stmt_iterator gsi; + tree arg_base; + bool check_ref, by_ref; + ao_ref r; + +- if (PARAM_VALUE (PARAM_IPA_MAX_AGG_ITEMS) == 0) ++ if ( PARAM_VALUE (PARAM_IPA_MAX_AGG_ITEMS) == 0) + return; + + /* The function operates in three stages. First, we prepare check_ref, r, +@@ -1606,91 +1919,73 @@ determine_locally_known_aggregate_parts + ao_ref_init (&r, arg); + } + +- /* Second stage walks back the BB, looks at individual statements and as long +- as it is confident of how the statements affect contents of the +- aggregates, it builds a sorted linked list of ipa_agg_jf_list structures +- describing it. */ +- gsi = gsi_for_stmt (call); +- gsi_prev (&gsi); +- for (; !gsi_end_p (gsi); gsi_prev (&gsi)) +- { +- struct ipa_known_agg_contents_list *n, **p; +- gimple *stmt = gsi_stmt (gsi); +- HOST_WIDE_INT lhs_offset, lhs_size; +- tree lhs, rhs, lhs_base; +- bool reverse; +- +- if (!stmt_may_clobber_ref_p_1 (stmt, &r)) +- continue; +- if (!gimple_assign_single_p (stmt)) +- break; +- +- lhs = gimple_assign_lhs (stmt); +- rhs = gimple_assign_rhs1 (stmt); +- if (!is_gimple_reg_type (TREE_TYPE (rhs)) +- || TREE_CODE (lhs) == BIT_FIELD_REF +- || contains_bitfld_component_ref_p (lhs)) +- break; ++ /* Second stage traverses virtual SSA web backwards starting from the call ++ statement, only looks at individual dominating virtual operand (its ++ definition dominates the call), as long as it is confident that content ++ of the aggregate is affected by definition of the virtual operand, it ++ builds a sorted linked list of ipa_agg_jf_list describing that. */ + +- lhs_base = get_ref_base_and_extent_hwi (lhs, &lhs_offset, +- &lhs_size, &reverse); +- if (!lhs_base) +- break; ++ for (tree dom_vuse = gimple_vuse (call); dom_vuse;) ++ { ++ gimple *stmt = SSA_NAME_DEF_STMT (dom_vuse); + +- if (check_ref) ++ if (gimple_code (stmt) == GIMPLE_PHI) + { +- if (TREE_CODE (lhs_base) != MEM_REF +- || TREE_OPERAND (lhs_base, 0) != arg_base +- || !integer_zerop (TREE_OPERAND (lhs_base, 1))) +- break; ++ dom_vuse = get_continuation_for_phi (stmt, &r, true, ++ fbi->aa_walk_budget, ++ &visited, false, NULL, NULL); ++ continue; + } +- else if (lhs_base != arg_base) ++ ++ if (stmt_may_clobber_ref_p_1 (stmt, &r)) + { +- if (DECL_P (lhs_base)) +- continue; +- else ++ struct ipa_known_agg_contents_list *content ++ = XALLOCA (struct ipa_known_agg_contents_list); ++ ++ if (!extract_mem_content (fbi, stmt, arg_base, check_ref, content)) + break; +- } + +- bool already_there = false; +- p = get_place_in_agg_contents_list (&list, lhs_offset, lhs_size, +- &already_there); +- if (!p) +- break; +- if (already_there) +- continue; ++ /* Now we get a dominating virtual operand, and need to check ++ whether its value is clobbered any other dominating one. */ ++ if ((content->value.pass_through.formal_id >= 0 ++ || content->value.pass_through.operand) ++ && !clobber_by_agg_contents_list_p (all_list, content)) ++ { ++ struct ipa_known_agg_contents_list *copy ++ = XALLOCA (struct ipa_known_agg_contents_list); + +- rhs = get_ssa_def_if_simple_copy (rhs); +- n = XALLOCA (struct ipa_known_agg_contents_list); +- n->size = lhs_size; +- n->offset = lhs_offset; +- if (is_gimple_ip_invariant (rhs)) +- { +- n->constant = rhs; +- const_count++; ++ /* Add to the list consisting of only dominating virtual ++ operands, whose definitions can finally reach the call. */ ++ add_to_agg_contents_list (&list, (*copy = *content, copy)); ++ ++ if (++value_count == PARAM_VALUE (PARAM_IPA_MAX_AGG_ITEMS)) ++ break; ++ } ++ ++ /* Add to the list consisting of all dominating virtual operands. */ ++ add_to_agg_contents_list (&all_list, content); ++ ++ if (++item_count == 2 * PARAM_VALUE (PARAM_IPA_MAX_AGG_ITEMS)) ++ break; + } +- else +- n->constant = NULL_TREE; +- n->next = *p; +- *p = n; +- +- item_count++; +- if (const_count == PARAM_VALUE (PARAM_IPA_MAX_AGG_ITEMS) +- || item_count == 2 * PARAM_VALUE (PARAM_IPA_MAX_AGG_ITEMS)) +- break; +- } ++ dom_vuse = gimple_vuse (stmt); ++ } ++ ++ if (visited) ++ BITMAP_FREE (visited); + + /* Third stage just goes over the list and creates an appropriate vector of +- ipa_agg_jf_item structures out of it, of sourse only if there are +- any known constants to begin with. */ ++ ipa_agg_jf_item structures out of it, of course only if there are ++ any meaningful items to begin with. */ + +- if (const_count) ++ if (value_count) + { + jfunc->agg.by_ref = by_ref; +- build_agg_jump_func_from_list (list, const_count, arg_offset, jfunc); ++ build_agg_jump_func_from_list (list, value_count, arg_offset, jfunc); + } + } + ++ + /* Return the Ith param type of callee associated with call graph + edge E. */ + +@@ -1797,7 +2092,7 @@ ipa_set_jfunc_vr (ipa_jump_func *jf, enu + jf->m_vr = ipa_get_value_range (type, min, max); + } + +-/* Assign to JF a pointer to a value_range just liek TMP but either fetch a ++/* Assign to JF a pointer to a value_range just like TMP but either fetch a + copy from ipa_vr_hash_table or allocate a new on in GC memory. */ + + static void +@@ -1814,8 +2109,8 @@ static void + ipa_compute_jump_functions_for_edge (struct ipa_func_body_info *fbi, + struct cgraph_edge *cs) + { +- struct ipa_node_params *info = IPA_NODE_REF (cs->caller); +- struct ipa_edge_args *args = IPA_EDGE_REF (cs); ++ class ipa_node_params *info = IPA_NODE_REF (cs->caller); ++ class ipa_edge_args *args = IPA_EDGE_REF_GET_CREATE (cs); + gcall *call = cs->call_stmt; + int n, arg_num = gimple_call_num_args (call); + bool useful_context = false; +@@ -1839,7 +2134,7 @@ ipa_compute_jump_functions_for_edge (str + if (flag_devirtualize && POINTER_TYPE_P (TREE_TYPE (arg))) + { + tree instance; +- struct ipa_polymorphic_call_context context (cs->caller->decl, ++ class ipa_polymorphic_call_context context (cs->caller->decl, + arg, cs->call_stmt, + &instance); + context.get_dynamic_type (instance, arg, NULL, cs->call_stmt, +@@ -1978,7 +2273,7 @@ ipa_compute_jump_functions_for_edge (str + || !ipa_get_jf_ancestor_agg_preserved (jfunc)) + && (AGGREGATE_TYPE_P (TREE_TYPE (arg)) + || POINTER_TYPE_P (param_type))) +- determine_locally_known_aggregate_parts (call, arg, param_type, jfunc); ++ determine_known_aggregate_parts (fbi, call, arg, param_type, jfunc); + } + if (!useful_context) + vec_free (args->polymorphic_call_contexts); +@@ -2076,11 +2371,12 @@ ipa_is_ssa_with_stmt_def (tree t) + + /* Find the indirect call graph edge corresponding to STMT and mark it as a + call to a parameter number PARAM_INDEX. NODE is the caller. Return the +- indirect call graph edge. */ ++ indirect call graph edge. ++ If POLYMORPHIC is true record is as a destination of polymorphic call. */ + + static struct cgraph_edge * + ipa_note_param_call (struct cgraph_node *node, int param_index, +- gcall *stmt) ++ gcall *stmt, bool polymorphic) + { + struct cgraph_edge *cs; + +@@ -2089,6 +2385,11 @@ ipa_note_param_call (struct cgraph_node + cs->indirect_info->agg_contents = 0; + cs->indirect_info->member_ptr = 0; + cs->indirect_info->guaranteed_unmodified = 0; ++ ipa_set_param_used_by_indirect_call (IPA_NODE_REF (node), ++ param_index, true); ++ if (cs->indirect_info->polymorphic || polymorphic) ++ ipa_set_param_used_by_polymorphic_call ++ (IPA_NODE_REF (node), param_index, true); + return cs; + } + +@@ -2155,7 +2456,7 @@ static void + ipa_analyze_indirect_call_uses (struct ipa_func_body_info *fbi, gcall *call, + tree target) + { +- struct ipa_node_params *info = fbi->info; ++ class ipa_node_params *info = fbi->info; + HOST_WIDE_INT offset; + bool by_ref; + +@@ -2164,7 +2465,7 @@ ipa_analyze_indirect_call_uses (struct i + tree var = SSA_NAME_VAR (target); + int index = ipa_get_param_decl_index (info, var); + if (index >= 0) +- ipa_note_param_call (fbi->node, index, call); ++ ipa_note_param_call (fbi->node, index, call, false); + return; + } + +@@ -2176,7 +2477,8 @@ ipa_analyze_indirect_call_uses (struct i + gimple_assign_rhs1 (def), &index, &offset, + NULL, &by_ref, &guaranteed_unmodified)) + { +- struct cgraph_edge *cs = ipa_note_param_call (fbi->node, index, call); ++ struct cgraph_edge *cs = ipa_note_param_call (fbi->node, index, ++ call, false); + cs->indirect_info->offset = offset; + cs->indirect_info->agg_contents = 1; + cs->indirect_info->by_ref = by_ref; +@@ -2277,7 +2579,8 @@ ipa_analyze_indirect_call_uses (struct i + if (index >= 0 + && parm_preserved_before_stmt_p (fbi, index, call, rec)) + { +- struct cgraph_edge *cs = ipa_note_param_call (fbi->node, index, call); ++ struct cgraph_edge *cs = ipa_note_param_call (fbi->node, index, ++ call, false); + cs->indirect_info->offset = offset; + cs->indirect_info->agg_contents = 1; + cs->indirect_info->member_ptr = 1; +@@ -2306,7 +2609,7 @@ ipa_analyze_virtual_call_uses (struct ip + if (TREE_CODE (obj) != SSA_NAME) + return; + +- struct ipa_node_params *info = fbi->info; ++ class ipa_node_params *info = fbi->info; + if (SSA_NAME_IS_DEFAULT_DEF (obj)) + { + struct ipa_jump_func jfunc; +@@ -2337,8 +2640,9 @@ ipa_analyze_virtual_call_uses (struct ip + return; + } + +- struct cgraph_edge *cs = ipa_note_param_call (fbi->node, index, call); +- struct cgraph_indirect_call_info *ii = cs->indirect_info; ++ struct cgraph_edge *cs = ipa_note_param_call (fbi->node, index, ++ call, true); ++ class cgraph_indirect_call_info *ii = cs->indirect_info; + ii->offset = anc_offset; + ii->otr_token = tree_to_uhwi (OBJ_TYPE_REF_TOKEN (target)); + ii->otr_type = obj_type_ref_class (target); +@@ -2410,7 +2714,7 @@ ipa_analyze_stmt_uses (struct ipa_func_b + static bool + visit_ref_for_mod_analysis (gimple *, tree op, tree, void *data) + { +- struct ipa_node_params *info = (struct ipa_node_params *) data; ++ class ipa_node_params *info = (class ipa_node_params *) data; + + op = get_base_address (op); + if (op +@@ -2458,7 +2762,7 @@ ipa_analyze_params_uses_in_bb (struct ip + static void + ipa_analyze_controlled_uses (struct cgraph_node *node) + { +- struct ipa_node_params *info = IPA_NODE_REF (node); ++ class ipa_node_params *info = IPA_NODE_REF (node); + + for (int i = 0; i < ipa_get_param_count (info); i++) + { +@@ -2550,11 +2854,11 @@ void + ipa_analyze_node (struct cgraph_node *node) + { + struct ipa_func_body_info fbi; +- struct ipa_node_params *info; ++ class ipa_node_params *info; + + ipa_check_create_node_params (); + ipa_check_create_edge_args (); +- info = IPA_NODE_REF (node); ++ info = IPA_NODE_REF_GET_CREATE (node); + + if (info->analysis_done) + return; +@@ -2610,22 +2914,96 @@ static void + update_jump_functions_after_inlining (struct cgraph_edge *cs, + struct cgraph_edge *e) + { +- struct ipa_edge_args *top = IPA_EDGE_REF (cs); +- struct ipa_edge_args *args = IPA_EDGE_REF (e); ++ class ipa_edge_args *top = IPA_EDGE_REF (cs); ++ class ipa_edge_args *args = IPA_EDGE_REF (e); ++ if (!args) ++ return; + int count = ipa_get_cs_argument_count (args); + int i; + + for (i = 0; i < count; i++) + { + struct ipa_jump_func *dst = ipa_get_ith_jump_func (args, i); +- struct ipa_polymorphic_call_context *dst_ctx ++ class ipa_polymorphic_call_context *dst_ctx + = ipa_get_ith_polymorhic_call_context (args, i); + ++ if (dst->agg.items) ++ { ++ struct ipa_agg_jf_item *item; ++ int j; ++ ++ FOR_EACH_VEC_ELT (*dst->agg.items, j, item) ++ { ++ int dst_fid; ++ struct ipa_jump_func *src; ++ ++ if (item->jftype != IPA_JF_PASS_THROUGH ++ && item->jftype != IPA_JF_LOAD_AGG) ++ continue; ++ ++ dst_fid = item->value.pass_through.formal_id; ++ if (!top || dst_fid >= ipa_get_cs_argument_count (top)) ++ { ++ item->jftype = IPA_JF_UNKNOWN; ++ continue; ++ } ++ ++ item->value.pass_through.formal_id = -1; ++ src = ipa_get_ith_jump_func (top, dst_fid); ++ if (src->type == IPA_JF_CONST) ++ { ++ if (item->jftype == IPA_JF_PASS_THROUGH ++ && item->value.pass_through.operation == NOP_EXPR) ++ { ++ item->jftype = IPA_JF_CONST; ++ item->value.constant = src->value.constant.value; ++ continue; ++ } ++ } ++ else if (src->type == IPA_JF_PASS_THROUGH ++ && src->value.pass_through.operation == NOP_EXPR) ++ { ++ if (item->jftype == IPA_JF_PASS_THROUGH ++ || !item->value.load_agg.by_ref ++ || src->value.pass_through.agg_preserved) ++ item->value.pass_through.formal_id ++ = src->value.pass_through.formal_id; ++ } ++ else if (src->type == IPA_JF_ANCESTOR) ++ { ++ if (item->jftype == IPA_JF_PASS_THROUGH) ++ { ++ if (!src->value.ancestor.offset) ++ item->value.pass_through.formal_id ++ = src->value.ancestor.formal_id; ++ } ++ else if (src->value.ancestor.agg_preserved) ++ { ++ gcc_checking_assert (item->value.load_agg.by_ref); ++ ++ item->value.pass_through.formal_id ++ = src->value.ancestor.formal_id; ++ item->value.load_agg.offset ++ += src->value.ancestor.offset; ++ } ++ } ++ ++ if (item->value.pass_through.formal_id < 0) ++ item->jftype = IPA_JF_UNKNOWN; ++ } ++ } ++ ++ if (!top) ++ { ++ ipa_set_jf_unknown (dst); ++ continue; ++ } ++ + if (dst->type == IPA_JF_ANCESTOR) + { + struct ipa_jump_func *src; + int dst_fid = dst->value.ancestor.formal_id; +- struct ipa_polymorphic_call_context *src_ctx ++ class ipa_polymorphic_call_context *src_ctx + = ipa_get_ith_polymorhic_call_context (top, dst_fid); + + /* Variable number of arguments can cause havoc if we try to access +@@ -2641,7 +3019,7 @@ update_jump_functions_after_inlining (st + + if (src_ctx && !src_ctx->useless_p ()) + { +- struct ipa_polymorphic_call_context ctx = *src_ctx; ++ class ipa_polymorphic_call_context ctx = *src_ctx; + + /* TODO: Make type preserved safe WRT contexts. */ + if (!ipa_get_jf_ancestor_type_preserved (dst)) +@@ -2660,8 +3038,11 @@ update_jump_functions_after_inlining (st + } + } + +- if (src->agg.items +- && (dst->value.ancestor.agg_preserved || !src->agg.by_ref)) ++ /* Parameter and argument in ancestor jump function must be pointer ++ type, which means access to aggregate must be by-reference. */ ++ gcc_assert (!src->agg.items || src->agg.by_ref); ++ ++ if (src->agg.items && dst->value.ancestor.agg_preserved) + { + struct ipa_agg_jf_item *item; + int j; +@@ -2705,18 +3086,18 @@ update_jump_functions_after_inlining (st + /* We must check range due to calls with variable number of arguments + and we cannot combine jump functions with operations. */ + if (dst->value.pass_through.operation == NOP_EXPR +- && (dst->value.pass_through.formal_id ++ && (top && dst->value.pass_through.formal_id + < ipa_get_cs_argument_count (top))) + { + int dst_fid = dst->value.pass_through.formal_id; + src = ipa_get_ith_jump_func (top, dst_fid); + bool dst_agg_p = ipa_get_jf_pass_through_agg_preserved (dst); +- struct ipa_polymorphic_call_context *src_ctx ++ class ipa_polymorphic_call_context *src_ctx + = ipa_get_ith_polymorhic_call_context (top, dst_fid); + + if (src_ctx && !src_ctx->useless_p ()) + { +- struct ipa_polymorphic_call_context ctx = *src_ctx; ++ class ipa_polymorphic_call_context ctx = *src_ctx; + + /* TODO: Make type preserved safe WRT contexts. */ + if (!ipa_get_jf_pass_through_type_preserved (dst)) +@@ -2856,7 +3237,7 @@ ipa_make_edge_direct_to_target (struct c + + /* Because may-edges are not explicitely represented and vtable may be external, + we may create the first reference to the object in the unit. */ +- if (!callee || callee->global.inlined_to) ++ if (!callee || callee->inlined_to) + { + + /* We are better to ensure we can refer to it. +@@ -2909,7 +3290,7 @@ ipa_make_edge_direct_to_target (struct c + + /* We cannot make edges to inline clones. It is bug that someone removed + the cgraph node too early. */ +- gcc_assert (!callee->global.inlined_to); ++ gcc_assert (!callee->inlined_to); + + if (dump_file && !unreachable) + { +@@ -3059,18 +3440,19 @@ ipa_find_agg_cst_from_init (tree scalar, + return find_constructor_constant_at_offset (DECL_INITIAL (scalar), offset); + } + +-/* Retrieve value from aggregate jump function AGG or static initializer of +- SCALAR (which can be NULL) for the given OFFSET or return NULL if there is +- none. BY_REF specifies whether the value has to be passed by reference or +- by value. If FROM_GLOBAL_CONSTANT is non-NULL, then the boolean it points +- to is set to true if the value comes from an initializer of a constant. */ ++/* Retrieve value from AGG, a set of known offset/value for an aggregate or ++ static initializer of SCALAR (which can be NULL) for the given OFFSET or ++ return NULL if there is none. BY_REF specifies whether the value has to be ++ passed by reference or by value. If FROM_GLOBAL_CONSTANT is non-NULL, then ++ the boolean it points to is set to true if the value comes from an ++ initializer of a constant. */ + + tree +-ipa_find_agg_cst_for_param (struct ipa_agg_jump_function *agg, tree scalar, ++ipa_find_agg_cst_for_param (struct ipa_agg_value_set *agg, tree scalar, + HOST_WIDE_INT offset, bool by_ref, + bool *from_global_constant) + { +- struct ipa_agg_jf_item *item; ++ struct ipa_agg_value *item; + int i; + + if (scalar) +@@ -3088,7 +3470,7 @@ ipa_find_agg_cst_for_param (struct ipa_a + || by_ref != agg->by_ref) + return NULL; + +- FOR_EACH_VEC_SAFE_ELT (agg->items, i, item) ++ FOR_EACH_VEC_ELT (agg->items, i, item) + if (item->offset == offset) + { + /* Currently we do not have clobber values, return NULL for them once +@@ -3184,12 +3566,14 @@ try_decrement_rdesc_refcount (struct ipa + pointer formal parameter described by jump function JFUNC. TARGET_TYPE is + the type of the parameter to which the result of JFUNC is passed. If it can + be determined, return the newly direct edge, otherwise return NULL. +- NEW_ROOT_INFO is the node info that JFUNC lattices are relative to. */ ++ NEW_ROOT and NEW_ROOT_INFO is the node and its info that JFUNC lattices are ++ relative to. */ + + static struct cgraph_edge * + try_make_edge_direct_simple_call (struct cgraph_edge *ie, + struct ipa_jump_func *jfunc, tree target_type, +- struct ipa_node_params *new_root_info) ++ struct cgraph_node *new_root, ++ class ipa_node_params *new_root_info) + { + struct cgraph_edge *cs; + tree target; +@@ -3198,10 +3582,14 @@ try_make_edge_direct_simple_call (struct + if (agg_contents) + { + bool from_global_constant; +- target = ipa_find_agg_cst_for_param (&jfunc->agg, scalar, ++ ipa_agg_value_set agg = ipa_agg_value_set_from_jfunc (new_root_info, ++ new_root, ++ &jfunc->agg); ++ target = ipa_find_agg_cst_for_param (&agg, scalar, + ie->indirect_info->offset, + ie->indirect_info->by_ref, + &from_global_constant); ++ agg.release (); + if (target + && !from_global_constant + && !ie->indirect_info->guaranteed_unmodified) +@@ -3255,12 +3643,16 @@ ipa_impossible_devirt_target (struct cgr + call based on a formal parameter which is described by jump function JFUNC + and if it can be determined, make it direct and return the direct edge. + Otherwise, return NULL. CTX describes the polymorphic context that the +- parameter the call is based on brings along with it. */ ++ parameter the call is based on brings along with it. NEW_ROOT and ++ NEW_ROOT_INFO is the node and its info that JFUNC lattices are relative ++ to. */ + + static struct cgraph_edge * + try_make_edge_direct_virtual_call (struct cgraph_edge *ie, + struct ipa_jump_func *jfunc, +- struct ipa_polymorphic_call_context ctx) ++ class ipa_polymorphic_call_context ctx, ++ struct cgraph_node *new_root, ++ class ipa_node_params *new_root_info) + { + tree target = NULL; + bool speculative = false; +@@ -3278,9 +3670,13 @@ try_make_edge_direct_virtual_call (struc + unsigned HOST_WIDE_INT offset; + tree scalar = (jfunc->type == IPA_JF_CONST) ? ipa_get_jf_constant (jfunc) + : NULL; +- tree t = ipa_find_agg_cst_for_param (&jfunc->agg, scalar, ++ ipa_agg_value_set agg = ipa_agg_value_set_from_jfunc (new_root_info, ++ new_root, ++ &jfunc->agg); ++ tree t = ipa_find_agg_cst_for_param (&agg, scalar, + ie->indirect_info->offset, + true); ++ agg.release (); + if (t && vtable_pointer_value_to_vtable (t, &vtable, &offset)) + { + bool can_refer; +@@ -3370,21 +3766,22 @@ update_indirect_edges_after_inlining (st + struct cgraph_node *node, + vec *new_edges) + { +- struct ipa_edge_args *top; ++ class ipa_edge_args *top; + struct cgraph_edge *ie, *next_ie, *new_direct_edge; +- struct ipa_node_params *new_root_info, *inlined_node_info; ++ struct cgraph_node *new_root; ++ class ipa_node_params *new_root_info, *inlined_node_info; + bool res = false; + + ipa_check_create_edge_args (); + top = IPA_EDGE_REF (cs); +- new_root_info = IPA_NODE_REF (cs->caller->global.inlined_to +- ? cs->caller->global.inlined_to +- : cs->caller); ++ new_root = cs->caller->inlined_to ++ ? cs->caller->inlined_to : cs->caller; ++ new_root_info = IPA_NODE_REF (new_root); + inlined_node_info = IPA_NODE_REF (cs->callee->function_symbol ()); + + for (ie = node->indirect_calls; ie; ie = next_ie) + { +- struct cgraph_indirect_call_info *ici = ie->indirect_info; ++ class cgraph_indirect_call_info *ici = ie->indirect_info; + struct ipa_jump_func *jfunc; + int param_index; + cgraph_node *spec_target = NULL; +@@ -3395,7 +3792,7 @@ update_indirect_edges_after_inlining (st + continue; + + /* We must check range due to calls with variable number of arguments: */ +- if (ici->param_index >= ipa_get_cs_argument_count (top)) ++ if (!top || ici->param_index >= ipa_get_cs_argument_count (top)) + { + ici->param_index = -1; + continue; +@@ -3418,13 +3815,16 @@ update_indirect_edges_after_inlining (st + { + ipa_polymorphic_call_context ctx; + ctx = ipa_context_from_jfunc (new_root_info, cs, param_index, jfunc); +- new_direct_edge = try_make_edge_direct_virtual_call (ie, jfunc, ctx); ++ new_direct_edge = try_make_edge_direct_virtual_call (ie, jfunc, ctx, ++ new_root, ++ new_root_info); + } + else + { + tree target_type = ipa_get_type (inlined_node_info, param_index); + new_direct_edge = try_make_edge_direct_simple_call (ie, jfunc, + target_type, ++ new_root, + new_root_info); + } + +@@ -3470,6 +3870,11 @@ update_indirect_edges_after_inlining (st + if (ici->polymorphic + && !ipa_get_jf_pass_through_type_preserved (jfunc)) + ici->vptr_changed = true; ++ ipa_set_param_used_by_indirect_call (new_root_info, ++ ici->param_index, true); ++ if (ici->polymorphic) ++ ipa_set_param_used_by_polymorphic_call (new_root_info, ++ ici->param_index, true); + } + } + else if (jfunc->type == IPA_JF_ANCESTOR) +@@ -3485,6 +3890,11 @@ update_indirect_edges_after_inlining (st + if (ici->polymorphic + && !ipa_get_jf_ancestor_type_preserved (jfunc)) + ici->vptr_changed = true; ++ ipa_set_param_used_by_indirect_call (new_root_info, ++ ici->param_index, true); ++ if (ici->polymorphic) ++ ipa_set_param_used_by_polymorphic_call (new_root_info, ++ ici->param_index, true); + } + } + else +@@ -3541,13 +3951,18 @@ combine_controlled_uses_counters (int c, + static void + propagate_controlled_uses (struct cgraph_edge *cs) + { +- struct ipa_edge_args *args = IPA_EDGE_REF (cs); +- struct cgraph_node *new_root = cs->caller->global.inlined_to +- ? cs->caller->global.inlined_to : cs->caller; +- struct ipa_node_params *new_root_info = IPA_NODE_REF (new_root); +- struct ipa_node_params *old_root_info = IPA_NODE_REF (cs->callee); ++ class ipa_edge_args *args = IPA_EDGE_REF (cs); ++ if (!args) ++ return; ++ struct cgraph_node *new_root = cs->caller->inlined_to ++ ? cs->caller->inlined_to : cs->caller; ++ class ipa_node_params *new_root_info = IPA_NODE_REF (new_root); ++ class ipa_node_params *old_root_info = IPA_NODE_REF (cs->callee); + int count, i; + ++ if (!old_root_info) ++ return; ++ + count = MIN (ipa_get_cs_argument_count (args), + ipa_get_param_count (old_root_info)); + for (i = 0; i < count; i++) +@@ -3608,9 +4023,9 @@ propagate_controlled_uses (struct cgraph + gcc_checking_assert (ok); + + clone = cs->caller; +- while (clone->global.inlined_to +- && clone != rdesc->cs->caller +- && IPA_NODE_REF (clone)->ipcp_orig_node) ++ while (clone->inlined_to ++ && clone->ipcp_clone ++ && clone != rdesc->cs->caller) + { + struct ipa_ref *ref; + ref = clone->find_reference (n, NULL, 0); +@@ -3669,6 +4084,7 @@ ipa_propagate_indirect_call_infos (struc + + propagate_controlled_uses (cs); + changed = propagate_info_to_inlined_callees (cs, cs->callee, new_edges); ++ ipa_node_params_sum->remove (cs->callee); + + return changed; + } +@@ -3830,16 +4246,16 @@ ipa_edge_args_sum_t::duplicate (cgraph_e + We need to find the duplicate that refers to our tree of + inline clones. */ + +- gcc_assert (dst->caller->global.inlined_to); ++ gcc_assert (dst->caller->inlined_to); + for (dst_rdesc = src_rdesc->next_duplicate; + dst_rdesc; + dst_rdesc = dst_rdesc->next_duplicate) + { + struct cgraph_node *top; +- top = dst_rdesc->cs->caller->global.inlined_to +- ? dst_rdesc->cs->caller->global.inlined_to ++ top = dst_rdesc->cs->caller->inlined_to ++ ? dst_rdesc->cs->caller->inlined_to + : dst_rdesc->cs->caller; +- if (dst->caller->global.inlined_to == top) ++ if (dst->caller->inlined_to == top) + break; + } + gcc_assert (dst_rdesc); +@@ -3849,9 +4265,9 @@ ipa_edge_args_sum_t::duplicate (cgraph_e + else if (dst_jf->type == IPA_JF_PASS_THROUGH + && src->caller == dst->caller) + { +- struct cgraph_node *inline_root = dst->caller->global.inlined_to +- ? dst->caller->global.inlined_to : dst->caller; +- struct ipa_node_params *root_info = IPA_NODE_REF (inline_root); ++ struct cgraph_node *inline_root = dst->caller->inlined_to ++ ? dst->caller->inlined_to : dst->caller; ++ class ipa_node_params *root_info = IPA_NODE_REF (inline_root); + int idx = ipa_get_jf_pass_through_formal_id (dst_jf); + + int c = ipa_get_controlled_uses (root_info, idx); +@@ -3995,7 +4411,7 @@ void + ipa_print_node_params (FILE *f, struct cgraph_node *node) + { + int i, count; +- struct ipa_node_params *info; ++ class ipa_node_params *info; + + if (!node->definition) + return; +@@ -4010,6 +4426,12 @@ ipa_print_node_params (FILE *f, struct c + ipa_dump_param (f, info, i); + if (ipa_is_param_used (info, i)) + fprintf (f, " used"); ++ if (ipa_is_param_used_by_ipa_predicates (info, i)) ++ fprintf (f, " used_by_ipa_predicates"); ++ if (ipa_is_param_used_by_indirect_call (info, i)) ++ fprintf (f, " used_by_indirect_call"); ++ if (ipa_is_param_used_by_polymorphic_call (info, i)) ++ fprintf (f, " used_by_polymorphic_call"); + c = ipa_get_controlled_uses (info, i); + if (c == IPA_UNDESCRIBED_USE) + fprintf (f, " undescribed_use"); +@@ -4104,6 +4526,8 @@ ipa_write_jump_function (struct output_b + bp_pack_value (&bp, jump_func->value.ancestor.agg_preserved, 1); + streamer_write_bitpack (&bp); + break; ++ default: ++ fatal_error (UNKNOWN_LOCATION, "invalid jump function in LTO stream"); + } + + count = vec_safe_length (jump_func->agg.items); +@@ -4117,8 +4541,36 @@ ipa_write_jump_function (struct output_b + + FOR_EACH_VEC_SAFE_ELT (jump_func->agg.items, i, item) + { ++ stream_write_tree (ob, item->type, true); + streamer_write_uhwi (ob, item->offset); +- stream_write_tree (ob, item->value, true); ++ streamer_write_uhwi (ob, item->jftype); ++ switch (item->jftype) ++ { ++ case IPA_JF_UNKNOWN: ++ break; ++ case IPA_JF_CONST: ++ stream_write_tree (ob, item->value.constant, true); ++ break; ++ case IPA_JF_PASS_THROUGH: ++ case IPA_JF_LOAD_AGG: ++ streamer_write_uhwi (ob, item->value.pass_through.operation); ++ streamer_write_uhwi (ob, item->value.pass_through.formal_id); ++ if (TREE_CODE_CLASS (item->value.pass_through.operation) ++ != tcc_unary) ++ stream_write_tree (ob, item->value.pass_through.operand, true); ++ if (item->jftype == IPA_JF_LOAD_AGG) ++ { ++ stream_write_tree (ob, item->value.load_agg.type, true); ++ streamer_write_uhwi (ob, item->value.load_agg.offset); ++ bp = bitpack_create (ob->main_stream); ++ bp_pack_value (&bp, item->value.load_agg.by_ref, 1); ++ streamer_write_bitpack (&bp); ++ } ++ break; ++ default: ++ fatal_error (UNKNOWN_LOCATION, ++ "invalid jump function in LTO stream"); ++ } + } + + bp = bitpack_create (ob->main_stream); +@@ -4143,10 +4595,10 @@ ipa_write_jump_function (struct output_b + /* Read in jump function JUMP_FUNC from IB. */ + + static void +-ipa_read_jump_function (struct lto_input_block *ib, ++ipa_read_jump_function (class lto_input_block *ib, + struct ipa_jump_func *jump_func, + struct cgraph_edge *cs, +- struct data_in *data_in, ++ class data_in *data_in, + bool prevails) + { + enum jump_func_type jftype; +@@ -4215,8 +4667,39 @@ ipa_read_jump_function (struct lto_input + for (i = 0; i < count; i++) + { + struct ipa_agg_jf_item item; ++ item.type = stream_read_tree (ib, data_in); + item.offset = streamer_read_uhwi (ib); +- item.value = stream_read_tree (ib, data_in); ++ item.jftype = (enum jump_func_type) streamer_read_uhwi (ib); ++ ++ switch (item.jftype) ++ { ++ case IPA_JF_UNKNOWN: ++ break; ++ case IPA_JF_CONST: ++ item.value.constant = stream_read_tree (ib, data_in); ++ break; ++ case IPA_JF_PASS_THROUGH: ++ case IPA_JF_LOAD_AGG: ++ operation = (enum tree_code) streamer_read_uhwi (ib); ++ item.value.pass_through.operation = operation; ++ item.value.pass_through.formal_id = streamer_read_uhwi (ib); ++ if (TREE_CODE_CLASS (operation) == tcc_unary) ++ item.value.pass_through.operand = NULL_TREE; ++ else ++ item.value.pass_through.operand = stream_read_tree (ib, data_in); ++ if (item.jftype == IPA_JF_LOAD_AGG) ++ { ++ struct bitpack_d bp; ++ item.value.load_agg.type = stream_read_tree (ib, data_in); ++ item.value.load_agg.offset = streamer_read_uhwi (ib); ++ bp = streamer_read_bitpack (ib); ++ item.value.load_agg.by_ref = bp_unpack_value (&bp, 1); ++ } ++ break; ++ default: ++ fatal_error (UNKNOWN_LOCATION, ++ "invalid jump function in LTO stream"); ++ } + if (prevails) + jump_func->agg.items->quick_push (item); + } +@@ -4255,7 +4738,7 @@ static void + ipa_write_indirect_edge_info (struct output_block *ob, + struct cgraph_edge *cs) + { +- struct cgraph_indirect_call_info *ii = cs->indirect_info; ++ class cgraph_indirect_call_info *ii = cs->indirect_info; + struct bitpack_d bp; + + streamer_write_hwi (ob, ii->param_index); +@@ -4284,11 +4767,12 @@ ipa_write_indirect_edge_info (struct out + relevant to indirect inlining from IB. */ + + static void +-ipa_read_indirect_edge_info (struct lto_input_block *ib, +- struct data_in *data_in, +- struct cgraph_edge *cs) ++ipa_read_indirect_edge_info (class lto_input_block *ib, ++ class data_in *data_in, ++ struct cgraph_edge *cs, ++ class ipa_node_params *info) + { +- struct cgraph_indirect_call_info *ii = cs->indirect_info; ++ class cgraph_indirect_call_info *ii = cs->indirect_info; + struct bitpack_d bp; + + ii->param_index = (int) streamer_read_hwi (ib); +@@ -4309,6 +4793,14 @@ ipa_read_indirect_edge_info (struct lto_ + ii->otr_type = stream_read_tree (ib, data_in); + ii->context.stream_in (ib, data_in); + } ++ if (info && ii->param_index >= 0) ++ { ++ if (ii->polymorphic) ++ ipa_set_param_used_by_polymorphic_call (info, ++ ii->param_index , true); ++ ipa_set_param_used_by_indirect_call (info, ++ ii->param_index, true); ++ } + } + + /* Stream out NODE info to OB. */ +@@ -4318,7 +4810,7 @@ ipa_write_node_info (struct output_block + { + int node_ref; + lto_symtab_encoder_t encoder; +- struct ipa_node_params *info = IPA_NODE_REF (node); ++ class ipa_node_params *info = IPA_NODE_REF (node); + int j; + struct cgraph_edge *e; + struct bitpack_d bp; +@@ -4345,7 +4837,13 @@ ipa_write_node_info (struct output_block + } + for (e = node->callees; e; e = e->next_callee) + { +- struct ipa_edge_args *args = IPA_EDGE_REF (e); ++ class ipa_edge_args *args = IPA_EDGE_REF (e); ++ ++ if (!args) ++ { ++ streamer_write_uhwi (ob, 0); ++ continue; ++ } + + streamer_write_uhwi (ob, + ipa_get_cs_argument_count (args) * 2 +@@ -4359,16 +4857,20 @@ ipa_write_node_info (struct output_block + } + for (e = node->indirect_calls; e; e = e->next_callee) + { +- struct ipa_edge_args *args = IPA_EDGE_REF (e); +- +- streamer_write_uhwi (ob, +- ipa_get_cs_argument_count (args) * 2 +- + (args->polymorphic_call_contexts != NULL)); +- for (j = 0; j < ipa_get_cs_argument_count (args); j++) ++ class ipa_edge_args *args = IPA_EDGE_REF (e); ++ if (!args) ++ streamer_write_uhwi (ob, 0); ++ else + { +- ipa_write_jump_function (ob, ipa_get_ith_jump_func (args, j)); +- if (args->polymorphic_call_contexts != NULL) +- ipa_get_ith_polymorhic_call_context (args, j)->stream_out (ob); ++ streamer_write_uhwi (ob, ++ ipa_get_cs_argument_count (args) * 2 ++ + (args->polymorphic_call_contexts != NULL)); ++ for (j = 0; j < ipa_get_cs_argument_count (args); j++) ++ { ++ ipa_write_jump_function (ob, ipa_get_ith_jump_func (args, j)); ++ if (args->polymorphic_call_contexts != NULL) ++ ipa_get_ith_polymorhic_call_context (args, j)->stream_out (ob); ++ } + } + ipa_write_indirect_edge_info (ob, e); + } +@@ -4377,8 +4879,8 @@ ipa_write_node_info (struct output_block + /* Stream in edge E from IB. */ + + static void +-ipa_read_edge_info (struct lto_input_block *ib, +- struct data_in *data_in, ++ipa_read_edge_info (class lto_input_block *ib, ++ class data_in *data_in, + struct cgraph_edge *e, bool prevails) + { + int count = streamer_read_uhwi (ib); +@@ -4389,7 +4891,7 @@ ipa_read_edge_info (struct lto_input_blo + return; + if (prevails && e->possibly_call_in_translation_unit_p ()) + { +- struct ipa_edge_args *args = IPA_EDGE_REF (e); ++ class ipa_edge_args *args = IPA_EDGE_REF_GET_CREATE (e); + vec_safe_grow_cleared (args->jump_functions, count); + if (contexts_computed) + vec_safe_grow_cleared (args->polymorphic_call_contexts, count); +@@ -4411,7 +4913,7 @@ ipa_read_edge_info (struct lto_input_blo + data_in, prevails); + if (contexts_computed) + { +- struct ipa_polymorphic_call_context ctx; ++ class ipa_polymorphic_call_context ctx; + ctx.stream_in (ib, data_in); + } + } +@@ -4421,14 +4923,15 @@ ipa_read_edge_info (struct lto_input_blo + /* Stream in NODE info from IB. */ + + static void +-ipa_read_node_info (struct lto_input_block *ib, struct cgraph_node *node, +- struct data_in *data_in) ++ipa_read_node_info (class lto_input_block *ib, struct cgraph_node *node, ++ class data_in *data_in) + { + int k; + struct cgraph_edge *e; + struct bitpack_d bp; + bool prevails = node->prevailing_p (); +- struct ipa_node_params *info = prevails ? IPA_NODE_REF (node) : NULL; ++ class ipa_node_params *info = prevails ++ ? IPA_NODE_REF_GET_CREATE (node) : NULL; + + int param_count = streamer_read_uhwi (ib); + if (prevails) +@@ -4468,7 +4971,7 @@ ipa_read_node_info (struct lto_input_blo + for (e = node->indirect_calls; e; e = e->next_callee) + { + ipa_read_edge_info (ib, data_in, e, prevails); +- ipa_read_indirect_edge_info (ib, data_in, e); ++ ipa_read_indirect_edge_info (ib, data_in, e, info); + } + } + +@@ -4525,7 +5028,7 @@ ipa_prop_read_section (struct lto_file_d + const int cfg_offset = sizeof (struct lto_function_header); + const int main_offset = cfg_offset + header->cfg_size; + const int string_offset = main_offset + header->main_size; +- struct data_in *data_in; ++ class data_in *data_in; + unsigned int i; + unsigned int count; + +@@ -4774,7 +5277,7 @@ read_replacements_section (struct lto_fi + const int cfg_offset = sizeof (struct lto_function_header); + const int main_offset = cfg_offset + header->cfg_size; + const int string_offset = main_offset + header->main_size; +- struct data_in *data_in; ++ class data_in *data_in; + unsigned int i; + unsigned int count; + +@@ -4888,7 +5391,8 @@ ipcp_modif_dom_walker::before_dom_childr + struct ipa_agg_replacement_value *v; + gimple *stmt = gsi_stmt (gsi); + tree rhs, val, t; +- HOST_WIDE_INT offset, size; ++ HOST_WIDE_INT offset; ++ poly_int64 size; + int index; + bool by_ref, vce; + +@@ -4923,7 +5427,8 @@ ipcp_modif_dom_walker::before_dom_childr + break; + if (!v + || v->by_ref != by_ref +- || tree_to_shwi (TYPE_SIZE (TREE_TYPE (v->value))) != size) ++ || maybe_ne (tree_to_poly_int64 (TYPE_SIZE (TREE_TYPE (v->value))), ++ size)) + continue; + + gcc_checking_assert (is_gimple_ip_invariant (v->value)); +@@ -5194,4 +5699,12 @@ ipcp_transform_function (struct cgraph_n + return TODO_update_ssa_only_virtuals; + } + ++ ++/* Return true if OTHER describes same agg value. */ ++bool ++ipa_agg_value::equal_to (const ipa_agg_value &other) ++{ ++ return offset == other.offset ++ && operand_equal_p (value, other.value, 0); ++} + #include "gt-ipa-prop.h" +diff -Nurp a/gcc/ipa-prop.h b/gcc/ipa-prop.h +--- a/gcc/ipa-prop.h 2020-04-30 15:14:04.624000000 +0800 ++++ b/gcc/ipa-prop.h 2020-04-30 15:14:56.696000000 +0800 +@@ -39,6 +39,15 @@ along with GCC; see the file COPYING3. + argument. + Unknown - neither of the above. + ++ IPA_JF_LOAD_AGG is a compound pass-through jump function, in which primary ++ operation on formal parameter is memory dereference that loads a value from ++ a part of an aggregate, which is represented or pointed to by the formal ++ parameter. Moreover, an additional unary/binary operation can be applied on ++ the loaded value, and final result is passed as actual argument of callee ++ (e.g. *(param_1(D) + 4) op 24 ). It is meant to describe usage of aggregate ++ parameter or by-reference parameter referenced in argument passing, commonly ++ found in C++ and Fortran. ++ + IPA_JF_ANCESTOR is a special pass-through jump function, which means that + the result is an address of a part of the object pointed to by the formal + parameter to which the function refers. It is mainly intended to represent +@@ -60,6 +69,7 @@ enum jump_func_type + IPA_JF_UNKNOWN = 0, /* newly allocated and zeroed jump functions default */ + IPA_JF_CONST, /* represented by field costant */ + IPA_JF_PASS_THROUGH, /* represented by field pass_through */ ++ IPA_JF_LOAD_AGG, /* represented by field load_agg */ + IPA_JF_ANCESTOR /* represented by field ancestor */ + }; + +@@ -97,6 +107,26 @@ struct GTY(()) ipa_pass_through_data + unsigned agg_preserved : 1; + }; + ++/* Structure holding data required to describe a load-value-from-aggregate ++ jump function. */ ++ ++struct GTY(()) ipa_load_agg_data ++{ ++ /* Inherit from pass through jump function, describing unary/binary ++ operation on the value loaded from aggregate that is represented or ++ pointed to by the formal parameter, specified by formal_id in this ++ pass_through jump function data structure. */ ++ struct ipa_pass_through_data pass_through; ++ /* Type of the value loaded from the aggregate. */ ++ tree type; ++ /* Offset at which the value is located within the aggregate. */ ++ HOST_WIDE_INT offset; ++ /* True if loaded by reference (the aggregate is pointed to by the formal ++ parameter) or false if loaded by value (the aggregate is represented ++ by the formal parameter). */ ++ bool by_ref; ++}; ++ + /* Structure holding data required to describe an ancestor pass-through + jump function. */ + +@@ -110,38 +140,139 @@ struct GTY(()) ipa_ancestor_jf_data + unsigned agg_preserved : 1; + }; + +-/* An element in an aggegate part of a jump function describing a known value +- at a given offset. When it is part of a pass-through jump function with +- agg_preserved set or an ancestor jump function with agg_preserved set, all +- unlisted positions are assumed to be preserved but the value can be a type +- node, which means that the particular piece (starting at offset and having +- the size of the type) is clobbered with an unknown value. When +- agg_preserved is false or the type of the containing jump function is +- different, all unlisted parts are assumed to be unknown and all values must +- fulfill is_gimple_ip_invariant. */ ++/* A jump function for an aggregate part at a given offset, which describes how ++ it content value is generated. All unlisted positions are assumed to have a ++ value defined in an unknown way. */ + + struct GTY(()) ipa_agg_jf_item + { +- /* The offset at which the known value is located within the aggregate. */ ++ /* The offset for the aggregate part. */ + HOST_WIDE_INT offset; + +- /* The known constant or type if this is a clobber. */ +- tree value; +-}; ++ /* Data type of the aggregate part. */ ++ tree type; + ++ /* Jump function type. */ ++ enum jump_func_type jftype; + +-/* Aggregate jump function - i.e. description of contents of aggregates passed +- either by reference or value. */ ++ /* Represents a value of jump function. constant represents the actual constant ++ in constant jump function content. pass_through is used only in simple pass ++ through jump function context. load_agg is for load-value-from-aggregate ++ jump function context. */ ++ union jump_func_agg_value ++ { ++ tree GTY ((tag ("IPA_JF_CONST"))) constant; ++ struct ipa_pass_through_data GTY ((tag ("IPA_JF_PASS_THROUGH"))) pass_through; ++ struct ipa_load_agg_data GTY ((tag ("IPA_JF_LOAD_AGG"))) load_agg; ++ } GTY ((desc ("%1.jftype"))) value; ++}; ++ ++/* Jump functions describing a set of aggregate contents. */ + + struct GTY(()) ipa_agg_jump_function + { +- /* Description of the individual items. */ ++ /* Description of the individual jump function item. */ + vec *items; +- /* True if the data was passed by reference (as opposed to by value). */ ++ /* True if the data was passed by reference (as opposed to by value). */ ++ bool by_ref; ++}; ++ ++/* An element in an aggregate part describing a known value at a given offset. ++ All unlisted positions are assumed to be unknown and all listed values must ++ fulfill is_gimple_ip_invariant. */ ++ ++struct ipa_agg_value ++{ ++ /* The offset at which the known value is located within the aggregate. */ ++ HOST_WIDE_INT offset; ++ ++ /* The known constant. */ ++ tree value; ++ ++ /* Return true if OTHER describes same agg value. */ ++ bool equal_to (const ipa_agg_value &other); ++}; ++ ++/* Structure describing a set of known offset/value for aggregate. */ ++ ++struct ipa_agg_value_set ++{ ++ /* Description of the individual item. */ ++ vec items; ++ /* True if the data was passed by reference (as opposed to by value). */ + bool by_ref; ++ ++ /* Return true if OTHER describes same agg values. */ ++ bool equal_to (const ipa_agg_value_set &other) ++ { ++ if (by_ref != other.by_ref) ++ return false; ++ if (items.length () != other.items.length ()) ++ return false; ++ for (unsigned int i = 0; i < items.length (); i++) ++ if (!items[i].equal_to (other.items[i])) ++ return false; ++ return true; ++ } ++ ++ /* Return true if there is any value for aggregate. */ ++ bool is_empty () const ++ { ++ return items.is_empty (); ++ } ++ ++ ipa_agg_value_set copy () const ++ { ++ ipa_agg_value_set new_copy; ++ ++ new_copy.items = items.copy (); ++ new_copy.by_ref = by_ref; ++ ++ return new_copy; ++ } ++ ++ void release () ++ { ++ items.release (); ++ } + }; + +-typedef struct ipa_agg_jump_function *ipa_agg_jump_function_p; ++/* Return copy of a vec. */ ++ ++static inline vec ++ipa_copy_agg_values (const vec &aggs) ++{ ++ vec aggs_copy = vNULL; ++ ++ if (!aggs.is_empty ()) ++ { ++ ipa_agg_value_set *agg; ++ int i; ++ ++ aggs_copy.reserve_exact (aggs.length ()); ++ ++ FOR_EACH_VEC_ELT (aggs, i, agg) ++ aggs_copy.quick_push (agg->copy ()); ++ } ++ ++ return aggs_copy; ++} ++ ++/* For vec, DO NOT call release(), use below function ++ instead. Because ipa_agg_value_set contains a field of vector type, we ++ should release this child vector in each element before reclaiming the ++ whole vector. */ ++ ++static inline void ++ipa_release_agg_values (vec &aggs) ++{ ++ ipa_agg_value_set *agg; ++ int i; ++ ++ FOR_EACH_VEC_ELT (aggs, i, agg) ++ agg->release (); ++ aggs.release (); ++} + + /* Information about zero/non-zero bits. */ + struct GTY(()) ipa_bits +@@ -170,19 +301,19 @@ struct GTY(()) ipa_vr + types of jump functions supported. */ + struct GTY (()) ipa_jump_func + { +- /* Aggregate contants description. See struct ipa_agg_jump_function and its +- description. */ ++ /* Aggregate jump function description. See struct ipa_agg_jump_function ++ and its description. */ + struct ipa_agg_jump_function agg; + + /* Information about zero/non-zero bits. The pointed to structure is shared + betweed different jump functions. Use ipa_set_jfunc_bits to set this + field. */ +- struct ipa_bits *bits; ++ class ipa_bits *bits; + + /* Information about value range, containing valid data only when vr_known is + true. The pointed to structure is shared betweed different jump + functions. Use ipa_set_jfunc_vr to set this field. */ +- struct value_range_base *m_vr; ++ class value_range_base *m_vr; + + enum jump_func_type type; + /* Represents a value of a jump function. pass_through is used only in jump +@@ -310,9 +441,12 @@ struct GTY(()) ipa_param_descriptor + says how many there are. If any use could not be described by means of + ipa-prop structures, this is IPA_UNDESCRIBED_USE. */ + int controlled_uses; +- unsigned int move_cost : 31; ++ unsigned int move_cost : 28; + /* The parameter is used. */ + unsigned used : 1; ++ unsigned used_by_ipa_predicates : 1; ++ unsigned used_by_indirect_call : 1; ++ unsigned used_by_polymorphic_call : 1; + }; + + /* ipa_node_params stores information related to formal parameters of functions +@@ -332,7 +466,7 @@ struct GTY((for_user)) ipa_node_params + vec *descriptors; + /* Pointer to an array of structures describing individual formal + parameters. */ +- struct ipcp_param_lattices * GTY((skip)) lattices; ++ class ipcp_param_lattices * GTY((skip)) lattices; + /* Only for versioned nodes this field would not be NULL, + it points to the node that IPA cp cloned from. */ + struct cgraph_node * GTY((skip)) ipcp_orig_node; +@@ -357,6 +491,8 @@ struct GTY((for_user)) ipa_node_params + unsigned node_dead : 1; + /* Node is involved in a recursion, potentionally indirect. */ + unsigned node_within_scc : 1; ++ /* Node contains only direct recursion. */ ++ unsigned node_is_self_scc : 1; + /* Node is calling a private function called only once. */ + unsigned node_calling_single_call : 1; + /* False when there is something makes versioning impossible. */ +@@ -420,7 +556,7 @@ struct ipa_func_body_info + cgraph_node *node; + + /* Its info. */ +- struct ipa_node_params *info; ++ class ipa_node_params *info; + + /* Information about individual BBs. */ + vec bb_infos; +@@ -439,7 +575,7 @@ struct ipa_func_body_info + /* Return the number of formal parameters. */ + + static inline int +-ipa_get_param_count (struct ipa_node_params *info) ++ipa_get_param_count (class ipa_node_params *info) + { + return vec_safe_length (info->descriptors); + } +@@ -450,10 +586,9 @@ ipa_get_param_count (struct ipa_node_par + WPA. */ + + static inline tree +-ipa_get_param (struct ipa_node_params *info, int i) ++ipa_get_param (class ipa_node_params *info, int i) + { + gcc_checking_assert (info->descriptors); +- gcc_checking_assert (!flag_wpa); + tree t = (*info->descriptors)[i].decl_or_type; + gcc_checking_assert (TREE_CODE (t) == PARM_DECL); + return t; +@@ -463,7 +598,7 @@ ipa_get_param (struct ipa_node_params *i + to INFO if it is known or NULL if not. */ + + static inline tree +-ipa_get_type (struct ipa_node_params *info, int i) ++ipa_get_type (class ipa_node_params *info, int i) + { + if (vec_safe_length (info->descriptors) <= (unsigned) i) + return NULL; +@@ -480,7 +615,7 @@ ipa_get_type (struct ipa_node_params *in + to INFO. */ + + static inline int +-ipa_get_param_move_cost (struct ipa_node_params *info, int i) ++ipa_get_param_move_cost (class ipa_node_params *info, int i) + { + gcc_checking_assert (info->descriptors); + return (*info->descriptors)[i].move_cost; +@@ -490,17 +625,47 @@ ipa_get_param_move_cost (struct ipa_node + associated with INFO to VAL. */ + + static inline void +-ipa_set_param_used (struct ipa_node_params *info, int i, bool val) ++ipa_set_param_used (class ipa_node_params *info, int i, bool val) + { + gcc_checking_assert (info->descriptors); + (*info->descriptors)[i].used = val; + } + ++/* Set the used_by_ipa_predicates flag corresponding to the Ith formal ++ parameter of the function associated with INFO to VAL. */ ++ ++static inline void ++ipa_set_param_used_by_ipa_predicates (class ipa_node_params *info, int i, bool val) ++{ ++ gcc_checking_assert (info->descriptors); ++ (*info->descriptors)[i].used_by_ipa_predicates = val; ++} ++ ++/* Set the used_by_indirect_call flag corresponding to the Ith formal ++ parameter of the function associated with INFO to VAL. */ ++ ++static inline void ++ipa_set_param_used_by_indirect_call (class ipa_node_params *info, int i, bool val) ++{ ++ gcc_checking_assert (info->descriptors); ++ (*info->descriptors)[i].used_by_indirect_call = val; ++} ++ ++/* Set the .used_by_polymorphic_call flag corresponding to the Ith formal ++ parameter of the function associated with INFO to VAL. */ ++ ++static inline void ++ipa_set_param_used_by_polymorphic_call (class ipa_node_params *info, int i, bool val) ++{ ++ gcc_checking_assert (info->descriptors); ++ (*info->descriptors)[i].used_by_polymorphic_call = val; ++} ++ + /* Return how many uses described by ipa-prop a parameter has or + IPA_UNDESCRIBED_USE if there is a use that is not described by these + structures. */ + static inline int +-ipa_get_controlled_uses (struct ipa_node_params *info, int i) ++ipa_get_controlled_uses (class ipa_node_params *info, int i) + { + /* FIXME: introducing speculation causes out of bounds access here. */ + if (vec_safe_length (info->descriptors) > (unsigned)i) +@@ -511,7 +676,7 @@ ipa_get_controlled_uses (struct ipa_node + /* Set the controlled counter of a given parameter. */ + + static inline void +-ipa_set_controlled_uses (struct ipa_node_params *info, int i, int val) ++ipa_set_controlled_uses (class ipa_node_params *info, int i, int val) + { + gcc_checking_assert (info->descriptors); + (*info->descriptors)[i].controlled_uses = val; +@@ -521,12 +686,42 @@ ipa_set_controlled_uses (struct ipa_node + function associated with INFO. */ + + static inline bool +-ipa_is_param_used (struct ipa_node_params *info, int i) ++ipa_is_param_used (class ipa_node_params *info, int i) + { + gcc_checking_assert (info->descriptors); + return (*info->descriptors)[i].used; + } + ++/* Return the used_by_ipa_predicates flag corresponding to the Ith formal ++ parameter of the function associated with INFO. */ ++ ++static inline bool ++ipa_is_param_used_by_ipa_predicates (class ipa_node_params *info, int i) ++{ ++ gcc_checking_assert (info->descriptors); ++ return (*info->descriptors)[i].used_by_ipa_predicates; ++} ++ ++/* Return the used_by_indirect_call flag corresponding to the Ith formal ++ parameter of the function associated with INFO. */ ++ ++static inline bool ++ipa_is_param_used_by_indirect_call (class ipa_node_params *info, int i) ++{ ++ gcc_checking_assert (info->descriptors); ++ return (*info->descriptors)[i].used_by_indirect_call; ++} ++ ++/* Return the used_by_polymorphic_call flag corresponding to the Ith formal ++ parameter of the function associated with INFO. */ ++ ++static inline bool ++ipa_is_param_used_by_polymorphic_call (class ipa_node_params *info, int i) ++{ ++ gcc_checking_assert (info->descriptors); ++ return (*info->descriptors)[i].used_by_polymorphic_call; ++} ++ + /* Information about replacements done in aggregates for a given node (each + node has its linked list). */ + struct GTY(()) ipa_agg_replacement_value +@@ -590,7 +785,7 @@ class GTY((for_user)) ipa_edge_args + /* Return the number of actual arguments. */ + + static inline int +-ipa_get_cs_argument_count (struct ipa_edge_args *args) ++ipa_get_cs_argument_count (class ipa_edge_args *args) + { + return vec_safe_length (args->jump_functions); + } +@@ -600,15 +795,15 @@ ipa_get_cs_argument_count (struct ipa_ed + ipa_compute_jump_functions. */ + + static inline struct ipa_jump_func * +-ipa_get_ith_jump_func (struct ipa_edge_args *args, int i) ++ipa_get_ith_jump_func (class ipa_edge_args *args, int i) + { + return &(*args->jump_functions)[i]; + } + + /* Returns a pointer to the polymorphic call context for the ith argument. + NULL if contexts are not computed. */ +-static inline struct ipa_polymorphic_call_context * +-ipa_get_ith_polymorhic_call_context (struct ipa_edge_args *args, int i) ++static inline class ipa_polymorphic_call_context * ++ipa_get_ith_polymorhic_call_context (class ipa_edge_args *args, int i) + { + if (!args->polymorphic_call_contexts) + return NULL; +@@ -637,7 +832,12 @@ class GTY((user)) ipa_edge_args_sum_t : + ipa_edge_args_sum_t (symbol_table *table, bool ggc) + : call_summary (table, ggc) { } + +- /* Hook that is called by summary when an edge is duplicated. */ ++ void remove (cgraph_edge *edge) ++ { ++ call_summary ::remove (edge); ++ } ++ ++ /* Hook that is called by summary when an edge is removed. */ + virtual void remove (cgraph_edge *cs, ipa_edge_args *args); + /* Hook that is called by summary when an edge is duplicated. */ + virtual void duplicate (cgraph_edge *src, +@@ -675,8 +875,10 @@ extern GTY(()) function_summary get_create (NODE)) +-#define IPA_EDGE_REF(EDGE) (ipa_edge_args_sum->get_create (EDGE)) ++#define IPA_NODE_REF(NODE) (ipa_node_params_sum->get (NODE)) ++#define IPA_NODE_REF_GET_CREATE(NODE) (ipa_node_params_sum->get_create (NODE)) ++#define IPA_EDGE_REF(EDGE) (ipa_edge_args_sum->get (EDGE)) ++#define IPA_EDGE_REF_GET_CREATE(EDGE) (ipa_edge_args_sum->get_create (EDGE)) + /* This macro checks validity of index returned by + ipa_get_param_decl_index function. */ + #define IS_VALID_JUMP_FUNC_INDEX(I) ((I) != -1) +@@ -740,9 +942,9 @@ bool ipa_propagate_indirect_call_infos ( + + /* Indirect edge and binfo processing. */ + tree ipa_get_indirect_edge_target (struct cgraph_edge *ie, +- vec , ++ vec, + vec, +- vec, ++ vec, + bool *); + struct cgraph_edge *ipa_make_edge_direct_to_target (struct cgraph_edge *, tree, + bool speculative = false); +@@ -755,13 +957,13 @@ ipa_bits *ipa_get_ipa_bits_for_value (co + void ipa_analyze_node (struct cgraph_node *); + + /* Aggregate jump function related functions. */ +-tree ipa_find_agg_cst_for_param (struct ipa_agg_jump_function *agg, tree scalar, ++tree ipa_find_agg_cst_for_param (struct ipa_agg_value_set *agg, tree scalar, + HOST_WIDE_INT offset, bool by_ref, + bool *from_global_constant = NULL); + bool ipa_load_from_parm_agg (struct ipa_func_body_info *fbi, + vec *descriptors, + gimple *stmt, tree op, int *index_p, +- HOST_WIDE_INT *offset_p, HOST_WIDE_INT *size_p, ++ HOST_WIDE_INT *offset_p, poly_int64 *size_p, + bool *by_ref, bool *guaranteed_unmodified = NULL); + + /* Debugging interface. */ +@@ -779,11 +981,11 @@ extern object_allocator +-class ipcp_value_source; ++struct ipcp_value_source; + + extern object_allocator > ipcp_sources_pool; + +-class ipcp_agg_lattice; ++struct ipcp_agg_lattice; + + extern object_allocator ipcp_agg_lattice_pool; + +@@ -793,15 +995,18 @@ void ipa_prop_write_jump_functions (void + void ipa_prop_read_jump_functions (void); + void ipcp_write_transformation_summaries (void); + void ipcp_read_transformation_summaries (void); +-int ipa_get_param_decl_index (struct ipa_node_params *, tree); +-tree ipa_value_from_jfunc (struct ipa_node_params *info, ++int ipa_get_param_decl_index (class ipa_node_params *, tree); ++tree ipa_value_from_jfunc (class ipa_node_params *info, + struct ipa_jump_func *jfunc, tree type); + unsigned int ipcp_transform_function (struct cgraph_node *node); + ipa_polymorphic_call_context ipa_context_from_jfunc (ipa_node_params *, + cgraph_edge *, + int, + ipa_jump_func *); +-void ipa_dump_param (FILE *, struct ipa_node_params *info, int i); ++ipa_agg_value_set ipa_agg_value_set_from_jfunc (ipa_node_params *, ++ cgraph_node *, ++ ipa_agg_jump_function *); ++void ipa_dump_param (FILE *, class ipa_node_params *info, int i); + void ipa_release_body_info (struct ipa_func_body_info *); + tree ipa_get_callee_param_type (struct cgraph_edge *e, int i); + +diff -Nurp a/gcc/ipa-pure-const.c b/gcc/ipa-pure-const.c +--- a/gcc/ipa-pure-const.c 2020-04-30 15:14:04.600000000 +0800 ++++ b/gcc/ipa-pure-const.c 2020-04-30 15:14:56.588000000 +0800 +@@ -1360,12 +1360,14 @@ ignore_edge_for_nothrow (struct cgraph_e + return true; + + enum availability avail; +- cgraph_node *n = e->callee->function_or_virtual_thunk_symbol (&avail, +- e->caller); +- if (avail <= AVAIL_INTERPOSABLE || TREE_NOTHROW (n->decl)) ++ cgraph_node *ultimate_target ++ = e->callee->function_or_virtual_thunk_symbol (&avail, e->caller); ++ if (avail <= AVAIL_INTERPOSABLE || TREE_NOTHROW (ultimate_target->decl)) + return true; +- return opt_for_fn (e->callee->decl, flag_non_call_exceptions) +- && !e->callee->binds_to_current_def_p (e->caller); ++ return ((opt_for_fn (e->callee->decl, flag_non_call_exceptions) ++ && !e->callee->binds_to_current_def_p (e->caller)) ++ || !opt_for_fn (e->caller->decl, flag_ipa_pure_const) ++ || !opt_for_fn (ultimate_target->decl, flag_ipa_pure_const)); + } + + /* Return true if NODE is self recursive function. +@@ -1395,16 +1397,21 @@ cdtor_p (cgraph_node *n, void *) + return false; + } + +-/* We only propagate across edges with non-interposable callee. */ ++/* Skip edges from and to nodes without ipa_pure_const enabled. ++ Ignore not available symbols. */ + + static bool + ignore_edge_for_pure_const (struct cgraph_edge *e) + { + enum availability avail; +- e->callee->function_or_virtual_thunk_symbol (&avail, e->caller); +- return (avail <= AVAIL_INTERPOSABLE); +-} ++ cgraph_node *ultimate_target ++ = e->callee->function_or_virtual_thunk_symbol (&avail, e->caller); + ++ return (avail <= AVAIL_INTERPOSABLE ++ || !opt_for_fn (e->caller->decl, flag_ipa_pure_const) ++ || !opt_for_fn (ultimate_target->decl, ++ flag_ipa_pure_const)); ++} + + /* Produce transitive closure over the callgraph and compute pure/const + attributes. */ +@@ -1670,7 +1677,7 @@ propagate_pure_const (void) + /* Inline clones share declaration with their offline copies; + do not modify their declarations since the offline copy may + be different. */ +- if (!w->global.inlined_to) ++ if (!w->inlined_to) + switch (this_state) + { + case IPA_CONST: +@@ -1831,7 +1838,7 @@ propagate_nothrow (void) + /* Inline clones share declaration with their offline copies; + do not modify their declarations since the offline copy may + be different. */ +- if (!w->global.inlined_to) ++ if (!w->inlined_to) + { + w->set_nothrow_flag (true); + if (dump_file) +@@ -1958,7 +1965,7 @@ propagate_malloc (void) + funct_state l = funct_state_summaries->get (node); + if (!node->alias + && l->malloc_state == STATE_MALLOC +- && !node->global.inlined_to) ++ && !node->inlined_to) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Function %s found to be malloc\n", +diff -Nurp a/gcc/ipa-reference.c b/gcc/ipa-reference.c +--- a/gcc/ipa-reference.c 2020-04-30 15:14:04.644000000 +0800 ++++ b/gcc/ipa-reference.c 2020-04-30 15:14:56.588000000 +0800 +@@ -46,7 +46,6 @@ along with GCC; see the file COPYING3. + #include "cgraph.h" + #include "data-streamer.h" + #include "calls.h" +-#include "splay-tree.h" + #include "ipa-utils.h" + #include "ipa-reference.h" + #include "symbol-summary.h" +@@ -75,8 +74,8 @@ struct ipa_reference_global_vars_info_d + + struct ipa_reference_optimization_summary_d + { +- bitmap statics_not_read; +- bitmap statics_not_written; ++ bitmap statics_read; ++ bitmap statics_written; + }; + + typedef ipa_reference_local_vars_info_d *ipa_reference_local_vars_info_t; +@@ -92,14 +91,20 @@ struct ipa_reference_vars_info_d + + typedef struct ipa_reference_vars_info_d *ipa_reference_vars_info_t; + +-/* This splay tree contains all of the static variables that are ++/* This map contains all of the static variables that are + being considered by the compilation level alias analysis. */ +-static splay_tree reference_vars_to_consider; ++typedef hash_map reference_vars_map_t; ++static reference_vars_map_t *ipa_reference_vars_map; ++static int ipa_reference_vars_uids; ++static vec *reference_vars_to_consider; ++varpool_node_hook_list *varpool_node_hooks; + + /* Set of all interesting module statics. A bit is set for every module + static we are considering. This is added to the local info when asm + code is found that clobbers all memory. */ + static bitmap all_module_statics; ++/* Zero bitmap. */ ++static bitmap no_module_statics; + /* Set of all statics that should be ignored because they are touched by + -fno-ipa-reference code. */ + static bitmap ignore_module_statics; +@@ -136,6 +141,31 @@ public: + + static ipa_ref_opt_summary_t *ipa_ref_opt_sum_summaries = NULL; + ++/* Return ID used by ipa-reference bitmaps. -1 if failed. */ ++int ++ipa_reference_var_uid (tree t) ++{ ++ if (!ipa_reference_vars_map) ++ return -1; ++ int *id = ipa_reference_vars_map->get ++ (symtab_node::get (t)->ultimate_alias_target (NULL)->decl); ++ if (!id) ++ return -1; ++ return *id; ++} ++ ++/* Return ID used by ipa-reference bitmaps. Create new entry if ++ T is not in map. Set EXISTED accordinly */ ++int ++ipa_reference_var_get_or_insert_uid (tree t, bool *existed) ++{ ++ int &id = ipa_reference_vars_map->get_or_insert ++ (symtab_node::get (t)->ultimate_alias_target (NULL)->decl, existed); ++ if (!*existed) ++ id = ipa_reference_vars_uids++; ++ return id; ++} ++ + /* Return the ipa_reference_vars structure starting from the cgraph NODE. */ + static inline ipa_reference_vars_info_t + get_reference_vars_info (struct cgraph_node *node) +@@ -165,7 +195,7 @@ get_reference_optimization_summary (stru + NULL if no data is available. */ + + bitmap +-ipa_reference_get_not_read_global (struct cgraph_node *fn) ++ipa_reference_get_read_global (struct cgraph_node *fn) + { + if (!opt_for_fn (current_function_decl, flag_ipa_reference)) + return NULL; +@@ -180,10 +210,10 @@ ipa_reference_get_not_read_global (struc + || (avail == AVAIL_INTERPOSABLE + && flags_from_decl_or_type (fn->decl) & ECF_LEAF)) + && opt_for_fn (fn2->decl, flag_ipa_reference)) +- return info->statics_not_read; ++ return info->statics_read; + else if (avail == AVAIL_NOT_AVAILABLE + && flags_from_decl_or_type (fn->decl) & ECF_LEAF) +- return all_module_statics; ++ return no_module_statics; + else + return NULL; + } +@@ -194,7 +224,7 @@ ipa_reference_get_not_read_global (struc + call. Returns NULL if no data is available. */ + + bitmap +-ipa_reference_get_not_written_global (struct cgraph_node *fn) ++ipa_reference_get_written_global (struct cgraph_node *fn) + { + if (!opt_for_fn (current_function_decl, flag_ipa_reference)) + return NULL; +@@ -209,10 +239,10 @@ ipa_reference_get_not_written_global (st + || (avail == AVAIL_INTERPOSABLE + && flags_from_decl_or_type (fn->decl) & ECF_LEAF)) + && opt_for_fn (fn2->decl, flag_ipa_reference)) +- return info->statics_not_written; ++ return info->statics_written; + else if (avail == AVAIL_NOT_AVAILABLE + && flags_from_decl_or_type (fn->decl) & ECF_LEAF) +- return all_module_statics; ++ return no_module_statics; + else + return NULL; + } +@@ -256,7 +286,9 @@ is_improper (symtab_node *n, void *v ATT + static inline bool + is_proper_for_analysis (tree t) + { +- if (bitmap_bit_p (ignore_module_statics, ipa_reference_var_uid (t))) ++ int id = ipa_reference_var_uid (t); ++ ++ if (id != -1 && bitmap_bit_p (ignore_module_statics, id)) + return false; + + if (symtab_node::get (t) +@@ -272,9 +304,7 @@ is_proper_for_analysis (tree t) + static const char * + get_static_name (int index) + { +- splay_tree_node stn = +- splay_tree_lookup (reference_vars_to_consider, index); +- return fndecl_name ((tree)(stn->value)); ++ return fndecl_name ((*reference_vars_to_consider)[index]); + } + + /* Dump a set of static vars to FILE. */ +@@ -287,6 +317,8 @@ dump_static_vars_set_to_file (FILE *f, b + return; + else if (set == all_module_statics) + fprintf (f, "ALL"); ++ else if (set == no_module_statics) ++ fprintf (f, "NO"); + else + EXECUTE_IF_SET_IN_BITMAP (set, 0, index, bi) + { +@@ -330,10 +362,12 @@ union_static_var_sets (bitmap &x, bitmap + But if SET is NULL or the maximum set, return that instead. */ + + static bitmap +-copy_static_var_set (bitmap set) ++copy_static_var_set (bitmap set, bool for_propagation) + { + if (set == NULL || set == all_module_statics) + return set; ++ if (!for_propagation && set == no_module_statics) ++ return set; + bitmap_obstack *o = set->obstack; + gcc_checking_assert (o); + bitmap copy = BITMAP_ALLOC (o); +@@ -403,6 +437,14 @@ propagate_bits (ipa_reference_global_var + } + } + ++/* Delete NODE from map. */ ++ ++static void ++varpool_removal_hook (varpool_node *node, void *) ++{ ++ ipa_reference_vars_map->remove (node->decl); ++} ++ + static bool ipa_init_p = false; + + /* The init routine for analyzing global static variable usage. See +@@ -415,22 +457,28 @@ ipa_init (void) + + ipa_init_p = true; + +- if (dump_file) +- reference_vars_to_consider = splay_tree_new (splay_tree_compare_ints, 0, 0); ++ vec_alloc (reference_vars_to_consider, 10); ++ ++ ++ if (ipa_ref_opt_sum_summaries != NULL) ++ { ++ delete ipa_ref_opt_sum_summaries; ++ ipa_ref_opt_sum_summaries = NULL; ++ delete ipa_reference_vars_map; ++ } ++ ipa_reference_vars_map = new reference_vars_map_t(257); ++ varpool_node_hooks ++ = symtab->add_varpool_removal_hook (varpool_removal_hook, NULL); ++ ipa_reference_vars_uids = 0; + + bitmap_obstack_initialize (&local_info_obstack); + bitmap_obstack_initialize (&optimization_summary_obstack); + all_module_statics = BITMAP_ALLOC (&optimization_summary_obstack); ++ no_module_statics = BITMAP_ALLOC (&optimization_summary_obstack); + ignore_module_statics = BITMAP_ALLOC (&optimization_summary_obstack); + + if (ipa_ref_var_info_summaries == NULL) + ipa_ref_var_info_summaries = new ipa_ref_var_info_summary_t (symtab); +- +- if (ipa_ref_opt_sum_summaries != NULL) +- { +- delete ipa_ref_opt_sum_summaries; +- ipa_ref_opt_sum_summaries = NULL; +- } + } + + +@@ -465,6 +513,8 @@ analyze_function (struct cgraph_node *fn + local = init_function_info (fn); + for (i = 0; fn->iterate_reference (i, ref); i++) + { ++ int id; ++ bool existed; + if (!is_a (ref->referred)) + continue; + var = ref->referred->decl; +@@ -472,23 +522,22 @@ analyze_function (struct cgraph_node *fn + continue; + /* This is a variable we care about. Check if we have seen it + before, and if not add it the set of variables we care about. */ +- if (all_module_statics +- && bitmap_set_bit (all_module_statics, ipa_reference_var_uid (var))) ++ id = ipa_reference_var_get_or_insert_uid (var, &existed); ++ if (!existed) + { ++ bitmap_set_bit (all_module_statics, id); + if (dump_file) +- splay_tree_insert (reference_vars_to_consider, +- ipa_reference_var_uid (var), +- (splay_tree_value)var); ++ reference_vars_to_consider->safe_push (var); + } + switch (ref->use) + { + case IPA_REF_LOAD: +- bitmap_set_bit (local->statics_read, ipa_reference_var_uid (var)); ++ bitmap_set_bit (local->statics_read, id); + break; + case IPA_REF_STORE: + if (ref->cannot_lead_to_return ()) + break; +- bitmap_set_bit (local->statics_written, ipa_reference_var_uid (var)); ++ bitmap_set_bit (local->statics_written, id); + break; + case IPA_REF_ADDR: + break; +@@ -510,10 +559,10 @@ ipa_ref_opt_summary_t::duplicate (cgraph + ipa_reference_optimization_summary_d + *dst_ginfo) + { +- dst_ginfo->statics_not_read = +- copy_static_var_set (ginfo->statics_not_read); +- dst_ginfo->statics_not_written = +- copy_static_var_set (ginfo->statics_not_written); ++ dst_ginfo->statics_read = ++ copy_static_var_set (ginfo->statics_read, false); ++ dst_ginfo->statics_written = ++ copy_static_var_set (ginfo->statics_written, false); + } + + /* Called when node is removed. */ +@@ -522,13 +571,15 @@ void + ipa_ref_opt_summary_t::remove (cgraph_node *, + ipa_reference_optimization_summary_d *ginfo) + { +- if (ginfo->statics_not_read +- && ginfo->statics_not_read != all_module_statics) +- BITMAP_FREE (ginfo->statics_not_read); +- +- if (ginfo->statics_not_written +- && ginfo->statics_not_written != all_module_statics) +- BITMAP_FREE (ginfo->statics_not_written); ++ if (ginfo->statics_read ++ && ginfo->statics_read != all_module_statics ++ && ginfo->statics_read != no_module_statics) ++ BITMAP_FREE (ginfo->statics_read); ++ ++ if (ginfo->statics_written ++ && ginfo->statics_written != all_module_statics ++ && ginfo->statics_written != no_module_statics) ++ BITMAP_FREE (ginfo->statics_written); + } + + /* Analyze each function in the cgraph to see which global or statics +@@ -676,16 +727,23 @@ get_read_write_all_from_node (struct cgr + } + } + +-/* Skip edges from and to nodes without ipa_reference enables. This leave +- them out of strongy connected coponents and makes them easyto skip in the ++/* Skip edges from and to nodes without ipa_reference enabled. ++ Ignore not available symbols. This leave ++ them out of strongly connected components and makes them easy to skip in the + propagation loop bellow. */ + + static bool + ignore_edge_p (cgraph_edge *e) + { +- return (!opt_for_fn (e->caller->decl, flag_ipa_reference) +- || !opt_for_fn (e->callee->function_symbol ()->decl, +- flag_ipa_reference)); ++ enum availability avail; ++ cgraph_node *ultimate_target ++ = e->callee->function_or_virtual_thunk_symbol (&avail, e->caller); ++ ++ return (avail < AVAIL_INTERPOSABLE ++ || (avail == AVAIL_INTERPOSABLE ++ && !(flags_from_decl_or_type (e->callee->decl) & ECF_LEAF)) ++ || !opt_for_fn (e->caller->decl, flag_ipa_reference) ++ || !opt_for_fn (ultimate_target->decl, flag_ipa_reference)); + } + + /* Produce the global information by preforming a transitive closure +@@ -753,11 +811,12 @@ propagate (void) + if (read_all) + node_g->statics_read = all_module_statics; + else +- node_g->statics_read = copy_static_var_set (node_l->statics_read); ++ node_g->statics_read = copy_static_var_set (node_l->statics_read, true); + if (write_all) + node_g->statics_written = all_module_statics; + else +- node_g->statics_written = copy_static_var_set (node_l->statics_written); ++ node_g->statics_written ++ = copy_static_var_set (node_l->statics_written, true); + + /* Merge the sets of this cycle with all sets of callees reached + from this cycle. */ +@@ -841,12 +900,26 @@ propagate (void) + ipa_reference_vars_info_t node_info; + ipa_reference_global_vars_info_t node_g; + ++ /* No need to produce summaries for inline clones. */ ++ if (node->inlined_to) ++ continue; ++ + node_info = get_reference_vars_info (node); +- if (!node->alias && opt_for_fn (node->decl, flag_ipa_reference) +- && (node->get_availability () > AVAIL_INTERPOSABLE +- || (flags_from_decl_or_type (node->decl) & ECF_LEAF))) ++ if (!node->alias && opt_for_fn (node->decl, flag_ipa_reference)) + { + node_g = &node_info->global; ++ bool read_all = ++ (node_g->statics_read == all_module_statics ++ || bitmap_equal_p (node_g->statics_read, all_module_statics)); ++ bool written_all = ++ (node_g->statics_written == all_module_statics ++ || bitmap_equal_p (node_g->statics_written, ++ all_module_statics)); ++ ++ /* There is no need to produce summary if we collected nothing ++ useful. */ ++ if (read_all && written_all) ++ continue; + + ipa_reference_optimization_summary_d *opt + = ipa_ref_opt_sum_summaries->get_create (node); +@@ -854,27 +927,25 @@ propagate (void) + /* Create the complimentary sets. */ + + if (bitmap_empty_p (node_g->statics_read)) +- opt->statics_not_read = all_module_statics; ++ opt->statics_read = no_module_statics; ++ else if (read_all) ++ opt->statics_read = all_module_statics; + else + { +- opt->statics_not_read ++ opt->statics_read + = BITMAP_ALLOC (&optimization_summary_obstack); +- if (node_g->statics_read != all_module_statics) +- bitmap_and_compl (opt->statics_not_read, +- all_module_statics, +- node_g->statics_read); ++ bitmap_copy (opt->statics_read, node_g->statics_read); + } + + if (bitmap_empty_p (node_g->statics_written)) +- opt->statics_not_written = all_module_statics; ++ opt->statics_written = no_module_statics; ++ else if (written_all) ++ opt->statics_written = all_module_statics; + else + { +- opt->statics_not_written ++ opt->statics_written + = BITMAP_ALLOC (&optimization_summary_obstack); +- if (node_g->statics_written != all_module_statics) +- bitmap_and_compl (opt->statics_not_written, +- all_module_statics, +- node_g->statics_written); ++ bitmap_copy (opt->statics_written, node_g->statics_written); + } + } + } +@@ -892,7 +963,7 @@ propagate (void) + + ipa_ref_var_info_summaries = NULL; + if (dump_file) +- splay_tree_delete (reference_vars_to_consider); ++ vec_free (reference_vars_to_consider); + reference_vars_to_consider = NULL; + return remove_p ? TODO_remove_functions : 0; + } +@@ -907,12 +978,10 @@ write_node_summary_p (struct cgraph_node + ipa_reference_optimization_summary_t info; + + /* See if we have (non-empty) info. */ +- if (!node->definition || node->global.inlined_to) ++ if (!node->definition || node->inlined_to) + return false; + info = get_reference_optimization_summary (node); +- if (!info +- || (bitmap_empty_p (info->statics_not_read) +- && bitmap_empty_p (info->statics_not_written))) ++ if (!info) + return false; + + /* See if we want to encode it. +@@ -925,11 +994,17 @@ write_node_summary_p (struct cgraph_node + && !referenced_from_this_partition_p (node, encoder)) + return false; + +- /* See if the info has non-empty intersections with vars we want to encode. */ +- if (!bitmap_intersect_p (info->statics_not_read, ltrans_statics) +- && !bitmap_intersect_p (info->statics_not_written, ltrans_statics)) +- return false; +- return true; ++ /* See if the info has non-empty intersections with vars we want to ++ encode. */ ++ bitmap_iterator bi; ++ unsigned int i; ++ EXECUTE_IF_AND_COMPL_IN_BITMAP (ltrans_statics, info->statics_read, 0, ++ i, bi) ++ return true; ++ EXECUTE_IF_AND_COMPL_IN_BITMAP (ltrans_statics, info->statics_written, 0, ++ i, bi) ++ return true; ++ return false; + } + + /* Stream out BITS<RANS_STATICS as list of decls to OB. +@@ -962,8 +1037,7 @@ stream_out_bitmap (struct lto_simple_out + return; + EXECUTE_IF_AND_IN_BITMAP (bits, ltrans_statics, 0, index, bi) + { +- tree decl = (tree)splay_tree_lookup (reference_vars_to_consider, +- index)->value; ++ tree decl = (*reference_vars_to_consider) [index]; + lto_output_var_decl_index (ob->decl_state, ob->main_stream, decl); + } + } +@@ -981,23 +1055,23 @@ ipa_reference_write_optimization_summary + auto_bitmap ltrans_statics; + int i; + +- reference_vars_to_consider = splay_tree_new (splay_tree_compare_ints, 0, 0); ++ vec_alloc (reference_vars_to_consider, ipa_reference_vars_uids); ++ reference_vars_to_consider->safe_grow (ipa_reference_vars_uids); + + /* See what variables we are interested in. */ + for (i = 0; i < lto_symtab_encoder_size (encoder); i++) + { + symtab_node *snode = lto_symtab_encoder_deref (encoder, i); + varpool_node *vnode = dyn_cast (snode); ++ int id; ++ + if (vnode +- && bitmap_bit_p (all_module_statics, +- ipa_reference_var_uid (vnode->decl)) ++ && (id = ipa_reference_var_uid (vnode->decl)) != -1 + && referenced_from_this_partition_p (vnode, encoder)) + { + tree decl = vnode->decl; +- bitmap_set_bit (ltrans_statics, ipa_reference_var_uid (decl)); +- splay_tree_insert (reference_vars_to_consider, +- ipa_reference_var_uid (decl), +- (splay_tree_value)decl); ++ bitmap_set_bit (ltrans_statics, id); ++ (*reference_vars_to_consider)[id] = decl; + ltrans_statics_bitcount ++; + } + } +@@ -1032,14 +1106,14 @@ ipa_reference_write_optimization_summary + node_ref = lto_symtab_encoder_encode (encoder, snode); + streamer_write_uhwi_stream (ob->main_stream, node_ref); + +- stream_out_bitmap (ob, info->statics_not_read, ltrans_statics, ++ stream_out_bitmap (ob, info->statics_read, ltrans_statics, + ltrans_statics_bitcount); +- stream_out_bitmap (ob, info->statics_not_written, ltrans_statics, ++ stream_out_bitmap (ob, info->statics_written, ltrans_statics, + ltrans_statics_bitcount); + } + } + lto_destroy_simple_output_block (ob); +- splay_tree_delete (reference_vars_to_consider); ++ delete reference_vars_to_consider; + } + + /* Deserialize the ipa info for lto. */ +@@ -1053,10 +1127,15 @@ ipa_reference_read_optimization_summary + unsigned int j = 0; + bitmap_obstack_initialize (&optimization_summary_obstack); + +- if (ipa_ref_opt_sum_summaries == NULL) +- ipa_ref_opt_sum_summaries = new ipa_ref_opt_summary_t (symtab); ++ gcc_checking_assert (ipa_ref_opt_sum_summaries == NULL); ++ ipa_ref_opt_sum_summaries = new ipa_ref_opt_summary_t (symtab); ++ ipa_reference_vars_map = new reference_vars_map_t(257); ++ varpool_node_hooks ++ = symtab->add_varpool_removal_hook (varpool_removal_hook, NULL); ++ ipa_reference_vars_uids = 0; + + all_module_statics = BITMAP_ALLOC (&optimization_summary_obstack); ++ no_module_statics = BITMAP_ALLOC (&optimization_summary_obstack); + + while ((file_data = file_data_vec[j++])) + { +@@ -1081,8 +1160,11 @@ ipa_reference_read_optimization_summary + unsigned int var_index = streamer_read_uhwi (ib); + tree v_decl = lto_file_decl_data_get_var_decl (file_data, + var_index); ++ bool existed; + bitmap_set_bit (all_module_statics, +- ipa_reference_var_uid (v_decl)); ++ ipa_reference_var_get_or_insert_uid ++ (v_decl, &existed)); ++ gcc_checking_assert (!existed); + if (dump_file) + fprintf (dump_file, " %s", fndecl_name (v_decl)); + } +@@ -1102,57 +1184,65 @@ ipa_reference_read_optimization_summary + ipa_reference_optimization_summary_d *info + = ipa_ref_opt_sum_summaries->get_create (node); + +- info->statics_not_read = BITMAP_ALLOC +- (&optimization_summary_obstack); +- info->statics_not_written = BITMAP_ALLOC +- (&optimization_summary_obstack); + if (dump_file) + fprintf (dump_file, +- "\nFunction name:%s:\n static not read:", ++ "\nFunction name:%s:\n static read:", + node->dump_asm_name ()); + +- /* Set the statics not read. */ ++ /* Set the statics read. */ + v_count = streamer_read_hwi (ib); + if (v_count == -1) + { +- info->statics_not_read = all_module_statics; ++ info->statics_read = all_module_statics; + if (dump_file) + fprintf (dump_file, " all module statics"); + } ++ else if (v_count == 0) ++ info->statics_read = no_module_statics; + else +- for (j = 0; j < (unsigned int)v_count; j++) +- { +- unsigned int var_index = streamer_read_uhwi (ib); +- tree v_decl = lto_file_decl_data_get_var_decl (file_data, +- var_index); +- bitmap_set_bit (info->statics_not_read, +- ipa_reference_var_uid (v_decl)); +- if (dump_file) +- fprintf (dump_file, " %s", fndecl_name (v_decl)); +- } ++ { ++ info->statics_read = BITMAP_ALLOC ++ (&optimization_summary_obstack); ++ for (j = 0; j < (unsigned int)v_count; j++) ++ { ++ unsigned int var_index = streamer_read_uhwi (ib); ++ tree v_decl = lto_file_decl_data_get_var_decl (file_data, ++ var_index); ++ bitmap_set_bit (info->statics_read, ++ ipa_reference_var_uid (v_decl)); ++ if (dump_file) ++ fprintf (dump_file, " %s", fndecl_name (v_decl)); ++ } ++ } + + if (dump_file) + fprintf (dump_file, +- "\n static not written:"); +- /* Set the statics not written. */ ++ "\n static written:"); ++ /* Set the statics written. */ + v_count = streamer_read_hwi (ib); + if (v_count == -1) + { +- info->statics_not_written = all_module_statics; ++ info->statics_written = all_module_statics; + if (dump_file) + fprintf (dump_file, " all module statics"); + } ++ else if (v_count == 0) ++ info->statics_written = no_module_statics; + else +- for (j = 0; j < (unsigned int)v_count; j++) +- { +- unsigned int var_index = streamer_read_uhwi (ib); +- tree v_decl = lto_file_decl_data_get_var_decl (file_data, +- var_index); +- bitmap_set_bit (info->statics_not_written, +- ipa_reference_var_uid (v_decl)); +- if (dump_file) +- fprintf (dump_file, " %s", fndecl_name (v_decl)); +- } ++ { ++ info->statics_written = BITMAP_ALLOC ++ (&optimization_summary_obstack); ++ for (j = 0; j < (unsigned int)v_count; j++) ++ { ++ unsigned int var_index = streamer_read_uhwi (ib); ++ tree v_decl = lto_file_decl_data_get_var_decl (file_data, ++ var_index); ++ bitmap_set_bit (info->statics_written, ++ ipa_reference_var_uid (v_decl)); ++ if (dump_file) ++ fprintf (dump_file, " %s", fndecl_name (v_decl)); ++ } ++ } + if (dump_file) + fprintf (dump_file, "\n"); + } +@@ -1233,6 +1323,9 @@ ipa_reference_c_finalize (void) + { + delete ipa_ref_opt_sum_summaries; + ipa_ref_opt_sum_summaries = NULL; ++ delete ipa_reference_vars_map; ++ ipa_reference_vars_map = NULL; ++ symtab->remove_varpool_removal_hook (varpool_node_hooks); + } + + if (ipa_init_p) +diff -Nurp a/gcc/ipa-reference.h b/gcc/ipa-reference.h +--- a/gcc/ipa-reference.h 2020-04-30 15:14:04.580000000 +0800 ++++ b/gcc/ipa-reference.h 2020-04-30 15:14:56.540000000 +0800 +@@ -22,15 +22,10 @@ along with GCC; see the file COPYING3. + #define GCC_IPA_REFERENCE_H + + /* In ipa-reference.c */ +-bitmap ipa_reference_get_not_read_global (struct cgraph_node *fn); +-bitmap ipa_reference_get_not_written_global (struct cgraph_node *fn); ++bitmap ipa_reference_get_read_global (struct cgraph_node *fn); ++bitmap ipa_reference_get_written_global (struct cgraph_node *fn); + void ipa_reference_c_finalize (void); +- +-inline int +-ipa_reference_var_uid (tree t) +-{ +- return DECL_UID (symtab_node::get (t)->ultimate_alias_target (NULL)->decl); +-} ++int ipa_reference_var_uid (tree t); + + #endif /* GCC_IPA_REFERENCE_H */ + +diff -Nurp a/gcc/ipa-utils.c b/gcc/ipa-utils.c +--- a/gcc/ipa-utils.c 2020-04-30 15:14:04.576000000 +0800 ++++ b/gcc/ipa-utils.c 2020-04-30 15:14:56.588000000 +0800 +@@ -103,8 +103,7 @@ searchc (struct searchc_env* env, struct + continue; + + if (w->aux +- && (avail > AVAIL_INTERPOSABLE +- || avail == AVAIL_INTERPOSABLE)) ++ && (avail >= AVAIL_INTERPOSABLE)) + { + w_info = (struct ipa_dfs_info *) w->aux; + if (w_info->new_node) +@@ -297,7 +296,7 @@ ipa_reverse_postorder (struct cgraph_nod + if (!node->aux + && (pass + || (!node->address_taken +- && !node->global.inlined_to ++ && !node->inlined_to + && !node->alias && !node->thunk.thunk_p + && !node->only_called_directly_p ()))) + { +diff -Nurp a/gcc/ipa-utils.h b/gcc/ipa-utils.h +--- a/gcc/ipa-utils.h 2020-04-30 15:14:04.652000000 +0800 ++++ b/gcc/ipa-utils.h 2020-04-30 15:14:56.624000000 +0800 +@@ -47,6 +47,9 @@ void ipa_merge_profiles (struct cgraph_n + struct cgraph_node *src, bool preserve_body = false); + bool recursive_call_p (tree, tree); + ++/* In ipa-prop.c */ ++void ipa_remove_useless_jump_functions (); ++ + /* In ipa-profile.c */ + bool ipa_propagate_frequency (struct cgraph_node *node); + +@@ -54,6 +57,7 @@ bool ipa_propagate_frequency (struct cgr + + struct odr_type_d; + typedef odr_type_d *odr_type; ++extern bool thunk_expansion; + void build_type_inheritance_graph (void); + void rebuild_type_inheritance_graph (void); + void update_type_inheritance_graph (void); +@@ -263,5 +267,3 @@ odr_type_p (const_tree t) + } + + #endif /* GCC_IPA_UTILS_H */ +- +- +diff -Nurp a/gcc/ipa-visibility.c b/gcc/ipa-visibility.c +--- a/gcc/ipa-visibility.c 2020-04-30 15:14:04.568000000 +0800 ++++ b/gcc/ipa-visibility.c 2020-04-30 15:14:56.588000000 +0800 +@@ -707,7 +707,7 @@ function_and_variable_visibility (bool w + || DECL_EXTERNAL (node->decl)); + if (cgraph_externally_visible_p (node, whole_program)) + { +- gcc_assert (!node->global.inlined_to); ++ gcc_assert (!node->inlined_to); + node->externally_visible = true; + } + else +diff -Nurp a/gcc/lto/lto.c b/gcc/lto/lto.c +--- a/gcc/lto/lto.c 2020-04-30 15:14:04.664000000 +0800 ++++ b/gcc/lto/lto.c 2020-04-30 15:14:56.552000000 +0800 +@@ -3211,9 +3211,9 @@ do_whole_program_analysis (void) + else + gcc_unreachable (); + +- /* Inline summaries are needed for balanced partitioning. Free them now so ++ /* Size summaries are needed for balanced partitioning. Free them now so + the memory can be used for streamer caches. */ +- ipa_free_fn_summary (); ++ ipa_free_size_summary (); + + /* AUX pointers are used by partitioning code to bookkeep number of + partitions symbol is in. This is no longer needed. */ +diff -Nurp a/gcc/lto/lto-partition.c b/gcc/lto/lto-partition.c +--- a/gcc/lto/lto-partition.c 2020-04-30 15:14:04.664000000 +0800 ++++ b/gcc/lto/lto-partition.c 2020-04-30 15:14:56.592000000 +0800 +@@ -171,7 +171,7 @@ add_symbol_to_partition_1 (ltrans_partit + { + struct cgraph_edge *e; + if (!node->alias && c == SYMBOL_PARTITION) +- part->insns += ipa_fn_summaries->get (cnode)->size; ++ part->insns += ipa_size_summaries->get (cnode)->size; + + /* Add all inline clones and callees that are duplicated. */ + for (e = cnode->callees; e; e = e->next_callee) +@@ -182,7 +182,7 @@ add_symbol_to_partition_1 (ltrans_partit + + /* Add all thunks associated with the function. */ + for (e = cnode->callers; e; e = e->next_caller) +- if (e->caller->thunk.thunk_p && !e->caller->global.inlined_to) ++ if (e->caller->thunk.thunk_p && !e->caller->inlined_to) + add_symbol_to_partition_1 (part, e->caller); + } + +@@ -233,8 +233,8 @@ contained_in_symbol (symtab_node *node) + if (cgraph_node *cnode = dyn_cast (node)) + { + cnode = cnode->function_symbol (); +- if (cnode->global.inlined_to) +- cnode = cnode->global.inlined_to; ++ if (cnode->inlined_to) ++ cnode = cnode->inlined_to; + return cnode; + } + else if (varpool_node *vnode = dyn_cast (node)) +@@ -291,7 +291,7 @@ undo_partition (ltrans_partition partiti + + if (!node->alias && (cnode = dyn_cast (node)) + && node->get_partitioning_class () == SYMBOL_PARTITION) +- partition->insns -= ipa_fn_summaries->get (cnode)->size; ++ partition->insns -= ipa_size_summaries->get (cnode)->size; + lto_symtab_encoder_delete_node (partition->encoder, node); + node->aux = (void *)((size_t)node->aux - 1); + } +@@ -529,7 +529,7 @@ lto_balanced_map (int n_lto_partitions, + else + order.safe_push (node); + if (!node->alias) +- total_size += ipa_fn_summaries->get (node)->size; ++ total_size += ipa_size_summaries->get (node)->size; + } + + original_total_size = total_size; +diff -Nurp a/gcc/lto/lto-symtab.c b/gcc/lto/lto-symtab.c +--- a/gcc/lto/lto-symtab.c 2020-04-30 15:14:04.664000000 +0800 ++++ b/gcc/lto/lto-symtab.c 2020-04-30 15:14:56.592000000 +0800 +@@ -63,7 +63,7 @@ lto_cgraph_replace_node (struct cgraph_n + prevailing_node->forced_by_abi = true; + if (node->address_taken) + { +- gcc_assert (!prevailing_node->global.inlined_to); ++ gcc_assert (!prevailing_node->inlined_to); + prevailing_node->mark_address_taken (); + } + if (node->definition && prevailing_node->definition +@@ -909,7 +909,7 @@ lto_symtab_merge_symbols_1 (symtab_node + cgraph_node *ce = dyn_cast (e); + + if ((!TREE_PUBLIC (e->decl) && !DECL_EXTERNAL (e->decl)) +- || (ce != NULL && ce->global.inlined_to)) ++ || (ce != NULL && ce->inlined_to)) + continue; + symtab_node *to = symtab_node::get (lto_symtab_prevailing_decl (e->decl)); + +diff -Nurp a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c +--- a/gcc/lto-cgraph.c 2020-04-30 15:14:04.636000000 +0800 ++++ b/gcc/lto-cgraph.c 2020-04-30 15:14:56.588000000 +0800 +@@ -329,7 +329,7 @@ reachable_from_other_partition_p (struct + struct cgraph_edge *e; + if (!node->definition) + return false; +- if (node->global.inlined_to) ++ if (node->inlined_to) + return false; + for (e = node->callers; e; e = e->next_caller) + { +@@ -399,7 +399,7 @@ lto_output_node (struct lto_simple_outpu + boundary_p = !lto_symtab_encoder_in_partition_p (encoder, node); + + if (node->analyzed && (!boundary_p || node->alias +- || (node->thunk.thunk_p && !node->global.inlined_to))) ++ || (node->thunk.thunk_p && !node->inlined_to))) + tag = LTO_symtab_analyzed_node; + else + tag = LTO_symtab_unavail_node; +@@ -422,7 +422,7 @@ lto_output_node (struct lto_simple_outpu + && node->get_partitioning_class () == SYMBOL_PARTITION) + { + /* Inline clones cannot be part of boundary. +- gcc_assert (!node->global.inlined_to); ++ gcc_assert (!node->inlined_to); + + FIXME: At the moment they can be, when partition contains an inline + clone that is clone of inline clone from outside partition. We can +@@ -468,9 +468,9 @@ lto_output_node (struct lto_simple_outpu + + if (tag == LTO_symtab_analyzed_node) + { +- if (node->global.inlined_to) ++ if (node->inlined_to) + { +- ref = lto_symtab_encoder_lookup (encoder, node->global.inlined_to); ++ ref = lto_symtab_encoder_lookup (encoder, node->inlined_to); + gcc_assert (ref != LCC_NOT_FOUND); + } + else +@@ -884,7 +884,7 @@ compute_ltrans_boundary (lto_symtab_enco + if (!lto_symtab_encoder_in_partition_p (encoder, callee)) + { + /* We should have moved all the inlines. */ +- gcc_assert (!callee->global.inlined_to); ++ gcc_assert (!callee->inlined_to); + add_node_to (encoder, callee, false); + } + } +@@ -911,7 +911,7 @@ compute_ltrans_boundary (lto_symtab_enco + && !lto_symtab_encoder_in_partition_p + (encoder, callee)) + { +- gcc_assert (!callee->global.inlined_to); ++ gcc_assert (!callee->inlined_to); + add_node_to (encoder, callee, false); + } + } +@@ -928,7 +928,7 @@ compute_ltrans_boundary (lto_symtab_enco + if (node->alias && node->analyzed) + create_references (encoder, node); + if (cnode +- && cnode->thunk.thunk_p && !cnode->global.inlined_to) ++ && cnode->thunk.thunk_p && !cnode->inlined_to) + add_node_to (encoder, cnode->callees->callee, false); + while (node->transparent_alias && node->analyzed) + { +@@ -984,7 +984,7 @@ output_symtab (void) + { + node = dyn_cast (lto_symtab_encoder_deref (encoder, i)); + if (node +- && ((node->thunk.thunk_p && !node->global.inlined_to) ++ && ((node->thunk.thunk_p && !node->inlined_to) + || lto_symtab_encoder_in_partition_p (encoder, node))) + { + output_outgoing_cgraph_edges (node->callees, ob, encoder); +@@ -1283,7 +1283,7 @@ input_node (struct lto_file_decl_data *f + input_overwrite_node (file_data, node, tag, &bp); + + /* Store a reference for now, and fix up later to be a pointer. */ +- node->global.inlined_to = (cgraph_node *) (intptr_t) ref; ++ node->inlined_to = (cgraph_node *) (intptr_t) ref; + + if (group) + { +@@ -1542,7 +1542,7 @@ input_cgraph_1 (struct lto_file_decl_dat + int ref; + if (cgraph_node *cnode = dyn_cast (node)) + { +- ref = (int) (intptr_t) cnode->global.inlined_to; ++ ref = (int) (intptr_t) cnode->inlined_to; + + /* We share declaration of builtins, so we may read same node twice. */ + if (!node->aux) +@@ -1551,10 +1551,10 @@ input_cgraph_1 (struct lto_file_decl_dat + + /* Fixup inlined_to from reference to pointer. */ + if (ref != LCC_NOT_FOUND) +- dyn_cast (node)->global.inlined_to ++ dyn_cast (node)->inlined_to + = dyn_cast (nodes[ref]); + else +- cnode->global.inlined_to = NULL; ++ cnode->inlined_to = NULL; + } + + ref = (int) (intptr_t) node->same_comdat_group; +diff -Nurp a/gcc/omp-simd-clone.c b/gcc/omp-simd-clone.c +--- a/gcc/omp-simd-clone.c 2020-04-30 15:14:04.644000000 +0800 ++++ b/gcc/omp-simd-clone.c 2020-04-30 15:14:56.592000000 +0800 +@@ -1635,7 +1635,7 @@ expand_simd_clones (struct cgraph_node * + tree attr = lookup_attribute ("omp declare simd", + DECL_ATTRIBUTES (node->decl)); + if (attr == NULL_TREE +- || node->global.inlined_to ++ || node->inlined_to + || lookup_attribute ("noclone", DECL_ATTRIBUTES (node->decl))) + return; + +diff -Nurp a/gcc/params.def b/gcc/params.def +--- a/gcc/params.def 2020-04-30 15:14:04.560000000 +0800 ++++ b/gcc/params.def 2020-04-30 15:14:56.700000000 +0800 +@@ -1093,6 +1093,18 @@ DEFPARAM (PARAM_IPA_CP_VALUE_LIST_SIZE, + "interprocedural constant propagation.", + 8, 0, 0) + ++DEFPARAM (PARAM_IPA_CP_MIN_RECURSIVE_PROBABILITY, ++ "ipa-cp-min-recursive-probability", ++ "Recursive cloning only when the probability of call being executed " ++ "exceeds the parameter. ", ++ 2, 0, 0) ++ ++DEFPARAM (PARAM_IPA_CP_MAX_RECURSIVE_DEPTH, ++ "ipa-cp-max-recursive-depth", ++ "Threshold ipa-cp opportunity evaluation that is still considered " ++ "Maximum depth of recursive cloning for self-recursive function.", ++ 8, 0, 0) ++ + DEFPARAM (PARAM_IPA_CP_EVAL_THRESHOLD, + "ipa-cp-eval-threshold", + "Threshold ipa-cp opportunity evaluation that is still considered " +@@ -1129,6 +1141,18 @@ DEFPARAM (PARAM_IPA_MAX_AA_STEPS, + "parameter analysis based on alias analysis in any given function.", + 25000, 0, 0) + ++DEFPARAM (PARAM_IPA_MAX_SWITCH_PREDICATE_BOUNDS, ++ "ipa-max-switch-predicate-bounds", ++ "Maximal number of boundary endpoints of case ranges of switch " ++ "statement used during IPA functoin summary generation.", ++ 5, 0, 0) ++ ++DEFPARAM (PARAM_IPA_MAX_PARAM_EXPR_OPS, ++ "ipa-max-param-expr-ops", ++ "Maximum number of operations in a parameter expression that can " ++ "be handled by IPA analysis.", ++ 10, 0, 0) ++ + /* WHOPR partitioning configuration. */ + + DEFPARAM (PARAM_LTO_PARTITIONS, +diff -Nurp a/gcc/passes.c b/gcc/passes.c +--- a/gcc/passes.c 2020-04-30 15:14:04.632000000 +0800 ++++ b/gcc/passes.c 2020-04-30 15:14:56.592000000 +0800 +@@ -3047,7 +3047,7 @@ function_called_by_processed_nodes_p (vo + continue; + if (TREE_ASM_WRITTEN (e->caller->decl)) + continue; +- if (!e->caller->process && !e->caller->global.inlined_to) ++ if (!e->caller->process && !e->caller->inlined_to) + break; + } + if (dump_file && e) +diff -Nurp a/gcc/symtab.c b/gcc/symtab.c +--- a/gcc/symtab.c 2020-04-30 15:14:04.636000000 +0800 ++++ b/gcc/symtab.c 2020-04-30 15:14:56.592000000 +0800 +@@ -1874,7 +1874,7 @@ symtab_node::get_partitioning_class (voi + if (DECL_ABSTRACT_P (decl)) + return SYMBOL_EXTERNAL; + +- if (cnode && cnode->global.inlined_to) ++ if (cnode && cnode->inlined_to) + return SYMBOL_DUPLICATE; + + /* Transparent aliases are always duplicated. */ +@@ -2274,7 +2274,7 @@ symtab_node::binds_to_current_def_p (sym + return true; + + /* Inline clones always binds locally. */ +- if (cnode && cnode->global.inlined_to) ++ if (cnode && cnode->inlined_to) + return true; + + if (DECL_EXTERNAL (decl)) +@@ -2286,7 +2286,7 @@ symtab_node::binds_to_current_def_p (sym + { + cgraph_node *cref = dyn_cast (ref); + if (cref) +- ref = cref->global.inlined_to; ++ ref = cref->inlined_to; + } + + /* If this is a reference from symbol itself and there are no aliases, we +diff -Nurp a/gcc/testsuite/gcc.c-torture/compile/flatten.c b/gcc/testsuite/gcc.c-torture/compile/flatten.c +--- a/gcc/testsuite/gcc.c-torture/compile/flatten.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.c-torture/compile/flatten.c 2020-04-30 15:14:56.684000000 +0800 +@@ -0,0 +1,5 @@ ++int you_shall_not_flatten_me () __attribute__ ((flatten)); ++main() ++{ ++ you_shall_not_flatten_me (); ++} +diff -Nurp a/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c +--- a/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/ipa/ipa-clone-2.c 2020-04-30 15:14:56.696000000 +0800 +@@ -0,0 +1,47 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O3 -fdump-ipa-cp-details -fno-early-inlining --param ipa-cp-max-recursive-depth=8" } */ ++ ++int fn(); ++ ++int data[100]; ++ ++int recur_fn (int i) ++{ ++ int j; ++ ++ if (i == 6) ++ { ++ fn(); ++ fn(); ++ fn(); ++ fn(); ++ fn(); ++ fn(); ++ fn(); ++ fn(); ++ fn(); ++ fn(); ++ fn(); ++ fn(); ++ return 10; ++ } ++ ++ data[i] = i; ++ ++ for (j = 0; j < 100; j++) ++ recur_fn (i + 1); ++ ++ return i; ++} ++ ++int main () ++{ ++ int i; ++ ++ for (i = 0; i < 100; i++) ++ recur_fn (1) + recur_fn (-5); ++ ++ return 1; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Creating a specialized node of recur_fn/\[0-9\]*\\." 12 "cp" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/ipa/ipcp-agg-10.c b/gcc/testsuite/gcc.dg/ipa/ipcp-agg-10.c +--- a/gcc/testsuite/gcc.dg/ipa/ipcp-agg-10.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/ipa/ipcp-agg-10.c 2020-04-30 15:14:56.664000000 +0800 +@@ -0,0 +1,78 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O3 -fdump-ipa-cp-details -fno-inline" } */ ++ ++int data1; ++ ++int callee1(int *v) ++{ ++ if (*v < 2) ++ return 0; ++ else ++ { ++ int t = data1; ++ ++ data1 = *v; ++ *v = t; ++ ++ return 1; ++ } ++} ++ ++int __attribute__((pure)) callee2(int *v) ++{ ++ if (*v < 2) ++ return 0; ++ else ++ { ++ data1 = v[0] + v[2]; ++ ++ return 1; ++ } ++} ++ ++int caller1(int c, int *r) ++{ ++ int a = 1; ++ ++ if (c) ++ return callee1(&a); ++ else ++ { ++ *r = 2; ++ return callee1(r); ++ } ++} ++ ++int data2[200]; ++int data3; ++ ++int __attribute__((const)) gen_cond(int); ++ ++int caller2(void) ++{ ++ int i, j; ++ int sum = 0; ++ int a[8]; ++ ++ a[0] = 3; ++ for (i = 0; i < 100; i++) ++ { ++ if (gen_cond (i)) ++ continue; ++ ++ a[2] = 4; ++ for (j = 0; j < 100; j++) ++ { ++ data2[i + j] = (i ^ j) + data3; ++ ++ sum += callee2(a); ++ } ++ } ++ ++ return sum; ++} ++ ++/* { dg-final { scan-ipa-dump-times "offset: 0, type: int, CONST: 1" 1 "cp" } } */ ++/* { dg-final { scan-ipa-dump-times "offset: 0, type: int, CONST: 2" 1 "cp" } } */ ++/* { dg-final { scan-ipa-dump-times "offset: 0, type: int, CONST: 3" 1 "cp" } } */ ++/* { dg-final { scan-ipa-dump-times "offset: 64, type: int, CONST: 4" 1 "cp" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/ipa/ipcp-agg-11.c b/gcc/testsuite/gcc.dg/ipa/ipcp-agg-11.c +--- a/gcc/testsuite/gcc.dg/ipa/ipcp-agg-11.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/ipa/ipcp-agg-11.c 2020-04-30 15:14:56.664000000 +0800 +@@ -0,0 +1,77 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O3 -fno-ipa-sra -fdump-ipa-cp-details -fno-early-inlining" } */ ++/* { dg-add-options bind_pic_locally } */ ++ ++struct S ++{ ++ int a, b, c; ++}; ++ ++void *blah(int, void *); ++ ++#define foo_body(p)\ ++{ \ ++ int i, c = (p)->c; \ ++ int b = (p)->b; \ ++ void *v = (void *) (p); \ ++ \ ++ for (i= 0; i< c; i++) \ ++ v = blah(b + i, v); \ ++} ++ ++static void __attribute__ ((noinline)) ++foo_v (struct S s) ++{ ++ foo_body (&s); ++} ++ ++static void __attribute__ ((noinline)) ++foo_r (struct S *p) ++{ ++ foo_body (p); ++} ++ ++static void ++goo_v (int a, int *p) ++{ ++ struct S s; ++ s.a = 101; ++ s.b = a % 7; ++ s.c = *p + 6; ++ foo_v (s); ++} ++ ++static void ++goo_r (int a, struct S n) ++{ ++ struct S s; ++ s.a = 1; ++ s.b = a + 5; ++ s.c = -n.b; ++ foo_r (&s); ++} ++ ++void ++entry () ++{ ++ int a; ++ int v; ++ struct S s; ++ ++ a = 9; ++ v = 3; ++ goo_v (a, &v); ++ ++ a = 100; ++ s.b = 18; ++ goo_r (a, s); ++} ++ ++/* { dg-final { scan-ipa-dump "offset: 0, type: int, CONST: 1" "cp" } } */ ++/* { dg-final { scan-ipa-dump "offset: 32, type: int, PASS THROUGH: 0, op plus_expr 5" "cp" } } */ ++/* { dg-final { scan-ipa-dump "offset: 64, type: int, LOAD AGG: 1 \\\[offset: 32, by value], op negate_expr" "cp" } } */ ++/* { dg-final { scan-ipa-dump "offset: 0, type: int, CONST: 101" "cp" } } */ ++/* { dg-final { scan-ipa-dump "offset: 32, type: int, PASS THROUGH: 0, op trunc_mod_expr 7" "cp" } } */ ++/* { dg-final { scan-ipa-dump "offset: 64, type: int, LOAD AGG: 1 \\\[offset: 0, by reference], op plus_expr 6" "cp" } } */ ++/* { dg-final { scan-ipa-dump "Aggregate replacements: 0\\\[0]=1, 0\\\[32]=105, 0\\\[64]=-18" "cp" } } */ ++/* { dg-final { scan-ipa-dump "Aggregate replacements: 0\\\[0]=101, 0\\\[32]=2, 0\\\[64]=9" "cp" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/ipa/pr91089.c b/gcc/testsuite/gcc.dg/ipa/pr91089.c +--- a/gcc/testsuite/gcc.dg/ipa/pr91089.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/ipa/pr91089.c 2020-04-30 15:14:56.516000000 +0800 +@@ -0,0 +1,62 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O3 -fdump-ipa-cp-details -fdump-ipa-fnsummary-details --param ipa-max-switch-predicate-bounds=10 -fno-inline" } */ ++ ++int fn (); ++ ++int data; ++ ++int callee (int i) ++{ ++ switch (i) ++ { ++ case -126: return i + 13; ++ case -127: return i + 5; ++ case -8: return i * i; ++ case 0: return i % 9; ++ case 5: ++ case 7: ++ case 6: return 3; ++ default: ++ fn (); ++ fn (); ++ fn (); ++ fn (); ++ fn (); ++ fn (); ++ fn (); ++ fn (); ++ fn (); ++ fn (); ++ fn (); ++ fn (); ++ fn (); ++ fn (); ++ fn (); ++ fn (); ++ fn (); ++ fn (); ++ fn (); ++ } ++ ++ return data += i; ++} ++ ++int caller () ++{ ++ return callee (-127) + ++ callee (-126) + ++ callee (-8) + ++ callee (0) + ++ callee (5) + ++ callee (6) + ++ callee (7) + ++ callee (100); ++} ++ ++/* { dg-final { scan-ipa-dump-times "Creating a specialized node of callee" 7 "cp" } } */ ++/* { dg-final { scan-ipa-dump "op0 < -127" "fnsummary" } } */ ++/* { dg-final { scan-ipa-dump "op0 > -126" "fnsummary" } } */ ++/* { dg-final { scan-ipa-dump "op0 != -8" "fnsummary" } } */ ++/* { dg-final { scan-ipa-dump "op0 != 0" "fnsummary" } } */ ++/* { dg-final { scan-ipa-dump "op0 < 5" "fnsummary" } } */ ++/* { dg-final { scan-ipa-dump "op0 > 7" "fnsummary" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/tree-ssa/pr46076.c b/gcc/testsuite/gcc.dg/tree-ssa/pr46076.c +--- a/gcc/testsuite/gcc.dg/tree-ssa/pr46076.c 2020-04-30 15:14:05.756000000 +0800 ++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr46076.c 2020-04-30 15:14:56.640000000 +0800 +@@ -19,9 +19,12 @@ main() + { + /* Make sure we perform indirect inlining of one and two and optimize + the result to a constant. */ +- if (print(one) != 3) +- link_error (); +- if (print(two) != 5) +- link_error (); ++ for (int i = 0; i < 100; i++) ++ { ++ if (print(one) != 3) ++ link_error (); ++ if (print(two) != 5) ++ link_error (); ++ } + return 0; + } +diff -Nurp a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-73.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-73.c +--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-73.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-73.c 2020-04-30 15:14:56.472000000 +0800 +@@ -0,0 +1,14 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O -fdump-tree-fre1" } */ ++ ++typedef int v2si __attribute__((vector_size(__SIZEOF_INT__ * 2))); ++int foo (int *a) ++{ ++ a[0] = 1; ++ a[1] = 2; ++ v2si x = *(v2si *)a; ++ *(v2si *)&a[2] = x; ++ return a[3]; ++} ++ ++/* { dg-final { scan-tree-dump "return 2;" "fre1" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-74.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-74.c +--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-74.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-74.c 2020-04-30 15:14:56.472000000 +0800 +@@ -0,0 +1,16 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O -fdump-tree-fre1" } */ ++ ++typedef int v4si __attribute__((vector_size(__SIZEOF_INT__ * 4))); ++int foo (int *a) ++{ ++ a[2] = 2; ++ a[0] = 0; ++ a[1] = 1; ++ a[3] = 4; ++ v4si x = *(v4si *)a; ++ *(v4si *)&a[4] = x; ++ return a[4] + a[7]; ++} ++ ++/* { dg-final { scan-tree-dump "return 4;" "fre1" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-76.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-76.c +--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-76.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-76.c 2020-04-30 15:14:56.472000000 +0800 +@@ -0,0 +1,16 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O -fdump-tree-fre1" } */ ++ ++typedef int v4si __attribute__((vector_size(__SIZEOF_INT__ * 4))); ++int foo (int *a) ++{ ++ __builtin_memset (a, 0, 2 * __SIZEOF_INT__); ++ a[2] = 2; ++ a[0] = 1; ++ a[3] = 3; ++ v4si x = *(v4si *)a; ++ *(v4si *)&a[4] = x; ++ return a[4] + a[5] + a[7]; ++} ++ ++/* { dg-final { scan-tree-dump "return 4;" "fre1" } } */ +diff -Nurp a/gcc/tree-sra.c b/gcc/tree-sra.c +--- a/gcc/tree-sra.c 2020-04-30 15:14:04.568000000 +0800 ++++ b/gcc/tree-sra.c 2020-04-30 15:14:56.556000000 +0800 +@@ -5488,7 +5488,7 @@ ipa_sra_preliminary_function_checks (str + + if ((DECL_ONE_ONLY (node->decl) || DECL_EXTERNAL (node->decl)) + && ipa_fn_summaries->get (node) +- && ipa_fn_summaries->get (node)->size >= MAX_INLINE_INSNS_AUTO) ++ && ipa_size_summaries->get (node)->size >= MAX_INLINE_INSNS_AUTO) + { + if (dump_file) + fprintf (dump_file, "Function too big to be made truly local.\n"); +diff -Nurp a/gcc/tree-ssa-alias.c b/gcc/tree-ssa-alias.c +--- a/gcc/tree-ssa-alias.c 2020-04-30 15:14:04.648000000 +0800 ++++ b/gcc/tree-ssa-alias.c 2020-04-30 15:14:56.540000000 +0800 +@@ -1822,14 +1822,16 @@ ref_maybe_used_by_call_p_1 (gcall *call, + if (callee != NULL_TREE && VAR_P (base) && TREE_STATIC (base)) + { + struct cgraph_node *node = cgraph_node::get (callee); +- bitmap not_read; ++ bitmap read; ++ int id; + + /* FIXME: Callee can be an OMP builtin that does not have a call graph + node yet. We should enforce that there are nodes for all decls in the + IL and remove this check instead. */ + if (node +- && (not_read = ipa_reference_get_not_read_global (node)) +- && bitmap_bit_p (not_read, ipa_reference_var_uid (base))) ++ && (id = ipa_reference_var_uid (base)) != -1 ++ && (read = ipa_reference_get_read_global (node)) ++ && !bitmap_bit_p (read, id)) + goto process_args; + } + +@@ -2217,11 +2219,13 @@ call_may_clobber_ref_p_1 (gcall *call, a + if (callee != NULL_TREE && VAR_P (base) && TREE_STATIC (base)) + { + struct cgraph_node *node = cgraph_node::get (callee); +- bitmap not_written; ++ bitmap written; ++ int id; + + if (node +- && (not_written = ipa_reference_get_not_written_global (node)) +- && bitmap_bit_p (not_written, ipa_reference_var_uid (base))) ++ && (id = ipa_reference_var_uid (base)) != -1 ++ && (written = ipa_reference_get_written_global (node)) ++ && !bitmap_bit_p (written, id)) + return false; + } + +diff -Nurp a/gcc/tree-ssa-sccvn.c b/gcc/tree-ssa-sccvn.c +--- a/gcc/tree-ssa-sccvn.c 2020-04-30 15:14:04.632000000 +0800 ++++ b/gcc/tree-ssa-sccvn.c 2020-04-30 15:14:56.480000000 +0800 +@@ -21,6 +21,7 @@ along with GCC; see the file COPYING3. + #include "config.h" + #include "system.h" + #include "coretypes.h" ++#include "splay-tree.h" + #include "backend.h" + #include "rtl.h" + #include "tree.h" +@@ -361,6 +362,8 @@ static void init_vn_nary_op_from_stmt (v + static void init_vn_nary_op_from_pieces (vn_nary_op_t, unsigned int, + enum tree_code, tree, tree *); + static tree vn_lookup_simplify_result (gimple_match_op *); ++static vn_reference_t vn_reference_lookup_or_insert_for_pieces ++ (tree, alias_set_type, tree, vec, tree); + + /* Return whether there is value numbering information for a given SSA name. */ + +@@ -1676,20 +1679,245 @@ vn_reference_lookup_1 (vn_reference_t vr + return NULL_TREE; + } + ++ ++/* Partial definition tracking support. */ ++ ++struct pd_range ++{ ++ HOST_WIDE_INT offset; ++ HOST_WIDE_INT size; ++}; ++ ++struct pd_data ++{ ++ tree rhs; ++ HOST_WIDE_INT offset; ++ HOST_WIDE_INT size; ++}; ++ ++/* Context for alias walking. */ ++ + struct vn_walk_cb_data + { + vn_walk_cb_data (vn_reference_t vr_, tree *last_vuse_ptr_, +- vn_lookup_kind vn_walk_kind_, bool tbaa_p_) ++ vn_lookup_kind vn_walk_kind_, bool tbaa_p_) + : vr (vr_), last_vuse_ptr (last_vuse_ptr_), vn_walk_kind (vn_walk_kind_), +- tbaa_p (tbaa_p_) +- {} ++ tbaa_p (tbaa_p_), known_ranges (NULL) ++ {} ++ ~vn_walk_cb_data (); ++ void *push_partial_def (const pd_data& pd, tree, HOST_WIDE_INT); + + vn_reference_t vr; + tree *last_vuse_ptr; + vn_lookup_kind vn_walk_kind; + bool tbaa_p; ++ ++ /* The VDEFs of partial defs we come along. */ ++ auto_vec partial_defs; ++ /* The first defs range to avoid splay tree setup in most cases. */ ++ pd_range first_range; ++ tree first_vuse; ++ splay_tree known_ranges; ++ obstack ranges_obstack; + }; + ++vn_walk_cb_data::~vn_walk_cb_data () ++{ ++ if (known_ranges) ++ { ++ splay_tree_delete (known_ranges); ++ obstack_free (&ranges_obstack, NULL); ++ } ++} ++ ++/* pd_range splay-tree helpers. */ ++ ++static int ++pd_range_compare (splay_tree_key offset1p, splay_tree_key offset2p) ++{ ++ HOST_WIDE_INT offset1 = *(HOST_WIDE_INT *)offset1p; ++ HOST_WIDE_INT offset2 = *(HOST_WIDE_INT *)offset2p; ++ if (offset1 < offset2) ++ return -1; ++ else if (offset1 > offset2) ++ return 1; ++ return 0; ++} ++ ++static void * ++pd_tree_alloc (int size, void *data_) ++{ ++ vn_walk_cb_data *data = (vn_walk_cb_data *)data_; ++ return obstack_alloc (&data->ranges_obstack, size); ++} ++ ++static void ++pd_tree_dealloc (void *, void *) ++{ ++} ++ ++/* Push PD to the vector of partial definitions returning a ++ value when we are ready to combine things with VUSE and MAXSIZEI, ++ NULL when we want to continue looking for partial defs or -1 ++ on failure. */ ++ ++void * ++vn_walk_cb_data::push_partial_def (const pd_data &pd, tree vuse, ++ HOST_WIDE_INT maxsizei) ++{ ++ if (partial_defs.is_empty ()) ++ { ++ partial_defs.safe_push (pd); ++ first_range.offset = pd.offset; ++ first_range.size = pd.size; ++ first_vuse = vuse; ++ last_vuse_ptr = NULL; ++ } ++ else ++ { ++ if (!known_ranges) ++ { ++ /* ??? Optimize the case where the second partial def ++ completes things. */ ++ gcc_obstack_init (&ranges_obstack); ++ known_ranges ++ = splay_tree_new_with_allocator (pd_range_compare, 0, 0, ++ pd_tree_alloc, ++ pd_tree_dealloc, this); ++ splay_tree_insert (known_ranges, ++ (splay_tree_key)&first_range.offset, ++ (splay_tree_value)&first_range); ++ } ++ if (known_ranges) ++ { ++ pd_range newr = { pd.offset, pd.size }; ++ splay_tree_node n; ++ pd_range *r; ++ /* Lookup the predecessor of offset + 1 and see if ++ we need to merge with it. */ ++ HOST_WIDE_INT loffset = newr.offset + 1; ++ if ((n = splay_tree_predecessor (known_ranges, ++ (splay_tree_key)&loffset)) ++ && ((r = (pd_range *)n->value), true) ++ && ranges_known_overlap_p (r->offset, r->size + 1, ++ newr.offset, newr.size)) ++ { ++ /* Ignore partial defs already covered. */ ++ if (known_subrange_p (newr.offset, newr.size, ++ r->offset, r->size)) ++ return NULL; ++ r->size = MAX (r->offset + r->size, ++ newr.offset + newr.size) - r->offset; ++ } ++ else ++ { ++ /* newr.offset wasn't covered yet, insert the ++ range. */ ++ r = XOBNEW (&ranges_obstack, pd_range); ++ *r = newr; ++ splay_tree_insert (known_ranges, ++ (splay_tree_key)&r->offset, ++ (splay_tree_value)r); ++ } ++ /* Merge r which now contains newr and is a member ++ of the splay tree with adjacent overlapping ranges. */ ++ pd_range *rafter; ++ while ((n = splay_tree_successor (known_ranges, ++ (splay_tree_key)&r->offset)) ++ && ((rafter = (pd_range *)n->value), true) ++ && ranges_known_overlap_p (r->offset, r->size + 1, ++ rafter->offset, rafter->size)) ++ { ++ r->size = MAX (r->offset + r->size, ++ rafter->offset + rafter->size) - r->offset; ++ splay_tree_remove (known_ranges, ++ (splay_tree_key)&rafter->offset); ++ } ++ partial_defs.safe_push (pd); ++ ++ /* Now we have merged newr into the range tree. ++ When we have covered [offseti, sizei] then the ++ tree will contain exactly one node which has ++ the desired properties and it will be 'r'. */ ++ if (known_subrange_p (0, maxsizei / BITS_PER_UNIT, ++ r->offset, r->size)) ++ { ++ /* Now simply native encode all partial defs ++ in reverse order. */ ++ unsigned ndefs = partial_defs.length (); ++ /* We support up to 512-bit values (for V8DFmode). */ ++ unsigned char buffer[64]; ++ int len; ++ ++ while (!partial_defs.is_empty ()) ++ { ++ pd_data pd = partial_defs.pop (); ++ if (TREE_CODE (pd.rhs) == CONSTRUCTOR) ++ /* Empty CONSTRUCTOR. */ ++ memset (buffer + MAX (0, pd.offset), ++ 0, MIN ((HOST_WIDE_INT)sizeof (buffer), pd.size)); ++ else ++ { ++ len = native_encode_expr (pd.rhs, ++ buffer + MAX (0, pd.offset), ++ sizeof (buffer - MAX (0, pd.offset)), ++ MAX (0, -pd.offset)); ++ if (len <= 0 ++ || len < (pd.size - MAX (0, -pd.offset))) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Failed to encode %u " ++ "partial definitions\n", ndefs); ++ return (void *)-1; ++ } ++ } ++ } ++ ++ tree type = vr->type; ++ /* Make sure to interpret in a type that has a range ++ covering the whole access size. */ ++ if (INTEGRAL_TYPE_P (vr->type) ++ && maxsizei != TYPE_PRECISION (vr->type)) ++ type = build_nonstandard_integer_type (maxsizei, ++ TYPE_UNSIGNED (type)); ++ tree val = native_interpret_expr (type, buffer, ++ maxsizei / BITS_PER_UNIT); ++ /* If we chop off bits because the types precision doesn't ++ match the memory access size this is ok when optimizing ++ reads but not when called from the DSE code during ++ elimination. */ ++ if (val ++ && type != vr->type) ++ { ++ if (! int_fits_type_p (val, vr->type)) ++ val = NULL_TREE; ++ else ++ val = fold_convert (vr->type, val); ++ } ++ ++ if (val) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Successfully combined %u " ++ "partial definitions\n", ndefs); ++ return vn_reference_lookup_or_insert_for_pieces ++ (first_vuse, ++ vr->set, vr->type, vr->operands, val); ++ } ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Failed to interpret %u " ++ "encoded partial definitions\n", ndefs); ++ return (void *)-1; ++ } ++ } ++ } ++ } ++ /* Continue looking for partial defs. */ ++ return NULL; ++} ++ + /* Callback for walk_non_aliased_vuses. Adjusts the vn_reference_t VR_ + with the current VUSE and performs the expression lookup. */ + +@@ -1701,6 +1929,11 @@ vn_reference_lookup_2 (ao_ref *op ATTRIB + vn_reference_s **slot; + hashval_t hash; + ++ /* If we have partial definitions recorded we have to go through ++ vn_reference_lookup_3. */ ++ if (!data->partial_defs.is_empty ()) ++ return NULL; ++ + if (data->last_vuse_ptr) + *data->last_vuse_ptr = vuse; + +@@ -1964,6 +2197,33 @@ public: + static rpo_elim *rpo_avail; + basic_block vn_context_bb; + ++/* Return true if BASE1 and BASE2 can be adjusted so they have the ++ same address and adjust *OFFSET1 and *OFFSET2 accordingly. ++ Otherwise return false. */ ++ ++static bool ++adjust_offsets_for_equal_base_address (tree base1, poly_int64 *offset1, ++ tree base2, poly_int64 *offset2) ++{ ++ poly_int64 soff; ++ if (TREE_CODE (base1) == MEM_REF ++ && TREE_CODE (base2) == MEM_REF) ++ { ++ if (mem_ref_offset (base1).to_shwi (&soff)) ++ { ++ base1 = TREE_OPERAND (base1, 0); ++ *offset1 += soff * BITS_PER_UNIT; ++ } ++ if (mem_ref_offset (base2).to_shwi (&soff)) ++ { ++ base2 = TREE_OPERAND (base2, 0); ++ *offset2 += soff * BITS_PER_UNIT; ++ } ++ return operand_equal_p (base1, base2, 0); ++ } ++ return operand_equal_p (base1, base2, OEP_ADDRESS_OF); ++} ++ + /* Callback for walk_non_aliased_vuses. Tries to perform a lookup + from the statement defining VUSE and if not successful tries to + translate *REFP and VR_ through an aggregate copy at the definition +@@ -2175,8 +2435,10 @@ vn_reference_lookup_3 (ao_ref *ref, tree + else + return (void *)-1; + tree len = gimple_call_arg (def_stmt, 2); +- if (known_subrange_p (offset, maxsize, offset2, +- wi::to_poly_offset (len) << LOG2_BITS_PER_UNIT)) ++ HOST_WIDE_INT leni, offset2i, offseti; ++ if (data->partial_defs.is_empty () ++ && known_subrange_p (offset, maxsize, offset2, ++ wi::to_poly_offset (len) << LOG2_BITS_PER_UNIT)) + { + tree val; + if (integer_zerop (gimple_call_arg (def_stmt, 1))) +@@ -2205,6 +2467,19 @@ vn_reference_lookup_3 (ao_ref *ref, tree + return vn_reference_lookup_or_insert_for_pieces + (vuse, vr->set, vr->type, vr->operands, val); + } ++ /* For now handle clearing memory with partial defs. */ ++ else if (integer_zerop (gimple_call_arg (def_stmt, 1)) ++ && tree_to_poly_int64 (len).is_constant (&leni) ++ && offset.is_constant (&offseti) ++ && offset2.is_constant (&offset2i) ++ && maxsize.is_constant (&maxsizei)) ++ { ++ pd_data pd; ++ pd.rhs = build_constructor (NULL_TREE, NULL); ++ pd.offset = offset2i - offseti; ++ pd.size = leni; ++ return data->push_partial_def (pd, vuse, maxsizei); ++ } + } + + /* 2) Assignment from an empty CONSTRUCTOR. */ +@@ -2215,17 +2490,37 @@ vn_reference_lookup_3 (ao_ref *ref, tree + { + tree base2; + poly_int64 offset2, size2, maxsize2; ++ HOST_WIDE_INT offset2i, size2i; + bool reverse; + base2 = get_ref_base_and_extent (gimple_assign_lhs (def_stmt), + &offset2, &size2, &maxsize2, &reverse); + if (known_size_p (maxsize2) + && known_eq (maxsize2, size2) +- && operand_equal_p (base, base2, 0) +- && known_subrange_p (offset, maxsize, offset2, size2)) ++ && adjust_offsets_for_equal_base_address (base, &offset, ++ base2, &offset2)) + { +- tree val = build_zero_cst (vr->type); +- return vn_reference_lookup_or_insert_for_pieces +- (vuse, vr->set, vr->type, vr->operands, val); ++ if (data->partial_defs.is_empty () ++ && known_subrange_p (offset, maxsize, offset2, size2)) ++ { ++ tree val = build_zero_cst (vr->type); ++ return vn_reference_lookup_or_insert_for_pieces ++ (vuse, vr->set, vr->type, vr->operands, val); ++ } ++ else if (maxsize.is_constant (&maxsizei) ++ && maxsizei % BITS_PER_UNIT == 0 ++ && offset.is_constant (&offseti) ++ && offseti % BITS_PER_UNIT == 0 ++ && offset2.is_constant (&offset2i) ++ && offset2i % BITS_PER_UNIT == 0 ++ && size2.is_constant (&size2i) ++ && size2i % BITS_PER_UNIT == 0) ++ { ++ pd_data pd; ++ pd.rhs = gimple_assign_rhs1 (def_stmt); ++ pd.offset = (offset2i - offseti) / BITS_PER_UNIT; ++ pd.size = size2i / BITS_PER_UNIT; ++ return data->push_partial_def (pd, vuse, maxsizei); ++ } + } + } + +@@ -2247,65 +2542,85 @@ vn_reference_lookup_3 (ao_ref *ref, tree + && is_gimple_min_invariant (SSA_VAL (gimple_assign_rhs1 (def_stmt)))))) + { + tree base2; +- HOST_WIDE_INT offset2, size2; ++ poly_int64 offset2, size2, maxsize2; ++ HOST_WIDE_INT offset2i, size2i; + bool reverse; +- base2 = get_ref_base_and_extent_hwi (gimple_assign_lhs (def_stmt), +- &offset2, &size2, &reverse); ++ base2 = get_ref_base_and_extent (gimple_assign_lhs (def_stmt), ++ &offset2, &size2, &maxsize2, &reverse); + if (base2 + && !reverse +- && size2 % BITS_PER_UNIT == 0 +- && offset2 % BITS_PER_UNIT == 0 +- && operand_equal_p (base, base2, 0) +- && known_subrange_p (offseti, maxsizei, offset2, size2)) +- { +- /* We support up to 512-bit values (for V8DFmode). */ +- unsigned char buffer[64]; +- int len; +- +- tree rhs = gimple_assign_rhs1 (def_stmt); +- if (TREE_CODE (rhs) == SSA_NAME) +- rhs = SSA_VAL (rhs); +- unsigned pad = 0; +- if (BYTES_BIG_ENDIAN +- && is_a (TYPE_MODE (TREE_TYPE (rhs)))) +- { +- /* On big-endian the padding is at the 'front' so +- just skip the initial bytes. */ +- fixed_size_mode mode +- = as_a (TYPE_MODE (TREE_TYPE (rhs))); +- pad = GET_MODE_SIZE (mode) - size2 / BITS_PER_UNIT; +- } +- len = native_encode_expr (rhs, +- buffer, sizeof (buffer), +- ((offseti - offset2) / BITS_PER_UNIT +- + pad)); +- if (len > 0 && len * BITS_PER_UNIT >= maxsizei) +- { +- tree type = vr->type; +- /* Make sure to interpret in a type that has a range +- covering the whole access size. */ +- if (INTEGRAL_TYPE_P (vr->type) +- && maxsizei != TYPE_PRECISION (vr->type)) +- type = build_nonstandard_integer_type (maxsizei, +- TYPE_UNSIGNED (type)); +- tree val = native_interpret_expr (type, buffer, +- maxsizei / BITS_PER_UNIT); +- /* If we chop off bits because the types precision doesn't +- match the memory access size this is ok when optimizing +- reads but not when called from the DSE code during +- elimination. */ +- if (val +- && type != vr->type) ++ && known_eq (maxsize2, size2) ++ && multiple_p (size2, BITS_PER_UNIT) ++ && multiple_p (offset2, BITS_PER_UNIT) ++ && adjust_offsets_for_equal_base_address (base, &offset, ++ base2, &offset2) ++ && offset.is_constant (&offseti) ++ && offset2.is_constant (&offset2i) ++ && size2.is_constant (&size2i)) ++ { ++ if (data->partial_defs.is_empty () ++ && known_subrange_p (offseti, maxsizei, offset2, size2)) ++ { ++ /* We support up to 512-bit values (for V8DFmode). */ ++ unsigned char buffer[64]; ++ int len; ++ ++ tree rhs = gimple_assign_rhs1 (def_stmt); ++ if (TREE_CODE (rhs) == SSA_NAME) ++ rhs = SSA_VAL (rhs); ++ unsigned pad = 0; ++ if (BYTES_BIG_ENDIAN ++ && is_a (TYPE_MODE (TREE_TYPE (rhs)))) + { +- if (! int_fits_type_p (val, vr->type)) +- val = NULL_TREE; +- else +- val = fold_convert (vr->type, val); ++ /* On big-endian the padding is at the 'front' so ++ just skip the initial bytes. */ ++ fixed_size_mode mode ++ = as_a (TYPE_MODE (TREE_TYPE (rhs))); ++ pad = GET_MODE_SIZE (mode) - size2i / BITS_PER_UNIT; + } +- +- if (val) +- return vn_reference_lookup_or_insert_for_pieces +- (vuse, vr->set, vr->type, vr->operands, val); ++ len = native_encode_expr (rhs, ++ buffer, sizeof (buffer), ++ ((offseti - offset2i) / BITS_PER_UNIT ++ + pad)); ++ if (len > 0 && len * BITS_PER_UNIT >= maxsizei) ++ { ++ tree type = vr->type; ++ /* Make sure to interpret in a type that has a range ++ covering the whole access size. */ ++ if (INTEGRAL_TYPE_P (vr->type) ++ && maxsizei != TYPE_PRECISION (vr->type)) ++ type = build_nonstandard_integer_type (maxsizei, ++ TYPE_UNSIGNED (type)); ++ tree val = native_interpret_expr (type, buffer, ++ maxsizei / BITS_PER_UNIT); ++ /* If we chop off bits because the types precision doesn't ++ match the memory access size this is ok when optimizing ++ reads but not when called from the DSE code during ++ elimination. */ ++ if (val ++ && type != vr->type) ++ { ++ if (! int_fits_type_p (val, vr->type)) ++ val = NULL_TREE; ++ else ++ val = fold_convert (vr->type, val); ++ } ++ ++ if (val) ++ return vn_reference_lookup_or_insert_for_pieces ++ (vuse, vr->set, vr->type, vr->operands, val); ++ } ++ } ++ else if (ranges_known_overlap_p (offseti, maxsizei, offset2i, size2i)) ++ { ++ pd_data pd; ++ tree rhs = gimple_assign_rhs1 (def_stmt); ++ if (TREE_CODE (rhs) == SSA_NAME) ++ rhs = SSA_VAL (rhs); ++ pd.rhs = rhs; ++ pd.offset = (offset2i - offseti) / BITS_PER_UNIT; ++ pd.size = size2i / BITS_PER_UNIT; ++ return data->push_partial_def (pd, vuse, maxsizei); + } + } + } +@@ -2316,7 +2631,12 @@ vn_reference_lookup_3 (ao_ref *ref, tree + && is_gimple_reg_type (vr->type) + && !contains_storage_order_barrier_p (vr->operands) + && gimple_assign_single_p (def_stmt) +- && TREE_CODE (gimple_assign_rhs1 (def_stmt)) == SSA_NAME) ++ && TREE_CODE (gimple_assign_rhs1 (def_stmt)) == SSA_NAME ++ /* A subset of partial defs from non-constants can be handled ++ by for example inserting a CONSTRUCTOR, a COMPLEX_EXPR or ++ even a (series of) BIT_INSERT_EXPR hoping for simplifications ++ downstream, not so much for actually doing the insertion. */ ++ && data->partial_defs.is_empty ()) + { + tree base2; + poly_int64 offset2, size2, maxsize2; +@@ -2328,7 +2648,8 @@ vn_reference_lookup_3 (ao_ref *ref, tree + if (!reverse + && known_size_p (maxsize2) + && known_eq (maxsize2, size2) +- && operand_equal_p (base, base2, 0) ++ && adjust_offsets_for_equal_base_address (base, &offset, ++ base2, &offset2) + && known_subrange_p (offset, maxsize, offset2, size2) + /* ??? We can't handle bitfield precision extracts without + either using an alternate type for the BIT_FIELD_REF and +@@ -2363,7 +2684,9 @@ vn_reference_lookup_3 (ao_ref *ref, tree + && gimple_assign_single_p (def_stmt) + && (DECL_P (gimple_assign_rhs1 (def_stmt)) + || TREE_CODE (gimple_assign_rhs1 (def_stmt)) == MEM_REF +- || handled_component_p (gimple_assign_rhs1 (def_stmt)))) ++ || handled_component_p (gimple_assign_rhs1 (def_stmt))) ++ /* Handling this is more complicated, give up for now. */ ++ && data->partial_defs.is_empty ()) + { + tree base2; + int i, j, k; +@@ -2497,7 +2820,9 @@ vn_reference_lookup_3 (ao_ref *ref, tree + || TREE_CODE (gimple_call_arg (def_stmt, 0)) == SSA_NAME) + && (TREE_CODE (gimple_call_arg (def_stmt, 1)) == ADDR_EXPR + || TREE_CODE (gimple_call_arg (def_stmt, 1)) == SSA_NAME) +- && poly_int_tree_p (gimple_call_arg (def_stmt, 2), ©_size)) ++ && poly_int_tree_p (gimple_call_arg (def_stmt, 2), ©_size) ++ /* Handling this is more complicated, give up for now. */ ++ && data->partial_defs.is_empty ()) + { + tree lhs, rhs; + ao_ref r; +diff -Nurp a/gcc/tree-ssa-structalias.c b/gcc/tree-ssa-structalias.c +--- a/gcc/tree-ssa-structalias.c 2020-04-30 15:14:04.644000000 +0800 ++++ b/gcc/tree-ssa-structalias.c 2020-04-30 15:14:56.592000000 +0800 +@@ -7817,7 +7817,7 @@ associate_varinfo_to_alias (struct cgrap + { + if ((node->alias + || (node->thunk.thunk_p +- && ! node->global.inlined_to)) ++ && ! node->inlined_to)) + && node->analyzed + && !node->ifunc_resolver) + insert_vi_for_tree (node->decl, (varinfo_t)data); +@@ -7987,7 +7987,7 @@ ipa_pta_execute (void) + /* Nodes without a body are not interesting. Especially do not + visit clones at this point for now - we get duplicate decls + there for inline clones at least. */ +- if (!node->has_gimple_body_p () || node->global.inlined_to) ++ if (!node->has_gimple_body_p () || node->inlined_to) + continue; + node->get_body (); + diff --git a/ipa-struct-reorg-bugfix.patch b/ipa-struct-reorg-bugfix.patch new file mode 100644 index 0000000..0ec8ba2 --- /dev/null +++ b/ipa-struct-reorg-bugfix.patch @@ -0,0 +1,613 @@ +diff -Nurp a/gcc/fold-const.c b/gcc/fold-const.c +--- a/gcc/fold-const.c 2020-06-16 22:27:46.116000000 -0400 ++++ b/gcc/fold-const.c 2020-06-16 22:27:58.412000000 -0400 +@@ -7165,15 +7165,9 @@ fold_plusminus_mult_expr (location_t loc + increased the number of multiplications necessary. */ + && TREE_CODE (arg10) != INTEGER_CST) + { +- HOST_WIDE_INT tmp1 = int01 / int11; +- HOST_WIDE_INT t = exact_log2 (absu_hwi (int11)); +- HOST_WIDE_INT size = tree_to_shwi (TYPE_SIZE_UNIT (TREE_TYPE (arg00))) * BITS_PER_UNIT; +- HOST_WIDE_INT sign_bit = HOST_WIDE_INT_1U << (size - t - 1); +- if (tmp1 & sign_bit) +- tmp1 |= HOST_WIDE_INT_M1U << (size - t); +- tree tmp2 = build_int_cst (TREE_TYPE (arg00), tmp1); + alt0 = fold_build2_loc (loc, MULT_EXPR, TREE_TYPE (arg00), arg00, +- tmp2); ++ build_int_cst (TREE_TYPE (arg00), ++ int01 / int11)); + alt1 = arg10; + same = maybe_same; + if (swap) +diff -Nurp a/gcc/ipa-struct-reorg/ipa-struct-reorg.c b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.c 2020-06-16 22:27:46.116000000 -0400 ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.c 2020-06-16 22:33:18.968000000 -0400 +@@ -112,6 +112,23 @@ is_va_list_type (tree type) + return TYPE_MAIN_VARIANT (type) == TYPE_MAIN_VARIANT (va_list_type_node); + } + ++static const char * ++get_type_name (tree type) ++{ ++ const char *tname = NULL; ++ if (TYPE_NAME (type) != NULL) ++ { ++ if (TREE_CODE (TYPE_NAME (type)) == IDENTIFIER_NODE) ++ { ++ tname = IDENTIFIER_POINTER (TYPE_NAME (type)); ++ } ++ else if (DECL_NAME (TYPE_NAME (type)) != NULL) ++ { ++ tname = IDENTIFIER_POINTER (DECL_NAME (TYPE_NAME (type))); ++ } ++ } ++ return tname; ++} + + /* Return the inner most type for arrays and pointers of TYPE. */ + +@@ -463,10 +480,10 @@ srtype::analyze (void) + if (fields.length () == 2) + fields[1]->clusternum = 1; + +- /* REMOVEME: FIXME: this is here for testing more testcases. */ ++ /* FIXME: Currently Return. */ + if (fields.length () >= 3) + { +- fields[1]->clusternum = 1; ++ return; + } + } + +@@ -875,6 +892,7 @@ private: + void analyze_types (void); + void clear_visited (void); + bool create_new_types (void); ++ void restore_field_type (void); + void create_new_decls (void); + srdecl *find_decl (tree); + void create_new_functions (void); +@@ -1096,6 +1114,11 @@ ipa_struct_reorg::record_type (tree type + { + tree t = TREE_TYPE (field); + process_union (t); ++ if (TREE_CODE (inner_type (t)) == UNION_TYPE ++ || TREE_CODE (inner_type (t)) == QUAL_UNION_TYPE) ++ { ++ type1->mark_escape (escape_union, NULL); ++ } + if (isvolatile_type (t)) + type1->mark_escape (escape_volatile, NULL); + escape_type e = escape_type_volatile_array_or_ptrptr (t); +@@ -2818,6 +2841,49 @@ ipa_struct_reorg::analyze_types (void) + } + } + ++/* When struct A has a struct B member, B's type info ++ is not stored in ++ TYPE_FIELDS (TREE_TYPE (TYPE_FIELDS (typeA))) ++ Try to restore B's type information. */ ++void ++ipa_struct_reorg::restore_field_type (void) ++{ ++ for (unsigned i = 0; i < types.length (); i++) ++ { ++ for (unsigned j = 0; j < types[i]->fields.length (); j++) ++ { ++ srfield *field = types[i]->fields[j]; ++ if (TREE_CODE (inner_type (field->fieldtype)) == RECORD_TYPE) ++ { ++ /* If field type has TYPE_FIELDS information, ++ we do not need to do this. */ ++ if (TYPE_FIELDS (field->type->type) != NULL) ++ { ++ continue; ++ } ++ for (unsigned k = 0; k < types.length (); k++) ++ { ++ if (i == k) ++ { ++ continue; ++ } ++ const char *type1 = get_type_name (field->type->type); ++ const char *type2 = get_type_name (types[k]->type); ++ if (type1 == NULL || type2 == NULL) ++ { ++ continue; ++ } ++ if (type1 == type2 ++ && TYPE_FIELDS (types[k]->type)) ++ { ++ field->type = types[k]; ++ } ++ } ++ } ++ } ++ } ++} ++ + /* Create all new types we want to create. */ + + bool +@@ -3669,7 +3735,7 @@ ipa_struct_reorg::rewrite_functions (voi + { + unsigned retval = 0; + +- ++ restore_field_type (); + /* Create new types, if we did not create any new types, + then don't rewrite any accesses. */ + if (!create_new_types ()) +diff -Nurp a/gcc/testsuite/gcc.c-torture/compile/20170404-1.c b/gcc/testsuite/gcc.c-torture/compile/20170404-1.c +--- a/gcc/testsuite/gcc.c-torture/compile/20170404-1.c 2020-06-16 22:27:46.120000000 -0400 ++++ b/gcc/testsuite/gcc.c-torture/compile/20170404-1.c 1969-12-31 19:00:00.000000000 -0500 +@@ -1,19 +0,0 @@ +-struct a +-{ +- int t, t1; +-}; +- +-static struct a *b; +- +-void *xmalloc(int); +- +- +-void f(void) +-{ +- b = xmalloc (sizeof(*b)); +-} +- +-int g(void) +-{ +- return b->t; +-} +diff -Nurp a/gcc/testsuite/gcc.c-torture/compile/nested-3.c b/gcc/testsuite/gcc.c-torture/compile/nested-3.c +--- a/gcc/testsuite/gcc.c-torture/compile/nested-3.c 2020-06-16 22:27:46.120000000 -0400 ++++ b/gcc/testsuite/gcc.c-torture/compile/nested-3.c 2020-06-16 22:27:58.416000000 -0400 +@@ -1,4 +1,3 @@ +-/* This used to crash Struct reorg. */ + struct a + { + int t; +diff -Nurp a/gcc/testsuite/gcc.c-torture/compile/struct-reorg-1.c b/gcc/testsuite/gcc.c-torture/compile/struct-reorg-1.c +--- a/gcc/testsuite/gcc.c-torture/compile/struct-reorg-1.c 2020-06-16 22:27:46.120000000 -0400 ++++ b/gcc/testsuite/gcc.c-torture/compile/struct-reorg-1.c 1969-12-31 19:00:00.000000000 -0500 +@@ -1,18 +0,0 @@ +-#include +-typedef struct { +- long laststart_offset; +- unsigned regnum; +-} compile_stack_elt_t; +-typedef struct { +- compile_stack_elt_t *stack; +- unsigned size; +-} compile_stack_type; +-void f (const char *p, const char *pend, int c) +-{ +- compile_stack_type compile_stack; +- while (p != pend) +- if (c) +- compile_stack.stack = realloc (compile_stack.stack, +- (compile_stack.size << 1) +- * sizeof (compile_stack_elt_t)); +-} +diff -Nurp a/gcc/testsuite/gcc.dg/pr33136-4.c b/gcc/testsuite/gcc.dg/pr33136-4.c +--- a/gcc/testsuite/gcc.dg/pr33136-4.c 2020-06-16 22:27:46.120000000 -0400 ++++ b/gcc/testsuite/gcc.dg/pr33136-4.c 1969-12-31 19:00:00.000000000 -0500 +@@ -1,59 +0,0 @@ +-/* PR tree-optimization/33136 */ +-/* { dg-do run } */ +-/* { dg-options "-O2" } */ +- +-extern void abort (void); +- +-struct S +-{ +- int b; +- int *c; +-}; +-static int d, e; +- +-static struct S s; +- +-static int * +-__attribute__((noinline, const)) +-foo (void) +-{ +- return &s.b; +-} +- +-int * +-__attribute__((noinline)) +-bar (int **f) +-{ +- s.c = &d; +- *f = &e; +- /* As nothing ever takes the address of any int * field in struct S, +- the write to *f can't alias with the s.c field. */ +- return s.c; +-} +- +-int +-__attribute__((noinline)) +-baz (int *x) +-{ +- s.b = 1; +- *x = 4; +- /* Function foo takes address of an int field in struct S, +- so *x can alias with the s.b field (and it does in this testcase). */ +- return s.b; +-} +- +-int +-__attribute__((noinline)) +-t (void) +-{ +- int *f = (int *) 0; +- return 10 * (bar (&f) != &d) + baz (foo ()); +-} +- +-int +-main (void) +-{ +- if (t () != 4) +- abort (); +- return 0; +-} +diff -Nurp a/gcc/testsuite/gcc.dg/struct/struct_reorg-1.c b/gcc/testsuite/gcc.dg/struct/struct_reorg-1.c +--- a/gcc/testsuite/gcc.dg/struct/struct_reorg-1.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/struct_reorg-1.c 2020-06-16 22:27:58.436000000 -0400 +@@ -0,0 +1,24 @@ ++// { dg-do compile } ++// { dg-options "-O3 -flto-partition=one -fipa-struct-reorg -fdump-ipa-all" } ++ ++struct a ++{ ++ int t, t1; ++}; ++ ++static struct a *b; ++ ++void *xmalloc(int); ++ ++ ++void f(void) ++{ ++ b = xmalloc (sizeof(*b)); ++} ++ ++int g(void) ++{ ++ return b->t; ++} ++ ++/* { dg-final { scan-ipa-dump "No structures to transform." "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/struct_reorg-2.c b/gcc/testsuite/gcc.dg/struct/struct_reorg-2.c +--- a/gcc/testsuite/gcc.dg/struct/struct_reorg-2.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/struct_reorg-2.c 2020-06-16 22:27:58.440000000 -0400 +@@ -0,0 +1,29 @@ ++// { dg-do run } ++ ++#include ++ ++struct a ++{ ++ int t; ++ int t1; ++}; ++ ++__attribute__((noinline)) int f(int i, int j) ++{ ++ struct a *t; ++ struct a t1 = {i, j}; ++ t = &t1; ++ auto int g(void) __attribute__((noinline)); ++ int g(void) ++ { ++ return t->t + t->t1; ++ } ++ return g(); ++} ++ ++int main() ++{ ++ assert (f(1, 2) == 3); ++} ++ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/struct_reorg-3.c b/gcc/testsuite/gcc.dg/struct/struct_reorg-3.c +--- a/gcc/testsuite/gcc.dg/struct/struct_reorg-3.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/struct_reorg-3.c 2020-06-16 22:27:58.440000000 -0400 +@@ -0,0 +1,23 @@ ++// { dg-do compile } ++// { dg-options "-O3 -flto-partition=one -fipa-struct-reorg -fdump-ipa-all" } ++ ++#include ++typedef struct { ++ long laststart_offset; ++ unsigned regnum; ++} compile_stack_elt_t; ++typedef struct { ++ compile_stack_elt_t *stack; ++ unsigned size; ++} compile_stack_type; ++void f (const char *p, const char *pend, int c) ++{ ++ compile_stack_type compile_stack; ++ while (p != pend) ++ if (c) ++ compile_stack.stack = realloc (compile_stack.stack, ++ (compile_stack.size << 1) ++ * sizeof (compile_stack_elt_t)); ++} ++ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/struct_reorg-4.c b/gcc/testsuite/gcc.dg/struct/struct_reorg-4.c +--- a/gcc/testsuite/gcc.dg/struct/struct_reorg-4.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/struct_reorg-4.c 2020-06-16 22:27:58.440000000 -0400 +@@ -0,0 +1,59 @@ ++/* { dg-do run } */ ++ ++extern void abort (void); ++ ++struct S ++{ ++ int b; ++ int *c; ++}; ++static int d, e; ++ ++static struct S s; ++ ++static int * ++__attribute__((noinline, const)) ++foo (void) ++{ ++ return &s.b; ++} ++ ++int * ++__attribute__((noinline)) ++bar (int **f) ++{ ++ s.c = &d; ++ *f = &e; ++ /* As nothing ever takes the address of any int * field in struct S, ++ the write to *f can't alias with the s.c field. */ ++ return s.c; ++} ++ ++int ++__attribute__((noinline)) ++baz (int *x) ++{ ++ s.b = 1; ++ *x = 4; ++ /* Function foo takes address of an int field in struct S, ++ so *x can alias with the s.b field (and it does in this testcase). */ ++ return s.b; ++} ++ ++int ++__attribute__((noinline)) ++t (void) ++{ ++ int *f = (int *) 0; ++ return 10 * (bar (&f) != &d) + baz (foo ()); ++} ++ ++int ++main (void) ++{ ++ if (t () != 4) ++ abort (); ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump "No structures to transform." "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/struct-reorg.exp b/gcc/testsuite/gcc.dg/struct/struct-reorg.exp +--- a/gcc/testsuite/gcc.dg/struct/struct-reorg.exp 2020-06-16 22:27:46.120000000 -0400 ++++ b/gcc/testsuite/gcc.dg/struct/struct-reorg.exp 2020-06-16 22:27:58.440000000 -0400 +@@ -1,5 +1,4 @@ +-# Copyright (C) 2007, 2008, 2009, 2010 +-# Free Software Foundation, Inc. ++# Copyright (C) 1997-2019 Free Software Foundation, Inc. + + # This program is free software; you can redistribute it and/or modify + # it under the terms of the GNU General Public License as published by +@@ -12,12 +11,9 @@ + # GNU General Public License for more details. + # + # You should have received a copy of the GNU General Public License +-# along with this program; see the file COPYING3. If not see ++# along with GCC; see the file COPYING3. If not see + # . + +-# Test the functionality of programs compiled with profile-directed structure +-# rearrangement using -fprofile-generate followed by -fprofile-use. +- + load_lib gcc-dg.exp + load_lib target-supports.exp + +@@ -26,62 +22,14 @@ dg-init + torture-init + + set STRUCT_REORG_TORTURE_OPTIONS [list \ +- { -O1 } \ +- { -O1 -g } \ +- { -O2 } \ +- { -O2 -g } \ +- { -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions } \ +- { -O3 -g } \ +- { -Os } ] +- ++ { -O3 } \ ++ { -Ofast } ] + +-set-torture-options $STRUCT_REORG_TORTURE_OPTIONS {{}} $LTO_TORTURE_OPTIONS ++set-torture-options $STRUCT_REORG_TORTURE_OPTIONS {{}} + +-gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/wo_prof_*.c]] "" "-fipa-struct-reorg -fdump-ipa-all -fwhole-program" ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \ ++ "" "-fipa-struct-reorg -fdump-ipa-all -flto-partition=one -fwhole-program" + ++# All done. + torture-finish +-dg-final +- +-# Some targets don't support tree profiling. +-if { ![check_profiling_available ""] } { +- return +-} +- +-# The procedures in profopt.exp need these parameters. +-set tool gcc +-set prof_ext "gcda" +- +-# Override the list defined in profopt.exp. +-set PROFOPT_OPTIONS [list {}] +- +-if $tracelevel then { +- strace $tracelevel +-} +- +-# Load support procs. +-load_lib profopt.exp +- +-# These are globals used by profopt-execute. The first is options +-# needed to generate profile data, the second is options to use the +-# profile data. +-set common "-O3 -fwhole-program" +-set profile_option [concat $common " -fprofile-generate"] +-set feedback_option [concat $common " -fprofile-use -fipa-struct-reorg -fdump-ipa-all"] +- +-foreach src [lsort [glob -nocomplain $srcdir/$subdir/w_prof_*.c]] { +- # If we're only testing specific files and this isn't one of them, skip it. +- if ![runtest_file_p $runtests $src] then { +- continue +- } +- profopt-execute $src +-} +- +-set feedback_option [concat $feedback_option " --param struct-reorg-cold-struct-ratio=30"] +- +-foreach src [lsort [glob -nocomplain $srcdir/$subdir/w_ratio_*.c]] { +- # If we're only testing specific files and this isn't one of them, skip it. +- if ![runtest_file_p $runtests $src] then { +- continue +- } +- profopt-execute $src +-} ++dg-finish +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_escape_str_init.c b/gcc/testsuite/gcc.dg/struct/wo_prof_escape_str_init.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_escape_str_init.c 2020-06-16 22:27:46.120000000 -0400 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_escape_str_init.c 2020-06-16 22:27:58.440000000 -0400 +@@ -28,4 +28,4 @@ main () + } + + /*--------------------------------------------------------------------------*/ +-/* { dg-final { scan-ipa-dump "has escaped...Type is used in an array" "struct_reorg" } } */ ++/* { dg-final { scan-ipa-dump "No structures to transform." "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_mult_field_peeling.c b/gcc/testsuite/gcc.dg/struct/wo_prof_mult_field_peeling.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_mult_field_peeling.c 2020-06-16 22:27:46.120000000 -0400 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_mult_field_peeling.c 2020-06-16 22:27:58.440000000 -0400 +@@ -38,5 +38,5 @@ main () + } + + /*--------------------------------------------------------------------------*/ +-/* The structure str_t is erroneously peeled into 4 structures instead of 2. */ +-/* { dg-final { scan-ipa-dump "the number of new types is 2" "struct_reorg" } } */ ++/* Two more fields structure is not splitted. */ ++/* { dg-final { scan-ipa-dump "No structures to transform." "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/w_prof_global_array.c b/gcc/testsuite/gcc.dg/struct/w_prof_global_array.c +--- a/gcc/testsuite/gcc.dg/struct/w_prof_global_array.c 2020-06-16 22:27:46.120000000 -0400 ++++ b/gcc/testsuite/gcc.dg/struct/w_prof_global_array.c 2020-06-16 22:27:58.440000000 -0400 +@@ -26,4 +26,4 @@ main () + } + + /*--------------------------------------------------------------------------*/ +-/* { dg-final-use { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" { xfail *-*-* } } } */ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" { xfail *-*-* } } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/w_prof_global_var.c b/gcc/testsuite/gcc.dg/struct/w_prof_global_var.c +--- a/gcc/testsuite/gcc.dg/struct/w_prof_global_var.c 2020-06-16 22:27:46.120000000 -0400 ++++ b/gcc/testsuite/gcc.dg/struct/w_prof_global_var.c 2020-06-16 22:27:58.440000000 -0400 +@@ -39,4 +39,4 @@ main () + } + + /*--------------------------------------------------------------------------*/ +-/* { dg-final-use { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" } } */ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/w_prof_local_array.c b/gcc/testsuite/gcc.dg/struct/w_prof_local_array.c +--- a/gcc/testsuite/gcc.dg/struct/w_prof_local_array.c 2020-06-16 22:27:46.124000000 -0400 ++++ b/gcc/testsuite/gcc.dg/struct/w_prof_local_array.c 2020-06-16 22:27:58.472000000 -0400 +@@ -34,4 +34,4 @@ main () + } + + /*--------------------------------------------------------------------------*/ +-/* { dg-final-use { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" { xfail *-*-* } } } */ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" { xfail *-*-* } } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/w_prof_local_var.c b/gcc/testsuite/gcc.dg/struct/w_prof_local_var.c +--- a/gcc/testsuite/gcc.dg/struct/w_prof_local_var.c 2020-06-16 22:27:46.124000000 -0400 ++++ b/gcc/testsuite/gcc.dg/struct/w_prof_local_var.c 2020-06-16 22:27:58.472000000 -0400 +@@ -37,4 +37,4 @@ main () + } + + /*--------------------------------------------------------------------------*/ +-/* { dg-final-use { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" } } */ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/w_prof_single_str_global.c b/gcc/testsuite/gcc.dg/struct/w_prof_single_str_global.c +--- a/gcc/testsuite/gcc.dg/struct/w_prof_single_str_global.c 2020-06-16 22:27:46.124000000 -0400 ++++ b/gcc/testsuite/gcc.dg/struct/w_prof_single_str_global.c 2020-06-16 22:27:58.472000000 -0400 +@@ -28,4 +28,4 @@ main () + } + + /*--------------------------------------------------------------------------*/ +-/* { dg-final-use { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" } } */ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/w_prof_two_strs.c b/gcc/testsuite/gcc.dg/struct/w_prof_two_strs.c +--- a/gcc/testsuite/gcc.dg/struct/w_prof_two_strs.c 2020-06-16 22:27:46.124000000 -0400 ++++ b/gcc/testsuite/gcc.dg/struct/w_prof_two_strs.c 2020-06-16 22:27:58.472000000 -0400 +@@ -61,4 +61,4 @@ main () + } + + /*--------------------------------------------------------------------------*/ +-/* { dg-final-use { scan-ipa-dump "Number of structures to transform is 2" "struct_reorg" } } */ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/w_ratio_cold_str.c b/gcc/testsuite/gcc.dg/struct/w_ratio_cold_str.c +--- a/gcc/testsuite/gcc.dg/struct/w_ratio_cold_str.c 2020-06-16 22:27:46.124000000 -0400 ++++ b/gcc/testsuite/gcc.dg/struct/w_ratio_cold_str.c 2020-06-16 22:27:58.472000000 -0400 +@@ -40,4 +40,4 @@ main () + + /*--------------------------------------------------------------------------*/ + /* Arrays are not handled. */ +-/* { dg-final-use { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" { xfail *-*-* } } } */ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" { xfail *-*-* } } } */ +diff -Nurp a/gcc/testsuite/g++.dg/torture/pr38355.C b/gcc/testsuite/g++.dg/torture/pr38355.C +--- a/gcc/testsuite/g++.dg/torture/pr38355.C 2020-06-16 22:27:46.124000000 -0400 ++++ b/gcc/testsuite/g++.dg/torture/pr38355.C 1969-12-31 19:00:00.000000000 -0500 +@@ -1,25 +0,0 @@ +-// { dg-do run } +-// { dg-options "-fwhole-program -fipa-struct-reorg" } +-template struct A +-{ +- char c; +- void foo(int); +- void bar(int i) { foo(i+1); } +-}; +- +-template struct B : virtual A<0> {}; +- +-template inline void baz(B& b, int i) +-{ +- if (i) b.bar(0); +-} +- +-extern template class A<0>; +-extern template void baz(B<0>&, int); +- +-int main() +-{ +- B<0> b; +- baz(b, 0); +- return 0; +-} diff --git a/ipa-struct-reorg.patch b/ipa-struct-reorg.patch new file mode 100644 index 0000000..cf3ae23 --- /dev/null +++ b/ipa-struct-reorg.patch @@ -0,0 +1,5846 @@ +This backport contains 1 patch from gcc personal branch tree. + +ipa-struct-reorg-2019-06-07-Update-with-Andrew-Pinski-s-struct-reorg-patch.patch +commit 6e1bd1c900533c627b5e4fbbecb41dcd7974b522 + +The original of this commit can be found on + https://gcc.gnu.org/git/?p=gcc-old.git;a=shortlog;h=refs/heads/sje/struct-reorg + +diff -Nurp a/gcc/common.opt b/gcc/common.opt +--- a/gcc/common.opt 2020-03-12 07:07:21.000000000 -0400 ++++ b/gcc/common.opt 2020-06-16 22:56:07.720000000 -0400 +@@ -1762,8 +1762,8 @@ Common Ignore + Does nothing. Preserved for backward compatibility. + + fipa-struct-reorg +-Common Ignore +-Does nothing. Preserved for backward compatibility. ++Common Report Var(flag_ipa_struct_reorg) Init(0) Optimization ++Perform structure layout optimizations. + + fipa-vrp + Common Report Var(flag_ipa_vrp) Optimization +diff -Nurp a/gcc/configure b/gcc/configure +--- a/gcc/configure 2020-03-12 07:08:30.000000000 -0400 ++++ b/gcc/configure 2020-06-16 22:56:07.724000000 -0400 +@@ -31614,7 +31614,7 @@ $as_echo "$as_me: executing $ac_file com + "depdir":C) $SHELL $ac_aux_dir/mkinstalldirs $DEPDIR ;; + "gccdepdir":C) + ${CONFIG_SHELL-/bin/sh} $ac_aux_dir/mkinstalldirs build/$DEPDIR +- for lang in $subdirs c-family common ++ for lang in $subdirs c-family common ipa-struct-reorg + do + ${CONFIG_SHELL-/bin/sh} $ac_aux_dir/mkinstalldirs $lang/$DEPDIR + done ;; +diff -Nurp a/gcc/configure.ac b/gcc/configure.ac +--- a/gcc/configure.ac 2020-03-12 07:07:21.000000000 -0400 ++++ b/gcc/configure.ac 2020-06-16 22:56:07.724000000 -0400 +@@ -1170,7 +1170,7 @@ AC_CHECK_HEADERS(ext/hash_map) + ZW_CREATE_DEPDIR + AC_CONFIG_COMMANDS([gccdepdir],[ + ${CONFIG_SHELL-/bin/sh} $ac_aux_dir/mkinstalldirs build/$DEPDIR +- for lang in $subdirs c-family common ++ for lang in $subdirs c-family common ipa-struct-reorg + do + ${CONFIG_SHELL-/bin/sh} $ac_aux_dir/mkinstalldirs $lang/$DEPDIR + done], [subdirs="$subdirs" ac_aux_dir=$ac_aux_dir DEPDIR=$DEPDIR]) +diff -Nurp a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +--- a/gcc/doc/invoke.texi 2020-03-12 07:07:21.000000000 -0400 ++++ b/gcc/doc/invoke.texi 2020-06-16 22:56:07.728000000 -0400 +@@ -420,6 +420,7 @@ Objective-C and Objective-C++ Dialects}. + -finline-functions -finline-functions-called-once -finline-limit=@var{n} @gol + -finline-small-functions -fipa-cp -fipa-cp-clone @gol + -fipa-bit-cp -fipa-vrp -fipa-pta -fipa-profile -fipa-pure-const @gol ++-fipa-struct-reorg @gol + -fipa-reference -fipa-reference-addressable @gol + -fipa-stack-alignment -fipa-icf -fira-algorithm=@var{algorithm} @gol + -flive-patching=@var{level} @gol +@@ -9312,6 +9313,19 @@ Enabled by default at @option{-O} and hi + Reduce stack alignment on call sites if possible. + Enabled by default. + ++@item -fipa-struct-reorg ++@opindex fipa-struct-reorg ++Perform structure reorganization optimization, that change C-like structures ++layout in order to better utilize spatial locality. This transformation is ++affective for programs containing arrays of structures. Available in two ++compilation modes: profile-based (enabled with @option{-fprofile-generate}) ++or static (which uses built-in heuristics). It works only in whole program ++mode, so it requires @option{-fwhole-program} to be ++enabled. Structures considered @samp{cold} by this transformation are not ++affected (see @option{--param struct-reorg-cold-struct-ratio=@var{value}}). ++ ++With this flag, the program debug info reflects a new structure layout. ++ + @item -fipa-pta + @opindex fipa-pta + Perform interprocedural pointer analysis and interprocedural modification +@@ -11025,6 +11039,15 @@ In each case, the @var{value} is an inte + @var{name} are: + + @table @gcctabopt ++@item struct-reorg-cold-struct-ratio ++The threshold ratio (as a percentage) between a structure frequency ++and the frequency of the hottest structure in the program. This parameter ++is used by struct-reorg optimization enabled by @option{-fipa-struct-reorg}. ++We say that if the ratio of a structure frequency, calculated by profiling, ++to the hottest structure frequency in the program is less than this ++parameter, then structure reorganization is not applied to this structure. ++The default is 10. ++ + @item predictable-branch-outcome + When branch is predicted to be taken with probability lower than this threshold + (in percent), then it is considered well predictable. +diff -Nurp a/gcc/fold-const.c b/gcc/fold-const.c +--- a/gcc/fold-const.c 2020-03-12 07:07:21.000000000 -0400 ++++ b/gcc/fold-const.c 2020-06-16 22:56:07.732000000 -0400 +@@ -7165,9 +7165,15 @@ fold_plusminus_mult_expr (location_t loc + increased the number of multiplications necessary. */ + && TREE_CODE (arg10) != INTEGER_CST) + { ++ HOST_WIDE_INT tmp1 = int01 / int11; ++ HOST_WIDE_INT t = exact_log2 (absu_hwi (int11)); ++ HOST_WIDE_INT size = tree_to_shwi (TYPE_SIZE_UNIT (TREE_TYPE (arg00))) * BITS_PER_UNIT; ++ HOST_WIDE_INT sign_bit = HOST_WIDE_INT_1U << (size - t - 1); ++ if (tmp1 & sign_bit) ++ tmp1 |= HOST_WIDE_INT_M1U << (size - t); ++ tree tmp2 = build_int_cst (TREE_TYPE (arg00), tmp1); + alt0 = fold_build2_loc (loc, MULT_EXPR, TREE_TYPE (arg00), arg00, +- build_int_cst (TREE_TYPE (arg00), +- int01 / int11)); ++ tmp2); + alt1 = arg10; + same = maybe_same; + if (swap) +diff -Nurp a/gcc/ipa-struct-reorg/escapes.def b/gcc/ipa-struct-reorg/escapes.def +--- a/gcc/ipa-struct-reorg/escapes.def 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/ipa-struct-reorg/escapes.def 2020-06-16 22:56:07.732000000 -0400 +@@ -0,0 +1,60 @@ ++/* Copyright (C) 2016 Free Software Foundation, Inc. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free ++Software Foundation; either version 3, or (at your option) any later ++version. ++ ++GCC is distributed in the hope that it will be useful, but WITHOUT ANY ++WARRANTY; without even the implied warranty of MERCHANTABILITY or ++FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++/* Before including this file, you should define a macro: ++ DEF_ESCAPE (ENUM, TEXT) ++ ++ This macro will be called once for each escape reason. The ++ ENUM will be of type "escape_type". The TEXT is describing ++ the reason for the escape. ++*/ ++DEF_ESCAPE (escape_marked_as_used, "Type used in variable marked as used") ++DEF_ESCAPE (escape_via_global_var, "Type used via a external visible variable") ++DEF_ESCAPE (escape_via_global_init, "Type used via a global init of a variable") ++DEF_ESCAPE (escape_non_supported_allocator, "Type used by allocation which is not currently supported") ++DEF_ESCAPE (escape_dependent_type_escapes, "Type uses a type which escapes or is used by a type which escapes") ++DEF_ESCAPE (escape_var_arg_function, "Types escapes via a variable argument function") ++DEF_ESCAPE (escape_bitfields, "Types has bitfields") ++DEF_ESCAPE (escape_recusive_type, "Type has a recusive relationship") ++DEF_ESCAPE (escape_variable_sized_array, "Type has a variable sized type") ++DEF_ESCAPE (escape_external_function, "Type escapes via an external function call") ++DEF_ESCAPE (escape_visible_function, "Type escapes via expternally visible function call") ++DEF_ESCAPE (escape_pointer_function, "Type escapes via an function pointer call") ++DEF_ESCAPE (escape_unkown_field, "Type escapes via an unkown field accessed") ++DEF_ESCAPE (escape_union, "Type escapes via an union") ++DEF_ESCAPE (escape_inline_asm, "Type escapes via inline-asm") ++DEF_ESCAPE (escape_non_multiply_size, "Type escapes a pointer plus which is not a multiplicate of the size") ++DEF_ESCAPE (escape_cast_void, "Type escapes a cast to/from void*") ++DEF_ESCAPE (escape_cast_another_ptr, "Type escapes a cast to a different pointer") ++DEF_ESCAPE (escape_cast_int, "Type escapes a cast from/to intergral type") ++DEF_ESCAPE (escape_int_const, "Type escapes via integer constant") ++DEF_ESCAPE (escape_vce, "Type escapes via a VIEW_CONVERT_EXPR") ++DEF_ESCAPE (escape_array_access, "Type escapes via an array access") ++DEF_ESCAPE (escape_noclonable_function, "Type escapes via a non-clonable function") ++DEF_ESCAPE (escape_rescusive_type, "Recusive type") ++DEF_ESCAPE (escape_user_alignment, "Type has an user alignment set") ++DEF_ESCAPE (escape_volatile, "Type has an variable which is volatile") ++DEF_ESCAPE (escape_non_eq, "Type has a comparison other than equals or not equals") ++DEF_ESCAPE (escape_addr, "Type escapes via taking the address of field") ++DEF_ESCAPE (escape_cannot_change_signature, "Type used in a call that cannot change signature") ++DEF_ESCAPE (escape_non_optimize, "Type used by a function which turns off struct reorg") ++DEF_ESCAPE (escape_array, "Type is used in an array [not handled yet]") ++DEF_ESCAPE (escape_ptr_ptr, "Type is used in a pointer to a pointer [not handled yet]") ++DEF_ESCAPE (escape_return, "Type escapes via a return [not handled yet]") ++ ++#undef DEF_ESCAPE +diff -Nurp a/gcc/ipa-struct-reorg/ipa-struct-reorg.c b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.c 2020-06-16 22:56:27.548000000 -0400 +@@ -0,0 +1,3840 @@ ++/* Struct-reorg optimizations. ++ Copyright (C) 2016-2017 Free Software Foundation, Inc. ++ Contributed by Andrew Pinski ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free ++Software Foundation; either version 3, or (at your option) any later ++version. ++ ++GCC is distributed in the hope that it will be useful, but WITHOUT ANY ++WARRANTY; without even the implied warranty of MERCHANTABILITY or ++FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++/* This pass implements the structure reorganization organization (struct-reorg). ++ Right now it handles just splitting off the hottest fields for a struct of 2 fields: ++ struct s { ++ type1 field1; // Hot field ++ type2 field2; ++ }; ++ s *v; ++ into: ++ struct s_hot { ++ type1 field1; ++ }; ++ struct c_cold { ++ type2 field2; ++ }; ++ s_hot *v_hot; ++ s_cold *v_cold; ++ ++ TODO: This pass can be extended to more fields, and other alogrothims like reordering. ++ ++ This pass operate in four stages: ++ 1. All of the field accesses, declarations (struct types and pointers to that type) ++ and struct types are scanned and recorded. This includes global declarations. ++ Also record all allocation and freeing sites; this is needed for the rewriting ++ phase. ++ ++ FIXME: If there is a top-level inline-asm, the pass immediately returns. ++ ++ 2. Prune out the types which are considered escaping. ++ Examples of types which are considered escaping: ++ 1. A declaration has been marked as having the attribute used or has user defined ++ alignment (type too). ++ 2. Accesses are via a BIT_FIELD_REF. FIXME: Handle VECTOR_TYPE for this case. ++ 3. The "allocation" site is not a known builtin function. ++ 4. Casting to/from an integer. ++ ++ 3. Analyze the types for which optimization to do. ++ a. Split the fields into two different structs. ++ (FIXME: two field case handled only) ++ Look at all structs which contain two fields, if one of the fields is hotter ++ then split it and put it on the rewritting for accesses. ++ Allocations and freeing are marked to split into two functions; all uses of ++ that type will now be considered as two. ++ b. Reorder fields hottest to the coldest. TODO: Implement. ++ ++ 4. Rewrite each access and allocation and free which is marked as rewriting. ++ ++ */ ++ ++#include "config.h" ++#include "system.h" ++#include "coretypes.h" ++#include "tm.h" ++#include "tree.h" ++#include "tree-pass.h" ++#include "cgraph.h" ++#include "diagnostic-core.h" ++#include "function.h" ++#include "basic-block.h" ++#include "gimple.h" ++#include "vec.h" ++#include "tree-pretty-print.h" ++#include "gimple-pretty-print.h" ++#include "gimple-iterator.h" ++#include "cfg.h" ++#include "ssa.h" ++#include "tree-dfa.h" ++#include "fold-const.h" ++#include "tree-inline.h" ++#include "stor-layout.h" ++#include "tree-into-ssa.h" ++#include "tree-cfg.h" ++#include "symbol-summary.h" ++#include "alloc-pool.h" ++#include "ipa-prop.h" ++#include "ipa-struct-reorg.h" ++#include "tree-eh.h" ++#include "bitmap.h" ++#include "ipa-param-manipulation.h" ++#include "tree-ssa-live.h" /* For remove_unused_locals. */ ++ ++#define VOID_POINTER_P(type) (POINTER_TYPE_P (type) && VOID_TYPE_P (TREE_TYPE (type))) ++ ++namespace { ++ ++using namespace struct_reorg; ++ ++/* Return true iff TYPE is stdarg va_list type. */ ++ ++static inline bool ++is_va_list_type (tree type) ++{ ++ return TYPE_MAIN_VARIANT (type) == TYPE_MAIN_VARIANT (va_list_type_node); ++} ++ ++ ++/* Return the inner most type for arrays and pointers of TYPE. */ ++ ++tree ++inner_type (tree type) ++{ ++ while (POINTER_TYPE_P (type) ++ || TREE_CODE (type) == ARRAY_TYPE) ++ type = TREE_TYPE (type); ++ return type; ++} ++ ++/* Return true if TYPE is a type which struct reorg should handled. */ ++ ++bool ++handled_type (tree type) ++{ ++ type = inner_type (type); ++ if (TREE_CODE (type) == RECORD_TYPE) ++ return !is_va_list_type (type); ++ return false; ++} ++ ++} // anon namespace ++ ++namespace struct_reorg { ++ ++/* Constructor of srfunction. */ ++ ++srfunction::srfunction (cgraph_node *n) ++ : node (n), ++ old (NULL), ++ newnode (NULL), ++ newf (NULL) ++{ ++} ++ ++/* Add an ARG to the list of arguments for the function. */ ++ ++void ++srfunction::add_arg(srdecl *arg) ++{ ++ args.safe_push(arg); ++} ++ ++/* Dump the SRFUNCTION to the file FILE. */ ++ ++void ++srfunction::dump (FILE *file) ++{ ++ if (node) ++ { ++ fprintf (file, "function : "); ++ print_generic_expr (file, node->decl); ++ fprintf (file, " with arguments: "); ++ for (unsigned i = 0; i < args.length (); i++) ++ { ++ if (i == 0) ++ fprintf (file, "\n "); ++ else ++ fprintf (file, "\n, "); ++ args[i]->dump (file); ++ } ++ ++ fprintf (file, "\nuses globals: "); ++ for(unsigned i = 0; i < globals.length (); i++) ++ { ++ fprintf (file, "\n "); ++ globals[i]->dump (file); ++ } ++ ++ fprintf (file, "\ndecls: "); ++ } ++ else ++ fprintf (file, "globals : "); ++ ++ for(unsigned i = 0; i < decls.length (); i++) ++ { ++ fprintf (file, "\n "); ++ decls[i]->dump (file); ++ } ++} ++ ++/* Simple dump the SRFUNCTION to the file FILE; used so it is not recusive. */ ++ ++void ++srfunction::simple_dump (FILE *file) ++{ ++ print_generic_expr (file, node->decl); ++} ++ ++ ++/* Constructor of FIELD. */ ++ ++srfield::srfield (tree field, srtype *base) ++ : offset(int_byte_position (field)), ++ fieldtype (TREE_TYPE (field)), ++ fielddecl (field), ++ base(base), ++ type(NULL), ++ clusternum(0) ++{ ++ for(int i = 0;i < max_split; i++) ++ newfield[i] = NULL_TREE; ++} ++ ++/* Constructor of TYPE. */ ++ ++srtype::srtype (tree type) ++ : type (type), ++ chain_type (false), ++ escapes (does_not_escape), ++ visited (false) ++{ ++ for (int i = 0; i < max_split; i++) ++ newtype[i] = NULL_TREE; ++ ++ for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) ++ { ++ if (TREE_CODE (field) == FIELD_DECL) ++ { ++ if (DECL_BIT_FIELD (field)) ++ { ++ escapes = escape_bitfields; ++ continue; ++ } ++ else if (!DECL_SIZE (field) ++ || TREE_CODE (DECL_SIZE (field)) != INTEGER_CST) ++ { ++ escapes = escape_variable_sized_array; ++ break; ++ } ++ srfield *t = new srfield (field, this); ++ fields.safe_push(t); ++ } ++ } ++} ++ ++/* Mark the type as escaping type E at statement STMT. */ ++ ++void ++srtype::mark_escape (escape_type e, gimple *stmt) ++{ ++ /* Once the type has escaped, it should never ++ change back to non escaping. */ ++ gcc_assert (e != does_not_escape); ++ if (has_escaped ()) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nO type: "); ++ simple_dump (dump_file); ++ fprintf (dump_file, " has already escaped."); ++ fprintf (dump_file, " old = \"%s\" ", escape_type_string[escapes - 1]); ++ fprintf (dump_file, " new = \"%s\"\n", escape_type_string[e - 1]); ++ if (stmt) ++ print_gimple_stmt (dump_file, stmt, 0); ++ fprintf (dump_file, "\n"); ++ } ++ return; ++ } ++ escapes = e; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nN type: "); ++ simple_dump (dump_file); ++ fprintf (dump_file, " new = \"%s\"\n", escape_reason ()); ++ if (stmt) ++ print_gimple_stmt (dump_file, stmt, 0); ++ fprintf (dump_file, "\n"); ++ } ++} ++ ++/* Add FIELD to the list of fields that use this type. */ ++ ++void ++srtype::add_field_site (srfield *field) ++{ ++ field_sites.safe_push(field); ++} ++ ++ ++/* Constructor of DECL. */ ++ ++srdecl::srdecl (srtype *tp, tree decl, int argnum) ++ : type (tp), ++ decl (decl), ++ func (NULL_TREE), ++ argumentnum (argnum), ++ visited (false) ++{ ++ if (TREE_CODE (decl) == SSA_NAME) ++ func = current_function_decl; ++ else if (!is_global_var (decl)) ++ func = DECL_CONTEXT (decl); ++ for(int i = 0;i < max_split; i++) ++ newdecl[i] = NULL_TREE; ++} ++ ++/* Find DECL in the function. */ ++ ++srdecl * ++srfunction::find_decl (tree decl) ++{ ++ for (unsigned i = 0; i < decls.length (); i++) ++ if (decls[i]->decl == decl) ++ return decls[i]; ++ return NULL; ++} ++ ++/* Record DECL of the TYPE with argument num ARG. */ ++ ++srdecl * ++srfunction::record_decl (srtype *type, tree decl, int arg) ++{ ++ // Search for the decl to see if it is already there. ++ srdecl *decl1 = find_decl (decl); ++ ++ if (decl1) ++ return decl1; ++ ++ gcc_assert (type); ++ ++ decl1 = new srdecl (type, decl, arg); ++ decls.safe_push(decl1); ++ return decl1; ++} ++ ++/* Find the field at OFF offset. */ ++ ++srfield * ++srtype::find_field (unsigned HOST_WIDE_INT off) ++{ ++ unsigned int i; ++ srfield *field; ++ ++ /* FIXME: handle array/struct field inside the current struct. */ ++ /* NOTE This does not need to be fixed to handle libquatumn */ ++ FOR_EACH_VEC_ELT (fields, i, field) ++ { ++ if (off == field->offset) ++ return field; ++ } ++ return NULL; ++} ++ ++/* Add the function FN to the list of functions if it ++ is there not already. */ ++ ++void ++srtype::add_function (srfunction *fn) ++{ ++ unsigned decluid; ++ unsigned i; ++ decluid = DECL_UID (fn->node->decl); ++ ++ srfunction *fn1; ++ // Search for the decl to see if it is already there. ++ FOR_EACH_VEC_ELT (functions, i, fn1) ++ { ++ if (DECL_UID (fn1->node->decl) == decluid) ++ return; ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Recording new function: %u.\n", decluid); ++ ++ functions.safe_push(fn); ++} ++ ++/* Dump out the type structure to FILE. */ ++ ++void ++srtype::dump (FILE *f) ++{ ++ unsigned int i; ++ srfield *field; ++ srfunction *fn; ++ sraccess *access; ++ ++ if (chain_type) ++ fprintf (f, "chain decl "); ++ ++ fprintf (f, "type : "); ++ print_generic_expr (f, type); ++ fprintf (f, "(%d) { ", TYPE_UID (type)); ++ if (escapes != does_not_escape) ++ fprintf (f, " escapes = \"%s\"\n", escape_reason ()); ++ fprintf (f, " fields = { "); ++ FOR_EACH_VEC_ELT (fields, i, field) ++ { ++ if (i == 0) ++ fprintf (f, "\n "); ++ else ++ fprintf (f, "\n, "); ++ field->dump (f); ++ } ++ fprintf (f, " }\n "); ++ fprintf (f, "\n accesses = {"); ++ FOR_EACH_VEC_ELT (accesses, i, access) ++ { ++ fprintf (f, "\n"); ++ access->dump (f); ++ } ++ fprintf (f, " }\n "); ++ fprintf (f, "\n functions = {"); ++ FOR_EACH_VEC_ELT (functions, i, fn) ++ { ++ fprintf (f, " \n"); ++ fn->simple_dump (f); ++ } ++ fprintf (f, "\n }\n"); ++ fprintf (f, "\n field_sites = {"); ++ FOR_EACH_VEC_ELT (field_sites, i, field) ++ { ++ fprintf (f, " \n"); ++ field->simple_dump (f); ++ } ++ fprintf (f, "\n }\n"); ++ fprintf (f, "}\n"); ++} ++ ++/* A simplified dump out the type structure to FILE. */ ++ ++void ++srtype::simple_dump (FILE *f) ++{ ++ print_generic_expr (f, type); ++} ++ ++/* Analyze the type and decide what to be done with it. */ ++ ++void ++srtype::analyze (void) ++{ ++ /* Chain decl types can't be split ++ so don't try. */ ++ if (chain_type) ++ return; ++ ++ /* If there is only one field then there is nothing ++ to be done. */ ++ if (fields.length () == 1) ++ return; ++ ++ /* For now we unconditionally split only structures with 2 fields ++ into 2 different structures. In future we intend to add profile ++ info and/or static heuristics to differentiate splitting process. */ ++ if (fields.length () == 2) ++ fields[1]->clusternum = 1; ++ ++ /* REMOVEME: FIXME: this is here for testing more testcases. */ ++ if (fields.length () >= 3) ++ { ++ fields[1]->clusternum = 1; ++ } ++} ++ ++/* Create the new fields for this field. */ ++ ++void ++srfield::create_new_fields (tree newtype[max_split], ++ tree newfields[max_split], ++ tree newlast[max_split]) ++{ ++ tree nt[max_split]; ++ ++ for (unsigned i = 0; i < max_split; i++) ++ nt[i] = NULL; ++ ++ if (type == NULL) ++ nt[0] = fieldtype; ++ else ++ memcpy (nt, type->newtype, sizeof(type->newtype)); ++ ++ for (unsigned i = 0; i < max_split && nt[i] != NULL; i++) ++ { ++ tree field = make_node (FIELD_DECL); ++ if (nt[1] != NULL && DECL_NAME (fielddecl)) ++ { ++ const char *tname = IDENTIFIER_POINTER (DECL_NAME (fielddecl)); ++ char id[10]; ++ char *name; ++ ++ sprintf(id, "%d", i); ++ name = concat (tname, ".reorg.", id, NULL); ++ DECL_NAME (field) = get_identifier (name); ++ free (name); ++ } ++ else ++ DECL_NAME (field) = DECL_NAME (fielddecl); ++ ++ TREE_TYPE (field) = reconstruct_complex_type (TREE_TYPE (fielddecl), nt[i]); ++ DECL_SOURCE_LOCATION (field) = DECL_SOURCE_LOCATION (fielddecl); ++ SET_DECL_ALIGN (field, DECL_ALIGN (fielddecl)); ++ DECL_USER_ALIGN (field) = DECL_USER_ALIGN (fielddecl); ++ TREE_ADDRESSABLE (field) = TREE_ADDRESSABLE (fielddecl); ++ DECL_NONADDRESSABLE_P (field) = !TREE_ADDRESSABLE (fielddecl); ++ TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (fielddecl); ++ DECL_CONTEXT (field) = newtype[clusternum]; ++ ++ if (newfields[clusternum] == NULL) ++ newfields[clusternum] = newlast[clusternum] = field; ++ else ++ { ++ DECL_CHAIN (newlast[clusternum]) = field; ++ newlast[clusternum] = field; ++ } ++ newfield[i] = field; ++ } ++ ++} ++ ++/* Create the new TYPE corresponding to THIS type. */ ++ ++bool ++srtype::create_new_type (void) ++{ ++ /* If the type has been visited, ++ then return if a new type was ++ created or not. */ ++ if (visited) ++ return has_new_type (); ++ ++ visited = true; ++ ++ if (escapes != does_not_escape) ++ { ++ newtype[0] = type; ++ return false; ++ } ++ ++ bool createnewtype = false; ++ unsigned maxclusters = 0; ++ ++ /* Create a new type for each field. */ ++ for (unsigned i = 0; i < fields.length (); i++) ++ { ++ srfield *field = fields[i]; ++ if (field->type) ++ createnewtype |= field->type->create_new_type (); ++ if (field->clusternum > maxclusters) ++ maxclusters = field->clusternum; ++ } ++ ++ /* If the fields' types did have a change or ++ we are not splitting the struct into two clusters, ++ then just return false and don't change the type. */ ++ if (!createnewtype && maxclusters == 0) ++ { ++ newtype[0] = type; ++ return false; ++ } ++ ++ /* Should have at most max_split clusters. */ ++ gcc_assert (maxclusters < max_split); ++ ++ tree newfields[max_split]; ++ tree newlast[max_split]; ++ ++ maxclusters++; ++ ++ const char *tname = NULL; ++ ++ if (TYPE_NAME (type) != NULL) ++ { ++ if (TREE_CODE (TYPE_NAME (type)) == IDENTIFIER_NODE) ++ tname = IDENTIFIER_POINTER (TYPE_NAME (type)); ++ else if (DECL_NAME (TYPE_NAME (type)) != NULL) ++ tname = IDENTIFIER_POINTER (DECL_NAME (TYPE_NAME (type))); ++ } ++ ++ for (unsigned i = 0; i < maxclusters; i++) ++ { ++ newfields[i] = NULL_TREE; ++ newlast[i] = NULL_TREE; ++ newtype[i] = make_node (RECORD_TYPE); ++ ++ char *name = NULL; ++ char id[10]; ++ sprintf(id, "%d", i); ++ if (tname) ++ { ++ name = concat (tname, ".reorg.", id, NULL); ++ TYPE_NAME (newtype[i]) = get_identifier (name); ++ free (name); ++ } ++ } ++ ++ for (unsigned i = 0; i < fields.length (); i++) ++ { ++ srfield *f = fields[i]; ++ f->create_new_fields (newtype, newfields, newlast); ++ } ++ ++ ++ /* No reason to warn about these structs since the warning would ++ have happened already. */ ++ int save_warn_padded = warn_padded; ++ warn_padded = 0; ++ ++ for (unsigned i = 0; i < maxclusters; i++) ++ { ++ TYPE_FIELDS (newtype[i]) = newfields[i]; ++ layout_type (newtype[i]); ++ } ++ ++ warn_padded = save_warn_padded; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Created %d types:\n", maxclusters); ++ for (unsigned i = 0; i < maxclusters; i++) ++ { ++ print_generic_expr (dump_file, newtype[i]); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ ++ return true; ++} ++ ++/* Helper function to copy some attributes from ORIG_DECL to the NEW_DECL. */ ++ ++static inline void ++copy_var_attributes (tree new_decl, tree orig_decl) ++{ ++ DECL_ARTIFICIAL (new_decl) = 1; ++ DECL_EXTERNAL (new_decl) = DECL_EXTERNAL (orig_decl); ++ TREE_STATIC (new_decl) = TREE_STATIC (orig_decl); ++ TREE_PUBLIC (new_decl) = TREE_PUBLIC (orig_decl); ++ TREE_USED (new_decl) = TREE_USED (orig_decl); ++ DECL_CONTEXT (new_decl) = DECL_CONTEXT (orig_decl); ++ TREE_THIS_VOLATILE (new_decl) = TREE_THIS_VOLATILE (orig_decl); ++ TREE_ADDRESSABLE (new_decl) = TREE_ADDRESSABLE (orig_decl); ++ TREE_READONLY (new_decl) = TREE_READONLY (orig_decl); ++ if (is_global_var (orig_decl)) ++ set_decl_tls_model (new_decl, DECL_TLS_MODEL (orig_decl)); ++} ++ ++/* Create all of the new decls (SSA_NAMES included) for THIS function. */ ++ ++void ++srfunction::create_new_decls (void) ++{ ++ /* If this function has been cloned, we don't need to ++ create the new decls. */ ++ if (newnode) ++ return; ++ ++ if (node) ++ set_cfun (DECL_STRUCT_FUNCTION (node->decl)); ++ ++ for (unsigned i = 0; i < decls.length (); i++) ++ { ++ srdecl *decl = decls[i]; ++ srtype *type = decl->type; ++ /* If the type of the decl does not change, ++ then don't create a new decl. */ ++ if (!type->has_new_type ()) ++ { ++ decl->newdecl[0] = decl->decl; ++ continue; ++ } ++ ++ /* Handle SSA_NAMEs. */ ++ if (TREE_CODE (decl->decl) == SSA_NAME) ++ { ++ tree newtype1[max_split]; ++ tree inner = SSA_NAME_VAR (decl->decl); ++ tree newinner[max_split]; ++ memset (newinner, 0, sizeof(newinner)); ++ for (unsigned j = 0; j < max_split && type->newtype[j]; j++) ++ newtype1[j] = reconstruct_complex_type (TREE_TYPE (decls[i]->decl), type->newtype[j]); ++ if (inner) ++ { ++ srdecl *in = find_decl (inner); ++ gcc_assert (in); ++ memcpy (newinner, in->newdecl, sizeof(newinner)); ++ } ++ tree od = decls[i]->decl; ++ /* Create the new ssa names and copy some attributes from the old one. */ ++ for (unsigned j = 0; j < max_split && type->newtype[j]; j++) ++ { ++ tree nd = make_ssa_name (newinner[j] ? newinner[j] : newtype1[j]); ++ decl->newdecl[j] = nd; ++ /* If the old decl was a default defition, handle it specially. */ ++ if (SSA_NAME_IS_DEFAULT_DEF (od)) ++ { ++ SSA_NAME_IS_DEFAULT_DEF (nd) = true; ++ SSA_NAME_DEF_STMT (nd) = gimple_build_nop (); ++ ++ /* Set the default definition for the ssaname if needed. */ ++ if (inner) ++ { ++ gcc_assert (newinner[j]); ++ set_ssa_default_def (cfun, newinner[j], nd); ++ } ++ } ++ SSA_NAME_OCCURS_IN_ABNORMAL_PHI (nd) ++ = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (od); ++ statistics_counter_event (cfun, "Create new ssa_name", 1); ++ } ++ } ++ else if (TREE_CODE (decls[i]->decl) == VAR_DECL) ++ { ++ tree orig_var = decl->decl; ++ const char *tname = NULL; ++ if (DECL_NAME (orig_var)) ++ tname = IDENTIFIER_POINTER (DECL_NAME (orig_var)); ++ for (unsigned j = 0; j < max_split && type->newtype[j]; j++) ++ { ++ tree new_name = NULL; ++ char *name = NULL; ++ char id[10]; ++ sprintf(id, "%d", j); ++ if (tname) ++ { ++ name = concat (tname, ".reorg.", id, NULL); ++ new_name = get_identifier (name); ++ free (name); ++ } ++ tree newtype1 = reconstruct_complex_type (TREE_TYPE (orig_var), type->newtype[j]); ++ decl->newdecl[j] = build_decl (DECL_SOURCE_LOCATION (orig_var), ++ VAR_DECL, new_name, newtype1); ++ copy_var_attributes (decl->newdecl[j], orig_var); ++ if (!is_global_var (orig_var)) ++ add_local_decl (cfun, decl->newdecl[j]); ++ else ++ varpool_node::add (decl->newdecl[j]); ++ statistics_counter_event (cfun, "Create new var decl", 1); ++ } ++ } ++ /* Paramater decls are already handled in create_new_functions. */ ++ else if (TREE_CODE (decls[i]->decl) == PARM_DECL) ++ ; ++ else ++ internal_error ("Unhandled decl type stored"); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Created New decls for decl:\n"); ++ fprintf (dump_file, "\n"); ++ decls[i]->dump (dump_file); ++ fprintf (dump_file, "\n"); ++ for (unsigned j = 0; j < max_split && decls[i]->newdecl[j]; j++) ++ { ++ print_generic_expr (dump_file, decls[i]->newdecl[j]); ++ fprintf (dump_file, "\n"); ++ } ++ fprintf (dump_file, "\n"); ++ } ++ } ++ ++ set_cfun (NULL); ++ ++} ++ ++/* Dump out the field structure to FILE. */ ++ ++void ++srfield::dump (FILE *f) ++{ ++ fprintf (f, "field (%d) { ", DECL_UID (fielddecl)); ++ fprintf (f, "base = "); ++ base->simple_dump (f); ++ fprintf (f, ", offset = " HOST_WIDE_INT_PRINT_DEC, offset); ++ fprintf (f, ", type = "); ++ print_generic_expr (f, fieldtype); ++ if (type) ++ { ++ fprintf (f, "( srtype = "); ++ type->simple_dump (f); ++ fprintf (f, ")"); ++ } ++ fprintf (f, "\n}\n"); ++} ++ ++ ++/* A simplified dump out the field structure to FILE. */ ++ ++void ++srfield::simple_dump (FILE *f) ++{ ++ fprintf (f, "field (%d)", DECL_UID (fielddecl)); ++} ++ ++/* Dump out the access structure to FILE. */ ++ ++void ++sraccess::dump (FILE *f) ++{ ++ fprintf (f, "access { "); ++ fprintf (f, "type = '("); ++ type->simple_dump (f); ++ fprintf (f, ")'"); ++ if (field) ++ { ++ fprintf (f, ", field = '("); ++ field->simple_dump (f); ++ fprintf (f, ")'"); ++ } ++ else ++ fprintf (f, ", whole type"); ++ fprintf (f, " in function: %s/%d", node->name (), node->order); ++ fprintf (f, ", stmt:\n"); ++ print_gimple_stmt (f, stmt, 0); ++ fprintf (f, "\n }\n"); ++ ++} ++ ++/* Dump out the decl structure to FILE. */ ++ ++void ++srdecl::dump (FILE *file) ++{ ++ if (!func) ++ fprintf (file, "global "); ++ if (argumentnum != -1) ++ fprintf (file, "argument(%d) ", argumentnum); ++ fprintf (file, "decl: "); ++ print_generic_expr (file, decl); ++ fprintf (file, " type: "); ++ type->simple_dump (file); ++} ++ ++} // namespace struct_reorg ++ ++namespace { ++ ++struct ipa_struct_reorg ++{ ++ // Constructors ++ ipa_struct_reorg(void) ++ : current_function (NULL), ++ done_recording(false) ++ { ++ } ++ ++ // public methods ++ unsigned execute(void); ++ void mark_type_as_escape (tree type, escape_type, gimple *stmt = NULL); ++private: ++ // fields ++ auto_vec_del types; ++ auto_vec_del functions; ++ srglobal globals; ++ srfunction *current_function; ++ ++ bool done_recording; ++ ++ // private methods ++ void dump_types (FILE *f); ++ void dump_types_escaped (FILE *f); ++ void dump_functions (FILE *f); ++ void record_accesses (void); ++ void detect_cycles (void); ++ bool walk_field_for_cycles (srtype*); ++ void prune_escaped_types (void); ++ void propagate_escape (void); ++ void analyze_types (void); ++ void clear_visited (void); ++ bool create_new_types (void); ++ void create_new_decls (void); ++ srdecl *find_decl (tree); ++ void create_new_functions (void); ++ void create_new_args (cgraph_node *new_node); ++ unsigned rewrite_functions (void); ++ srdecl *record_var (tree decl, escape_type escapes = does_not_escape, int arg = -1); ++ srfunction *record_function (cgraph_node *node); ++ srfunction *find_function (cgraph_node *node); ++ srtype *record_type (tree type); ++ void process_union (tree type); ++ srtype *find_type (tree type); ++ void maybe_record_stmt (cgraph_node *, gimple *); ++ void maybe_record_assign (cgraph_node *, gassign *); ++ void maybe_record_call (cgraph_node *, gcall *); ++ void maybe_record_allocation_site (cgraph_node *, gimple *); ++ void record_stmt_expr (tree expr, cgraph_node *node, gimple *stmt); ++ void mark_expr_escape(tree, escape_type, gimple *stmt); ++ tree allocate_size (srtype *t, gimple *stmt); ++ ++ void mark_decls_in_as_not_needed (tree fn); ++ ++ bool rewrite_stmt (gimple*, gimple_stmt_iterator *); ++ bool rewrite_assign (gassign *, gimple_stmt_iterator *); ++ bool rewrite_call (gcall *, gimple_stmt_iterator *); ++ bool rewrite_cond (gcond *, gimple_stmt_iterator *); ++ bool rewrite_debug (gimple *, gimple_stmt_iterator *); ++ bool rewrite_phi (gphi *); ++ bool rewrite_expr (tree expr, tree newexpr[max_split], bool ignore_missing_decl = false); ++ bool rewrite_lhs_rhs (tree lhs, tree rhs, tree newlhs[max_split], tree newrhs[max_split]); ++ bool get_type_field (tree expr, tree &base, bool &indirect, srtype *&type, srfield *&field, bool &realpart, bool &imagpart, bool &address, bool should_create = false, bool can_escape = false); ++ bool wholeaccess (tree expr, tree base, tree accesstype, srtype *t); ++ ++ void check_definition (srdecl *decl, vec&); ++ void check_uses (srdecl *decl, vec&); ++ void check_use (srdecl *decl, gimple *stmt, vec&); ++ void check_type_and_push (tree newdecl, srtype *type, vec &worklist, gimple *stmt); ++ void check_other_side (srdecl *decl, tree other, gimple *stmt, vec &worklist); ++ ++ void find_vars (gimple *stmt); ++ void find_var (tree expr, gimple *stmt); ++ void mark_types_asm (gasm *astmt); ++ ++ bool has_rewritten_type (srfunction*); ++ void maybe_mark_or_record_other_side (tree side, tree other, gimple *stmt); ++}; ++ ++/* Dump all of the recorded types to file F. */ ++ ++void ++ipa_struct_reorg::dump_types (FILE *f) ++{ ++ unsigned i; ++ srtype *type; ++ FOR_EACH_VEC_ELT (types, i, type) ++ { ++ type->dump(f); ++ } ++ fprintf (f, "\n"); ++} ++ ++/* Dump all of the recorded types to file F. */ ++ ++void ++ipa_struct_reorg::dump_types_escaped (FILE *f) ++{ ++ unsigned i; ++ srtype *type; ++ FOR_EACH_VEC_ELT (types, i, type) ++ { ++ if (type->has_escaped ()) ++ { ++ type->simple_dump (f); ++ fprintf (f, " has escaped: \"%s\"\n", type->escape_reason()); ++ } ++ } ++ fprintf (f, "\n"); ++} ++ ++ ++/* Dump all of the record functions to file F. */ ++ ++void ++ipa_struct_reorg::dump_functions (FILE *f) ++{ ++ unsigned i; ++ srfunction *fn; ++ ++ fprintf (f, "\n\n"); ++ globals.dump (f); ++ fprintf (f, "\n\n"); ++ FOR_EACH_VEC_ELT (functions, i, fn) ++ { ++ fn->dump(f); ++ fprintf (f, "\n"); ++ } ++ fprintf (f, "\n\n"); ++} ++ ++/* Find the recorded srtype corresponding to TYPE. */ ++ ++srtype * ++ipa_struct_reorg::find_type (tree type) ++{ ++ unsigned i; ++ /* Get the main variant as we are going ++ to find that type only. */ ++ type = TYPE_MAIN_VARIANT (type); ++ ++ srtype *type1; ++ // Search for the type to see if it is already there. ++ FOR_EACH_VEC_ELT (types, i, type1) ++ { ++ if (types_compatible_p (type1->type, type)) ++ return type1; ++ } ++ return NULL; ++} ++ ++/* Is TYPE a volatile type or one which points ++ to a volatile type. */ ++ ++bool isvolatile_type (tree type) ++{ ++ if (TYPE_VOLATILE (type)) ++ return true; ++ while (POINTER_TYPE_P (type) || TREE_CODE (type) == ARRAY_TYPE) ++ { ++ type = TREE_TYPE (type); ++ if (TYPE_VOLATILE (type)) ++ return true; ++ } ++ return false; ++} ++ ++/* Is TYPE an array type or points to an array type. */ ++ ++bool isarraytype (tree type) ++{ ++ if (TREE_CODE (type) == ARRAY_TYPE) ++ return true; ++ while (POINTER_TYPE_P (type)) ++ { ++ type = TREE_TYPE (type); ++ if (TREE_CODE (type) == ARRAY_TYPE) ++ return true; ++ } ++ return false; ++} ++ ++/* Is TYPE a pointer to another pointer. */ ++ ++bool isptrptr (tree type) ++{ ++ bool firstptr = false; ++ while (POINTER_TYPE_P (type) || TREE_CODE (type) == ARRAY_TYPE) ++ { ++ if (POINTER_TYPE_P (type)) ++ { ++ if (firstptr) ++ return true; ++ firstptr = true; ++ } ++ type = TREE_TYPE (type); ++ } ++ return false; ++} ++ ++/* Return the escape type which corresponds to if ++ this is an volatile type, an array type or a pointer ++ to a pointer type. */ ++ ++escape_type escape_type_volatile_array_or_ptrptr (tree type) ++{ ++ if (isvolatile_type (type)) ++ return escape_volatile; ++ if (isarraytype (type)) ++ return escape_array; ++ if (isptrptr (type)) ++ return escape_ptr_ptr; ++ return does_not_escape; ++} ++ ++/* Record TYPE if not already recorded. */ ++ ++srtype * ++ipa_struct_reorg::record_type (tree type) ++{ ++ unsigned typeuid; ++ ++ /* Get the main variant as we are going ++ to record that type only. */ ++ type = TYPE_MAIN_VARIANT (type); ++ typeuid = TYPE_UID (type); ++ ++ srtype *type1; ++ ++ type1 = find_type (type); ++ if (type1) ++ return type1; ++ ++ /* If already done recording just return NULL. */ ++ if (done_recording) ++ return NULL; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Recording new type: %u.\n", typeuid); ++ ++ type1 = new srtype (type); ++ types.safe_push(type1); ++ ++ /* If the type has an user alignment set, ++ that means the user most likely already setup the type. */ ++ if (TYPE_USER_ALIGN (type)) ++ type1->mark_escape (escape_user_alignment, NULL); ++ ++ for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) ++ { ++ if (TREE_CODE (field) == FIELD_DECL) ++ { ++ tree t = TREE_TYPE (field); ++ process_union (t); ++ if (isvolatile_type (t)) ++ type1->mark_escape (escape_volatile, NULL); ++ escape_type e = escape_type_volatile_array_or_ptrptr (t); ++ if (e != does_not_escape) ++ type1->mark_escape (e, NULL); ++ if (handled_type (t)) ++ { ++ srtype *t1 = record_type (inner_type (t)); ++ srfield *f = type1->find_field (int_byte_position (field)); ++ /* We might have an variable sized type which we don't set the handle. */ ++ if (f) ++ { ++ f->type = t1; ++ t1->add_field_site (f); ++ } ++ if (t1 == type1) ++ type1->mark_escape (escape_rescusive_type, NULL); ++ } ++ } ++ } ++ ++ return type1; ++} ++ ++/* Mark TYPE as escaping with ESCAPES as the reason. */ ++ ++void ++ipa_struct_reorg::mark_type_as_escape (tree type, escape_type escapes, gimple *stmt) ++{ ++ if (handled_type (type)) ++ { ++ srtype *stype = record_type (inner_type (type)); ++ ++ if (!stype) ++ return; ++ ++ stype->mark_escape (escapes, stmt); ++ } ++} ++ ++/* Maybe process the union of type TYPE, such that marking all of the fields' ++ types as being escaping. */ ++ ++void ++ipa_struct_reorg::process_union (tree type) ++{ ++ static hash_set unions_recorded; ++ ++ type = inner_type (type); ++ if (TREE_CODE (type) != UNION_TYPE ++ && TREE_CODE (type) != QUAL_UNION_TYPE) ++ return; ++ ++ type = TYPE_MAIN_VARIANT (type); ++ ++ /* We already processed this type. */ ++ if (unions_recorded.add (type)) ++ return; ++ ++ for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) ++ { ++ if (TREE_CODE (field) == FIELD_DECL) ++ { ++ mark_type_as_escape (TREE_TYPE (field), escape_union); ++ process_union (TREE_TYPE (field)); ++ } ++ } ++} ++ ++/* Used by record_var function as a callback to walk_tree. ++ Mark the type as escaping if it has expressions which ++ cannot be converted for global initializations. */ ++ ++static tree ++record_init_types (tree *tp, int *walk_subtrees, void *data) ++{ ++ ipa_struct_reorg *c = (ipa_struct_reorg *)data; ++ switch (TREE_CODE (*tp)) ++ { ++ CASE_CONVERT: ++ case COMPONENT_REF: ++ case VIEW_CONVERT_EXPR: ++ case ARRAY_REF: ++ { ++ tree typeouter = TREE_TYPE (*tp); ++ tree typeinner = TREE_TYPE (TREE_OPERAND (*tp, 0)); ++ c->mark_type_as_escape (typeouter, escape_via_global_init); ++ c->mark_type_as_escape (typeinner, escape_via_global_init); ++ break; ++ } ++ case INTEGER_CST: ++ if (!integer_zerop (*tp)) ++ c->mark_type_as_escape (TREE_TYPE (*tp), escape_via_global_init); ++ break; ++ case VAR_DECL: ++ case PARM_DECL: ++ case FIELD_DECL: ++ c->mark_type_as_escape (TREE_TYPE (*tp), escape_via_global_init); ++ *walk_subtrees = false; ++ break; ++ default: ++ *walk_subtrees = true; ++ break; ++ } ++ return NULL_TREE; ++} ++ ++/* Record var DECL; optionally specify the escape reason and the argument ++ number in a function. */ ++ ++srdecl * ++ipa_struct_reorg::record_var (tree decl, escape_type escapes, int arg) ++{ ++ srtype *type; ++ srdecl *sd = NULL; ++ ++ process_union (TREE_TYPE (decl)); ++ ++ /* */ ++ if (handled_type (TREE_TYPE (decl))) ++ { ++ type = record_type (inner_type (TREE_TYPE (decl))); ++ escape_type e; ++ ++ if (done_recording && !type) ++ return NULL; ++ ++ gcc_assert (type); ++ if (TREE_CODE (decl) == VAR_DECL && is_global_var (decl)) ++ sd = globals.record_decl (type, decl, arg); ++ else ++ { ++ gcc_assert (current_function); ++ sd = current_function->record_decl (type, decl, arg); ++ } ++ ++ /* If the variable has the "used" attribute, then treat the type as escaping. */ ++ if (escapes != does_not_escape) ++ e = escapes; ++ else if (TREE_CODE (decl) != SSA_NAME && DECL_PRESERVE_P (decl)) ++ e = escape_marked_as_used; ++ else if (TREE_THIS_VOLATILE (decl)) ++ e = escape_volatile; ++ else if (TREE_CODE (decl) != SSA_NAME && DECL_USER_ALIGN (decl)) ++ e = escape_user_alignment; ++ else if (TREE_CODE (decl) != SSA_NAME && TREE_STATIC (decl) && TREE_PUBLIC (decl)) ++ e = escape_via_global_var; ++ /* We don't have an initlizer. */ ++ else if (TREE_CODE (decl) != SSA_NAME && DECL_INITIAL (decl) == error_mark_node) ++ e = escape_via_global_var; ++ else ++ e = escape_type_volatile_array_or_ptrptr (TREE_TYPE (decl)); ++ ++ if (e != does_not_escape) ++ type->mark_escape (e, NULL); ++ } ++ ++ /* Record the initial usage of variables as types escapes. */ ++ if (TREE_CODE (decl) != SSA_NAME && TREE_STATIC (decl) && DECL_INITIAL (decl)) ++ { ++ walk_tree_without_duplicates (&DECL_INITIAL (decl), record_init_types, this); ++ if (!integer_zerop (DECL_INITIAL (decl)) ++ && DECL_INITIAL (decl) != error_mark_node) ++ mark_type_as_escape (TREE_TYPE (decl), escape_via_global_init); ++ } ++ return sd; ++} ++ ++/* Find void* ssa_names which are used inside MEM[] or if we have &a.c, ++ mark the type as escaping. */ ++ ++void ++ipa_struct_reorg::find_var (tree expr, gimple *stmt) ++{ ++ /* If we have VCE mark the outer type as escaping and the inner one ++ Also mark the inner most operand. */ ++ if (TREE_CODE (expr) == VIEW_CONVERT_EXPR) ++ { ++ mark_type_as_escape (TREE_TYPE (expr), escape_vce, stmt); ++ mark_type_as_escape (TREE_TYPE (TREE_OPERAND (expr, 0)), ++ escape_vce, stmt); ++ } ++ ++ /* If we have &b.c then we need to mark the type of b ++ as escaping as tracking a will be hard. */ ++ if (TREE_CODE (expr) == ADDR_EXPR ++ || TREE_CODE (expr) == VIEW_CONVERT_EXPR) ++ { ++ tree r = TREE_OPERAND (expr, 0); ++ if (handled_component_p (r) ++ || TREE_CODE (r) == MEM_REF) ++ { ++ while (handled_component_p (r) ++ || TREE_CODE (r) == MEM_REF) ++ { ++ if (TREE_CODE (r) == VIEW_CONVERT_EXPR) ++ { ++ mark_type_as_escape (TREE_TYPE (r), escape_vce, stmt); ++ mark_type_as_escape (TREE_TYPE (TREE_OPERAND (r, 0)), ++ escape_vce, stmt); ++ } ++ if (TREE_CODE (r) == MEM_REF) ++ mark_type_as_escape (TREE_TYPE (TREE_OPERAND (r, 1)), ++ escape_addr, stmt); ++ r = TREE_OPERAND (r, 0); ++ } ++ mark_expr_escape (r, escape_addr, stmt); ++ } ++ } ++ ++ tree base; ++ bool indirect; ++ srtype *type; ++ srfield *field; ++ bool realpart, imagpart, address; ++ get_type_field (expr, base, indirect, type, field, ++ realpart, imagpart, address, true, true); ++} ++ ++ ++void ++ipa_struct_reorg::find_vars (gimple *stmt) ++{ ++ gasm *astmt; ++ switch (gimple_code (stmt)) ++ { ++ case GIMPLE_ASSIGN: ++ if (gimple_assign_rhs_class (stmt) == GIMPLE_SINGLE_RHS ++ || gimple_assign_rhs_code (stmt) == POINTER_PLUS_EXPR) ++ { ++ tree lhs = gimple_assign_lhs (stmt); ++ tree rhs = gimple_assign_rhs1 (stmt); ++ find_var (gimple_assign_lhs (stmt), stmt); ++ find_var (gimple_assign_rhs1 (stmt), stmt); ++ if (TREE_CODE (lhs) == SSA_NAME ++ && VOID_POINTER_P (TREE_TYPE (lhs)) ++ && handled_type (TREE_TYPE (rhs))) ++ { ++ srtype *t = find_type (inner_type (TREE_TYPE (rhs))); ++ srdecl *d = find_decl (lhs); ++ if (!d && t) ++ current_function->record_decl (t, lhs, -1); ++ } ++ if (TREE_CODE (rhs) == SSA_NAME ++ && VOID_POINTER_P (TREE_TYPE (rhs)) ++ && handled_type (TREE_TYPE (lhs))) ++ { ++ srtype *t = find_type (inner_type (TREE_TYPE (lhs))); ++ srdecl *d = find_decl (rhs); ++ if (!d && t) ++ current_function->record_decl (t, rhs, -1); ++ } ++ } ++ break; ++ ++ case GIMPLE_CALL: ++ if (gimple_call_lhs (stmt)) ++ find_var (gimple_call_lhs (stmt), stmt); ++ ++ if (gimple_call_chain (stmt)) ++ find_var (gimple_call_chain (stmt), stmt); ++ ++ for (unsigned i = 0; i < gimple_call_num_args (stmt); i++) ++ find_var (gimple_call_arg (stmt, i), stmt); ++ break; ++ ++ case GIMPLE_ASM: ++ astmt = as_a (stmt); ++ for (unsigned i = 0; i < gimple_asm_ninputs (astmt); i++) ++ find_var (TREE_VALUE (gimple_asm_input_op (astmt, i)), stmt); ++ for (unsigned i = 0; i < gimple_asm_noutputs (astmt); i++) ++ find_var (TREE_VALUE (gimple_asm_output_op (astmt, i)), stmt); ++ mark_types_asm (astmt); ++ break; ++ ++ case GIMPLE_RETURN: ++ { ++ tree expr = gimple_return_retval (as_a(stmt)); ++ if (expr) ++ find_var (expr, stmt); ++ /* return &a; should mark the type of a as escaping through a return. */ ++ if (expr && TREE_CODE (expr) == ADDR_EXPR) ++ { ++ expr = TREE_OPERAND (expr, 0); ++ srdecl *d = find_decl (expr); ++ if (d) ++ d->type->mark_escape (escape_return, stmt); ++ } ++ } ++ break; ++ ++ default: ++ break; ++ } ++} ++ ++/* Maybe record access of statement for further analaysis. */ ++ ++void ++ipa_struct_reorg::maybe_record_stmt (cgraph_node *node, gimple *stmt) ++{ ++ switch (gimple_code (stmt)) ++ { ++ case GIMPLE_ASSIGN: ++ maybe_record_assign (node, as_a (stmt)); ++ break; ++ case GIMPLE_CALL: ++ maybe_record_call (node, as_a (stmt)); ++ break; ++ case GIMPLE_DEBUG: ++ break; ++ case GIMPLE_GOTO: ++ case GIMPLE_SWITCH: ++ break; ++ default: ++ break; ++ } ++} ++ ++/* This function checks whether ARG is a result of multiplication ++ of some number by STRUCT_SIZE. If yes, the function returns true ++ and this number is filled into NUM. */ ++ ++static bool ++is_result_of_mult (tree arg, tree *num, tree struct_size) ++{ ++ if (!struct_size ++ || TREE_CODE (struct_size) != INTEGER_CST ++ || integer_zerop (struct_size)) ++ return false; ++ ++ /* If we have a integer, just check if it is a multiply of STRUCT_SIZE. */ ++ if (TREE_CODE (arg) == INTEGER_CST) ++ { ++ if (integer_zerop (size_binop (FLOOR_MOD_EXPR, arg, struct_size))) ++ { ++ *num = size_binop (FLOOR_DIV_EXPR, arg, struct_size); ++ return true; ++ } ++ return false; ++ } ++ gimple *size_def_stmt = SSA_NAME_DEF_STMT (arg); ++ ++ /* If the allocation statement was of the form ++ D.2229_10 = (D.2228_9); ++ then size_def_stmt can be D.2228_9 = num.3_8 * 8; */ ++ ++ while (size_def_stmt && is_gimple_assign (size_def_stmt)) ++ { ++ tree lhs = gimple_assign_lhs (size_def_stmt); ++ ++ /* We expect temporary here. */ ++ if (!is_gimple_reg (lhs)) ++ return false; ++ ++ // FIXME: this should handle SHIFT also. ++ if (gimple_assign_rhs_code (size_def_stmt) == PLUS_EXPR) ++ { ++ tree num1, num2; ++ tree arg0 = gimple_assign_rhs1 (size_def_stmt); ++ tree arg1 = gimple_assign_rhs2 (size_def_stmt); ++ if (!is_result_of_mult (arg0, &num1, struct_size)) ++ return false; ++ if (!is_result_of_mult (arg1, &num2, struct_size)) ++ return false; ++ *num = size_binop (PLUS_EXPR, num1, num2); ++ return true; ++ } ++ if (gimple_assign_rhs_code (size_def_stmt) == MULT_EXPR) ++ { ++ tree arg0 = gimple_assign_rhs1 (size_def_stmt); ++ tree arg1 = gimple_assign_rhs2 (size_def_stmt); ++ tree num1; ++ ++ if (is_result_of_mult (arg0, &num1, struct_size)) ++ { ++ *num = size_binop (MULT_EXPR, arg1, num1); ++ return true; ++ } ++ if (is_result_of_mult (arg1, &num1, struct_size)) ++ { ++ *num = size_binop (MULT_EXPR, arg0, num1); ++ return true; ++ } ++ ++ *num = NULL_TREE; ++ return false; ++ } ++ else if (gimple_assign_rhs_code (size_def_stmt) == SSA_NAME) ++ { ++ arg = gimple_assign_rhs1 (size_def_stmt); ++ size_def_stmt = SSA_NAME_DEF_STMT (arg); ++ } ++ else ++ { ++ *num = NULL_TREE; ++ return false; ++ } ++ } ++ ++ *num = NULL_TREE; ++ return false; ++} ++ ++/* Return TRUE if STMT is an allocation statement that is handled. */ ++ ++static bool ++handled_allocation_stmt (gimple *stmt) ++{ ++ if (gimple_call_builtin_p (stmt, BUILT_IN_REALLOC) ++ || gimple_call_builtin_p (stmt, BUILT_IN_MALLOC) ++ || gimple_call_builtin_p (stmt, BUILT_IN_CALLOC) ++ || gimple_call_builtin_p (stmt, BUILT_IN_ALIGNED_ALLOC) ++ || gimple_call_builtin_p (stmt, BUILT_IN_ALLOCA) ++ || gimple_call_builtin_p (stmt, BUILT_IN_ALLOCA_WITH_ALIGN)) ++ return true; ++ return false; ++} ++ ++ ++/* Returns the allocated size / T size for STMT. That is the number of ++ elements in the array allocated. */ ++ ++tree ++ipa_struct_reorg::allocate_size (srtype *type, gimple *stmt) ++{ ++ if (!stmt ++ || gimple_code (stmt) != GIMPLE_CALL ++ || !handled_allocation_stmt (stmt)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nNot a allocate statment:\n"); ++ print_gimple_stmt (dump_file, stmt, 0); ++ fprintf (dump_file, "\n"); ++ } ++ return NULL; ++ } ++ ++ if (type->has_escaped ()) ++ return NULL; ++ ++ tree struct_size = TYPE_SIZE_UNIT (type->type); ++ ++ tree size = gimple_call_arg (stmt, 0); ++ ++ if (gimple_call_builtin_p (stmt, BUILT_IN_REALLOC) ++ || gimple_call_builtin_p (stmt, BUILT_IN_ALIGNED_ALLOC)) ++ size = gimple_call_arg (stmt, 1); ++ else if (gimple_call_builtin_p (stmt, BUILT_IN_CALLOC)) ++ { ++ tree arg1; ++ arg1 = gimple_call_arg (stmt, 1); ++ /* Check that second argument is a constant equal to the size of structure. */ ++ if (operand_equal_p (arg1, struct_size, 0)) ++ return size; ++ /* Check that first argument is a constant equal to the size of structure. */ ++ if (operand_equal_p (size, struct_size, 0)) ++ return arg1; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\ncalloc the correct size:\n"); ++ print_gimple_stmt (dump_file, stmt, 0); ++ fprintf (dump_file, "\n"); ++ } ++ return NULL; ++ } ++ ++ tree num; ++ if (!is_result_of_mult (size, &num, struct_size)) ++ return NULL; ++ ++ return num; ++ ++} ++ ++ ++void ++ipa_struct_reorg::maybe_mark_or_record_other_side (tree side, tree other, gimple *stmt) ++{ ++ gcc_assert (TREE_CODE (side) == SSA_NAME || TREE_CODE (side) == ADDR_EXPR); ++ srtype *type = NULL; ++ if (handled_type (TREE_TYPE (other))) ++ type = record_type (inner_type (TREE_TYPE (other))); ++ if (TREE_CODE (side) == ADDR_EXPR) ++ side = TREE_OPERAND (side, 0); ++ srdecl *d = find_decl (side); ++ if (!type) ++ { ++ if (!d) ++ return; ++ if (TREE_CODE (side) == SSA_NAME ++ && VOID_POINTER_P (TREE_TYPE (side))) ++ return; ++ d->type->mark_escape (escape_cast_another_ptr, stmt); ++ return; ++ } ++ ++ if (!d) ++ { ++ if (VOID_POINTER_P (TREE_TYPE (side)) ++ && TREE_CODE (side) == SSA_NAME) ++ current_function->record_decl (type, side, -1); ++ else ++ type->mark_escape (escape_cast_another_ptr, stmt); ++ } ++ else if (type != d->type) ++ { ++ type->mark_escape (escape_cast_another_ptr, stmt); ++ d->type->mark_escape (escape_cast_another_ptr, stmt); ++ } ++} ++ ++/* Record accesses in an assignment statement STMT. */ ++ ++void ++ipa_struct_reorg::maybe_record_assign (cgraph_node *node, gassign *stmt) ++{ ++ ++ /* */ ++ ++ if (gimple_clobber_p (stmt)) ++ { ++ record_stmt_expr (gimple_assign_lhs (stmt), node, stmt); ++ return; ++ } ++ ++ if (gimple_assign_rhs_code (stmt) == POINTER_PLUS_EXPR) ++ { ++ tree lhs = gimple_assign_lhs (stmt); ++ tree rhs1 = gimple_assign_rhs1 (stmt); ++ tree rhs2 = gimple_assign_rhs2 (stmt); ++ tree num; ++ if (!handled_type (TREE_TYPE (lhs))) ++ return; ++ /* Check if rhs2 is a multiplication of the size of the type. */ ++ if (is_result_of_mult (rhs2, &num, TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (lhs))))) ++ { ++ record_stmt_expr (lhs, node, stmt); ++ record_stmt_expr (rhs1, node, stmt); ++ } ++ else ++ { ++ mark_expr_escape (lhs, escape_non_multiply_size, stmt); ++ mark_expr_escape (rhs1, escape_non_multiply_size, stmt); ++ } ++ return; ++ } ++ /* Copies, References, Taking addresses. */ ++ if (gimple_assign_rhs_class (stmt) == GIMPLE_SINGLE_RHS) ++ { ++ tree lhs = gimple_assign_lhs (stmt); ++ tree rhs = gimple_assign_rhs1 (stmt); ++ /* If we have a = &b.c then we need to mark the type of b ++ as escaping as tracking a will be hard. */ ++ if (TREE_CODE (rhs) == ADDR_EXPR) ++ { ++ tree r = TREE_OPERAND (rhs, 0); ++ if (handled_component_p (r)) ++ { ++ while (handled_component_p (r)) ++ r = TREE_OPERAND (r, 0); ++ mark_expr_escape (r, escape_addr, stmt); ++ return; ++ } ++ } ++ if ((TREE_CODE (rhs) == SSA_NAME || TREE_CODE (rhs) == ADDR_EXPR)) ++ maybe_mark_or_record_other_side (rhs, lhs, stmt); ++ if (TREE_CODE (lhs) == SSA_NAME) ++ maybe_mark_or_record_other_side (lhs, rhs, stmt); ++ } ++} ++ ++tree ++get_ref_base_and_offset (tree &e, HOST_WIDE_INT &offset, bool &realpart, bool &imagpart, tree &accesstype) ++{ ++ offset = 0; ++ realpart = false; ++ imagpart = false; ++ accesstype = NULL_TREE; ++ if (TREE_CODE (e) == REALPART_EXPR) ++ { ++ e = TREE_OPERAND (e, 0); ++ realpart = true; ++ } ++ if (TREE_CODE (e) == IMAGPART_EXPR) ++ { ++ e = TREE_OPERAND (e, 0); ++ imagpart = true; ++ } ++ tree expr = e; ++ while (true) ++ { ++ switch (TREE_CODE (expr)) ++ { ++ case COMPONENT_REF: ++ { ++ tree field = TREE_OPERAND (expr, 1); ++ tree field_off = byte_position (field); ++ if (TREE_CODE (field_off) != INTEGER_CST) ++ return NULL; ++ offset += tree_to_shwi (field_off); ++ expr = TREE_OPERAND (expr, 0); ++ accesstype = NULL; ++ break; ++ } ++ case MEM_REF: ++ { ++ tree field_off = TREE_OPERAND (expr, 1); ++ gcc_assert (TREE_CODE (field_off) == INTEGER_CST); ++ /* So we can mark the types as escaping if different. */ ++ accesstype = TREE_TYPE (field_off); ++ offset += tree_to_uhwi (field_off); ++ return TREE_OPERAND (expr, 0); ++ } ++ default: ++ return expr; ++ } ++ } ++} ++ ++/* Return true if EXPR was accessing the whole type T. */ ++ ++bool ++ipa_struct_reorg::wholeaccess (tree expr, tree base, tree accesstype, srtype *t) ++{ ++ if (expr == base) ++ return true; ++ ++ if (TREE_CODE (expr) == ADDR_EXPR && TREE_OPERAND (expr, 0) == base) ++ return true; ++ ++ if (!accesstype) ++ return false; ++ ++ if (!types_compatible_p (TREE_TYPE (expr), TREE_TYPE (accesstype))) ++ return false; ++ ++ if (!handled_type (TREE_TYPE (expr))) ++ return false; ++ ++ srtype *other_type = find_type (inner_type (TREE_TYPE (expr))); ++ ++ if (t == other_type) ++ return true; ++ ++ return false; ++} ++ ++bool ++ipa_struct_reorg::get_type_field (tree expr, tree &base, bool &indirect, srtype *&type, srfield *&field, bool &realpart, bool &imagpart, bool &address, bool should_create, bool can_escape) ++{ ++ HOST_WIDE_INT offset; ++ tree accesstype; ++ address = false; ++ bool mark_as_bit_field = false; ++ ++ if (TREE_CODE (expr) == BIT_FIELD_REF) ++ { ++ expr = TREE_OPERAND (expr, 0); ++ mark_as_bit_field = true; ++ } ++ ++ base = get_ref_base_and_offset (expr, offset, realpart, imagpart, accesstype); ++ ++ /* Variable access, unkown type. */ ++ if (base == NULL) ++ return false; ++ ++ if (TREE_CODE (base) == ADDR_EXPR) ++ { ++ address = true; ++ base = TREE_OPERAND (base, 0); ++ } ++ ++ if (offset != 0 && accesstype) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Non zero offset (%d) with MEM.\n", (int)offset); ++ print_generic_expr (dump_file, expr); ++ fprintf (dump_file, "\n"); ++ print_generic_expr (dump_file, base); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ ++ srdecl *d = find_decl (base); ++ srtype *t; ++ ++ if (integer_zerop (base)) ++ { ++ gcc_assert (!d); ++ if (!accesstype) ++ return false; ++ t = find_type (inner_type (inner_type (accesstype))); ++ if (!t && should_create && handled_type (accesstype)) ++ t = record_type (inner_type (accesstype)); ++ if (!t) ++ return false; ++ } ++ else if (!d && accesstype) ++ { ++ if (!should_create) ++ return false; ++ if (!handled_type (accesstype)) ++ return false; ++ t = find_type (inner_type (inner_type (accesstype))); ++ if (!t) ++ t = record_type (inner_type (accesstype)); ++ if (!t || t->has_escaped ()) ++ return false; ++ /* If base is not void* mark the type as escaping. */ ++ if (!VOID_POINTER_P (TREE_TYPE (base))) ++ { ++ gcc_assert (can_escape); ++ t->mark_escape (escape_cast_another_ptr, NULL); ++ return false; ++ } ++ if (TREE_CODE (base) == SSA_NAME) ++ current_function->record_decl (t, base, -1); ++ } ++ else if (!d) ++ return false; ++ else ++ t = d->type; ++ ++ if (t->has_escaped ()) ++ return false; ++ ++ if (mark_as_bit_field) ++ { ++ gcc_assert (can_escape); ++ t->mark_escape (escape_bitfields, NULL); ++ return false; ++ } ++ ++ if (wholeaccess (expr, base, accesstype, t)) ++ { ++ field = NULL; ++ type = t; ++ indirect = accesstype != NULL; ++ return true; ++ } ++ ++ srfield *f = t->find_field (offset); ++ if (!f) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nunkown field\n"); ++ print_generic_expr (dump_file, expr); ++ fprintf (dump_file, "\n"); ++ print_generic_expr (dump_file, base); ++ fprintf (dump_file, "\n"); ++ } ++ gcc_assert (can_escape); ++ t->mark_escape (escape_unkown_field, NULL); ++ return false; ++ } ++ if (!types_compatible_p (f->fieldtype, TREE_TYPE (expr))) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nfieldtype = "); ++ print_generic_expr (dump_file, f->fieldtype); ++ fprintf (dump_file, "\naccess type = "); ++ print_generic_expr (dump_file, TREE_TYPE (expr)); ++ fprintf (dump_file, "original expr = "); ++ print_generic_expr (dump_file, expr); ++ fprintf (dump_file, "\n"); ++ } ++ gcc_assert (can_escape); ++ t->mark_escape (escape_unkown_field, NULL); ++ return false; ++ } ++ field = f; ++ type = t; ++ indirect = accesstype != NULL; ++ return true; ++} ++ ++/* Mark the type used in EXPR as escaping. */ ++ ++void ++ipa_struct_reorg::mark_expr_escape (tree expr, escape_type escapes, gimple *stmt) ++{ ++ tree base; ++ bool indirect; ++ srtype *type; ++ srfield *field; ++ bool realpart, imagpart, address; ++ if (!get_type_field (expr, base, indirect, type, field, realpart, imagpart, address)) ++ return; ++ ++ type->mark_escape (escapes, stmt); ++} ++ ++/* Record accesses in a call statement STMT. */ ++ ++void ++ipa_struct_reorg::maybe_record_call (cgraph_node *node, gcall *stmt) ++{ ++ tree argtype; ++ tree fndecl; ++ escape_type escapes = does_not_escape; ++ bool free_or_realloc = gimple_call_builtin_p (stmt, BUILT_IN_FREE) ++ || gimple_call_builtin_p (stmt, BUILT_IN_REALLOC); ++ ++ /* We check allocation sites in a different location. */ ++ if (handled_allocation_stmt (stmt)) ++ return; ++ ++ ++ /* A few cases here: ++ 1) assigned from the lhs ++ 2) Used in argument ++ If a function being called is global (or indirect) ++ then we reject the types as being escaping. */ ++ ++ if (tree chain = gimple_call_chain (stmt)) ++ record_stmt_expr (chain, node, stmt); ++ ++ /* Assigned from LHS. */ ++ if (tree lhs = gimple_call_lhs (stmt)) ++ { ++ /* FIXME: handle return types.. */ ++ mark_type_as_escape (TREE_TYPE (lhs), escape_return); ++ } ++ ++ /* If we have an internal call, just record the stmt. */ ++ if (gimple_call_internal_p (stmt)) ++ { ++ for (unsigned i = 0; i < gimple_call_num_args (stmt); i++) ++ record_stmt_expr (gimple_call_arg (stmt, i), node, stmt); ++ return; ++ } ++ ++ fndecl = gimple_call_fndecl (stmt); ++ ++ /* If we have an indrect call, just mark the types as escape. */ ++ if (!fndecl) ++ escapes = escape_pointer_function; ++ /* Non local functions cause escape except for calls to free ++ and realloc. ++ FIXME: should support function annotations too. */ ++ else if (!free_or_realloc ++ && !cgraph_node::local_info (fndecl)->local) ++ escapes = escape_external_function; ++ else if (!free_or_realloc ++ && !cgraph_node::local_info (fndecl)->can_change_signature) ++ escapes = escape_cannot_change_signature; ++ /* FIXME: we should be able to handle functions in other partitions. */ ++ else if (symtab_node::get(fndecl)->in_other_partition) ++ escapes = escape_external_function; ++ ++ if (escapes != does_not_escape) ++ { ++ for (unsigned i = 0; i < gimple_call_num_args (stmt); i++) ++ mark_type_as_escape (TREE_TYPE (gimple_call_arg (stmt, i)), ++ escapes); ++ return; ++ } ++ ++ argtype = TYPE_ARG_TYPES (gimple_call_fntype (stmt)); ++ for (unsigned i = 0; i < gimple_call_num_args (stmt); i++) ++ { ++ tree arg = gimple_call_arg (stmt, i); ++ if (argtype) ++ { ++ tree argtypet = TREE_VALUE (argtype); ++ if (!free_or_realloc ++ && VOID_POINTER_P (argtypet)) ++ mark_type_as_escape (TREE_TYPE (arg), escape_cast_void); ++ else ++ record_stmt_expr (arg, node, stmt); ++ } ++ else ++ mark_type_as_escape (TREE_TYPE (arg), escape_var_arg_function); ++ ++ argtype = argtype ? TREE_CHAIN (argtype) : NULL_TREE; ++ } ++ ++} ++ ++ ++void ++ipa_struct_reorg::record_stmt_expr (tree expr, cgraph_node *node, gimple *stmt) ++{ ++ tree base; ++ bool indirect; ++ srtype *type; ++ srfield *field; ++ bool realpart, imagpart, address; ++ if (!get_type_field (expr, base, indirect, type, field, realpart, imagpart, address)) ++ return; ++ ++ if (!opt_for_fn (current_function_decl, flag_ipa_struct_reorg)) ++ type->mark_escape (escape_non_optimize, stmt); ++ ++ /* Record it. */ ++ type->add_access (new sraccess (stmt, node, type, field)); ++} ++ ++/* Find function corresponding to NODE. */ ++ ++srfunction * ++ipa_struct_reorg::find_function (cgraph_node *node) ++{ ++ for (unsigned i = 0; i < functions.length (); i++) ++ if (functions[i]->node == node) ++ return functions[i]; ++ return NULL; ++} ++ ++void ++ipa_struct_reorg::check_type_and_push (tree newdecl, srtype *type, vec &worklist, gimple *stmt) ++{ ++ if (integer_zerop (newdecl)) ++ return; ++ ++ if (TREE_CODE (newdecl) == ADDR_EXPR) ++ { ++ srdecl *d = find_decl (TREE_OPERAND (newdecl, 0)); ++ if (!d) ++ { ++ type->mark_escape (escape_cast_another_ptr, stmt); ++ return; ++ } ++ if (d->type == type) ++ return; ++ ++ srtype *type1 = d->type; ++ type->mark_escape (escape_cast_another_ptr, stmt); ++ type1->mark_escape (escape_cast_another_ptr, stmt); ++ return; ++ } ++ ++ srdecl *d = find_decl (newdecl); ++ if (!d) ++ { ++ if (TREE_CODE (newdecl) == INTEGER_CST) ++ { ++ type->mark_escape (escape_int_const, stmt); ++ return; ++ } ++ /* If we have a non void* or a decl (which is hard to track), ++ then mark the type as escaping. */ ++ if (!VOID_POINTER_P (TREE_TYPE (newdecl)) ++ || DECL_P (newdecl)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nunkown decl: "); ++ print_generic_expr (dump_file, newdecl); ++ fprintf (dump_file, " in type:\n"); ++ print_generic_expr (dump_file, TREE_TYPE (newdecl)); ++ fprintf (dump_file, "\n"); ++ } ++ type->mark_escape (escape_cast_another_ptr, stmt); ++ return; ++ } ++ /* At this point there should only be unkown void* ssa names. */ ++ gcc_assert (TREE_CODE (newdecl) == SSA_NAME); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nrecording unkown decl: "); ++ print_generic_expr (dump_file, newdecl); ++ fprintf (dump_file, " as type:\n"); ++ type->simple_dump (dump_file); ++ fprintf (dump_file, "\n"); ++ } ++ d = current_function->record_decl (type, newdecl, -1); ++ worklist.safe_push (d); ++ return; ++ } ++ ++ /* Only add to the worklist if the decl is a SSA_NAME. */ ++ if (TREE_CODE (newdecl) == SSA_NAME) ++ worklist.safe_push (d); ++ if (d->type == type) ++ return; ++ ++ srtype *type1 = d->type; ++ type->mark_escape (escape_cast_another_ptr, stmt); ++ type1->mark_escape (escape_cast_another_ptr, stmt); ++ ++} ++ ++/* ++ 2) Check SSA_NAMEs for non type usages (source or use) (worlist of srdecl) ++ a) if the SSA_NAME is sourced from a pointer plus, record the pointer and ++ check to make sure the addition was a multiple of the size. ++ check the pointer type too. ++ b) If the name is sourced from an allocation check the allocation ++ i) Add SSA_NAME (void*) to the worklist if allocated from realloc ++ c) if the name is from a param, make sure the param type was of the original type ++ d) if the name is from a cast/assignment, make sure it is used as that type or void* ++ i) If void* then push the ssa_name into worklist ++*/ ++void ++ipa_struct_reorg::check_definition (srdecl *decl, vec &worklist) ++{ ++ tree ssa_name = decl->decl; ++ srtype *type = decl->type; ++ ++ /* c) if the name is from a param, make sure the param type was ++ of the original type */ ++ if (SSA_NAME_IS_DEFAULT_DEF (ssa_name)) ++ { ++ tree var = SSA_NAME_VAR (ssa_name); ++ if (var ++ && TREE_CODE (var) == PARM_DECL ++ && VOID_POINTER_P (TREE_TYPE (ssa_name))) ++ type->mark_escape (escape_cast_void, NULL); ++ return; ++ } ++ gimple *stmt = SSA_NAME_DEF_STMT (ssa_name); ++ ++ /* ++ b) If the name is sourced from an allocation check the allocation ++ i) Add SSA_NAME (void*) to the worklist if allocated from realloc ++ */ ++ if (gimple_code (stmt) == GIMPLE_CALL) ++ { ++ /* For realloc, check the type of the argument. */ ++ if (gimple_call_builtin_p (stmt, BUILT_IN_REALLOC)) ++ check_type_and_push (gimple_call_arg (stmt, 0), type, worklist, stmt); ++ ++ if (!handled_allocation_stmt (stmt) ++ || !allocate_size (type, stmt)) ++ type->mark_escape (escape_return, stmt); ++ return; ++ } ++ /* If the SSA_NAME is sourced from an inline-asm, just mark the type as escaping. */ ++ if (gimple_code (stmt) == GIMPLE_ASM) ++ { ++ type->mark_escape (escape_inline_asm, stmt); ++ return; ++ } ++ ++ /* If the SSA_NAME is sourced from a PHI check add each name to the worklist and ++ check to make sure they are used correctly. */ ++ if (gimple_code (stmt) == GIMPLE_PHI) ++ { ++ for (unsigned i = 0; i < gimple_phi_num_args (stmt); i++) ++ check_type_and_push (gimple_phi_arg_def (stmt, i), type, worklist, stmt); ++ return; ++ } ++ ++ gcc_assert (gimple_code (stmt) == GIMPLE_ASSIGN); ++ /* ++ a) if the SSA_NAME is sourced from a pointer plus, record the pointer and ++ check to make sure the addition was a multiple of the size. ++ check the pointer type too. ++ */ ++ ++ tree rhs = gimple_assign_rhs1 (stmt); ++ if (gimple_assign_rhs_code (stmt) == POINTER_PLUS_EXPR) ++ { ++ tree rhs2 = gimple_assign_rhs2 (stmt); ++ tree num; ++ if (!is_result_of_mult (rhs2, &num, TYPE_SIZE_UNIT (type->type))) ++ type->mark_escape (escape_non_multiply_size, stmt); ++ ++ if (TREE_CODE (rhs) == SSA_NAME) ++ check_type_and_push (rhs, type, worklist, stmt); ++ return; ++ } ++ ++ /* Casts between pointers and integer are escaping. */ ++ if (gimple_assign_cast_p (stmt)) ++ { ++ type->mark_escape (escape_cast_int, stmt); ++ return; ++ } ++ ++ /* ++ d) if the name is from a cast/assignment, make sure it is used as that type or void* ++ i) If void* then push the ssa_name into worklist ++ */ ++ gcc_assert (gimple_assign_single_p (stmt)); ++ check_other_side (decl, rhs, stmt, worklist); ++} ++ ++/* Mark the types used by the inline-asm as escaping. It is unkown what happens inside ++ an inline-asm. */ ++ ++void ++ipa_struct_reorg::mark_types_asm (gasm *astmt) ++{ ++ for (unsigned i = 0; i < gimple_asm_ninputs (astmt); i++) ++ { ++ tree v = TREE_VALUE (gimple_asm_input_op (astmt, i)); ++ /* If we have &b, just strip the & here. */ ++ if (TREE_CODE (v) == ADDR_EXPR) ++ v = TREE_OPERAND (v, 0); ++ mark_expr_escape (v, escape_inline_asm, astmt); ++ } ++ for (unsigned i = 0; i < gimple_asm_noutputs (astmt); i++) ++ { ++ tree v = TREE_VALUE (gimple_asm_output_op (astmt, i)); ++ /* If we have &b, just strip the & here. */ ++ if (TREE_CODE (v) == ADDR_EXPR) ++ v = TREE_OPERAND (v, 0); ++ mark_expr_escape (v, escape_inline_asm, astmt); ++ } ++} ++ ++void ++ipa_struct_reorg::check_other_side (srdecl *decl, tree other, gimple *stmt, vec &worklist) ++{ ++ srtype *type = decl->type; ++ ++ if (TREE_CODE (other) == SSA_NAME ++ || DECL_P (other) ++ || TREE_CODE (other) == INTEGER_CST) ++ { ++ check_type_and_push (other, type, worklist, stmt); ++ return; ++ } ++ ++ tree t = TREE_TYPE (other); ++ if (!handled_type (t)) ++ { ++ type->mark_escape (escape_cast_another_ptr, stmt); ++ return; ++ } ++ ++ srtype *t1 = find_type (inner_type (t)); ++ if (t1 == type) ++ { ++ tree base; ++ bool indirect; ++ srtype *type1; ++ srfield *field; ++ bool realpart, imagpart, address; ++ if (!get_type_field (other, base, indirect, type1, field, realpart, imagpart, address)) ++ type->mark_escape (escape_cast_another_ptr, stmt); ++ ++ return; ++ } ++ ++ if (t1) ++ t1->mark_escape (escape_cast_another_ptr, stmt); ++ ++ type->mark_escape (escape_cast_another_ptr, stmt); ++} ++ ++ ++void ++ipa_struct_reorg::check_use (srdecl *decl, gimple *stmt, vec &worklist) ++{ ++ srtype *type = decl->type; ++ ++ if (gimple_code (stmt) == GIMPLE_RETURN) ++ { ++ type->mark_escape (escape_return, stmt); ++ return; ++ } ++ /* If the SSA_NAME PHI check and add the src to the worklist and ++ check to make sure they are used correctly. */ ++ if (gimple_code (stmt) == GIMPLE_PHI) ++ { ++ check_type_and_push (gimple_phi_result (stmt), type, worklist, stmt); ++ return; ++ } ++ ++ if (gimple_code (stmt) == GIMPLE_ASM) ++ { ++ mark_types_asm (as_a (stmt)); ++ return; ++ } ++ ++ if (gimple_code (stmt) == GIMPLE_COND) ++ { ++ tree rhs1 = gimple_cond_lhs (stmt); ++ tree rhs2 = gimple_cond_rhs (stmt); ++ tree orhs = rhs1; ++ if (gimple_cond_code (stmt) != EQ_EXPR ++ && gimple_cond_code (stmt) != NE_EXPR) ++ { ++ mark_expr_escape (rhs1, escape_non_eq, stmt); ++ mark_expr_escape (rhs2, escape_non_eq, stmt); ++ } ++ if (rhs1 == decl->decl) ++ orhs = rhs2; ++ if (integer_zerop (orhs)) ++ return; ++ if (TREE_CODE (orhs) != SSA_NAME) ++ mark_expr_escape (rhs1, escape_non_eq, stmt); ++ check_type_and_push (orhs, type, worklist, stmt); ++ return; ++ } ++ ++ ++ /* Casts between pointers and integer are escaping. */ ++ if (gimple_assign_cast_p (stmt)) ++ { ++ type->mark_escape (escape_cast_int, stmt); ++ return; ++ } ++ ++ /* We might have a_1 = ptr_2 == ptr_3; */ ++ if (is_gimple_assign (stmt) ++ && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison) ++ { ++ tree rhs1 = gimple_assign_rhs1 (stmt); ++ tree rhs2 = gimple_assign_rhs2 (stmt); ++ tree orhs = rhs1; ++ if (gimple_assign_rhs_code (stmt) != EQ_EXPR ++ && gimple_assign_rhs_code (stmt) != NE_EXPR) ++ { ++ mark_expr_escape (rhs1, escape_non_eq, stmt); ++ mark_expr_escape (rhs2, escape_non_eq, stmt); ++ } ++ if (rhs1 == decl->decl) ++ orhs = rhs2; ++ if (integer_zerop (orhs)) ++ return; ++ if (TREE_CODE (orhs) != SSA_NAME) ++ mark_expr_escape (rhs1, escape_non_eq, stmt); ++ check_type_and_push (orhs, type, worklist, stmt); ++ return; ++ } ++ ++ if (gimple_assign_single_p (stmt)) ++ { ++ tree lhs = gimple_assign_lhs (stmt); ++ tree rhs = gimple_assign_rhs1 (stmt); ++ /* Check if we have a_1 = b_2; that a_1 is in the correct type. */ ++ if (decl->decl == rhs) ++ { ++ check_other_side (decl, lhs, stmt, worklist); ++ return; ++ } ++ } ++ ++ if (is_gimple_assign (stmt) ++ && gimple_assign_rhs_code (stmt) == POINTER_PLUS_EXPR) ++ { ++ tree rhs2 = gimple_assign_rhs2 (stmt); ++ tree lhs = gimple_assign_lhs (stmt); ++ tree num; ++ check_other_side (decl, lhs, stmt, worklist); ++ if (!is_result_of_mult (rhs2, &num, TYPE_SIZE_UNIT (type->type))) ++ type->mark_escape (escape_non_multiply_size, stmt); ++ } ++ ++} ++ ++/* ++ 2) Check SSA_NAMEs for non type usages (source or use) (worlist of srdecl) ++ d) if the name is used in a cast/assignment, make sure it is used as that type or void* ++ i) If void* then push the ssa_name into worklist ++ e) if used in conditional check the other side ++ i) If the conditional is non NE/EQ then mark the type as non rejecting ++ f) Check if the use in a Pointer PLUS EXPR Is used by mulitplication of its size ++ */ ++void ++ipa_struct_reorg::check_uses (srdecl *decl, vec &worklist) ++{ ++ tree ssa_name = decl->decl; ++ imm_use_iterator imm_iter; ++ use_operand_p use_p; ++ ++ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, ssa_name) ++ { ++ gimple *stmt = USE_STMT (use_p); ++ ++ if (is_gimple_debug (stmt)) ++ continue; ++ ++ check_use (decl, stmt, worklist); ++ } ++} ++ ++/* Record function corresponding to NODE. */ ++ ++srfunction * ++ipa_struct_reorg::record_function (cgraph_node *node) ++{ ++ function *fn; ++ tree parm, var; ++ unsigned int i; ++ srfunction *sfn; ++ escape_type escapes = does_not_escape; ++ ++ sfn = new srfunction (node); ++ functions.safe_push (sfn); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nRecording accesses and types from function: %s/%u\n", ++ node->name (), node->order); ++ ++ /* Nodes without a body are not interesting. Especially do not ++ visit clones at this point for now - we get duplicate decls ++ there for inline clones at least. */ ++ if (!node->has_gimple_body_p () || node->inlined_to) ++ return sfn; ++ ++ node->get_body (); ++ fn = DECL_STRUCT_FUNCTION (node->decl); ++ ++ if (!fn) ++ return sfn; ++ ++ current_function = sfn; ++ ++ if (DECL_PRESERVE_P (node->decl)) ++ escapes = escape_marked_as_used; ++ else if (!node->local.local) ++ escapes = escape_visible_function; ++ else if (!node->local.can_change_signature) ++ escapes = escape_cannot_change_signature; ++ else if (!tree_versionable_function_p (node->decl)) ++ escapes = escape_noclonable_function; ++ else if (!opt_for_fn (node->decl, flag_ipa_struct_reorg)) ++ escapes = escape_non_optimize; ++ ++ basic_block bb; ++ gimple_stmt_iterator si; ++ ++ /* Record the static chain decl. */ ++ if (fn->static_chain_decl) ++ { ++ srdecl *sd = record_var (fn->static_chain_decl, ++ escapes, ++ -2); ++ if (sd) ++ { ++ /* Specify that this type is used by the static ++ chain so it cannot be split. */ ++ sd->type->chain_type = true; ++ sfn->add_arg (sd); ++ sd->type->add_function (sfn); ++ } ++ } ++ ++ /* Record the arguments. */ ++ for (parm = DECL_ARGUMENTS (node->decl), i = 0; ++ parm; ++ parm = DECL_CHAIN (parm), i++) ++ { ++ srdecl *sd = record_var (parm, escapes, i); ++ if (sd) ++ { ++ sfn->add_arg (sd); ++ sd->type->add_function (sfn); ++ } ++ } ++ ++ /* Mark the return type as escaping */ ++ { ++ tree return_type = TREE_TYPE (TREE_TYPE (node->decl)); ++ mark_type_as_escape (return_type, escape_return, NULL); ++ } ++ ++ /* If the cfg does not exist for the function, don't process the function. */ ++ if (!fn->cfg) ++ { ++ current_function = NULL; ++ return sfn; ++ } ++ ++ /* The following order is done for recording stage: ++ 0) Record all variables/SSA_NAMES that are of struct type ++ 1) Record MEM_REF/COMPONENT_REFs ++ a) Record SSA_NAMEs (void*) and record that as the accessed type. ++ */ ++ ++ push_cfun (fn); ++ ++ FOR_EACH_LOCAL_DECL (cfun, i, var) ++ { ++ if (TREE_CODE (var) != VAR_DECL) ++ continue; ++ ++ record_var (var); ++ } ++ ++ for (i = 1; i < num_ssa_names; ++i) ++ { ++ tree name = ssa_name (i); ++ if (!name ++ || has_zero_uses (name) ++ || virtual_operand_p (name)) ++ continue; ++ ++ record_var (name); ++ } ++ ++ /* Find the variables which are used via MEM_REF and are void* types. */ ++ FOR_EACH_BB_FN (bb, cfun) ++ { ++ for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) ++ { ++ gimple *stmt = gsi_stmt (si); ++ find_vars (stmt); ++ } ++ } ++ ++ auto_vec worklist; ++ for (unsigned i = 0; i < current_function->decls.length (); i++) ++ { ++ srdecl *decl = current_function->decls[i]; ++ if (TREE_CODE (decl->decl) == SSA_NAME) ++ { ++ decl->visited = false; ++ worklist.safe_push (decl); ++ } ++ } ++ ++ /* ++ 2) Check SSA_NAMEs for non type usages (source or use) (worlist of srdecl) ++ a) if the SSA_NAME is sourced from a pointer plus, record the pointer and ++ check to make sure the addition was a multiple of the size. ++ check the pointer type too. ++ b) If the name is sourced from an allocation check the allocation ++ i) Add SSA_NAME (void*) to the worklist if allocated from realloc ++ c) if the name is from a param, make sure the param type was of the original type ++ d) if the name is used in a cast/assignment, make sure it is used as that type or void* ++ i) If void* then push the ssa_name into worklist ++ e) if used in conditional check the other side ++ i) If the conditional is non NE/EQ then mark the type as non rejecting ++ f) Check if the use in a POinter PLUS EXPR Is used by mulitplication of its size ++ */ ++ ++ while (!worklist.is_empty ()) ++ { ++ srdecl *decl = worklist.pop (); ++ if (decl->visited) ++ continue; ++ decl->visited = true; ++ check_definition (decl, worklist); ++ check_uses (decl, worklist); ++ } ++ ++ FOR_EACH_BB_FN (bb, cfun) ++ { ++ for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) ++ { ++ gimple *stmt = gsi_stmt (si); ++ maybe_record_stmt (node, stmt); ++ } ++ } ++ ++ pop_cfun (); ++ current_function = NULL; ++ return sfn; ++} ++ ++ ++/* Record all accesses for all types including global variables. */ ++ ++void ++ipa_struct_reorg::record_accesses (void) ++{ ++ varpool_node *var; ++ cgraph_node *cnode; ++ ++ /* Record global (non-auto) variables first. */ ++ FOR_EACH_VARIABLE (var) ++ { ++ if (!var->real_symbol_p ()) ++ continue; ++ ++ /* Record all variables including the accesses inside a variable. */ ++ escape_type escapes = does_not_escape; ++ if (var->externally_visible || !var->definition) ++ escapes = escape_via_global_var; ++ if (var->in_other_partition) ++ escapes = escape_via_global_var; ++ if (!var->externally_visible && var->definition) ++ var->get_constructor (); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Recording global variable: "); ++ print_generic_expr (dump_file, var->decl); ++ fprintf (dump_file, "\n"); ++ } ++ record_var (var->decl, escapes); ++ } ++ ++ FOR_EACH_FUNCTION (cnode) ++ { ++ if (!cnode->real_symbol_p ()) ++ continue; ++ ++ /* Record accesses inside a function. */ ++ if(cnode->definition) ++ record_function (cnode); ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "all types (before pruning):\n"); ++ dump_types (dump_file); ++ fprintf (dump_file, "all functions (before pruning):\n"); ++ dump_functions (dump_file); ++ } ++ done_recording = true; ++} ++ ++/* A helper function to detect cycles (recusive) types. ++ Return TRUE if TYPE was a rescusive type. */ ++ ++bool ++ipa_struct_reorg::walk_field_for_cycles (srtype *type) ++{ ++ unsigned i; ++ srfield *field; ++ ++ type->visited = true; ++ if (type->escaped_rescusive ()) ++ return true; ++ ++ if (type->has_escaped ()) ++ return false; ++ ++ FOR_EACH_VEC_ELT (type->fields, i, field) ++ { ++ if (!field->type) ++ ; ++ else if (field->type->visited ++ || walk_field_for_cycles (field->type)) ++ { ++ type->mark_escape (escape_rescusive_type, NULL); ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++/* Clear visited on all types. */ ++ ++void ++ipa_struct_reorg::clear_visited (void) ++{ ++ for (unsigned i = 0; i < types.length (); i++) ++ types[i]->visited = false; ++} ++ ++/* Detect recusive types and mark them as escaping. */ ++ ++void ++ipa_struct_reorg::detect_cycles (void) ++{ ++ for (unsigned i = 0; i < types.length (); i++) ++ { ++ if (types[i]->has_escaped ()) ++ continue; ++ ++ clear_visited (); ++ walk_field_for_cycles (types[i]); ++ } ++} ++ ++/* Propagate escaping to depdenent types. */ ++ ++void ++ipa_struct_reorg::propagate_escape (void) ++{ ++ ++ unsigned i; ++ srtype *type; ++ bool changed = false; ++ ++ do ++ { ++ changed = false; ++ FOR_EACH_VEC_ELT (types, i, type) ++ { ++ for (tree field = TYPE_FIELDS (type->type); ++ field; ++ field = DECL_CHAIN (field)) ++ { ++ if (TREE_CODE (field) == FIELD_DECL ++ && handled_type (TREE_TYPE (field))) ++ { ++ tree t = inner_type (TREE_TYPE (field)); ++ srtype *type1 = find_type (t); ++ if (!type1) ++ continue; ++ if (type1->has_escaped () ++ && !type->has_escaped ()) ++ { ++ type->mark_escape (escape_dependent_type_escapes, NULL); ++ changed = true; ++ } ++ if (type->has_escaped () ++ && !type1->has_escaped ()) ++ { ++ type1->mark_escape (escape_dependent_type_escapes, NULL); ++ changed = true; ++ } ++ } ++ } ++ } ++ } while (changed); ++} ++ ++/* Prune the escaped types and their decls from what was recorded. */ ++ ++void ++ipa_struct_reorg::prune_escaped_types (void) ++{ ++ detect_cycles (); ++ propagate_escape (); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "all types (after prop but before pruning):\n"); ++ dump_types (dump_file); ++ fprintf (dump_file, "all functions (after prop but before pruning):\n"); ++ dump_functions (dump_file); ++ } ++ ++ if (dump_file) ++ dump_types_escaped (dump_file); ++ ++ ++ /* Prune the function arguments which escape ++ and functions which have no types as arguments. */ ++ for (unsigned i = 0; i < functions.length (); ) ++ { ++ srfunction *function = functions[i]; ++ ++ /* Prune function arguments of types that escape. */ ++ for (unsigned j = 0; j < function->args.length ();) ++ { ++ if (function->args[j]->type->has_escaped ()) ++ function->args.ordered_remove (j); ++ else ++ j++; ++ } ++ ++ /* Prune global variables that the function uses of types that escape. */ ++ for (unsigned j = 0; j < function->globals.length ();) ++ { ++ if (function->globals[j]->type->has_escaped ()) ++ function->globals.ordered_remove (j); ++ else ++ j++; ++ } ++ ++ /* Prune variables that the function uses of types that escape. */ ++ for (unsigned j = 0; j < function->decls.length ();) ++ { ++ srdecl *decl = function->decls[j]; ++ if (decl->type->has_escaped ()) ++ { ++ function->decls.ordered_remove (j); ++ delete decl; ++ } ++ else ++ j++; ++ } ++ ++ /* Prune functions which don't refer to any variables any more. */ ++ if (function->args.is_empty () ++ && function->decls.is_empty () ++ && function->globals.is_empty ()) ++ { ++ delete function; ++ functions.ordered_remove (i); ++ } ++ else ++ i++; ++ } ++ ++ /* Prune globals of types that escape, all references to those decls ++ will have been removed in the first loop. */ ++ for (unsigned j = 0; j < globals.decls.length ();) ++ { ++ srdecl *decl = globals.decls[j]; ++ if (decl->type->has_escaped ()) ++ { ++ globals.decls.ordered_remove (j); ++ delete decl; ++ } ++ else ++ j++; ++ } ++ ++ /* Prune types that escape, all references to those types ++ will have been removed in the above loops. */ ++ for (unsigned i = 0; i < types.length (); ) ++ { ++ srtype *type = types[i]; ++ if (type->has_escaped ()) ++ { ++ /* All references to this type should have been removed now. */ ++ delete type; ++ types.ordered_remove (i); ++ } ++ else ++ i++; ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "all types (after pruning):\n"); ++ dump_types (dump_file); ++ fprintf (dump_file, "all functions (after pruning):\n"); ++ dump_functions (dump_file); ++ } ++} ++ ++/* Analyze all of the types. */ ++ ++void ++ipa_struct_reorg::analyze_types (void) ++{ ++ for (unsigned i = 0; i < types.length (); i++) ++ { ++ if (!types[i]->has_escaped ()) ++ types[i]->analyze(); ++ } ++} ++ ++/* Create all new types we want to create. */ ++ ++bool ++ipa_struct_reorg::create_new_types (void) ++{ ++ int newtypes = 0; ++ clear_visited (); ++ for (unsigned i = 0; i < types.length (); i++) ++ newtypes += types[i]->create_new_type (); ++ ++ if (dump_file) ++ { ++ if (newtypes) ++ fprintf (dump_file, "\nNumber of structures to transform is %d\n", newtypes); ++ else ++ fprintf (dump_file, "\nNo structures to transform.\n"); ++ } ++ ++ return newtypes != 0; ++} ++ ++/* Create all the new decls except for the new arguments ++ which create_new_functions would have created. */ ++ ++void ++ipa_struct_reorg::create_new_decls (void) ++{ ++ globals.create_new_decls (); ++ for (unsigned i = 0; i < functions.length (); i++) ++ functions[i]->create_new_decls (); ++} ++ ++/* Create the new arguments for the function corresponding to NODE. */ ++ ++void ++ipa_struct_reorg::create_new_args (cgraph_node *new_node) ++{ ++ tree decl = new_node->decl; ++ vec params = ipa_get_vector_of_formal_parms (decl); ++ ipa_parm_adjustment_vec adjs; ++ adjs.create (params.length ()); ++ for (unsigned i = 0; i < params.length (); i++) ++ { ++ struct ipa_parm_adjustment adj; ++ tree parm = params[i]; ++ memset (&adj, 0, sizeof (adj)); ++ adj.base_index = i; ++ adj.base = parm; ++ srtype *t = find_type (inner_type (TREE_TYPE (parm))); ++ if (!t ++ || t->has_escaped () ++ || !t->has_new_type ()) ++ { ++ adj.op = IPA_PARM_OP_COPY; ++ adjs.safe_push (adj); ++ continue; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Creating a new argument for: "); ++ print_generic_expr (dump_file, params[i]); ++ fprintf (dump_file, " in function: "); ++ print_generic_expr (dump_file, decl); ++ fprintf (dump_file, "\n"); ++ } ++ adj.arg_prefix = "struct_reorg"; ++ adj.op = IPA_PARM_OP_NONE; ++ for (unsigned j = 0; j < max_split && t->newtype[j]; j++) ++ { ++ adj.type = reconstruct_complex_type (TREE_TYPE (parm), t->newtype[j]); ++ adjs.safe_push (adj); ++ } ++ } ++ ipa_modify_formal_parameters (decl, adjs); ++ params.release (); ++ for (unsigned i = 0; i < adjs.length (); i++) ++ { ++ if (adjs[i].op != IPA_PARM_OP_NONE) ++ continue; ++ tree decl = adjs[i].base; ++ srdecl *d = find_decl (decl); ++ if (!d) ++ continue; ++ unsigned j = 0; ++ while (j < max_split && d->newdecl[j]) ++ j++; ++ d->newdecl[j] = adjs[i].new_decl; ++ } ++ adjs.release (); ++ ++ function *fn = DECL_STRUCT_FUNCTION (decl); ++ ++ if (!fn->static_chain_decl) ++ return; ++ srdecl *chain = find_decl (fn->static_chain_decl); ++ if (!chain) ++ return; ++ ++ srtype *type = chain->type; ++ tree orig_var = chain->decl; ++ const char *tname = NULL; ++ if (DECL_NAME (orig_var)) ++ tname = IDENTIFIER_POINTER (DECL_NAME (orig_var)); ++ gcc_assert (!type->newtype[1]); ++ tree new_name = NULL; ++ char *name = NULL; ++ if (tname) ++ { ++ name = concat (tname, ".reorg.0", NULL); ++ new_name = get_identifier (name); ++ free (name); ++ } ++ tree newtype1 = reconstruct_complex_type (TREE_TYPE (orig_var), type->newtype[0]); ++ chain->newdecl[0] = build_decl (DECL_SOURCE_LOCATION (orig_var), ++ PARM_DECL, new_name, newtype1); ++ copy_var_attributes (chain->newdecl[0], orig_var); ++ fn->static_chain_decl = chain->newdecl[0]; ++ ++} ++ ++/* Find the refered DECL in the current function or globals. ++ If this is a global decl, record that as being used ++ in the current function. */ ++ ++srdecl * ++ipa_struct_reorg::find_decl (tree decl) ++{ ++ srdecl *d; ++ d = globals.find_decl (decl); ++ if (d) ++ { ++ /* Record the global usage in the current function. */ ++ if (!done_recording && current_function) ++ { ++ bool add = true; ++ /* No reason to add it to the current function if it is ++ already recorded as such. */ ++ for (unsigned i = 0; i < current_function->globals.length (); i++) ++ { ++ if (current_function->globals[i] == d) ++ { ++ add = false; ++ break; ++ } ++ } ++ if (add) ++ current_function->globals.safe_push (d); ++ } ++ return d; ++ } ++ if (current_function) ++ return current_function->find_decl (decl); ++ return NULL; ++} ++ ++/* Create new function clones for the cases where the arguments ++ need to be changed. */ ++ ++void ++ipa_struct_reorg::create_new_functions (void) ++{ ++ for (unsigned i = 0; i < functions.length (); i++) ++ { ++ srfunction *f = functions[i]; ++ bool anyargchanges = false; ++ cgraph_node *new_node; ++ cgraph_node *node = f->node; ++ int newargs = 0; ++ if (f->old) ++ continue; ++ ++ if (f->args.length () == 0) ++ continue; ++ ++ for (unsigned j = 0; j < f->args.length (); j++) ++ { ++ srdecl *d = f->args[j]; ++ srtype *t = d->type; ++ if (t->has_new_type ()) ++ { ++ newargs += t->newtype[1] != NULL; ++ anyargchanges = true; ++ } ++ } ++ if (!anyargchanges) ++ continue; ++ ++ if (dump_file) ++ { ++ fprintf (dump_file, "Creating a clone of function: "); ++ f->simple_dump (dump_file); ++ fprintf (dump_file, "\n"); ++ } ++ statistics_counter_event (NULL, "Create new function", 1); ++ new_node = node->create_version_clone_with_body (vNULL, NULL, ++ NULL, false, NULL, NULL, ++ "struct_reorg"); ++ new_node->make_local (); ++ f->newnode = new_node; ++ srfunction *n = record_function (new_node); ++ current_function = n; ++ n->old = f; ++ f->newf = n; ++ /* Create New arguments. */ ++ create_new_args (new_node); ++ current_function = NULL; ++ } ++} ++ ++bool ++ipa_struct_reorg::rewrite_lhs_rhs (tree lhs, tree rhs, tree newlhs[max_split], tree newrhs[max_split]) ++{ ++ bool l = rewrite_expr (lhs, newlhs); ++ bool r = rewrite_expr (rhs, newrhs); ++ ++ /* Handle NULL pointer specially. */ ++ if (l && !r && integer_zerop (rhs)) ++ { ++ r = true; ++ for (unsigned i = 0; i < max_split && newlhs[i]; i++) ++ newrhs[i] = fold_convert (TREE_TYPE (newlhs[i]), rhs); ++ } ++ ++ return l || r; ++} ++ ++bool ++ipa_struct_reorg::rewrite_expr (tree expr, tree newexpr[max_split], bool ignore_missing_decl) ++{ ++ tree base; ++ bool indirect; ++ srtype *t; ++ srfield *f; ++ bool realpart, imagpart; ++ bool address; ++ ++ tree newbase[max_split]; ++ memset (newexpr, 0, sizeof(tree[max_split])); ++ ++ if (TREE_CODE (expr) == CONSTRUCTOR) ++ { ++ srtype *t = find_type (TREE_TYPE (expr)); ++ if (!t) ++ return false; ++ gcc_assert (CONSTRUCTOR_NELTS (expr) == 0); ++ if (!t->has_new_type ()) ++ return false; ++ for (unsigned i = 0; i < max_split && t->newtype[i]; i++) ++ newexpr[i] = build_constructor (t->newtype[i], NULL); ++ return true; ++ } ++ ++ if (!get_type_field (expr, base, indirect, t, f, realpart, imagpart, address)) ++ return false; ++ ++ /* If the type is not changed, then just return false. */ ++ if (!t->has_new_type ()) ++ return false; ++ ++ /* NULL pointer handling is "special". */ ++ if (integer_zerop (base)) ++ { ++ gcc_assert (indirect && !address); ++ for (unsigned i = 0; i < max_split && t->newtype[i]; i++) ++ { ++ tree newtype1 = reconstruct_complex_type (TREE_TYPE (base), t->newtype[i]); ++ newbase[i] = fold_convert (newtype1, base); ++ } ++ } ++ else ++ { ++ srdecl *d = find_decl (base); ++ ++ if (!d && dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Can't find decl:\n"); ++ print_generic_expr (dump_file, base); ++ fprintf (dump_file, "\ntype:\n"); ++ t->dump (dump_file); ++ } ++ if (!d && ignore_missing_decl) ++ return true; ++ gcc_assert (d); ++ memcpy (newbase, d->newdecl, sizeof(d->newdecl)); ++ } ++ ++ if (f == NULL) ++ { ++ memcpy (newexpr, newbase, sizeof(newbase)); ++ for (unsigned i = 0; i < max_split && newexpr[i]; i++) ++ { ++ if (address) ++ newexpr[i] = build_fold_addr_expr (newexpr[i]); ++ if (indirect) ++ newexpr[i] = build_simple_mem_ref (newexpr[i]); ++ if (imagpart) ++ newexpr[i] = build1 (IMAGPART_EXPR, TREE_TYPE (TREE_TYPE (newexpr[i])), newexpr[i]); ++ if (realpart) ++ newexpr[i] = build1 (REALPART_EXPR, TREE_TYPE (TREE_TYPE (newexpr[i])), newexpr[i]); ++ } ++ return true; ++ } ++ ++ tree newdecl = newbase[f->clusternum]; ++ for (unsigned i = 0; i < max_split && f->newfield[i]; i++) ++ { ++ tree newbase1 = newdecl; ++ if (address) ++ newbase1 = build_fold_addr_expr (newbase1); ++ if (indirect) ++ newbase1 = build_simple_mem_ref (newbase1); ++ newexpr[i] = build3 (COMPONENT_REF, TREE_TYPE (f->newfield[i]), ++ newbase1, f->newfield[i], NULL_TREE); ++ if (imagpart) ++ newexpr[i] = build1 (IMAGPART_EXPR, TREE_TYPE (TREE_TYPE (newexpr[i])), newexpr[i]); ++ if (realpart) ++ newexpr[i] = build1 (REALPART_EXPR, TREE_TYPE (TREE_TYPE (newexpr[i])), newexpr[i]); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "cluster: %d. decl = ", (int)f->clusternum); ++ print_generic_expr (dump_file, newbase1); ++ fprintf (dump_file, "\nnewexpr = "); ++ print_generic_expr (dump_file, newexpr[i]); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ return true; ++} ++ ++bool ++ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) ++{ ++ bool remove = false; ++ if (gimple_clobber_p (stmt)) ++ { ++ tree lhs = gimple_assign_lhs (stmt); ++ tree newlhs[max_split]; ++ if (!rewrite_expr (lhs, newlhs)) ++ return false; ++ for (unsigned i = 0; i < max_split && newlhs[i]; i++) ++ { ++ tree clobber = build_constructor (TREE_TYPE (newlhs[i]), NULL); ++ TREE_THIS_VOLATILE (clobber) = true; ++ gimple *newstmt = gimple_build_assign (newlhs[i], clobber); ++ gsi_insert_before (gsi, newstmt, GSI_SAME_STMT); ++ remove = true; ++ } ++ return remove; ++ } ++ ++ if (gimple_assign_rhs_code (stmt) == EQ_EXPR ++ || gimple_assign_rhs_code (stmt) == NE_EXPR) ++ { ++ tree rhs1 = gimple_assign_rhs1 (stmt); ++ tree rhs2 = gimple_assign_rhs2 (stmt); ++ tree newrhs1[max_split]; ++ tree newrhs2[max_split]; ++ tree_code rhs_code = gimple_assign_rhs_code (stmt); ++ tree_code code = rhs_code == EQ_EXPR ? BIT_AND_EXPR : BIT_IOR_EXPR; ++ if (!rewrite_lhs_rhs (rhs1, rhs2, newrhs1, newrhs2)) ++ return false; ++ tree newexpr = NULL_TREE; ++ for (unsigned i = 0; i < max_split && newrhs1[i]; i++) ++ { ++ tree expr = gimplify_build2 (gsi, rhs_code, boolean_type_node, newrhs1[i], newrhs2[i]); ++ if (!newexpr) ++ newexpr = expr; ++ else ++ newexpr = gimplify_build2 (gsi, code, boolean_type_node, newexpr, expr); ++ } ++ ++ if (newexpr) ++ { ++ newexpr = fold_convert (TREE_TYPE (gimple_assign_lhs (stmt)), newexpr); ++ gimple_assign_set_rhs_from_tree (gsi, newexpr); ++ update_stmt (stmt); ++ } ++ return false; ++ } ++ ++ if (gimple_assign_rhs_code (stmt) == POINTER_PLUS_EXPR) ++ { ++ tree lhs = gimple_assign_lhs (stmt); ++ tree rhs1 = gimple_assign_rhs1 (stmt); ++ tree rhs2 = gimple_assign_rhs2 (stmt); ++ tree newlhs[max_split]; ++ tree newrhs[max_split]; ++ ++ if (!rewrite_lhs_rhs (lhs, rhs1, newlhs, newrhs)) ++ return false; ++ tree size = TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (lhs))); ++ tree num; ++ /* Check if rhs2 is a multiplication of the size of the type. */ ++ if (!is_result_of_mult (rhs2, &num, size)) ++ internal_error ("the rhs of pointer was not a multiplicate and it slipped through."); ++ ++ num = gimplify_build1 (gsi, NOP_EXPR, sizetype, num); ++ for (unsigned i = 0; i < max_split && newlhs[i]; i++) ++ { ++ gimple *new_stmt; ++ ++ tree newsize = TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (newlhs[i]))); ++ newsize = gimplify_build2 (gsi, MULT_EXPR, sizetype, num, newsize); ++ new_stmt = gimple_build_assign (newlhs[i], POINTER_PLUS_EXPR, newrhs[i], newsize); ++ gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); ++ remove = true; ++ } ++ return remove; ++ } ++ if (gimple_assign_rhs_class (stmt) == GIMPLE_SINGLE_RHS) ++ { ++ tree lhs = gimple_assign_lhs (stmt); ++ tree rhs = gimple_assign_rhs1 (stmt); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "rewriting stamtenet:\n"); ++ print_gimple_stmt (dump_file, stmt, 0); ++ fprintf (dump_file, "\n"); ++ } ++ tree newlhs[max_split]; ++ tree newrhs[max_split]; ++ if (!rewrite_lhs_rhs (lhs, rhs, newlhs, newrhs)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nDid nothing to statement.\n"); ++ return false; ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nreplaced with:\n"); ++ for (unsigned i = 0; i < max_split && (newlhs[i] || newrhs[i]); i++) ++ { ++ gimple *newstmt = gimple_build_assign (newlhs[i] ? newlhs[i] : lhs, newrhs[i] ? newrhs[i] : rhs); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_gimple_stmt (dump_file, newstmt, 0); ++ fprintf (dump_file, "\n"); ++ } ++ gsi_insert_before (gsi, newstmt, GSI_SAME_STMT); ++ remove = true; ++ } ++ return remove; ++ } ++ ++ return remove; ++} ++ ++/* Rewrite function call statement STMT. Return TRUE if the statement ++ is to be removed. */ ++ ++bool ++ipa_struct_reorg::rewrite_call (gcall *stmt, gimple_stmt_iterator *gsi) ++{ ++ /* Handled allocation calls are handled seperately from normal ++ function calls. */ ++ if (handled_allocation_stmt (stmt)) ++ { ++ tree lhs = gimple_call_lhs (stmt); ++ tree newrhs1[max_split]; ++ srdecl *decl = find_decl (lhs); ++ if (!decl || !decl->type) ++ return false; ++ srtype *type = decl->type; ++ tree num = allocate_size (type, stmt); ++ gcc_assert (num); ++ memset (newrhs1, 0, sizeof(newrhs1)); ++ ++ /* The realloc call needs to have its first argument rewritten. */ ++ if (gimple_call_builtin_p (stmt, BUILT_IN_REALLOC)) ++ { ++ tree rhs1 = gimple_call_arg (stmt, 0); ++ if (integer_zerop (rhs1)) ++ { ++ for (unsigned i = 0; i < max_split; i++) ++ newrhs1[i] = rhs1; ++ } ++ else if (!rewrite_expr (rhs1, newrhs1)) ++ internal_error ("rewrite failed for realloc"); ++ } ++ ++ /* Go through each new lhs. */ ++ for (unsigned i = 0; i < max_split && decl->newdecl[i]; i++) ++ { ++ tree newsize = TYPE_SIZE_UNIT (type->type); ++ gimple *g; ++ /* Every allocation except for calloc needs the size multiplied out. */ ++ if (!gimple_call_builtin_p (stmt, BUILT_IN_CALLOC)) ++ newsize = gimplify_build2 (gsi, MULT_EXPR, sizetype, num, newsize); ++ ++ if (gimple_call_builtin_p (stmt, BUILT_IN_MALLOC) ++ || gimple_call_builtin_p (stmt, BUILT_IN_ALLOCA)) ++ g = gimple_build_call (gimple_call_fndecl (stmt), ++ 1, newsize); ++ else if (gimple_call_builtin_p (stmt, BUILT_IN_CALLOC)) ++ g = gimple_build_call (gimple_call_fndecl (stmt), ++ 2, num, newsize); ++ else if (gimple_call_builtin_p (stmt, BUILT_IN_REALLOC)) ++ g = gimple_build_call (gimple_call_fndecl (stmt), ++ 2, newrhs1[i], newsize); ++ else ++ gcc_assert (false); ++ gimple_call_set_lhs (g, decl->newdecl[i]); ++ gsi_insert_before (gsi, g, GSI_SAME_STMT); ++ } ++ return true; ++ } ++ ++ /* The function call free needs to be handled special. */ ++ if (gimple_call_builtin_p (stmt, BUILT_IN_FREE)) ++ { ++ tree expr = gimple_call_arg (stmt, 0); ++ tree newexpr[max_split]; ++ if (!rewrite_expr (expr, newexpr)) ++ return false; ++ ++ if (newexpr[1] == NULL) ++ { ++ gimple_call_set_arg (stmt, 0, newexpr[0]); ++ update_stmt (stmt); ++ return false; ++ } ++ ++ for (unsigned i = 0; i < max_split && newexpr[i]; i++) ++ { ++ gimple *g = gimple_build_call (gimple_call_fndecl (stmt), ++ 1, newexpr[i]); ++ gsi_insert_before (gsi, g, GSI_SAME_STMT); ++ } ++ return true; ++ } ++ ++ /* Otherwise, look up the function to see if we have cloned it ++ and rewrite the arguments. */ ++ tree fndecl = gimple_call_fndecl (stmt); ++ ++ /* Indirect calls are already marked as escaping so ignore. */ ++ if (!fndecl) ++ return false; ++ ++ cgraph_node *node = cgraph_node::get (fndecl); ++ gcc_assert (node); ++ srfunction *f = find_function (node); ++ ++ /* Did not find the function or had not cloned it return saying don't ++ change the function call. */ ++ if (!f || !f->newf) ++ return false; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Changing arguments for function call :\n"); ++ print_gimple_expr (dump_file, stmt, 0); ++ fprintf (dump_file, "\n"); ++ } ++ ++ /* Move over to the new function. */ ++ f = f->newf; ++ ++ tree chain = gimple_call_chain (stmt); ++ unsigned nargs = gimple_call_num_args (stmt); ++ auto_vec vargs (nargs); ++ ++ if (chain) ++ { ++ tree newchains[max_split]; ++ if (rewrite_expr (chain, newchains)) ++ { ++ /* Chain decl's type cannot be split and but it can change. */ ++ gcc_assert (newchains[1] == NULL); ++ chain = newchains[0]; ++ } ++ } ++ ++ for (unsigned i = 0; i < nargs; i++) ++ vargs.quick_push (gimple_call_arg (stmt, i)); ++ ++ int extraargs = 0; ++ ++ for (unsigned i = 0; i < f->args.length (); i++) ++ { ++ srdecl *d = f->args[i]; ++ if (d->argumentnum == -2) ++ continue; ++ gcc_assert (d->argumentnum != -1); ++ tree arg = vargs[d->argumentnum + extraargs]; ++ tree newargs[max_split]; ++ if (!rewrite_expr (arg, newargs)) ++ continue; ++ ++ /* If this ARG has a replacement handle the replacement. */ ++ for (unsigned j = 0; j < max_split && d->newdecl[j]; j++) ++ { ++ gcc_assert (newargs[j]); ++ /* If this is the first replacement of the arugment, ++ then just replace it. */ ++ if (j == 0) ++ vargs[d->argumentnum + extraargs] = newargs[j]; ++ else ++ { ++ /* More than one replacement, we need to insert into the array. */ ++ extraargs++; ++ vargs.safe_insert(d->argumentnum + extraargs, newargs[j]); ++ } ++ } ++ } ++ ++ gcall *new_stmt; ++ ++ new_stmt = gimple_build_call_vec (f->node->decl, vargs); ++ ++ if (gimple_call_lhs (stmt)) ++ gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt)); ++ ++ gimple_set_vuse (new_stmt, gimple_vuse (stmt)); ++ gimple_set_vdef (new_stmt, gimple_vdef (stmt)); ++ ++ if (gimple_has_location (stmt)) ++ gimple_set_location (new_stmt, gimple_location (stmt)); ++ gimple_call_copy_flags (new_stmt, stmt); ++ gimple_call_set_chain (new_stmt, chain); ++ ++ gimple_set_modified (new_stmt, true); ++ ++ if (gimple_vdef (new_stmt) ++ && TREE_CODE (gimple_vdef (new_stmt)) == SSA_NAME) ++ SSA_NAME_DEF_STMT (gimple_vdef (new_stmt)) = new_stmt; ++ ++ gsi_replace (gsi, new_stmt, false); ++ ++ /* We need to defer cleaning EH info on the new statement to ++ fixup-cfg. We may not have dominator information at this point ++ and thus would end up with unreachable blocks and have no way ++ to communicate that we need to run CFG cleanup then. */ ++ int lp_nr = lookup_stmt_eh_lp (stmt); ++ if (lp_nr != 0) ++ { ++ remove_stmt_from_eh_lp (stmt); ++ add_stmt_to_eh_lp (new_stmt, lp_nr); ++ } ++ ++ ++ return false; ++} ++ ++/* Rewrite the conditional statement STMT. Return TRUE if the ++ old statement is to be removed. */ ++ ++bool ++ipa_struct_reorg::rewrite_cond (gcond *stmt, gimple_stmt_iterator *gsi) ++{ ++ tree_code rhs_code = gimple_cond_code (stmt); ++ ++ /* Handle only equals or not equals conditionals. */ ++ if (rhs_code != EQ_EXPR ++ && rhs_code != NE_EXPR) ++ return false; ++ tree rhs1 = gimple_cond_lhs (stmt); ++ tree rhs2 = gimple_cond_rhs (stmt); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "COND: Rewriting\n"); ++ print_gimple_stmt (dump_file, stmt, 0); ++ fprintf (dump_file, "\n"); ++ print_generic_expr (dump_file, rhs1); ++ fprintf (dump_file, "\n"); ++ print_generic_expr (dump_file, rhs2); ++ fprintf (dump_file, "\n"); ++ } ++ ++ tree newrhs1[max_split]; ++ tree newrhs2[max_split]; ++ tree_code code = rhs_code == EQ_EXPR ? BIT_AND_EXPR : BIT_IOR_EXPR; ++ if (!rewrite_lhs_rhs (rhs1, rhs2, newrhs1, newrhs2)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nDid nothing to statement.\n"); ++ return false; ++ } ++ ++ tree newexpr = NULL_TREE; ++ for (unsigned i = 0; i < max_split && newrhs1[i]; i++) ++ { ++ tree expr = gimplify_build2 (gsi, rhs_code, boolean_type_node, newrhs1[i], newrhs2[i]); ++ if (!newexpr) ++ newexpr = expr; ++ else ++ newexpr = gimplify_build2 (gsi, code, boolean_type_node, newexpr, expr); ++ } ++ ++ if (newexpr) ++ { ++ gimple_cond_set_lhs (stmt, newexpr); ++ gimple_cond_set_rhs (stmt, boolean_true_node); ++ update_stmt (stmt); ++ } ++ return false; ++} ++ ++/* Rewrite debug statments if possible. Return TRUE if the statement ++ should be removed. */ ++ ++bool ++ipa_struct_reorg::rewrite_debug (gimple *stmt, gimple_stmt_iterator *) ++{ ++ bool remove = false; ++ if (gimple_debug_bind_p (stmt)) ++ { ++ tree var = gimple_debug_bind_get_var (stmt); ++ tree newvar[max_split]; ++ if (rewrite_expr (var, newvar, true)) ++ remove = true; ++ if (gimple_debug_bind_has_value_p (stmt)) ++ { ++ var = gimple_debug_bind_get_value (stmt); ++ if (TREE_CODE (var) == POINTER_PLUS_EXPR) ++ var = TREE_OPERAND (var, 0); ++ if (rewrite_expr (var, newvar, true)) ++ remove = true; ++ } ++ } ++ else if (gimple_debug_source_bind_p (stmt)) ++ { ++ tree var = gimple_debug_source_bind_get_var (stmt); ++ tree newvar[max_split]; ++ if (rewrite_expr (var, newvar, true)) ++ remove = true; ++ var = gimple_debug_source_bind_get_value (stmt); ++ if (TREE_CODE (var) == POINTER_PLUS_EXPR) ++ var = TREE_OPERAND (var, 0); ++ if (rewrite_expr (var, newvar, true)) ++ remove = true; ++ } ++ ++ return remove; ++} ++ ++/* Rewrite PHI nodes, return true if the PHI was replaced. */ ++ ++bool ++ipa_struct_reorg::rewrite_phi (gphi *phi) ++{ ++ tree newlhs[max_split]; ++ gphi *newphi[max_split]; ++ tree result = gimple_phi_result (phi); ++ gphi_iterator gsi; ++ ++ memset(newphi, 0, sizeof(newphi)); ++ ++ if (!rewrite_expr (result, newlhs)) ++ return false; ++ ++ if (newlhs[0] == NULL) ++ return false; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nrewriting PHI:"); ++ print_gimple_stmt (dump_file, phi, 0); ++ } ++ ++ for (unsigned i = 0; i < max_split && newlhs[i]; i++) ++ newphi[i] = create_phi_node (newlhs[i], gimple_bb (phi)); ++ ++ for(unsigned i = 0; i < gimple_phi_num_args (phi); i++) ++ { ++ tree newrhs[max_split]; ++ phi_arg_d rhs = *gimple_phi_arg (phi, i); ++ rewrite_expr (rhs.def, newrhs); ++ for (unsigned j = 0; j < max_split && newlhs[j]; j++) ++ { ++ SET_PHI_ARG_DEF (newphi[j], i, newrhs[j]); ++ gimple_phi_arg_set_location (newphi[j], i, rhs.locus); ++ update_stmt (newphi[j]); ++ } ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\ninto\n:"); ++ for (unsigned i = 0; i < max_split && newlhs[i]; i++) ++ { ++ print_gimple_stmt (dump_file, newphi[i], 0); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ ++ gsi = gsi_for_phi (phi); ++ remove_phi_node (&gsi, false); ++ ++ return true; ++} ++ ++/* Rewrite gimple statement STMT, return true if the STATEMENT ++ is to be removed. */ ++ ++bool ++ipa_struct_reorg::rewrite_stmt (gimple *stmt, gimple_stmt_iterator *gsi) ++{ ++ switch (gimple_code (stmt)) ++ { ++ case GIMPLE_ASSIGN: ++ return rewrite_assign (as_a (stmt), gsi); ++ case GIMPLE_CALL: ++ return rewrite_call (as_a (stmt), gsi); ++ case GIMPLE_COND: ++ return rewrite_cond (as_a (stmt), gsi); ++ break; ++ case GIMPLE_GOTO: ++ case GIMPLE_SWITCH: ++ break; ++ case GIMPLE_DEBUG: ++ case GIMPLE_ASM: ++ break; ++ default: ++ break; ++ } ++ return false; ++} ++ ++/* Does the function F uses any decl which has changed. */ ++ ++bool ++ipa_struct_reorg::has_rewritten_type (srfunction *f) ++{ ++ for (unsigned i = 0; i < f->decls.length (); i++) ++ { ++ srdecl *d = f->decls[i]; ++ if (d->newdecl[0] != d->decl) ++ return true; ++ } ++ ++ for (unsigned i = 0; i < f->globals.length (); i++) ++ { ++ srdecl *d = f->globals[i]; ++ if (d->newdecl[0] != d->decl) ++ return true; ++ } ++ return false; ++ ++} ++ ++/* Rewrite the functions if needed, return ++ the TODOs requested. */ ++ ++unsigned ++ipa_struct_reorg::rewrite_functions (void) ++{ ++ unsigned retval = 0; ++ ++ ++ /* Create new types, if we did not create any new types, ++ then don't rewrite any accesses. */ ++ if (!create_new_types ()) ++ return 0; ++ ++ if (functions.length ()) ++ { ++ retval = TODO_remove_functions; ++ create_new_functions (); ++ } ++ ++ create_new_decls (); ++ ++ for (unsigned i = 0; i < functions.length (); i++) ++ { ++ srfunction *f = functions[i]; ++ if (f->newnode) ++ continue; ++ ++ /* Function uses no rewriten types so don't cause a rewrite. */ ++ if (!has_rewritten_type (f)) ++ continue; ++ ++ cgraph_node *node = f->node; ++ basic_block bb; ++ ++ push_cfun (DECL_STRUCT_FUNCTION (node->decl)); ++ current_function = f; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nBefore rewrite:\n"); ++ dump_function_to_file (current_function_decl, dump_file, dump_flags | TDF_VOPS); ++ } ++ FOR_EACH_BB_FN (bb, cfun) ++ { ++ for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); ) ++ { ++ if (rewrite_phi (si.phi ())) ++ si = gsi_start_phis (bb); ++ else ++ gsi_next (&si); ++ } ++ ++ for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); ) ++ { ++ gimple *stmt = gsi_stmt (si); ++ if (rewrite_stmt (stmt, &si)) ++ gsi_remove (&si, true); ++ else ++ gsi_next (&si); ++ } ++ } ++ ++ /* Debug statements need to happen after all other statements ++ have changed. */ ++ FOR_EACH_BB_FN (bb, cfun) ++ { ++ for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); ) ++ { ++ gimple *stmt = gsi_stmt (si); ++ if (gimple_code (stmt) == GIMPLE_DEBUG ++ && rewrite_debug (stmt, &si)) ++ gsi_remove (&si, true); ++ else ++ gsi_next (&si); ++ } ++ } ++ ++ /* Release the old SSA_NAMES for old arguments. */ ++ if (f->old) ++ { ++ for (unsigned i = 0; i < f->args.length (); i++) ++ { ++ srdecl *d = f->args[i]; ++ if (d->newdecl[0] != d->decl) ++ { ++ tree ssa_name = ssa_default_def (cfun, d->decl); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Found "); ++ print_generic_expr (dump_file, ssa_name); ++ fprintf (dump_file, " to be released.\n"); ++ } ++ release_ssa_name (ssa_name); ++ } ++ } ++ } ++ ++ update_ssa (TODO_update_ssa_only_virtuals); ++ ++ if (flag_tree_pta) ++ compute_may_aliases (); ++ ++ remove_unused_locals (); ++ ++ cgraph_edge::rebuild_edges (); ++ ++ free_dominance_info (CDI_DOMINATORS); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nAfter rewrite:\n"); ++ dump_function_to_file (current_function_decl, dump_file, dump_flags | TDF_VOPS); ++ } ++ ++ pop_cfun (); ++ current_function = NULL; ++ } ++ ++ return retval | TODO_verify_all; ++} ++ ++unsigned int ++ipa_struct_reorg::execute (void) ++{ ++ /* FIXME: If there is a top-level inline-asm, the pass immediately returns. */ ++ if (symtab->first_asm_symbol ()) ++ return 0; ++ record_accesses (); ++ prune_escaped_types (); ++ analyze_types (); ++ ++ return rewrite_functions (); ++} ++ ++const pass_data pass_data_ipa_struct_reorg = ++{ ++ SIMPLE_IPA_PASS, /* type */ ++ "struct_reorg", /* name */ ++ OPTGROUP_NONE, /* optinfo_flags */ ++ TV_IPA_STRUCT_REORG, /* tv_id */ ++ 0, /* properties_required */ ++ 0, /* properties_provided */ ++ 0, /* properties_destroyed */ ++ 0, /* todo_flags_start */ ++ 0, /* todo_flags_finish */ ++}; ++ ++class pass_ipa_struct_reorg : public simple_ipa_opt_pass ++{ ++public: ++ pass_ipa_struct_reorg (gcc::context *ctxt) ++ : simple_ipa_opt_pass (pass_data_ipa_struct_reorg, ctxt) ++ {} ++ ++ /* opt_pass methods: */ ++ virtual bool gate (function *); ++ virtual unsigned int execute (function *) { return ipa_struct_reorg ().execute(); } ++ ++}; // class pass_ipa_struct_reorg ++ ++bool ++pass_ipa_struct_reorg::gate (function *) ++{ ++ return (optimize ++ && flag_ipa_struct_reorg ++ /* Don't bother doing anything if the program has errors. */ ++ && !seen_error ()); ++} ++ ++} // anon namespace ++ ++simple_ipa_opt_pass * ++make_pass_ipa_struct_reorg (gcc::context *ctxt) ++{ ++ return new pass_ipa_struct_reorg (ctxt); ++} +diff -Nurp a/gcc/ipa-struct-reorg/ipa-struct-reorg.h b/gcc/ipa-struct-reorg/ipa-struct-reorg.h +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.h 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.h 2020-06-16 22:56:07.732000000 -0400 +@@ -0,0 +1,235 @@ ++/* Struct-reorg optimizations. ++ Copyright (C) 2016-2017 Free Software Foundation, Inc. ++ Contributed by Andrew Pinski ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free ++Software Foundation; either version 3, or (at your option) any later ++version. ++ ++GCC is distributed in the hope that it will be useful, but WITHOUT ANY ++WARRANTY; without even the implied warranty of MERCHANTABILITY or ++FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++#ifndef IPA_STRUCT_REORG_H ++#define IPA_STRUCT_REORG_H ++ ++namespace struct_reorg { ++ ++const int max_split = 2; ++ ++template ++struct auto_vec_del : auto_vec ++{ ++ ~auto_vec_del(); ++}; ++ ++template ++auto_vec_del::~auto_vec_del(void) ++{ ++ unsigned i; ++ T *t; ++ FOR_EACH_VEC_ELT (*this, i, t) ++ { ++ delete t; ++ } ++} ++ ++enum escape_type ++{ ++ does_not_escape, ++#define DEF_ESCAPE(ENUM, TEXT) ENUM, ++#include "escapes.def" ++ escape_max_escape ++}; ++ ++const char *escape_type_string[escape_max_escape - 1] = ++{ ++#define DEF_ESCAPE(ENUM, TEXT) TEXT, ++#include "escapes.def" ++}; ++ ++struct srfield; ++struct srtype; ++struct sraccess; ++struct srdecl; ++struct srfunction; ++ ++struct srfunction ++{ ++ cgraph_node *node; ++ auto_vec args; ++ auto_vec globals; ++ auto_vec_del decls; ++ srdecl *record_decl (srtype *, tree, int arg); ++ ++ srfunction *old; ++ cgraph_node *newnode; ++ srfunction *newf; ++ ++ // Constructors ++ srfunction (cgraph_node *n); ++ ++ // Methods ++ void add_arg (srdecl *arg); ++ void dump (FILE *file); ++ void simple_dump (FILE *file); ++ ++ bool check_args (void); ++ void create_new_decls (void); ++ srdecl *find_decl (tree); ++}; ++ ++struct srglobal : private srfunction ++{ ++ srglobal () ++ : srfunction (NULL) ++ { ++ } ++ ++ using srfunction::dump; ++ using srfunction::create_new_decls; ++ using srfunction::find_decl; ++ using srfunction::record_decl; ++ using srfunction::decls; ++}; ++ ++struct srtype ++{ ++ tree type; ++ auto_vec_del fields; ++ ++ // array of fields that use this type. ++ auto_vec field_sites; ++ ++ // array of functions which use directly the type ++ auto_vec functions; ++ ++ auto_vec_del accesses; ++ bool chain_type; ++ ++private: ++ escape_type escapes; ++public: ++ ++ tree newtype[max_split]; ++ bool visited; ++ ++ // Constructors ++ srtype(tree type); ++ ++ // Methods ++ void dump (FILE *file); ++ void simple_dump (FILE *file); ++ void add_function (srfunction *); ++ void add_access (sraccess *a) ++ { ++ accesses.safe_push (a); ++ } ++ void add_field_site (srfield *); ++ ++ srfield *find_field (unsigned HOST_WIDE_INT offset); ++ ++ bool create_new_type (void); ++ void analyze (void); ++ void mark_escape (escape_type, gimple *stmt); ++ bool has_escaped (void) ++ { ++ return escapes != does_not_escape; ++ } ++ const char *escape_reason (void) ++ { ++ if (!has_escaped()) ++ return NULL; ++ return escape_type_string[escapes-1]; ++ } ++ bool escaped_rescusive (void) ++ { ++ return escapes == escape_rescusive_type; ++ } ++ bool has_new_type (void) ++ { ++ return newtype[0] && newtype[0] != type; ++ } ++}; ++ ++struct srfield ++{ ++ unsigned HOST_WIDE_INT offset; ++ tree fieldtype; ++ tree fielddecl; ++ srtype *base; ++ srtype *type; ++ ++ unsigned clusternum; ++ ++ tree newfield[max_split]; ++ ++ // Constructors ++ srfield (tree field, srtype *base); ++ ++ // Methods ++ void dump (FILE *file); ++ void simple_dump (FILE *file); ++ ++ void create_new_fields (tree newtype[max_split], ++ tree newfields[max_split], ++ tree newlast[max_split]); ++}; ++ ++struct sraccess ++{ ++ gimple *stmt; ++ cgraph_node *node; ++ ++ srtype *type; ++ // NULL field means the whole type is accessed ++ srfield *field; ++ ++ // constructors ++ sraccess (gimple *s, cgraph_node *n, srtype *t, srfield *f = NULL) ++ : stmt (s), ++ node (n), ++ type (t), ++ field (f) ++ {} ++ ++ // Methods ++ void dump (FILE *file); ++}; ++ ++struct srdecl ++{ ++ srtype *type; ++ tree decl; ++ tree func; ++ /* -1 : not an argument ++ -2 : static chain */ ++ int argumentnum; ++ ++ bool visited; ++ ++ tree newdecl[max_split]; ++ ++ // Constructors ++ srdecl (srtype *type, tree decl, int argumentnum = -1); ++ ++ // Methods ++ void dump (FILE *file); ++ bool has_new_decl (void) ++ { ++ return newdecl[0] && newdecl[0] != decl; ++ } ++}; ++ ++ ++} // namespace struct_reorg ++ ++#endif +diff -Nurp a/gcc/Makefile.in b/gcc/Makefile.in +--- a/gcc/Makefile.in 2020-03-12 07:07:20.000000000 -0400 ++++ b/gcc/Makefile.in 2020-06-16 22:56:07.732000000 -0400 +@@ -1367,6 +1367,7 @@ OBJS = \ + incpath.o \ + init-regs.o \ + internal-fn.o \ ++ ipa-struct-reorg/ipa-struct-reorg.o \ + ipa-cp.o \ + ipa-devirt.o \ + ipa-fnsummary.o \ +diff -Nurp a/gcc/params.def b/gcc/params.def +--- a/gcc/params.def 2020-03-12 07:07:21.000000000 -0400 ++++ b/gcc/params.def 2020-06-16 22:56:07.732000000 -0400 +@@ -42,6 +42,16 @@ along with GCC; see the file COPYING3. + + Be sure to add an entry to invoke.texi summarizing the parameter. */ + ++/* The threshold ratio between current and hottest structure counts. ++ We say that if the ratio of the current structure count, ++ calculated by profiling, to the hottest structure count ++ in the program is less than this parameter, then structure ++ reorganization is not applied. The default is 10%. */ ++DEFPARAM (PARAM_STRUCT_REORG_COLD_STRUCT_RATIO, ++ "struct-reorg-cold-struct-ratio", ++ "The threshold ratio between current and hottest structure counts", ++ 10, 0, 100) ++ + /* When branch is predicted to be taken with probability lower than this + threshold (in percent), then it is considered well predictable. */ + DEFPARAM (PARAM_PREDICTABLE_BRANCH_OUTCOME, +diff -Nurp a/gcc/params.h b/gcc/params.h +--- a/gcc/params.h 2020-03-12 07:07:21.000000000 -0400 ++++ b/gcc/params.h 2020-06-16 22:56:07.732000000 -0400 +@@ -130,6 +130,8 @@ extern int default_param_value (compiler + extern void init_param_values (int *params); + + /* Macros for the various parameters. */ ++#define STRUCT_REORG_COLD_STRUCT_RATIO \ ++ PARAM_VALUE (PARAM_STRUCT_REORG_COLD_STRUCT_RATIO) + #define MAX_INLINE_INSNS_SINGLE \ + PARAM_VALUE (PARAM_MAX_INLINE_INSNS_SINGLE) + #define MAX_INLINE_INSNS \ +diff -Nurp a/gcc/passes.def b/gcc/passes.def +--- a/gcc/passes.def 2020-03-12 07:07:21.000000000 -0400 ++++ b/gcc/passes.def 2020-06-16 22:56:07.732000000 -0400 +@@ -169,6 +169,8 @@ along with GCC; see the file COPYING3. + INSERT_PASSES_AFTER (all_late_ipa_passes) + NEXT_PASS (pass_materialize_all_clones); + NEXT_PASS (pass_ipa_pta); ++ /* FIXME: this should a normal IP pass */ ++ NEXT_PASS (pass_ipa_struct_reorg); + NEXT_PASS (pass_omp_simd_clone); + TERMINATE_PASS_LIST (all_late_ipa_passes) + +diff -Nurp a/gcc/testsuite/gcc.c-torture/compile/20170404-1.c b/gcc/testsuite/gcc.c-torture/compile/20170404-1.c +--- a/gcc/testsuite/gcc.c-torture/compile/20170404-1.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.c-torture/compile/20170404-1.c 2020-06-16 22:56:07.732000000 -0400 +@@ -0,0 +1,19 @@ ++struct a ++{ ++ int t, t1; ++}; ++ ++static struct a *b; ++ ++void *xmalloc(int); ++ ++ ++void f(void) ++{ ++ b = xmalloc (sizeof(*b)); ++} ++ ++int g(void) ++{ ++ return b->t; ++} +diff -Nurp a/gcc/testsuite/gcc.c-torture/compile/nested-3.c b/gcc/testsuite/gcc.c-torture/compile/nested-3.c +--- a/gcc/testsuite/gcc.c-torture/compile/nested-3.c 2020-03-12 07:07:22.000000000 -0400 ++++ b/gcc/testsuite/gcc.c-torture/compile/nested-3.c 2020-06-16 22:56:07.736000000 -0400 +@@ -1,3 +1,4 @@ ++/* This used to crash Struct reorg. */ + struct a + { + int t; +diff -Nurp a/gcc/testsuite/gcc.c-torture/compile/struct-reorg-1.c b/gcc/testsuite/gcc.c-torture/compile/struct-reorg-1.c +--- a/gcc/testsuite/gcc.c-torture/compile/struct-reorg-1.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.c-torture/compile/struct-reorg-1.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,18 @@ ++#include ++typedef struct { ++ long laststart_offset; ++ unsigned regnum; ++} compile_stack_elt_t; ++typedef struct { ++ compile_stack_elt_t *stack; ++ unsigned size; ++} compile_stack_type; ++void f (const char *p, const char *pend, int c) ++{ ++ compile_stack_type compile_stack; ++ while (p != pend) ++ if (c) ++ compile_stack.stack = realloc (compile_stack.stack, ++ (compile_stack.size << 1) ++ * sizeof (compile_stack_elt_t)); ++} +diff -Nurp a/gcc/testsuite/gcc.dg/pr33136-4.c b/gcc/testsuite/gcc.dg/pr33136-4.c +--- a/gcc/testsuite/gcc.dg/pr33136-4.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/pr33136-4.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,59 @@ ++/* PR tree-optimization/33136 */ ++/* { dg-do run } */ ++/* { dg-options "-O2" } */ ++ ++extern void abort (void); ++ ++struct S ++{ ++ int b; ++ int *c; ++}; ++static int d, e; ++ ++static struct S s; ++ ++static int * ++__attribute__((noinline, const)) ++foo (void) ++{ ++ return &s.b; ++} ++ ++int * ++__attribute__((noinline)) ++bar (int **f) ++{ ++ s.c = &d; ++ *f = &e; ++ /* As nothing ever takes the address of any int * field in struct S, ++ the write to *f can't alias with the s.c field. */ ++ return s.c; ++} ++ ++int ++__attribute__((noinline)) ++baz (int *x) ++{ ++ s.b = 1; ++ *x = 4; ++ /* Function foo takes address of an int field in struct S, ++ so *x can alias with the s.b field (and it does in this testcase). */ ++ return s.b; ++} ++ ++int ++__attribute__((noinline)) ++t (void) ++{ ++ int *f = (int *) 0; ++ return 10 * (bar (&f) != &d) + baz (foo ()); ++} ++ ++int ++main (void) ++{ ++ if (t () != 4) ++ abort (); ++ return 0; ++} +diff -Nurp a/gcc/testsuite/gcc.dg/struct/struct-reorg.exp b/gcc/testsuite/gcc.dg/struct/struct-reorg.exp +--- a/gcc/testsuite/gcc.dg/struct/struct-reorg.exp 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/struct-reorg.exp 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,87 @@ ++# Copyright (C) 2007, 2008, 2009, 2010 ++# Free Software Foundation, Inc. ++ ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 3 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; see the file COPYING3. If not see ++# . ++ ++# Test the functionality of programs compiled with profile-directed structure ++# rearrangement using -fprofile-generate followed by -fprofile-use. ++ ++load_lib gcc-dg.exp ++load_lib target-supports.exp ++ ++# Initialize `dg'. ++dg-init ++torture-init ++ ++set STRUCT_REORG_TORTURE_OPTIONS [list \ ++ { -O1 } \ ++ { -O1 -g } \ ++ { -O2 } \ ++ { -O2 -g } \ ++ { -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions } \ ++ { -O3 -g } \ ++ { -Os } ] ++ ++ ++set-torture-options $STRUCT_REORG_TORTURE_OPTIONS {{}} $LTO_TORTURE_OPTIONS ++ ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/wo_prof_*.c]] "" "-fipa-struct-reorg -fdump-ipa-all -fwhole-program" ++ ++torture-finish ++dg-final ++ ++# Some targets don't support tree profiling. ++if { ![check_profiling_available ""] } { ++ return ++} ++ ++# The procedures in profopt.exp need these parameters. ++set tool gcc ++set prof_ext "gcda" ++ ++# Override the list defined in profopt.exp. ++set PROFOPT_OPTIONS [list {}] ++ ++if $tracelevel then { ++ strace $tracelevel ++} ++ ++# Load support procs. ++load_lib profopt.exp ++ ++# These are globals used by profopt-execute. The first is options ++# needed to generate profile data, the second is options to use the ++# profile data. ++set common "-O3 -fwhole-program" ++set profile_option [concat $common " -fprofile-generate"] ++set feedback_option [concat $common " -fprofile-use -fipa-struct-reorg -fdump-ipa-all"] ++ ++foreach src [lsort [glob -nocomplain $srcdir/$subdir/w_prof_*.c]] { ++ # If we're only testing specific files and this isn't one of them, skip it. ++ if ![runtest_file_p $runtests $src] then { ++ continue ++ } ++ profopt-execute $src ++} ++ ++set feedback_option [concat $feedback_option " --param struct-reorg-cold-struct-ratio=30"] ++ ++foreach src [lsort [glob -nocomplain $srcdir/$subdir/w_ratio_*.c]] { ++ # If we're only testing specific files and this isn't one of them, skip it. ++ if ![runtest_file_p $runtests $src] then { ++ continue ++ } ++ profopt-execute $src ++} +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_array_field.c b/gcc/testsuite/gcc.dg/struct/wo_prof_array_field.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_array_field.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_array_field.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,26 @@ ++/* { dg-do compile } */ ++/* { dg-do run } */ ++ ++#include ++typedef struct basic ++{ ++ int a; ++ int b[10]; ++} type_struct; ++ ++type_struct *str1; ++ ++int main() ++{ ++ int i; ++ ++ str1 = malloc (10 * sizeof (type_struct)); ++ ++ for (i=0; i<=9; i++) ++ str1[i].a = str1[i].b[0]; ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" { xfail *-*-* } } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_array_through_pointer.c b/gcc/testsuite/gcc.dg/struct/wo_prof_array_through_pointer.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_array_through_pointer.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_array_through_pointer.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,38 @@ ++/* { dg-do compile } */ ++/* { dg-do run } */ ++ ++#include ++typedef struct ++{ ++ int a; ++ float b; ++}str_t; ++ ++#ifdef STACK_SIZE ++#if STACK_SIZE > 8000 ++#define N 1000 ++#else ++#define N (STACK_SIZE/8) ++#endif ++#else ++#define N 1000 ++#endif ++ ++int ++main () ++{ ++ int i; ++ str_t A[N]; ++ str_t *p = A; ++ ++ for (i = 0; i < N; i++) ++ p[i].a = 0; ++ ++ for (i = 0; i < N; i++) ++ if (p[i].a != 0) ++ abort (); ++ ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" { xfail *-*-* } } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_double_malloc.c b/gcc/testsuite/gcc.dg/struct/wo_prof_double_malloc.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_double_malloc.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_double_malloc.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,29 @@ ++/* { dg-do compile } */ ++/* { dg-do run } */ ++ ++#include ++ ++typedef struct test_struct ++{ ++ int a; ++ int b; ++} type_struct; ++ ++typedef type_struct **struct_pointer2; ++ ++struct_pointer2 str1; ++ ++int main() ++{ ++ int i, j; ++ ++ str1 = malloc (2 * sizeof (type_struct *)); ++ ++ for (i = 0; i <= 1; i++) ++ str1[i] = malloc (2 * sizeof (type_struct)); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" { xfail *-*-* } } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_empty_str.c b/gcc/testsuite/gcc.dg/struct/wo_prof_empty_str.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_empty_str.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_empty_str.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,44 @@ ++/* { dg-do run } */ ++ ++#include ++ ++struct S { int a; struct V *b; }; ++typedef struct { int c; } T; ++typedef struct { int d; int e; } U; ++ ++void * ++fn (void *x) ++{ ++ return x; ++} ++ ++int ++foo (struct S *s) ++{ ++ T x; ++ ++ T y = *(T *)fn (&x); ++ return y.c; ++} ++ ++int ++bar (struct S *s) ++{ ++ U x; ++ ++ U y = *(U *)fn (&x); ++ return y.d + s->a; ++} ++ ++int ++main () ++{ ++ struct S s; ++ ++ foo(&s) + bar (&s); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final { scan-ipa-dump "No structures to transform" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_escape_arg_to_local.c b/gcc/testsuite/gcc.dg/struct/wo_prof_escape_arg_to_local.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_escape_arg_to_local.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_escape_arg_to_local.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,44 @@ ++/* { dg-do run } */ ++ ++#include ++struct str ++{ ++ int a; ++ float b; ++}; ++ ++#ifdef STACK_SIZE ++#if STACK_SIZE > 8000 ++#define N 1000 ++#else ++#define N (STACK_SIZE/8) ++#endif ++#else ++#define N 1000 ++#endif ++ ++int ++foo (struct str * p_str) ++{ ++ static int sum = 0; ++ ++ sum = sum + p_str->a; ++ return sum; ++} ++ ++int ++main () ++{ ++ int i, sum; ++ struct str * p = malloc (N * sizeof (struct str)); ++ if (p == NULL) ++ return 0; ++ for (i = 0; i < N; i++) ++ sum = foo (p+i); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" } } */ ++ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_escape_return-1.c b/gcc/testsuite/gcc.dg/struct/wo_prof_escape_return-1.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_escape_return-1.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_escape_return-1.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,32 @@ ++/* { dg-do run } */ ++ ++#include ++ ++struct A { ++ int d; ++ int d1; ++}; ++ ++struct A a; ++ ++struct A *foo () __attribute__((noinline)); ++struct A *foo () ++{ ++ a.d = 5; ++ return &a; ++} ++ ++int ++main () ++{ ++ a.d = 0; ++ foo (); ++ ++ if (a.d != 5) ++ abort (); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final { scan-ipa-dump "has escaped. .Type escapes via a return" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_escape_return.c b/gcc/testsuite/gcc.dg/struct/wo_prof_escape_return.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_escape_return.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_escape_return.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,31 @@ ++/* { dg-do run } */ ++ ++#include ++ ++struct A { ++ int d; ++}; ++ ++struct A a; ++ ++struct A foo () __attribute__((noinline)); ++struct A foo () ++{ ++ a.d = 5; ++ return a; ++} ++ ++int ++main () ++{ ++ a.d = 0; ++ foo (); ++ ++ if (a.d != 5) ++ abort (); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final { scan-ipa-dump "has escaped: \"Type escapes via a return" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_escape_str_init.c b/gcc/testsuite/gcc.dg/struct/wo_prof_escape_str_init.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_escape_str_init.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_escape_str_init.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,31 @@ ++/* { dg-do compile } */ ++/* { dg-do run } */ ++ ++#include ++typedef struct ++{ ++ int a; ++ int b; ++}str_t; ++ ++#define N 2 ++ ++str_t A[2] = {{1,1},{2,2}}; ++ ++int ++main () ++{ ++ int i; ++ ++ for (i = 0; i < N; i++) ++ A[i].b = A[i].a; ++ ++ for (i = 0; i < N; i++) ++ if (A[i].b != A[i].a) ++ abort (); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final { scan-ipa-dump "has escaped...Type is used in an array" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_escape_substr_array.c b/gcc/testsuite/gcc.dg/struct/wo_prof_escape_substr_array.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_escape_substr_array.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_escape_substr_array.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,33 @@ ++/* { dg-do compile } */ ++/* { dg-do run } */ ++ ++#include ++typedef struct ++{ ++ int a; ++ float b; ++}str_t; ++ ++#define N 1000 ++ ++typedef struct ++{ ++ str_t A[N]; ++ int c; ++}str_with_substr_t; ++ ++str_with_substr_t a; ++ ++int ++main () ++{ ++ int i; ++ ++ for (i = 0; i < N; i++) ++ a.A[i].b = 0; ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" { xfail *-*-* } } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_escape_substr_pointer.c b/gcc/testsuite/gcc.dg/struct/wo_prof_escape_substr_pointer.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_escape_substr_pointer.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_escape_substr_pointer.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,48 @@ ++/* { dg-do compile } */ ++/* { dg-do run } */ ++ ++#include ++typedef struct ++{ ++ int a; ++ float b; ++}str_t; ++ ++#ifdef STACK_SIZE ++#if STACK_SIZE > 16000 ++#define N 1000 ++#else ++#define N (STACK_SIZE/16) ++#endif ++#else ++#define N 1000 ++#endif ++ ++typedef struct ++{ ++ str_t * sub_str; ++ int c; ++}str_with_substr_t; ++ ++int foo; ++ ++int ++main (void) ++{ ++ int i; ++ str_with_substr_t A[N]; ++ str_t a[N]; ++ ++ for (i=0; i < N; i++) ++ A[i].sub_str = &(a[i]); ++ ++ for (i=0; i < N; i++) ++ A[i].sub_str->a = 5; ++ ++ foo = A[56].sub_str->a; ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final { scan-ipa-dump "has escaped...Type is used in an array" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_escape_substr_value.c b/gcc/testsuite/gcc.dg/struct/wo_prof_escape_substr_value.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_escape_substr_value.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_escape_substr_value.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,45 @@ ++/* { dg-do compile } */ ++/* { dg-do run } */ ++ ++#include ++typedef struct ++{ ++ int a; ++ float b; ++}str_t; ++ ++#ifdef STACK_SIZE ++#if STACK_SIZE > 8000 ++#define N 1000 ++#else ++#define N (STACK_SIZE/8) ++#endif ++#else ++#define N 1000 ++#endif ++ ++ ++typedef struct ++{ ++ str_t sub_str; ++ int c; ++}str_with_substr_t; ++ ++int ++main () ++{ ++ int i; ++ str_with_substr_t A[N]; ++ ++ for (i = 0; i < N; i++) ++ A[i].sub_str.a = 5; ++ ++ for (i = 0; i < N; i++) ++ if (A[i].sub_str.a != 5) ++ abort (); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final { scan-ipa-dump "has escaped...Type is used in an array" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_global_array.c b/gcc/testsuite/gcc.dg/struct/wo_prof_global_array.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_global_array.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_global_array.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,32 @@ ++/* { dg-do compile } */ ++/* { dg-do run } */ ++ ++#include ++typedef struct ++{ ++ int a; ++ float b; ++}str_t; ++ ++#define N 1000 ++str_t A[N]; ++ ++int ++main () ++{ ++ int i; ++ ++ for (i = 0; i < N; i++) ++ { ++ A[i].a = 0; ++ } ++ ++ for (i = 0; i < N; i++) ++ if (A[i].a != 0) ++ abort (); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" { xfail *-*-* } } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_global_var.c b/gcc/testsuite/gcc.dg/struct/wo_prof_global_var.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_global_var.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_global_var.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,45 @@ ++/* { dg-do compile } */ ++/* { dg-do run } */ ++ ++#include ++typedef struct ++{ ++ int a; ++ float b; ++}str_t; ++ ++#ifdef STACK_SIZE ++#if STACK_SIZE > 8000 ++#define N 1000 ++#else ++#define N (STACK_SIZE/8) ++#endif ++#else ++#define N 1000 ++#endif ++ ++str_t *p; ++ ++int ++main () ++{ ++ int i, sum; ++ ++ p = malloc (N * sizeof (str_t)); ++ if (p == NULL) ++ return 0; ++ for (i = 0; i < N; i++) ++ p[i].b = i; ++ ++ for (i = 0; i < N; i++) ++ p[i].b = p[i].a + 1; ++ ++ for (i = 0; i < N; i++) ++ if (p[i].b != p[i].a + 1) ++ abort (); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_local_array.c b/gcc/testsuite/gcc.dg/struct/wo_prof_local_array.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_local_array.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_local_array.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,40 @@ ++/* { dg-do compile } */ ++/* { dg-do run } */ ++ ++#include ++typedef struct ++{ ++ int a; ++ float b; ++}str_t; ++ ++#ifdef STACK_SIZE ++#if STACK_SIZE > 8000 ++#define N 1000 ++#else ++#define N (STACK_SIZE/8) ++#endif ++#else ++#define N 1000 ++#endif ++ ++int ++main () ++{ ++ int i; ++ str_t A[N]; ++ ++ for (i = 0; i < N; i++) ++ { ++ A[i].a = 0; ++ } ++ ++ for (i = 0; i < N; i++) ++ if (A[i].a != 0) ++ abort (); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" { xfail *-*-* } } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_local_var.c b/gcc/testsuite/gcc.dg/struct/wo_prof_local_var.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_local_var.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_local_var.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,43 @@ ++/* { dg-do compile } */ ++/* { dg-do run } */ ++ ++#include ++typedef struct ++{ ++ int a; ++ float b; ++}str_t; ++ ++#ifdef STACK_SIZE ++#if STACK_SIZE > 8000 ++#define N 1000 ++#else ++#define N (STACK_SIZE/8) ++#endif ++#else ++#define N 1000 ++#endif ++ ++int ++main () ++{ ++ int i, sum; ++ ++ str_t * p = malloc (N * sizeof (str_t)); ++ if (p == NULL) ++ return 0; ++ for (i = 0; i < N; i++) ++ p[i].b = i; ++ ++ for (i = 0; i < N; i++) ++ p[i].b = p[i].a + 1; ++ ++ for (i = 0; i < N; i++) ++ if (p[i].b != p[i].a + 1) ++ abort (); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_malloc_size_var-1.c b/gcc/testsuite/gcc.dg/struct/wo_prof_malloc_size_var-1.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_malloc_size_var-1.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_malloc_size_var-1.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,47 @@ ++/* { dg-do compile } */ ++/* { dg-do run } */ ++ ++#include ++typedef struct ++{ ++ int a; ++ float b; ++}str_t; ++ ++#ifdef STACK_SIZE ++#if STACK_SIZE > 8000 ++#define N 1000 ++#else ++#define N (STACK_SIZE/8) ++#endif ++#else ++#define N 1000 ++#endif ++ ++int ++main () ++{ ++ long i, num; ++ ++ num = rand(); ++ num = num > N ? N : num; ++ str_t * p = malloc (num * sizeof (str_t)); ++ ++ if (p == 0) ++ return 0; ++ ++ for (i = 1; i <= num; i++) ++ p[i-1].b = i; ++ ++ for (i = 1; i <= num; i++) ++ p[i-1].a = p[i-1].b + 1; ++ ++ for (i = 0; i < num; i++) ++ if (p[i].a != p[i].b + 1) ++ abort (); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_malloc_size_var.c b/gcc/testsuite/gcc.dg/struct/wo_prof_malloc_size_var.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_malloc_size_var.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_malloc_size_var.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,47 @@ ++/* { dg-do compile } */ ++/* { dg-do run } */ ++ ++#include ++typedef struct ++{ ++ int a; ++ float b; ++}str_t; ++ ++#ifdef STACK_SIZE ++#if STACK_SIZE > 8000 ++#define N 1000 ++#else ++#define N (STACK_SIZE/8) ++#endif ++#else ++#define N 1000 ++#endif ++ ++int ++main () ++{ ++ int i, num; ++ ++ num = rand(); ++ num = num > N ? N : num; ++ str_t * p = malloc (num * sizeof (str_t)); ++ ++ if (p == 0) ++ return 0; ++ ++ for (i = 0; i < num; i++) ++ p[i].b = i; ++ ++ for (i = 0; i < num; i++) ++ p[i].a = p[i].b + 1; ++ ++ for (i = 0; i < num; i++) ++ if (p[i].a != p[i].b + 1) ++ abort (); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_mult_field_peeling.c b/gcc/testsuite/gcc.dg/struct/wo_prof_mult_field_peeling.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_mult_field_peeling.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_mult_field_peeling.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,42 @@ ++/* { dg-do compile } */ ++/* { dg-do run } */ ++ ++#include ++typedef struct ++{ ++ int a; ++ float b; ++ int c; ++ float d; ++}str_t; ++ ++#ifdef STACK_SIZE ++#if STACK_SIZE > 1600 ++#define N 100 ++#else ++#define N (STACK_SIZE/16) ++#endif ++#else ++#define N 100 ++#endif ++ ++int ++main () ++{ ++ int i; ++ str_t *p = malloc (N * sizeof (str_t)); ++ if (p == NULL) ++ return 0; ++ for (i = 0; i < N; i++) ++ p[i].a = 5; ++ ++ for (i = 0; i < N; i++) ++ if (p[i].a != 5) ++ abort (); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* The structure str_t is erroneously peeled into 4 structures instead of 2. */ ++/* { dg-final { scan-ipa-dump "the number of new types is 2" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_single_str_global.c b/gcc/testsuite/gcc.dg/struct/wo_prof_single_str_global.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_single_str_global.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_single_str_global.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,34 @@ ++/* { dg-do compile } */ ++/* { dg-do run } */ ++ ++#include ++typedef struct ++{ ++ int a; ++ int b; ++}str_t; ++ ++#define N 3 ++ ++str_t str; ++ ++int ++main () ++{ ++ int i; ++ int res = 1<<(1< ++typedef struct ++{ ++ int a; ++ int b; ++}str_t; ++ ++#define N 3 ++ ++int ++main () ++{ ++ int i; ++ int res = 1<<(1< ++typedef struct ++{ ++ int a; ++ int *b; ++}str_t; ++ ++#define N 3 ++ ++str_t *p; ++ ++int ++main () ++{ ++ str_t str; ++ int i; ++ int res = 1 << (1 << N); ++ p = &str; ++ str.a = 2; ++ ++ p->b = &(p->a); ++ ++ for (i=0; i < N; i++) ++ p->a = *(p->b)*(*(p->b)); ++ ++ if (p->a != res) ++ abort (); ++ ++ /* POSIX ignores all but the 8 low-order bits, but other ++ environments may not. */ ++ return (p->a & 255); ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final { scan-ipa-dump "has escaped...Type escapes a cast to a different" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/wo_prof_two_strs.c b/gcc/testsuite/gcc.dg/struct/wo_prof_two_strs.c +--- a/gcc/testsuite/gcc.dg/struct/wo_prof_two_strs.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/wo_prof_two_strs.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,67 @@ ++/* { dg-do compile } */ ++/* { dg-do run } */ ++ ++#include ++ ++typedef struct ++{ ++ int a; ++ float b; ++}str_t1; ++ ++typedef struct ++{ ++ int c; ++ float d; ++}str_t2; ++ ++#ifdef STACK_SIZE ++#if STACK_SIZE > 16000 ++#define N 1000 ++#else ++#define N (STACK_SIZE/16) ++#endif ++#else ++#define N 1000 ++#endif ++ ++str_t1 *p1; ++str_t2 *p2; ++int num; ++ ++void ++foo (void) ++{ ++ int i; ++ ++ for (i=0; i < num; i++) ++ p2[i].c = 2; ++} ++ ++int ++main () ++{ ++ int i, r; ++ ++ r = rand (); ++ num = r > N ? N : r; ++ p1 = malloc (num * sizeof (str_t1)); ++ p2 = malloc (num * sizeof (str_t2)); ++ ++ if (p1 == NULL || p2 == NULL) ++ return 0; ++ ++ for (i = 0; i < num; i++) ++ p1[i].a = 1; ++ ++ foo (); ++ ++ for (i = 0; i < num; i++) ++ if (p1[i].a != 1 || p2[i].c != 2) ++ abort (); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final { scan-ipa-dump "Number of structures to transform is 2" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/w_prof_global_array.c b/gcc/testsuite/gcc.dg/struct/w_prof_global_array.c +--- a/gcc/testsuite/gcc.dg/struct/w_prof_global_array.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/w_prof_global_array.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,29 @@ ++#include ++typedef struct ++{ ++ int a; ++ float b; ++}str_t; ++ ++#define N 1000 ++str_t A[N]; ++ ++int ++main () ++{ ++ int i; ++ ++ for (i = 0; i < N; i++) ++ { ++ A[i].a = 0; ++ } ++ ++ for (i = 0; i < N; i++) ++ if (A[i].a != 0) ++ abort (); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final-use { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" { xfail *-*-* } } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/w_prof_global_var.c b/gcc/testsuite/gcc.dg/struct/w_prof_global_var.c +--- a/gcc/testsuite/gcc.dg/struct/w_prof_global_var.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/w_prof_global_var.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,42 @@ ++#include ++typedef struct ++{ ++ int a; ++ float b; ++}str_t; ++ ++#ifdef STACK_SIZE ++#if STACK_SIZE > 8000 ++#define N 1000 ++#else ++#define N (STACK_SIZE/8) ++#endif ++#else ++#define N 1000 ++#endif ++ ++str_t *p; ++ ++int ++main () ++{ ++ int i, sum; ++ ++ p = malloc (N * sizeof (str_t)); ++ if (p == NULL) ++ return 0; ++ for (i = 0; i < N; i++) ++ p[i].b = i; ++ ++ for (i = 0; i < N; i++) ++ p[i].a = p[i].b + 1; ++ ++ for (i = 0; i < N; i++) ++ if (p[i].a != p[i].b + 1) ++ abort (); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final-use { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/w_prof_local_array.c b/gcc/testsuite/gcc.dg/struct/w_prof_local_array.c +--- a/gcc/testsuite/gcc.dg/struct/w_prof_local_array.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/w_prof_local_array.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,37 @@ ++#include ++typedef struct ++{ ++ int a; ++ float b; ++}str_t; ++ ++#ifdef STACK_SIZE ++#if STACK_SIZE > 8000 ++#define N 1000 ++#else ++#define N (STACK_SIZE/8) ++#endif ++#else ++#define N 1000 ++#endif ++ ++int ++main () ++{ ++ int i; ++ str_t A[N]; ++ ++ for (i = 0; i < N; i++) ++ { ++ A[i].a = 0; ++ } ++ ++ for (i = 0; i < N; i++) ++ if (A[i].a != 0) ++ abort (); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final-use { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" { xfail *-*-* } } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/w_prof_local_var.c b/gcc/testsuite/gcc.dg/struct/w_prof_local_var.c +--- a/gcc/testsuite/gcc.dg/struct/w_prof_local_var.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/w_prof_local_var.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,40 @@ ++#include ++typedef struct ++{ ++ int a; ++ float b; ++}str_t; ++ ++#ifdef STACK_SIZE ++#if STACK_SIZE > 8000 ++#define N 1000 ++#else ++#define N (STACK_SIZE/8) ++#endif ++#else ++#define N 1000 ++#endif ++ ++int ++main () ++{ ++ int i, sum; ++ ++ str_t * p = malloc (N * sizeof (str_t)); ++ if (p == NULL) ++ return 0; ++ for (i = 0; i < N; i++) ++ p[i].b = i; ++ ++ for (i = 0; i < N; i++) ++ p[i].a = p[i].b + 1; ++ ++ for (i = 0; i < N; i++) ++ if (p[i].a != p[i].b + 1) ++ abort (); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final-use { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/w_prof_single_str_global.c b/gcc/testsuite/gcc.dg/struct/w_prof_single_str_global.c +--- a/gcc/testsuite/gcc.dg/struct/w_prof_single_str_global.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/w_prof_single_str_global.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,31 @@ ++#include ++typedef struct ++{ ++ int a; ++ int b; ++}str_t; ++ ++#define N 3 ++ ++str_t str; ++ ++int ++main () ++{ ++ int i; ++ int res = 1<<(1< ++ ++typedef struct ++{ ++ int a; ++ float b; ++}str_t1; ++ ++typedef struct ++{ ++ int c; ++ float d; ++}str_t2; ++ ++#ifdef STACK_SIZE ++#if STACK_SIZE > 16000 ++#define N 1000 ++#else ++#define N (STACK_SIZE/16) ++#endif ++#else ++#define N 1000 ++#endif ++ ++str_t1 *p1; ++str_t2 *p2; ++int num; ++ ++void ++foo (void) ++{ ++ int i; ++ ++ for (i=0; i < num; i++) ++ p2[i].c = 2; ++} ++ ++int ++main () ++{ ++ int i, r; ++ ++ r = rand (); ++ num = r > N ? N : r; ++ p1 = malloc (num * sizeof (str_t1)); ++ p2 = malloc (num * sizeof (str_t2)); ++ ++ if (p1 == NULL || p2 == NULL) ++ return 0; ++ ++ for (i = 0; i < num; i++) ++ p1[i].a = 1; ++ ++ foo (); ++ ++ for (i = 0; i < num; i++) ++ if (p1[i].a != 1 || p2[i].c != 2) ++ abort (); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* { dg-final-use { scan-ipa-dump "Number of structures to transform is 2" "struct_reorg" } } */ +diff -Nurp a/gcc/testsuite/gcc.dg/struct/w_ratio_cold_str.c b/gcc/testsuite/gcc.dg/struct/w_ratio_cold_str.c +--- a/gcc/testsuite/gcc.dg/struct/w_ratio_cold_str.c 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/gcc.dg/struct/w_ratio_cold_str.c 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,43 @@ ++#include ++typedef struct ++{ ++ int a; ++ int b; ++}str_t1; ++ ++typedef struct ++{ ++ float a; ++ float b; ++}str_t2; ++ ++#define N1 1000 ++#define N2 100 ++str_t1 A1[N1]; ++str_t2 A2[N2]; ++ ++int ++main () ++{ ++ int i; ++ ++ for (i = 0; i < N1; i++) ++ A1[i].a = 0; ++ ++ for (i = 0; i < N2; i++) ++ A2[i].a = 0; ++ ++ for (i = 0; i < N1; i++) ++ if (A1[i].a != 0) ++ abort (); ++ ++ for (i = 0; i < N2; i++) ++ if (A2[i].a != 0) ++ abort (); ++ ++ return 0; ++} ++ ++/*--------------------------------------------------------------------------*/ ++/* Arrays are not handled. */ ++/* { dg-final-use { scan-ipa-dump "Number of structures to transform is 1" "struct_reorg" { xfail *-*-* } } } */ +diff -Nurp a/gcc/testsuite/g++.dg/torture/pr38355.C b/gcc/testsuite/g++.dg/torture/pr38355.C +--- a/gcc/testsuite/g++.dg/torture/pr38355.C 1969-12-31 19:00:00.000000000 -0500 ++++ b/gcc/testsuite/g++.dg/torture/pr38355.C 2020-06-16 22:56:07.736000000 -0400 +@@ -0,0 +1,25 @@ ++// { dg-do run } ++// { dg-options "-fwhole-program -fipa-struct-reorg" } ++template struct A ++{ ++ char c; ++ void foo(int); ++ void bar(int i) { foo(i+1); } ++}; ++ ++template struct B : virtual A<0> {}; ++ ++template inline void baz(B& b, int i) ++{ ++ if (i) b.bar(0); ++} ++ ++extern template class A<0>; ++extern template void baz(B<0>&, int); ++ ++int main() ++{ ++ B<0> b; ++ baz(b, 0); ++ return 0; ++} +diff -Nurp a/gcc/timevar.def b/gcc/timevar.def +--- a/gcc/timevar.def 2020-03-12 07:07:23.000000000 -0400 ++++ b/gcc/timevar.def 2020-06-16 22:56:07.736000000 -0400 +@@ -77,6 +77,7 @@ DEFTIMEVAR (TV_IPA_CONSTANT_PROP , " + DEFTIMEVAR (TV_IPA_INLINING , "ipa inlining heuristics") + DEFTIMEVAR (TV_IPA_FNSPLIT , "ipa function splitting") + DEFTIMEVAR (TV_IPA_COMDATS , "ipa comdats") ++DEFTIMEVAR (TV_IPA_STRUCT_REORG , "ipa struct reorg optimization") + DEFTIMEVAR (TV_IPA_OPT , "ipa various optimizations") + DEFTIMEVAR (TV_IPA_LTO_DECOMPRESS , "lto stream inflate") + DEFTIMEVAR (TV_IPA_LTO_COMPRESS , "lto stream deflate") +diff -Nurp a/gcc/tree-pass.h b/gcc/tree-pass.h +--- a/gcc/tree-pass.h 2020-03-12 07:07:23.000000000 -0400 ++++ b/gcc/tree-pass.h 2020-06-16 22:56:07.736000000 -0400 +@@ -504,6 +504,7 @@ extern ipa_opt_pass_d *make_pass_ipa_dev + extern ipa_opt_pass_d *make_pass_ipa_reference (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_hsa (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_pure_const (gcc::context *ctxt); ++extern simple_ipa_opt_pass *make_pass_ipa_struct_reorg (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_pta (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_tm (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_target_clone (gcc::context *ctxt); diff --git a/ivopts-1.patch b/ivopts-1.patch index 2c5e62c..6f2e041 100644 --- a/ivopts-1.patch +++ b/ivopts-1.patch @@ -1,3 +1,6 @@ +re-PR-tree-optimization-90240-ICE-in-try_improve_iv_.patch: +commit 98d8f142132ac670da2dc99cce530048343ab948 + diff -urpN a/gcc/testsuite/gfortran.dg/graphite/pr90240.f b/gcc/testsuite/gfortran.dg/graphite/pr90240.f new file mode 100644 --- /dev/null diff --git a/ivopts-2.patch b/ivopts-2.patch index c9cbec1..9bd0b68 100644 --- a/ivopts-2.patch +++ b/ivopts-2.patch @@ -1,3 +1,6 @@ +re-PR-tree-optimization-90078-ICE-with-deep-template.patch: +commit 8363a2f1f7c47d7b3d1760ce631a6824e91c0d80 + diff -urpN a/gcc/testsuite/g++.dg/tree-ssa/pr90078.C b/gcc/testsuite/g++.dg/tree-ssa/pr90078.C new file mode 100644 --- /dev/null diff --git a/loop-finite-bugfix.patch b/loop-finite-bugfix.patch index c159a8b..a290507 100644 --- a/loop-finite-bugfix.patch +++ b/loop-finite-bugfix.patch @@ -1,3 +1,9 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-c-94392-only-enable-ffinite-loops-for-C.patch +75efe9cb1f8938a713ce540dc3b27bc2afcd3fae + diff --git a/gcc/c-family/c-opts.c b/gcc/c-family/c-opts.c index 6b6c754ad86..58ba0948e79 100644 --- a/gcc/c-family/c-opts.c diff --git a/loop-finite.patch b/loop-finite.patch index 945ea28..cc2543e 100644 --- a/loop-finite.patch +++ b/loop-finite.patch @@ -1,3 +1,9 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-PR-tree-optimization-89713-Assume-loop-with-an-exit-.patch +c29c92c789d93848cc1c929838771bfc68cb272c + diff --git a/gcc/common.opt b/gcc/common.opt index e1404165feb..a1544d06824 100644 --- a/gcc/common.opt diff --git a/loop-split.patch b/loop-split.patch index c689060..d99db75 100644 --- a/loop-split.patch +++ b/loop-split.patch @@ -1,3 +1,9 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-Loop-split-on-semi-invariant-conditional-statement.patch +095f78c62157124ad479a3f98b6995ced090b807 + diff --git a/gcc/params.def b/gcc/params.def index 942447d77e6..df7d1f7c5e7 100644 --- a/gcc/params.def diff --git a/medium-code-mode.patch b/medium-code-mode.patch new file mode 100644 index 0000000..9133683 --- /dev/null +++ b/medium-code-mode.patch @@ -0,0 +1,426 @@ +diff -Nurp a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c +--- a/gcc/config/aarch64/aarch64.c 2020-07-16 14:54:30.588000000 +0800 ++++ b/gcc/config/aarch64/aarch64.c 2020-07-16 15:06:33.000000000 +0800 +@@ -2030,6 +2030,32 @@ aarch64_load_symref_appropriately (rtx d + emit_insn (gen_add_losym (dest, tmp_reg, imm)); + return; + } ++ case SYMBOL_MEDIUM_ABSOLUTE: ++ { ++ rtx tmp_reg = dest; ++ machine_mode mode = GET_MODE (dest); ++ ++ gcc_assert (mode == Pmode || mode == ptr_mode); ++ if (can_create_pseudo_p ()) ++ tmp_reg = gen_reg_rtx (mode); ++ ++ if (mode == DImode) ++ { ++ emit_insn ( ++ gen_load_symbol_medium_di (dest, tmp_reg, imm)); ++ } ++ else ++ { ++ emit_insn ( ++ gen_load_symbol_medium_si (dest, tmp_reg, imm)); ++ } ++ if (REG_P (dest)) ++ { ++ set_unique_reg_note ( ++ get_last_insn (), REG_EQUIV, copy_rtx (imm)); ++ } ++ return; ++ } + + case SYMBOL_TINY_ABSOLUTE: + emit_insn (gen_rtx_SET (dest, imm)); +@@ -2152,6 +2178,64 @@ aarch64_load_symref_appropriately (rtx d + return; + } + ++ case SYMBOL_MEDIUM_GOT_4G: ++ { ++ rtx tmp_reg = dest; ++ machine_mode mode = GET_MODE (dest); ++ if (can_create_pseudo_p ()) ++ { ++ tmp_reg = gen_reg_rtx (mode); ++ } ++ rtx insn; ++ rtx mem; ++ rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_"); ++ ++ if (mode == DImode) ++ { ++ emit_insn ( ++ gen_load_symbol_medium_di (tmp_reg, dest, s)); ++ } ++ else ++ { ++ emit_insn ( ++ gen_load_symbol_medium_si (tmp_reg, dest, s)); ++ } ++ if (REG_P (dest)) ++ { ++ set_unique_reg_note ( ++ get_last_insn (), REG_EQUIV, copy_rtx (s)); ++ } ++ ++ if (mode == ptr_mode) ++ { ++ if (mode == DImode) ++ { ++ emit_insn (gen_get_gotoff_di (dest, imm)); ++ insn = gen_ldr_got_medium_di ( ++ dest, tmp_reg, dest); ++ } ++ else ++ { ++ emit_insn (gen_get_gotoff_si (dest, imm)); ++ insn = gen_ldr_got_medium_si ( ++ dest, tmp_reg, dest); ++ } ++ mem = XVECEXP (SET_SRC (insn), 0, 0); ++ } ++ else ++ { ++ gcc_assert (mode == Pmode); ++ emit_insn (gen_get_gotoff_di (dest, imm)); ++ insn = gen_ldr_got_medium_sidi (dest, tmp_reg, dest); ++ mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0); ++ } ++ ++ gcc_assert (GET_CODE (mem) == MEM); ++ MEM_READONLY_P (mem) = 1; ++ MEM_NOTRAP_P (mem) = 1; ++ emit_insn (insn); ++ return; ++ } + case SYMBOL_SMALL_TLSGD: + { + rtx_insn *insns; +@@ -3372,11 +3456,12 @@ aarch64_expand_mov_immediate (rtx dest, + + return; + +- case SYMBOL_SMALL_TLSGD: +- case SYMBOL_SMALL_TLSDESC: ++ case SYMBOL_SMALL_TLSGD: ++ case SYMBOL_SMALL_TLSDESC: + case SYMBOL_SMALL_TLSIE: + case SYMBOL_SMALL_GOT_28K: + case SYMBOL_SMALL_GOT_4G: ++ case SYMBOL_MEDIUM_GOT_4G: + case SYMBOL_TINY_GOT: + case SYMBOL_TINY_TLSIE: + if (const_offset != 0) +@@ -3395,6 +3480,7 @@ aarch64_expand_mov_immediate (rtx dest, + case SYMBOL_TLSLE24: + case SYMBOL_TLSLE32: + case SYMBOL_TLSLE48: ++ case SYMBOL_MEDIUM_ABSOLUTE: + aarch64_load_symref_appropriately (dest, imm, sty); + return; + +@@ -10334,6 +10420,13 @@ cost_plus: + if (speed) + *cost += extra_cost->alu.arith; + } ++ else if (aarch64_cmodel == AARCH64_CMODEL_MEDIUM ++ || aarch64_cmodel == AARCH64_CMODEL_MEDIUM_PIC) ++ { ++ /* 4 movs adr sub add 2movs ldr. */ ++ if (speed) ++ *cost += 7*extra_cost->alu.arith; ++ } + + if (flag_pic) + { +@@ -10341,6 +10434,8 @@ cost_plus: + *cost += COSTS_N_INSNS (1); + if (speed) + *cost += extra_cost->ldst.load; ++ if (aarch64_cmodel == AARCH64_CMODEL_MEDIUM_PIC) ++ *cost += 2*extra_cost->alu.arith; + } + return true; + +@@ -11395,6 +11490,7 @@ initialize_aarch64_tls_size (struct gcc_ + if (aarch64_tls_size > 32) + aarch64_tls_size = 32; + break; ++ case AARCH64_CMODEL_MEDIUM: + case AARCH64_CMODEL_LARGE: + /* The maximum TLS size allowed under large is 16E. + FIXME: 16E should be 64bit, we only support 48bit offset now. */ +@@ -12187,6 +12283,9 @@ initialize_aarch64_code_model (struct gc + aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC; + #endif + break; ++ case AARCH64_CMODEL_MEDIUM: ++ aarch64_cmodel = AARCH64_CMODEL_MEDIUM_PIC; ++ break; + case AARCH64_CMODEL_LARGE: + sorry ("code model %qs with %<-f%s%>", "large", + opts->x_flag_pic > 1 ? "PIC" : "pic"); +@@ -12205,6 +12304,7 @@ static void + aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts) + { + ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string; ++ ptr->x_aarch64_data_threshold = opts->x_aarch64_data_threshold; + ptr->x_aarch64_branch_protection_string + = opts->x_aarch64_branch_protection_string; + } +@@ -12220,6 +12320,7 @@ aarch64_option_restore (struct gcc_optio + opts->x_explicit_arch = ptr->x_explicit_arch; + selected_arch = aarch64_get_arch (ptr->x_explicit_arch); + opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string; ++ opts->x_aarch64_data_threshold = ptr->x_aarch64_data_threshold; + opts->x_aarch64_branch_protection_string + = ptr->x_aarch64_branch_protection_string; + if (opts->x_aarch64_branch_protection_string) +@@ -13067,6 +13168,8 @@ aarch64_classify_symbol (rtx x, HOST_WID + + case AARCH64_CMODEL_SMALL_SPIC: + case AARCH64_CMODEL_SMALL_PIC: ++ case AARCH64_CMODEL_MEDIUM_PIC: ++ case AARCH64_CMODEL_MEDIUM: + case AARCH64_CMODEL_SMALL: + return SYMBOL_SMALL_ABSOLUTE; + +@@ -13100,6 +13203,7 @@ aarch64_classify_symbol (rtx x, HOST_WID + return SYMBOL_TINY_ABSOLUTE; + + case AARCH64_CMODEL_SMALL: ++ AARCH64_SMALL_ROUTINE: + /* Same reasoning as the tiny code model, but the offset cap here is + 4G. */ + if ((SYMBOL_REF_WEAK (x) +@@ -13121,7 +13225,48 @@ aarch64_classify_symbol (rtx x, HOST_WID + ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G); + return SYMBOL_SMALL_ABSOLUTE; + ++ case AARCH64_CMODEL_MEDIUM: ++ { ++ tree decl_local = SYMBOL_REF_DECL (x); ++ if (decl_local != NULL ++ && tree_fits_uhwi_p (DECL_SIZE_UNIT (decl_local))) ++ { ++ HOST_WIDE_INT size = tree_to_uhwi ( ++ DECL_SIZE_UNIT (decl_local)); ++ /* If the data is smaller than the threshold, goto ++ the small code model. Else goto the large code ++ model. */ ++ if (size >= HOST_WIDE_INT (aarch64_data_threshold)) ++ goto AARCH64_LARGE_ROUTINE; ++ } ++ goto AARCH64_SMALL_ROUTINE; ++ } ++ ++ case AARCH64_CMODEL_MEDIUM_PIC: ++ { ++ tree decl_local = SYMBOL_REF_DECL (x); ++ if (decl_local != NULL ++ && tree_fits_uhwi_p (DECL_SIZE_UNIT (decl_local))) ++ { ++ HOST_WIDE_INT size = tree_to_uhwi ( ++ DECL_SIZE_UNIT (decl_local)); ++ if (size < HOST_WIDE_INT (aarch64_data_threshold)) ++ { ++ if (!aarch64_symbol_binds_local_p (x)) ++ { ++ return SYMBOL_SMALL_GOT_4G; ++ } ++ return SYMBOL_SMALL_ABSOLUTE; ++ } ++ } ++ if (!aarch64_symbol_binds_local_p (x)) ++ { ++ return SYMBOL_MEDIUM_GOT_4G; ++ } ++ return SYMBOL_MEDIUM_ABSOLUTE; ++ } + case AARCH64_CMODEL_LARGE: ++ AARCH64_LARGE_ROUTINE: + /* This is alright even in PIC code as the constant + pool reference is always PC relative and within + the same translation unit. */ +@@ -15364,6 +15509,8 @@ aarch64_asm_preferred_eh_data_format (in + case AARCH64_CMODEL_SMALL: + case AARCH64_CMODEL_SMALL_PIC: + case AARCH64_CMODEL_SMALL_SPIC: ++ case AARCH64_CMODEL_MEDIUM: ++ case AARCH64_CMODEL_MEDIUM_PIC: + /* text+got+data < 4Gb. 4-byte signed relocs are sufficient + for everything. */ + type = DW_EH_PE_sdata4; +@@ -18454,7 +18601,8 @@ aarch64_empty_mask_is_expensive (unsigne + bool + aarch64_use_pseudo_pic_reg (void) + { +- return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC; ++ return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC ++ || aarch64_cmodel == AARCH64_CMODEL_MEDIUM_PIC ; + } + + /* Implement TARGET_UNSPEC_MAY_TRAP_P. */ +@@ -18464,6 +18612,7 @@ aarch64_unspec_may_trap_p (const_rtx x, + { + switch (XINT (x, 1)) + { ++ case UNSPEC_GOTMEDIUMPIC4G: + case UNSPEC_GOTSMALLPIC: + case UNSPEC_GOTSMALLPIC28K: + case UNSPEC_GOTTINYPIC: +diff -Nurp a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +--- a/gcc/config/aarch64/aarch64.h 2020-07-16 14:54:30.592000000 +0800 ++++ b/gcc/config/aarch64/aarch64.h 2020-07-16 14:55:05.672000000 +0800 +@@ -33,6 +33,10 @@ + + #define REGISTER_TARGET_PRAGMAS() aarch64_register_pragmas () + ++/* Default threshold 64-bit relocation data ++ with aarch64 medium memory model. */ ++#define AARCH64_DEFAULT_LARGE_DATA_THRESHOLD 65536 ++ + /* Target machine storage layout. */ + + #define PROMOTE_MODE(MODE, UNSIGNEDP, TYPE) \ +diff -Nurp a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md +--- a/gcc/config/aarch64/aarch64.md 2020-07-16 14:54:30.588000000 +0800 ++++ b/gcc/config/aarch64/aarch64.md 2020-07-16 14:55:05.676000000 +0800 +@@ -209,6 +209,11 @@ + UNSPEC_RSQRTS + UNSPEC_NZCV + UNSPEC_XPACLRI ++ UNSPEC_MOV_MEDIUM_SYMBOL ++ UNSPEC_GET_LAST_PC ++ UNSPEC_GOTMEDIUMPIC4G ++ UNSPEC_GET_GOTOFF ++ UNSPEC_LOAD_SYMBOL_MEDIUM + UNSPEC_LD1_SVE + UNSPEC_ST1_SVE + UNSPEC_LD1RQ +@@ -6548,6 +6553,39 @@ + [(set_attr "type" "load_4")] + ) + ++(define_insn "get_gotoff_" ++ [(set (match_operand:GPI 0 "register_operand" "=r") ++ (unspec:GPI [(match_operand 1 "aarch64_valid_symref" "S")] ++ UNSPEC_GET_GOTOFF))] ++ "" ++ "movz\\t%x0, :gotoff_g1:%A1\;movk\\t%x0, :gotoff_g0_nc:%A1" ++ [(set_attr "type" "multiple") ++ (set_attr "length" "8")] ++) ++ ++(define_insn "ldr_got_medium_" ++ [(set (match_operand:PTR 0 "register_operand" "=r") ++ (unspec:PTR [(mem:PTR (lo_sum:PTR ++ (match_operand:PTR 1 "register_operand" "r") ++ (match_operand:PTR 2 "register_operand" "r")))] ++ UNSPEC_GOTMEDIUMPIC4G))] ++ "" ++ "ldr\\t%0, [%1, %2]" ++ [(set_attr "type" "load_4")] ++) ++ ++(define_insn "ldr_got_medium_sidi" ++ [(set (match_operand:DI 0 "register_operand" "=r") ++ (zero_extend:DI ++ (unspec:SI [(mem:SI (lo_sum:DI ++ (match_operand:DI 1 "register_operand" "r") ++ (match_operand:DI 2 "register_operand" "r")))] ++ UNSPEC_GOTMEDIUMPIC4G)))] ++ "TARGET_ILP32" ++ "ldr\\t%0, [%1, %2]" ++ [(set_attr "type" "load_4")] ++) ++ + (define_insn "ldr_got_small_28k_" + [(set (match_operand:PTR 0 "register_operand" "=r") + (unspec:PTR [(mem:PTR (lo_sum:PTR +@@ -6709,6 +6747,23 @@ + (set_attr "length" "12")] + ) + ++(define_insn "load_symbol_medium_" ++ [(set (match_operand:GPI 0 "register_operand" "=r") ++ (unspec:GPI [(match_operand 2 "aarch64_valid_symref" "S")] ++ UNSPEC_LOAD_SYMBOL_MEDIUM)) ++ (clobber (match_operand:GPI 1 "register_operand" "=r"))] ++ "" ++ "movz\\t%x0, :prel_g3:%A2\;\\ ++ movk\\t%x0, :prel_g2_nc:%A2\;\\ ++ movk\\t%x0, :prel_g1_nc:%A2\;\\ ++ movk\\t%x0, :prel_g0_nc:%A2\;\\ ++ adr\\t%x1, .\;\\ ++ sub\\t%x1, %x1, 0x4\;\\ ++ add\\t%x0, %x0, %x1" ++ [(set_attr "type" "multiple") ++ (set_attr "length" "28")] ++) ++ + (define_expand "tlsdesc_small_" + [(unspec:PTR [(match_operand 0 "aarch64_valid_symref")] UNSPEC_TLSDESC)] + "TARGET_TLS_DESC" +diff -Nurp a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt +--- a/gcc/config/aarch64/aarch64.opt 2020-07-16 14:54:30.580000000 +0800 ++++ b/gcc/config/aarch64/aarch64.opt 2020-07-16 14:55:05.676000000 +0800 +@@ -27,6 +27,10 @@ enum aarch64_processor explicit_tune_cor + TargetVariable + enum aarch64_arch explicit_arch = aarch64_no_arch + ++;; -mlarge-data-threshold= ++TargetSave ++int x_aarch64_data_threshold ++ + TargetSave + const char *x_aarch64_override_tune_string + +@@ -61,8 +65,15 @@ EnumValue + Enum(cmodel) String(small) Value(AARCH64_CMODEL_SMALL) + + EnumValue ++Enum(cmodel) String(medium) Value(AARCH64_CMODEL_MEDIUM) ++ ++EnumValue + Enum(cmodel) String(large) Value(AARCH64_CMODEL_LARGE) + ++mlarge-data-threshold= ++Target RejectNegative Joined UInteger Var(aarch64_data_threshold) Init(AARCH64_DEFAULT_LARGE_DATA_THRESHOLD) ++-mlarge-data-threshold= Data greater than given threshold will be assume that it should be relocated using 64-bit relocation. ++ + mbig-endian + Target Report RejectNegative Mask(BIG_END) + Assume target CPU is configured as big endian. +diff -Nurp a/gcc/config/aarch64/aarch64-opts.h b/gcc/config/aarch64/aarch64-opts.h +--- a/gcc/config/aarch64/aarch64-opts.h 2020-07-16 14:54:30.584000000 +0800 ++++ b/gcc/config/aarch64/aarch64-opts.h 2020-07-16 14:55:05.676000000 +0800 +@@ -66,6 +66,10 @@ enum aarch64_code_model { + /* -fpic for small memory model. + GOT size to 28KiB (4K*8-4K) or 3580 entries. */ + AARCH64_CMODEL_SMALL_SPIC, ++ /* Using movk insn sequence to do 64bit PC relative relocation. */ ++ AARCH64_CMODEL_MEDIUM, ++ /* Using movk insn sequence to do 64bit PC relative got relocation. */ ++ AARCH64_CMODEL_MEDIUM_PIC, + /* No assumptions about addresses of code and data. + The PIC variant is not yet implemented. */ + AARCH64_CMODEL_LARGE +diff -Nurp a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h +--- a/gcc/config/aarch64/aarch64-protos.h 2020-07-16 14:54:30.584000000 +0800 ++++ b/gcc/config/aarch64/aarch64-protos.h 2020-07-16 14:55:05.676000000 +0800 +@@ -95,9 +95,11 @@ + */ + enum aarch64_symbol_type + { ++ SYMBOL_MEDIUM_ABSOLUTE, + SYMBOL_SMALL_ABSOLUTE, + SYMBOL_SMALL_GOT_28K, + SYMBOL_SMALL_GOT_4G, ++ SYMBOL_MEDIUM_GOT_4G, + SYMBOL_SMALL_TLSGD, + SYMBOL_SMALL_TLSDESC, + SYMBOL_SMALL_TLSIE, diff --git a/reduction-chain-slp-option.patch b/reduction-chain-slp-option.patch new file mode 100644 index 0000000..1b10c5c --- /dev/null +++ b/reduction-chain-slp-option.patch @@ -0,0 +1,52 @@ +diff -Nurp a/gcc/common.opt b/gcc/common.opt +--- a/gcc/common.opt 2020-06-20 23:53:56.124000000 +0800 ++++ b/gcc/common.opt 2020-06-22 23:02:18.808000000 +0800 +@@ -2858,6 +2858,10 @@ ftree-slp-vectorize + Common Report Var(flag_tree_slp_vectorize) Optimization EnabledBy(ftree-vectorize) + Enable basic block vectorization (SLP) on trees. + ++ftree-vect-analyze-slp-group ++Common Report Var(flag_tree_slp_group) Init(0) ++Disable SLP vectorization for reduction chain on tree. ++ + fvect-cost-model= + Common Joined RejectNegative Enum(vect_cost_model) Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT) Optimization + -fvect-cost-model=[unlimited|dynamic|cheap] Specifies the cost model for vectorization. +diff -Nurp a/gcc/testsuite/gcc.dg/vect/vect-reduc-12.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-12.c +--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-12.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-12.c 2020-06-22 23:04:08.260000000 +0800 +@@ -0,0 +1,20 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details -funsafe-math-optimizations -fno-tree-reassoc -ftree-vect-analyze-slp-group" } */ ++void f(double *a, double *res, double m) { ++ double res1, res0; ++ res1 = 0; ++ res0 = 0; ++ for (int i = 0; i < 1000; i+=8) { ++ res0 += a[i] * m; ++ res1 += a[i+1] * m; ++ res0 += a[i+2] * m; ++ res1 += a[i+3] * m; ++ res0 += a[i+4] * m; ++ res1 += a[i+5] * m; ++ res0 += a[i+6] * m; ++ res1 += a[i+7] * m; ++ } ++ res[0] += res0; ++ res[1] += res1; ++} ++/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */ +diff -Nurp a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c +--- a/gcc/tree-vect-slp.c 2020-06-21 01:07:56.516000000 +0800 ++++ b/gcc/tree-vect-slp.c 2020-06-22 23:02:54.540000000 +0800 +@@ -2327,8 +2327,9 @@ vect_analyze_slp (vec_info *vinfo, unsig + { + /* Find SLP sequences starting from reduction chains. */ + FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element) +- if (! vect_analyze_slp_instance (vinfo, bst_map, first_element, ++ if (flag_tree_slp_group ++ || ! vect_analyze_slp_instance (vinfo, bst_map, first_element, + max_tree_size)) + { + /* Dissolve reduction chain group. */ + stmt_vec_info vinfo = first_element; diff --git a/reductions-slp-enhancement.patch b/reductions-slp-enhancement.patch new file mode 100644 index 0000000..de426a3 --- /dev/null +++ b/reductions-slp-enhancement.patch @@ -0,0 +1,59 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-tree-vect-slp.c-vect_analyze_slp-When-reduction-grou.patch +0214d31a48f867b9b00134cea7223d35ed7865aa + +diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-9.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-9.c +new file mode 100644 +index 00000000000..bee642ee999 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-9.c +@@ -0,0 +1,25 @@ ++/* { dg-do compile } */ ++/* { dg-require-effective-target vect_int_mult } */ ++ ++int ++bar (int *x, int a, int b, int n) ++{ ++ x = __builtin_assume_aligned (x, __BIGGEST_ALIGNMENT__); ++ int sum1 = 0; ++ int sum2 = 0; ++ for (int i = 0; i < n; ++i) ++ { ++ /* Reduction chain vectorization fails here because of the ++ different operations but we can still vectorize both ++ reductions as SLP reductions, saving IVs. */ ++ sum1 += x[2*i] - a; ++ sum1 += x[2*i+1] * b; ++ sum2 += x[2*i] - b; ++ sum2 += x[2*i+1] * a; ++ } ++ return sum1 + sum2; ++} ++ ++/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" "vect" } } */ ++/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ ++/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */ +diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c +index e1061ede061..0af51197a84 100644 +--- a/gcc/tree-vect-slp.c ++++ b/gcc/tree-vect-slp.c +@@ -2271,14 +2271,18 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) + { + /* Dissolve reduction chain group. */ + stmt_vec_info vinfo = first_element; ++ stmt_vec_info last = NULL; + while (vinfo) + { + stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo); + REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL; + REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL; ++ last = vinfo; + vinfo = next; + } + STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def; ++ /* It can be still vectorized as part of an SLP reduction. */ ++ loop_vinfo->reductions.safe_push (last); + } + } diff --git a/remove-array-index-inliner-hint.patch b/remove-array-index-inliner-hint.patch index e0c09fb..416cd9a 100644 --- a/remove-array-index-inliner-hint.patch +++ b/remove-array-index-inliner-hint.patch @@ -1,3 +1,9 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-ipa-fnsummary.c-ipa_dump_hints-Do-not-dump-array_ind.patch +a20f263ba1a76af40eb4e6734529739a2a30ed65 + diff -uprN a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi diff --git a/simplify-removing-subregs.patch b/simplify-removing-subregs.patch new file mode 100644 index 0000000..199ff7a --- /dev/null +++ b/simplify-removing-subregs.patch @@ -0,0 +1,117 @@ +diff -Nurp a/gcc/expr.c b/gcc/expr.c +--- a/gcc/expr.c 2020-08-05 20:33:04.068000000 +0800 ++++ b/gcc/expr.c 2020-08-05 20:33:21.420000000 +0800 +@@ -3770,6 +3770,78 @@ emit_move_insn (rtx x, rtx y) + gcc_assert (mode != BLKmode + && (GET_MODE (y) == mode || GET_MODE (y) == VOIDmode)); + ++ /* If we have a copy that looks like one of the following patterns: ++ (set (subreg:M1 (reg:M2 ...)) (subreg:M1 (reg:M2 ...))) ++ (set (subreg:M1 (reg:M2 ...)) (mem:M1 ADDR)) ++ (set (mem:M1 ADDR) (subreg:M1 (reg:M2 ...))) ++ (set (subreg:M1 (reg:M2 ...)) (constant C)) ++ where mode M1 is equal in size to M2, try to detect whether the ++ mode change involves an implicit round trip through memory. ++ If so, see if we can avoid that by removing the subregs and ++ doing the move in mode M2 instead. */ ++ ++ rtx x_inner = NULL_RTX; ++ rtx y_inner = NULL_RTX; ++ ++ #define CANDIDATE_SUBREG_P(subreg) \ ++ (REG_P (SUBREG_REG (subreg)) \ ++ && known_eq (GET_MODE_SIZE (GET_MODE (SUBREG_REG (subreg))), \ ++ GET_MODE_SIZE (GET_MODE (subreg))) \ ++ && optab_handler (mov_optab, GET_MODE (SUBREG_REG (subreg))) \ ++ != CODE_FOR_nothing) ++ ++ #define CANDIDATE_MEM_P(innermode, mem) \ ++ (!targetm.can_change_mode_class ((innermode), GET_MODE (mem), ALL_REGS) \ ++ && !push_operand ((mem), GET_MODE (mem)) \ ++ /* Not a candiate if innermode requires too much alignment. */ \ ++ && (MEM_ALIGN (mem) >= GET_MODE_ALIGNMENT (innermode) \ ++ || targetm.slow_unaligned_access (GET_MODE (mem), \ ++ MEM_ALIGN (mem)) \ ++ || !targetm.slow_unaligned_access ((innermode), \ ++ MEM_ALIGN (mem)))) ++ ++ if (SUBREG_P (x) && CANDIDATE_SUBREG_P (x)) ++ x_inner = SUBREG_REG (x); ++ ++ if (SUBREG_P (y) && CANDIDATE_SUBREG_P (y)) ++ y_inner = SUBREG_REG (y); ++ ++ if (x_inner != NULL_RTX ++ && y_inner != NULL_RTX ++ && GET_MODE (x_inner) == GET_MODE (y_inner) ++ && !targetm.can_change_mode_class (GET_MODE (x_inner), mode, ALL_REGS)) ++ { ++ x = x_inner; ++ y = y_inner; ++ mode = GET_MODE (x_inner); ++ } ++ else if (x_inner != NULL_RTX ++ && MEM_P (y) ++ && CANDIDATE_MEM_P (GET_MODE (x_inner), y)) ++ { ++ x = x_inner; ++ y = adjust_address (y, GET_MODE (x_inner), 0); ++ mode = GET_MODE (x_inner); ++ } ++ else if (y_inner != NULL_RTX ++ && MEM_P (x) ++ && CANDIDATE_MEM_P (GET_MODE (y_inner), x)) ++ { ++ x = adjust_address (x, GET_MODE (y_inner), 0); ++ y = y_inner; ++ mode = GET_MODE (y_inner); ++ } ++ else if (x_inner != NULL_RTX ++ && CONSTANT_P (y) ++ && !targetm.can_change_mode_class (GET_MODE (x_inner), ++ mode, ALL_REGS) ++ && (y_inner = simplify_subreg (GET_MODE (x_inner), y, mode, 0))) ++ { ++ x = x_inner; ++ y = y_inner; ++ mode = GET_MODE (x_inner); ++ } ++ + if (CONSTANT_P (y)) + { + if (optimize +diff -Nurp a/gcc/testsuite/gcc.target/aarch64/pr95254.c b/gcc/testsuite/gcc.target/aarch64/pr95254.c +--- a/gcc/testsuite/gcc.target/aarch64/pr95254.c 1970-01-01 08:00:00.000000000 +0800 ++++ b/gcc/testsuite/gcc.target/aarch64/pr95254.c 2020-08-05 20:33:21.424000000 +0800 +@@ -0,0 +1,19 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -ftree-slp-vectorize -march=armv8.2-a+sve -msve-vector-bits=256" } */ ++ ++typedef short __attribute__((vector_size (8))) v4hi; ++ ++typedef union U4HI { v4hi v; short a[4]; } u4hi; ++ ++short b[4]; ++ ++void pass_v4hi (v4hi v) ++{ ++ int i; ++ u4hi u; ++ u.v = v; ++ for (i = 0; i < 4; i++) ++ b[i] = u.a[i]; ++}; ++ ++/* { dg-final { scan-assembler-not "ptrue" } } */ +diff -Nurp a/gcc/testsuite/gcc.target/i386/pr67609.c b/gcc/testsuite/gcc.target/i386/pr67609.c +--- a/gcc/testsuite/gcc.target/i386/pr67609.c 2020-08-05 20:33:04.628000000 +0800 ++++ b/gcc/testsuite/gcc.target/i386/pr67609.c 2020-08-05 20:33:21.424000000 +0800 +@@ -1,7 +1,7 @@ + /* { dg-do compile } */ + /* { dg-options "-O2 -msse2" } */ + /* { dg-require-effective-target lp64 } */ +-/* { dg-final { scan-assembler "movdqa" } } */ ++/* { dg-final { scan-assembler "movq\t%xmm0" } } */ + + #include + __m128d reg; diff --git a/skip-debug-insns-when-computing-inline-costs.patch b/skip-debug-insns-when-computing-inline-costs.patch index 6155590..2f09c27 100644 --- a/skip-debug-insns-when-computing-inline-costs.patch +++ b/skip-debug-insns-when-computing-inline-costs.patch @@ -1,3 +1,9 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-PR91176-Skip-debug-insns-when-computing-inline-costs.patch +d3ed5b56646511a52db9992f4024969bfc9a13f9 + diff -uprN a/gcc/ipa-fnsummary.c b/gcc/ipa-fnsummary.c --- a/gcc/ipa-fnsummary.c +++ b/gcc/ipa-fnsummary.c diff --git a/tighten-range-for-generating-csel.patch b/tighten-range-for-generating-csel.patch new file mode 100644 index 0000000..8e628f8 --- /dev/null +++ b/tighten-range-for-generating-csel.patch @@ -0,0 +1,132 @@ +This backport contains 1 patch from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-cselim-Don-t-assume-it-is-safe-to-cstore-replace-a-s.patch +cf39dccf9284d2fd9f9aa7050760adea110c8d88 + +diff -uprN a/gcc/testsuite/gcc.c-torture/execute/pr94734.c b/gcc/testsuite/gcc.c-torture/execute/pr94734.c +new file mode 100644 +--- /dev/null ++++ b/gcc/testsuite/gcc.c-torture/execute/pr94734.c +@@ -0,0 +1,59 @@ ++/* PR tree-optimization/94734 */ ++ ++__attribute__((noipa)) int ++foo (int n) ++{ ++ int arr[16], s = 0; ++ for (int i = 0; i < n; i++) ++ { ++ if (i < 16) ++ arr[i] = i; ++ } ++ for (int i = 0; i < 16; i++) ++ s += arr[i]; ++ return s; ++} ++ ++__attribute__((noipa)) int ++bar (int n, int x, unsigned long y, unsigned long z) ++{ ++ int arr[16], s = 0; ++ arr[4] = 42; ++ for (int i = 0; i < n; i++) ++ { ++ if (x == (i & 0x25)) ++ arr[y] = i; ++ } ++ return arr[z]; ++} ++ ++__attribute__((noipa)) int ++baz (int n, int x, unsigned long z) ++{ ++ int arr[16], s = 0; ++ arr[12] = 42; ++ for (int i = 0; i < n; i++) ++ { ++ if (x == (i & 0x25)) ++ arr[7] = i; ++ } ++ return arr[z]; ++} ++ ++int ++main () ++{ ++ if (foo (10374) != 15 * 16 / 2) ++ __builtin_abort (); ++ if (bar (25, 0x25, (unsigned long) 0xdeadbeefbeefdeadULL, 4) != 42) ++ __builtin_abort (); ++ if (bar (25, 4, 15, 15) != 22) ++ __builtin_abort (); ++ if (baz (25, 0x25, 12) != 42) ++ __builtin_abort (); ++ if (baz (25, 4, 7) != 22) ++ __builtin_abort (); ++ if (baz (25, 4, 12) != 42) ++ __builtin_abort (); ++ return 0; ++} +diff -uprN a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-1.c +--- a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-1.c ++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-1.c +@@ -9,4 +9,4 @@ unsigned test(unsigned k, unsigned b) { + return a[0]+a[1]; + } + +-/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" } } */ ++/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" { xfail *-*-* } } } */ +diff -uprN a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-2.c +--- a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-2.c ++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-2.c +@@ -11,4 +11,4 @@ unsigned test(unsigned k, unsigned b) { + return a[0]+a[1]; + } + +-/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" } } */ ++/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" { xfail *-*-* } } } */ +diff -uprN a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-5.c b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-5.c +--- a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-5.c ++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-5.c +@@ -13,4 +13,4 @@ int test(int b, int k) { + return a.data[0] + a.data[1]; + } + +-/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" } } */ ++/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" { xfail *-*-* } } } */ +diff -uprN a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-6.c b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-6.c +--- a/gcc/testsuite/gcc.dg/tree-ssa/pr89430-6.c ++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr89430-6.c +@@ -16,4 +16,4 @@ int test(int b, int k) { + return a.data[0].x + a.data[1].x; + } + +-/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" } } */ ++/* { dg-final { scan-tree-dump "Conditional store replacement" "cselim" { xfail *-*-* } } } */ +diff -uprN a/gcc/tree-ssa-phiopt.c b/gcc/tree-ssa-phiopt.c +--- a/gcc/tree-ssa-phiopt.c ++++ b/gcc/tree-ssa-phiopt.c +@@ -45,6 +45,7 @@ along with GCC; see the file COPYING3. If not see + #include "tree-inline.h" + #include "params.h" + #include "case-cfn-macros.h" ++#include "tree-eh.h" + + static unsigned int tree_ssa_phiopt_worker (bool, bool, bool); + static bool two_value_replacement (basic_block, basic_block, edge, gphi *, +@@ -2237,10 +2238,13 @@ cond_store_replacement (basic_block middle_bb, basic_block join_bb, + whose value is not available readily, which we want to avoid. */ + if (!nontrap->contains (lhs)) + { +- /* If LHS is a local variable without address-taken, we could ++ /* If LHS is an access to a local variable without address-taken ++ (or when we allow data races) and known not to trap, we could + always safely move down the store. */ + tree base = get_base_address (lhs); +- if (!auto_var_p (base) || TREE_ADDRESSABLE (base)) ++ if (!auto_var_p (base) ++ || (TREE_ADDRESSABLE (base) && !flag_store_data_races) ++ || tree_could_trap_p (lhs)) + return false; + } diff --git a/vectorization-enhancement.patch b/vectorization-enhancement.patch new file mode 100644 index 0000000..3c7f0af --- /dev/null +++ b/vectorization-enhancement.patch @@ -0,0 +1,20239 @@ +This backport contains 128 patchs from gcc main stream tree. +The commit id of these patchs list as following in the order of time. + +0001-Aarch64-SVE-Dot-product-support.patch +9feeafd7f95ea9f7211908c137c60074b3a52da2 + +0002-tree-vect-stmts.c-get_group_load_store_type-Avoid-pe.patch +419c5f99876d9ee517f6b646dd785cdcaf5cb6fe + +0003-re-PR-tree-optimization-90358-526.blender_r-train-ru.patch +898758504fa87d9f5e72c2c8b32139b413276a10 + +0004-tree-vect-slp.c-vect_build_slp_tree_2-Bump-size-when.patch +9f708a844853eb2fe87e696d27de14cbd68896f8 + +0005-cfgloop.h-struct-loop-Add-simdlen-member.patch +f63445e56c265757ebd50dc12fcd01773341b49f + +0006-Current-vectoriser-doesn-t-support-masked-loads-for-.patch +997636716c5dde7d59d026726a6f58918069f122 + +0007-tree-vrp.h-value_range_base-nonzero_p-New.patch +f2b00d2ba461d6dafdeccf6d93828b349b5e7f76 + +0008-AArch64-PR-tree-optimization-90332-Implement-vec_ini.patch +41dab855dce20d5d7042c9330dd8124d0ece19c0 + +0009-Fix-a-thinko-in-tree-ssa-loop.c.patch +cc261f66c268107b120add99942d729b3a489452 + +0010-re-PR-tree-optimization-90883-Generated-code-is-wors.patch +3fe0ddc88334f9afd622458653a6d103948994bd + +0011-re-PR-tree-optimization-90883-Generated-code-is-wors.patch +08c1638dab9becfafc65064891c1c59f5711c27f + +0012-Remove-quite-obvious-dead-assignments.patch +45309d286c80ecad8b7a4efba0e9aba35d847af6 + +0013-Fix-various-issues-seen-with-clang-static-analyzer.patch +ef874db611879d5004e1d834543e55d31f2bfe1c + +0014-re-PR-tree-optimization-91033-ICE-in-vect_analyze_lo.patch +a7b3509eb6aa51d696be5edba6f4e451ceff03a0 + +0015-re-PR-tree-optimization-91069-Miscompare-of-453.povr.patch +75da268e1a563a1a52389cd2ecee12d07c45a655 + +0016-tree-vrp.c-extract_range_from_multiplicative_op-Add-.patch +e2cfa983c31fa7886f496a47feb8714297ca0063 + +0017-re-PR-tree-optimization-91257-Compile-time-and-memor.patch +a55d6091230ae8d0d6f6c20dcc55158f6705090e + +0018-re-PR-tree-optimization-91257-Compile-time-and-memor.patch +ce52e0ffb4f1ea7bd4fb99aea5dda75d260e438f + +0019-Enforce-canonicalization-in-value_range.patch: +c7cf3a9bb00b6d64ba0c0e0761f000758e9428a6 + +0020-tree-vectorizer.h-get_initial_def_for_reduction-Remo.patch +5fdd6038147e4ba30c8c01332dae8ab0d717bc14 + +0021-tree-parloops.c-report_ploop_op-Copy-from-report_vec.patch +31de92e39bbeffb9f1641d292e94b48f70809ae1 + +0022-tree-vect-loop.c-vect_is_simple_reduction-Remove-ope.patch +901083b9bdf69a7b1382f9682c6fd1d5759667dd + +0023-Enforce-correct-COND_EXPR-order-for-EXTRACT_LAST_RED.patch +c449d3ae28ff4e133114fb67dbf7dcc7a95ca5d5 + +0024-tree-vect-loop.c-vect_is_slp_reduction-Remove.patch +b3c4d0dd309b7027f6e0f0b9a84829fcd53f7d64 + +0025-re-PR-tree-optimization-91822-FAIL-gcc.dg-pr88031.c-.patch +6e222b2a3aede20f3093802d1649e75848e3bd2b + +0026-re-PR-target-91269-unaligned-floating-point-register.patch +d63eadac7db10d4846bdffa93fd164cb035fb102 + +0027-tree-vect-loop.c-get_initial_def_for_reduction-Simpl.patch +d469a71e5a0eb512b522248841c56496abca8cd6 + +0028-tree-vectorizer.h-_stmt_vec_info-const_cond_reduc_co.patch +a7701dd16103048432ec8051e4773760c0e2cf90 + +0029-re-PR-tree-optimization-91896-ICE-in-vect_get_vec_de.patch +fadb01364d36a50836201bc9a6a03e525d267967 + +0030-tree-vect-loop.c-vect_analyze_loop_operations-Also-c.patch +9593e8e5e391e77bb065d4689b7511bed6a640a3 + +0031-tree-vect-loop.c-vect_analyze_loop_operations-Analyz.patch +1b4dbccc1f828fa00e6acc8b88d24301c65552df + +0032-Fix-reduc_index-1-handling-for-COND_REDUCTION-PR9190.patch +18908a56e18f15f84a91a4529923dd0878b2294f + +0033-tree-vectorizer.h-_stmt_vec_info-reduc_fn-New.patch +29f26978866f32bddd656847441a3a953ffd7a21 + +0034-gimple.c-gimple_get_lhs-For-PHIs-return-the-result.patch +61362d9d18916bd5b694385982cf4a02b7537b0e + +0035-tree-vect-loop.c-vectorizable_reduction-Move-variabl.patch +c7ea76ea5629e9f0357de49847274cf80e35f2f8 + +0036-tree-if-conv.c-tree_if_conversion-Move-call-to-ifcvt.patch +f30b3d2891cef9803badb3f85d739c0fcfafd585 + +0037-tree-vectorizer.h-stmt_vec_info_type-cycle_phi_info_.patch +291fa23ac04e317877c1e102937532f080180bb2 + +0038-re-PR-tree-optimization-91940-__builtin_bswap16-loop.patch +9ff9a0a5e6edd8729f559bf86ca06f781c4da246 + +0039-tree-vectorizer.h-vect_transform_reduction-Declare.patch +9f4d9a366b3299c276043ab987234c7bed7d29f2 + +0040-re-PR-target-91982-gcc.target-aarch64-sve-clastb_-.c.patch +48528394eafa9d1db9f956570f910c76d429a3e5 + +0041-re-PR-tree-optimization-91532-SVE-Redundant-predicat.patch +b238b34ea47222ffca7addc5fe4e8c052ade88b3 + +0042-tree-vectorizer.h-_stmt_vec_info-v_reduc_type-Remove.patch +69f8c1aef5cdcc54d5cb2ca4f99f4f26c2f822a9 + +0043-tree-vectorizer.h-_stmt_vec_info-reduc_vectype_in-Ne.patch +f78347996e02a8a767a525bfb764e769afe29d67 + +0044-tree-vect-loop.c-vect_is_simple_reduction-Simplify-a.patch +4a8841c0413d52261a8d024577381582d07a866a + +0045-re-PR-tree-optimization-92069-ice-in-vect_analyze_sc.patch +7bd8bec53f0e43c7a7852c54650746e65324514b + +0046-Deal-with-incoming-POLY_INT_CST-ranges-PR92033.patch +96eb7d7a642085f651e9940f0ee75568d7c4441d + +0047-tree-vect-loop.c-vect_valid_reduction_input_p-Remove.patch +aab8c2fd6542a52663243eec160b80bdd61516d5 + +0048-tree-vect-loop.c-needs_fold_left_reduction_p-Export.patch +aa9dffac731d0359a0e7a925ff8f4a1bef182eac + +0049-vect-Refactor-versioning-threshold.patch +a421fe9e610b5dbfce1913cd724c8ba193addd47 + +0050-vect-Outline-code-into-new-function-determine_peel_f.patch +31b35fd503e1c6713839db24044812d237aba5f1 + +0051-vect-Be-consistent-in-versioning-threshold-use.patch +f261d4808cc28a2dfd47fe06c97364c0869bb78f + +0052-tree-vect-loop.c-check_reduction_path-Compute-reduct.patch +58baf7ab85cbb1068a651c96f7d56e2902ead6cc + +0053-tree-vectorizer.h-_stmt_vec_info-cond_reduc_code-Rem.patch +c11cccc0285f02f117a1e80924fb7673b6486ce9 + +0054-re-PR-target-86753-gcc.target-aarch64-sve-vcond_-45-.patch +cc1facefe3b4e3b067d95291a7dba834b830ff18 + +0055-Avoid-recomputing-data-references-in-BB-SLP.patch +fa0c8df71d4f0476834db0b7cd88524878b46cf7 + +0056-Move-code-out-of-vect_slp_analyze_bb_1.patch +1d778697b37aec23db5b6003dfe08d2d78bd9424 + +0057-Avoid-setting-current_vector_size-in-get_vec_alignme.patch +da157e2ee9e12348df78246ee33b244b7cc334df + +0058-Pass-a-vec_info-to-vect_supportable_shift.patch +a5c3185a503fbdbc1bf05efe8ab9d12850a211c1 + +0059-Pass-a-vec_info-to-vect_supportable_direct_optab_p.patch +dcab2a0d1d4b2c0b4bba6f5e3834ec0678a2a5c8 + +0060-Pass-a-vec_info-to-get_mask_type_for_scalar_type.patch +1bd5196c9b1a0cd7280adadd6d788f81a82ca023 + +0061-Pass-a-vec_info-to-get_vectype_for_scalar_type.patch +7ed54790da87bbb4a134020a9fb8bd1b72fd0acb + +0062-Pass-a-vec_info-to-duplicate_and_interleave.patch +cdbe6e9bb4ae2882f77f94993783085fa342a9f9 + +0063-Pass-a-vec_info-to-can_duplicate_and_interleave_p.patch +43fdde5738ea0554fa000987e9769add027f4876 + +0064-Pass-a-vec_info-to-simple_integer_narrowing.patch +6c261c667801eee46a6221d3681d17493c0bbd65 + +0065-Pass-a-vec_info-to-supportable_narrowing_operation.patch +db8374a63fd0ea84f72ac76cc899be44df36df6a + +0066-Pass-a-loop_vec_info-to-vect_maybe_permute_loop_mask.patch +b0dab10e71b03441beefbbf951c0812056413cd3 + +0067-Pass-a-vec_info-to-vect_halve_mask_nunits.patch +830e90dab3dee5c8129c7760ff09ab112c2cd271 + +0068-Pass-a-vec_info-to-vect_double_mask_nunits.patch +8d1473958808fe4714ec24991ac83ee6cbf45397 + +0069-Replace-current_vector_size-with-vec_info-vector_siz.patch +ba7f76dd6bbf038948bbe516764a8bb0c851f750 + +0070-tree-vectorizer.h-_slp_tree-ops-New-member.patch +30c0d1e3cf8b03992e08cfd00ccf1fcb638d3c03 + +0071-re-PR-tree-optimization-92162-ICE-in-vect_create_epi.patch +53b15ca96116544a7a3ca8bc5f4e1649b74f3d45 + +0072-Fix-use-after-free-in-vector_size-change.patch +87121696fb2ddbec5f33daa359234850f7fd306d + +0073-re-PR-tree-optimization-92173-ICE-in-optab_for_tree_.patch +9107d6526b938eba8168025c0d90d06ad3634e69 + +0074-re-PR-tree-optimization-92173-ICE-in-optab_for_tree_.patch +6c7b0df8029d01e05577668333660d0bc58a3023 + +0075-AArch64-Don-t-apply-mode_for_int_vector-to-scalars.patch +d7814449f229cecdee48afe381519a61ea7e3378 + +0076-re-PR-tree-optimization-65930-Reduction-with-sign-ch.patch +82e8e335f917b9ce40801838c06f7945cf88da43 + +0077-re-PR-tree-optimization-92205-ICE-in-vect_get_vec_de.patch +e227594789d909fbad56f6036910938678738f92 + +0078-tree-vect-slp.c-vect_get_and_check_slp_defs-For-redu.patch +4352288a3df915575a2b820f702242908740106f + +0079-tree-vect-loop.c-vectorizable_reduction-Verify-STMT_.patch +ea133b14f48ed5730748a7e02e322fb07ccc2d85 + +0080-Fix-reductions-for-fully-masked-loops.patch +89d0345ad7b8d84045813972ee60557a6b511c57 + +0081-tree-vect-loop.c-vect_create_epilog_for_reduction-Us.patch +e0c4f7fbd6a4ee8e3a1468514044bd941fa28522 + +0082-re-PR-tree-optimization-92241-ice-in-vect_mark_patte.patch +97c6bea819ec0a773041308e62a7c05c33f093b0 + +0083-re-PR-tree-optimization-65930-Reduction-with-sign-ch.patch +b7ff7cef5005721e78d6936bed3ae1c059b4e8d2 + +0084-Fix-reduc_index-calculation-in-vectorizable_conditio.patch +1d149b7260bcc4c0c6367b3aea47a8b91a1cf345 + +0085-vect-PR-88915-Vectorize-epilogues-when-versioning-lo.patch +97c146036750e7cb3966d292572ec158a78f356e + +0086-re-PR-tree-optimization-65930-Reduction-with-sign-ch.patch +b4673569c2a8b974e3f84ffaa547941c5d40cfe5 + +0087-Come-up-with-an-abstraction.patch +7f4a8ee03d404c560dcb75ba684fd57ffbc77e85 + +0088-re-PR-tree-optimization-92275-ICE-error-definition-i.patch +b81f2dafdbd2c5aa49213b35dc12d4610834e39e + +0089-vect-Make-vect-epilogues-nomask-1-default.patch +1297712fb4af6c6bfd827e0f0a9695b14669f87d + +0090-vect-Clean-up-orig_loop_vinfo-from-vect_analyze_loop.patch +494d6c28c53d0852bb6468b1f1ca189159775fcc + +0091-re-PR-tree-optimization-92371-ICE-in-info_for_reduct.patch +02bf7e6fa219f939b3225c54fbe8bab2133b1aeb + +0092-vect-PR92317-fix-skip_epilogue-creation-for-epilogue.patch +2e7a4f579b1157754ea20a03431b4fa80cd4567a + +0093-Restructure-vect_analyze_loop.patch +72d6aeecd95ec49fff1d258e4631167a03351cbb + +0094-Check-the-VF-is-small-enough-for-an-epilogue-loop.patch +8ec5b16a9a3dbd6d825596c22f1bc32646de28fe + +0095-tree-vect-loop.c-vectorizable_reduction-Remember-red.patch +06af1f1a0def9de076ec629ea634122f15882ce6 + +0096-Don-t-vectorise-single-iteration-epilogues.patch +4b205bf82d06c4d9d0ae7b78e54c712d79d5b021 + +0097-re-PR-tree-optimization-92405-ICE-in-vect_get_vec_de.patch +084d390246c2172853f9e12ce04aef23cba79590 + +0098-re-PR-tree-optimization-92324-ICE-in-expand_direct_o.patch +f1e1ed3314b7c6308f64cbbcf6d1916e239c8e35 + +0099-vect-Disable-vectorization-of-epilogues-for-loops-wi.patch +b602712b3ea2a0729a2eda61bd9ee795aba6138f + +0100-Use-correct-vector-type-in-neutral_op_for_slp_reduct.patch +d308ca27c71e43625b378dc6c2774105867d4fa7 + +0101-vect-Account-for-epilogue-s-peeling-for-gaps-when-ch.patch +87b47251924c7539a9a8e191587d118a14496473 + +0102-Add-a-targetm.vectorize.related_mode-hook.patch +f09552335030433018fd5f7f6b9848339b5ca2da + +0103-Replace-mode_for_int_vector-with-related_int_vector_.patch +d083ee47a9828236016841356fc7207e7c90bbbd + +0104-Add-build_truth_vector_type_for_mode.patch +0a0ef2387cc1561d537d8d949aef9479ef17ba35 + +0105-Remove-build_-same_sized_-truth_vector_type.patch +e8738f4e9686203451fd11f05b268b8a31b95ebd + +0106-Pass-the-data-vector-mode-to-get_mask_mode.patch +10116ec1c147a76522cafba6b6a5b4ed1cb37b77 + +0107-Use-build_vector_type_for_mode-in-get_vectype_for_sc.patch +95da266b86fcdeff84fcadc5e3cde3d0027e571d + +0108-Use-consistent-compatibility-checks-in-vectorizable_.patch +0203c4f3bfb3e3242635b0cee0b9deedb4070a62 + +0109-Use-consistent-compatibility-checks-in-vectorizable_.patch +e021fb865564b62a10adb1e98f75b5ea05058047 + +0110-Replace-vec_info-vector_size-with-vec_info-vector_mo.patch +1c84a2d25ecd4c03dde745f36a4762dd45f97c85 + +0111-Make-less-use-of-get_same_sized_vectype.patch +2df4150075c03f8a292c40afd3bb25febb673578 + +0112-Require-equal-type-sizes-for-vectorised-calls.patch +7f52eb891b738337d5cf82c7c440a5eea8c7b0c9 + +0113-Support-vectorisation-with-mixed-vector-sizes.patch +df7c22831f1e48dba49479c5960c1c180d8eab2c + +0114-Avoid-retrying-with-the-same-vector-modes.patch +a55d8232df3dd4f7a3f5b70025074c3919b802a6 + +0115-AArch64-Support-vectorising-with-multiple-vector-siz.patch +74166aabeb7f22990476b1169bba031b8323ee92 + +0116-Allow-mixed-vector-sizes-within-a-single-vectorised-.patch +05101d1b575a57ca26e4275e971da85a0dd1d52a + +0117-Vectorise-conversions-between-differently-sized-inte.patch +9c437a108a14b9bdc44659c131b0da944e5ffeab + +0118-Consider-building-nodes-from-scalars-in-vect_slp_ana.patch +60838d634634a70d65a126166c944b159ac7649c + +0119-Optionally-pick-the-cheapest-loop_vec_info.patch +bcc7e346bf9b5dc77797ea949d6adc740deb30ca + +0120-Move-canonicalisation-of-dr_with_seg_len_pair_ts.patch +1fb2b0f69ee849142b669ba1b82264ce6d0f75f9 + +0121-Delay-swapping-data-refs-in-prune_runtime_alias_test.patch +97602450b04e94aff034381bf6ee4236b95727ed + +0122-Add-flags-to-dr_with_seg_len_pair_t.patch +e9acf80c96d681917d930869b7cbfb7d2fa54d51 + +0123-Record-whether-a-dr_with_seg_len-contains-mixed-step.patch +52c29905259363ce2b78dd7aa8a25cf531cddb3a + +0124-Dump-the-list-of-merged-alias-pairs.patch +cad984b289e2b3aca786314c673339eb0500fefa + +0125-Print-the-type-of-alias-check-in-a-dump-message.patch +b4d1b635737a4780e5be247f8be9550eaf83dae5 + +0126-Use-a-single-comparison-for-index-based-alias-checks.patch +f9d6338bd15ce1fae36bf25d3a0545e9678ddc58 + +0127-Optimise-WAR-and-WAW-alias-checks.patch +8489e1f45b50600c01eb8ed8c5d0ca914ded281c + +0128-Avoid-quadratic-behaviour-in-prune_runtime_alias_tes.patch +ea1ff9e46c7ec5e49ec671616cfcf405ef665054 + +diff --git a/gcc/asan.c b/gcc/asan.c +index 3b800b26b69..605d04f87f7 100644 +--- a/gcc/asan.c ++++ b/gcc/asan.c +@@ -1713,8 +1713,8 @@ asan_emit_allocas_unpoison (rtx top, rtx bot, rtx_insn *before) + rtx ret = init_one_libfunc ("__asan_allocas_unpoison"); + top = convert_memory_address (ptr_mode, top); + bot = convert_memory_address (ptr_mode, bot); +- ret = emit_library_call_value (ret, NULL_RTX, LCT_NORMAL, ptr_mode, +- top, ptr_mode, bot, ptr_mode); ++ emit_library_call (ret, LCT_NORMAL, ptr_mode, ++ top, ptr_mode, bot, ptr_mode); + + do_pending_stack_adjust (); + rtx_insn *insns = get_insns (); +diff --git a/gcc/bt-load.c b/gcc/bt-load.c +index a7d9d53954e..f68879ca49a 100644 +--- a/gcc/bt-load.c ++++ b/gcc/bt-load.c +@@ -1169,7 +1169,6 @@ move_btr_def (basic_block new_def_bb, int btr, btr_def *def, bitmap live_range, + + if (def->other_btr_uses_before_def) + { +- insp = BB_END (b); + for (insp = BB_END (b); ! INSN_P (insp); insp = PREV_INSN (insp)) + gcc_assert (insp != BB_HEAD (b)); + +diff --git a/gcc/builtins.c b/gcc/builtins.c +index ed11f79ff0b..910e614a4d1 100644 +--- a/gcc/builtins.c ++++ b/gcc/builtins.c +@@ -1653,11 +1653,8 @@ expand_builtin_apply_args_1 (void) + /* Save the structure value address unless this is passed as an + "invisible" first argument. */ + if (struct_incoming_value) +- { +- emit_move_insn (adjust_address (registers, Pmode, size), +- copy_to_reg (struct_incoming_value)); +- size += GET_MODE_SIZE (Pmode); +- } ++ emit_move_insn (adjust_address (registers, Pmode, size), ++ copy_to_reg (struct_incoming_value)); + + /* Return the address of the block. */ + return copy_addr_to_reg (XEXP (registers, 0)); +@@ -1806,7 +1803,6 @@ expand_builtin_apply (rtx function, rtx arguments, rtx argsize) + emit_move_insn (struct_value, value); + if (REG_P (struct_value)) + use_reg (&call_fusage, struct_value); +- size += GET_MODE_SIZE (Pmode); + } + + /* All arguments and registers used for the call are set up by now! */ +diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c +index c0582a54c93..cb999cbf82f 100644 +--- a/gcc/c/c-typeck.c ++++ b/gcc/c/c-typeck.c +@@ -5424,7 +5424,7 @@ build_conditional_expr (location_t colon_loc, tree ifexp, bool ifexp_bcp, + tree elem_type = TREE_TYPE (vectype); + tree zero = build_int_cst (elem_type, 0); + tree zero_vec = build_vector_from_val (vectype, zero); +- tree cmp_type = build_same_sized_truth_vector_type (vectype); ++ tree cmp_type = truth_type_for (vectype); + ifexp = build2 (NE_EXPR, cmp_type, ifexp, zero_vec); + } + +@@ -11327,7 +11327,7 @@ build_vec_cmp (tree_code code, tree type, + { + tree zero_vec = build_zero_cst (type); + tree minus_one_vec = build_minus_one_cst (type); +- tree cmp_type = build_same_sized_truth_vector_type (type); ++ tree cmp_type = truth_type_for (type); + tree cmp = build2 (code, cmp_type, arg0, arg1); + return build3 (VEC_COND_EXPR, type, cmp, minus_one_vec, zero_vec); + } +diff --git a/gcc/cfgexpand.c b/gcc/cfgexpand.c +index e252975f546..4ae8e3b3297 100644 +--- a/gcc/cfgexpand.c ++++ b/gcc/cfgexpand.c +@@ -3029,7 +3029,6 @@ expand_asm_stmt (gasm *stmt) + } + } + } +- unsigned nclobbers = clobber_rvec.length(); + + /* First pass over inputs and outputs checks validity and sets + mark_addressable if needed. */ +@@ -3301,7 +3300,7 @@ expand_asm_stmt (gasm *stmt) + gcc_assert (constraints.length() == noutputs + ninputs); + + /* But it certainly can adjust the clobbers. */ +- nclobbers = clobber_rvec.length(); ++ unsigned nclobbers = clobber_rvec.length (); + + /* Third pass checks for easy conflicts. */ + /* ??? Why are we doing this on trees instead of rtx. */ +@@ -5979,11 +5978,11 @@ construct_init_block (void) + { + first_block = e->dest; + redirect_edge_succ (e, init_block); +- e = make_single_succ_edge (init_block, first_block, flags); ++ make_single_succ_edge (init_block, first_block, flags); + } + else +- e = make_single_succ_edge (init_block, EXIT_BLOCK_PTR_FOR_FN (cfun), +- EDGE_FALLTHRU); ++ make_single_succ_edge (init_block, EXIT_BLOCK_PTR_FOR_FN (cfun), ++ EDGE_FALLTHRU); + + update_bb_for_insn (init_block); + return init_block; +diff --git a/gcc/cfghooks.c b/gcc/cfghooks.c +index a1d603a207e..a18b6490bdd 100644 +--- a/gcc/cfghooks.c ++++ b/gcc/cfghooks.c +@@ -253,8 +253,6 @@ verify_flow_info (void) + err = 1; + } + +- last_bb_seen = ENTRY_BLOCK_PTR_FOR_FN (cfun); +- + /* Clean up. */ + free (last_visited); + free (edge_checksum); +diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h +index b78d87d22f1..98bf6d2adda 100644 +--- a/gcc/cfgloop.h ++++ b/gcc/cfgloop.h +@@ -174,6 +174,9 @@ struct GTY ((chain_next ("%h.next"))) loop { + of the loop can be safely evaluated concurrently. */ + int safelen; + ++ /* Preferred vectorization factor for the loop if non-zero. */ ++ int simdlen; ++ + /* Constraints are generally set by consumers and affect certain + semantics of niter analyzer APIs. Currently the APIs affected are + number_of_iterations_exit* functions and their callers. One typical +diff --git a/gcc/cfgloopmanip.c b/gcc/cfgloopmanip.c +index ea4b914c15b..8fc697ecf5d 100644 +--- a/gcc/cfgloopmanip.c ++++ b/gcc/cfgloopmanip.c +@@ -364,7 +364,6 @@ remove_path (edge e, bool *irred_invalidated, + + for (i = 0; i < nrem; i++) + { +- bb = rem_bbs[i]; + FOR_EACH_EDGE (ae, ei, rem_bbs[i]->succs) + if (ae->dest != EXIT_BLOCK_PTR_FOR_FN (cfun) + && !bitmap_bit_p (seen, ae->dest->index)) +@@ -1016,6 +1015,7 @@ copy_loop_info (struct loop *loop, struct loop *target) + target->nb_iterations_estimate = loop->nb_iterations_estimate; + target->estimate_state = loop->estimate_state; + target->safelen = loop->safelen; ++ target->simdlen = loop->simdlen; + target->constraints = loop->constraints; + target->can_be_parallel = loop->can_be_parallel; + target->warned_aggressive_loop_optimizations +diff --git a/gcc/cfgrtl.c b/gcc/cfgrtl.c +index 08e534f2485..b5f15907bde 100644 +--- a/gcc/cfgrtl.c ++++ b/gcc/cfgrtl.c +@@ -2958,7 +2958,6 @@ rtl_verify_bb_layout (void) + basic_block last_bb_seen = ENTRY_BLOCK_PTR_FOR_FN (cfun), curr_bb = NULL; + + num_bb_notes = 0; +- last_bb_seen = ENTRY_BLOCK_PTR_FOR_FN (cfun); + + for (x = rtx_first; x; x = NEXT_INSN (x)) + { +diff --git a/gcc/cgraph.c b/gcc/cgraph.c +index a16f4668b3c..bed6838d22b 100644 +--- a/gcc/cgraph.c ++++ b/gcc/cgraph.c +@@ -2717,8 +2717,6 @@ bool + cgraph_node::set_pure_flag (bool pure, bool looping) + { + struct set_pure_flag_info info = {pure, looping, false}; +- if (!pure) +- looping = false; + call_for_symbol_thunks_and_aliases (set_pure_flag_1, &info, !pure, true); + return info.changed; + } +diff --git a/gcc/combine.c b/gcc/combine.c +index 567aa2c3715..b9d674c96cc 100644 +--- a/gcc/combine.c ++++ b/gcc/combine.c +@@ -6591,7 +6591,6 @@ simplify_if_then_else (rtx x) + || reg_mentioned_p (true_rtx, false_rtx) + || rtx_equal_p (false_rtx, XEXP (cond, 0)))) + { +- true_code = reversed_comparison_code (cond, NULL); + SUBST (XEXP (x, 0), reversed_comparison (cond, GET_MODE (cond))); + SUBST (XEXP (x, 1), false_rtx); + SUBST (XEXP (x, 2), true_rtx); +diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md +index e3852c5d182..28f93a70801 100644 +--- a/gcc/config/aarch64/aarch64-simd.md ++++ b/gcc/config/aarch64/aarch64-simd.md +@@ -3183,7 +3183,7 @@ + ;; In this insn, operand 1 should be low, and operand 2 the high part of the + ;; dest vector. + +-(define_insn "*aarch64_combinez" ++(define_insn "@aarch64_combinez" + [(set (match_operand: 0 "register_operand" "=w,w,w") + (vec_concat: + (match_operand:VDC 1 "general_operand" "w,?r,m") +@@ -3197,7 +3197,7 @@ + (set_attr "arch" "simd,fp,simd")] + ) + +-(define_insn "*aarch64_combinez_be" ++(define_insn "@aarch64_combinez_be" + [(set (match_operand: 0 "register_operand" "=w,w,w") + (vec_concat: + (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero") +@@ -5926,6 +5926,15 @@ + DONE; + }) + ++(define_expand "vec_init" ++ [(match_operand:VQ_NO2E 0 "register_operand" "") ++ (match_operand 1 "" "")] ++ "TARGET_SIMD" ++{ ++ aarch64_expand_vector_init (operands[0], operands[1]); ++ DONE; ++}) ++ + (define_insn "*aarch64_simd_ld1r" + [(set (match_operand:VALL_F16 0 "register_operand" "=w") + (vec_duplicate:VALL_F16 +@@ -6937,3 +6946,21 @@ + "pmull2\\t%0.1q, %1.2d, %2.2d" + [(set_attr "type" "crypto_pmull")] + ) ++ ++;; Sign- or zero-extend a 64-bit integer vector to a 128-bit vector. ++(define_insn "2" ++ [(set (match_operand:VQN 0 "register_operand" "=w") ++ (ANY_EXTEND:VQN (match_operand: 1 "register_operand" "w")))] ++ "TARGET_SIMD" ++ "xtl\t%0., %1." ++ [(set_attr "type" "neon_shift_imm_long")] ++) ++ ++;; Truncate a 128-bit integer vector to a 64-bit vector. ++(define_insn "trunc2" ++ [(set (match_operand: 0 "register_operand" "=w") ++ (truncate: (match_operand:VQN 1 "register_operand" "w")))] ++ "TARGET_SIMD" ++ "xtn\t%0., %1." ++ [(set_attr "type" "neon_shift_imm_narrow_q")] ++) +diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md +index 3f39c4c5b63..02d33b7276f 100644 +--- a/gcc/config/aarch64/aarch64-sve.md ++++ b/gcc/config/aarch64/aarch64-sve.md +@@ -3132,3 +3132,19 @@ + DONE; + } + ) ++ ++;; Unpredicated DOT product. ++(define_insn "dot_prod" ++ [(set (match_operand:SVE_SDI 0 "register_operand" "=w, ?&w") ++ (plus:SVE_SDI ++ (unspec:SVE_SDI ++ [(match_operand: 1 "register_operand" "w, w") ++ (match_operand: 2 "register_operand" "w, w")] ++ DOTPROD) ++ (match_operand:SVE_SDI 3 "register_operand" "0, w")))] ++ "TARGET_SVE" ++ "@ ++ dot\\t%0., %1., %2. ++ movprfx\t%0, %3\;dot\\t%0., %1., %2." ++ [(set_attr "movprfx" "*,yes")] ++) +diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c +index 2ff0bc0a686..128c250dffe 100644 +--- a/gcc/config/aarch64/aarch64.c ++++ b/gcc/config/aarch64/aarch64.c +@@ -1549,17 +1549,37 @@ aarch64_sve_pred_mode (unsigned int elem_nbytes) + /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */ + + static opt_machine_mode +-aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes) ++aarch64_get_mask_mode (machine_mode mode) + { +- if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR)) ++ unsigned int vec_flags = aarch64_classify_vector_mode (mode); ++ if (vec_flags & VEC_SVE_DATA) ++ return aarch64_sve_pred_mode (GET_MODE_UNIT_SIZE (mode)); ++ ++ return default_get_mask_mode (mode); ++} ++ ++/* Implement TARGET_VECTORIZE_RELATED_MODE. */ ++ ++static opt_machine_mode ++aarch64_vectorize_related_mode (machine_mode vector_mode, ++ scalar_mode element_mode, ++ poly_uint64 nunits) ++{ ++ unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode); ++ ++ /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */ ++ if ((vec_flags & VEC_ADVSIMD) ++ && known_eq (nunits, 0U) ++ && known_eq (GET_MODE_BITSIZE (vector_mode), 64U) ++ && maybe_ge (GET_MODE_BITSIZE (element_mode) ++ * GET_MODE_NUNITS (vector_mode), 128U)) + { +- unsigned int elem_nbytes = vector_element_size (nbytes, nunits); +- machine_mode pred_mode; +- if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode)) +- return pred_mode; ++ machine_mode res = aarch64_simd_container_mode (element_mode, 128); ++ if (VECTOR_MODE_P (res)) ++ return res; + } + +- return default_get_mask_mode (nunits, nbytes); ++ return default_vectorize_related_mode (vector_mode, element_mode, nunits); + } + + /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations, +@@ -10897,7 +10917,9 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp) + /* Caller assumes we cannot fail. */ + gcc_assert (use_rsqrt_p (mode)); + +- machine_mode mmsk = mode_for_int_vector (mode).require (); ++ machine_mode mmsk = (VECTOR_MODE_P (mode) ++ ? related_int_vector_mode (mode).require () ++ : int_mode_for_mode (mode).require ()); + rtx xmsk = gen_reg_rtx (mmsk); + if (!recp) + /* When calculating the approximate square root, compare the +@@ -14226,13 +14248,34 @@ aarch64_preferred_simd_mode (scalar_mode mode) + + /* Return a list of possible vector sizes for the vectorizer + to iterate over. */ +-static void +-aarch64_autovectorize_vector_sizes (vector_sizes *sizes) ++static unsigned int ++aarch64_autovectorize_vector_modes (vector_modes *modes, bool) + { + if (TARGET_SVE) +- sizes->safe_push (BYTES_PER_SVE_VECTOR); +- sizes->safe_push (16); +- sizes->safe_push (8); ++ modes->safe_push (VNx16QImode); ++ ++ /* Try using 128-bit vectors for all element types. */ ++ modes->safe_push (V16QImode); ++ ++ /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors ++ for wider elements. */ ++ modes->safe_push (V8QImode); ++ ++ /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors ++ for wider elements. ++ ++ TODO: We could support a limited form of V4QImode too, so that ++ we use 32-bit vectors for 8-bit elements. */ ++ modes->safe_push (V4HImode); ++ ++ /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors ++ for 64-bit elements. ++ ++ TODO: We could similarly support limited forms of V2QImode and V2HImode ++ for this case. */ ++ modes->safe_push (V2SImode); ++ ++ return 0; + } + + /* Implement TARGET_MANGLE_TYPE. */ +@@ -15191,6 +15234,45 @@ aarch64_expand_vector_init (rtx target, rtx vals) + rtx v0 = XVECEXP (vals, 0, 0); + bool all_same = true; + ++ /* This is a special vec_init where N is not an element mode but a ++ vector mode with half the elements of M. We expect to find two entries ++ of mode N in VALS and we must put their concatentation into TARGET. */ ++ if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0)))) ++ { ++ gcc_assert (known_eq (GET_MODE_SIZE (mode), ++ 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0))))); ++ rtx lo = XVECEXP (vals, 0, 0); ++ rtx hi = XVECEXP (vals, 0, 1); ++ machine_mode narrow_mode = GET_MODE (lo); ++ gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode); ++ gcc_assert (narrow_mode == GET_MODE (hi)); ++ ++ /* When we want to concatenate a half-width vector with zeroes we can ++ use the aarch64_combinez[_be] patterns. Just make sure that the ++ zeroes are in the right half. */ ++ if (BYTES_BIG_ENDIAN ++ && aarch64_simd_imm_zero (lo, narrow_mode) ++ && general_operand (hi, narrow_mode)) ++ emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo)); ++ else if (!BYTES_BIG_ENDIAN ++ && aarch64_simd_imm_zero (hi, narrow_mode) ++ && general_operand (lo, narrow_mode)) ++ emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi)); ++ else ++ { ++ /* Else create the two half-width registers and combine them. */ ++ if (!REG_P (lo)) ++ lo = force_reg (GET_MODE (lo), lo); ++ if (!REG_P (hi)) ++ hi = force_reg (GET_MODE (hi), hi); ++ ++ if (BYTES_BIG_ENDIAN) ++ std::swap (lo, hi); ++ emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi)); ++ } ++ return; ++ } ++ + /* Count the number of variable elements to initialise. */ + for (int i = 0; i < n_elts; ++i) + { +@@ -16684,7 +16766,7 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d) + if (d->testing_p) + return true; + +- machine_mode sel_mode = mode_for_int_vector (d->vmode).require (); ++ machine_mode sel_mode = related_int_vector_mode (d->vmode).require (); + rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm); + if (d->one_vector_p) + emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel)); +@@ -17064,9 +17146,7 @@ void + aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode, + rtx *ops) + { +- machine_mode pred_mode +- = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode), +- GET_MODE_SIZE (cmp_mode)).require (); ++ machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require (); + rtx pred = gen_reg_rtx (pred_mode); + if (FLOAT_MODE_P (cmp_mode)) + { +@@ -19363,9 +19443,9 @@ aarch64_libgcc_floating_mode_supported_p + #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \ + aarch64_builtin_vectorized_function + +-#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES +-#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \ +- aarch64_autovectorize_vector_sizes ++#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES ++#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \ ++ aarch64_autovectorize_vector_modes + + #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV + #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \ +@@ -19398,6 +19478,8 @@ aarch64_libgcc_floating_mode_supported_p + #define TARGET_VECTORIZE_VEC_PERM_CONST \ + aarch64_vectorize_vec_perm_const + ++#undef TARGET_VECTORIZE_RELATED_MODE ++#define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode + #undef TARGET_VECTORIZE_GET_MASK_MODE + #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode + #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE +diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md +index 6caeeac8086..c7ccd5bf6fe 100644 +--- a/gcc/config/aarch64/iterators.md ++++ b/gcc/config/aarch64/iterators.md +@@ -663,6 +663,9 @@ + (QI "b") (HI "h") + (SI "s") (DI "d")]) + ++;; Like Vetype, but map to types that are a quarter of the element size. ++(define_mode_attr Vetype_fourth [(VNx4SI "b") (VNx2DI "h")]) ++ + ;; Equivalent of "size" for a vector element. + (define_mode_attr Vesize [(VNx16QI "b") + (VNx8HI "h") (VNx8HF "h") +@@ -765,6 +768,7 @@ + ;; Half modes of all vector modes, in lower-case. + (define_mode_attr Vhalf [(V8QI "v4qi") (V16QI "v8qi") + (V4HI "v2hi") (V8HI "v4hi") ++ (V8HF "v4hf") + (V2SI "si") (V4SI "v2si") + (V2DI "di") (V2SF "sf") + (V4SF "v2sf") (V2DF "df")]) +@@ -800,6 +804,8 @@ + (V2DI "V2SI") + (DI "SI") (SI "HI") + (HI "QI")]) ++(define_mode_attr Vnarrowq [(V8HI "v8qi") (V4SI "v4hi") ++ (V2DI "v2si")]) + + ;; Narrowed quad-modes for VQN (Used for XTN2). + (define_mode_attr VNARROWQ2 [(V8HI "V16QI") (V4SI "V8HI") +@@ -1029,8 +1035,10 @@ + (V2SF "p") (V4SF "v") + (V4HF "v") (V8HF "v")]) + +-(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")]) +-(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")]) ++(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi") ++ (VNx4SI "vnx16qi") (VNx2DI "vnx8hi")]) ++(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI") ++ (VNx4SI "VNx16QI") (VNx2DI "VNx8HI")]) + + + ;; Register suffix for DOTPROD input types from the return type. +diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c +index f7ff95a0edf..325dd3cea9a 100644 +--- a/gcc/config/arc/arc.c ++++ b/gcc/config/arc/arc.c +@@ -477,16 +477,17 @@ arc_preferred_simd_mode (scalar_mode mode) + } + + /* Implements target hook +- TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES. */ ++ TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES. */ + +-static void +-arc_autovectorize_vector_sizes (vector_sizes *sizes) ++static unsigned int ++arc_autovectorize_vector_modes (vector_modes *modes, bool) + { + if (TARGET_PLUS_QMACW) + { +- sizes->quick_push (8); +- sizes->quick_push (4); ++ modes->quick_push (V4HImode); ++ modes->quick_push (V2HImode); + } ++ return 0; + } + + +@@ -596,8 +597,8 @@ static rtx arc_legitimize_address_0 (rtx, rtx, machine_mode mode); + #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE + #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE arc_preferred_simd_mode + +-#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES +-#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES arc_autovectorize_vector_sizes ++#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES ++#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES arc_autovectorize_vector_modes + + #undef TARGET_CAN_USE_DOLOOP_P + #define TARGET_CAN_USE_DOLOOP_P arc_can_use_doloop_p +diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c +index cdfc0f9e72f..1a4a4b7bc58 100644 +--- a/gcc/config/arm/arm.c ++++ b/gcc/config/arm/arm.c +@@ -288,7 +288,7 @@ static bool arm_builtin_support_vector_misalignment (machine_mode mode, + static void arm_conditional_register_usage (void); + static enum flt_eval_method arm_excess_precision (enum excess_precision_type); + static reg_class_t arm_preferred_rename_class (reg_class_t rclass); +-static void arm_autovectorize_vector_sizes (vector_sizes *); ++static unsigned int arm_autovectorize_vector_modes (vector_modes *, bool); + static int arm_default_branch_cost (bool, bool); + static int arm_cortex_a5_branch_cost (bool, bool); + static int arm_cortex_m_branch_cost (bool, bool); +@@ -519,9 +519,9 @@ static const struct attribute_spec arm_attribute_table[] = + #define TARGET_ARRAY_MODE_SUPPORTED_P arm_array_mode_supported_p + #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE + #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE arm_preferred_simd_mode +-#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES +-#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \ +- arm_autovectorize_vector_sizes ++#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES ++#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \ ++ arm_autovectorize_vector_modes + + #undef TARGET_MACHINE_DEPENDENT_REORG + #define TARGET_MACHINE_DEPENDENT_REORG arm_reorg +@@ -28446,14 +28446,15 @@ arm_vector_alignment (const_tree type) + return align; + } + +-static void +-arm_autovectorize_vector_sizes (vector_sizes *sizes) ++static unsigned int ++arm_autovectorize_vector_modes (vector_modes *modes, bool) + { + if (!TARGET_NEON_VECTORIZE_DOUBLE) + { +- sizes->safe_push (16); +- sizes->safe_push (8); ++ modes->safe_push (V16QImode); ++ modes->safe_push (V8QImode); + } ++ return 0; + } + + static bool +diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c +index 99fa45edcd4..eb06ff9e05b 100644 +--- a/gcc/config/gcn/gcn.c ++++ b/gcc/config/gcn/gcn.c +@@ -3800,8 +3800,7 @@ gcn_expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode, + a vector. */ + + opt_machine_mode +-gcn_vectorize_get_mask_mode (poly_uint64 ARG_UNUSED (nunits), +- poly_uint64 ARG_UNUSED (length)) ++gcn_vectorize_get_mask_mode (machine_mode) + { + /* GCN uses a DImode bit-mask. */ + return DImode; +diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c +index 1bca5a7eea6..5a0f8a0eb72 100644 +--- a/gcc/config/i386/i386.c ++++ b/gcc/config/i386/i386.c +@@ -9647,7 +9647,6 @@ ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v, + CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); + CUMULATIVE_ARGS next_cum; + tree fntype; +- int max; + + gcc_assert (!no_rtl); + +@@ -9663,10 +9662,6 @@ ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v, + if (stdarg_p (fntype)) + ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type, + true); +- +- max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD; +- if (max > X86_64_REGPARM_MAX) +- max = X86_64_REGPARM_MAX; + } + + +@@ -11806,7 +11801,6 @@ choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg, + { + base_reg = hard_frame_pointer_rtx; + base_offset = toffset; +- len = tlen; + } + } + } +@@ -39699,12 +39693,10 @@ ix86_preferred_reload_class (rtx x, reg_class_t regclass) + static reg_class_t + ix86_preferred_output_reload_class (rtx x, reg_class_t regclass) + { +- machine_mode mode = GET_MODE (x); +- + /* Restrict the output reload class to the register bank that we are doing + math on. If we would like not to return a subset of CLASS, reject this + alternative: if reload cannot do this, it will still use its choice. */ +- mode = GET_MODE (x); ++ machine_mode mode = GET_MODE (x); + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS; + +@@ -45666,14 +45658,13 @@ ix86_expand_rounddf_32 (rtx operand0, rtx operand1) + 0, OPTAB_DIRECT); + + /* Compensate. */ +- tmp = gen_reg_rtx (mode); + /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */ + tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false); +- emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp))); ++ emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one))); + xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT); + /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */ + tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false); +- emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp))); ++ emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one))); + xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT); + + /* res = copysign (xa2, operand1) */ +@@ -50238,27 +50229,42 @@ ix86_split_reduction (machine_mode mode) + vectors. If AVX512F is enabled then try vectorizing with 512bit, + 256bit and 128bit vectors. */ + +-static void +-ix86_autovectorize_vector_sizes (vector_sizes *sizes) ++static unsigned int ++ix86_autovectorize_vector_modes (vector_modes *modes, bool all) + { + if (TARGET_AVX512F && !TARGET_PREFER_AVX256) + { +- sizes->safe_push (64); +- sizes->safe_push (32); +- sizes->safe_push (16); ++ modes->safe_push (V64QImode); ++ modes->safe_push (V32QImode); ++ modes->safe_push (V16QImode); ++ } ++ else if (TARGET_AVX512F && all) ++ { ++ modes->safe_push (V32QImode); ++ modes->safe_push (V16QImode); ++ modes->safe_push (V64QImode); + } + else if (TARGET_AVX && !TARGET_PREFER_AVX128) + { +- sizes->safe_push (32); +- sizes->safe_push (16); ++ modes->safe_push (V32QImode); ++ modes->safe_push (V16QImode); ++ } ++ else if (TARGET_AVX && all) ++ { ++ modes->safe_push (V16QImode); ++ modes->safe_push (V32QImode); + } ++ ++ return 0; + } + + /* Implemenation of targetm.vectorize.get_mask_mode. */ + + static opt_machine_mode +-ix86_get_mask_mode (poly_uint64 nunits, poly_uint64 vector_size) ++ix86_get_mask_mode (machine_mode data_mode) + { ++ unsigned vector_size = GET_MODE_SIZE (data_mode); ++ unsigned nunits = GET_MODE_NUNITS (data_mode); + unsigned elem_size = vector_size / nunits; + + /* Scalar mask case. */ +@@ -51849,9 +51855,9 @@ ix86_run_selftests (void) + #undef TARGET_VECTORIZE_SPLIT_REDUCTION + #define TARGET_VECTORIZE_SPLIT_REDUCTION \ + ix86_split_reduction +-#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES +-#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \ +- ix86_autovectorize_vector_sizes ++#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES ++#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \ ++ ix86_autovectorize_vector_modes + #undef TARGET_VECTORIZE_GET_MASK_MODE + #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode + #undef TARGET_VECTORIZE_INIT_COST +diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md +index 18cc39ae521..8c961f12a42 100644 +--- a/gcc/config/i386/sse.md ++++ b/gcc/config/i386/sse.md +@@ -16441,10 +16441,9 @@ + (unspec:VF_128_256 + [(match_operand:VF_128_256 1 "register_operand" "0,0,x") + (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm") +- (subreg:VF_128_256 +- (lt: +- (match_operand: 3 "register_operand" "Yz,Yz,x") +- (match_operand: 4 "const0_operand" "C,C,C")) 0)] ++ (lt:VF_128_256 ++ (match_operand: 3 "register_operand" "Yz,Yz,x") ++ (match_operand: 4 "const0_operand" "C,C,C"))] + UNSPEC_BLENDV))] + "TARGET_SSE4_1" + "#" +diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c +index d758fbf1be6..1008947209e 100644 +--- a/gcc/config/mips/mips.c ++++ b/gcc/config/mips/mips.c +@@ -13457,13 +13457,14 @@ mips_preferred_simd_mode (scalar_mode mode) + return word_mode; + } + +-/* Implement TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES. */ ++/* Implement TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES. */ + +-static void +-mips_autovectorize_vector_sizes (vector_sizes *sizes) ++static unsigned int ++mips_autovectorize_vector_modes (vector_modes *modes, bool) + { + if (ISA_HAS_MSA) +- sizes->safe_push (16); ++ modes->safe_push (V16QImode); ++ return 0; + } + + /* Implement TARGET_INIT_LIBFUNCS. */ +@@ -22676,9 +22677,9 @@ mips_starting_frame_offset (void) + + #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE + #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE mips_preferred_simd_mode +-#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES +-#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \ +- mips_autovectorize_vector_sizes ++#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES ++#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \ ++ mips_autovectorize_vector_modes + + #undef TARGET_INIT_BUILTINS + #define TARGET_INIT_BUILTINS mips_init_builtins +diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c +index 87d60078bb0..8f046de424c 100644 +--- a/gcc/config/rs6000/rs6000.c ++++ b/gcc/config/rs6000/rs6000.c +@@ -15457,7 +15457,7 @@ static tree + fold_build_vec_cmp (tree_code code, tree type, + tree arg0, tree arg1) + { +- tree cmp_type = build_same_sized_truth_vector_type (type); ++ tree cmp_type = truth_type_for (type); + tree zero_vec = build_zero_cst (type); + tree minus_one_vec = build_minus_one_cst (type); + tree cmp = fold_build2 (code, cmp_type, arg0, arg1); +diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c +index db3f94978ec..c35666dec83 100644 +--- a/gcc/config/s390/s390.c ++++ b/gcc/config/s390/s390.c +@@ -6588,7 +6588,7 @@ s390_expand_vec_compare_cc (rtx target, enum rtx_code code, + case LE: cc_producer_mode = CCVFHEmode; code = GE; swap_p = true; break; + default: gcc_unreachable (); + } +- scratch_mode = mode_for_int_vector (GET_MODE (cmp1)).require (); ++ scratch_mode = related_int_vector_mode (GET_MODE (cmp1)).require (); + + if (inv_p) + all_p = !all_p; +@@ -6694,7 +6694,7 @@ s390_expand_vcond (rtx target, rtx then, rtx els, + + /* We always use an integral type vector to hold the comparison + result. */ +- result_mode = mode_for_int_vector (cmp_mode).require (); ++ result_mode = related_int_vector_mode (cmp_mode).require (); + result_target = gen_reg_rtx (result_mode); + + /* We allow vector immediates as comparison operands that +diff --git a/gcc/cp/call.c b/gcc/cp/call.c +index f365a5a7f7b..23a54f3c332 100644 +--- a/gcc/cp/call.c ++++ b/gcc/cp/call.c +@@ -5161,7 +5161,7 @@ build_conditional_expr_1 (const op_location_t &loc, + + if (!COMPARISON_CLASS_P (arg1)) + { +- tree cmp_type = build_same_sized_truth_vector_type (arg1_type); ++ tree cmp_type = truth_type_for (arg1_type); + arg1 = build2 (NE_EXPR, cmp_type, arg1, build_zero_cst (arg1_type)); + } + return build3_loc (loc, VEC_COND_EXPR, arg2_type, arg1, arg2, arg3); +diff --git a/gcc/cp/class.c b/gcc/cp/class.c +index 6b57184e081..5b0a60d61cc 100644 +--- a/gcc/cp/class.c ++++ b/gcc/cp/class.c +@@ -4760,8 +4760,6 @@ adjust_clone_args (tree decl) + tree orig_decl_parms = TYPE_ARG_TYPES (TREE_TYPE (decl)); + tree decl_parms, clone_parms; + +- clone_parms = orig_clone_parms; +- + /* Skip the 'this' parameter. */ + orig_clone_parms = TREE_CHAIN (orig_clone_parms); + orig_decl_parms = TREE_CHAIN (orig_decl_parms); +@@ -8581,7 +8579,6 @@ dump_class_hierarchy_r (FILE *stream, + tree base_binfo; + int i; + +- indented = maybe_indent_hierarchy (stream, indent, 0); + fprintf (stream, "%s (0x" HOST_WIDE_INT_PRINT_HEX ") ", + type_as_string (BINFO_TYPE (binfo), TFF_PLAIN_IDENTIFIER), + (HOST_WIDE_INT) (uintptr_t) binfo); +@@ -8602,7 +8599,6 @@ dump_class_hierarchy_r (FILE *stream, + fprintf (stream, " virtual"); + fprintf (stream, "\n"); + +- indented = 0; + if (BINFO_PRIMARY_P (binfo)) + { + indented = maybe_indent_hierarchy (stream, indent + 3, indented); +diff --git a/gcc/cp/decl.c b/gcc/cp/decl.c +index 39d55589ef3..5c82c2272c2 100644 +--- a/gcc/cp/decl.c ++++ b/gcc/cp/decl.c +@@ -6387,7 +6387,7 @@ build_aggr_init_full_exprs (tree decl, tree init, int flags) + static tree + check_initializer (tree decl, tree init, int flags, vec **cleanups) + { +- tree type = TREE_TYPE (decl); ++ tree type; + tree init_code = NULL; + tree core_type; + +diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c +index e1c02d7b718..60fe58e0313 100644 +--- a/gcc/cp/parser.c ++++ b/gcc/cp/parser.c +@@ -10485,7 +10485,7 @@ cp_parser_lambda_expression (cp_parser* parser) + if (ok) + maybe_add_lambda_conv_op (type); + +- type = finish_struct (type, /*attributes=*/NULL_TREE); ++ finish_struct (type, /*attributes=*/NULL_TREE); + + in_discarded_stmt = discarded; + +diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c +index 4787747b6ff..ff7921533cb 100644 +--- a/gcc/cp/pt.c ++++ b/gcc/cp/pt.c +@@ -7459,8 +7459,7 @@ unify_bound_ttp_args (tree tparms, tree targs, tree parm, tree& arg, + { + /* In keeping with P0522R0, adjust P's template arguments + to apply to A's template; then flatten it again. */ +- tree nparmvec = parmvec; +- nparmvec = coerce_ttp_args_for_tta (arg, parmvec, tf_none); ++ tree nparmvec = coerce_ttp_args_for_tta (arg, parmvec, tf_none); + nparmvec = expand_template_argument_pack (nparmvec); + + if (unify (tparms, targs, nparmvec, argvec, +@@ -7887,7 +7886,6 @@ convert_template_argument (tree parm, + invalid, but static members are OK. In any + case, grab the underlying fields/functions + and issue an error later if required. */ +- orig_arg = TREE_VALUE (arg); + TREE_TYPE (arg) = unknown_type_node; + } + +diff --git a/gcc/cp/rtti.c b/gcc/cp/rtti.c +index 3ca2b5e7b88..9aea6b939ec 100644 +--- a/gcc/cp/rtti.c ++++ b/gcc/cp/rtti.c +@@ -209,8 +209,8 @@ build_headof (tree exp) + offset = build_vtbl_ref (cp_build_fold_indirect_ref (exp), + index); + +- type = cp_build_qualified_type (ptr_type_node, +- cp_type_quals (TREE_TYPE (exp))); ++ cp_build_qualified_type (ptr_type_node, ++ cp_type_quals (TREE_TYPE (exp))); + return fold_build_pointer_plus (exp, offset); + } + +diff --git a/gcc/cp/typeck.c b/gcc/cp/typeck.c +index 2169f8c4efd..c42fd731cd2 100644 +--- a/gcc/cp/typeck.c ++++ b/gcc/cp/typeck.c +@@ -4293,7 +4293,7 @@ build_vec_cmp (tree_code code, tree type, + { + tree zero_vec = build_zero_cst (type); + tree minus_one_vec = build_minus_one_cst (type); +- tree cmp_type = build_same_sized_truth_vector_type(type); ++ tree cmp_type = truth_type_for (type); + tree cmp = build2 (code, cmp_type, arg0, arg1); + return build3 (VEC_COND_EXPR, type, cmp, minus_one_vec, zero_vec); + } +@@ -9189,8 +9189,6 @@ convert_for_initialization (tree exp, tree type, tree rhs, int flags, + if (exp == error_mark_node) + return error_mark_node; + +- rhstype = non_reference (rhstype); +- + type = complete_type (type); + + if (DIRECT_INIT_EXPR_P (type, rhs)) +diff --git a/gcc/cselib.c b/gcc/cselib.c +index 84c17c23f6d..108b2588cf9 100644 +--- a/gcc/cselib.c ++++ b/gcc/cselib.c +@@ -2518,13 +2518,12 @@ cselib_record_sets (rtx_insn *insn) + int n_sets = 0; + int i; + struct cselib_set sets[MAX_SETS]; +- rtx body = PATTERN (insn); + rtx cond = 0; + int n_sets_before_autoinc; + int n_strict_low_parts = 0; + struct cselib_record_autoinc_data data; + +- body = PATTERN (insn); ++ rtx body = PATTERN (insn); + if (GET_CODE (body) == COND_EXEC) + { + cond = COND_EXEC_TEST (body); +diff --git a/gcc/d/d-codegen.cc b/gcc/d/d-codegen.cc +index 2abff92fc88..6f5499b08ee 100644 +--- a/gcc/d/d-codegen.cc ++++ b/gcc/d/d-codegen.cc +@@ -1397,7 +1397,7 @@ build_boolop (tree_code code, tree arg0, tree arg1) + /* Build a vector comparison. + VEC_COND_EXPR ; */ + tree type = TREE_TYPE (arg0); +- tree cmptype = build_same_sized_truth_vector_type (type); ++ tree cmptype = truth_type_for (type); + tree cmp = fold_build2_loc (input_location, code, cmptype, arg0, arg1); + + return fold_build3_loc (input_location, VEC_COND_EXPR, type, cmp, +diff --git a/gcc/df-scan.c b/gcc/df-scan.c +index 08d7af33371..84c2e54c855 100644 +--- a/gcc/df-scan.c ++++ b/gcc/df-scan.c +@@ -229,7 +229,6 @@ void + df_scan_alloc (bitmap all_blocks ATTRIBUTE_UNUSED) + { + struct df_scan_problem_data *problem_data; +- unsigned int insn_num = get_max_uid () + 1; + basic_block bb; + + /* Given the number of pools, this is really faster than tearing +@@ -257,7 +256,6 @@ df_scan_alloc (bitmap all_blocks ATTRIBUTE_UNUSED) + bitmap_obstack_initialize (&problem_data->reg_bitmaps); + bitmap_obstack_initialize (&problem_data->insn_bitmaps); + +- insn_num += insn_num / 4; + df_grow_reg_info (); + + df_grow_insn_info (); +diff --git a/gcc/doc/poly-int.texi b/gcc/doc/poly-int.texi +index 1023e823cb3..d60bb02aabf 100644 +--- a/gcc/doc/poly-int.texi ++++ b/gcc/doc/poly-int.texi +@@ -803,6 +803,18 @@ the assertion is known to hold. + @item constant_lower_bound (@var{a}) + Assert that @var{a} is nonnegative and return the smallest value it can have. + ++@item constant_lower_bound_with_limit (@var{a}, @var{b}) ++Return the least value @var{a} can have, given that the context in ++which @var{a} appears guarantees that the answer is no less than @var{b}. ++In other words, the caller is asserting that @var{a} is greater than or ++equal to @var{b} even if @samp{known_ge (@var{a}, @var{b})} doesn't hold. ++ ++@item constant_upper_bound_with_limit (@var{a}, @var{b}) ++Return the greatest value @var{a} can have, given that the context in ++which @var{a} appears guarantees that the answer is no greater than @var{b}. ++In other words, the caller is asserting that @var{a} is less than or equal ++to @var{b} even if @samp{known_le (@var{a}, @var{b})} doesn't hold. ++ + @item lower_bound (@var{a}, @var{b}) + Return a value that is always less than or equal to both @var{a} and @var{b}. + It will be the greatest such value for some indeterminate values +diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi +index 8c8978bb13a..73db70867b4 100644 +--- a/gcc/doc/tm.texi ++++ b/gcc/doc/tm.texi +@@ -6016,27 +6016,71 @@ against lower halves of vectors recursively until the specified mode is + reached. The default is @var{mode} which means no splitting. + @end deftypefn + +-@deftypefn {Target Hook} void TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES (vector_sizes *@var{sizes}) +-If the mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is not +-the only one that is worth considering, this hook should add all suitable +-vector sizes to @var{sizes}, in order of decreasing preference. The first +-one should be the size of @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE}. ++@deftypefn {Target Hook} {unsigned int} TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES (vector_modes *@var{modes}, bool @var{all}) ++If using the mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} ++is not the only approach worth considering, this hook should add one mode to ++@var{modes} for each useful alternative approach. These modes are then ++passed to @code{TARGET_VECTORIZE_RELATED_MODE} to obtain the vector mode ++for a given element mode. ++ ++The modes returned in @var{modes} should use the smallest element mode ++possible for the vectorization approach that they represent, preferring ++integer modes over floating-poing modes in the event of a tie. The first ++mode should be the @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} for its ++element mode. ++ ++If @var{all} is true, add suitable vector modes even when they are generally ++not expected to be worthwhile. ++ ++The hook returns a bitmask of flags that control how the modes in ++@var{modes} are used. The flags are: ++@table @code ++@item VECT_COMPARE_COSTS ++Tells the loop vectorizer to try all the provided modes and pick the one ++with the lowest cost. By default the vectorizer will choose the first ++mode that works. ++@end table + + The hook does not need to do anything if the vector returned by + @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant +-for autovectorization. The default implementation does nothing. +-@end deftypefn +- +-@deftypefn {Target Hook} opt_machine_mode TARGET_VECTORIZE_GET_MASK_MODE (poly_uint64 @var{nunits}, poly_uint64 @var{length}) +-A vector mask is a value that holds one boolean result for every element +-in a vector. This hook returns the machine mode that should be used to +-represent such a mask when the vector in question is @var{length} bytes +-long and contains @var{nunits} elements. The hook returns an empty +-@code{opt_machine_mode} if no such mode exists. +- +-The default implementation returns the mode of an integer vector that +-is @var{length} bytes long and that contains @var{nunits} elements, +-if such a mode exists. ++for autovectorization. The default implementation adds no modes and ++returns 0. ++@end deftypefn ++ ++@deftypefn {Target Hook} opt_machine_mode TARGET_VECTORIZE_RELATED_MODE (machine_mode @var{vector_mode}, scalar_mode @var{element_mode}, poly_uint64 @var{nunits}) ++If a piece of code is using vector mode @var{vector_mode} and also wants ++to operate on elements of mode @var{element_mode}, return the vector mode ++it should use for those elements. If @var{nunits} is nonzero, ensure that ++the mode has exactly @var{nunits} elements, otherwise pick whichever vector ++size pairs the most naturally with @var{vector_mode}. Return an empty ++@code{opt_machine_mode} if there is no supported vector mode with the ++required properties. ++ ++There is no prescribed way of handling the case in which @var{nunits} ++is zero. One common choice is to pick a vector mode with the same size ++as @var{vector_mode}; this is the natural choice if the target has a ++fixed vector size. Another option is to choose a vector mode with the ++same number of elements as @var{vector_mode}; this is the natural choice ++if the target has a fixed number of elements. Alternatively, the hook ++might choose a middle ground, such as trying to keep the number of ++elements as similar as possible while applying maximum and minimum ++vector sizes. ++ ++The default implementation uses @code{mode_for_vector} to find the ++requested mode, returning a mode with the same size as @var{vector_mode} ++when @var{nunits} is zero. This is the correct behavior for most targets. ++@end deftypefn ++ ++@deftypefn {Target Hook} opt_machine_mode TARGET_VECTORIZE_GET_MASK_MODE (machine_mode @var{mode}) ++Return the mode to use for a vector mask that holds one boolean ++result for each element of vector mode @var{mode}. The returned mask mode ++can be a vector of integers (class @code{MODE_VECTOR_INT}), a vector of ++booleans (class @code{MODE_VECTOR_BOOL}) or a scalar integer (class ++@code{MODE_INT}). Return an empty @code{opt_machine_mode} if no such ++mask mode exists. ++ ++The default implementation returns a @code{MODE_VECTOR_INT} with the ++same size and number of elements as @var{mode}, if such a mode exists. + @end deftypefn + + @deftypefn {Target Hook} bool TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE (unsigned @var{ifn}) +diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in +index fe1194ef91a..bc362dca0f5 100644 +--- a/gcc/doc/tm.texi.in ++++ b/gcc/doc/tm.texi.in +@@ -4172,7 +4172,9 @@ address; but often a machine-dependent strategy can generate better code. + + @hook TARGET_VECTORIZE_SPLIT_REDUCTION + +-@hook TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES ++@hook TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES ++ ++@hook TARGET_VECTORIZE_RELATED_MODE + + @hook TARGET_VECTORIZE_GET_MASK_MODE + +diff --git a/gcc/dojump.c b/gcc/dojump.c +index 8626689463e..bac37a357a9 100644 +--- a/gcc/dojump.c ++++ b/gcc/dojump.c +@@ -668,8 +668,6 @@ do_jump_by_parts_greater_rtx (scalar_int_mode mode, int unsignedp, rtx op0, + code = LE; + if_true_label = if_false_label; + if_false_label = drop_through_label; +- drop_through_if_true = false; +- drop_through_if_false = true; + prob = prob.invert (); + } + +diff --git a/gcc/early-remat.c b/gcc/early-remat.c +index 122891c1edb..0396f16babf 100644 +--- a/gcc/early-remat.c ++++ b/gcc/early-remat.c +@@ -1123,7 +1123,6 @@ early_remat::record_equiv_candidates (unsigned int cand1_index, + ec->representative = cand1_index; + cand1->equiv_class = ec; + } +- cand1 = &m_candidates[ec->representative]; + cand2->equiv_class = ec; + bitmap_set_bit (ec->members, cand2_index); + if (cand2_index > ec->representative) +diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c +index 15dffa58a2e..78104603c76 100644 +--- a/gcc/emit-rtl.c ++++ b/gcc/emit-rtl.c +@@ -3993,7 +3993,7 @@ try_split (rtx pat, rtx_insn *trial, int last) + before = PREV_INSN (trial); + after = NEXT_INSN (trial); + +- tem = emit_insn_after_setloc (seq, trial, INSN_LOCATION (trial)); ++ emit_insn_after_setloc (seq, trial, INSN_LOCATION (trial)); + + delete_insn (trial); + +diff --git a/gcc/expmed.c b/gcc/expmed.c +index c5f5499c013..34cdfbf151a 100644 +--- a/gcc/expmed.c ++++ b/gcc/expmed.c +@@ -1662,12 +1662,10 @@ extract_bit_field_1 (rtx str_rtx, poly_uint64 bitsize, poly_uint64 bitnum, + poly_uint64 nunits; + if (!multiple_p (GET_MODE_BITSIZE (GET_MODE (op0)), + GET_MODE_UNIT_BITSIZE (tmode), &nunits) +- || !mode_for_vector (inner_mode, nunits).exists (&new_mode) +- || !VECTOR_MODE_P (new_mode) ++ || !related_vector_mode (tmode, inner_mode, ++ nunits).exists (&new_mode) + || maybe_ne (GET_MODE_SIZE (new_mode), +- GET_MODE_SIZE (GET_MODE (op0))) +- || GET_MODE_INNER (new_mode) != GET_MODE_INNER (tmode) +- || !targetm.vector_mode_supported_p (new_mode)) ++ GET_MODE_SIZE (GET_MODE (op0)))) + new_mode = VOIDmode; + } + poly_uint64 pos; +diff --git a/gcc/expr.c b/gcc/expr.c +index fa15b7eceae..5e3700fe15f 100644 +--- a/gcc/expr.c ++++ b/gcc/expr.c +@@ -249,6 +249,31 @@ convert_move (rtx to, rtx from, int unsignedp) + + if (VECTOR_MODE_P (to_mode) || VECTOR_MODE_P (from_mode)) + { ++ if (GET_MODE_UNIT_PRECISION (to_mode) ++ > GET_MODE_UNIT_PRECISION (from_mode)) ++ { ++ optab op = unsignedp ? zext_optab : sext_optab; ++ insn_code icode = convert_optab_handler (op, to_mode, from_mode); ++ if (icode != CODE_FOR_nothing) ++ { ++ emit_unop_insn (icode, to, from, ++ unsignedp ? ZERO_EXTEND : SIGN_EXTEND); ++ return; ++ } ++ } ++ ++ if (GET_MODE_UNIT_PRECISION (to_mode) ++ < GET_MODE_UNIT_PRECISION (from_mode)) ++ { ++ insn_code icode = convert_optab_handler (trunc_optab, ++ to_mode, from_mode); ++ if (icode != CODE_FOR_nothing) ++ { ++ emit_unop_insn (icode, to, from, TRUNCATE); ++ return; ++ } ++ } ++ + gcc_assert (known_eq (GET_MODE_BITSIZE (from_mode), + GET_MODE_BITSIZE (to_mode))); + +diff --git a/gcc/fold-const.h b/gcc/fold-const.h +index 049fee91876..e2e66246315 100644 +--- a/gcc/fold-const.h ++++ b/gcc/fold-const.h +@@ -83,7 +83,7 @@ extern bool fold_deferring_overflow_warnings_p (void); + extern void fold_overflow_warning (const char*, enum warn_strict_overflow_code); + extern enum tree_code fold_div_compare (enum tree_code, tree, tree, + tree *, tree *, bool *); +-extern int operand_equal_p (const_tree, const_tree, unsigned int); ++extern int operand_equal_p (const_tree, const_tree, unsigned int flags = 0); + extern int multiple_of_p (tree, const_tree, const_tree); + #define omit_one_operand(T1,T2,T3)\ + omit_one_operand_loc (UNKNOWN_LOCATION, T1, T2, T3) +diff --git a/gcc/fwprop.c b/gcc/fwprop.c +index cf2c9de2d35..f2966fadae8 100644 +--- a/gcc/fwprop.c ++++ b/gcc/fwprop.c +@@ -448,6 +448,18 @@ enum { + PR_OPTIMIZE_FOR_SPEED = 4 + }; + ++/* Check that X has a single def. */ ++ ++static bool ++reg_single_def_p (rtx x) ++{ ++ if (!REG_P (x)) ++ return false; ++ ++ int regno = REGNO (x); ++ return (DF_REG_DEF_COUNT (regno) == 1 ++ && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (cfun)), regno)); ++} + + /* Replace all occurrences of OLD in *PX with NEW and try to simplify the + resulting expression. Replace *PX with a new RTL expression if an +@@ -547,6 +559,54 @@ propagate_rtx_1 (rtx *px, rtx old_rtx, rtx new_rtx, int flags) + tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)), + SUBREG_BYTE (x)); + } ++ ++ else ++ { ++ rtvec vec; ++ rtvec newvec; ++ const char *fmt = GET_RTX_FORMAT (code); ++ rtx op; ++ ++ for (int i = 0; fmt[i]; i++) ++ switch (fmt[i]) ++ { ++ case 'E': ++ vec = XVEC (x, i); ++ newvec = vec; ++ for (int j = 0; j < GET_NUM_ELEM (vec); j++) ++ { ++ op = RTVEC_ELT (vec, j); ++ valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); ++ if (op != RTVEC_ELT (vec, j)) ++ { ++ if (newvec == vec) ++ { ++ newvec = shallow_copy_rtvec (vec); ++ if (!tem) ++ tem = shallow_copy_rtx (x); ++ XVEC (tem, i) = newvec; ++ } ++ RTVEC_ELT (newvec, j) = op; ++ } ++ } ++ break; ++ ++ case 'e': ++ if (XEXP (x, i)) ++ { ++ op = XEXP (x, i); ++ valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); ++ if (op != XEXP (x, i)) ++ { ++ if (!tem) ++ tem = shallow_copy_rtx (x); ++ XEXP (tem, i) = op; ++ } ++ } ++ break; ++ } ++ } ++ + break; + + case RTX_OBJ: +@@ -1370,10 +1430,11 @@ forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set) + + /* Given a use USE of an insn, if it has a single reaching + definition, try to forward propagate it into that insn. +- Return true if cfg cleanup will be needed. */ ++ Return true if cfg cleanup will be needed. ++ REG_PROP_ONLY is true if we should only propagate register copies. */ + + static bool +-forward_propagate_into (df_ref use) ++forward_propagate_into (df_ref use, bool reg_prop_only = false) + { + df_ref def; + rtx_insn *def_insn, *use_insn; +@@ -1394,10 +1455,6 @@ forward_propagate_into (df_ref use) + if (DF_REF_IS_ARTIFICIAL (def)) + return false; + +- /* Do not propagate loop invariant definitions inside the loop. */ +- if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father) +- return false; +- + /* Check if the use is still present in the insn! */ + use_insn = DF_REF_INSN (use); + if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE) +@@ -1415,6 +1472,19 @@ forward_propagate_into (df_ref use) + if (!def_set) + return false; + ++ if (reg_prop_only ++ && (!reg_single_def_p (SET_SRC (def_set)) ++ || !reg_single_def_p (SET_DEST (def_set)))) ++ return false; ++ ++ /* Allow propagations into a loop only for reg-to-reg copies, since ++ replacing one register by another shouldn't increase the cost. */ ++ ++ if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father ++ && (!reg_single_def_p (SET_SRC (def_set)) ++ || !reg_single_def_p (SET_DEST (def_set)))) ++ return false; ++ + /* Only try one kind of propagation. If two are possible, we'll + do it on the following iterations. */ + if (forward_propagate_and_simplify (use, def_insn, def_set) +@@ -1483,7 +1553,7 @@ gate_fwprop (void) + } + + static unsigned int +-fwprop (void) ++fwprop (bool fwprop_addr_p) + { + unsigned i; + +@@ -1502,11 +1572,16 @@ fwprop (void) + + df_ref use = DF_USES_GET (i); + if (use) +- if (DF_REF_TYPE (use) == DF_REF_REG_USE +- || DF_REF_BB (use)->loop_father == NULL +- /* The outer most loop is not really a loop. */ +- || loop_outer (DF_REF_BB (use)->loop_father) == NULL) +- forward_propagate_into (use); ++ { ++ if (DF_REF_TYPE (use) == DF_REF_REG_USE ++ || DF_REF_BB (use)->loop_father == NULL ++ /* The outer most loop is not really a loop. */ ++ || loop_outer (DF_REF_BB (use)->loop_father) == NULL) ++ forward_propagate_into (use, fwprop_addr_p); ++ ++ else if (fwprop_addr_p) ++ forward_propagate_into (use, false); ++ } + } + + fwprop_done (); +@@ -1537,7 +1612,7 @@ public: + + /* opt_pass methods: */ + virtual bool gate (function *) { return gate_fwprop (); } +- virtual unsigned int execute (function *) { return fwprop (); } ++ virtual unsigned int execute (function *) { return fwprop (false); } + + }; // class pass_rtl_fwprop + +@@ -1549,33 +1624,6 @@ make_pass_rtl_fwprop (gcc::context *ctxt) + return new pass_rtl_fwprop (ctxt); + } + +-static unsigned int +-fwprop_addr (void) +-{ +- unsigned i; +- +- fwprop_init (); +- +- /* Go through all the uses. df_uses_create will create new ones at the +- end, and we'll go through them as well. */ +- for (i = 0; i < DF_USES_TABLE_SIZE (); i++) +- { +- if (!propagations_left) +- break; +- +- df_ref use = DF_USES_GET (i); +- if (use) +- if (DF_REF_TYPE (use) != DF_REF_REG_USE +- && DF_REF_BB (use)->loop_father != NULL +- /* The outer most loop is not really a loop. */ +- && loop_outer (DF_REF_BB (use)->loop_father) != NULL) +- forward_propagate_into (use); +- } +- +- fwprop_done (); +- return 0; +-} +- + namespace { + + const pass_data pass_data_rtl_fwprop_addr = +@@ -1600,7 +1648,7 @@ public: + + /* opt_pass methods: */ + virtual bool gate (function *) { return gate_fwprop (); } +- virtual unsigned int execute (function *) { return fwprop_addr (); } ++ virtual unsigned int execute (function *) { return fwprop (true); } + + }; // class pass_rtl_fwprop_addr + +diff --git a/gcc/gimple.c b/gcc/gimple.c +index 8fae60fb848..bf362dbe545 100644 +--- a/gcc/gimple.c ++++ b/gcc/gimple.c +@@ -1771,6 +1771,8 @@ gimple_get_lhs (const gimple *stmt) + return gimple_assign_lhs (stmt); + else if (code == GIMPLE_CALL) + return gimple_call_lhs (stmt); ++ else if (code == GIMPLE_PHI) ++ return gimple_phi_result (stmt); + else + return NULL_TREE; + } +diff --git a/gcc/graphite-scop-detection.c b/gcc/graphite-scop-detection.c +index 4534d43721f..489d0b93b42 100644 +--- a/gcc/graphite-scop-detection.c ++++ b/gcc/graphite-scop-detection.c +@@ -1105,14 +1105,12 @@ assign_parameter_index_in_region (tree name, sese_info_p region) + gcc_assert (TREE_CODE (name) == SSA_NAME + && INTEGRAL_TYPE_P (TREE_TYPE (name)) + && ! defined_in_sese_p (name, region->region)); +- + int i; + tree p; + FOR_EACH_VEC_ELT (region->params, i, p) + if (p == name) + return; + +- i = region->params.length (); + region->params.safe_push (name); + } + +diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c +index 95788dfee7d..21ecd566766 100644 +--- a/gcc/internal-fn.c ++++ b/gcc/internal-fn.c +@@ -100,7 +100,7 @@ init_internal_fns () + /* Create static initializers for the information returned by + direct_internal_fn. */ + #define not_direct { -2, -2, false } +-#define mask_load_direct { -1, 2, false } ++#define mask_load_direct { -1, 2, true } + #define load_lanes_direct { -1, -1, false } + #define mask_load_lanes_direct { -1, -1, false } + #define gather_load_direct { -1, -1, false } +diff --git a/gcc/ira-color.c b/gcc/ira-color.c +index aa91b56c81f..8a90ae1b4e6 100644 +--- a/gcc/ira-color.c ++++ b/gcc/ira-color.c +@@ -1108,7 +1108,6 @@ setup_profitable_hard_regs (void) + || empty_profitable_hard_regs (a)) + continue; + data = ALLOCNO_COLOR_DATA (a); +- mode = ALLOCNO_MODE (a); + if ((costs = ALLOCNO_UPDATED_HARD_REG_COSTS (a)) != NULL + || (costs = ALLOCNO_HARD_REG_COSTS (a)) != NULL) + { +diff --git a/gcc/ira.c b/gcc/ira.c +index b330f2a287b..4262e5cf3b7 100644 +--- a/gcc/ira.c ++++ b/gcc/ira.c +@@ -4414,10 +4414,9 @@ rtx_moveable_p (rtx *loc, enum op_type type) + { + const char *fmt; + rtx x = *loc; +- enum rtx_code code = GET_CODE (x); + int i, j; + +- code = GET_CODE (x); ++ enum rtx_code code = GET_CODE (x); + switch (code) + { + case CONST: +diff --git a/gcc/lra-eliminations.c b/gcc/lra-eliminations.c +index ee9fd51f129..7a345a52ae1 100644 +--- a/gcc/lra-eliminations.c ++++ b/gcc/lra-eliminations.c +@@ -1146,7 +1146,6 @@ eliminate_regs_in_insn (rtx_insn *insn, bool replace_p, bool first_p, + single_set without having put new body into the insn and the + re-recognition won't hurt in this rare case. */ + id = lra_update_insn_recog_data (insn); +- static_id = id->insn_static_data; + } + + /* Spill pseudos which are assigned to hard registers in SET. Add +diff --git a/gcc/lra.c b/gcc/lra.c +index 1d2578f8c12..10b85340fc5 100644 +--- a/gcc/lra.c ++++ b/gcc/lra.c +@@ -1029,12 +1029,8 @@ lra_set_insn_recog_data (rtx_insn *insn) + data->operand_loc, + constraints, operand_mode, NULL); + if (nop > 0) +- { +- const char *p = recog_data.constraints[0]; +- +- for (p = constraints[0]; *p; p++) +- nalt += *p == ','; +- } ++ for (const char *p =constraints[0]; *p; p++) ++ nalt += *p == ','; + data->insn_static_data = insn_static_data + = get_static_insn_data (-1, nop, 0, nalt); + for (i = 0; i < nop; i++) +diff --git a/gcc/machmode.h b/gcc/machmode.h +index 3a7cee88962..d564f9c6458 100644 +--- a/gcc/machmode.h ++++ b/gcc/machmode.h +@@ -257,6 +257,9 @@ public: + bool exists () const; + template bool exists (U *) const; + ++ bool operator== (const T &m) const { return m_mode == m; } ++ bool operator!= (const T &m) const { return m_mode != m; } ++ + private: + machine_mode m_mode; + }; +@@ -841,20 +844,9 @@ smallest_int_mode_for_size (poly_uint64 size) + extern opt_scalar_int_mode int_mode_for_mode (machine_mode); + extern opt_machine_mode bitwise_mode_for_mode (machine_mode); + extern opt_machine_mode mode_for_vector (scalar_mode, poly_uint64); +-extern opt_machine_mode mode_for_int_vector (unsigned int, poly_uint64); +- +-/* Return the integer vector equivalent of MODE, if one exists. In other +- words, return the mode for an integer vector that has the same number +- of bits as MODE and the same number of elements as MODE, with the +- latter being 1 if MODE is scalar. The returned mode can be either +- an integer mode or a vector mode. */ +- +-inline opt_machine_mode +-mode_for_int_vector (machine_mode mode) +-{ +- return mode_for_int_vector (GET_MODE_UNIT_BITSIZE (mode), +- GET_MODE_NUNITS (mode)); +-} ++extern opt_machine_mode related_vector_mode (machine_mode, scalar_mode, ++ poly_uint64 = 0); ++extern opt_machine_mode related_int_vector_mode (machine_mode); + + /* A class for iterating through possible bitfield modes. */ + class bit_field_mode_iterator +diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c +index 74159734fc8..0d7f104a2f2 100644 +--- a/gcc/omp-expand.c ++++ b/gcc/omp-expand.c +@@ -4974,6 +4974,13 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) + && loop->safelen > 1) + { + loop->force_vectorize = true; ++ if (simdlen && tree_fits_uhwi_p (OMP_CLAUSE_SIMDLEN_EXPR (simdlen))) ++ { ++ unsigned HOST_WIDE_INT v ++ = tree_to_uhwi (OMP_CLAUSE_SIMDLEN_EXPR (simdlen)); ++ if (v < INT_MAX && v <= (unsigned HOST_WIDE_INT) loop->safelen) ++ loop->simdlen = v; ++ } + cfun->has_force_vectorize_loops = true; + } + else if (dont_vectorize) +diff --git a/gcc/omp-general.c b/gcc/omp-general.c +index 356772ff458..4fb53af7587 100644 +--- a/gcc/omp-general.c ++++ b/gcc/omp-general.c +@@ -468,13 +468,16 @@ omp_max_vf (void) + && global_options_set.x_flag_tree_loop_vectorize)) + return 1; + +- auto_vector_sizes sizes; +- targetm.vectorize.autovectorize_vector_sizes (&sizes); +- if (!sizes.is_empty ()) ++ auto_vector_modes modes; ++ targetm.vectorize.autovectorize_vector_modes (&modes, true); ++ if (!modes.is_empty ()) + { + poly_uint64 vf = 0; +- for (unsigned int i = 0; i < sizes.length (); ++i) +- vf = ordered_max (vf, sizes[i]); ++ for (unsigned int i = 0; i < modes.length (); ++i) ++ /* The returned modes use the smallest element size (and thus ++ the largest nunits) for the vectorization approach that they ++ represent. */ ++ vf = ordered_max (vf, GET_MODE_NUNITS (modes[i])); + return vf; + } + +diff --git a/gcc/omp-low.c b/gcc/omp-low.c +index 813cefd69b9..7866639f76c 100644 +--- a/gcc/omp-low.c ++++ b/gcc/omp-low.c +@@ -3650,11 +3650,8 @@ omp_clause_aligned_alignment (tree clause) + /* Otherwise return implementation defined alignment. */ + unsigned int al = 1; + opt_scalar_mode mode_iter; +- auto_vector_sizes sizes; +- targetm.vectorize.autovectorize_vector_sizes (&sizes); +- poly_uint64 vs = 0; +- for (unsigned int i = 0; i < sizes.length (); ++i) +- vs = ordered_max (vs, sizes[i]); ++ auto_vector_modes modes; ++ targetm.vectorize.autovectorize_vector_modes (&modes, true); + static enum mode_class classes[] + = { MODE_INT, MODE_VECTOR_INT, MODE_FLOAT, MODE_VECTOR_FLOAT }; + for (int i = 0; i < 4; i += 2) +@@ -3665,19 +3662,18 @@ omp_clause_aligned_alignment (tree clause) + machine_mode vmode = targetm.vectorize.preferred_simd_mode (mode); + if (GET_MODE_CLASS (vmode) != classes[i + 1]) + continue; +- while (maybe_ne (vs, 0U) +- && known_lt (GET_MODE_SIZE (vmode), vs) +- && GET_MODE_2XWIDER_MODE (vmode).exists ()) +- vmode = GET_MODE_2XWIDER_MODE (vmode).require (); ++ machine_mode alt_vmode; ++ for (unsigned int j = 0; j < modes.length (); ++j) ++ if (related_vector_mode (modes[j], mode).exists (&alt_vmode) ++ && known_ge (GET_MODE_SIZE (alt_vmode), GET_MODE_SIZE (vmode))) ++ vmode = alt_vmode; + + tree type = lang_hooks.types.type_for_mode (mode, 1); + if (type == NULL_TREE || TYPE_MODE (type) != mode) + continue; +- poly_uint64 nelts = exact_div (GET_MODE_SIZE (vmode), +- GET_MODE_SIZE (mode)); +- type = build_vector_type (type, nelts); +- if (TYPE_MODE (type) != vmode) +- continue; ++ type = build_vector_type_for_mode (type, vmode); ++ /* The functions above are not allowed to return invalid modes. */ ++ gcc_assert (TYPE_MODE (type) == vmode); + if (TYPE_ALIGN_UNIT (type) > al) + al = TYPE_ALIGN_UNIT (type); + } +diff --git a/gcc/optabs-query.c b/gcc/optabs-query.c +index 71c73fb43cc..61de7dc283b 100644 +--- a/gcc/optabs-query.c ++++ b/gcc/optabs-query.c +@@ -354,11 +354,8 @@ can_conditionally_move_p (machine_mode mode) + opt_machine_mode + qimode_for_vec_perm (machine_mode mode) + { +- machine_mode qimode; +- if (GET_MODE_INNER (mode) != QImode +- && mode_for_vector (QImode, GET_MODE_SIZE (mode)).exists (&qimode) +- && VECTOR_MODE_P (qimode)) +- return qimode; ++ if (GET_MODE_INNER (mode) != QImode) ++ return related_vector_mode (mode, QImode, GET_MODE_SIZE (mode)); + return opt_machine_mode (); + } + +@@ -587,22 +584,21 @@ can_vec_mask_load_store_p (machine_mode mode, + if (!VECTOR_MODE_P (vmode)) + return false; + +- if ((targetm.vectorize.get_mask_mode +- (GET_MODE_NUNITS (vmode), GET_MODE_SIZE (vmode)).exists (&mask_mode)) ++ if (targetm.vectorize.get_mask_mode (vmode).exists (&mask_mode) + && convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing) + return true; + +- auto_vector_sizes vector_sizes; +- targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); +- for (unsigned int i = 0; i < vector_sizes.length (); ++i) ++ auto_vector_modes vector_modes; ++ targetm.vectorize.autovectorize_vector_modes (&vector_modes, true); ++ for (unsigned int i = 0; i < vector_modes.length (); ++i) + { +- poly_uint64 cur = vector_sizes[i]; ++ poly_uint64 cur = GET_MODE_SIZE (vector_modes[i]); + poly_uint64 nunits; + if (!multiple_p (cur, GET_MODE_SIZE (smode), &nunits)) + continue; + if (mode_for_vector (smode, nunits).exists (&vmode) + && VECTOR_MODE_P (vmode) +- && targetm.vectorize.get_mask_mode (nunits, cur).exists (&mask_mode) ++ && targetm.vectorize.get_mask_mode (vmode).exists (&mask_mode) + && convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing) + return true; + } +diff --git a/gcc/optabs-tree.c b/gcc/optabs-tree.c +index 8157798cc71..341e02bd51c 100644 +--- a/gcc/optabs-tree.c ++++ b/gcc/optabs-tree.c +@@ -300,6 +300,20 @@ supportable_convert_operation (enum tree_code code, + return true; + } + ++ if (GET_MODE_UNIT_PRECISION (m1) > GET_MODE_UNIT_PRECISION (m2) ++ && can_extend_p (m1, m2, TYPE_UNSIGNED (vectype_in))) ++ { ++ *code1 = code; ++ return true; ++ } ++ ++ if (GET_MODE_UNIT_PRECISION (m1) < GET_MODE_UNIT_PRECISION (m2) ++ && convert_optab_handler (trunc_optab, m1, m2) != CODE_FOR_nothing) ++ { ++ *code1 = code; ++ return true; ++ } ++ + /* Now check for builtin. */ + if (targetm.vectorize.builtin_conversion + && targetm.vectorize.builtin_conversion (code, vectype_out, vectype_in)) +diff --git a/gcc/optabs.c b/gcc/optabs.c +index 7d7efe0a4a2..c2c1274ebdb 100644 +--- a/gcc/optabs.c ++++ b/gcc/optabs.c +@@ -2095,8 +2095,8 @@ expand_twoval_binop (optab binoptab, rtx op0, rtx op1, rtx targ0, rtx targ1, + xop1 = avoid_expensive_constant (mode1, binoptab, 1, xop1, unsignedp); + + create_fixed_operand (&ops[0], targ0); +- create_convert_operand_from (&ops[1], op0, mode, unsignedp); +- create_convert_operand_from (&ops[2], op1, mode, unsignedp); ++ create_convert_operand_from (&ops[1], xop0, mode, unsignedp); ++ create_convert_operand_from (&ops[2], xop1, mode, unsignedp); + create_fixed_operand (&ops[3], targ1); + if (maybe_expand_insn (icode, 4, ops)) + return 1; +@@ -5486,7 +5486,7 @@ expand_vec_perm_1 (enum insn_code icode, rtx target, + struct expand_operand ops[4]; + + gcc_assert (GET_MODE_CLASS (smode) == MODE_VECTOR_INT +- || mode_for_int_vector (tmode).require () == smode); ++ || related_int_vector_mode (tmode).require () == smode); + create_output_operand (&ops[0], target, tmode); + create_input_operand (&ops[3], sel, smode); + +@@ -5611,8 +5611,7 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1, + /* The optabs are only defined for selectors with the same width + as the values being permuted. */ + machine_mode required_sel_mode; +- if (!mode_for_int_vector (mode).exists (&required_sel_mode) +- || !VECTOR_MODE_P (required_sel_mode)) ++ if (!related_int_vector_mode (mode).exists (&required_sel_mode)) + { + delete_insns_since (last); + return NULL_RTX; +diff --git a/gcc/params.def b/gcc/params.def +index 3f18642475a..b269045fb9c 100644 +--- a/gcc/params.def ++++ b/gcc/params.def +@@ -1403,7 +1403,7 @@ DEFPARAM (PARAM_MAX_VRP_SWITCH_ASSERTIONS, + DEFPARAM (PARAM_VECT_EPILOGUES_NOMASK, + "vect-epilogues-nomask", + "Enable loop epilogue vectorization using smaller vector size.", +- 0, 0, 1) ++ 1, 0, 1) + + DEFPARAM(PARAM_UNROLL_JAM_MIN_PERCENT, + "unroll-jam-min-percent", +diff --git a/gcc/poly-int.h b/gcc/poly-int.h +index d68a652b5fa..ba39ca471be 100644 +--- a/gcc/poly-int.h ++++ b/gcc/poly-int.h +@@ -1528,6 +1528,29 @@ constant_lower_bound (const poly_int_pod &a) + return a.coeffs[0]; + } + ++/* Return the constant lower bound of A, given that it is no less than B. */ ++ ++template ++inline POLY_CONST_COEFF (Ca, Cb) ++constant_lower_bound_with_limit (const poly_int_pod &a, const Cb &b) ++{ ++ if (known_ge (a, b)) ++ return a.coeffs[0]; ++ return b; ++} ++ ++/* Return the constant upper bound of A, given that it is no greater ++ than B. */ ++ ++template ++inline POLY_CONST_COEFF (Ca, Cb) ++constant_upper_bound_with_limit (const poly_int_pod &a, const Cb &b) ++{ ++ if (known_le (a, b)) ++ return a.coeffs[0]; ++ return b; ++} ++ + /* Return a value that is known to be no greater than A and B. This + will be the greatest lower bound for some indeterminate values but + not necessarily for all. */ +diff --git a/gcc/read-rtl.c b/gcc/read-rtl.c +index ebd69bde531..1af51f686c7 100644 +--- a/gcc/read-rtl.c ++++ b/gcc/read-rtl.c +@@ -1282,7 +1282,7 @@ read_subst_mapping (htab_t subst_iters_table, htab_t subst_attrs_table, + m = add_mapping (&substs, subst_iters_table, attr_operands[1]); + end_ptr = &m->values; + end_ptr = add_map_value (end_ptr, 1, ""); +- end_ptr = add_map_value (end_ptr, 2, ""); ++ add_map_value (end_ptr, 2, ""); + + add_define_attr_for_define_subst (attr_operands[1], queue); + } +@@ -1290,7 +1290,7 @@ read_subst_mapping (htab_t subst_iters_table, htab_t subst_attrs_table, + m = add_mapping (&substs, subst_attrs_table, attr_operands[0]); + end_ptr = &m->values; + end_ptr = add_map_value (end_ptr, 1, attr_operands[2]); +- end_ptr = add_map_value (end_ptr, 2, attr_operands[3]); ++ add_map_value (end_ptr, 2, attr_operands[3]); + } + + /* Check newly-created code iterator ITERATOR to see whether every code has the +diff --git a/gcc/regrename.c b/gcc/regrename.c +index 637b3cbe6d7..5259d565e58 100644 +--- a/gcc/regrename.c ++++ b/gcc/regrename.c +@@ -1426,10 +1426,9 @@ scan_rtx (rtx_insn *insn, rtx *loc, enum reg_class cl, enum scan_actions action, + { + const char *fmt; + rtx x = *loc; +- enum rtx_code code = GET_CODE (x); + int i, j; + +- code = GET_CODE (x); ++ enum rtx_code code = GET_CODE (x); + switch (code) + { + case CONST: +diff --git a/gcc/reorg.c b/gcc/reorg.c +index 81349382b81..bdfcf8851cd 100644 +--- a/gcc/reorg.c ++++ b/gcc/reorg.c +@@ -2708,14 +2708,13 @@ fill_slots_from_thread (rtx_jump_insn *insn, rtx condition, + && GET_CODE (PATTERN (new_thread)) != ASM_INPUT + && asm_noperands (PATTERN (new_thread)) < 0) + { +- rtx pat = PATTERN (new_thread); + rtx dest; + rtx src; + + /* We know "new_thread" is an insn due to NONJUMP_INSN_P (new_thread) + above. */ + trial = as_a (new_thread); +- pat = PATTERN (trial); ++ rtx pat = PATTERN (trial); + + if (!NONJUMP_INSN_P (trial) + || GET_CODE (pat) != SET +diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c +index 50bbb79655b..bdbd1b98eba 100644 +--- a/gcc/simplify-rtx.c ++++ b/gcc/simplify-rtx.c +@@ -6709,6 +6709,17 @@ simplify_subreg (machine_mode outermode, rtx op, + } + } + ++ /* If OP is a vector comparison and the subreg is not changing the ++ number of elements or the size of the elements, change the result ++ of the comparison to the new mode. */ ++ if (COMPARISON_P (op) ++ && VECTOR_MODE_P (outermode) ++ && VECTOR_MODE_P (innermode) ++ && known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode)) ++ && known_eq (GET_MODE_UNIT_SIZE (outermode), ++ GET_MODE_UNIT_SIZE (innermode))) ++ return simplify_gen_relational (GET_CODE (op), outermode, innermode, ++ XEXP (op, 0), XEXP (op, 1)); + return NULL_RTX; + } + +diff --git a/gcc/stor-layout.c b/gcc/stor-layout.c +index 5d6f2e0166c..a054b7887e7 100644 +--- a/gcc/stor-layout.c ++++ b/gcc/stor-layout.c +@@ -514,18 +514,43 @@ mode_for_vector (scalar_mode innermode, poly_uint64 nunits) + return opt_machine_mode (); + } + +-/* Return the mode for a vector that has NUNITS integer elements of +- INT_BITS bits each, if such a mode exists. The mode can be either +- an integer mode or a vector mode. */ ++/* If a piece of code is using vector mode VECTOR_MODE and also wants ++ to operate on elements of mode ELEMENT_MODE, return the vector mode ++ it should use for those elements. If NUNITS is nonzero, ensure that ++ the mode has exactly NUNITS elements, otherwise pick whichever vector ++ size pairs the most naturally with VECTOR_MODE; this may mean choosing ++ a mode with a different size and/or number of elements, depending on ++ what the target prefers. Return an empty opt_machine_mode if there ++ is no supported vector mode with the required properties. ++ ++ Unlike mode_for_vector. any returned mode is guaranteed to satisfy ++ both VECTOR_MODE_P and targetm.vector_mode_supported_p. */ + + opt_machine_mode +-mode_for_int_vector (unsigned int int_bits, poly_uint64 nunits) ++related_vector_mode (machine_mode vector_mode, scalar_mode element_mode, ++ poly_uint64 nunits) + { ++ gcc_assert (VECTOR_MODE_P (vector_mode)); ++ return targetm.vectorize.related_mode (vector_mode, element_mode, nunits); ++} ++ ++/* If a piece of code is using vector mode VECTOR_MODE and also wants ++ to operate on integer vectors with the same element size and number ++ of elements, return the vector mode it should use. Return an empty ++ opt_machine_mode if there is no supported vector mode with the ++ required properties. ++ ++ Unlike mode_for_vector. any returned mode is guaranteed to satisfy ++ both VECTOR_MODE_P and targetm.vector_mode_supported_p. */ ++ ++opt_machine_mode ++related_int_vector_mode (machine_mode vector_mode) ++{ ++ gcc_assert (VECTOR_MODE_P (vector_mode)); + scalar_int_mode int_mode; +- machine_mode vec_mode; +- if (int_mode_for_size (int_bits, 0).exists (&int_mode) +- && mode_for_vector (int_mode, nunits).exists (&vec_mode)) +- return vec_mode; ++ if (int_mode_for_mode (GET_MODE_INNER (vector_mode)).exists (&int_mode)) ++ return related_vector_mode (vector_mode, int_mode, ++ GET_MODE_NUNITS (vector_mode)); + return opt_machine_mode (); + } + +diff --git a/gcc/target.def b/gcc/target.def +index 66cee075018..f998470fffd 100644 +--- a/gcc/target.def ++++ b/gcc/target.def +@@ -1894,33 +1894,80 @@ reached. The default is @var{mode} which means no splitting.", + /* Returns a mask of vector sizes to iterate over when auto-vectorizing + after processing the preferred one derived from preferred_simd_mode. */ + DEFHOOK +-(autovectorize_vector_sizes, +- "If the mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is not\n\ +-the only one that is worth considering, this hook should add all suitable\n\ +-vector sizes to @var{sizes}, in order of decreasing preference. The first\n\ +-one should be the size of @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE}.\n\ ++(autovectorize_vector_modes, ++ "If using the mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE}\n\ ++is not the only approach worth considering, this hook should add one mode to\n\ ++@var{modes} for each useful alternative approach. These modes are then\n\ ++passed to @code{TARGET_VECTORIZE_RELATED_MODE} to obtain the vector mode\n\ ++for a given element mode.\n\ ++\n\ ++The modes returned in @var{modes} should use the smallest element mode\n\ ++possible for the vectorization approach that they represent, preferring\n\ ++integer modes over floating-poing modes in the event of a tie. The first\n\ ++mode should be the @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} for its\n\ ++element mode.\n\ ++\n\ ++If @var{all} is true, add suitable vector modes even when they are generally\n\ ++not expected to be worthwhile.\n\ ++\n\ ++The hook returns a bitmask of flags that control how the modes in\n\ ++@var{modes} are used. The flags are:\n\ ++@table @code\n\ ++@item VECT_COMPARE_COSTS\n\ ++Tells the loop vectorizer to try all the provided modes and pick the one\n\ ++with the lowest cost. By default the vectorizer will choose the first\n\ ++mode that works.\n\ ++@end table\n\ + \n\ + The hook does not need to do anything if the vector returned by\n\ + @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant\n\ +-for autovectorization. The default implementation does nothing.", +- void, +- (vector_sizes *sizes), +- default_autovectorize_vector_sizes) ++for autovectorization. The default implementation adds no modes and\n\ ++returns 0.", ++ unsigned int, ++ (vector_modes *modes, bool all), ++ default_autovectorize_vector_modes) ++ ++DEFHOOK ++(related_mode, ++ "If a piece of code is using vector mode @var{vector_mode} and also wants\n\ ++to operate on elements of mode @var{element_mode}, return the vector mode\n\ ++it should use for those elements. If @var{nunits} is nonzero, ensure that\n\ ++the mode has exactly @var{nunits} elements, otherwise pick whichever vector\n\ ++size pairs the most naturally with @var{vector_mode}. Return an empty\n\ ++@code{opt_machine_mode} if there is no supported vector mode with the\n\ ++required properties.\n\ ++\n\ ++There is no prescribed way of handling the case in which @var{nunits}\n\ ++is zero. One common choice is to pick a vector mode with the same size\n\ ++as @var{vector_mode}; this is the natural choice if the target has a\n\ ++fixed vector size. Another option is to choose a vector mode with the\n\ ++same number of elements as @var{vector_mode}; this is the natural choice\n\ ++if the target has a fixed number of elements. Alternatively, the hook\n\ ++might choose a middle ground, such as trying to keep the number of\n\ ++elements as similar as possible while applying maximum and minimum\n\ ++vector sizes.\n\ ++\n\ ++The default implementation uses @code{mode_for_vector} to find the\n\ ++requested mode, returning a mode with the same size as @var{vector_mode}\n\ ++when @var{nunits} is zero. This is the correct behavior for most targets.", ++ opt_machine_mode, ++ (machine_mode vector_mode, scalar_mode element_mode, poly_uint64 nunits), ++ default_vectorize_related_mode) + + /* Function to get a target mode for a vector mask. */ + DEFHOOK + (get_mask_mode, +- "A vector mask is a value that holds one boolean result for every element\n\ +-in a vector. This hook returns the machine mode that should be used to\n\ +-represent such a mask when the vector in question is @var{length} bytes\n\ +-long and contains @var{nunits} elements. The hook returns an empty\n\ +-@code{opt_machine_mode} if no such mode exists.\n\ +-\n\ +-The default implementation returns the mode of an integer vector that\n\ +-is @var{length} bytes long and that contains @var{nunits} elements,\n\ +-if such a mode exists.", ++ "Return the mode to use for a vector mask that holds one boolean\n\ ++result for each element of vector mode @var{mode}. The returned mask mode\n\ ++can be a vector of integers (class @code{MODE_VECTOR_INT}), a vector of\n\ ++booleans (class @code{MODE_VECTOR_BOOL}) or a scalar integer (class\n\ ++@code{MODE_INT}). Return an empty @code{opt_machine_mode} if no such\n\ ++mask mode exists.\n\ ++\n\ ++The default implementation returns a @code{MODE_VECTOR_INT} with the\n\ ++same size and number of elements as @var{mode}, if such a mode exists.", + opt_machine_mode, +- (poly_uint64 nunits, poly_uint64 length), ++ (machine_mode mode), + default_get_mask_mode) + + /* Function to say whether a masked operation is expensive when the +diff --git a/gcc/target.h b/gcc/target.h +index 008932b5dbd..057e6ae8768 100644 +--- a/gcc/target.h ++++ b/gcc/target.h +@@ -199,11 +199,19 @@ enum vect_cost_model_location { + class vec_perm_indices; + + /* The type to use for lists of vector sizes. */ +-typedef vec vector_sizes; ++typedef vec vector_modes; + + /* Same, but can be used to construct local lists that are + automatically freed. */ +-typedef auto_vec auto_vector_sizes; ++typedef auto_vec auto_vector_modes; ++ ++/* Flags returned by TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES: ++ ++ VECT_COMPARE_COSTS ++ Tells the loop vectorizer to try all the provided modes and ++ pick the one with the lowest cost. By default the vectorizer ++ will choose the first mode that works. */ ++const unsigned int VECT_COMPARE_COSTS = 1U << 0; + + /* The target structure. This holds all the backend hooks. */ + #define DEFHOOKPOD(NAME, DOC, TYPE, INIT) TYPE NAME; +diff --git a/gcc/targhooks.c b/gcc/targhooks.c +index 02b9dc59611..6396f6f4bdf 100644 +--- a/gcc/targhooks.c ++++ b/gcc/targhooks.c +@@ -1312,32 +1312,39 @@ default_split_reduction (machine_mode mode) + return mode; + } + +-/* By default only the size derived from the preferred vector mode +- is tried. */ ++/* By default only the preferred vector mode is tried. */ + +-void +-default_autovectorize_vector_sizes (vector_sizes *) ++unsigned int ++default_autovectorize_vector_modes (vector_modes *, bool) + { ++ return 0; + } + +-/* By default a vector of integers is used as a mask. */ ++/* The default implementation of TARGET_VECTORIZE_RELATED_MODE. */ + + opt_machine_mode +-default_get_mask_mode (poly_uint64 nunits, poly_uint64 vector_size) +-{ +- unsigned int elem_size = vector_element_size (vector_size, nunits); +- scalar_int_mode elem_mode +- = smallest_int_mode_for_size (elem_size * BITS_PER_UNIT); +- machine_mode vector_mode; ++default_vectorize_related_mode (machine_mode vector_mode, ++ scalar_mode element_mode, ++ poly_uint64 nunits) ++{ ++ machine_mode result_mode; ++ if ((maybe_ne (nunits, 0U) ++ || multiple_p (GET_MODE_SIZE (vector_mode), ++ GET_MODE_SIZE (element_mode), &nunits)) ++ && mode_for_vector (element_mode, nunits).exists (&result_mode) ++ && VECTOR_MODE_P (result_mode) ++ && targetm.vector_mode_supported_p (result_mode)) ++ return result_mode; + +- gcc_assert (known_eq (elem_size * nunits, vector_size)); ++ return opt_machine_mode (); ++} + +- if (mode_for_vector (elem_mode, nunits).exists (&vector_mode) +- && VECTOR_MODE_P (vector_mode) +- && targetm.vector_mode_supported_p (vector_mode)) +- return vector_mode; ++/* By default a vector of integers is used as a mask. */ + +- return opt_machine_mode (); ++opt_machine_mode ++default_get_mask_mode (machine_mode mode) ++{ ++ return related_int_vector_mode (mode); + } + + /* By default consider masked stores to be expensive. */ +diff --git a/gcc/targhooks.h b/gcc/targhooks.h +index 59436278dcf..2d599190891 100644 +--- a/gcc/targhooks.h ++++ b/gcc/targhooks.h +@@ -110,8 +110,11 @@ default_builtin_support_vector_misalignment (machine_mode mode, + int, bool); + extern machine_mode default_preferred_simd_mode (scalar_mode mode); + extern machine_mode default_split_reduction (machine_mode); +-extern void default_autovectorize_vector_sizes (vector_sizes *); +-extern opt_machine_mode default_get_mask_mode (poly_uint64, poly_uint64); ++extern unsigned int default_autovectorize_vector_modes (vector_modes *, bool); ++extern opt_machine_mode default_vectorize_related_mode (machine_mode, ++ scalar_mode, ++ poly_uint64); ++extern opt_machine_mode default_get_mask_mode (machine_mode); + extern bool default_empty_mask_is_expensive (unsigned); + extern void *default_init_cost (struct loop *); + extern unsigned default_add_stmt_cost (void *, int, enum vect_cost_for_stmt, +diff --git a/gcc/testsuite/g++.dg/opt/pr92317.C b/gcc/testsuite/g++.dg/opt/pr92317.C +new file mode 100644 +index 00000000000..2bb9729fc96 +--- /dev/null ++++ b/gcc/testsuite/g++.dg/opt/pr92317.C +@@ -0,0 +1,51 @@ ++// Copied from pr87967.C ++// { dg-do compile { target c++11 } } ++// { dg-options "-O2 -ftree-vectorize -fno-tree-pre --param vect-epilogues-nomask=1" } ++ ++void h(); ++template struct k { using d = b; }; ++template class> using e = k; ++template class f> ++using g = typename e::d; ++struct l { ++ template using ab = typename i::j; ++}; ++struct n : l { ++ using j = g; ++}; ++class o { ++public: ++ long r(); ++}; ++char m; ++char s() { ++ if (m) ++ return '0'; ++ return 'A'; ++} ++class t { ++public: ++ typedef char *ad; ++ ad m_fn2(); ++}; ++void fn3() { ++ char *a; ++ t b; ++ bool p = false; ++ while (*a) { ++ h(); ++ o c; ++ if (*a) ++ a++; ++ if (c.r()) { ++ n::j q; ++ for (t::ad d = b.m_fn2(), e; d != e; d++) { ++ char f = *q; ++ *d = f + s(); ++ } ++ p = true; ++ } ++ } ++ if (p) ++ throw; ++} +diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr90883.C b/gcc/testsuite/g++.dg/tree-ssa/pr90883.C +new file mode 100644 +index 00000000000..0e622f263d2 +--- /dev/null ++++ b/gcc/testsuite/g++.dg/tree-ssa/pr90883.C +@@ -0,0 +1,20 @@ ++// { dg-options "-O2 -Os -fdump-tree-dse-details -std=c++11 --param max-inline-insns-size=1" } ++ ++ ++ class C ++ { ++ char a[7]{}; ++ int b{}; ++ }; ++ ++ C slow() ++ { ++ return {}; ++ } ++ ++ ++// We want to match enough here to capture that we deleted an empty ++// constructor store ++// aarch64 and mips will expand to loop to clear because CLEAR_RATIO. ++// { dg-final { scan-tree-dump "Deleted redundant store: .*\.a = {}" "dse1" { xfail { aarch64-*-* mips*-*-* } } } } ++ +diff --git a/gcc/testsuite/gcc.dg/pr92162.c b/gcc/testsuite/gcc.dg/pr92162.c +new file mode 100644 +index 00000000000..ed82595a752 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/pr92162.c +@@ -0,0 +1,10 @@ ++/* { dg-do compile } */ ++/* { dg-options "-Ofast" } */ ++ ++short int s8; ++ ++void __attribute__ ((simd)) ++gn (void) ++{ ++ s8 = 0; ++} +diff --git a/gcc/testsuite/gcc.dg/torture/pr91896.c b/gcc/testsuite/gcc.dg/torture/pr91896.c +new file mode 100644 +index 00000000000..e728538bb9a +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/torture/pr91896.c +@@ -0,0 +1,18 @@ ++/* { dg-do compile } */ ++/* { dg-additional-options "-ftree-vectorize" } */ ++ ++unsigned int ++zj (unsigned int et) ++{ ++ signed char jr = 0; ++ ++ do { ++ et *= 3; ++ jr += 2; ++ } while (jr >= 0); ++ ++ if (et == (unsigned int) jr) ++ et = 0; ++ ++ return et; ++} +diff --git a/gcc/testsuite/gcc.dg/torture/pr92069.c b/gcc/testsuite/gcc.dg/torture/pr92069.c +new file mode 100644 +index 00000000000..806ff5fba14 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/torture/pr92069.c +@@ -0,0 +1,19 @@ ++/* { dg-do compile } */ ++/* { dg-additional-options "-ftree-vectorize" } */ ++ ++unsigned int a, c, d; ++double b; ++void e() ++{ ++ for (; d; d++) ++ { ++ double f; ++ a = 2; ++ for (; a; a++) ++ { ++ c = b; ++ b = f; ++ f = c; ++ } ++ } ++} +diff --git a/gcc/testsuite/gcc.dg/torture/pr92173.c b/gcc/testsuite/gcc.dg/torture/pr92173.c +new file mode 100644 +index 00000000000..fcb3548b716 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/torture/pr92173.c +@@ -0,0 +1,11 @@ ++/* { dg-do compile } */ ++/* { dg-additional-options "-ftree-vectorize" } */ ++ ++unsigned int ++yo (unsigned int o0, signed char s1) ++{ ++ for (s1 = 0; s1 < 1; s1 -= 2) ++ o0 += o0; ++ ++ return o0 + s1; ++} +diff --git a/gcc/testsuite/gcc.dg/torture/pr92241.c b/gcc/testsuite/gcc.dg/torture/pr92241.c +new file mode 100644 +index 00000000000..331d03b3d44 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/torture/pr92241.c +@@ -0,0 +1,13 @@ ++/* { dg-do compile } */ ++/* { dg-additional-options "-ftree-vectorize" } */ ++ ++int a, b; ++char c[2]; ++void d() { ++ char e; ++ for (; b; b--) { ++ e = 0; ++ for (; e <= 1; e++) ++ a &= c[b + e] && 1; ++ } ++} +diff --git a/gcc/testsuite/gcc.dg/torture/pr92275.c b/gcc/testsuite/gcc.dg/torture/pr92275.c +new file mode 100644 +index 00000000000..b9f70889758 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/torture/pr92275.c +@@ -0,0 +1,13 @@ ++/* { dg-do compile } */ ++/* { dg-additional-options "-ftree-vectorize" } */ ++ ++unsigned long a, c; ++int *b, *b2; ++long d; ++ ++void fn1() ++{ ++ for (; b < b2; b++) ++ d += *b * c; ++ d *= a; ++} +diff --git a/gcc/testsuite/gcc.dg/torture/pr92371.c b/gcc/testsuite/gcc.dg/torture/pr92371.c +new file mode 100644 +index 00000000000..0c78d32f471 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/torture/pr92371.c +@@ -0,0 +1,12 @@ ++/* { dg-do compile } */ ++/* { dg-additional-options "-ftree-vectorize" } */ ++ ++int a, b; ++void d() ++{ ++ int c = sizeof(int); ++ for (; a; a++) ++ c *= sizeof(int); ++ c *= sizeof(int); ++ b = c; ++} +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-36.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-36.c +new file mode 100644 +index 00000000000..23a53bb4ad2 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-36.c +@@ -0,0 +1,65 @@ ++/* { dg-options "-O2 -fdump-tree-dse-details -fno-tree-fre" } */ ++#include ++#include ++ ++struct X ++{ ++ char mem0[10]; ++ char mem1[10]; ++}; ++ ++ ++void blah (struct X); ++ ++ ++void ++foo1() ++{ ++ struct X x = { }; ++ memset (x.mem1, 0, sizeof x.mem1); ++ blah (x); ++} ++ ++void ++foo2() ++{ ++ struct X x = { }; ++ x.mem1[5] = 0; ++ blah (x); ++} ++ ++void ++bar1 () ++{ ++ struct X x; ++ memset (&x, 0, sizeof x); ++ memset (&x.mem1, 0, sizeof x.mem1); ++ blah (x); ++} ++void ++bar2 () ++{ ++ struct X x; ++ memset (&x, 0, sizeof x); ++ x.mem1[5] = 0; ++ blah (x); ++} ++ ++void ++baz1 () ++{ ++ struct X *x = calloc (sizeof (struct X), 1); ++ memset (&x->mem1, 0, sizeof x->mem1); ++ blah (*x); ++} ++ ++void ++baz2 () ++{ ++ struct X *x = calloc (sizeof (struct X), 1); ++ x->mem1[5] = 0; ++ blah (*x); ++} ++/* { dg-final { scan-tree-dump-times "Deleted redundant call" 3 "dse1" } } */ ++/* { dg-final { scan-tree-dump-times "Deleted redundant store" 3 "dse1" } } */ ++ +diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-div-2.c b/gcc/testsuite/gcc.dg/vect/bb-slp-div-2.c +new file mode 100644 +index 00000000000..715c22ac6c6 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/bb-slp-div-2.c +@@ -0,0 +1,14 @@ ++/* { dg-do compile } */ ++ ++int x[4], y[4], z[4]; ++ ++void ++f (void) ++{ ++ x[0] += y[0] / z[0] * 2; ++ x[1] += y[1] / z[1] * 2; ++ x[2] += y[2] / z[2] * 2; ++ x[3] += y[3] / z[3] * 2; ++} ++ ++/* { dg-final { scan-tree-dump "basic block vectorized" "slp2" { target vect_int } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr69907.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr69907.c +index 85f9a02582f..813b1af089a 100644 +--- a/gcc/testsuite/gcc.dg/vect/bb-slp-pr69907.c ++++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr69907.c +@@ -18,5 +18,6 @@ void foo(unsigned *p1, unsigned short *p2) + } + + /* Disable for SVE because for long or variable-length vectors we don't +- get an unrolled epilogue loop. */ +-/* { dg-final { scan-tree-dump "BB vectorization with gaps at the end of a load is not supported" "slp1" { target { ! aarch64_sve } } } } */ ++ get an unrolled epilogue loop. Also disable for AArch64 Advanced SIMD, ++ because there we can vectorize the epilogue using mixed vector sizes. */ ++/* { dg-final { scan-tree-dump "BB vectorization with gaps at the end of a load is not supported" "slp1" { target { ! aarch64*-*-* } } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c b/gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c +index 228190ab05d..877de4eb5be 100644 +--- a/gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c ++++ b/gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + #include "tree-vect.h" + + extern float copysignf (float, float); +diff --git a/gcc/testsuite/gcc.dg/vect/no-fast-math-vect16.c b/gcc/testsuite/gcc.dg/vect/no-fast-math-vect16.c +index 7a148e41d51..5f871289337 100644 +--- a/gcc/testsuite/gcc.dg/vect/no-fast-math-vect16.c ++++ b/gcc/testsuite/gcc.dg/vect/no-fast-math-vect16.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_float_strict } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/no-scevccp-noreassoc-slp-reduc-7.c b/gcc/testsuite/gcc.dg/vect/no-scevccp-noreassoc-slp-reduc-7.c +index 1d674504e2c..022d49f1175 100644 +--- a/gcc/testsuite/gcc.dg/vect/no-scevccp-noreassoc-slp-reduc-7.c ++++ b/gcc/testsuite/gcc.dg/vect/no-scevccp-noreassoc-slp-reduc-7.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-12.c b/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-12.c +index e4202b10d06..b5f8c3c88e4 100644 +--- a/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-12.c ++++ b/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-12.c +@@ -46,4 +46,4 @@ int main (void) + } + + /* Until we support multiple types in the inner loop */ +-/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED." 1 "vect" { xfail *-*-* } } } */ ++/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED." 1 "vect" { xfail { ! aarch64*-*-* } } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/no-scevccp-vect-iv-3.c b/gcc/testsuite/gcc.dg/vect/no-scevccp-vect-iv-3.c +index 50b4998bb6c..7049e4936b9 100644 +--- a/gcc/testsuite/gcc.dg/vect/no-scevccp-vect-iv-3.c ++++ b/gcc/testsuite/gcc.dg/vect/no-scevccp-vect-iv-3.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-require-effective-target vect_int } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-31.c b/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-31.c +index c3b242157ce..d2ae7976781 100644 +--- a/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-31.c ++++ b/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-31.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-add-options bind_pic_locally } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-64.c b/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-64.c +index 470bbfb5537..243e01e6dad 100644 +--- a/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-64.c ++++ b/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-64.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-add-options bind_pic_locally } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-66.c b/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-66.c +index 805024d8058..e339590bacb 100644 +--- a/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-66.c ++++ b/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-66.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-68.c b/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-68.c +index 726c0de652f..c403a8302d8 100644 +--- a/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-68.c ++++ b/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-68.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-skip-if "AArch64 tiny code model does not support programs larger than 1MiB" {aarch64_tiny} } */ + /* { dg-add-options bind_pic_locally } */ +diff --git a/gcc/testsuite/gcc.dg/vect/no-vfa-vect-dv-2.c b/gcc/testsuite/gcc.dg/vect/no-vfa-vect-dv-2.c +index 4513c40b34f..dcb53701795 100644 +--- a/gcc/testsuite/gcc.dg/vect/no-vfa-vect-dv-2.c ++++ b/gcc/testsuite/gcc.dg/vect/no-vfa-vect-dv-2.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/pr33804.c b/gcc/testsuite/gcc.dg/vect/pr33804.c +index 86babbe60e7..0db13674b42 100644 +--- a/gcc/testsuite/gcc.dg/vect/pr33804.c ++++ b/gcc/testsuite/gcc.dg/vect/pr33804.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-require-effective-target vect_int } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/pr53773.c b/gcc/testsuite/gcc.dg/vect/pr53773.c +index 0bcc021767e..7f8229571ec 100644 +--- a/gcc/testsuite/gcc.dg/vect/pr53773.c ++++ b/gcc/testsuite/gcc.dg/vect/pr53773.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-additional-options "-fdump-tree-optimized" } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/pr65930-1.c b/gcc/testsuite/gcc.dg/vect/pr65930-1.c +new file mode 100644 +index 00000000000..895fbf8869d +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/pr65930-1.c +@@ -0,0 +1,26 @@ ++/* { dg-require-effective-target vect_int } */ ++ ++#include "tree-vect.h" ++ ++unsigned __attribute__((noipa)) ++bar (unsigned int *x) ++{ ++ int sum = 4; ++ x = __builtin_assume_aligned (x, __BIGGEST_ALIGNMENT__); ++ for (int i = 0; i < 16; ++i) ++ sum += x[i]; ++ return sum; ++} ++ ++int ++main() ++{ ++ static int a[16] __attribute__((aligned(__BIGGEST_ALIGNMENT__))) ++ = { 1, 3, 5, 8, 9, 10, 17, 18, 23, 29, 30, 55, 42, 2, 3, 1 }; ++ check_vect (); ++ if (bar (a) != 260) ++ abort (); ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/pr65930-2.c b/gcc/testsuite/gcc.dg/vect/pr65930-2.c +new file mode 100644 +index 00000000000..9cfb9b102d9 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/pr65930-2.c +@@ -0,0 +1,28 @@ ++/* { dg-require-effective-target vect_int } */ ++ ++#include "tree-vect.h" ++ ++int __attribute__((noipa)) ++bar (unsigned int *x, int n) ++{ ++ int sum = 4; ++ x = __builtin_assume_aligned (x, __BIGGEST_ALIGNMENT__); ++ for (int i = 0; i < n; ++i) ++ sum += x[i*4+0]+ x[i*4 + 1] + x[i*4 + 2] + x[i*4 + 3]; ++ return sum; ++} ++ ++int ++main () ++{ ++ static int a[16] __attribute__((aligned(__BIGGEST_ALIGNMENT__))) ++ = { 1, 3, 5, 8, 9, 10, 17, 18, 23, 29, 30, 55, 42, 2, 3, 1 }; ++ check_vect (); ++ if (bar (a, 4) != 260) ++ abort (); ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */ ++/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" "vect" } } */ ++/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-1.c b/gcc/testsuite/gcc.dg/vect/pr65947-1.c +index 879819d576a..9fc74a1ab28 100644 +--- a/gcc/testsuite/gcc.dg/vect/pr65947-1.c ++++ b/gcc/testsuite/gcc.dg/vect/pr65947-1.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_condition } */ + + #include "tree-vect.h" +@@ -41,5 +43,5 @@ main (void) + } + + /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */ +-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */ +-/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 4 "vect" { target { ! vect_fold_extract_last } } } } */ ++/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */ ++/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 2 "vect" { target { ! vect_fold_extract_last } } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-10.c b/gcc/testsuite/gcc.dg/vect/pr65947-10.c +index f37aecab082..e4a1d9419c2 100644 +--- a/gcc/testsuite/gcc.dg/vect/pr65947-10.c ++++ b/gcc/testsuite/gcc.dg/vect/pr65947-10.c +@@ -42,6 +42,6 @@ main (void) + } + + /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */ +-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */ ++/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */ + /* { dg-final { scan-tree-dump-not "condition expression based on integer induction." "vect" } } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-12.c b/gcc/testsuite/gcc.dg/vect/pr65947-12.c +index b84fd41bc63..a47f4146a29 100644 +--- a/gcc/testsuite/gcc.dg/vect/pr65947-12.c ++++ b/gcc/testsuite/gcc.dg/vect/pr65947-12.c +@@ -42,5 +42,5 @@ main (void) + } + + /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */ +-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */ ++/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */ + /* { dg-final { scan-tree-dump-not "condition expression based on integer induction." "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-13.c b/gcc/testsuite/gcc.dg/vect/pr65947-13.c +index e1d3ff52f5c..a703923151d 100644 +--- a/gcc/testsuite/gcc.dg/vect/pr65947-13.c ++++ b/gcc/testsuite/gcc.dg/vect/pr65947-13.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_condition } */ + + #include "tree-vect.h" +@@ -41,5 +43,5 @@ main (void) + } + + /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */ +-/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 4 "vect" { xfail vect_fold_extract_last } } } */ +-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */ ++/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 2 "vect" { xfail vect_fold_extract_last } } } */ ++/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-14.c b/gcc/testsuite/gcc.dg/vect/pr65947-14.c +index 9f1e4e1eb6a..3b76fda2122 100644 +--- a/gcc/testsuite/gcc.dg/vect/pr65947-14.c ++++ b/gcc/testsuite/gcc.dg/vect/pr65947-14.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_condition } */ + + #include "tree-vect.h" +@@ -41,5 +43,5 @@ main (void) + } + + /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */ +-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */ +-/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 4 "vect" { target { ! vect_fold_extract_last } } } } */ ++/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */ ++/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 2 "vect" { target { ! vect_fold_extract_last } } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-2.c b/gcc/testsuite/gcc.dg/vect/pr65947-2.c +index 18d33c436a5..58ba5f764d0 100644 +--- a/gcc/testsuite/gcc.dg/vect/pr65947-2.c ++++ b/gcc/testsuite/gcc.dg/vect/pr65947-2.c +@@ -42,5 +42,5 @@ main (void) + } + + /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */ +-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */ ++/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */ + /* { dg-final { scan-tree-dump-not "condition expression based on integer induction." "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-3.c b/gcc/testsuite/gcc.dg/vect/pr65947-3.c +index 427abdb4140..6b4077e1a62 100644 +--- a/gcc/testsuite/gcc.dg/vect/pr65947-3.c ++++ b/gcc/testsuite/gcc.dg/vect/pr65947-3.c +@@ -52,5 +52,5 @@ main (void) + } + + /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */ +-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */ ++/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */ + /* { dg-final { scan-tree-dump-not "condition expression based on integer induction." "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-4.c b/gcc/testsuite/gcc.dg/vect/pr65947-4.c +index 186e03a6346..471fbe2da21 100644 +--- a/gcc/testsuite/gcc.dg/vect/pr65947-4.c ++++ b/gcc/testsuite/gcc.dg/vect/pr65947-4.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_condition } */ + + #include "tree-vect.h" +@@ -41,6 +43,6 @@ main (void) + } + + /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */ +-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */ +-/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 4 "vect" { target { ! vect_fold_extract_last } } } } */ ++/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */ ++/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 2 "vect" { target { ! vect_fold_extract_last } } } } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-5.c b/gcc/testsuite/gcc.dg/vect/pr65947-5.c +index c91b648aa05..4e3f765cd0c 100644 +--- a/gcc/testsuite/gcc.dg/vect/pr65947-5.c ++++ b/gcc/testsuite/gcc.dg/vect/pr65947-5.c +@@ -53,5 +53,5 @@ main (void) + /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 1 "vect" { target { ! vect_fold_extract_last } } } } */ + /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" { target vect_fold_extract_last } } } */ + /* { dg-final { scan-tree-dump "loop size is greater than data size" "vect" { xfail vect_fold_extract_last } } } */ +-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */ ++/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */ + /* { dg-final { scan-tree-dump-not "condition expression based on integer induction." "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-6.c b/gcc/testsuite/gcc.dg/vect/pr65947-6.c +index b072c8d33a2..dde96d7a553 100644 +--- a/gcc/testsuite/gcc.dg/vect/pr65947-6.c ++++ b/gcc/testsuite/gcc.dg/vect/pr65947-6.c +@@ -41,5 +41,5 @@ main (void) + } + + /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */ +-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */ ++/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */ + /* { dg-final { scan-tree-dump-not "condition expression based on integer induction." "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-9.c b/gcc/testsuite/gcc.dg/vect/pr65947-9.c +index e43e0e473be..1f295306016 100644 +--- a/gcc/testsuite/gcc.dg/vect/pr65947-9.c ++++ b/gcc/testsuite/gcc.dg/vect/pr65947-9.c +@@ -48,5 +48,5 @@ main () + /* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" { target { ! vect_fold_extract_last } } } } */ + /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 1 "vect" { target vect_fold_extract_last } } } */ + /* { dg-final { scan-tree-dump "loop size is greater than data size" "vect" { target { ! vect_fold_extract_last } } } } */ +-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */ ++/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 1 "vect" { target vect_fold_extract_last } } } */ + /* { dg-final { scan-tree-dump-not "condition expression based on integer induction." "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/pr80631-1.c b/gcc/testsuite/gcc.dg/vect/pr80631-1.c +index f2405198a10..cbb9a6ff69a 100644 +--- a/gcc/testsuite/gcc.dg/vect/pr80631-1.c ++++ b/gcc/testsuite/gcc.dg/vect/pr80631-1.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* PR tree-optimization/80631 */ + + #include "tree-vect.h" +@@ -72,5 +74,5 @@ main () + } + + /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 5 "vect" { target vect_condition } } } */ +-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 10 "vect" { target vect_fold_extract_last } } } */ +-/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 10 "vect" { target { { ! vect_fold_extract_last } && vect_condition } } } } */ ++/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 5 "vect" { target vect_fold_extract_last } } } */ ++/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 5 "vect" { target { { ! vect_fold_extract_last } && vect_condition } } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/pr80631-2.c b/gcc/testsuite/gcc.dg/vect/pr80631-2.c +index b334ca2345b..61e11316af2 100644 +--- a/gcc/testsuite/gcc.dg/vect/pr80631-2.c ++++ b/gcc/testsuite/gcc.dg/vect/pr80631-2.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* PR tree-optimization/80631 */ + + #include "tree-vect.h" +@@ -72,5 +74,5 @@ main () + } + + /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 5 "vect" { target vect_condition } } } */ +-/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 10 "vect" { target vect_condition xfail vect_fold_extract_last } } } */ +-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 10 "vect" { target vect_fold_extract_last } } } */ ++/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 5 "vect" { target vect_condition xfail vect_fold_extract_last } } } */ ++/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 5 "vect" { target vect_fold_extract_last } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/pr92205.c b/gcc/testsuite/gcc.dg/vect/pr92205.c +new file mode 100644 +index 00000000000..a031c1fe297 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/pr92205.c +@@ -0,0 +1,13 @@ ++/* { dg-do compile } */ ++/* { dg-require-effective-target vect_int } */ ++ ++int b(int n, unsigned char *a) ++{ ++ int d = 0; ++ a = __builtin_assume_aligned (a, __BIGGEST_ALIGNMENT__); ++ for (int c = 0; c < n; ++c) ++ d |= a[c]; ++ return d; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { xfail *-*-* } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/slp-23.c b/gcc/testsuite/gcc.dg/vect/slp-23.c +index 7d330c787d1..d7c67fe2c6e 100644 +--- a/gcc/testsuite/gcc.dg/vect/slp-23.c ++++ b/gcc/testsuite/gcc.dg/vect/slp-23.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/slp-25.c b/gcc/testsuite/gcc.dg/vect/slp-25.c +index ff7eff202cb..1c33927c434 100644 +--- a/gcc/testsuite/gcc.dg/vect/slp-25.c ++++ b/gcc/testsuite/gcc.dg/vect/slp-25.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-add-options bind_pic_locally } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/slp-9.c b/gcc/testsuite/gcc.dg/vect/slp-9.c +index d0c94f1986b..d5212dca3dd 100644 +--- a/gcc/testsuite/gcc.dg/vect/slp-9.c ++++ b/gcc/testsuite/gcc.dg/vect/slp-9.c +@@ -44,5 +44,5 @@ int main (void) + } + + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_widen_mult_hi_to_si } } }*/ +-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_widen_mult_hi_to_si } } } */ ++/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target vect_widen_mult_hi_to_si } } } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c +index 07c96c00eb0..15dd59922fc 100644 +--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c ++++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c +index fc689e46ba1..f457c11aa3c 100644 +--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c ++++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c +index 88591c5bdcb..1fd15aa3c87 100644 +--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c ++++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-sad-2.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-sad-2.c +new file mode 100644 +index 00000000000..7d9255e48f2 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-sad-2.c +@@ -0,0 +1,31 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ ++/* { dg-do compile } */ ++/* { dg-require-effective-target vect_usad_char } */ ++/* With AVX256 or more we do not pull off the trick eliding the epilogue. */ ++/* { dg-additional-options "-mprefer-avx128" { target { x86_64-*-* i?86-*-* } } } */ ++ ++typedef unsigned char uint8_t; ++int x264_pixel_sad_8x8( uint8_t *pix1, uint8_t *pix2, int i_stride_pix2 ) ++{ ++ int i_sum = 0; ++ for( int y = 0; y < 8; y++ ) ++ { ++ i_sum += __builtin_abs( pix1[0] - pix2[0] ); ++ i_sum += __builtin_abs( pix1[1] - pix2[1] ); ++ i_sum += __builtin_abs( pix1[2] - pix2[2] ); ++ i_sum += __builtin_abs( pix1[3] - pix2[3] ); ++ i_sum += __builtin_abs( pix1[4] - pix2[4] ); ++ i_sum += __builtin_abs( pix1[5] - pix2[5] ); ++ i_sum += __builtin_abs( pix1[6] - pix2[6] ); ++ i_sum += __builtin_abs( pix1[7] - pix2[7] ); ++ pix1 += 16; ++ pix2 += i_stride_pix2; ++ } ++ return i_sum; ++} ++ ++/* { dg-final { scan-tree-dump "vect_recog_sad_pattern: detected" "vect" } } */ ++/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ ++/* { dg-final { scan-tree-dump-not "access with gaps requires scalar epilogue loop" "vect" } } */ ++/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/slp-widen-mult-half.c b/gcc/testsuite/gcc.dg/vect/slp-widen-mult-half.c +index f5fb63e19f1..e3bfee33348 100644 +--- a/gcc/testsuite/gcc.dg/vect/slp-widen-mult-half.c ++++ b/gcc/testsuite/gcc.dg/vect/slp-widen-mult-half.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include "tree-vect.h" +diff --git a/gcc/testsuite/gcc.dg/vect/slp-widen-mult-s16.c b/gcc/testsuite/gcc.dg/vect/slp-widen-mult-s16.c +index 4460d59b5a1..abb10fde45b 100644 +--- a/gcc/testsuite/gcc.dg/vect/slp-widen-mult-s16.c ++++ b/gcc/testsuite/gcc.dg/vect/slp-widen-mult-s16.c +@@ -38,5 +38,5 @@ int main (void) + } + + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_widen_mult_hi_to_si || vect_unpack } } } } */ +-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_widen_mult_hi_to_si || vect_unpack } } } } */ ++/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_widen_mult_hi_to_si || vect_unpack } } } } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/slp-widen-mult-u8.c b/gcc/testsuite/gcc.dg/vect/slp-widen-mult-u8.c +index 6e72c4878c2..0756119afb4 100644 +--- a/gcc/testsuite/gcc.dg/vect/slp-widen-mult-u8.c ++++ b/gcc/testsuite/gcc.dg/vect/slp-widen-mult-u8.c +@@ -38,5 +38,5 @@ int main (void) + } + + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_widen_mult_qi_to_hi || vect_unpack } } } } */ +-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_widen_mult_hi_to_si || vect_unpack } } } } */ ++/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_widen_mult_hi_to_si || vect_unpack } } } } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/trapv-vect-reduc-4.c b/gcc/testsuite/gcc.dg/vect/trapv-vect-reduc-4.c +index 8a57eb69a91..f09c964fdc1 100644 +--- a/gcc/testsuite/gcc.dg/vect/trapv-vect-reduc-4.c ++++ b/gcc/testsuite/gcc.dg/vect/trapv-vect-reduc-4.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-require-effective-target vect_int } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-103.c b/gcc/testsuite/gcc.dg/vect/vect-103.c +index 4a9e1574eb0..2a4510482d4 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-103.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-103.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-109.c b/gcc/testsuite/gcc.dg/vect/vect-109.c +index 9a507105899..ac5d0827899 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-109.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-109.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-skip-if "" { vect_no_align } } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-add-options bind_pic_locally } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-119.c b/gcc/testsuite/gcc.dg/vect/vect-119.c +index aa8c3002bff..29a9c51cd29 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-119.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-119.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-require-effective-target vect_int } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-24.c b/gcc/testsuite/gcc.dg/vect/vect-24.c +index cbff6c55fa4..fa4c0620d29 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-24.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-24.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-26.c b/gcc/testsuite/gcc.dg/vect/vect-26.c +index 4f0472b5d0f..8a141f38400 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-26.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-26.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-27.c b/gcc/testsuite/gcc.dg/vect/vect-27.c +index 590217feee7..ac86b21aceb 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-27.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-27.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-add-options bind_pic_locally } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-29.c b/gcc/testsuite/gcc.dg/vect/vect-29.c +index 86ec2cc1ddf..bbd446dfe63 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-29.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-29.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-add-options bind_pic_locally } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-42.c b/gcc/testsuite/gcc.dg/vect/vect-42.c +index a65b4a62276..086cbf20c0a 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-42.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-42.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_float } */ + /* { dg-add-options double_vectors } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-44.c b/gcc/testsuite/gcc.dg/vect/vect-44.c +index 03ef2c0f671..f7f1fd28665 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-44.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-44.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_float } */ + /* { dg-additional-options "--param vect-max-peeling-for-alignment=0" } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-48.c b/gcc/testsuite/gcc.dg/vect/vect-48.c +index bac6ef6b8dd..b29fe47635a 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-48.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-48.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_float } */ + /* { dg-add-options double_vectors } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-50.c b/gcc/testsuite/gcc.dg/vect/vect-50.c +index c9500ca91e5..f43676896af 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-50.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-50.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_float } */ + /* { dg-additional-options "--param vect-max-peeling-for-alignment=0" } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-52.c b/gcc/testsuite/gcc.dg/vect/vect-52.c +index 0343d9a24d1..c20a4be2ede 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-52.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-52.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_float } */ + /* { dg-add-options double_vectors } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-54.c b/gcc/testsuite/gcc.dg/vect/vect-54.c +index 58201abe069..2b236e48e19 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-54.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-54.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_float } */ + /* { dg-add-options double_vectors } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-56.c b/gcc/testsuite/gcc.dg/vect/vect-56.c +index 8060b05e781..c914126ece5 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-56.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-56.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_float } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-58.c b/gcc/testsuite/gcc.dg/vect/vect-58.c +index 441af51860e..da4f9740e33 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-58.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-58.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_float } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-60.c b/gcc/testsuite/gcc.dg/vect/vect-60.c +index 3b7477c96ab..121c503c63a 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-60.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-60.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_float } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-72.c b/gcc/testsuite/gcc.dg/vect/vect-72.c +index 472d8d57549..9e8e91b7ae6 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-72.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-72.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-add-options bind_pic_locally } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-75-big-array.c b/gcc/testsuite/gcc.dg/vect/vect-75-big-array.c +index 42b2b8d91aa..a3fb5053037 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-75-big-array.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-75-big-array.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-75.c b/gcc/testsuite/gcc.dg/vect/vect-75.c +index 2cdd7032242..88da97f0bb7 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-75.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-75.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-77-alignchecks.c b/gcc/testsuite/gcc.dg/vect/vect-77-alignchecks.c +index 56ee797d10b..fb3e4992782 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-77-alignchecks.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-77-alignchecks.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-77-global.c b/gcc/testsuite/gcc.dg/vect/vect-77-global.c +index f0b73505d68..1580d6e075b 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-77-global.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-77-global.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-add-options bind_pic_locally } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-78-alignchecks.c b/gcc/testsuite/gcc.dg/vect/vect-78-alignchecks.c +index c3ef8a36591..57e8da0a909 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-78-alignchecks.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-78-alignchecks.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-78-global.c b/gcc/testsuite/gcc.dg/vect/vect-78-global.c +index 241e7fa94b5..ea039b389b2 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-78-global.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-78-global.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-add-options bind_pic_locally } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-89-big-array.c b/gcc/testsuite/gcc.dg/vect/vect-89-big-array.c +index decfbee318a..59e1aae0017 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-89-big-array.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-89-big-array.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-89.c b/gcc/testsuite/gcc.dg/vect/vect-89.c +index 051698eada2..356ab96d330 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-89.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-89.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-91.c b/gcc/testsuite/gcc.dg/vect/vect-91.c +index 9430da3290a..91264d9841d 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-91.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-91.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-additional-options "--param vect-max-peeling-for-alignment=0" } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-92.c b/gcc/testsuite/gcc.dg/vect/vect-92.c +index b9a1ce23d02..9ceb0fbadcd 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-92.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-92.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_float } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-96.c b/gcc/testsuite/gcc.dg/vect/vect-96.c +index 0cb935b9f16..c0d6c37b21d 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-96.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-96.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-add-options double_vectors } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-1.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-1.c +index c2b1c773047..3887120b747 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-alias-check-1.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-1.c +@@ -15,3 +15,5 @@ fn1 () + } + + /* { dg-final { scan-tree-dump "improved number of alias checks from \[0-9\]* to 1" "vect" } } */ ++/* { dg-final { scan-tree-dump "using an address-based overlap test" "vect" } } */ ++/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-10.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-10.c +index 0e6285e4a23..b6cc309dbe8 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-alias-check-10.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-10.c +@@ -65,3 +65,6 @@ main (void) + FOR_EACH_TYPE (DO_TEST) + return 0; + } ++ ++/* { dg-final { scan-tree-dump-not "using an address-based" "vect" } } */ ++/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-11.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-11.c +index a0d5abc3aa4..09a4ebfa69e 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-alias-check-11.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-11.c +@@ -95,3 +95,6 @@ main (void) + /* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* step[^ ]* \* 8[)]* is outside \(-24, 24\)} "vect" { target vect_double } } } */ + /* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* step[^ ]* \* 8[)]* is outside \(-32, 32\)} "vect" { target vect_double } } } */ + /* { dg-final { scan-tree-dump {run-time check [^\n]* abs \([^*]* \* 8[)]* >= 32} "vect" { target vect_double } } } */ ++ ++/* { dg-final { scan-tree-dump-not "using an address-based" "vect" } } */ ++/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-12.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-12.c +index 788cdfc3cdc..63a897f4bad 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-alias-check-12.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-12.c +@@ -95,3 +95,6 @@ main (void) + /* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* [_a-z][^ ]* \* 8[)]* is outside \[0, 24\)} "vect" { target vect_double } } } */ + /* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* [_a-z][^ ]* \* 8[)]* is outside \[0, 32\)} "vect" { target vect_double } } } */ + /* { dg-final { scan-tree-dump {run-time check [^\n]* unsigned \([^*]* \* 8[)]* >= 32} "vect" { target vect_double } } } */ ++ ++/* { dg-final { scan-tree-dump-not "using an address-based" "vect" } } */ ++/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-13.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-13.c +index 60bc4730724..812aa9027dd 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-alias-check-13.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-13.c +@@ -18,4 +18,6 @@ f2 (int *x, long step2, int n) + + /* { dg-final { scan-tree-dump {need run-time check that [^\n]*step1[^\n]* is nonzero} "vect" } } */ + /* { dg-final { scan-tree-dump-not {need run-time check that [^\n]*step2[^\n]* is nonzero} "vect" } } */ ++/* { dg-final { scan-tree-dump-not "using an address-based" "vect" } } */ ++/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */ + /* { dg-final { scan-tree-dump-times {LOOP VECTORIZED} 2 "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-14.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-14.c +new file mode 100644 +index 00000000000..1d148a04918 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-14.c +@@ -0,0 +1,64 @@ ++#define N 200 ++#define M 4 ++ ++typedef signed char sc; ++typedef unsigned char uc; ++typedef signed short ss; ++typedef unsigned short us; ++typedef int si; ++typedef unsigned int ui; ++typedef signed long long sll; ++typedef unsigned long long ull; ++ ++#define FOR_EACH_TYPE(M) \ ++ M (sc) M (uc) \ ++ M (ss) M (us) \ ++ M (si) M (ui) \ ++ M (sll) M (ull) \ ++ M (float) M (double) ++ ++#define TEST_VALUE(I) ((I) * 17 / 2) ++ ++#define ADD_TEST(TYPE) \ ++ void __attribute__((noinline, noclone)) \ ++ test_##TYPE (TYPE *a, TYPE *b) \ ++ { \ ++ for (int i = 0; i < N; i += 2) \ ++ { \ ++ TYPE b0 = b[i + 0]; \ ++ TYPE b1 = b[i + 1]; \ ++ a[i + 0] = b0 + 2; \ ++ a[i + 1] = b1 + 3; \ ++ } \ ++ } ++ ++#define DO_TEST(TYPE) \ ++ for (int j = 0; j < M; ++j) \ ++ { \ ++ TYPE a[N + M]; \ ++ for (int i = 0; i < N + M; ++i) \ ++ a[i] = TEST_VALUE (i); \ ++ test_##TYPE (a + j, a); \ ++ for (int i = 0; i < N; i += 2) \ ++ { \ ++ TYPE base1 = j == 0 ? TEST_VALUE (i) : a[i]; \ ++ TYPE base2 = j <= 1 ? TEST_VALUE (i + 1) : a[i + 1]; \ ++ if (a[i + j] != (TYPE) (base1 + 2) \ ++ || a[i + j + 1] != (TYPE) (base2 + 3)) \ ++ __builtin_abort (); \ ++ } \ ++ } ++ ++FOR_EACH_TYPE (ADD_TEST) ++ ++int ++main (void) ++{ ++ FOR_EACH_TYPE (DO_TEST) ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump {flags: *WAR\n} "vect" { target vect_int } } } */ ++/* { dg-final { scan-tree-dump-not {flags: [^\n]*ARBITRARY\n} "vect" } } */ ++/* { dg-final { scan-tree-dump "using an address-based WAR/WAW test" "vect" } } */ ++/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-15.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-15.c +new file mode 100644 +index 00000000000..fbe3f8431ff +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-15.c +@@ -0,0 +1,61 @@ ++#define N 200 ++#define DIST 32 ++ ++typedef signed char sc; ++typedef unsigned char uc; ++typedef signed short ss; ++typedef unsigned short us; ++typedef int si; ++typedef unsigned int ui; ++typedef signed long long sll; ++typedef unsigned long long ull; ++ ++#define FOR_EACH_TYPE(M) \ ++ M (sc) M (uc) \ ++ M (ss) M (us) \ ++ M (si) M (ui) \ ++ M (sll) M (ull) \ ++ M (float) M (double) ++ ++#define ADD_TEST(TYPE) \ ++ void __attribute__((noinline, noclone)) \ ++ test_##TYPE (TYPE *x, TYPE *y) \ ++ { \ ++ for (int i = 0; i < N; ++i) \ ++ { \ ++ x[i] = i; \ ++ y[i] = 42 - i * 2; \ ++ } \ ++ } ++ ++#define DO_TEST(TYPE) \ ++ for (int i = 0; i < DIST * 2; ++i) \ ++ { \ ++ TYPE a[N + DIST * 2] = {}; \ ++ test_##TYPE (a + DIST, a + i); \ ++ for (int j = 0; j < N + DIST * 2; ++j) \ ++ { \ ++ TYPE expected = 0; \ ++ if (i > DIST && j >= i && j < i + N) \ ++ expected = 42 - (j - i) * 2; \ ++ if (j >= DIST && j < DIST + N) \ ++ expected = j - DIST; \ ++ if (i <= DIST && j >= i && j < i + N) \ ++ expected = 42 - (j - i) * 2; \ ++ if (expected != a[j]) \ ++ __builtin_abort (); \ ++ } \ ++ } ++ ++FOR_EACH_TYPE (ADD_TEST) ++ ++int ++main (void) ++{ ++ FOR_EACH_TYPE (DO_TEST) ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump {flags: *WAW\n} "vect" { target vect_int } } } */ ++/* { dg-final { scan-tree-dump "using an address-based WAR/WAW test" "vect" } } */ ++/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-16.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-16.c +new file mode 100644 +index 00000000000..81c252dfc23 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-16.c +@@ -0,0 +1,66 @@ ++#define N 200 ++#define DIST 32 ++ ++typedef signed char sc; ++typedef unsigned char uc; ++typedef signed short ss; ++typedef unsigned short us; ++typedef int si; ++typedef unsigned int ui; ++typedef signed long long sll; ++typedef unsigned long long ull; ++ ++#define FOR_EACH_TYPE(M) \ ++ M (sc) M (uc) \ ++ M (ss) M (us) \ ++ M (si) M (ui) \ ++ M (sll) M (ull) \ ++ M (float) M (double) ++ ++#define TEST_VALUE(I) ((I) * 13 / 2) ++ ++#define ADD_TEST(TYPE) \ ++ TYPE __attribute__((noinline, noclone)) \ ++ test_##TYPE (TYPE *x, TYPE *y) \ ++ { \ ++ TYPE res = 0; \ ++ for (int i = 0; i < N; ++i) \ ++ { \ ++ x[i] = i; \ ++ res += y[i]; \ ++ } \ ++ return res; \ ++ } ++ ++#define DO_TEST(TYPE) \ ++ for (int i = 0; i < DIST * 2; ++i) \ ++ { \ ++ TYPE a[N + DIST * 2]; \ ++ for (int j = 0; j < N + DIST * 2; ++j) \ ++ a[j] = TEST_VALUE (j); \ ++ TYPE res = test_##TYPE (a + DIST, a + i); \ ++ for (int j = 0; j < N; ++j) \ ++ if (a[j + DIST] != (TYPE) j) \ ++ __builtin_abort (); \ ++ TYPE expected_res = 0; \ ++ for (int j = i; j < i + N; ++j) \ ++ if (i <= DIST && j >= DIST && j < DIST + N) \ ++ expected_res += j - DIST; \ ++ else \ ++ expected_res += TEST_VALUE (j); \ ++ if (expected_res != res) \ ++ __builtin_abort (); \ ++ } ++ ++FOR_EACH_TYPE (ADD_TEST) ++ ++int ++main (void) ++{ ++ FOR_EACH_TYPE (DO_TEST) ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump {flags: *RAW\n} "vect" { target vect_int } } } */ ++/* { dg-final { scan-tree-dump "using an address-based overlap test" "vect" } } */ ++/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-17.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-17.c +new file mode 100644 +index 00000000000..c49c497c2d0 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-17.c +@@ -0,0 +1,15 @@ ++/* { dg-do compile } */ ++/* { dg-require-effective-target vect_load_lanes } */ ++ ++struct s { int x[100]; }; ++ ++void ++f (struct s *s1, int a, int b) ++{ ++ for (int i = 0; i < 32; ++i) ++ s1->x[a + i] = s1->x[b + i * 2] + s1->x[b + i * 3]; ++} ++ ++/* { dg-final { scan-tree-dump {flags: *[^\n]*MIXED_STEPS} "vect" } } */ ++/* { dg-final { scan-tree-dump "using an address-based overlap test" "vect" } } */ ++/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-18.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-18.c +new file mode 100644 +index 00000000000..9d0739151d9 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-18.c +@@ -0,0 +1,64 @@ ++#define N 200 ++#define DIST 32 ++ ++typedef signed char sc; ++typedef unsigned char uc; ++typedef signed short ss; ++typedef unsigned short us; ++typedef int si; ++typedef unsigned int ui; ++typedef signed long long sll; ++typedef unsigned long long ull; ++ ++#define FOR_EACH_TYPE(M) \ ++ M (sc) M (uc) \ ++ M (ss) M (us) \ ++ M (si) M (ui) \ ++ M (sll) M (ull) \ ++ M (float) M (double) ++ ++#define TEST_VALUE(I) ((I) * 11 / 2) ++ ++#define ADD_TEST(TYPE) \ ++ TYPE a_##TYPE[N * 2]; \ ++ void __attribute__((noinline, noclone)) \ ++ test_##TYPE (int x, int y) \ ++ { \ ++ for (int i = 0; i < N; ++i) \ ++ a_##TYPE[x - i] += a_##TYPE[y - i]; \ ++ } ++ ++#define DO_TEST(TYPE) \ ++ for (int i = 0; i < DIST * 2; ++i) \ ++ { \ ++ for (int j = 0; j < N + DIST * 2; ++j) \ ++ a_##TYPE[j] = TEST_VALUE (j); \ ++ test_##TYPE (i + N - 1, DIST + N - 1); \ ++ for (int j = 0; j < N + DIST * 2; ++j) \ ++ { \ ++ TYPE expected; \ ++ if (j < i || j >= i + N) \ ++ expected = TEST_VALUE (j); \ ++ else if (i >= DIST) \ ++ expected = ((TYPE) TEST_VALUE (j) \ ++ + (TYPE) TEST_VALUE (j + DIST - i)); \ ++ else \ ++ expected = ((TYPE) TEST_VALUE (j) \ ++ + a_##TYPE[j + DIST - i]); \ ++ if (expected != a_##TYPE[j]) \ ++ __builtin_abort (); \ ++ } \ ++ } ++ ++FOR_EACH_TYPE (ADD_TEST) ++ ++int ++main (void) ++{ ++ FOR_EACH_TYPE (DO_TEST) ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump {flags: *WAR\n} "vect" { target vect_int } } } */ ++/* { dg-final { scan-tree-dump "using an index-based WAR/WAW test" "vect" } } */ ++/* { dg-final { scan-tree-dump-not "using an address-based" "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-19.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-19.c +new file mode 100644 +index 00000000000..7c0ff36a8c4 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-19.c +@@ -0,0 +1,62 @@ ++#define N 200 ++#define DIST 32 ++ ++typedef signed char sc; ++typedef unsigned char uc; ++typedef signed short ss; ++typedef unsigned short us; ++typedef int si; ++typedef unsigned int ui; ++typedef signed long long sll; ++typedef unsigned long long ull; ++ ++#define FOR_EACH_TYPE(M) \ ++ M (sc) M (uc) \ ++ M (ss) M (us) \ ++ M (si) M (ui) \ ++ M (sll) M (ull) \ ++ M (float) M (double) ++ ++#define ADD_TEST(TYPE) \ ++ TYPE a_##TYPE[N * 2]; \ ++ void __attribute__((noinline, noclone)) \ ++ test_##TYPE (int x, int y) \ ++ { \ ++ for (int i = 0; i < N; ++i) \ ++ { \ ++ a_##TYPE[i + x] = i; \ ++ a_##TYPE[i + y] = 42 - i * 2; \ ++ } \ ++ } ++ ++#define DO_TEST(TYPE) \ ++ for (int i = 0; i < DIST * 2; ++i) \ ++ { \ ++ __builtin_memset (a_##TYPE, 0, sizeof (a_##TYPE)); \ ++ test_##TYPE (DIST, i); \ ++ for (int j = 0; j < N + DIST * 2; ++j) \ ++ { \ ++ TYPE expected = 0; \ ++ if (i > DIST && j >= i && j < i + N) \ ++ expected = 42 - (j - i) * 2; \ ++ if (j >= DIST && j < DIST + N) \ ++ expected = j - DIST; \ ++ if (i <= DIST && j >= i && j < i + N) \ ++ expected = 42 - (j - i) * 2; \ ++ if (expected != a_##TYPE[j]) \ ++ __builtin_abort (); \ ++ } \ ++ } ++ ++FOR_EACH_TYPE (ADD_TEST) ++ ++int ++main (void) ++{ ++ FOR_EACH_TYPE (DO_TEST) ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump {flags: *WAW\n} "vect" { target vect_int } } } */ ++/* { dg-final { scan-tree-dump "using an index-based WAR/WAW test" "vect" } } */ ++/* { dg-final { scan-tree-dump-not "using an address-based" "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-20.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-20.c +new file mode 100644 +index 00000000000..8a699ebfda8 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-20.c +@@ -0,0 +1,66 @@ ++#define N 200 ++#define DIST 32 ++ ++typedef signed char sc; ++typedef unsigned char uc; ++typedef signed short ss; ++typedef unsigned short us; ++typedef int si; ++typedef unsigned int ui; ++typedef signed long long sll; ++typedef unsigned long long ull; ++ ++#define FOR_EACH_TYPE(M) \ ++ M (sc) M (uc) \ ++ M (ss) M (us) \ ++ M (si) M (ui) \ ++ M (sll) M (ull) \ ++ M (float) M (double) ++ ++#define TEST_VALUE(I) ((I) * 11 / 2) ++ ++#define ADD_TEST(TYPE) \ ++ TYPE a_##TYPE[N * 2]; \ ++ TYPE __attribute__((noinline, noclone)) \ ++ test_##TYPE (int x, int y) \ ++ { \ ++ TYPE res = 0; \ ++ for (int i = 0; i < N; ++i) \ ++ { \ ++ a_##TYPE[i + x] = i; \ ++ res += a_##TYPE[i + y]; \ ++ } \ ++ return res; \ ++ } ++ ++#define DO_TEST(TYPE) \ ++ for (int i = 0; i < DIST * 2; ++i) \ ++ { \ ++ for (int j = 0; j < N + DIST * 2; ++j) \ ++ a_##TYPE[j] = TEST_VALUE (j); \ ++ TYPE res = test_##TYPE (DIST, i); \ ++ for (int j = 0; j < N; ++j) \ ++ if (a_##TYPE[j + DIST] != (TYPE) j) \ ++ __builtin_abort (); \ ++ TYPE expected_res = 0; \ ++ for (int j = i; j < i + N; ++j) \ ++ if (i <= DIST && j >= DIST && j < DIST + N) \ ++ expected_res += j - DIST; \ ++ else \ ++ expected_res += TEST_VALUE (j); \ ++ if (expected_res != res) \ ++ __builtin_abort (); \ ++ } ++ ++FOR_EACH_TYPE (ADD_TEST) ++ ++int ++main (void) ++{ ++ FOR_EACH_TYPE (DO_TEST) ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump {flags: *RAW\n} "vect" { target vect_int } } } */ ++/* { dg-final { scan-tree-dump "using an index-based overlap test" "vect" } } */ ++/* { dg-final { scan-tree-dump-not "using an address-based" "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-8.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-8.c +index 0569ca487b5..7e5df138999 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-alias-check-8.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-8.c +@@ -58,3 +58,7 @@ main (void) + FOR_EACH_TYPE (DO_TEST) + return 0; + } ++ ++/* { dg-final { scan-tree-dump {flags: *WAR\n} "vect" { target vect_int } } } */ ++/* { dg-final { scan-tree-dump "using an index-based WAR/WAW test" "vect" } } */ ++/* { dg-final { scan-tree-dump-not "using an address-based" "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-9.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-9.c +index 5685bfee576..a7fc1fcebbb 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-alias-check-9.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-9.c +@@ -17,7 +17,7 @@ typedef unsigned long long ull; + M (sll) M (ull) \ + M (float) M (double) + +-#define TEST_VALUE(I) ((I) * 5 / 2) ++#define TEST_VALUE(I) ((I) * 17 / 2) + + #define ADD_TEST(TYPE) \ + void __attribute__((noinline, noclone)) \ +@@ -51,3 +51,7 @@ main (void) + FOR_EACH_TYPE (DO_TEST) + return 0; + } ++ ++/* { dg-final { scan-tree-dump {flags: [^\n]*ARBITRARY\n} "vect" { target vect_int } } } */ ++/* { dg-final { scan-tree-dump "using an address-based overlap test" "vect" } } */ ++/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-bswap16.c b/gcc/testsuite/gcc.dg/vect/vect-bswap16.c +index 3c98b07e425..d29b352b832 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-bswap16.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-bswap16.c +@@ -1,4 +1,4 @@ +-/* { dg-require-effective-target vect_bswap } */ ++/* { dg-additional-options "-msse4" { target sse4_runtime } } */ + + #include "tree-vect.h" + +@@ -39,4 +39,4 @@ main (void) + return 0; + } + +-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ ++/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_bswap || sse4_runtime } } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-bswap16a.c b/gcc/testsuite/gcc.dg/vect/vect-bswap16a.c +new file mode 100644 +index 00000000000..730dc4e8352 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/vect-bswap16a.c +@@ -0,0 +1,5 @@ ++/* { dg-additional-options "-msse2 -mno-sse3" { target sse2_runtime } } */ ++ ++#include "vect-bswap16.c" ++ ++/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_shift } } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-3.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-3.c +new file mode 100644 +index 00000000000..bb99b95eca5 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-3.c +@@ -0,0 +1,47 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ ++/* { dg-require-effective-target vect_condition } */ ++/* { dg-require-effective-target vect_float } */ ++ ++#include "tree-vect.h" ++ ++extern void abort (void) __attribute__ ((noreturn)); ++ ++#define N 27 ++ ++/* Condition reduction with different types. */ ++ ++int ++condition_reduction (float *a, float min_v) ++{ ++ int last = 0; ++ ++ for (int i = 0; i < N; i++) ++ if (a[i] < min_v) ++ last = i; ++ ++ return last; ++} ++ ++int ++main (void) ++{ ++ float a[N] = { ++ 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ++ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ++ 21, 22, 23, 24, 25, 26, 27 ++ }; ++ ++ check_vect (); ++ ++ int ret = condition_reduction (a, 10); ++ if (ret != 18) ++ abort (); ++ ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */ ++/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */ ++/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 2 "vect" { target { ! vect_fold_extract_last } } } } */ ++ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-4.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-4.c +new file mode 100644 +index 00000000000..8820075b1dc +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-4.c +@@ -0,0 +1,47 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ ++/* { dg-require-effective-target vect_condition } */ ++/* { dg-require-effective-target vect_double } */ ++ ++#include "tree-vect.h" ++ ++extern void abort (void) __attribute__ ((noreturn)); ++ ++#define N 27 ++ ++/* Condition reduction with different types. */ ++ ++int ++condition_reduction (double *a, double min_v) ++{ ++ int last = 0; ++ ++ for (int i = 0; i < N; i++) ++ if (a[i] < min_v) ++ last = i; ++ ++ return last; ++} ++ ++int ++main (void) ++{ ++ double a[N] = { ++ 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ++ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ++ 21, 22, 23, 24, 25, 26, 27 ++ }; ++ ++ check_vect (); ++ ++ int ret = condition_reduction (a, 10); ++ if (ret != 18) ++ abort (); ++ ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */ ++/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */ ++/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 2 "vect" { target { ! vect_fold_extract_last } } } } */ ++ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-double-reduc-5.c b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-5.c +index 0ba33895592..079704cee81 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-double-reduc-5.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-5.c +@@ -52,5 +52,5 @@ int main () + + /* Vectorization of loops with multiple types and double reduction is not + supported yet. */ +-/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ ++/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail { ! aarch64*-*-* } } } } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-epilogues.c b/gcc/testsuite/gcc.dg/vect/vect-epilogues.c +new file mode 100644 +index 00000000000..946666e918f +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/vect-epilogues.c +@@ -0,0 +1,19 @@ ++/* { dg-do compile } */ ++ ++/* Copied from PR 88915. */ ++void pixel_avg( unsigned char *dst, int i_dst_stride, ++ unsigned char *src1, int i_src1_stride, ++ unsigned char *src2, int i_src2_stride, ++ int i_width, int i_height ) ++ { ++ for( int y = 0; y < i_height; y++ ) ++ { ++ for( int x = 0; x < i_width; x++ ) ++ dst[x] = ( src1[x] + src2[x] + 1 ) >> 1; ++ dst += i_dst_stride; ++ src1 += i_src1_stride; ++ src2 += i_src2_stride; ++ } ++ } ++ ++/* { dg-final { scan-tree-dump "LOOP EPILOGUE VECTORIZED" "vect" { xfail { arm*-*-* } } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-live-1.c b/gcc/testsuite/gcc.dg/vect/vect-live-1.c +index e170875d7ab..f628c5d3998 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-live-1.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-live-1.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-additional-options "-fno-tree-scev-cprop" } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-live-2.c b/gcc/testsuite/gcc.dg/vect/vect-live-2.c +index a6daa61829e..19d8c22859e 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-live-2.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-live-2.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_long } */ + /* { dg-require-effective-target vect_shift } */ + /* { dg-additional-options "-fno-tree-scev-cprop" } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-live-3.c b/gcc/testsuite/gcc.dg/vect/vect-live-3.c +index 3ffa5166f45..8f5ccb27365 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-live-3.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-live-3.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include "tree-vect.h" +diff --git a/gcc/testsuite/gcc.dg/vect/vect-live-4.c b/gcc/testsuite/gcc.dg/vect/vect-live-4.c +index 21cc27320ac..553ffcd49f7 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-live-4.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-live-4.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include "tree-vect.h" +diff --git a/gcc/testsuite/gcc.dg/vect/vect-live-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-live-slp-1.c +index aff37c100f0..965437c8f03 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-live-slp-1.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-live-slp-1.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-additional-options "-fno-tree-scev-cprop" } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-live-slp-2.c b/gcc/testsuite/gcc.dg/vect/vect-live-slp-2.c +index 35689665b54..0d2f17f9003 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-live-slp-2.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-live-slp-2.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-additional-options "-fno-tree-scev-cprop" } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-live-slp-3.c b/gcc/testsuite/gcc.dg/vect/vect-live-slp-3.c +index 854116fa36e..a3f60f6ce6d 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-live-slp-3.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-live-slp-3.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_long } */ + /* { dg-additional-options "-fno-tree-scev-cprop" } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-multitypes-3.c b/gcc/testsuite/gcc.dg/vect/vect-multitypes-3.c +index 18bf5e80917..1f82121df06 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-multitypes-3.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-multitypes-3.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-add-options double_vectors } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c b/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c +index 43887865bf4..b0f74083f2b 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-add-options bind_pic_locally } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-multitypes-6.c b/gcc/testsuite/gcc.dg/vect/vect-multitypes-6.c +index b47a93ab326..864b17ac640 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-multitypes-6.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-multitypes-6.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-add-options double_vectors } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4e.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4e.c +index 13238dbe2f9..e65a092f5bf 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-outer-4e.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4e.c +@@ -23,4 +23,4 @@ foo (){ + return; + } + +-/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ ++/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail { ! aarch64*-*-* } } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4f.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4f.c +index d1fbe346a48..a88014a2fbf 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-outer-4f.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4f.c +@@ -65,4 +65,4 @@ int main (void) + return 0; + } + +-/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ ++/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail { ! aarch64*-*-* } } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4g.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4g.c +index d1fbe346a48..a88014a2fbf 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-outer-4g.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4g.c +@@ -65,4 +65,4 @@ int main (void) + return 0; + } + +-/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ ++/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail { ! aarch64*-*-* } } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4k.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4k.c +index d1fbe346a48..a88014a2fbf 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-outer-4k.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4k.c +@@ -65,4 +65,4 @@ int main (void) + return 0; + } + +-/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ ++/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail { ! aarch64*-*-* } } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4l.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4l.c +index d1fbe346a48..4f95c652ee3 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-outer-4l.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4l.c +@@ -65,4 +65,4 @@ int main (void) + return 0; + } + +-/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ ++/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail { ! aarch64*-*-* } } } }*/ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-call-1.c b/gcc/testsuite/gcc.dg/vect/vect-outer-call-1.c +new file mode 100644 +index 00000000000..f26d4220532 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/vect-outer-call-1.c +@@ -0,0 +1,22 @@ ++/* { dg-do compile } */ ++/* { dg-require-effective-target vect_float } */ ++/* { dg-additional-options "-fno-math-errno" } */ ++ ++void ++foo (float * __restrict x, float *y, int n, int m) ++{ ++ if (m > 0) ++ for (int i = 0; i < n; ++i) ++ { ++ float tem = x[i], tem1; ++ for (int j = 0; j < m; ++j) ++ { ++ tem += y[j]; ++ tem1 = tem; ++ tem = __builtin_sqrtf (tem); ++ } ++ x[i] = tem - tem1; ++ } ++} ++ ++/* { dg-final { scan-tree-dump "OUTER LOOP VECTORIZED" "vect" { target { vect_call_sqrtf } } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-1-epilogue.c b/gcc/testsuite/gcc.dg/vect/vect-peel-1-epilogue.c +new file mode 100644 +index 00000000000..cc23c6b0866 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/vect-peel-1-epilogue.c +@@ -0,0 +1,3 @@ ++/* { dg-require-effective-target vect_int } */ ++ ++#include "vect-peel-1-src.c" +diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-1-src.c b/gcc/testsuite/gcc.dg/vect/vect-peel-1-src.c +new file mode 100644 +index 00000000000..7980d4dd643 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/vect-peel-1-src.c +@@ -0,0 +1,48 @@ ++#include ++#include "tree-vect.h" ++ ++#define N 128 ++ ++int ib[N+7]; ++ ++__attribute__ ((noinline)) ++int main1 () ++{ ++ int i; ++ int ia[N+1]; ++ ++ /* All the accesses are misaligned. With cost model disabled, we ++ count the number of aligned accesses for each peeling option, and ++ in this case we align the two loads if possible (i.e., if ++ misaligned stores are supported). */ ++ for (i = 1; i <= N; i++) ++ { ++ ia[i] = ib[i+2] + ib[i+6]; ++ } ++ ++ /* check results: */ ++ for (i = 1; i <= N; i++) ++ { ++ if (ia[i] != ib[i+2] + ib[i+6]) ++ abort (); ++ } ++ ++ return 0; ++} ++ ++int main (void) ++{ ++ int i; ++ ++ check_vect (); ++ ++ for (i = 0; i <= N+6; i++) ++ { ++ asm volatile ("" : "+r" (i)); ++ ib[i] = i; ++ } ++ ++ return main1 (); ++} ++ ++ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-1.c b/gcc/testsuite/gcc.dg/vect/vect-peel-1.c +index fae99ab0b08..a7660a381c4 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-peel-1.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-peel-1.c +@@ -1,51 +1,8 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + +-#include +-#include "tree-vect.h" +- +-#define N 128 +- +-int ib[N+7]; +- +-__attribute__ ((noinline)) +-int main1 () +-{ +- int i; +- int ia[N+1]; +- +- /* All the accesses are misaligned. With cost model disabled, we +- count the number of aligned accesses for each peeling option, and +- in this case we align the two loads if possible (i.e., if +- misaligned stores are supported). */ +- for (i = 1; i <= N; i++) +- { +- ia[i] = ib[i+2] + ib[i+6]; +- } +- +- /* check results: */ +- for (i = 1; i <= N; i++) +- { +- if (ia[i] != ib[i+2] + ib[i+6]) +- abort (); +- } +- +- return 0; +-} +- +-int main (void) +-{ +- int i; +- +- check_vect (); +- +- for (i = 0; i <= N+6; i++) +- { +- asm volatile ("" : "+r" (i)); +- ib[i] = i; +- } +- +- return main1 (); +-} ++#include "vect-peel-1-src.c" + + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ + /* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect" { target { { vect_element_align } && { vect_aligned_arrays } } xfail { ! vect_unaligned_possible } } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-3-epilogue.c b/gcc/testsuite/gcc.dg/vect/vect-peel-3-epilogue.c +new file mode 100644 +index 00000000000..8af0fcdca0e +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/vect-peel-3-epilogue.c +@@ -0,0 +1,4 @@ ++/* { dg-require-effective-target vect_int } */ ++/* { dg-add-options bind_pic_locally } */ ++ ++#include "vect-peel-3-src.c" +diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-3-src.c b/gcc/testsuite/gcc.dg/vect/vect-peel-3-src.c +new file mode 100644 +index 00000000000..a21ce8c3d6a +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/vect-peel-3-src.c +@@ -0,0 +1,58 @@ ++#include ++#include "tree-vect.h" ++ ++#if VECTOR_BITS > 128 ++#define NINTS (VECTOR_BITS / 32) ++#define EXTRA (NINTS * 2) ++#else ++#define NINTS 4 ++#define EXTRA 10 ++#endif ++ ++#define N 128 ++ ++#define RES_A (N * N / 4) ++#define RES_B (N * (N + 1) / 2 + (NINTS + 3) * (N + 1)) ++#define RES_C (N * (N + 1) / 2 + (N + 1)) ++#define RES (RES_A + RES_B + RES_C) ++ ++int ib[N + EXTRA]; ++int ia[N + EXTRA]; ++int ic[N + EXTRA]; ++ ++__attribute__ ((noinline)) ++int main1 () ++{ ++ int i, suma = 0, sumb = 0, sumc = 0; ++ ++ /* ib and ic have same misalignment, we peel to align them. */ ++ for (i = 0; i <= N; i++) ++ { ++ suma += ia[i]; ++ sumb += ib[i + NINTS + 1]; ++ sumc += ic[i + 1]; ++ } ++ ++ /* check results: */ ++ if (suma + sumb + sumc != RES) ++ abort (); ++ ++ return 0; ++} ++ ++int main (void) ++{ ++ int i; ++ ++ check_vect (); ++ ++ for (i = 0; i < N + EXTRA; i++) ++ { ++ asm volatile ("" : "+r" (i)); ++ ib[i] = i; ++ ic[i] = i+2; ++ ia[i] = i/2; ++ } ++ ++ return main1 (); ++} +diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-3.c b/gcc/testsuite/gcc.dg/vect/vect-peel-3.c +index d5c0cf10ce1..2cd99573fd1 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-peel-3.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-peel-3.c +@@ -1,64 +1,9 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-add-options bind_pic_locally } */ + +-#include +-#include "tree-vect.h" +- +-#if VECTOR_BITS > 128 +-#define NINTS (VECTOR_BITS / 32) +-#define EXTRA (NINTS * 2) +-#else +-#define NINTS 4 +-#define EXTRA 10 +-#endif +- +-#define N 128 +- +-#define RES_A (N * N / 4) +-#define RES_B (N * (N + 1) / 2 + (NINTS + 3) * (N + 1)) +-#define RES_C (N * (N + 1) / 2 + (N + 1)) +-#define RES (RES_A + RES_B + RES_C) +- +-int ib[N + EXTRA]; +-int ia[N + EXTRA]; +-int ic[N + EXTRA]; +- +-__attribute__ ((noinline)) +-int main1 () +-{ +- int i, suma = 0, sumb = 0, sumc = 0; +- +- /* ib and ic have same misalignment, we peel to align them. */ +- for (i = 0; i <= N; i++) +- { +- suma += ia[i]; +- sumb += ib[i + NINTS + 1]; +- sumc += ic[i + 1]; +- } +- +- /* check results: */ +- if (suma + sumb + sumc != RES) +- abort (); +- +- return 0; +-} +- +-int main (void) +-{ +- int i; +- +- check_vect (); +- +- for (i = 0; i < N + EXTRA; i++) +- { +- asm volatile ("" : "+r" (i)); +- ib[i] = i; +- ic[i] = i+2; +- ia[i] = i/2; +- } +- +- return main1 (); +-} ++#include "vect-peel-3-src.c" + + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align && { ! vect_hw_misalign } } } } } */ + /* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect" { xfail { { ! vect_unaligned_possible } || vect_sizes_32B_16B } } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-4-epilogue.c b/gcc/testsuite/gcc.dg/vect/vect-peel-4-epilogue.c +new file mode 100644 +index 00000000000..783982f04f6 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/vect-peel-4-epilogue.c +@@ -0,0 +1,4 @@ ++/* { dg-require-effective-target vect_int } */ ++/* { dg-add-options bind_pic_locally } */ ++ ++#include "vect-peel-4-src.c" +diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-4-src.c b/gcc/testsuite/gcc.dg/vect/vect-peel-4-src.c +new file mode 100644 +index 00000000000..33088fb0902 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/vect-peel-4-src.c +@@ -0,0 +1,45 @@ ++#include ++#include "tree-vect.h" ++ ++#define N 128 ++ ++int ib[N+7]; ++int ia[N+1]; ++ ++__attribute__ ((noinline)) ++int main1 () ++{ ++ int i; ++ ++ /* Don't peel keeping one load and the store aligned. */ ++ for (i = 0; i <= N; i++) ++ { ++ ia[i] = ib[i] + ib[i+5]; ++ } ++ ++ /* check results: */ ++ for (i = 1; i <= N; i++) ++ { ++ if (ia[i] != ib[i] + ib[i+5]) ++ abort (); ++ } ++ ++ return 0; ++} ++ ++int main (void) ++{ ++ int i; ++ ++ check_vect (); ++ ++ for (i = 0; i <= N+6; i++) ++ { ++ asm volatile ("" : "+r" (i)); ++ ib[i] = i; ++ } ++ ++ return main1 (); ++} ++ ++ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-4.c b/gcc/testsuite/gcc.dg/vect/vect-peel-4.c +index 88f9f0ddcba..3b5272f284f 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-peel-4.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-peel-4.c +@@ -1,49 +1,9 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-add-options bind_pic_locally } */ + +-#include +-#include "tree-vect.h" +- +-#define N 128 +- +-int ib[N+7]; +-int ia[N+1]; +- +-__attribute__ ((noinline)) +-int main1 () +-{ +- int i; +- +- /* Don't peel keeping one load and the store aligned. */ +- for (i = 0; i <= N; i++) +- { +- ia[i] = ib[i] + ib[i+5]; +- } +- +- /* check results: */ +- for (i = 1; i <= N; i++) +- { +- if (ia[i] != ib[i] + ib[i+5]) +- abort (); +- } +- +- return 0; +-} +- +-int main (void) +-{ +- int i; +- +- check_vect (); +- +- for (i = 0; i <= N+6; i++) +- { +- asm volatile ("" : "+r" (i)); +- ib[i] = i; +- } +- +- return main1 (); +-} ++#include "vect-peel-4-src.c" + + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align && { ! vect_hw_misalign } } } } } */ + /* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect" { xfail { ! vect_unaligned_possible } } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-2char-big-array.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-2char-big-array.c +index e246ae7f3c6..c40f8625b84 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-2char-big-array.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-2char-big-array.c +@@ -62,4 +62,4 @@ int main (void) + return 0; + } + +-/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail *-*-* } } } */ ++/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-2char.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-2char.c +index 5f0551ee372..dd3045502f1 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-2char.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-2char.c +@@ -46,4 +46,4 @@ int main (void) + return 0; + } + +-/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail *-*-* } } } */ ++/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-2short.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-2short.c +index 02c2bee8612..1a2d8d04f4e 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-2short.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-2short.c +@@ -45,4 +45,4 @@ int main (void) + return 0; + } + +-/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail *-*-* } } } */ ++/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-6.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-6.c +index ad148046a8e..cc0d9694a4f 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-6.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-6.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_float_strict } */ + /* { dg-additional-options "-fno-fast-math" } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16a.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16a.c +index 171451872e5..ffbc9706901 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16a.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16a.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8a.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8a.c +index ac674749b6f..05e343ad782 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8a.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8a.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ + /* { dg-additional-options "-march=armv8.2-a+dotprod" { target { aarch64*-*-* } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8b.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8b.c +index b036ad5b0b4..e0f47d8a4f2 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8b.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8b.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +@@ -12,12 +14,6 @@ signed char Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); + + /* char->short->short dot product. + The dot-product pattern should be detected. +- The reduction is currently not vectorized becaus of the signed->unsigned->signed +- casts, since this patch: +- +- 2005-12-26 Kazu Hirata +- +- PR tree-optimization/25125 + + When the dot-product is detected, the loop should be vectorized on vect_sdot_qi + targets (targets that support dot-product of signed char). +@@ -60,5 +56,5 @@ int main (void) + /* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 1 "vect" { xfail *-*-* } } } */ + /* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 1 "vect" } } */ + +-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail *-*-* } } } */ ++/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16b.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16b.c +index 57e18040cf2..0fc112012cf 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16b.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16b.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8a.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8a.c +index d020f643bb8..e23ebd9b072 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8a.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8a.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */ + /* { dg-additional-options "-march=armv8.2-a+dotprod" { target { aarch64*-*-* } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8b.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8b.c +index 3155d97b3cd..288be13440d 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8b.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8b.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-epilogue-gaps.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-epilogue-gaps.c +new file mode 100644 +index 00000000000..dc5704f5607 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-epilogue-gaps.c +@@ -0,0 +1,45 @@ ++/* { dg-options "-O3 -fno-vect-cost-model" } */ ++struct { ++ float real; ++ float img; ++} g[11]; ++ ++float __attribute__ ((noclone)) ++foo_11 (void) ++{ ++ float sum = 0.0; ++ for (int i = 0; i < 11; ++i) ++ sum += g[i].real; ++ return sum; ++} ++ ++float __attribute__ ((noclone)) ++foo_10 (void) ++{ ++ float sum = 0.0; ++ for (int i = 0; i < 10; ++i) ++ sum += g[i].real; ++ return sum; ++} ++ ++int main (void) ++{ ++ float check_10 = 0.0; ++ float check_11 = 0.0; ++ for (int i = 0; i < 11; ++i) ++ { ++ asm volatile ("" : : : "memory"); ++ g[i].real = (float) i; ++ g[i].img = (float) -i; ++ if (i < 10) ++ check_10 += (float) i; ++ check_11 += (float) i; ++ } ++ ++ if (foo_10 () != check_10) ++ __builtin_abort (); ++ if (foo_11 () != check_11) ++ __builtin_abort (); ++ ++ return 0; ++} +diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1a.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1a.c +index b06b234072b..1ddbe96ebc3 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1a.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1a.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1b-big-array.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1b-big-array.c +index be03c7d011d..7ae2c838344 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1b-big-array.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1b-big-array.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1c-big-array.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1c-big-array.c +index c30c85ce911..91ce0ef934e 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1c-big-array.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1c-big-array.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2a.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2a.c +index a98edd3045a..2190eaa6242 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2a.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2a.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2b-big-array.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2b-big-array.c +index 570e56a8c9b..6ad645b3bdd 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2b-big-array.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2b-big-array.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2c.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2c.c +index 8190622d5d7..71df5741e16 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2c.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2c.c +@@ -21,6 +21,8 @@ foo () + 2005-12-26 Kazu Hirata + + PR tree-optimization/25125 ++ ++ but we still handle the reduction. + */ + + for (i = 0; i < N; i++) +@@ -43,5 +45,4 @@ main (void) + } + + /* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: detected" 1 "vect" { xfail *-*-* } } } */ +-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail *-*-* } } } */ +-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" { target { ! vect_widen_sum_qi_to_hi } } } } */ ++/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-sad.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-sad.c +index a033a7d27d1..2f0bb692564 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-sad.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-sad.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_usad_char } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-tail-nomask-1.c b/gcc/testsuite/gcc.dg/vect/vect-tail-nomask-1.c +index b912a3431f7..e5bbeaede09 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-tail-nomask-1.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-tail-nomask-1.c +@@ -106,4 +106,4 @@ main (int argc, const char **argv) + } + + /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" { target avx2_runtime } } } */ +-/* { dg-final { scan-tree-dump-times "LOOP EPILOGUE VECTORIZED \\(VS=16\\)" 2 "vect" { target avx2_runtime } } } */ ++/* { dg-final { scan-tree-dump-times "LOOP EPILOGUE VECTORIZED \\(MODE=V16QI\\)" 2 "vect" { target avx2_runtime } } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-const-s16.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-const-s16.c +index 89f983cad06..4c95dd20179 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-const-s16.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-const-s16.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-additional-options "-fno-ipa-icf" } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-const-u16.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-const-u16.c +index e319699cd92..4075f815cea 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-const-u16.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-const-u16.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-additional-options "-fno-ipa-icf" } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-half-u8.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-half-u8.c +index ee0538c0635..c4ac88e186d 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-half-u8.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-half-u8.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-additional-options "-fno-ipa-icf" } */ + +diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-half.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-half.c +index 6d74c693316..ebbf4f5e841 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-half.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-half.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include "tree-vect.h" +diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u16.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u16.c +index 942f63d6f31..2e28baae0b8 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u16.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u16.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c +index 98f78d3b37a..d277f0b2b94 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c ++++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8b.c b/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8b.c +index 176f183f3ce..6fc7a282351 100644 +--- a/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8b.c ++++ b/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8b.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-require-effective-target vect_int } */ + + #include +diff --git a/gcc/testsuite/gcc.dg/vshift-5.c b/gcc/testsuite/gcc.dg/vshift-5.c +index daa5f1c5cd8..62e6328cb28 100644 +--- a/gcc/testsuite/gcc.dg/vshift-5.c ++++ b/gcc/testsuite/gcc.dg/vshift-5.c +@@ -40,6 +40,42 @@ f2 (void) + a[3] = a3; + } + ++__attribute__((noinline, noclone)) void ++f2a (int x) ++{ ++ long long a0, a1, a2, a3; ++ a0 = a[0]; ++ a1 = a[1]; ++ a2 = a[2]; ++ a3 = a[3]; ++ a0 = a0 << x; ++ a1 = a1 << 2; ++ a2 = a2 << 2; ++ a3 = a3 << 2; ++ a[0] = a0; ++ a[1] = a1; ++ a[2] = a2; ++ a[3] = a3; ++} ++ ++__attribute__((noinline, noclone)) void ++f2b (int x) ++{ ++ long long a0, a1, a2, a3; ++ a0 = a[0]; ++ a1 = a[1]; ++ a2 = a[2]; ++ a3 = a[3]; ++ a0 = a0 << 2; ++ a1 = a1 << 2; ++ a2 = a2 << x; ++ a3 = a3 << 2; ++ a[0] = a0; ++ a[1] = a1; ++ a[2] = a2; ++ a[3] = a3; ++} ++ + __attribute__((noinline, noclone)) void + f3 (int x) + { +@@ -77,5 +113,13 @@ main () + if (a[0] != (4LL << 7) || a[1] != (3LL << 8) + || a[2] != (2LL << 9) || a[3] != (1LL << 10)) + abort (); ++ f2a (3); ++ if (a[0] != (4LL << 10) || a[1] != (3LL << 10) ++ || a[2] != (2LL << 11) || a[3] != (1LL << 12)) ++ abort (); ++ f2b (3); ++ if (a[0] != (4LL << 12) || a[1] != (3LL << 12) ++ || a[2] != (2LL << 14) || a[3] != (1LL << 14)) ++ abort (); + return 0; + } +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c b/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c +new file mode 100644 +index 00000000000..8ff66714e9b +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c +@@ -0,0 +1,39 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -ftree-vectorize" } */ ++ ++#include ++ ++#define DEF_DOT(TYPE1, TYPE2) \ ++TYPE1 __attribute__ ((noinline, noclone)) \ ++dot_##TYPE1##_##TYPE2 (TYPE2 *restrict x, TYPE2 *restrict y, int n) \ ++{ \ ++ TYPE1 sum = 0; \ ++ for (int i = 0; i < n; i++) \ ++ { \ ++ sum += x[i] * y[i]; \ ++ } \ ++ return sum; \ ++} ++ ++DEF_DOT(uint32_t, uint8_t) ++DEF_DOT(int32_t, int8_t) ++DEF_DOT(int64_t, int16_t) ++ ++/* The uint16_t->uint64_t dot product requires a casting to satisfy the C ++ language rules. */ ++uint64_t __attribute__ ((noinline, noclone)) ++dot_uint64_t_uint16_t (uint16_t *restrict x, uint16_t *restrict y, int n) ++{ ++ uint64_t sum = 0; ++ for (int i = 0; i < n; i++) ++ { ++ sum += (unsigned int)x[i] * y[i]; ++ } ++ return sum; ++} ++ ++/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.s, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */ ++/* { dg-final { scan-assembler-times {\tsdot\tz[0-9]+\.s, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */ ++/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.d, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ ++/* { dg-final { scan-assembler-times {\tsdot\tz[0-9]+\.d, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ ++/* { dg-final { scan-assembler-times {\twhilelo\t} 8 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c +index 5c04bcdb3f5..51925fa8f50 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c +@@ -17,3 +17,4 @@ f (double *restrict a, double *restrict b, double *restrict c, + + /* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */ + /* { dg-final { scan-assembler-not {\tfmad\t} } } */ ++/* { dg-final { scan-assembler-times {\tst1d} 2 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c +new file mode 100644 +index 00000000000..78c70b2be32 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c +@@ -0,0 +1,90 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -ftree-vectorize" } */ ++ ++#include ++ ++#define MASK_SLP_2(TYPE_COND, ALT_VAL) \ ++void __attribute__ ((noinline, noclone)) \ ++mask_slp_##TYPE_COND##_2_##ALT_VAL (int *restrict x, int *restrict y, \ ++ TYPE_COND *restrict z, int n) \ ++{ \ ++ for (int i = 0; i < n; i += 2) \ ++ { \ ++ x[i] = y[i] ? z[i] : 1; \ ++ x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL; \ ++ } \ ++} ++ ++#define MASK_SLP_4(TYPE_COND, ALT_VAL) \ ++void __attribute__ ((noinline, noclone)) \ ++mask_slp_##TYPE_COND##_4_##ALT_VAL (int *restrict x, int *restrict y, \ ++ TYPE_COND *restrict z, int n) \ ++{ \ ++ for (int i = 0; i < n; i += 4) \ ++ { \ ++ x[i] = y[i] ? z[i] : 1; \ ++ x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL; \ ++ x[i + 2] = y[i + 2] ? z[i + 2] : 1; \ ++ x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL; \ ++ } \ ++} ++ ++#define MASK_SLP_8(TYPE_COND, ALT_VAL) \ ++void __attribute__ ((noinline, noclone)) \ ++mask_slp_##TYPE_COND##_8_##ALT_VAL (int *restrict x, int *restrict y, \ ++ TYPE_COND *restrict z, int n) \ ++{ \ ++ for (int i = 0; i < n; i += 8) \ ++ { \ ++ x[i] = y[i] ? z[i] : 1; \ ++ x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL; \ ++ x[i + 2] = y[i + 2] ? z[i + 2] : 1; \ ++ x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL; \ ++ x[i + 4] = y[i + 4] ? z[i + 4] : 1; \ ++ x[i + 5] = y[i + 5] ? z[i + 5] : ALT_VAL; \ ++ x[i + 6] = y[i + 6] ? z[i + 6] : 1; \ ++ x[i + 7] = y[i + 7] ? z[i + 7] : ALT_VAL; \ ++ } \ ++} ++ ++#define MASK_SLP_FAIL(TYPE_COND) \ ++void __attribute__ ((noinline, noclone)) \ ++mask_slp_##TYPE_COND##_FAIL (int *restrict x, int *restrict y, \ ++ TYPE_COND *restrict z, int n) \ ++{ \ ++ for (int i = 0; i < n; i += 2) \ ++ { \ ++ x[i] = y[i] ? z[i] : 1; \ ++ x[i + 1] = y[i + 1] ? z[i + 1] : x[z[i + 1]]; \ ++ } \ ++} ++ ++MASK_SLP_2(int8_t, 1) ++MASK_SLP_2(int8_t, 2) ++MASK_SLP_2(int, 1) ++MASK_SLP_2(int, 2) ++MASK_SLP_2(int64_t, 1) ++MASK_SLP_2(int64_t, 2) ++ ++MASK_SLP_4(int8_t, 1) ++MASK_SLP_4(int8_t, 2) ++MASK_SLP_4(int, 1) ++MASK_SLP_4(int, 2) ++MASK_SLP_4(int64_t, 1) ++MASK_SLP_4(int64_t, 2) ++ ++MASK_SLP_8(int8_t, 1) ++MASK_SLP_8(int8_t, 2) ++MASK_SLP_8(int, 1) ++MASK_SLP_8(int, 2) ++MASK_SLP_8(int64_t, 1) ++MASK_SLP_8(int64_t, 2) ++ ++MASK_SLP_FAIL(int8_t) ++MASK_SLP_FAIL(int) ++MASK_SLP_FAIL(int64_t) ++ ++/* { dg-final { scan-assembler-not {\tld2w\t} } } */ ++/* { dg-final { scan-assembler-not {\tst2w\t} } } */ ++/* { dg-final { scan-assembler-times {\tld1w\t} 48 } } */ ++/* { dg-final { scan-assembler-times {\tst1w\t} 40 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_1.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_1.c +index a258344b0a9..f152d04b473 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/reduc_1.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_1.c +@@ -105,8 +105,8 @@ reduc_##NAME##_##TYPE (TYPE *a, int n) \ + + TEST_BITWISE (DEF_REDUC_BITWISE) + +-/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */ +-/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ ++/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */ ++/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ + /* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ + /* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */ + +@@ -157,8 +157,8 @@ TEST_BITWISE (DEF_REDUC_BITWISE) + /* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ + /* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */ + +-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */ +-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */ ++/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b\n} 2 } } */ ++/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h\n} 2 } } */ + /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s\n} 2 } } */ + /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 2 } } */ + /* { dg-final { scan-assembler-times {\tfaddv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_2.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_2.c +index 376a453fc73..0640cba8e0f 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/reduc_2.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_2.c +@@ -116,8 +116,8 @@ reduc_##NAME##TYPE (TYPE (*restrict a)[NUM_ELEMS(TYPE)], \ + + TEST_BITWISE (DEF_REDUC_BITWISE) + +-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */ +-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */ ++/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b\n} 2 } } */ ++/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h\n} 2 } } */ + /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s\n} 2 } } */ + /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 2 } } */ + /* { dg-final { scan-assembler-times {\tfaddv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_5.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_5.c +index ff535942331..cced4ad488e 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/reduc_5.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_5.c +@@ -23,16 +23,12 @@ REDUC (uint64_t) + REDUC (float) + REDUC (double) + +-/* XFAILed until we support sub-int reductions for signed types. */ +-/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m} 2 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m} 2 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m} 1 } } */ +-/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m} 1 } } */ ++/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m} 2 } } */ ++/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m} 2 } } */ + /* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m} 2 } } */ + /* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m} 2 } } */ + /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m} 1 } } */ + /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m} 1 } } */ + +-/* XFAILed until we support sub-int reductions for signed types. */ +-/* { dg-final { scan-assembler-times {\tsub\t} 8 { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-times {\tsub\t} 8 } } */ + /* { dg-final { scan-assembler-times {\tfsub\t} 2 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_8.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_8.c +index 3913b8848c0..dec4c87e54d 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/reduc_8.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_8.c +@@ -15,6 +15,5 @@ reduc (int *restrict a, int *restrict b, int *restrict c) + } + + /* { dg-final { scan-assembler-times {\tcmpne\tp[0-9]+\.s, } 1 } } */ +-/* We ought to use the CMPNE result for the SEL too. */ +-/* { dg-final { scan-assembler-not {\tcmpeq\tp[0-9]+\.s, } { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-not {\tcmpeq\tp[0-9]+\.s, } } } */ + /* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.s, } 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c +index a718e9d2ebf..83ebec50bc6 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c +@@ -1,10 +1,7 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -ftree-vectorize -fno-inline -msve-vector-bits=256 -fdump-tree-vect-details" } */ ++/* { dg-options "-O2 -ftree-vectorize" } */ + +-double mat[100][4]; +-double mat2[100][8]; +-double mat3[100][12]; +-double mat4[100][3]; ++double mat[100][2]; + + double + slp_reduc_plus (int n) +@@ -14,115 +11,8 @@ slp_reduc_plus (int n) + { + tmp = tmp + mat[i][0]; + tmp = tmp + mat[i][1]; +- tmp = tmp + mat[i][2]; +- tmp = tmp + mat[i][3]; + } + return tmp; + } + +-double +-slp_reduc_plus2 (int n) +-{ +- double tmp = 0.0; +- for (int i = 0; i < n; i++) +- { +- tmp = tmp + mat2[i][0]; +- tmp = tmp + mat2[i][1]; +- tmp = tmp + mat2[i][2]; +- tmp = tmp + mat2[i][3]; +- tmp = tmp + mat2[i][4]; +- tmp = tmp + mat2[i][5]; +- tmp = tmp + mat2[i][6]; +- tmp = tmp + mat2[i][7]; +- } +- return tmp; +-} +- +-double +-slp_reduc_plus3 (int n) +-{ +- double tmp = 0.0; +- for (int i = 0; i < n; i++) +- { +- tmp = tmp + mat3[i][0]; +- tmp = tmp + mat3[i][1]; +- tmp = tmp + mat3[i][2]; +- tmp = tmp + mat3[i][3]; +- tmp = tmp + mat3[i][4]; +- tmp = tmp + mat3[i][5]; +- tmp = tmp + mat3[i][6]; +- tmp = tmp + mat3[i][7]; +- tmp = tmp + mat3[i][8]; +- tmp = tmp + mat3[i][9]; +- tmp = tmp + mat3[i][10]; +- tmp = tmp + mat3[i][11]; +- } +- return tmp; +-} +- +-void +-slp_non_chained_reduc (int n, double * restrict out) +-{ +- for (int i = 0; i < 3; i++) +- out[i] = 0; +- +- for (int i = 0; i < n; i++) +- { +- out[0] = out[0] + mat4[i][0]; +- out[1] = out[1] + mat4[i][1]; +- out[2] = out[2] + mat4[i][2]; +- } +-} +- +-/* Strict FP reductions shouldn't be used for the outer loops, only the +- inner loops. */ +- +-float +-double_reduc1 (float (*restrict i)[16]) +-{ +- float l = 0; +- +- for (int a = 0; a < 8; a++) +- for (int b = 0; b < 8; b++) +- l += i[b][a]; +- return l; +-} +- +-float +-double_reduc2 (float *restrict i) +-{ +- float l = 0; +- +- for (int a = 0; a < 8; a++) +- for (int b = 0; b < 16; b++) +- { +- l += i[b * 4]; +- l += i[b * 4 + 1]; +- l += i[b * 4 + 2]; +- l += i[b * 4 + 3]; +- } +- return l; +-} +- +-float +-double_reduc3 (float *restrict i, float *restrict j) +-{ +- float k = 0, l = 0; +- +- for (int a = 0; a < 8; a++) +- for (int b = 0; b < 8; b++) +- { +- k += i[b]; +- l += j[b]; +- } +- return l * k; +-} +- +-/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s} 4 } } */ +-/* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d} 9 } } */ +-/* 1 reduction each for double_reduc{1,2} and 2 for double_reduc3. Each one +- is reported three times, once for SVE, once for 128-bit AdvSIMD and once +- for 64-bit AdvSIMD. */ +-/* { dg-final { scan-tree-dump-times "Detected double reduction" 12 "vect" } } */ +-/* double_reduc2 has 2 reductions and slp_non_chained_reduc has 3. */ +-/* { dg-final { scan-tree-dump-times "Detected reduction" 10 "vect" } } */ ++/* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d\n} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_13.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_13.c +index 0b2a7ad57e3..37b5f1148a3 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_13.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_13.c +@@ -32,7 +32,6 @@ vec_slp_##TYPE (TYPE *restrict a, int n) \ + + TEST_ALL (VEC_PERM) + +-/* ??? We don't treat the int8_t and int16_t loops as reductions. */ + /* ??? We don't treat the uint loops as SLP. */ + /* The loop should be fully-masked. */ + /* { dg-final { scan-assembler-times {\tld1b\t} 2 { xfail *-*-* } } } */ +@@ -41,15 +40,15 @@ TEST_ALL (VEC_PERM) + /* { dg-final { scan-assembler-times {\tld1w\t} 2 } } */ + /* { dg-final { scan-assembler-times {\tld1d\t} 3 { xfail *-*-* } } } */ + /* { dg-final { scan-assembler-times {\tld1d\t} 2 } } */ +-/* { dg-final { scan-assembler-not {\tldr} { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-not {\tldr} } } */ + +-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b} 4 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 6 { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b} 4 } } */ ++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 6 } } */ + /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */ + /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */ + +-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b\n} 2 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h\n} 2 { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b\n} 2 } } */ ++/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h\n} 2 } } */ + /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s\n} 2 } } */ + /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 2 } } */ + /* { dg-final { scan-assembler-times {\tfadda\th[0-9]+, p[0-7], h[0-9]+, z[0-9]+\.h\n} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_5.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_5.c +index b75edc69e2d..6a199d00659 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_5.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_5.c +@@ -33,34 +33,24 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE *restrict b, int n) \ + + TEST_ALL (VEC_PERM) + +-/* ??? We don't think it's worth using SLP for the 64-bit loops and fall +- back to the less efficient non-SLP implementation instead. */ +-/* ??? At present we don't treat the int8_t and int16_t loops as +- reductions. */ +-/* { dg-final { scan-assembler-times {\tld1b\t} 2 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tld1h\t} 3 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */ +-/* { dg-final { scan-assembler-times {\tld1h\t} 2 } } */ ++/* { dg-final { scan-assembler-times {\tld1b\t} 2 } } */ ++/* { dg-final { scan-assembler-times {\tld1h\t} 3 } } */ + /* { dg-final { scan-assembler-times {\tld1w\t} 3 } } */ + /* { dg-final { scan-assembler-times {\tld1d\t} 3 } } */ + /* { dg-final { scan-assembler-not {\tld2b\t} } } */ + /* { dg-final { scan-assembler-not {\tld2h\t} } } */ + /* { dg-final { scan-assembler-not {\tld2w\t} } } */ + /* { dg-final { scan-assembler-not {\tld2d\t} } } */ +-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 4 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 4 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 2 } } */ +-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 2 } } */ ++/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 4 } } */ ++/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */ + /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s} 4 } } */ + /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 4 } } */ + /* { dg-final { scan-assembler-times {\tfaddv\th[0-9]+, p[0-7], z[0-9]+\.h} 2 } } */ + /* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s} 2 } } */ + /* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 2 } } */ + +-/* Should be 4 and 6 respectively, if we used reductions for int8_t and +- int16_t. */ +-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b} 2 } } */ +-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 4 } } */ ++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b} 4 } } */ ++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 6 } } */ + /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */ + /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */ + +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_7.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_7.c +index 9e6aa8ccbf8..19207207999 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_7.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_7.c +@@ -31,45 +31,27 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE *restrict b, int n) \ + T (uint16_t) \ + T (int32_t) \ + T (uint32_t) \ +- T (int64_t) \ +- T (uint64_t) \ + T (_Float16) \ +- T (float) \ +- T (double) ++ T (float) + + TEST_ALL (VEC_PERM) + +-/* We can't use SLP for the 64-bit loops, since the number of reduction +- results might be greater than the number of elements in the vector. +- Otherwise we have two loads per loop, one for the initial vector +- and one for the loop body. */ +-/* ??? At present we don't treat the int8_t and int16_t loops as +- reductions. */ +-/* { dg-final { scan-assembler-times {\tld1b\t} 2 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tld1h\t} 3 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */ +-/* { dg-final { scan-assembler-times {\tld1h\t} 2 } } */ ++/* We have two loads per loop, one for the initial vector and one for ++ the loop body. */ ++/* { dg-final { scan-assembler-times {\tld1b\t} 2 } } */ ++/* { dg-final { scan-assembler-times {\tld1h\t} 3 } } */ + /* { dg-final { scan-assembler-times {\tld1w\t} 3 } } */ +-/* { dg-final { scan-assembler-times {\tld4d\t} 3 } } */ + /* { dg-final { scan-assembler-not {\tld4b\t} } } */ + /* { dg-final { scan-assembler-not {\tld4h\t} } } */ + /* { dg-final { scan-assembler-not {\tld4w\t} } } */ +-/* { dg-final { scan-assembler-not {\tld1d\t} } } */ +-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 8 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 8 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 4 } } */ +-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */ ++/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 8 } } */ ++/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 8 } } */ + /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s} 8 } } */ +-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 8 } } */ + /* { dg-final { scan-assembler-times {\tfaddv\th[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */ + /* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s} 4 } } */ +-/* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 4 } } */ + +-/* Should be 4 and 6 respectively, if we used reductions for int8_t and +- int16_t. */ +-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b} 2 } } */ +-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 4 } } */ ++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b} 4 } } */ ++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 6 } } */ + /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */ +-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */ + + /* { dg-final { scan-assembler-not {\tuqdec} } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c +index 68baba9e965..40ff2d561a8 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c +@@ -15,12 +15,9 @@ f (TYPE *x, TYPE *y, unsigned short n, l + /* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */ + /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */ + /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */ +-/* Should multiply by (VF-1)*4 rather than (257-1)*4. */ +-/* { dg-final { scan-assembler-not {, 1024} } } */ +-/* { dg-final { scan-assembler-not {lsl[^\n]*[, ]10} } } */ +-/* { dg-final { scan-assembler-not {\tcmp\tx[0-9]+, 0} } } */ +-/* { dg-final { scan-assembler-not {\tcmp\tw[0-9]+, 0} } } */ +-/* { dg-final { scan-assembler-not {\tcsel\tx[0-9]+} } } */ +-/* Two range checks and a check for n being zero. */ +-/* { dg-final { scan-assembler-times {\tcmp\t} 1 } } */ +-/* { dg-final { scan-assembler-times {\tccmp\t} 2 } } */ ++/* Should use a WAR check that multiplies by (VF-2)*4 rather than ++ an overlap check that multiplies by (257-1)*4. */ ++/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */ ++/* One range check and a check for n being zero. */ ++/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 1 } } */ ++/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c +index 30f6d2691b8..b8afea70207 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c +@@ -15,7 +15,7 @@ f (TYPE *x, TYPE *y, unsigned short n, unsigned short m) + /* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */ + /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */ + /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */ +-/* Should multiply by (257-1)*4 rather than (VF-1)*4. */ ++/* Should multiply by (257-1)*4 rather than (VF-1)*4 or (VF-2)*4. */ + /* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x2, 10, 16\n} 1 } } */ + /* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x3, 10, 16\n} 1 } } */ + /* { dg-final { scan-assembler-not {\tcmp\tx[0-9]+, 0} } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_3.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_3.c +index 70792ff9f33..5ab6859ad4e 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_3.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_3.c +@@ -15,13 +15,10 @@ f (TYPE *x, TYPE *y, int n, long m __attribute__((unused))) + /* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */ + /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */ + /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */ +-/* Should multiply by (VF-1)*4 rather than (257-1)*4. */ +-/* { dg-final { scan-assembler-not {, 1024} } } */ +-/* { dg-final { scan-assembler-not {\t.bfiz\t} } } */ +-/* { dg-final { scan-assembler-not {lsl[^\n]*[, ]10} } } */ +-/* { dg-final { scan-assembler-not {\tcmp\tx[0-9]+, 0} } } */ +-/* { dg-final { scan-assembler {\tcmp\tw2, 0} } } */ +-/* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+} 2 } } */ +-/* Two range checks and a check for n being zero. */ +-/* { dg-final { scan-assembler {\tcmp\t} } } */ +-/* { dg-final { scan-assembler-times {\tccmp\t} 2 } } */ ++/* Should use a WAR check that multiplies by (VF-2)*4 rather than ++ an overlap check that multiplies by (257-1)*4. */ ++/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */ ++/* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+[^\n]*xzr} 1 } } */ ++/* One range check and a check for n being zero. */ ++/* { dg-final { scan-assembler-times {\tcmp\t} 1 } } */ ++/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_5.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_5.c +index 688f3be61d7..93c114193e9 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_5.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_5.c +@@ -15,13 +15,10 @@ f (TYPE *x, TYPE *y, long n, long m __attribute__((unused))) + /* { dg-final { scan-assembler {\tst1d\tz[0-9]+} } } */ + /* { dg-final { scan-assembler {\tldr\td[0-9]+} } } */ + /* { dg-final { scan-assembler {\tstr\td[0-9]+} } } */ +-/* Should multiply by (VF-1)*8 rather than (257-1)*8. */ +-/* { dg-final { scan-assembler-not {, 2048} } } */ +-/* { dg-final { scan-assembler-not {\t.bfiz\t} } } */ +-/* { dg-final { scan-assembler-not {lsl[^\n]*[, ]11} } } */ +-/* { dg-final { scan-assembler {\tcmp\tx[0-9]+, 0} } } */ +-/* { dg-final { scan-assembler-not {\tcmp\tw[0-9]+, 0} } } */ +-/* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+} 2 } } */ +-/* Two range checks and a check for n being zero. */ +-/* { dg-final { scan-assembler {\tcmp\t} } } */ +-/* { dg-final { scan-assembler-times {\tccmp\t} 2 } } */ ++/* Should use a WAR check that multiplies by (VF-2)*8 rather than ++ an overlap check that multiplies by (257-1)*4. */ ++/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #16\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */ ++/* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+[^\n]*xzr} 1 } } */ ++/* One range check and a check for n being zero. */ ++/* { dg-final { scan-assembler-times {\tcmp\t} 1 } } */ ++/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c +index 00d84760a19..b38f23e87ba 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c +@@ -98,24 +98,24 @@ TEST_CMP (nugt) + /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ + + /* 5 for lt, 5 for ult and 5 for nult. */ +-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */ ++/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */ + + /* 5 for le, 5 for ule and 5 for nule. */ +-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */ ++/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */ + + /* 5 for gt, 5 for ugt and 5 for nugt. */ +-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */ ++/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */ + + /* 5 for ge, 5 for uge and 5 for nuge. */ +-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */ ++/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */ + + /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} } } */ + /* 3 loops * 5 invocations for all 12 unordered comparisons. */ +-/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 180 { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 180 } } */ + + /* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 7 { xfail *-*-* } } } */ + /* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 14 { xfail *-*-* } } } */ +@@ -123,19 +123,19 @@ TEST_CMP (nugt) + /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ + /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ + +-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */ ++/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */ + +-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */ ++/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */ + +-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */ ++/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */ + +-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */ ++/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */ + + /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} } } */ + /* 3 loops * 5 invocations, with 2 invocations having ncopies == 2, + for all 12 unordered comparisons. */ +-/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 252 { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 252 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c +index 23bfb7b2649..2f16fbff522 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c +@@ -19,16 +19,16 @@ + /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 40 { xfail *-*-* } } } */ + + /* 5 for le, 5 for ule and 5 for nule. */ +-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 } } */ ++/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 } } */ + + /* 5 for gt, 5 for ugt, 5 for nueq and 5 for nugt. */ + /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 20 { xfail *-*-* } } } */ + /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 40 { xfail *-*-* } } } */ + + /* 5 for ge, 5 for uge and 5 for nuge. */ +-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 } } */ ++/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 } } */ + + /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} } } */ + /* 3 loops * 5 invocations for ordered, unordered amd ueq. */ +@@ -43,14 +43,14 @@ + /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 28 { xfail *-*-* } } } */ + /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 56 { xfail *-*-* } } } */ + +-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 } } */ ++/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 } } */ + + /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 28 { xfail *-*-* } } } */ + /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 56 { xfail *-*-* } } } */ + +-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 { xfail *-*-* } } } */ +-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 { xfail *-*-* } } } */ ++/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 } } */ ++/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 } } */ + + /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} } } */ + /* 3 loops * 5 invocations, with 2 invocations having ncopies == 2, +diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_1.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_1.c +new file mode 100644 +index 00000000000..fe490cfbf3f +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_1.c +@@ -0,0 +1,18 @@ ++/* { dg-options "-O2 -ftree-vectorize" } */ ++ ++#pragma GCC target "+nosve" ++ ++#include ++ ++void ++f (int64_t *x, int64_t *y, int32_t *z, int n) ++{ ++ for (int i = 0; i < n; ++i) ++ { ++ x[i] += y[i]; ++ z[i] += z[i - 2]; ++ } ++} ++ ++/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.2d,} 1 } } */ ++/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.2s,} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_10.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_10.c +new file mode 100644 +index 00000000000..81e77a8bb04 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_10.c +@@ -0,0 +1,18 @@ ++/* { dg-options "-O2 -ftree-vectorize" } */ ++ ++#pragma GCC target "+nosve" ++ ++#include ++ ++void ++f (int16_t *x, int16_t *y, uint8_t *z, int n) ++{ ++ for (int i = 0; i < n; ++i) ++ { ++ x[i] = z[i]; ++ y[i] += y[i - 8]; ++ } ++} ++ ++/* { dg-final { scan-assembler-times {\tuxtl\tv[0-9]+\.8h, v[0-9]+\.8b\n} 1 } } */ ++/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.8h,} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_11.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_11.c +new file mode 100644 +index 00000000000..d9da6c1f12a +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_11.c +@@ -0,0 +1,18 @@ ++/* { dg-options "-O2 -ftree-vectorize" } */ ++ ++#pragma GCC target "+nosve" ++ ++#include ++ ++void ++f (int32_t *x, int64_t *y, int64_t *z, int n) ++{ ++ for (int i = 0; i < n; ++i) ++ { ++ x[i] = z[i]; ++ y[i] += y[i - 2]; ++ } ++} ++ ++/* { dg-final { scan-assembler-times {\txtn\tv[0-9]+\.2s, v[0-9]+\.2d\n} 1 } } */ ++/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.2d,} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_12.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_12.c +new file mode 100644 +index 00000000000..80dab8bf55f +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_12.c +@@ -0,0 +1,18 @@ ++/* { dg-options "-O2 -ftree-vectorize" } */ ++ ++#pragma GCC target "+nosve" ++ ++#include ++ ++void ++f (int16_t *x, int32_t *y, int32_t *z, int n) ++{ ++ for (int i = 0; i < n; ++i) ++ { ++ x[i] = z[i]; ++ y[i] += y[i - 4]; ++ } ++} ++ ++/* { dg-final { scan-assembler-times {\txtn\tv[0-9]+\.4h, v[0-9]+\.4s\n} 1 } } */ ++/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.4s,} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_13.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_13.c +new file mode 100644 +index 00000000000..655fa7d4bf1 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_13.c +@@ -0,0 +1,18 @@ ++/* { dg-options "-O2 -ftree-vectorize" } */ ++ ++#pragma GCC target "+nosve" ++ ++#include ++ ++void ++f (int8_t *x, int16_t *y, int16_t *z, int n) ++{ ++ for (int i = 0; i < n; ++i) ++ { ++ x[i] = z[i]; ++ y[i] += y[i - 8]; ++ } ++} ++ ++/* { dg-final { scan-assembler-times {\txtn\tv[0-9]+\.8b, v[0-9]+\.8h\n} 1 } } */ ++/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.8h,} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_2.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_2.c +new file mode 100644 +index 00000000000..1fe69cad259 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_2.c +@@ -0,0 +1,19 @@ ++/* { dg-options "-O2 -ftree-vectorize" } */ ++ ++#pragma GCC target "+nosve" ++ ++#include ++ ++void ++f (int32_t *x, int32_t *y, int16_t *z, int n) ++{ ++ for (int i = 0; i < n; ++i) ++ { ++ x[i] += y[i]; ++ z[i] += z[i - 4]; ++ } ++} ++ ++/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.4s,} 1 } } */ ++/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.4h,} 1 } } */ ++/* { dg-final { scan-assembler-not {\tadd\tv[0-9]+\.2s,} } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_3.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_3.c +new file mode 100644 +index 00000000000..1290772216e +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_3.c +@@ -0,0 +1,19 @@ ++/* { dg-options "-O2 -ftree-vectorize" } */ ++ ++#pragma GCC target "+nosve" ++ ++#include ++ ++void ++f (int16_t *x, int16_t *y, int8_t *z, int n) ++{ ++ for (int i = 0; i < n; ++i) ++ { ++ x[i] += y[i]; ++ z[i] += z[i - 8]; ++ } ++} ++ ++/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.8h,} 1 } } */ ++/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.8b,} 1 } } */ ++/* { dg-final { scan-assembler-not {\tadd\tv[0-9]+\.4h,} } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_4.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_4.c +new file mode 100644 +index 00000000000..768ea8c7164 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_4.c +@@ -0,0 +1,18 @@ ++/* { dg-options "-O2 -ftree-vectorize" } */ ++ ++#pragma GCC target "+nosve" ++ ++#include ++ ++void ++f (int64_t *x, int64_t *y, int8_t *z, int n) ++{ ++ for (int i = 0; i < n; ++i) ++ { ++ x[i] += y[i]; ++ z[i] += z[i - 8]; ++ } ++} ++ ++/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.2d,} 4 } } */ ++/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.8b,} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_5.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_5.c +new file mode 100644 +index 00000000000..ca8a65a16e7 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_5.c +@@ -0,0 +1,18 @@ ++/* { dg-options "-O2 -ftree-vectorize" } */ ++ ++#pragma GCC target "+nosve" ++ ++#include ++ ++void ++f (int64_t *x, int64_t *y, int32_t *z, int n) ++{ ++ for (int i = 0; i < n; ++i) ++ { ++ x[i] = z[i]; ++ y[i] += y[i - 2]; ++ } ++} ++ ++/* { dg-final { scan-assembler-times {\tsxtl\tv[0-9]+\.2d, v[0-9]+\.2s\n} 1 } } */ ++/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.2d,} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_6.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_6.c +new file mode 100644 +index 00000000000..6c09b5b146b +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_6.c +@@ -0,0 +1,18 @@ ++/* { dg-options "-O2 -ftree-vectorize" } */ ++ ++#pragma GCC target "+nosve" ++ ++#include ++ ++void ++f (int32_t *x, int32_t *y, int16_t *z, int n) ++{ ++ for (int i = 0; i < n; ++i) ++ { ++ x[i] = z[i]; ++ y[i] += y[i - 4]; ++ } ++} ++ ++/* { dg-final { scan-assembler-times {\tsxtl\tv[0-9]+\.4s, v[0-9]+\.4h\n} 1 } } */ ++/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.4s,} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_7.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_7.c +new file mode 100644 +index 00000000000..94a66c545ef +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_7.c +@@ -0,0 +1,18 @@ ++/* { dg-options "-O2 -ftree-vectorize" } */ ++ ++#pragma GCC target "+nosve" ++ ++#include ++ ++void ++f (int16_t *x, int16_t *y, int8_t *z, int n) ++{ ++ for (int i = 0; i < n; ++i) ++ { ++ x[i] = z[i]; ++ y[i] += y[i - 8]; ++ } ++} ++ ++/* { dg-final { scan-assembler-times {\tsxtl\tv[0-9]+\.8h, v[0-9]+\.8b\n} 1 } } */ ++/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.8h,} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_8.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_8.c +new file mode 100644 +index 00000000000..9531966c294 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_8.c +@@ -0,0 +1,18 @@ ++/* { dg-options "-O2 -ftree-vectorize" } */ ++ ++#pragma GCC target "+nosve" ++ ++#include ++ ++void ++f (int64_t *x, int64_t *y, uint32_t *z, int n) ++{ ++ for (int i = 0; i < n; ++i) ++ { ++ x[i] = z[i]; ++ y[i] += y[i - 2]; ++ } ++} ++ ++/* { dg-final { scan-assembler-times {\tuxtl\tv[0-9]+\.2d, v[0-9]+\.2s\n} 1 } } */ ++/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.2d,} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_9.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_9.c +new file mode 100644 +index 00000000000..de8f6988685 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_9.c +@@ -0,0 +1,18 @@ ++/* { dg-options "-O2 -ftree-vectorize" } */ ++ ++#pragma GCC target "+nosve" ++ ++#include ++ ++void ++f (int32_t *x, int32_t *y, uint16_t *z, int n) ++{ ++ for (int i = 0; i < n; ++i) ++ { ++ x[i] = z[i]; ++ y[i] += y[i - 4]; ++ } ++} ++ ++/* { dg-final { scan-assembler-times {\tuxtl\tv[0-9]+\.4s, v[0-9]+\.4h\n} 1 } } */ ++/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.4s,} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-19.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-19.c +index ae2f8611ea6..9d926ca5dfe 100644 +--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-19.c ++++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-19.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-options "-O3 -mavx -mtune=generic -dp" } */ + +diff --git a/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c b/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c +index 2a105601c71..51765900fcf 100644 +--- a/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c ++++ b/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-options "-O3 -mavx2 -fdump-tree-vect-details" } */ + /* { dg-require-effective-target avx2 } */ + +diff --git a/gcc/testsuite/gcc.target/i386/avx512f-gather-2.c b/gcc/testsuite/gcc.target/i386/avx512f-gather-2.c +index a26aa6529e8..4de04511934 100644 +--- a/gcc/testsuite/gcc.target/i386/avx512f-gather-2.c ++++ b/gcc/testsuite/gcc.target/i386/avx512f-gather-2.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ /* PR59617 */ + /* { dg-options "-O3 -mavx512f -fdump-tree-vect-details -mtune=knl" } */ + +diff --git a/gcc/testsuite/gcc.target/i386/avx512f-gather-5.c b/gcc/testsuite/gcc.target/i386/avx512f-gather-5.c +index 2bb9c5c090b..946117d9d30 100644 +--- a/gcc/testsuite/gcc.target/i386/avx512f-gather-5.c ++++ b/gcc/testsuite/gcc.target/i386/avx512f-gather-5.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-options "-O3 -mavx512f -mtune=knl" } */ + +diff --git a/gcc/testsuite/gcc.target/i386/avx512f-simd-1.c b/gcc/testsuite/gcc.target/i386/avx512f-simd-1.c +new file mode 100644 +index 00000000000..235fb917e17 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/avx512f-simd-1.c +@@ -0,0 +1,35 @@ ++/* { dg-do compile } */ ++/* { dg-options "-fopenmp-simd -O2 -mavx512f -masm=att" } */ ++/* { dg-final { scan-assembler "vpadd\[^\n\r]*%xmm" } } */ ++/* { dg-final { scan-assembler "vpadd\[^\n\r]*%ymm" } } */ ++/* { dg-final { scan-assembler "vpadd\[^\n\r]*%zmm" } } */ ++ ++#define N 1024 ++int a[N]; ++ ++void ++f1 (void) ++{ ++ int i; ++ #pragma omp simd simdlen (4) ++ for (i = 0; i < N; ++i) ++ a[i] = a[i] + 1; ++} ++ ++void ++f2 (void) ++{ ++ int i; ++ #pragma omp simd simdlen (8) ++ for (i = 0; i < N; ++i) ++ a[i] = a[i] + 2; ++} ++ ++void ++f3 (void) ++{ ++ int i; ++ #pragma omp simd simdlen (16) ++ for (i = 0; i < N; ++i) ++ a[i] = a[i] + 3; ++} +diff --git a/gcc/testsuite/gcc.target/i386/l_fma_double_1.c b/gcc/testsuite/gcc.target/i386/l_fma_double_1.c +index e5bcdabcf79..2472fb016ee 100644 +--- a/gcc/testsuite/gcc.target/i386/l_fma_double_1.c ++++ b/gcc/testsuite/gcc.target/i386/l_fma_double_1.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */ + +diff --git a/gcc/testsuite/gcc.target/i386/l_fma_double_2.c b/gcc/testsuite/gcc.target/i386/l_fma_double_2.c +index dbd078abc81..3d569733b1e 100644 +--- a/gcc/testsuite/gcc.target/i386/l_fma_double_2.c ++++ b/gcc/testsuite/gcc.target/i386/l_fma_double_2.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */ + +diff --git a/gcc/testsuite/gcc.target/i386/l_fma_double_3.c b/gcc/testsuite/gcc.target/i386/l_fma_double_3.c +index d0844f208e5..8e5ec4150cc 100644 +--- a/gcc/testsuite/gcc.target/i386/l_fma_double_3.c ++++ b/gcc/testsuite/gcc.target/i386/l_fma_double_3.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */ + +diff --git a/gcc/testsuite/gcc.target/i386/l_fma_double_4.c b/gcc/testsuite/gcc.target/i386/l_fma_double_4.c +index b9498a0ff13..0d2a0408d0b 100644 +--- a/gcc/testsuite/gcc.target/i386/l_fma_double_4.c ++++ b/gcc/testsuite/gcc.target/i386/l_fma_double_4.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */ + +diff --git a/gcc/testsuite/gcc.target/i386/l_fma_double_5.c b/gcc/testsuite/gcc.target/i386/l_fma_double_5.c +index 0292ba040a3..fcf1a6ceac1 100644 +--- a/gcc/testsuite/gcc.target/i386/l_fma_double_5.c ++++ b/gcc/testsuite/gcc.target/i386/l_fma_double_5.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */ + +diff --git a/gcc/testsuite/gcc.target/i386/l_fma_double_6.c b/gcc/testsuite/gcc.target/i386/l_fma_double_6.c +index a716006eda8..650e608117f 100644 +--- a/gcc/testsuite/gcc.target/i386/l_fma_double_6.c ++++ b/gcc/testsuite/gcc.target/i386/l_fma_double_6.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */ + +diff --git a/gcc/testsuite/gcc.target/i386/l_fma_float_1.c b/gcc/testsuite/gcc.target/i386/l_fma_float_1.c +index b386b83e39a..c29198ba666 100644 +--- a/gcc/testsuite/gcc.target/i386/l_fma_float_1.c ++++ b/gcc/testsuite/gcc.target/i386/l_fma_float_1.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */ + +diff --git a/gcc/testsuite/gcc.target/i386/l_fma_float_2.c b/gcc/testsuite/gcc.target/i386/l_fma_float_2.c +index 81193b2d8b1..cb38b77344f 100644 +--- a/gcc/testsuite/gcc.target/i386/l_fma_float_2.c ++++ b/gcc/testsuite/gcc.target/i386/l_fma_float_2.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */ + +diff --git a/gcc/testsuite/gcc.target/i386/l_fma_float_3.c b/gcc/testsuite/gcc.target/i386/l_fma_float_3.c +index d86cb904357..10a350e9e10 100644 +--- a/gcc/testsuite/gcc.target/i386/l_fma_float_3.c ++++ b/gcc/testsuite/gcc.target/i386/l_fma_float_3.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */ + +diff --git a/gcc/testsuite/gcc.target/i386/l_fma_float_4.c b/gcc/testsuite/gcc.target/i386/l_fma_float_4.c +index 68ca8388d70..020e5d86f35 100644 +--- a/gcc/testsuite/gcc.target/i386/l_fma_float_4.c ++++ b/gcc/testsuite/gcc.target/i386/l_fma_float_4.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */ + +diff --git a/gcc/testsuite/gcc.target/i386/l_fma_float_5.c b/gcc/testsuite/gcc.target/i386/l_fma_float_5.c +index 4db4749c024..3ff23c17aab 100644 +--- a/gcc/testsuite/gcc.target/i386/l_fma_float_5.c ++++ b/gcc/testsuite/gcc.target/i386/l_fma_float_5.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */ + +diff --git a/gcc/testsuite/gcc.target/i386/l_fma_float_6.c b/gcc/testsuite/gcc.target/i386/l_fma_float_6.c +index 0b86e6256bd..34671baa28a 100644 +--- a/gcc/testsuite/gcc.target/i386/l_fma_float_6.c ++++ b/gcc/testsuite/gcc.target/i386/l_fma_float_6.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */ + +diff --git a/gcc/testsuite/gcc.target/i386/mask-pack.c b/gcc/testsuite/gcc.target/i386/mask-pack.c +index 0b564ef4284..a607dfb460c 100644 +--- a/gcc/testsuite/gcc.target/i386/mask-pack.c ++++ b/gcc/testsuite/gcc.target/i386/mask-pack.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-options "-mavx512bw -O3 -fopenmp-simd -fdump-tree-vect-details" } */ + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 10 "vect" } } */ +diff --git a/gcc/testsuite/gcc.target/i386/mask-unpack.c b/gcc/testsuite/gcc.target/i386/mask-unpack.c +index 4291480cfff..ca71ea2e29d 100644 +--- a/gcc/testsuite/gcc.target/i386/mask-unpack.c ++++ b/gcc/testsuite/gcc.target/i386/mask-unpack.c +@@ -1,3 +1,5 @@ ++/* Disabling epilogues until we find a better way to deal with scans. */ ++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + /* { dg-do compile } */ + /* { dg-options "-mavx512bw -mavx512dq -mno-stackrealign -O3 -fopenmp-simd -fdump-tree-vect-details" } */ + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 10 "vect" } } */ +diff --git a/gcc/testsuite/gcc.target/i386/pr90358.c b/gcc/testsuite/gcc.target/i386/pr90358.c +new file mode 100644 +index 00000000000..4894fdbd079 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr90358.c +@@ -0,0 +1,35 @@ ++/* PR target/90358 */ ++/* { dg-do run { target { sse4_runtime } } } */ ++/* { dg-options "-O3 -msse4" } */ ++ ++struct s { unsigned int a, b, c; }; ++ ++void __attribute__ ((noipa)) ++foo (struct s *restrict s1, struct s *restrict s2, int n) ++{ ++ for (int i = 0; i < n; ++i) ++ { ++ s1[i].b = s2[i].b; ++ s1[i].c = s2[i].c; ++ s2[i].c = 0; ++ } ++} ++ ++#define N 12 ++ ++int ++main () ++{ ++ struct s s1[N], s2[N]; ++ for (unsigned int j = 0; j < N; ++j) ++ { ++ s2[j].a = j * 5; ++ s2[j].b = j * 5 + 2; ++ s2[j].c = j * 5 + 4; ++ } ++ foo (s1, s2, N); ++ for (unsigned int j = 0; j < N; ++j) ++ if (s1[j].b != j * 5 + 2) ++ __builtin_abort (); ++ return 0; ++} +diff --git a/gcc/testsuite/gcc.target/i386/pr91033.c b/gcc/testsuite/gcc.target/i386/pr91033.c +new file mode 100644 +index 00000000000..43d99d5a7dc +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr91033.c +@@ -0,0 +1,15 @@ ++/* PR tree-optimization/91033 */ ++/* { dg-do compile { target pthread } } */ ++/* { dg-options "-march=knl -O2 -fopenmp-simd -ftree-parallelize-loops=2" } */ ++ ++#define N 1024 ++int a[N]; ++ ++void ++foo (void) ++{ ++ int i; ++ #pragma omp simd simdlen (4) ++ for (i = 0; i < N; ++i) ++ a[i] = a[i] + 1; ++} +diff --git a/gcc/testsuite/gfortran.dg/vect/vect-4.f90 b/gcc/testsuite/gfortran.dg/vect/vect-4.f90 +index b567cbd8644..c2eeafd3900 100644 +--- a/gcc/testsuite/gfortran.dg/vect/vect-4.f90 ++++ b/gcc/testsuite/gfortran.dg/vect/vect-4.f90 +@@ -1,3 +1,5 @@ ++! Disabling epilogues until we find a better way to deal with scans. ++! { dg-additional-options "--param vect-epilogues-nomask=0" } + ! { dg-do compile } + ! { dg-require-effective-target vect_float } + ! { dg-additional-options "--param vect-max-peeling-for-alignment=0" } +diff --git a/gcc/testsuite/gfortran.dg/vect/vect-8.f90 b/gcc/testsuite/gfortran.dg/vect/vect-8.f90 +index 0ac5f1c390b..1c243308476 100644 +--- a/gcc/testsuite/gfortran.dg/vect/vect-8.f90 ++++ b/gcc/testsuite/gfortran.dg/vect/vect-8.f90 +@@ -704,5 +704,6 @@ CALL track('KERNEL ') + RETURN + END SUBROUTINE kernel + +-! { dg-final { scan-tree-dump-times "vectorized 22 loops" 1 "vect" { target vect_intdouble_cvt } } } +-! { dg-final { scan-tree-dump-times "vectorized 17 loops" 1 "vect" { target { ! vect_intdouble_cvt } } } } ++! { dg-final { scan-tree-dump-times "vectorized 23 loops" 1 "vect" { target aarch64*-*-* } } } ++! { dg-final { scan-tree-dump-times "vectorized 22 loops" 1 "vect" { target { vect_intdouble_cvt && { ! aarch64*-*-* } } } } } ++! { dg-final { scan-tree-dump-times "vectorized 17 loops" 1 "vect" { target { { ! vect_intdouble_cvt } && { ! aarch64*-*-* } } } } } +diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c +index f8aeec9bae8..621c8ea3dad 100644 +--- a/gcc/tree-cfg.c ++++ b/gcc/tree-cfg.c +@@ -3557,6 +3557,24 @@ verify_gimple_assign_unary (gassign *stmt) + { + CASE_CONVERT: + { ++ /* Allow conversions between vectors with the same number of elements, ++ provided that the conversion is OK for the element types too. */ ++ if (VECTOR_TYPE_P (lhs_type) ++ && VECTOR_TYPE_P (rhs1_type) ++ && known_eq (TYPE_VECTOR_SUBPARTS (lhs_type), ++ TYPE_VECTOR_SUBPARTS (rhs1_type))) ++ { ++ lhs_type = TREE_TYPE (lhs_type); ++ rhs1_type = TREE_TYPE (rhs1_type); ++ } ++ else if (VECTOR_TYPE_P (lhs_type) || VECTOR_TYPE_P (rhs1_type)) ++ { ++ error ("invalid vector types in nop conversion"); ++ debug_generic_expr (lhs_type); ++ debug_generic_expr (rhs1_type); ++ return true; ++ } ++ + /* Allow conversions from pointer type to integral type only if + there is no sign or zero extension involved. + For targets were the precision of ptrofftype doesn't match that +diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c +index d00c1bd31e6..c95dd204870 100644 +--- a/gcc/tree-data-ref.c ++++ b/gcc/tree-data-ref.c +@@ -1287,7 +1287,7 @@ create_data_ref (edge nest, loop_p loop, tree memref, gimple *stmt, + return dr; + } + +-/* A helper function computes order between two tree epxressions T1 and T2. ++/* A helper function computes order between two tree expressions T1 and T2. + This is used in comparator functions sorting objects based on the order + of tree expressions. The function returns -1, 0, or 1. */ + +@@ -1454,6 +1454,54 @@ comp_dr_with_seg_len_pair (const void *pa_, const void *pb_) + return 0; + } + ++/* Dump information about ALIAS_PAIR, indenting each line by INDENT. */ ++ ++static void ++dump_alias_pair (dr_with_seg_len_pair_t *alias_pair, const char *indent) ++{ ++ dump_printf (MSG_NOTE, "%sreference: %T vs. %T\n", indent, ++ DR_REF (alias_pair->first.dr), ++ DR_REF (alias_pair->second.dr)); ++ ++ dump_printf (MSG_NOTE, "%ssegment length: %T", indent, ++ alias_pair->first.seg_len); ++ if (!operand_equal_p (alias_pair->first.seg_len, ++ alias_pair->second.seg_len, 0)) ++ dump_printf (MSG_NOTE, " vs. %T", alias_pair->second.seg_len); ++ ++ dump_printf (MSG_NOTE, "\n%saccess size: ", indent); ++ dump_dec (MSG_NOTE, alias_pair->first.access_size); ++ if (maybe_ne (alias_pair->first.access_size, alias_pair->second.access_size)) ++ { ++ dump_printf (MSG_NOTE, " vs. "); ++ dump_dec (MSG_NOTE, alias_pair->second.access_size); ++ } ++ ++ dump_printf (MSG_NOTE, "\n%salignment: %d", indent, ++ alias_pair->first.align); ++ if (alias_pair->first.align != alias_pair->second.align) ++ dump_printf (MSG_NOTE, " vs. %d", alias_pair->second.align); ++ ++ dump_printf (MSG_NOTE, "\n%sflags: ", indent); ++ if (alias_pair->flags & DR_ALIAS_RAW) ++ dump_printf (MSG_NOTE, " RAW"); ++ if (alias_pair->flags & DR_ALIAS_WAR) ++ dump_printf (MSG_NOTE, " WAR"); ++ if (alias_pair->flags & DR_ALIAS_WAW) ++ dump_printf (MSG_NOTE, " WAW"); ++ if (alias_pair->flags & DR_ALIAS_ARBITRARY) ++ dump_printf (MSG_NOTE, " ARBITRARY"); ++ if (alias_pair->flags & DR_ALIAS_SWAPPED) ++ dump_printf (MSG_NOTE, " SWAPPED"); ++ if (alias_pair->flags & DR_ALIAS_UNSWAPPED) ++ dump_printf (MSG_NOTE, " UNSWAPPED"); ++ if (alias_pair->flags & DR_ALIAS_MIXED_STEPS) ++ dump_printf (MSG_NOTE, " MIXED_STEPS"); ++ if (alias_pair->flags == 0) ++ dump_printf (MSG_NOTE, " "); ++ dump_printf (MSG_NOTE, "\n"); ++} ++ + /* Merge alias checks recorded in ALIAS_PAIRS and remove redundant ones. + FACTOR is number of iterations that each data reference is accessed. + +@@ -1488,19 +1536,50 @@ void + prune_runtime_alias_test_list (vec *alias_pairs, + poly_uint64) + { ++ if (alias_pairs->is_empty ()) ++ return; ++ ++ /* Canonicalize each pair so that the base components are ordered wrt ++ data_ref_compare_tree. This allows the loop below to merge more ++ cases. */ ++ unsigned int i; ++ dr_with_seg_len_pair_t *alias_pair; ++ FOR_EACH_VEC_ELT (*alias_pairs, i, alias_pair) ++ { ++ data_reference_p dr_a = alias_pair->first.dr; ++ data_reference_p dr_b = alias_pair->second.dr; ++ int comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a), ++ DR_BASE_ADDRESS (dr_b)); ++ if (comp_res == 0) ++ comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b)); ++ if (comp_res == 0) ++ comp_res = data_ref_compare_tree (DR_INIT (dr_a), DR_INIT (dr_b)); ++ if (comp_res > 0) ++ { ++ std::swap (alias_pair->first, alias_pair->second); ++ alias_pair->flags |= DR_ALIAS_SWAPPED; ++ } ++ else ++ alias_pair->flags |= DR_ALIAS_UNSWAPPED; ++ } ++ + /* Sort the collected data ref pairs so that we can scan them once to + combine all possible aliasing checks. */ + alias_pairs->qsort (comp_dr_with_seg_len_pair); + + /* Scan the sorted dr pairs and check if we can combine alias checks + of two neighboring dr pairs. */ +- for (size_t i = 1; i < alias_pairs->length (); ++i) ++ unsigned int last = 0; ++ for (i = 1; i < alias_pairs->length (); ++i) + { + /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */ +- dr_with_seg_len *dr_a1 = &(*alias_pairs)[i-1].first, +- *dr_b1 = &(*alias_pairs)[i-1].second, +- *dr_a2 = &(*alias_pairs)[i].first, +- *dr_b2 = &(*alias_pairs)[i].second; ++ dr_with_seg_len_pair_t *alias_pair1 = &(*alias_pairs)[last]; ++ dr_with_seg_len_pair_t *alias_pair2 = &(*alias_pairs)[i]; ++ ++ dr_with_seg_len *dr_a1 = &alias_pair1->first; ++ dr_with_seg_len *dr_b1 = &alias_pair1->second; ++ dr_with_seg_len *dr_a2 = &alias_pair2->first; ++ dr_with_seg_len *dr_b2 = &alias_pair2->second; + + /* Remove duplicate data ref pairs. */ + if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2) +@@ -1509,10 +1588,16 @@ prune_runtime_alias_test_list (vec *alias_pairs, + dump_printf (MSG_NOTE, "found equal ranges %T, %T and %T, %T\n", + DR_REF (dr_a1->dr), DR_REF (dr_b1->dr), + DR_REF (dr_a2->dr), DR_REF (dr_b2->dr)); +- alias_pairs->ordered_remove (i--); ++ alias_pair1->flags |= alias_pair2->flags; + continue; + } + ++ /* Assume that we won't be able to merge the pairs, then correct ++ if we do. */ ++ last += 1; ++ if (last != i) ++ (*alias_pairs)[last] = (*alias_pairs)[i]; ++ + if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2) + { + /* We consider the case that DR_B1 and DR_B2 are same memrefs, +@@ -1538,13 +1623,6 @@ prune_runtime_alias_test_list (vec *alias_pairs, + if (!ordered_p (init_a1, init_a2)) + continue; + +- /* Make sure dr_a1 starts left of dr_a2. */ +- if (maybe_gt (init_a1, init_a2)) +- { +- std::swap (*dr_a1, *dr_a2); +- std::swap (init_a1, init_a2); +- } +- + /* Work out what the segment length would be if we did combine + DR_A1 and DR_A2: + +@@ -1561,7 +1639,10 @@ prune_runtime_alias_test_list (vec *alias_pairs, + + The lengths both have sizetype, so the sign is taken from + the step instead. */ +- if (!operand_equal_p (dr_a1->seg_len, dr_a2->seg_len, 0)) ++ poly_uint64 new_seg_len = 0; ++ bool new_seg_len_p = !operand_equal_p (dr_a1->seg_len, ++ dr_a2->seg_len, 0); ++ if (new_seg_len_p) + { + poly_uint64 seg_len_a1, seg_len_a2; + if (!poly_int_tree_p (dr_a1->seg_len, &seg_len_a1) +@@ -1579,14 +1660,29 @@ prune_runtime_alias_test_list (vec *alias_pairs, + int sign_a = tree_int_cst_sgn (indicator_a); + int sign_b = tree_int_cst_sgn (indicator_b); + +- poly_uint64 new_seg_len; + if (sign_a <= 0 && sign_b <= 0) + new_seg_len = lower_bound (seg_len_a1, seg_len_a2); + else if (sign_a >= 0 && sign_b >= 0) + new_seg_len = upper_bound (seg_len_a1, seg_len_a2); + else + continue; ++ } ++ /* At this point we're committed to merging the refs. */ + ++ /* Make sure dr_a1 starts left of dr_a2. */ ++ if (maybe_gt (init_a1, init_a2)) ++ { ++ std::swap (*dr_a1, *dr_a2); ++ std::swap (init_a1, init_a2); ++ } ++ ++ /* The DR_Bs are equal, so only the DR_As can introduce ++ mixed steps. */ ++ if (!operand_equal_p (DR_STEP (dr_a1->dr), DR_STEP (dr_a2->dr), 0)) ++ alias_pair1->flags |= DR_ALIAS_MIXED_STEPS; ++ ++ if (new_seg_len_p) ++ { + dr_a1->seg_len = build_int_cst (TREE_TYPE (dr_a1->seg_len), + new_seg_len); + dr_a1->align = MIN (dr_a1->align, known_alignment (new_seg_len)); +@@ -1608,17 +1704,40 @@ prune_runtime_alias_test_list (vec *alias_pairs, + dump_printf (MSG_NOTE, "merging ranges for %T, %T and %T, %T\n", + DR_REF (dr_a1->dr), DR_REF (dr_b1->dr), + DR_REF (dr_a2->dr), DR_REF (dr_b2->dr)); +- alias_pairs->ordered_remove (i); +- i--; ++ alias_pair1->flags |= alias_pair2->flags; ++ last -= 1; + } + } ++ alias_pairs->truncate (last + 1); ++ ++ /* Try to restore the original dr_with_seg_len order within each ++ dr_with_seg_len_pair_t. If we ended up combining swapped and ++ unswapped pairs into the same check, we have to invalidate any ++ RAW, WAR and WAW information for it. */ ++ if (dump_enabled_p ()) ++ dump_printf (MSG_NOTE, "merged alias checks:\n"); ++ FOR_EACH_VEC_ELT (*alias_pairs, i, alias_pair) ++ { ++ unsigned int swap_mask = (DR_ALIAS_SWAPPED | DR_ALIAS_UNSWAPPED); ++ unsigned int swapped = (alias_pair->flags & swap_mask); ++ if (swapped == DR_ALIAS_SWAPPED) ++ std::swap (alias_pair->first, alias_pair->second); ++ else if (swapped != DR_ALIAS_UNSWAPPED) ++ alias_pair->flags |= DR_ALIAS_ARBITRARY; ++ alias_pair->flags &= ~swap_mask; ++ if (dump_enabled_p ()) ++ dump_alias_pair (alias_pair, " "); ++ } + } + +-/* Given LOOP's two data references and segment lengths described by DR_A +- and DR_B, create expression checking if the two addresses ranges intersect +- with each other based on index of the two addresses. This can only be +- done if DR_A and DR_B referring to the same (array) object and the index +- is the only difference. For example: ++/* Try to generate a runtime condition that is true if ALIAS_PAIR is ++ free of aliases, using a condition based on index values instead ++ of a condition based on addresses. Return true on success, ++ storing the condition in *COND_EXPR. ++ ++ This can only be done if the two data references in ALIAS_PAIR access ++ the same array object and the index is the only difference. For example, ++ if the two data references are DR_A and DR_B: + + DR_A DR_B + data-ref arr[i] arr[j] +@@ -1635,16 +1754,20 @@ prune_runtime_alias_test_list (vec *alias_pairs, + + We can create expression based on index rather than address: + +- (i_0 + 4 < j_0 || j_0 + 4 < i_0) ++ (unsigned) (i_0 - j_0 + 3) <= 6 ++ ++ i.e. the indices are less than 4 apart. + + Note evolution step of index needs to be considered in comparison. */ + + static bool + create_intersect_range_checks_index (struct loop *loop, tree *cond_expr, +- const dr_with_seg_len& dr_a, +- const dr_with_seg_len& dr_b) ++ const dr_with_seg_len_pair_t &alias_pair) + { +- if (integer_zerop (DR_STEP (dr_a.dr)) ++ const dr_with_seg_len &dr_a = alias_pair.first; ++ const dr_with_seg_len &dr_b = alias_pair.second; ++ if ((alias_pair.flags & DR_ALIAS_MIXED_STEPS) ++ || integer_zerop (DR_STEP (dr_a.dr)) + || integer_zerop (DR_STEP (dr_b.dr)) + || DR_NUM_DIMENSIONS (dr_a.dr) != DR_NUM_DIMENSIONS (dr_b.dr)) + return false; +@@ -1670,15 +1793,8 @@ create_intersect_range_checks_index (struct loop *loop, tree *cond_expr, + if (neg_step) + { + abs_step = -abs_step; +- seg_len1 = -seg_len1; +- seg_len2 = -seg_len2; +- } +- else +- { +- /* Include the access size in the length, so that we only have one +- tree addition below. */ +- seg_len1 += dr_a.access_size; +- seg_len2 += dr_b.access_size; ++ seg_len1 = (-wi::to_poly_wide (dr_a.seg_len)).force_uhwi (); ++ seg_len2 = (-wi::to_poly_wide (dr_b.seg_len)).force_uhwi (); + } + + /* Infer the number of iterations with which the memory segment is accessed +@@ -1692,16 +1808,15 @@ create_intersect_range_checks_index (struct loop *loop, tree *cond_expr, + || !can_div_trunc_p (seg_len2 + abs_step - 1, abs_step, &niter_len2)) + return false; + +- poly_uint64 niter_access1 = 0, niter_access2 = 0; +- if (neg_step) +- { +- /* Divide each access size by the byte step, rounding up. */ +- if (!can_div_trunc_p (dr_a.access_size - abs_step - 1, +- abs_step, &niter_access1) +- || !can_div_trunc_p (dr_b.access_size + abs_step - 1, +- abs_step, &niter_access2)) +- return false; +- } ++ /* Divide each access size by the byte step, rounding up. */ ++ poly_uint64 niter_access1, niter_access2; ++ if (!can_div_trunc_p (dr_a.access_size + abs_step - 1, ++ abs_step, &niter_access1) ++ || !can_div_trunc_p (dr_b.access_size + abs_step - 1, ++ abs_step, &niter_access2)) ++ return false; ++ ++ bool waw_or_war_p = (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW)) == 0; + + unsigned int i; + for (i = 0; i < DR_NUM_DIMENSIONS (dr_a.dr); i++) +@@ -1741,44 +1856,298 @@ create_intersect_range_checks_index (struct loop *loop, tree *cond_expr, + index of data reference. Like segment length, index length is + linear function of the number of iterations with index_step as + the coefficient, i.e, niter_len * idx_step. */ +- tree idx_len1 = fold_build2 (MULT_EXPR, TREE_TYPE (min1), idx_step, +- build_int_cst (TREE_TYPE (min1), +- niter_len1)); +- tree idx_len2 = fold_build2 (MULT_EXPR, TREE_TYPE (min2), idx_step, +- build_int_cst (TREE_TYPE (min2), +- niter_len2)); +- tree max1 = fold_build2 (PLUS_EXPR, TREE_TYPE (min1), min1, idx_len1); +- tree max2 = fold_build2 (PLUS_EXPR, TREE_TYPE (min2), min2, idx_len2); +- /* Adjust ranges for negative step. */ ++ offset_int abs_idx_step = offset_int::from (wi::to_wide (idx_step), ++ SIGNED); + if (neg_step) +- { +- /* IDX_LEN1 and IDX_LEN2 are negative in this case. */ +- std::swap (min1, max1); +- std::swap (min2, max2); +- +- /* As with the lengths just calculated, we've measured the access +- sizes in iterations, so multiply them by the index step. */ +- tree idx_access1 +- = fold_build2 (MULT_EXPR, TREE_TYPE (min1), idx_step, +- build_int_cst (TREE_TYPE (min1), niter_access1)); +- tree idx_access2 +- = fold_build2 (MULT_EXPR, TREE_TYPE (min2), idx_step, +- build_int_cst (TREE_TYPE (min2), niter_access2)); +- +- /* MINUS_EXPR because the above values are negative. */ +- max1 = fold_build2 (MINUS_EXPR, TREE_TYPE (max1), max1, idx_access1); +- max2 = fold_build2 (MINUS_EXPR, TREE_TYPE (max2), max2, idx_access2); +- } +- tree part_cond_expr +- = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, +- fold_build2 (LE_EXPR, boolean_type_node, max1, min2), +- fold_build2 (LE_EXPR, boolean_type_node, max2, min1)); ++ abs_idx_step = -abs_idx_step; ++ poly_offset_int idx_len1 = abs_idx_step * niter_len1; ++ poly_offset_int idx_len2 = abs_idx_step * niter_len2; ++ poly_offset_int idx_access1 = abs_idx_step * niter_access1; ++ poly_offset_int idx_access2 = abs_idx_step * niter_access2; ++ ++ gcc_assert (known_ge (idx_len1, 0) ++ && known_ge (idx_len2, 0) ++ && known_ge (idx_access1, 0) ++ && known_ge (idx_access2, 0)); ++ ++ /* Each access has the following pattern, with lengths measured ++ in units of INDEX: ++ ++ <-- idx_len --> ++ <--- A: -ve step ---> ++ +-----+-------+-----+-------+-----+ ++ | n-1 | ..... | 0 | ..... | n-1 | ++ +-----+-------+-----+-------+-----+ ++ <--- B: +ve step ---> ++ <-- idx_len --> ++ | ++ min ++ ++ where "n" is the number of scalar iterations covered by the segment ++ and where each access spans idx_access units. ++ ++ A is the range of bytes accessed when the step is negative, ++ B is the range when the step is positive. ++ ++ When checking for general overlap, we need to test whether ++ the range: ++ ++ [min1 + low_offset1, min2 + high_offset1 + idx_access1 - 1] ++ ++ overlaps: ++ ++ [min2 + low_offset2, min2 + high_offset2 + idx_access2 - 1] ++ ++ where: ++ ++ low_offsetN = +ve step ? 0 : -idx_lenN; ++ high_offsetN = +ve step ? idx_lenN : 0; ++ ++ This is equivalent to testing whether: ++ ++ min1 + low_offset1 <= min2 + high_offset2 + idx_access2 - 1 ++ && min2 + low_offset2 <= min1 + high_offset1 + idx_access1 - 1 ++ ++ Converting this into a single test, there is an overlap if: ++ ++ 0 <= min2 - min1 + bias <= limit ++ ++ where bias = high_offset2 + idx_access2 - 1 - low_offset1 ++ limit = (high_offset1 - low_offset1 + idx_access1 - 1) ++ + (high_offset2 - low_offset2 + idx_access2 - 1) ++ i.e. limit = idx_len1 + idx_access1 - 1 + idx_len2 + idx_access2 - 1 ++ ++ Combining the tests requires limit to be computable in an unsigned ++ form of the index type; if it isn't, we fall back to the usual ++ pointer-based checks. ++ ++ We can do better if DR_B is a write and if DR_A and DR_B are ++ well-ordered in both the original and the new code (see the ++ comment above the DR_ALIAS_* flags for details). In this case ++ we know that for each i in [0, n-1], the write performed by ++ access i of DR_B occurs after access numbers j<=i of DR_A in ++ both the original and the new code. Any write or anti ++ dependencies wrt those DR_A accesses are therefore maintained. ++ ++ We just need to make sure that each individual write in DR_B does not ++ overlap any higher-indexed access in DR_A; such DR_A accesses happen ++ after the DR_B access in the original code but happen before it in ++ the new code. ++ ++ We know the steps for both accesses are equal, so by induction, we ++ just need to test whether the first write of DR_B overlaps a later ++ access of DR_A. In other words, we need to move min1 along by ++ one iteration: ++ ++ min1' = min1 + idx_step ++ ++ and use the ranges: ++ ++ [min1' + low_offset1', min1' + high_offset1' + idx_access1 - 1] ++ ++ and: ++ ++ [min2, min2 + idx_access2 - 1] ++ ++ where: ++ ++ low_offset1' = +ve step ? 0 : -(idx_len1 - |idx_step|) ++ high_offset1' = +ve_step ? idx_len1 - |idx_step| : 0. */ ++ if (waw_or_war_p) ++ idx_len1 -= abs_idx_step; ++ ++ poly_offset_int limit = idx_len1 + idx_access1 - 1 + idx_access2 - 1; ++ if (!waw_or_war_p) ++ limit += idx_len2; ++ ++ tree utype = unsigned_type_for (TREE_TYPE (min1)); ++ if (!wi::fits_to_tree_p (limit, utype)) ++ return false; ++ ++ poly_offset_int low_offset1 = neg_step ? -idx_len1 : 0; ++ poly_offset_int high_offset2 = neg_step || waw_or_war_p ? 0 : idx_len2; ++ poly_offset_int bias = high_offset2 + idx_access2 - 1 - low_offset1; ++ /* Equivalent to adding IDX_STEP to MIN1. */ ++ if (waw_or_war_p) ++ bias -= wi::to_offset (idx_step); ++ ++ tree subject = fold_build2 (MINUS_EXPR, utype, ++ fold_convert (utype, min2), ++ fold_convert (utype, min1)); ++ subject = fold_build2 (PLUS_EXPR, utype, subject, ++ wide_int_to_tree (utype, bias)); ++ tree part_cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, ++ wide_int_to_tree (utype, limit)); + if (*cond_expr) + *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, + *cond_expr, part_cond_expr); + else + *cond_expr = part_cond_expr; + } ++ if (dump_enabled_p ()) ++ { ++ if (waw_or_war_p) ++ dump_printf (MSG_NOTE, "using an index-based WAR/WAW test\n"); ++ else ++ dump_printf (MSG_NOTE, "using an index-based overlap test\n"); ++ } ++ return true; ++} ++ ++/* A subroutine of create_intersect_range_checks, with a subset of the ++ same arguments. Try to optimize cases in which the second access ++ is a write and in which some overlap is valid. */ ++ ++static bool ++create_waw_or_war_checks (tree *cond_expr, ++ const dr_with_seg_len_pair_t &alias_pair) ++{ ++ const dr_with_seg_len& dr_a = alias_pair.first; ++ const dr_with_seg_len& dr_b = alias_pair.second; ++ ++ /* Check for cases in which: ++ ++ (a) DR_B is always a write; ++ (b) the accesses are well-ordered in both the original and new code ++ (see the comment above the DR_ALIAS_* flags for details); and ++ (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */ ++ if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW)) ++ return false; ++ ++ /* Check for equal (but possibly variable) steps. */ ++ tree step = DR_STEP (dr_a.dr); ++ if (!operand_equal_p (step, DR_STEP (dr_b.dr))) ++ return false; ++ ++ /* Make sure that we can operate on sizetype without loss of precision. */ ++ tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr)); ++ if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype)) ++ return false; ++ ++ /* All addresses involved are known to have a common alignment ALIGN. ++ We can therefore subtract ALIGN from an exclusive endpoint to get ++ an inclusive endpoint. In the best (and common) case, ALIGN is the ++ same as the access sizes of both DRs, and so subtracting ALIGN ++ cancels out the addition of an access size. */ ++ unsigned int align = MIN (dr_a.align, dr_b.align); ++ poly_uint64 last_chunk_a = dr_a.access_size - align; ++ poly_uint64 last_chunk_b = dr_b.access_size - align; ++ ++ /* Get a boolean expression that is true when the step is negative. */ ++ tree indicator = dr_direction_indicator (dr_a.dr); ++ tree neg_step = fold_build2 (LT_EXPR, boolean_type_node, ++ fold_convert (ssizetype, indicator), ++ ssize_int (0)); ++ ++ /* Get lengths in sizetype. */ ++ tree seg_len_a ++ = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len)); ++ step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step)); ++ ++ /* Each access has the following pattern: ++ ++ <- |seg_len| -> ++ <--- A: -ve step ---> ++ +-----+-------+-----+-------+-----+ ++ | n-1 | ..... | 0 | ..... | n-1 | ++ +-----+-------+-----+-------+-----+ ++ <--- B: +ve step ---> ++ <- |seg_len| -> ++ | ++ base address ++ ++ where "n" is the number of scalar iterations covered by the segment. ++ ++ A is the range of bytes accessed when the step is negative, ++ B is the range when the step is positive. ++ ++ We know that DR_B is a write. We also know (from checking that ++ DR_A and DR_B are well-ordered) that for each i in [0, n-1], ++ the write performed by access i of DR_B occurs after access numbers ++ j<=i of DR_A in both the original and the new code. Any write or ++ anti dependencies wrt those DR_A accesses are therefore maintained. ++ ++ We just need to make sure that each individual write in DR_B does not ++ overlap any higher-indexed access in DR_A; such DR_A accesses happen ++ after the DR_B access in the original code but happen before it in ++ the new code. ++ ++ We know the steps for both accesses are equal, so by induction, we ++ just need to test whether the first write of DR_B overlaps a later ++ access of DR_A. In other words, we need to move addr_a along by ++ one iteration: ++ ++ addr_a' = addr_a + step ++ ++ and check whether: ++ ++ [addr_b, addr_b + last_chunk_b] ++ ++ overlaps: ++ ++ [addr_a' + low_offset_a, addr_a' + high_offset_a + last_chunk_a] ++ ++ where [low_offset_a, high_offset_a] spans accesses [1, n-1]. I.e.: ++ ++ low_offset_a = +ve step ? 0 : seg_len_a - step ++ high_offset_a = +ve step ? seg_len_a - step : 0 ++ ++ This is equivalent to testing whether: ++ ++ addr_a' + low_offset_a <= addr_b + last_chunk_b ++ && addr_b <= addr_a' + high_offset_a + last_chunk_a ++ ++ Converting this into a single test, there is an overlap if: ++ ++ 0 <= addr_b + last_chunk_b - addr_a' - low_offset_a <= limit ++ ++ where limit = high_offset_a - low_offset_a + last_chunk_a + last_chunk_b ++ ++ If DR_A is performed, limit + |step| - last_chunk_b is known to be ++ less than the size of the object underlying DR_A. We also know ++ that last_chunk_b <= |step|; this is checked elsewhere if it isn't ++ guaranteed at compile time. There can therefore be no overflow if ++ "limit" is calculated in an unsigned type with pointer precision. */ ++ tree addr_a = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_a.dr), ++ DR_OFFSET (dr_a.dr)); ++ addr_a = fold_build_pointer_plus (addr_a, DR_INIT (dr_a.dr)); ++ ++ tree addr_b = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_b.dr), ++ DR_OFFSET (dr_b.dr)); ++ addr_b = fold_build_pointer_plus (addr_b, DR_INIT (dr_b.dr)); ++ ++ /* Advance ADDR_A by one iteration and adjust the length to compensate. */ ++ addr_a = fold_build_pointer_plus (addr_a, step); ++ tree seg_len_a_minus_step = fold_build2 (MINUS_EXPR, sizetype, ++ seg_len_a, step); ++ if (!CONSTANT_CLASS_P (seg_len_a_minus_step)) ++ seg_len_a_minus_step = build1 (SAVE_EXPR, sizetype, seg_len_a_minus_step); ++ ++ tree low_offset_a = fold_build3 (COND_EXPR, sizetype, neg_step, ++ seg_len_a_minus_step, size_zero_node); ++ if (!CONSTANT_CLASS_P (low_offset_a)) ++ low_offset_a = build1 (SAVE_EXPR, sizetype, low_offset_a); ++ ++ /* We could use COND_EXPR , ++ but it's usually more efficient to reuse the LOW_OFFSET_A result. */ ++ tree high_offset_a = fold_build2 (MINUS_EXPR, sizetype, seg_len_a_minus_step, ++ low_offset_a); ++ ++ /* The amount added to addr_b - addr_a'. */ ++ tree bias = fold_build2 (MINUS_EXPR, sizetype, ++ size_int (last_chunk_b), low_offset_a); ++ ++ tree limit = fold_build2 (MINUS_EXPR, sizetype, high_offset_a, low_offset_a); ++ limit = fold_build2 (PLUS_EXPR, sizetype, limit, ++ size_int (last_chunk_a + last_chunk_b)); ++ ++ tree subject = fold_build2 (POINTER_DIFF_EXPR, ssizetype, addr_b, addr_a); ++ subject = fold_build2 (PLUS_EXPR, sizetype, ++ fold_convert (sizetype, subject), bias); ++ ++ *cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit); ++ if (dump_enabled_p ()) ++ dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n"); + return true; + } + +@@ -1866,24 +2235,29 @@ get_segment_min_max (const dr_with_seg_len &d, tree *seg_min_out, + *seg_max_out = fold_build_pointer_plus (addr_base, max_reach); + } + +-/* Given two data references and segment lengths described by DR_A and DR_B, +- create expression checking if the two addresses ranges intersect with +- each other: ++/* Generate a runtime condition that is true if ALIAS_PAIR is free of aliases, ++ storing the condition in *COND_EXPR. The fallback is to generate a ++ a test that the two accesses do not overlap: + +- ((DR_A_addr_0 + DR_A_segment_length_0) <= DR_B_addr_0) +- || (DR_B_addr_0 + DER_B_segment_length_0) <= DR_A_addr_0)) */ ++ end_a <= start_b || end_b <= start_a. */ + + static void + create_intersect_range_checks (struct loop *loop, tree *cond_expr, +- const dr_with_seg_len& dr_a, +- const dr_with_seg_len& dr_b) ++ const dr_with_seg_len_pair_t &alias_pair) + { ++ const dr_with_seg_len& dr_a = alias_pair.first; ++ const dr_with_seg_len& dr_b = alias_pair.second; + *cond_expr = NULL_TREE; +- if (create_intersect_range_checks_index (loop, cond_expr, dr_a, dr_b)) ++ if (create_intersect_range_checks_index (loop, cond_expr, alias_pair)) ++ return; ++ ++ if (create_waw_or_war_checks (cond_expr, alias_pair)) + return; + + unsigned HOST_WIDE_INT min_align; + tree_code cmp_code; ++ /* We don't have to check DR_ALIAS_MIXED_STEPS here, since both versions ++ are equivalent. This is just an optimization heuristic. */ + if (TREE_CODE (DR_STEP (dr_a.dr)) == INTEGER_CST + && TREE_CODE (DR_STEP (dr_b.dr)) == INTEGER_CST) + { +@@ -1924,6 +2298,8 @@ create_intersect_range_checks (struct loop *loop, tree *cond_expr, + = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, + fold_build2 (cmp_code, boolean_type_node, seg_a_max, seg_b_min), + fold_build2 (cmp_code, boolean_type_node, seg_b_max, seg_a_min)); ++ if (dump_enabled_p ()) ++ dump_printf (MSG_NOTE, "using an address-based overlap test\n"); + } + + /* Create a conditional expression that represents the run-time checks for +@@ -1940,18 +2316,19 @@ create_runtime_alias_checks (struct loop *loop, + tree part_cond_expr; + + fold_defer_overflow_warnings (); +- for (size_t i = 0, s = alias_pairs->length (); i < s; ++i) ++ dr_with_seg_len_pair_t *alias_pair; ++ unsigned int i; ++ FOR_EACH_VEC_ELT (*alias_pairs, i, alias_pair) + { +- const dr_with_seg_len& dr_a = (*alias_pairs)[i].first; +- const dr_with_seg_len& dr_b = (*alias_pairs)[i].second; +- ++ gcc_assert (alias_pair->flags); + if (dump_enabled_p ()) + dump_printf (MSG_NOTE, + "create runtime check for data references %T and %T\n", +- DR_REF (dr_a.dr), DR_REF (dr_b.dr)); ++ DR_REF (alias_pair->first.dr), ++ DR_REF (alias_pair->second.dr)); + + /* Create condition expression for each pair data references. */ +- create_intersect_range_checks (loop, &part_cond_expr, dr_a, dr_b); ++ create_intersect_range_checks (loop, &part_cond_expr, *alias_pair); + if (*cond_expr) + *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, + *cond_expr, part_cond_expr); +diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h +index 70cbb03b49c..9cb48a2ea3e 100644 +--- a/gcc/tree-data-ref.h ++++ b/gcc/tree-data-ref.h +@@ -221,19 +221,113 @@ struct dr_with_seg_len + unsigned int align; + }; + ++/* Flags that describe a potential alias between two dr_with_seg_lens. ++ In general, each pair of dr_with_seg_lens represents a composite of ++ multiple access pairs P, so testing flags like DR_IS_READ on the DRs ++ does not give meaningful information. ++ ++ DR_ALIAS_RAW: ++ There is a pair in P for which the second reference is a read ++ and the first is a write. ++ ++ DR_ALIAS_WAR: ++ There is a pair in P for which the second reference is a write ++ and the first is a read. ++ ++ DR_ALIAS_WAW: ++ There is a pair in P for which both references are writes. ++ ++ DR_ALIAS_ARBITRARY: ++ Either ++ (a) it isn't possible to classify one pair in P as RAW, WAW or WAR; or ++ (b) there is a pair in P that breaks the ordering assumption below. ++ ++ This flag overrides the RAW, WAR and WAW flags above. ++ ++ DR_ALIAS_UNSWAPPED: ++ DR_ALIAS_SWAPPED: ++ Temporary flags that indicate whether there is a pair P whose ++ DRs have or haven't been swapped around. ++ ++ DR_ALIAS_MIXED_STEPS: ++ The DR_STEP for one of the data references in the pair does not ++ accurately describe that reference for all members of P. (Note ++ that the flag does not say anything about whether the DR_STEPs ++ of the two references in the pair are the same.) ++ ++ The ordering assumption mentioned above is that for every pair ++ (DR_A, DR_B) in P: ++ ++ (1) The original code accesses n elements for DR_A and n elements for DR_B, ++ interleaved as follows: ++ ++ one access of size DR_A.access_size at DR_A.dr ++ one access of size DR_B.access_size at DR_B.dr ++ one access of size DR_A.access_size at DR_A.dr + STEP_A ++ one access of size DR_B.access_size at DR_B.dr + STEP_B ++ one access of size DR_A.access_size at DR_A.dr + STEP_A * 2 ++ one access of size DR_B.access_size at DR_B.dr + STEP_B * 2 ++ ... ++ ++ (2) The new code accesses the same data in exactly two chunks: ++ ++ one group of accesses spanning |DR_A.seg_len| + DR_A.access_size ++ one group of accesses spanning |DR_B.seg_len| + DR_B.access_size ++ ++ A pair might break this assumption if the DR_A and DR_B accesses ++ in the original or the new code are mingled in some way. For example, ++ if DR_A.access_size represents the effect of two individual writes ++ to nearby locations, the pair breaks the assumption if those writes ++ occur either side of the access for DR_B. ++ ++ Note that DR_ALIAS_ARBITRARY describes whether the ordering assumption ++ fails to hold for any individual pair in P. If the assumption *does* ++ hold for every pair in P, it doesn't matter whether it holds for the ++ composite pair or not. In other words, P should represent the complete ++ set of pairs that the composite pair is testing, so only the ordering ++ of two accesses in the same member of P matters. */ ++const unsigned int DR_ALIAS_RAW = 1U << 0; ++const unsigned int DR_ALIAS_WAR = 1U << 1; ++const unsigned int DR_ALIAS_WAW = 1U << 2; ++const unsigned int DR_ALIAS_ARBITRARY = 1U << 3; ++const unsigned int DR_ALIAS_SWAPPED = 1U << 4; ++const unsigned int DR_ALIAS_UNSWAPPED = 1U << 5; ++const unsigned int DR_ALIAS_MIXED_STEPS = 1U << 6; ++ + /* This struct contains two dr_with_seg_len objects with aliasing data + refs. Two comparisons are generated from them. */ + + struct dr_with_seg_len_pair_t + { +- dr_with_seg_len_pair_t (const dr_with_seg_len& d1, +- const dr_with_seg_len& d2) +- : first (d1), second (d2) {} ++ /* WELL_ORDERED indicates that the ordering assumption described above ++ DR_ALIAS_ARBITRARY holds. REORDERED indicates that it doesn't. */ ++ enum sequencing { WELL_ORDERED, REORDERED }; ++ ++ dr_with_seg_len_pair_t (const dr_with_seg_len &, ++ const dr_with_seg_len &, sequencing); + + dr_with_seg_len first; + dr_with_seg_len second; ++ unsigned int flags; + }; + ++inline dr_with_seg_len_pair_t:: ++dr_with_seg_len_pair_t (const dr_with_seg_len &d1, const dr_with_seg_len &d2, ++ sequencing seq) ++ : first (d1), second (d2), flags (0) ++{ ++ if (DR_IS_READ (d1.dr) && DR_IS_WRITE (d2.dr)) ++ flags |= DR_ALIAS_WAR; ++ else if (DR_IS_WRITE (d1.dr) && DR_IS_READ (d2.dr)) ++ flags |= DR_ALIAS_RAW; ++ else if (DR_IS_WRITE (d1.dr) && DR_IS_WRITE (d2.dr)) ++ flags |= DR_ALIAS_WAW; ++ else ++ gcc_unreachable (); ++ if (seq == REORDERED) ++ flags |= DR_ALIAS_ARBITRARY; ++} ++ + enum data_dependence_direction { + dir_positive, + dir_negative, +diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c +index 2780a4b243f..bd946e14eb6 100644 +--- a/gcc/tree-if-conv.c ++++ b/gcc/tree-if-conv.c +@@ -120,6 +120,7 @@ along with GCC; see the file COPYING3. If not see + #include "fold-const.h" + #include "tree-ssa-sccvn.h" + #include "tree-cfgcleanup.h" ++#include "tree-ssa-dse.h" + + /* Only handle PHIs with no more arguments unless we are asked to by + simd pragma. */ +@@ -2884,7 +2885,7 @@ ifcvt_split_critical_edges (struct loop *loop, bool aggressive_if_conv) + loop vectorization. */ + + static void +-ifcvt_local_dce (basic_block bb) ++ifcvt_local_dce (class loop *loop) + { + gimple *stmt; + gimple *stmt1; +@@ -2901,6 +2902,10 @@ ifcvt_local_dce (basic_block bb) + replace_uses_by (name_pair->first, name_pair->second); + redundant_ssa_names.release (); + ++ /* The loop has a single BB only. */ ++ basic_block bb = loop->header; ++ tree latch_vdef = NULL_TREE; ++ + worklist.create (64); + /* Consider all phi as live statements. */ + for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) +@@ -2908,6 +2913,8 @@ ifcvt_local_dce (basic_block bb) + phi = gsi_stmt (gsi); + gimple_set_plf (phi, GF_PLF_2, true); + worklist.safe_push (phi); ++ if (virtual_operand_p (gimple_phi_result (phi))) ++ latch_vdef = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop)); + } + /* Consider load/store statements, CALL and COND as live. */ + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) +@@ -2971,6 +2978,19 @@ ifcvt_local_dce (basic_block bb) + while (!gsi_end_p (gsi)) + { + stmt = gsi_stmt (gsi); ++ if (gimple_store_p (stmt)) ++ { ++ tree lhs = gimple_get_lhs (stmt); ++ ao_ref write; ++ ao_ref_init (&write, lhs); ++ ++ if (dse_classify_store (&write, stmt, false, NULL, NULL, latch_vdef) ++ == DSE_STORE_DEAD) ++ delete_dead_or_redundant_assignment (&gsi, "dead"); ++ gsi_next (&gsi); ++ continue; ++ } ++ + if (gimple_plf (stmt, GF_PLF_2)) + { + gsi_next (&gsi); +@@ -3071,9 +3091,6 @@ tree_if_conversion (struct loop *loop, vec *preds) + on-the-fly. */ + combine_blocks (loop); + +- /* Delete dead predicate computations. */ +- ifcvt_local_dce (loop->header); +- + /* Perform local CSE, this esp. helps the vectorizer analysis if loads + and stores are involved. CSE only the loop body, not the entry + PHIs, those are to be kept in sync with the non-if-converted copy. +@@ -3082,6 +3099,9 @@ tree_if_conversion (struct loop *loop, vec *preds) + bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index); + bitmap_set_bit (exit_bbs, loop->latch->index); + todo |= do_rpo_vn (cfun, loop_preheader_edge (loop), exit_bbs); ++ ++ /* Delete dead predicate computations. */ ++ ifcvt_local_dce (loop); + BITMAP_FREE (exit_bbs); + + todo |= TODO_cleanup_cfg; +diff --git a/gcc/tree-inline.c b/gcc/tree-inline.c +index d115fcb1a5b..2fbcd6e3e46 100644 +--- a/gcc/tree-inline.c ++++ b/gcc/tree-inline.c +@@ -6201,11 +6201,11 @@ tree_function_versioning (tree old_decl, tree new_decl, + in the debug info that var (whole DECL_ORIGIN is the parm + PARM_DECL) is optimized away, but could be looked up at the + call site as value of D#X there. */ +- tree var = vars, vexpr; ++ tree vexpr; + gimple_stmt_iterator cgsi + = gsi_after_labels (single_succ (ENTRY_BLOCK_PTR_FOR_FN (cfun))); + gimple *def_temp; +- var = vars; ++ tree var = vars; + i = vec_safe_length (*debug_args); + do + { +diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c +index 8959f52a67b..a002bcd57b2 100644 +--- a/gcc/tree-loop-distribution.c ++++ b/gcc/tree-loop-distribution.c +@@ -2445,12 +2445,6 @@ compute_alias_check_pairs (struct loop *loop, vec *alias_ddrs, + struct data_reference *dr_a = DDR_A (ddr); + struct data_reference *dr_b = DDR_B (ddr); + tree seg_length_a, seg_length_b; +- int comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a), +- DR_BASE_ADDRESS (dr_b)); +- +- if (comp_res == 0) +- comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b)); +- gcc_assert (comp_res != 0); + + if (latch_dominated_by_data_ref (loop, dr_a)) + seg_length_a = data_ref_segment_size (dr_a, niters_plus_one); +@@ -2471,11 +2465,9 @@ compute_alias_check_pairs (struct loop *loop, vec *alias_ddrs, + + dr_with_seg_len_pair_t dr_with_seg_len_pair + (dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a), +- dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b)); +- +- /* Canonicalize pairs by sorting the two DR members. */ +- if (comp_res > 0) +- std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); ++ dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b), ++ /* ??? Would WELL_ORDERED be safe? */ ++ dr_with_seg_len_pair_t::REORDERED); + + comp_alias_pairs->safe_push (dr_with_seg_len_pair); + } +diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c +index dad6e2884db..e841da66db5 100644 +--- a/gcc/tree-parloops.c ++++ b/gcc/tree-parloops.c +@@ -88,7 +88,8 @@ along with GCC; see the file COPYING3. If not see + More info can also be found at http://gcc.gnu.org/wiki/AutoParInGCC */ + /* + Reduction handling: +- currently we use vect_force_simple_reduction() to detect reduction patterns. ++ currently we use code inspired by vect_force_simple_reduction to detect ++ reduction patterns. + The code transformation will be introduced by an example. + + +@@ -182,6 +183,717 @@ parloop + + */ + ++/* Error reporting helper for parloops_is_simple_reduction below. GIMPLE ++ statement STMT is printed with a message MSG. */ ++ ++static void ++report_ploop_op (dump_flags_t msg_type, gimple *stmt, const char *msg) ++{ ++ dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt); ++} ++ ++/* DEF_STMT_INFO occurs in a loop that contains a potential reduction ++ operation. Return true if the results of DEF_STMT_INFO are something ++ that can be accumulated by such a reduction. */ ++ ++static bool ++parloops_valid_reduction_input_p (stmt_vec_info def_stmt_info) ++{ ++ return (is_gimple_assign (def_stmt_info->stmt) ++ || is_gimple_call (def_stmt_info->stmt) ++ || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def ++ || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI ++ && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def ++ && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt)))); ++} ++ ++/* Detect SLP reduction of the form: ++ ++ #a1 = phi ++ a2 = operation (a1) ++ a3 = operation (a2) ++ a4 = operation (a3) ++ a5 = operation (a4) ++ ++ #a = phi ++ ++ PHI is the reduction phi node (#a1 = phi above) ++ FIRST_STMT is the first reduction stmt in the chain ++ (a2 = operation (a1)). ++ ++ Return TRUE if a reduction chain was detected. */ ++ ++static bool ++parloops_is_slp_reduction (loop_vec_info loop_info, gimple *phi, ++ gimple *first_stmt) ++{ ++ class loop *loop = (gimple_bb (phi))->loop_father; ++ class loop *vect_loop = LOOP_VINFO_LOOP (loop_info); ++ enum tree_code code; ++ gimple *loop_use_stmt = NULL; ++ stmt_vec_info use_stmt_info; ++ tree lhs; ++ imm_use_iterator imm_iter; ++ use_operand_p use_p; ++ int nloop_uses, size = 0, n_out_of_loop_uses; ++ bool found = false; ++ ++ if (loop != vect_loop) ++ return false; ++ ++ auto_vec reduc_chain; ++ lhs = PHI_RESULT (phi); ++ code = gimple_assign_rhs_code (first_stmt); ++ while (1) ++ { ++ nloop_uses = 0; ++ n_out_of_loop_uses = 0; ++ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) ++ { ++ gimple *use_stmt = USE_STMT (use_p); ++ if (is_gimple_debug (use_stmt)) ++ continue; ++ ++ /* Check if we got back to the reduction phi. */ ++ if (use_stmt == phi) ++ { ++ loop_use_stmt = use_stmt; ++ found = true; ++ break; ++ } ++ ++ if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) ++ { ++ loop_use_stmt = use_stmt; ++ nloop_uses++; ++ } ++ else ++ n_out_of_loop_uses++; ++ ++ /* There are can be either a single use in the loop or two uses in ++ phi nodes. */ ++ if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses)) ++ return false; ++ } ++ ++ if (found) ++ break; ++ ++ /* We reached a statement with no loop uses. */ ++ if (nloop_uses == 0) ++ return false; ++ ++ /* This is a loop exit phi, and we haven't reached the reduction phi. */ ++ if (gimple_code (loop_use_stmt) == GIMPLE_PHI) ++ return false; ++ ++ if (!is_gimple_assign (loop_use_stmt) ++ || code != gimple_assign_rhs_code (loop_use_stmt) ++ || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt))) ++ return false; ++ ++ /* Insert USE_STMT into reduction chain. */ ++ use_stmt_info = loop_info->lookup_stmt (loop_use_stmt); ++ reduc_chain.safe_push (use_stmt_info); ++ ++ lhs = gimple_assign_lhs (loop_use_stmt); ++ size++; ++ } ++ ++ if (!found || loop_use_stmt != phi || size < 2) ++ return false; ++ ++ /* Swap the operands, if needed, to make the reduction operand be the second ++ operand. */ ++ lhs = PHI_RESULT (phi); ++ for (unsigned i = 0; i < reduc_chain.length (); ++i) ++ { ++ gassign *next_stmt = as_a (reduc_chain[i]->stmt); ++ if (gimple_assign_rhs2 (next_stmt) == lhs) ++ { ++ tree op = gimple_assign_rhs1 (next_stmt); ++ stmt_vec_info def_stmt_info = loop_info->lookup_def (op); ++ ++ /* Check that the other def is either defined in the loop ++ ("vect_internal_def"), or it's an induction (defined by a ++ loop-header phi-node). */ ++ if (def_stmt_info ++ && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)) ++ && parloops_valid_reduction_input_p (def_stmt_info)) ++ { ++ lhs = gimple_assign_lhs (next_stmt); ++ continue; ++ } ++ ++ return false; ++ } ++ else ++ { ++ tree op = gimple_assign_rhs2 (next_stmt); ++ stmt_vec_info def_stmt_info = loop_info->lookup_def (op); ++ ++ /* Check that the other def is either defined in the loop ++ ("vect_internal_def"), or it's an induction (defined by a ++ loop-header phi-node). */ ++ if (def_stmt_info ++ && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)) ++ && parloops_valid_reduction_input_p (def_stmt_info)) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G", ++ next_stmt); ++ ++ swap_ssa_operands (next_stmt, ++ gimple_assign_rhs1_ptr (next_stmt), ++ gimple_assign_rhs2_ptr (next_stmt)); ++ update_stmt (next_stmt); ++ ++ if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt))) ++ LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true; ++ } ++ else ++ return false; ++ } ++ ++ lhs = gimple_assign_lhs (next_stmt); ++ } ++ ++ /* Build up the actual chain. */ ++ for (unsigned i = 0; i < reduc_chain.length () - 1; ++i) ++ { ++ REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0]; ++ REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1]; ++ } ++ REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0]; ++ REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL; ++ ++ /* Save the chain for further analysis in SLP detection. */ ++ LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]); ++ REDUC_GROUP_SIZE (reduc_chain[0]) = size; ++ ++ return true; ++} ++ ++/* Return true if we need an in-order reduction for operation CODE ++ on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer ++ overflow must wrap. */ ++ ++static bool ++parloops_needs_fold_left_reduction_p (tree type, tree_code code, ++ bool need_wrapping_integral_overflow) ++{ ++ /* CHECKME: check for !flag_finite_math_only too? */ ++ if (SCALAR_FLOAT_TYPE_P (type)) ++ switch (code) ++ { ++ case MIN_EXPR: ++ case MAX_EXPR: ++ return false; ++ ++ default: ++ return !flag_associative_math; ++ } ++ ++ if (INTEGRAL_TYPE_P (type)) ++ { ++ if (!operation_no_trapping_overflow (type, code)) ++ return true; ++ if (need_wrapping_integral_overflow ++ && !TYPE_OVERFLOW_WRAPS (type) ++ && operation_can_overflow (code)) ++ return true; ++ return false; ++ } ++ ++ if (SAT_FIXED_POINT_TYPE_P (type)) ++ return true; ++ ++ return false; ++} ++ ++ ++/* Function parloops_is_simple_reduction ++ ++ (1) Detect a cross-iteration def-use cycle that represents a simple ++ reduction computation. We look for the following pattern: ++ ++ loop_header: ++ a1 = phi < a0, a2 > ++ a3 = ... ++ a2 = operation (a3, a1) ++ ++ or ++ ++ a3 = ... ++ loop_header: ++ a1 = phi < a0, a2 > ++ a2 = operation (a3, a1) ++ ++ such that: ++ 1. operation is commutative and associative and it is safe to ++ change the order of the computation ++ 2. no uses for a2 in the loop (a2 is used out of the loop) ++ 3. no uses of a1 in the loop besides the reduction operation ++ 4. no uses of a1 outside the loop. ++ ++ Conditions 1,4 are tested here. ++ Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. ++ ++ (2) Detect a cross-iteration def-use cycle in nested loops, i.e., ++ nested cycles. ++ ++ (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double ++ reductions: ++ ++ a1 = phi < a0, a2 > ++ inner loop (def of a3) ++ a2 = phi < a3 > ++ ++ (4) Detect condition expressions, ie: ++ for (int i = 0; i < N; i++) ++ if (a[i] < val) ++ ret_val = a[i]; ++ ++*/ ++ ++static stmt_vec_info ++parloops_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, ++ bool *double_reduc, ++ bool need_wrapping_integral_overflow, ++ enum vect_reduction_type *v_reduc_type) ++{ ++ gphi *phi = as_a (phi_info->stmt); ++ class loop *loop = (gimple_bb (phi))->loop_father; ++ class loop *vect_loop = LOOP_VINFO_LOOP (loop_info); ++ bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop); ++ gimple *phi_use_stmt = NULL; ++ enum tree_code orig_code, code; ++ tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE; ++ tree type; ++ tree name; ++ imm_use_iterator imm_iter; ++ use_operand_p use_p; ++ bool phi_def; ++ ++ *double_reduc = false; ++ *v_reduc_type = TREE_CODE_REDUCTION; ++ ++ tree phi_name = PHI_RESULT (phi); ++ /* ??? If there are no uses of the PHI result the inner loop reduction ++ won't be detected as possibly double-reduction by vectorizable_reduction ++ because that tries to walk the PHI arg from the preheader edge which ++ can be constant. See PR60382. */ ++ if (has_zero_uses (phi_name)) ++ return NULL; ++ unsigned nphi_def_loop_uses = 0; ++ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name) ++ { ++ gimple *use_stmt = USE_STMT (use_p); ++ if (is_gimple_debug (use_stmt)) ++ continue; ++ ++ if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "intermediate value used outside loop.\n"); ++ ++ return NULL; ++ } ++ ++ nphi_def_loop_uses++; ++ phi_use_stmt = use_stmt; ++ } ++ ++ edge latch_e = loop_latch_edge (loop); ++ tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); ++ if (TREE_CODE (loop_arg) != SSA_NAME) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "reduction: not ssa_name: %T\n", loop_arg); ++ return NULL; ++ } ++ ++ stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg); ++ if (!def_stmt_info ++ || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))) ++ return NULL; ++ ++ if (gassign *def_stmt = dyn_cast (def_stmt_info->stmt)) ++ { ++ name = gimple_assign_lhs (def_stmt); ++ phi_def = false; ++ } ++ else if (gphi *def_stmt = dyn_cast (def_stmt_info->stmt)) ++ { ++ name = PHI_RESULT (def_stmt); ++ phi_def = true; ++ } ++ else ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "reduction: unhandled reduction operation: %G", ++ def_stmt_info->stmt); ++ return NULL; ++ } ++ ++ unsigned nlatch_def_loop_uses = 0; ++ auto_vec lcphis; ++ bool inner_loop_of_double_reduc = false; ++ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) ++ { ++ gimple *use_stmt = USE_STMT (use_p); ++ if (is_gimple_debug (use_stmt)) ++ continue; ++ if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) ++ nlatch_def_loop_uses++; ++ else ++ { ++ /* We can have more than one loop-closed PHI. */ ++ lcphis.safe_push (as_a (use_stmt)); ++ if (nested_in_vect_loop ++ && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt)) ++ == vect_double_reduction_def)) ++ inner_loop_of_double_reduc = true; ++ } ++ } ++ ++ /* If this isn't a nested cycle or if the nested cycle reduction value ++ is used ouside of the inner loop we cannot handle uses of the reduction ++ value. */ ++ if ((!nested_in_vect_loop || inner_loop_of_double_reduc) ++ && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "reduction used in loop.\n"); ++ return NULL; ++ } ++ ++ /* If DEF_STMT is a phi node itself, we expect it to have a single argument ++ defined in the inner loop. */ ++ if (phi_def) ++ { ++ gphi *def_stmt = as_a (def_stmt_info->stmt); ++ op1 = PHI_ARG_DEF (def_stmt, 0); ++ ++ if (gimple_phi_num_args (def_stmt) != 1 ++ || TREE_CODE (op1) != SSA_NAME) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "unsupported phi node definition.\n"); ++ ++ return NULL; ++ } ++ ++ gimple *def1 = SSA_NAME_DEF_STMT (op1); ++ if (gimple_bb (def1) ++ && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) ++ && loop->inner ++ && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1)) ++ && is_gimple_assign (def1) ++ && is_a (phi_use_stmt) ++ && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))) ++ { ++ if (dump_enabled_p ()) ++ report_ploop_op (MSG_NOTE, def_stmt, ++ "detected double reduction: "); ++ ++ *double_reduc = true; ++ return def_stmt_info; ++ } ++ ++ return NULL; ++ } ++ ++ /* If we are vectorizing an inner reduction we are executing that ++ in the original order only in case we are not dealing with a ++ double reduction. */ ++ bool check_reduction = true; ++ if (flow_loop_nested_p (vect_loop, loop)) ++ { ++ gphi *lcphi; ++ unsigned i; ++ check_reduction = false; ++ FOR_EACH_VEC_ELT (lcphis, i, lcphi) ++ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi)) ++ { ++ gimple *use_stmt = USE_STMT (use_p); ++ if (is_gimple_debug (use_stmt)) ++ continue; ++ if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt))) ++ check_reduction = true; ++ } ++ } ++ ++ gassign *def_stmt = as_a (def_stmt_info->stmt); ++ code = orig_code = gimple_assign_rhs_code (def_stmt); ++ ++ if (nested_in_vect_loop && !check_reduction) ++ { ++ /* FIXME: Even for non-reductions code generation is funneled ++ through vectorizable_reduction for the stmt defining the ++ PHI latch value. So we have to artificially restrict ourselves ++ for the supported operations. */ ++ switch (get_gimple_rhs_class (code)) ++ { ++ case GIMPLE_BINARY_RHS: ++ case GIMPLE_TERNARY_RHS: ++ break; ++ default: ++ /* Not supported by vectorizable_reduction. */ ++ if (dump_enabled_p ()) ++ report_ploop_op (MSG_MISSED_OPTIMIZATION, def_stmt, ++ "nested cycle: not handled operation: "); ++ return NULL; ++ } ++ if (dump_enabled_p ()) ++ report_ploop_op (MSG_NOTE, def_stmt, "detected nested cycle: "); ++ return def_stmt_info; ++ } ++ ++ /* We can handle "res -= x[i]", which is non-associative by ++ simply rewriting this into "res += -x[i]". Avoid changing ++ gimple instruction for the first simple tests and only do this ++ if we're allowed to change code at all. */ ++ if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name) ++ code = PLUS_EXPR; ++ ++ if (code == COND_EXPR) ++ { ++ if (! nested_in_vect_loop) ++ *v_reduc_type = COND_REDUCTION; ++ ++ op3 = gimple_assign_rhs1 (def_stmt); ++ if (COMPARISON_CLASS_P (op3)) ++ { ++ op4 = TREE_OPERAND (op3, 1); ++ op3 = TREE_OPERAND (op3, 0); ++ } ++ if (op3 == phi_name || op4 == phi_name) ++ { ++ if (dump_enabled_p ()) ++ report_ploop_op (MSG_MISSED_OPTIMIZATION, def_stmt, ++ "reduction: condition depends on previous" ++ " iteration: "); ++ return NULL; ++ } ++ ++ op1 = gimple_assign_rhs2 (def_stmt); ++ op2 = gimple_assign_rhs3 (def_stmt); ++ } ++ else if (!commutative_tree_code (code) || !associative_tree_code (code)) ++ { ++ if (dump_enabled_p ()) ++ report_ploop_op (MSG_MISSED_OPTIMIZATION, def_stmt, ++ "reduction: not commutative/associative: "); ++ return NULL; ++ } ++ else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS) ++ { ++ op1 = gimple_assign_rhs1 (def_stmt); ++ op2 = gimple_assign_rhs2 (def_stmt); ++ } ++ else ++ { ++ if (dump_enabled_p ()) ++ report_ploop_op (MSG_MISSED_OPTIMIZATION, def_stmt, ++ "reduction: not handled operation: "); ++ return NULL; ++ } ++ ++ if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME) ++ { ++ if (dump_enabled_p ()) ++ report_ploop_op (MSG_MISSED_OPTIMIZATION, def_stmt, ++ "reduction: both uses not ssa_names: "); ++ ++ return NULL; ++ } ++ ++ type = TREE_TYPE (gimple_assign_lhs (def_stmt)); ++ if ((TREE_CODE (op1) == SSA_NAME ++ && !types_compatible_p (type,TREE_TYPE (op1))) ++ || (TREE_CODE (op2) == SSA_NAME ++ && !types_compatible_p (type, TREE_TYPE (op2))) ++ || (op3 && TREE_CODE (op3) == SSA_NAME ++ && !types_compatible_p (type, TREE_TYPE (op3))) ++ || (op4 && TREE_CODE (op4) == SSA_NAME ++ && !types_compatible_p (type, TREE_TYPE (op4)))) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "reduction: multiple types: operation type: " ++ "%T, operands types: %T,%T", ++ type, TREE_TYPE (op1), TREE_TYPE (op2)); ++ if (op3) ++ dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3)); ++ ++ if (op4) ++ dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4)); ++ dump_printf (MSG_NOTE, "\n"); ++ } ++ ++ return NULL; ++ } ++ ++ /* Check whether it's ok to change the order of the computation. ++ Generally, when vectorizing a reduction we change the order of the ++ computation. This may change the behavior of the program in some ++ cases, so we need to check that this is ok. One exception is when ++ vectorizing an outer-loop: the inner-loop is executed sequentially, ++ and therefore vectorizing reductions in the inner-loop during ++ outer-loop vectorization is safe. */ ++ if (check_reduction ++ && *v_reduc_type == TREE_CODE_REDUCTION ++ && parloops_needs_fold_left_reduction_p (type, code, ++ need_wrapping_integral_overflow)) ++ *v_reduc_type = FOLD_LEFT_REDUCTION; ++ ++ /* Reduction is safe. We're dealing with one of the following: ++ 1) integer arithmetic and no trapv ++ 2) floating point arithmetic, and special flags permit this optimization ++ 3) nested cycle (i.e., outer loop vectorization). */ ++ stmt_vec_info def1_info = loop_info->lookup_def (op1); ++ stmt_vec_info def2_info = loop_info->lookup_def (op2); ++ if (code != COND_EXPR && !def1_info && !def2_info) ++ { ++ if (dump_enabled_p ()) ++ report_ploop_op (MSG_NOTE, def_stmt, ++ "reduction: no defs for operands: "); ++ return NULL; ++ } ++ ++ /* Check that one def is the reduction def, defined by PHI, ++ the other def is either defined in the loop ("vect_internal_def"), ++ or it's an induction (defined by a loop-header phi-node). */ ++ ++ if (def2_info ++ && def2_info->stmt == phi ++ && (code == COND_EXPR ++ || !def1_info ++ || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt)) ++ || parloops_valid_reduction_input_p (def1_info))) ++ { ++ if (dump_enabled_p ()) ++ report_ploop_op (MSG_NOTE, def_stmt, "detected reduction: "); ++ return def_stmt_info; ++ } ++ ++ if (def1_info ++ && def1_info->stmt == phi ++ && (code == COND_EXPR ++ || !def2_info ++ || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt)) ++ || parloops_valid_reduction_input_p (def2_info))) ++ { ++ if (! nested_in_vect_loop && orig_code != MINUS_EXPR) ++ { ++ /* Check if we can swap operands (just for simplicity - so that ++ the rest of the code can assume that the reduction variable ++ is always the last (second) argument). */ ++ if (code == COND_EXPR) ++ { ++ /* Swap cond_expr by inverting the condition. */ ++ tree cond_expr = gimple_assign_rhs1 (def_stmt); ++ enum tree_code invert_code = ERROR_MARK; ++ enum tree_code cond_code = TREE_CODE (cond_expr); ++ ++ if (TREE_CODE_CLASS (cond_code) == tcc_comparison) ++ { ++ bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0)); ++ invert_code = invert_tree_comparison (cond_code, honor_nans); ++ } ++ if (invert_code != ERROR_MARK) ++ { ++ TREE_SET_CODE (cond_expr, invert_code); ++ swap_ssa_operands (def_stmt, ++ gimple_assign_rhs2_ptr (def_stmt), ++ gimple_assign_rhs3_ptr (def_stmt)); ++ } ++ else ++ { ++ if (dump_enabled_p ()) ++ report_ploop_op (MSG_NOTE, def_stmt, ++ "detected reduction: cannot swap operands " ++ "for cond_expr"); ++ return NULL; ++ } ++ } ++ else ++ swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt), ++ gimple_assign_rhs2_ptr (def_stmt)); ++ ++ if (dump_enabled_p ()) ++ report_ploop_op (MSG_NOTE, def_stmt, ++ "detected reduction: need to swap operands: "); ++ ++ if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt))) ++ LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true; ++ } ++ else ++ { ++ if (dump_enabled_p ()) ++ report_ploop_op (MSG_NOTE, def_stmt, "detected reduction: "); ++ } ++ ++ return def_stmt_info; ++ } ++ ++ /* Try to find SLP reduction chain. */ ++ if (! nested_in_vect_loop ++ && code != COND_EXPR ++ && orig_code != MINUS_EXPR ++ && parloops_is_slp_reduction (loop_info, phi, def_stmt)) ++ { ++ if (dump_enabled_p ()) ++ report_ploop_op (MSG_NOTE, def_stmt, ++ "reduction: detected reduction chain: "); ++ ++ return def_stmt_info; ++ } ++ ++ /* Look for the expression computing loop_arg from loop PHI result. */ ++ if (check_reduction_path (vect_location, loop, phi, loop_arg, code)) ++ return def_stmt_info; ++ ++ if (dump_enabled_p ()) ++ { ++ report_ploop_op (MSG_MISSED_OPTIMIZATION, def_stmt, ++ "reduction: unknown pattern: "); ++ } ++ ++ return NULL; ++} ++ ++/* Wrapper around vect_is_simple_reduction, which will modify code ++ in-place if it enables detection of more reductions. Arguments ++ as there. */ ++ ++stmt_vec_info ++parloops_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, ++ bool *double_reduc, ++ bool need_wrapping_integral_overflow) ++{ ++ enum vect_reduction_type v_reduc_type; ++ stmt_vec_info def_info ++ = parloops_is_simple_reduction (loop_info, phi_info, double_reduc, ++ need_wrapping_integral_overflow, ++ &v_reduc_type); ++ if (def_info) ++ { ++ STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type; ++ STMT_VINFO_REDUC_DEF (phi_info) = def_info; ++ STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type; ++ STMT_VINFO_REDUC_DEF (def_info) = phi_info; ++ } ++ return def_info; ++} ++ + /* Minimal number of iterations of a loop that should be executed in each + thread. */ + #define MIN_PER_THREAD PARAM_VALUE (PARAM_PARLOOPS_MIN_PER_THREAD) +@@ -2614,9 +3326,9 @@ gather_scalar_reductions (loop_p loop, reduction_info_table_type *reduction_list + continue; + + stmt_vec_info reduc_stmt_info +- = vect_force_simple_reduction (simple_loop_info, +- simple_loop_info->lookup_stmt (phi), +- &double_reduc, true); ++ = parloops_force_simple_reduction (simple_loop_info, ++ simple_loop_info->lookup_stmt (phi), ++ &double_reduc, true); + if (!reduc_stmt_info || !valid_reduction_p (reduc_stmt_info)) + continue; + +@@ -2663,9 +3375,9 @@ gather_scalar_reductions (loop_p loop, reduction_info_table_type *reduction_list + stmt_vec_info inner_phi_info + = simple_loop_info->lookup_stmt (inner_phi); + stmt_vec_info inner_reduc_stmt_info +- = vect_force_simple_reduction (simple_loop_info, +- inner_phi_info, +- &double_reduc, true); ++ = parloops_force_simple_reduction (simple_loop_info, ++ inner_phi_info, ++ &double_reduc, true); + gcc_assert (!double_reduc); + if (!inner_reduc_stmt_info + || !valid_reduction_p (inner_reduc_stmt_info)) +diff --git a/gcc/tree-ssa-alias.c b/gcc/tree-ssa-alias.c +index 01f095382d6..54e8adc8d7c 100644 +--- a/gcc/tree-ssa-alias.c ++++ b/gcc/tree-ssa-alias.c +@@ -2535,13 +2535,36 @@ stmt_kills_ref_p (gimple *stmt, ao_ref *ref) + case BUILT_IN_MEMSET_CHK: + case BUILT_IN_STRNCPY: + case BUILT_IN_STPNCPY: ++ case BUILT_IN_CALLOC: + { + /* For a must-alias check we need to be able to constrain + the access properly. */ + if (!ref->max_size_known_p ()) + return false; +- tree dest = gimple_call_arg (stmt, 0); +- tree len = gimple_call_arg (stmt, 2); ++ tree dest; ++ tree len; ++ ++ /* In execution order a calloc call will never kill ++ anything. However, DSE will (ab)use this interface ++ to ask if a calloc call writes the same memory locations ++ as a later assignment, memset, etc. So handle calloc ++ in the expected way. */ ++ if (DECL_FUNCTION_CODE (callee) == BUILT_IN_CALLOC) ++ { ++ tree arg0 = gimple_call_arg (stmt, 0); ++ tree arg1 = gimple_call_arg (stmt, 1); ++ if (TREE_CODE (arg0) != INTEGER_CST ++ || TREE_CODE (arg1) != INTEGER_CST) ++ return false; ++ ++ dest = gimple_call_lhs (stmt); ++ len = fold_build2 (MULT_EXPR, TREE_TYPE (arg0), arg0, arg1); ++ } ++ else ++ { ++ dest = gimple_call_arg (stmt, 0); ++ len = gimple_call_arg (stmt, 2); ++ } + if (!poly_int_tree_p (len)) + return false; + tree rbase = ref->base; +diff --git a/gcc/tree-ssa-dse.c b/gcc/tree-ssa-dse.c +index efe5b31cc0a..c20fbe048ed 100644 +--- a/gcc/tree-ssa-dse.c ++++ b/gcc/tree-ssa-dse.c +@@ -1,4 +1,4 @@ +-/* Dead store elimination ++/* Dead and redundant store elimination + Copyright (C) 2004-2019 Free Software Foundation, Inc. + + This file is part of GCC. +@@ -36,17 +36,26 @@ along with GCC; see the file COPYING3. If not see + #include "params.h" + #include "alias.h" + #include "tree-ssa-loop.h" ++#include "tree-ssa-dse.h" + + /* This file implements dead store elimination. + + A dead store is a store into a memory location which will later be + overwritten by another store without any intervening loads. In this +- case the earlier store can be deleted. ++ case the earlier store can be deleted or trimmed if the store ++ was partially dead. ++ ++ A redundant store is a store into a memory location which stores ++ the exact same value as a prior store to the same memory location. ++ While this can often be handled by dead store elimination, removing ++ the redundant store is often better than removing or trimming the ++ dead store. + + In our SSA + virtual operand world we use immediate uses of virtual +- operands to detect dead stores. If a store's virtual definition ++ operands to detect these cases. If a store's virtual definition + is used precisely once by a later store to the same location which +- post dominates the first store, then the first store is dead. ++ post dominates the first store, then the first store is dead. If ++ the data stored is the same, then the second store is redundant. + + The single use of the store's virtual definition ensures that + there are no intervening aliased loads and the requirement that +@@ -58,7 +67,9 @@ along with GCC; see the file COPYING3. If not see + the point immediately before the later store. Again, the single + use of the virtual definition and the post-dominance relationship + ensure that such movement would be safe. Clearly if there are +- back to back stores, then the second is redundant. ++ back to back stores, then the second is makes the first dead. If ++ the second store stores the same value, then the second store is ++ redundant. + + Reviewing section 10.7.2 in Morgan's "Building an Optimizing Compiler" + may also help in understanding this code since it discusses the +@@ -66,19 +77,13 @@ along with GCC; see the file COPYING3. If not see + fact, they are the same transformation applied to different views of + the CFG. */ + ++void delete_dead_or_redundant_assignment (gimple_stmt_iterator *, const char *); ++static void delete_dead_or_redundant_call (gimple_stmt_iterator *, const char *); + + /* Bitmap of blocks that have had EH statements cleaned. We should + remove their dead edges eventually. */ + static bitmap need_eh_cleanup; + +-/* Return value from dse_classify_store */ +-enum dse_store_status +-{ +- DSE_STORE_LIVE, +- DSE_STORE_MAYBE_PARTIAL_DEAD, +- DSE_STORE_DEAD +-}; +- + /* STMT is a statement that may write into memory. Analyze it and + initialize WRITE to describe how STMT affects memory. + +@@ -106,6 +111,25 @@ initialize_ao_ref_for_dse (gimple *stmt, ao_ref *write) + ao_ref_init_from_ptr_and_size (write, ptr, size); + return true; + } ++ ++ /* A calloc call can never be dead, but it can make ++ subsequent stores redundant if they store 0 into ++ the same memory locations. */ ++ case BUILT_IN_CALLOC: ++ { ++ tree nelem = gimple_call_arg (stmt, 0); ++ tree selem = gimple_call_arg (stmt, 1); ++ if (TREE_CODE (nelem) == INTEGER_CST ++ && TREE_CODE (selem) == INTEGER_CST) ++ { ++ tree lhs = gimple_call_lhs (stmt); ++ tree size = fold_build2 (MULT_EXPR, TREE_TYPE (nelem), ++ nelem, selem); ++ ao_ref_init_from_ptr_and_size (write, lhs, size); ++ return true; ++ } ++ } ++ + default: + break; + } +@@ -551,16 +575,84 @@ check_name (tree, tree *idx, void *data) + return true; + } + ++/* STMT stores the value 0 into one or more memory locations ++ (via memset, empty constructor, calloc call, etc). ++ ++ See if there is a subsequent store of the value 0 to one ++ or more of the same memory location(s). If so, the subsequent ++ store is redundant and can be removed. ++ ++ The subsequent stores could be via memset, empty constructors, ++ simple MEM stores, etc. */ ++ ++static void ++dse_optimize_redundant_stores (gimple *stmt) ++{ ++ int cnt = 0; ++ ++ /* We could do something fairly complex and look through PHIs ++ like DSE_CLASSIFY_STORE, but it doesn't seem to be worth ++ the effort. ++ ++ Look at all the immediate uses of the VDEF (which are obviously ++ dominated by STMT). See if one or more stores 0 into the same ++ memory locations a STMT, if so remove the immediate use statements. */ ++ tree defvar = gimple_vdef (stmt); ++ imm_use_iterator ui; ++ gimple *use_stmt; ++ FOR_EACH_IMM_USE_STMT (use_stmt, ui, defvar) ++ { ++ /* Limit stmt walking. */ ++ if (++cnt > PARAM_VALUE (PARAM_DSE_MAX_ALIAS_QUERIES_PER_STORE)) ++ BREAK_FROM_IMM_USE_STMT (ui); ++ ++ /* If USE_STMT stores 0 into one or more of the same locations ++ as STMT and STMT would kill USE_STMT, then we can just remove ++ USE_STMT. */ ++ tree fndecl; ++ if ((is_gimple_assign (use_stmt) ++ && gimple_vdef (use_stmt) ++ && ((gimple_assign_rhs_code (use_stmt) == CONSTRUCTOR ++ && CONSTRUCTOR_NELTS (gimple_assign_rhs1 (use_stmt)) == 0 ++ && !gimple_clobber_p (stmt)) ++ || (gimple_assign_rhs_code (use_stmt) == INTEGER_CST ++ && integer_zerop (gimple_assign_rhs1 (use_stmt))))) ++ || (gimple_call_builtin_p (use_stmt, BUILT_IN_NORMAL) ++ && (fndecl = gimple_call_fndecl (use_stmt)) != NULL ++ && (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_MEMSET ++ || DECL_FUNCTION_CODE (fndecl) == BUILT_IN_MEMSET_CHK) ++ && integer_zerop (gimple_call_arg (use_stmt, 1)))) ++ { ++ ao_ref write; ++ ++ if (!initialize_ao_ref_for_dse (use_stmt, &write)) ++ BREAK_FROM_IMM_USE_STMT (ui) ++ ++ if (valid_ao_ref_for_dse (&write) ++ && stmt_kills_ref_p (stmt, &write)) ++ { ++ gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt); ++ if (is_gimple_assign (use_stmt)) ++ delete_dead_or_redundant_assignment (&gsi, "redundant"); ++ else if (is_gimple_call (use_stmt)) ++ delete_dead_or_redundant_call (&gsi, "redundant"); ++ else ++ gcc_unreachable (); ++ } ++ } ++ } ++} ++ + /* A helper of dse_optimize_stmt. + Given a GIMPLE_ASSIGN in STMT that writes to REF, classify it + according to downstream uses and defs. Sets *BY_CLOBBER_P to true + if only clobber statements influenced the classification result. + Returns the classification. */ + +-static dse_store_status ++dse_store_status + dse_classify_store (ao_ref *ref, gimple *stmt, + bool byte_tracking_enabled, sbitmap live_bytes, +- bool *by_clobber_p = NULL) ++ bool *by_clobber_p, tree stop_at_vuse) + { + gimple *temp; + int cnt = 0; +@@ -596,6 +688,11 @@ dse_classify_store (ao_ref *ref, gimple *stmt, + } + else + defvar = gimple_vdef (temp); ++ ++ /* If we're instructed to stop walking at region boundary, do so. */ ++ if (defvar == stop_at_vuse) ++ return DSE_STORE_LIVE; ++ + auto_vec defs; + gimple *phi_def = NULL; + FOR_EACH_IMM_USE_STMT (use_stmt, ui, defvar) +@@ -763,12 +860,12 @@ private: + + /* Delete a dead call at GSI, which is mem* call of some kind. */ + static void +-delete_dead_call (gimple_stmt_iterator *gsi) ++delete_dead_or_redundant_call (gimple_stmt_iterator *gsi, const char *type) + { + gimple *stmt = gsi_stmt (*gsi); + if (dump_file && (dump_flags & TDF_DETAILS)) + { +- fprintf (dump_file, " Deleted dead call: "); ++ fprintf (dump_file, " Deleted %s call: ", type); + print_gimple_stmt (dump_file, stmt, 0, dump_flags); + fprintf (dump_file, "\n"); + } +@@ -796,13 +893,13 @@ delete_dead_call (gimple_stmt_iterator *gsi) + + /* Delete a dead store at GSI, which is a gimple assignment. */ + +-static void +-delete_dead_assignment (gimple_stmt_iterator *gsi) ++void ++delete_dead_or_redundant_assignment (gimple_stmt_iterator *gsi, const char *type) + { + gimple *stmt = gsi_stmt (*gsi); + if (dump_file && (dump_flags & TDF_DETAILS)) + { +- fprintf (dump_file, " Deleted dead store: "); ++ fprintf (dump_file, " Deleted %s store: ", type); + print_gimple_stmt (dump_file, stmt, 0, dump_flags); + fprintf (dump_file, "\n"); + } +@@ -855,7 +952,8 @@ dse_dom_walker::dse_optimize_stmt (gimple_stmt_iterator *gsi) + some builtin calls. */ + if (gimple_call_builtin_p (stmt, BUILT_IN_NORMAL)) + { +- switch (DECL_FUNCTION_CODE (gimple_call_fndecl (stmt))) ++ tree fndecl = gimple_call_fndecl (stmt); ++ switch (DECL_FUNCTION_CODE (fndecl)) + { + case BUILT_IN_MEMCPY: + case BUILT_IN_MEMMOVE: +@@ -867,10 +965,18 @@ dse_dom_walker::dse_optimize_stmt (gimple_stmt_iterator *gsi) + tree size = gimple_call_arg (stmt, 2); + if (integer_zerop (size)) + { +- delete_dead_call (gsi); ++ delete_dead_or_redundant_call (gsi, "dead"); + return; + } + ++ /* If this is a memset call that initializes an object ++ to zero, it may be redundant with an earlier memset ++ or empty CONSTRUCTOR of a larger object. */ ++ if ((DECL_FUNCTION_CODE (fndecl) == BUILT_IN_MEMSET ++ || DECL_FUNCTION_CODE (fndecl) == BUILT_IN_MEMSET_CHK) ++ && integer_zerop (gimple_call_arg (stmt, 1))) ++ dse_optimize_redundant_stores (stmt); ++ + enum dse_store_status store_status; + m_byte_tracking_enabled + = setup_live_bytes_from_ref (&ref, m_live_bytes); +@@ -887,10 +993,14 @@ dse_dom_walker::dse_optimize_stmt (gimple_stmt_iterator *gsi) + } + + if (store_status == DSE_STORE_DEAD) +- delete_dead_call (gsi); ++ delete_dead_or_redundant_call (gsi, "dead"); + return; + } + ++ case BUILT_IN_CALLOC: ++ /* We already know the arguments are integer constants. */ ++ dse_optimize_redundant_stores (stmt); ++ + default: + return; + } +@@ -900,6 +1010,18 @@ dse_dom_walker::dse_optimize_stmt (gimple_stmt_iterator *gsi) + { + bool by_clobber_p = false; + ++ /* First see if this store is a CONSTRUCTOR and if there ++ are subsequent CONSTRUCTOR stores which are totally ++ subsumed by this statement. If so remove the subsequent ++ CONSTRUCTOR store. ++ ++ This will tend to make fewer calls into memset with longer ++ arguments. */ ++ if (gimple_assign_rhs_code (stmt) == CONSTRUCTOR ++ && CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt)) == 0 ++ && !gimple_clobber_p (stmt)) ++ dse_optimize_redundant_stores (stmt); ++ + /* Self-assignments are zombies. */ + if (operand_equal_p (gimple_assign_rhs1 (stmt), + gimple_assign_lhs (stmt), 0)) +@@ -930,7 +1052,7 @@ dse_dom_walker::dse_optimize_stmt (gimple_stmt_iterator *gsi) + && !by_clobber_p) + return; + +- delete_dead_assignment (gsi); ++ delete_dead_or_redundant_assignment (gsi, "dead"); + } + } + +diff --git a/gcc/tree-ssa-dse.h b/gcc/tree-ssa-dse.h +new file mode 100644 +index 00000000000..a5eccbd746d +--- /dev/null ++++ b/gcc/tree-ssa-dse.h +@@ -0,0 +1,36 @@ ++/* Support routines for dead store elimination. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify ++it under the terms of the GNU General Public License as published by ++the Free Software Foundation; either version 3, or (at your option) ++any later version. ++ ++GCC is distributed in the hope that it will be useful, ++but WITHOUT ANY WARRANTY; without even the implied warranty of ++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++#ifndef GCC_TREE_SSA_DSE_H ++#define GCC_TREE_SSA_DSE_H ++ ++/* Return value from dse_classify_store */ ++enum dse_store_status ++{ ++ DSE_STORE_LIVE, ++ DSE_STORE_MAYBE_PARTIAL_DEAD, ++ DSE_STORE_DEAD ++}; ++ ++dse_store_status dse_classify_store (ao_ref *, gimple *, bool, sbitmap, ++ bool * = NULL, tree = NULL); ++ ++void delete_dead_or_redundant_assignment (gimple_stmt_iterator *, const char *); ++ ++#endif /* GCC_TREE_SSA_DSE_H */ +diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c +index d241becd481..2d54e13b180 100644 +--- a/gcc/tree-ssa-loop-niter.c ++++ b/gcc/tree-ssa-loop-niter.c +@@ -1928,7 +1928,7 @@ number_of_iterations_cond (struct loop *loop, + + tree + simplify_replace_tree (tree expr, tree old, tree new_tree, +- tree (*valueize) (tree)) ++ tree (*valueize) (tree, void*), void *context) + { + unsigned i, n; + tree ret = NULL_TREE, e, se; +@@ -1944,7 +1944,7 @@ simplify_replace_tree (tree expr, tree old, tree new_tree, + { + if (TREE_CODE (expr) == SSA_NAME) + { +- new_tree = valueize (expr); ++ new_tree = valueize (expr, context); + if (new_tree != expr) + return new_tree; + } +@@ -1960,7 +1960,7 @@ simplify_replace_tree (tree expr, tree old, tree new_tree, + for (i = 0; i < n; i++) + { + e = TREE_OPERAND (expr, i); +- se = simplify_replace_tree (e, old, new_tree, valueize); ++ se = simplify_replace_tree (e, old, new_tree, valueize, context); + if (e == se) + continue; + +diff --git a/gcc/tree-ssa-loop-niter.h b/gcc/tree-ssa-loop-niter.h +index dc116489218..fb192d2c250 100644 +--- a/gcc/tree-ssa-loop-niter.h ++++ b/gcc/tree-ssa-loop-niter.h +@@ -53,7 +53,9 @@ extern bool scev_probably_wraps_p (tree, tree, tree, gimple *, + struct loop *, bool); + extern void free_numbers_of_iterations_estimates (struct loop *); + extern void free_numbers_of_iterations_estimates (function *); +-extern tree simplify_replace_tree (tree, tree, tree, tree (*)(tree) = NULL); ++extern tree simplify_replace_tree (tree, tree, ++ tree, tree (*)(tree, void *) = NULL, ++ void * = NULL); + extern void substitute_in_loop_info (struct loop *, tree, tree); + + #endif /* GCC_TREE_SSA_LOOP_NITER_H */ +diff --git a/gcc/tree-ssa-loop.c b/gcc/tree-ssa-loop.c +index 00a09508836..551718637f1 100644 +--- a/gcc/tree-ssa-loop.c ++++ b/gcc/tree-ssa-loop.c +@@ -768,9 +768,9 @@ get_lsm_tmp_name (tree ref, unsigned n, const char *suffix) + ns[1] = 0; + lsm_tmp_name_add (ns); + } +- return lsm_tmp_name; + if (suffix != NULL) + lsm_tmp_name_add (suffix); ++ return lsm_tmp_name; + } + + /* Computes an estimated number of insns in LOOP, weighted by WEIGHTS. */ +diff --git a/gcc/tree-ssa-reassoc.c b/gcc/tree-ssa-reassoc.c +index 6794fbde29e..9c1a9a651fe 100644 +--- a/gcc/tree-ssa-reassoc.c ++++ b/gcc/tree-ssa-reassoc.c +@@ -2039,9 +2039,6 @@ optimize_ops_list (enum tree_code opcode, + i++; + } + +- length = ops->length (); +- oelast = ops->last (); +- + if (iterate) + optimize_ops_list (opcode, ops); + } +diff --git a/gcc/tree-ssa-sccvn.c b/gcc/tree-ssa-sccvn.c +index 95fbead7b1e..cd5a3a75eaa 100644 +--- a/gcc/tree-ssa-sccvn.c ++++ b/gcc/tree-ssa-sccvn.c +@@ -309,6 +309,10 @@ static vn_tables_t valid_info; + /* Valueization hook. Valueize NAME if it is an SSA name, otherwise + just return it. */ + tree (*vn_valueize) (tree); ++tree vn_valueize_wrapper (tree t, void* context ATTRIBUTE_UNUSED) ++{ ++ return vn_valueize (t); ++} + + + /* This represents the top of the VN lattice, which is the universal +@@ -6364,7 +6368,7 @@ process_bb (rpo_elim &avail, basic_block bb, + if (bb->loop_father->nb_iterations) + bb->loop_father->nb_iterations + = simplify_replace_tree (bb->loop_father->nb_iterations, +- NULL_TREE, NULL_TREE, vn_valueize); ++ NULL_TREE, NULL_TREE, &vn_valueize_wrapper); + } + + /* Value-number all defs in the basic-block. */ +diff --git a/gcc/tree-ssa-sink.c b/gcc/tree-ssa-sink.c +index 2648b24f7d5..98b6caced03 100644 +--- a/gcc/tree-ssa-sink.c ++++ b/gcc/tree-ssa-sink.c +@@ -433,7 +433,6 @@ statement_sink_location (gimple *stmt, basic_block frombb, + + if (gimple_code (use) != GIMPLE_PHI) + { +- sinkbb = gimple_bb (use); + sinkbb = select_best_block (frombb, gimple_bb (use), stmt); + + if (sinkbb == frombb) +diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c +index c3ea2d680d8..91494d76176 100644 +--- a/gcc/tree-ssa-threadedge.c ++++ b/gcc/tree-ssa-threadedge.c +@@ -1299,7 +1299,6 @@ thread_across_edge (gcond *dummy_cond, + + x = new jump_thread_edge (taken_edge, EDGE_COPY_SRC_JOINER_BLOCK); + path->safe_push (x); +- found = false; + found = thread_around_empty_blocks (taken_edge, + dummy_cond, + avail_exprs_stack, +diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c +index 17a4fc8e279..e822ffc1a01 100644 +--- a/gcc/tree-vect-data-refs.c ++++ b/gcc/tree-vect-data-refs.c +@@ -2863,10 +2863,12 @@ strip_conversion (tree op) + } + + /* Return true if vectorizable_* routines can handle statements STMT1_INFO +- and STMT2_INFO being in a single group. */ ++ and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can ++ be grouped in SLP mode. */ + + static bool +-can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info) ++can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info, ++ bool allow_slp_p) + { + if (gimple_assign_single_p (stmt1_info->stmt)) + return gimple_assign_single_p (stmt2_info->stmt); +@@ -2888,7 +2890,8 @@ can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info) + like those created by build_mask_conversion. */ + tree mask1 = gimple_call_arg (call1, 2); + tree mask2 = gimple_call_arg (call2, 2); +- if (!operand_equal_p (mask1, mask2, 0)) ++ if (!operand_equal_p (mask1, mask2, 0) ++ && (ifn == IFN_MASK_STORE || !allow_slp_p)) + { + mask1 = strip_conversion (mask1); + if (!mask1) +@@ -2974,7 +2977,7 @@ vect_analyze_data_ref_accesses (vec_info *vinfo) + || data_ref_compare_tree (DR_BASE_ADDRESS (dra), + DR_BASE_ADDRESS (drb)) != 0 + || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0 +- || !can_group_stmts_p (stmtinfo_a, stmtinfo_b)) ++ || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true)) + break; + + /* Check that the data-refs have the same constant size. */ +@@ -3059,6 +3062,13 @@ vect_analyze_data_ref_accesses (vec_info *vinfo) + DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b; + lastinfo = stmtinfo_b; + ++ STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a) ++ = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false); ++ ++ if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Load suitable for SLP vectorization only.\n"); ++ + if (init_b == init_prev + && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)) + && dump_enabled_p ()) +@@ -3446,7 +3456,6 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) + /* First, we collect all data ref pairs for aliasing checks. */ + FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr) + { +- int comp_res; + poly_uint64 lower_bound; + tree segment_length_a, segment_length_b; + unsigned HOST_WIDE_INT access_size_a, access_size_b; +@@ -3478,10 +3487,13 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) + dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr)); + stmt_vec_info stmt_info_b = dr_info_b->stmt; + ++ bool preserves_scalar_order_p ++ = vect_preserves_scalar_order_p (dr_info_a, dr_info_b); ++ + /* Skip the pair if inter-iteration dependencies are irrelevant + and intra-iteration dependencies are guaranteed to be honored. */ + if (ignore_step_p +- && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b) ++ && (preserves_scalar_order_p + || vectorizable_with_step_bound_p (dr_info_a, dr_info_b, + &lower_bound))) + { +@@ -3562,14 +3574,11 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) + align_a = vect_vfa_align (dr_info_a); + align_b = vect_vfa_align (dr_info_b); + +- comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr), +- DR_BASE_ADDRESS (dr_info_b->dr)); +- if (comp_res == 0) +- comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr), +- DR_OFFSET (dr_info_b->dr)); +- + /* See whether the alias is known at compilation time. */ +- if (comp_res == 0 ++ if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr), ++ DR_BASE_ADDRESS (dr_info_b->dr), 0) ++ && operand_equal_p (DR_OFFSET (dr_info_a->dr), ++ DR_OFFSET (dr_info_b->dr), 0) + && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST + && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST + && poly_int_tree_p (segment_length_a) +@@ -3602,15 +3611,21 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) + stmt_info_b->stmt); + } + +- dr_with_seg_len_pair_t dr_with_seg_len_pair +- (dr_with_seg_len (dr_info_a->dr, segment_length_a, +- access_size_a, align_a), +- dr_with_seg_len (dr_info_b->dr, segment_length_b, +- access_size_b, align_b)); ++ dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a, ++ access_size_a, align_a); ++ dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b, ++ access_size_b, align_b); ++ /* Canonicalize the order to be the one that's needed for accurate ++ RAW, WAR and WAW flags, in cases where the data references are ++ well-ordered. The order doesn't really matter otherwise, ++ but we might as well be consistent. */ ++ if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a) ++ std::swap (dr_a, dr_b); + +- /* Canonicalize pairs by sorting the two DR members. */ +- if (comp_res > 0) +- std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); ++ dr_with_seg_len_pair_t dr_with_seg_len_pair ++ (dr_a, dr_b, (preserves_scalar_order_p ++ ? dr_with_seg_len_pair_t::WELL_ORDERED ++ : dr_with_seg_len_pair_t::REORDERED)); + + comp_alias_ddrs.safe_push (dr_with_seg_len_pair); + } +@@ -4123,7 +4138,7 @@ vect_find_stmt_data_reference (loop_p loop, gimple *stmt, + */ + + opt_result +-vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf) ++vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal) + { + struct loop *loop = NULL; + unsigned int i; +@@ -4298,7 +4313,7 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf) + /* Set vectype for STMT. */ + scalar_type = TREE_TYPE (DR_REF (dr)); + STMT_VINFO_VECTYPE (stmt_info) +- = get_vectype_for_scalar_type (scalar_type); ++ = get_vectype_for_scalar_type (vinfo, scalar_type); + if (!STMT_VINFO_VECTYPE (stmt_info)) + { + if (dump_enabled_p ()) +@@ -4344,13 +4359,18 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf) + if (!vect_check_gather_scatter (stmt_info, + as_a (vinfo), + &gs_info) +- || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset))) +- return opt_result::failure_at +- (stmt_info->stmt, +- (gatherscatter == GATHER) ? +- "not vectorized: not suitable for gather load %G" : +- "not vectorized: not suitable for scatter store %G", +- stmt_info->stmt); ++ || !get_vectype_for_scalar_type (vinfo, ++ TREE_TYPE (gs_info.offset))) ++ { ++ if (fatal) ++ *fatal = false; ++ return opt_result::failure_at ++ (stmt_info->stmt, ++ (gatherscatter == GATHER) ++ ? "not vectorized: not suitable for gather load %G" ++ : "not vectorized: not suitable for scatter store %G", ++ stmt_info->stmt); ++ } + STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter; + } + } +diff --git a/gcc/tree-vect-generic.c b/gcc/tree-vect-generic.c +index ad1ea4e7b97..39bc2a82b37 100644 +--- a/gcc/tree-vect-generic.c ++++ b/gcc/tree-vect-generic.c +@@ -694,7 +694,7 @@ expand_vector_divmod (gimple_stmt_iterator *gsi, tree type, tree op0, + tree zero, cst, cond, mask_type; + gimple *stmt; + +- mask_type = build_same_sized_truth_vector_type (type); ++ mask_type = truth_type_for (type); + zero = build_zero_cst (type); + cond = build2 (LT_EXPR, mask_type, op0, zero); + tree_vector_builder vec (type, nunits, 1); +diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c +index b3fae5ba4da..20ede85633b 100644 +--- a/gcc/tree-vect-loop-manip.c ++++ b/gcc/tree-vect-loop-manip.c +@@ -47,6 +47,9 @@ along with GCC; see the file COPYING3. If not see + #include "stor-layout.h" + #include "optabs-query.h" + #include "vec-perm-indices.h" ++#include "insn-config.h" ++#include "rtl.h" ++#include "recog.h" + + /************************************************************************* + Simple Loop Peeling Utilities +@@ -323,13 +326,18 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm, + tree src_masktype = src_rgm->mask_type; + tree dest_masktype = dest_rgm->mask_type; + machine_mode src_mode = TYPE_MODE (src_masktype); ++ insn_code icode1, icode2; + if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter +- && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing +- && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing) ++ && (icode1 = optab_handler (vec_unpacku_hi_optab, ++ src_mode)) != CODE_FOR_nothing ++ && (icode2 = optab_handler (vec_unpacku_lo_optab, ++ src_mode)) != CODE_FOR_nothing) + { + /* Unpacking the source masks gives at least as many mask bits as + we need. We can then VIEW_CONVERT any excess bits away. */ +- tree unpack_masktype = vect_halve_mask_nunits (src_masktype); ++ machine_mode dest_mode = insn_data[icode1].operand[0].mode; ++ gcc_assert (dest_mode == insn_data[icode2].operand[0].mode); ++ tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode); + for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i) + { + tree src = src_rgm->masks[i / 2]; +@@ -1745,7 +1753,7 @@ vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code) + Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO. + CODE and NITERS are as for vect_update_inits_of_dr. */ + +-static void ++void + vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters, + tree_code code) + { +@@ -1755,21 +1763,12 @@ vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters, + + DUMP_VECT_SCOPE ("vect_update_inits_of_dr"); + +- /* Adjust niters to sizetype and insert stmts on loop preheader edge. */ ++ /* Adjust niters to sizetype. We used to insert the stmts on loop preheader ++ here, but since we might use these niters to update the epilogues niters ++ and data references we can't insert them here as this definition might not ++ always dominate its uses. */ + if (!types_compatible_p (sizetype, TREE_TYPE (niters))) +- { +- gimple_seq seq; +- edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); +- tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters"); +- +- niters = fold_convert (sizetype, niters); +- niters = force_gimple_operand (niters, &seq, false, var); +- if (seq) +- { +- basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); +- gcc_assert (!new_bb); +- } +- } ++ niters = fold_convert (sizetype, niters); + + FOR_EACH_VEC_ELT (datarefs, i, dr) + { +@@ -2032,6 +2031,29 @@ vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo, + *niters_vector_mult_vf_ptr = niters_vector_mult_vf; + } + ++/* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP, ++ this function searches for the corresponding lcssa phi node in exit ++ bb of LOOP. If it is found, return the phi result; otherwise return ++ NULL. */ ++ ++static tree ++find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED, ++ gphi *lcssa_phi) ++{ ++ gphi_iterator gsi; ++ edge e = single_exit (loop); ++ ++ gcc_assert (single_pred_p (e->dest)); ++ for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ gphi *phi = gsi.phi (); ++ if (operand_equal_p (PHI_ARG_DEF (phi, 0), ++ PHI_ARG_DEF (lcssa_phi, 0), 0)) ++ return PHI_RESULT (phi); ++ } ++ return NULL_TREE; ++} ++ + /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND + from SECOND/FIRST and puts it at the original loop's preheader/exit + edge, the two loops are arranged as below: +@@ -2119,6 +2141,29 @@ slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo, + incoming edge. */ + adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg); + } ++ ++ /* For epilogue peeling we have to make sure to copy all LC PHIs ++ for correct vectorization of live stmts. */ ++ if (loop == first) ++ { ++ basic_block orig_exit = single_exit (second)->dest; ++ for (gsi_orig = gsi_start_phis (orig_exit); ++ !gsi_end_p (gsi_orig); gsi_next (&gsi_orig)) ++ { ++ gphi *orig_phi = gsi_orig.phi (); ++ tree orig_arg = PHI_ARG_DEF (orig_phi, 0); ++ if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg)) ++ continue; ++ ++ /* Already created in the above loop. */ ++ if (find_guard_arg (first, second, orig_phi)) ++ continue; ++ ++ tree new_res = copy_ssa_name (orig_arg); ++ gphi *lcphi = create_phi_node (new_res, between_bb); ++ add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION); ++ } ++ } + } + + /* Function slpeel_add_loop_guard adds guard skipping from the beginning +@@ -2203,29 +2248,6 @@ slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop, + } + } + +-/* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP, +- this function searches for the corresponding lcssa phi node in exit +- bb of LOOP. If it is found, return the phi result; otherwise return +- NULL. */ +- +-static tree +-find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED, +- gphi *lcssa_phi) +-{ +- gphi_iterator gsi; +- edge e = single_exit (loop); +- +- gcc_assert (single_pred_p (e->dest)); +- for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi)) +- { +- gphi *phi = gsi.phi (); +- if (operand_equal_p (PHI_ARG_DEF (phi, 0), +- PHI_ARG_DEF (lcssa_phi, 0), 0)) +- return PHI_RESULT (phi); +- } +- return NULL_TREE; +-} +- + /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied + from LOOP. Function slpeel_add_loop_guard adds guard skipping from a + point between the two loops to the end of EPILOG. Edges GUARD_EDGE +@@ -2296,12 +2318,14 @@ slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog, + { + gphi *update_phi = gsi.phi (); + tree old_arg = PHI_ARG_DEF (update_phi, 0); +- /* This loop-closed-phi actually doesn't represent a use out of the +- loop - the phi arg is a constant. */ +- if (TREE_CODE (old_arg) != SSA_NAME) +- continue; + +- tree merge_arg = get_current_def (old_arg); ++ tree merge_arg = NULL_TREE; ++ ++ /* If the old argument is a SSA_NAME use its current_def. */ ++ if (TREE_CODE (old_arg) == SSA_NAME) ++ merge_arg = get_current_def (old_arg); ++ /* If it's a constant or doesn't have a current_def, just use the old ++ argument. */ + if (!merge_arg) + merge_arg = old_arg; + +@@ -2412,7 +2436,22 @@ slpeel_update_phi_nodes_for_lcssa (struct loop *epilog) + + Note this function peels prolog and epilog only if it's necessary, + as well as guards. +- Returns created epilogue or NULL. ++ This function returns the epilogue loop if a decision was made to vectorize ++ it, otherwise NULL. ++ ++ The analysis resulting in this epilogue loop's loop_vec_info was performed ++ in the same vect_analyze_loop call as the main loop's. At that time ++ vect_analyze_loop constructs a list of accepted loop_vec_info's for lower ++ vectorization factors than the main loop. This list is stored in the main ++ loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to ++ vectorize the epilogue loop for a lower vectorization factor, the ++ loop_vec_info sitting at the top of the epilogue_vinfos list is removed, ++ updated and linked to the epilogue loop. This is later used to vectorize ++ the epilogue. The reason the loop_vec_info needs updating is that it was ++ constructed based on the original main loop, and the epilogue loop is a ++ copy of this loop, so all links pointing to statements in the original loop ++ need updating. Furthermore, these loop_vec_infos share the ++ data_reference's records, which will also need to be updated. + + TODO: Guard for prefer_scalar_loop should be emitted along with + versioning conditions if loop versioning is needed. */ +@@ -2422,7 +2461,8 @@ struct loop * + vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, + tree *niters_vector, tree *step_vector, + tree *niters_vector_mult_vf_var, int th, +- bool check_profitability, bool niters_no_overflow) ++ bool check_profitability, bool niters_no_overflow, ++ tree *advance, drs_init_vec &orig_drs_init) + { + edge e, guard_e; + tree type = TREE_TYPE (niters), guard_cond; +@@ -2430,6 +2470,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, + profile_probability prob_prolog, prob_vector, prob_epilog; + int estimated_vf; + int prolog_peeling = 0; ++ bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0; + /* We currently do not support prolog peeling if the target alignment is not + known at compile time. 'vect_gen_prolog_loop_niters' depends on the + target alignment being constant. */ +@@ -2483,19 +2524,77 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, + int bound_prolog = 0; + if (prolog_peeling) + niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor, +- &bound_prolog); ++ &bound_prolog); + else + niters_prolog = build_int_cst (type, 0); + ++ loop_vec_info epilogue_vinfo = NULL; ++ if (vect_epilogues) ++ { ++ epilogue_vinfo = loop_vinfo->epilogue_vinfos[0]; ++ loop_vinfo->epilogue_vinfos.ordered_remove (0); ++ } ++ ++ tree niters_vector_mult_vf = NULL_TREE; ++ /* Saving NITERs before the loop, as this may be changed by prologue. */ ++ tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo); ++ edge update_e = NULL, skip_e = NULL; ++ unsigned int lowest_vf = constant_lower_bound (vf); ++ /* If we know the number of scalar iterations for the main loop we should ++ check whether after the main loop there are enough iterations left over ++ for the epilogue. */ ++ if (vect_epilogues ++ && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) ++ && prolog_peeling >= 0 ++ && known_eq (vf, lowest_vf)) ++ { ++ unsigned HOST_WIDE_INT eiters ++ = (LOOP_VINFO_INT_NITERS (loop_vinfo) ++ - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)); ++ ++ eiters -= prolog_peeling; ++ eiters ++ = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo); ++ ++ unsigned int ratio; ++ unsigned int epilogue_gaps ++ = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo); ++ while (!(constant_multiple_p ++ (GET_MODE_SIZE (loop_vinfo->vector_mode), ++ GET_MODE_SIZE (epilogue_vinfo->vector_mode), &ratio) ++ && eiters >= lowest_vf / ratio + epilogue_gaps)) ++ { ++ delete epilogue_vinfo; ++ epilogue_vinfo = NULL; ++ if (loop_vinfo->epilogue_vinfos.length () == 0) ++ { ++ vect_epilogues = false; ++ break; ++ } ++ epilogue_vinfo = loop_vinfo->epilogue_vinfos[0]; ++ loop_vinfo->epilogue_vinfos.ordered_remove (0); ++ epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo); ++ } ++ } + /* Prolog loop may be skipped. */ + bool skip_prolog = (prolog_peeling != 0); +- /* Skip to epilog if scalar loop may be preferred. It's only needed +- when we peel for epilog loop and when it hasn't been checked with +- loop versioning. */ ++ /* Skip this loop to epilog when there are not enough iterations to enter this ++ vectorized loop. If true we should perform runtime checks on the NITERS ++ to check whether we should skip the current vectorized loop. If we know ++ the number of scalar iterations we may choose to add a runtime check if ++ this number "maybe" smaller than the number of iterations required ++ when we know the number of scalar iterations may potentially ++ be smaller than the number of iterations required to enter this loop, for ++ this we use the upper bounds on the prolog and epilog peeling. When we ++ don't know the number of iterations and don't require versioning it is ++ because we have asserted that there are enough scalar iterations to enter ++ the main loop, so this skip is not necessary. When we are versioning then ++ we only add such a skip if we have chosen to vectorize the epilogue. */ + bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo), + bound_prolog + bound_epilog) +- : !LOOP_REQUIRES_VERSIONING (loop_vinfo)); ++ : (!LOOP_REQUIRES_VERSIONING (loop_vinfo) ++ || vect_epilogues)); + /* Epilog loop must be executed if the number of iterations for epilog + loop is known at compile time, otherwise we need to add a check at + the end of vector loop and skip to the end of epilog loop. */ +@@ -2525,6 +2624,12 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, + + dump_user_location_t loop_loc = find_loop_location (loop); + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); ++ if (vect_epilogues) ++ /* Make sure to set the epilogue's epilogue scalar loop, such that we can ++ use the original scalar loop as remaining epilogue if necessary. */ ++ LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo) ++ = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); ++ + if (prolog_peeling) + { + e = loop_preheader_edge (loop); +@@ -2571,6 +2676,15 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, + scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog); + scale_loop_profile (prolog, prob_prolog, bound_prolog); + } ++ ++ /* Save original inits for each data_reference before advancing them with ++ NITERS_PROLOG. */ ++ unsigned int i; ++ struct data_reference *dr; ++ vec datarefs = loop_vinfo->shared->datarefs; ++ FOR_EACH_VEC_ELT (datarefs, i, dr) ++ orig_drs_init.safe_push (std::make_pair (dr, DR_OFFSET (dr))); ++ + /* Update init address of DRs. */ + vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR); + /* Update niters for vector loop. */ +@@ -2605,8 +2719,15 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, + "loop can't be duplicated to exit edge.\n"); + gcc_unreachable (); + } +- /* Peel epilog and put it on exit edge of loop. */ +- epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e); ++ /* Peel epilog and put it on exit edge of loop. If we are vectorizing ++ said epilog then we should use a copy of the main loop as a starting ++ point. This loop may have already had some preliminary transformations ++ to allow for more optimal vectorization, for example if-conversion. ++ If we are not vectorizing the epilog then we should use the scalar loop ++ as the transformations mentioned above make less or no sense when not ++ vectorizing. */ ++ epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop; ++ epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e); + if (!epilog) + { + dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, +@@ -2635,6 +2756,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, + guard_to, guard_bb, + prob_vector.invert (), + irred_flag); ++ skip_e = guard_e; + e = EDGE_PRED (guard_to, 0); + e = (e != guard_e ? e : EDGE_PRED (guard_to, 1)); + slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e); +@@ -2656,7 +2778,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, + } + + basic_block bb_before_epilog = loop_preheader_edge (epilog)->src; +- tree niters_vector_mult_vf; + /* If loop is peeled for non-zero constant times, now niters refers to + orig_niters - prolog_peeling, it won't overflow even the orig_niters + overflows. */ +@@ -2679,7 +2800,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, + /* Update IVs of original loop as if they were advanced by + niters_vector_mult_vf steps. */ + gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo)); +- edge update_e = skip_vector ? e : loop_preheader_edge (epilog); ++ update_e = skip_vector ? e : loop_preheader_edge (epilog); + vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf, + update_e); + +@@ -2720,10 +2841,75 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, + adjust_vec_debug_stmts (); + scev_reset (); + } ++ ++ if (vect_epilogues) ++ { ++ epilog->aux = epilogue_vinfo; ++ LOOP_VINFO_LOOP (epilogue_vinfo) = epilog; ++ ++ loop_constraint_clear (epilog, LOOP_C_INFINITE); ++ ++ /* We now must calculate the number of NITERS performed by the previous ++ loop and EPILOGUE_NITERS to be performed by the epilogue. */ ++ tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf), ++ niters_prolog, niters_vector_mult_vf); ++ ++ /* If skip_vector we may skip the previous loop, we insert a phi-node to ++ determine whether we are coming from the previous vectorized loop ++ using the update_e edge or the skip_vector basic block using the ++ skip_e edge. */ ++ if (skip_vector) ++ { ++ gcc_assert (update_e != NULL && skip_e != NULL); ++ gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)), ++ update_e->dest); ++ tree new_ssa = make_ssa_name (TREE_TYPE (niters)); ++ gimple *stmt = gimple_build_assign (new_ssa, niters); ++ gimple_stmt_iterator gsi; ++ if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME ++ && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL) ++ { ++ gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf)); ++ gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); ++ } ++ else ++ { ++ gsi = gsi_last_bb (update_e->src); ++ gsi_insert_before (&gsi, stmt, GSI_NEW_STMT); ++ } ++ ++ niters = new_ssa; ++ add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION); ++ add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e, ++ UNKNOWN_LOCATION); ++ niters = PHI_RESULT (new_phi); ++ } ++ ++ /* Subtract the number of iterations performed by the vectorized loop ++ from the number of total iterations. */ ++ tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters), ++ before_loop_niters, ++ niters); ++ ++ LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters; ++ LOOP_VINFO_NITERSM1 (epilogue_vinfo) ++ = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters), ++ epilogue_niters, ++ build_one_cst (TREE_TYPE (epilogue_niters))); ++ ++ /* Set ADVANCE to the number of iterations performed by the previous ++ loop and its prologue. */ ++ *advance = niters; ++ ++ /* Redo the peeling for niter analysis as the NITERs and alignment ++ may have been updated to take the main loop into account. */ ++ determine_peel_for_niter (epilogue_vinfo); ++ } ++ + adjust_vec.release (); + free_original_copy_tables (); + +- return epilog; ++ return vect_epilogues ? epilog : NULL; + } + + /* Function vect_create_cond_for_niters_checks. +@@ -2987,9 +3173,7 @@ vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr) + *COND_EXPR_STMT_LIST. */ + + struct loop * +-vect_loop_versioning (loop_vec_info loop_vinfo, +- unsigned int th, bool check_profitability, +- poly_uint64 versioning_threshold) ++vect_loop_versioning (loop_vec_info loop_vinfo) + { + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop; + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); +@@ -3009,10 +3193,15 @@ vect_loop_versioning (loop_vec_info loop_vinfo, + bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo); + bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo); + bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo); ++ poly_uint64 versioning_threshold ++ = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); + tree version_simd_if_cond + = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo); ++ unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); + +- if (check_profitability) ++ if (th >= vect_vf_for_cost (loop_vinfo) ++ && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) ++ && !ordered_p (th, versioning_threshold)) + cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters, + build_int_cst (TREE_TYPE (scalar_loop_iters), + th - 1)); +diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c +index 0308b26b808..6cbdfd1ad1a 100644 +--- a/gcc/tree-vect-loop.c ++++ b/gcc/tree-vect-loop.c +@@ -154,6 +154,8 @@ along with GCC; see the file COPYING3. If not see + */ + + static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *); ++static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info, ++ bool *); + + /* Subroutine of vect_determine_vf_for_stmt that handles only one + statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE +@@ -325,7 +327,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo) + "get vectype for scalar type: %T\n", + scalar_type); + +- vectype = get_vectype_for_scalar_type (scalar_type); ++ vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type); + if (!vectype) + return opt_result::failure_at (phi, + "not vectorized: unsupported " +@@ -559,19 +561,19 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop) + && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type); + + stmt_vec_info reduc_stmt_info +- = vect_force_simple_reduction (loop_vinfo, stmt_vinfo, +- &double_reduc, false); ++ = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc); + if (reduc_stmt_info) + { +- if (double_reduc) +- { +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_NOTE, vect_location, ++ STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info; ++ STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo; ++ if (double_reduc) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, + "Detected double reduction.\n"); + + STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def; +- STMT_VINFO_DEF_TYPE (reduc_stmt_info) +- = vect_double_reduction_def; ++ STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def; + } + else + { +@@ -582,7 +584,6 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop) + "Detected vectorizable nested cycle.\n"); + + STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle; +- STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle; + } + else + { +@@ -688,13 +689,16 @@ vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo) + stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first); + while (next) + { +- if (! STMT_VINFO_IN_PATTERN_P (next)) ++ if (! STMT_VINFO_IN_PATTERN_P (next) ++ || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1) + break; + next = REDUC_GROUP_NEXT_ELEMENT (next); + } +- /* If not all stmt in the chain are patterns try to handle +- the chain without patterns. */ +- if (! next) ++ /* If not all stmt in the chain are patterns or if we failed ++ to update STMT_VINFO_REDUC_IDX try to handle the chain ++ without patterns. */ ++ if (! next ++ && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1) + { + vect_fixup_reduc_chain (first); + LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i] +@@ -730,9 +734,7 @@ vect_get_loop_niters (struct loop *loop, tree *assumptions, + if (!exit) + return cond; + +- niter = chrec_dont_know; + may_be_zero = NULL_TREE; +- niter_assumptions = boolean_true_node; + if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL) + || chrec_contains_undetermined (niter_desc.niter)) + return cond; +@@ -826,6 +828,8 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared) + ivexpr_map (NULL), + slp_unrolling_factor (1), + single_scalar_iteration_cost (0), ++ vec_outside_cost (0), ++ vec_inside_cost (0), + vectorizable (false), + can_fully_mask_p (true), + fully_masked_p (false), +@@ -885,6 +889,8 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared) + } + } + } ++ ++ epilogue_vinfos.create (6); + } + + /* Free all levels of MASKS. */ +@@ -959,6 +965,7 @@ _loop_vec_info::~_loop_vec_info () + + release_vec_loop_masks (&masks); + delete ivexpr_map; ++ epilogue_vinfos.release (); + + loop->aux = NULL; + } +@@ -1431,8 +1438,8 @@ vect_update_vf_for_slp (loop_vec_info loop_vinfo) + dump_printf_loc (MSG_NOTE, vect_location, + "Loop contains SLP and non-SLP stmts\n"); + /* Both the vectorization factor and unroll factor have the form +- current_vector_size * X for some rational X, so they must have +- a common multiple. */ ++ GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X, ++ so they must have a common multiple. */ + vectorization_factor + = force_common_multiple (vectorization_factor, + LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo)); +@@ -1535,12 +1542,18 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo) + phi_op = PHI_ARG_DEF (phi, 0); + stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op); + if (!op_def_info) +- return opt_result::failure_at (phi, "unsupported phi"); ++ return opt_result::failure_at (phi, "unsupported phi\n"); + + if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer + && (STMT_VINFO_RELEVANT (op_def_info) + != vect_used_in_outer_by_reduction)) +- return opt_result::failure_at (phi, "unsupported phi"); ++ return opt_result::failure_at (phi, "unsupported phi\n"); ++ ++ if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def ++ || (STMT_VINFO_DEF_TYPE (stmt_info) ++ == vect_double_reduction_def)) ++ && !vectorizable_lc_phi (stmt_info, NULL, NULL)) ++ return opt_result::failure_at (phi, "unsupported phi\n"); + } + + continue; +@@ -1564,18 +1577,19 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo) + ok = vectorizable_induction (stmt_info, NULL, NULL, NULL, + &cost_vec); + else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def ++ || (STMT_VINFO_DEF_TYPE (stmt_info) ++ == vect_double_reduction_def) + || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) + && ! PURE_SLP_STMT (stmt_info)) +- ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL, +- &cost_vec); ++ ok = vectorizable_reduction (stmt_info, NULL, NULL, &cost_vec); + } + + /* SLP PHIs are tested by vect_slp_analyze_node_operations. */ + if (ok + && STMT_VINFO_LIVE_P (stmt_info) + && !PURE_SLP_STMT (stmt_info)) +- ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL, +- &cost_vec); ++ ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL, ++ -1, NULL, &cost_vec); + + if (!ok) + return opt_result::failure_at (phi, +@@ -1692,9 +1706,20 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo) + return 0; + } + +- HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop); +- if (estimated_niter == -1) +- estimated_niter = likely_max_stmt_executions_int (loop); ++ HOST_WIDE_INT estimated_niter; ++ ++ /* If we are vectorizing an epilogue then we know the maximum number of ++ scalar iterations it will cover is at least one lower than the ++ vectorization factor of the main loop. */ ++ if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) ++ estimated_niter ++ = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1; ++ else ++ { ++ estimated_niter = estimated_stmt_executions_int (loop); ++ if (estimated_niter == -1) ++ estimated_niter = likely_max_stmt_executions_int (loop); ++ } + if (estimated_niter != -1 + && ((unsigned HOST_WIDE_INT) estimated_niter + < MAX (th, (unsigned) min_profitable_estimate))) +@@ -1774,6 +1799,101 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs, + return opt_result::success (); + } + ++/* Look for SLP-only access groups and turn each individual access into its own ++ group. */ ++static void ++vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo) ++{ ++ unsigned int i; ++ struct data_reference *dr; ++ ++ DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups"); ++ ++ vec datarefs = loop_vinfo->shared->datarefs; ++ FOR_EACH_VEC_ELT (datarefs, i, dr) ++ { ++ gcc_assert (DR_REF (dr)); ++ stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr)); ++ ++ /* Check if the load is a part of an interleaving chain. */ ++ if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) ++ { ++ stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info); ++ unsigned int group_size = DR_GROUP_SIZE (first_element); ++ ++ /* Check if SLP-only groups. */ ++ if (!STMT_SLP_TYPE (stmt_info) ++ && STMT_VINFO_SLP_VECT_ONLY (first_element)) ++ { ++ /* Dissolve the group. */ ++ STMT_VINFO_SLP_VECT_ONLY (first_element) = false; ++ ++ stmt_vec_info vinfo = first_element; ++ while (vinfo) ++ { ++ stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo); ++ DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo; ++ DR_GROUP_NEXT_ELEMENT (vinfo) = NULL; ++ DR_GROUP_SIZE (vinfo) = 1; ++ DR_GROUP_GAP (vinfo) = group_size - 1; ++ vinfo = next; ++ } ++ } ++ } ++ } ++} ++ ++ ++/* Decides whether we need to create an epilogue loop to handle ++ remaining scalar iterations and sets PEELING_FOR_NITERS accordingly. */ ++ ++void ++determine_peel_for_niter (loop_vec_info loop_vinfo) ++{ ++ LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; ++ ++ unsigned HOST_WIDE_INT const_vf; ++ HOST_WIDE_INT max_niter ++ = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo)); ++ ++ unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); ++ if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) ++ th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO ++ (loop_vinfo)); ++ ++ if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) ++ /* The main loop handles all iterations. */ ++ LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; ++ else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) ++ && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) ++ { ++ /* Work out the (constant) number of iterations that need to be ++ peeled for reasons other than niters. */ ++ unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); ++ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) ++ peel_niter += 1; ++ if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter, ++ LOOP_VINFO_VECT_FACTOR (loop_vinfo))) ++ LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; ++ } ++ else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) ++ /* ??? When peeling for gaps but not alignment, we could ++ try to check whether the (variable) niters is known to be ++ VF * N + 1. That's something of a niche case though. */ ++ || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ++ || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf) ++ || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo)) ++ < (unsigned) exact_log2 (const_vf)) ++ /* In case of versioning, check if the maximum number of ++ iterations is greater than th. If they are identical, ++ the epilogue is unnecessary. */ ++ && (!LOOP_REQUIRES_VERSIONING (loop_vinfo) ++ || ((unsigned HOST_WIDE_INT) max_niter ++ > (th / const_vf) * const_vf)))) ++ LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; ++} ++ ++ + /* Function vect_analyze_loop_2. + + Apply a set of analyses on LOOP, and create a loop_vec_info struct +@@ -1786,6 +1906,15 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts) + int res; + unsigned int max_vf = MAX_VECTORIZATION_FACTOR; + poly_uint64 min_vf = 2; ++ loop_vec_info orig_loop_vinfo = NULL; ++ ++ /* If we are dealing with an epilogue then orig_loop_vinfo points to the ++ loop_vec_info of the first vectorized loop. */ ++ if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) ++ orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); ++ else ++ orig_loop_vinfo = loop_vinfo; ++ gcc_assert (orig_loop_vinfo); + + /* The first group of checks is independent of the vector size. */ + fatal = true; +@@ -1824,7 +1953,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts) + /* Analyze the data references and also adjust the minimal + vectorization factor according to the loads and stores. */ + +- ok = vect_analyze_data_refs (loop_vinfo, &min_vf); ++ ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal); + if (!ok) + { + if (dump_enabled_p ()) +@@ -1855,7 +1984,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts) + + /* Data-flow analysis to detect stmts that do not need to be vectorized. */ + +- ok = vect_mark_stmts_to_be_vectorized (loop_vinfo); ++ ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal); + if (!ok) + { + if (dump_enabled_p ()) +@@ -1901,7 +2030,6 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts) + vect_compute_single_scalar_iteration_cost (loop_vinfo); + + poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); +- unsigned th; + + /* Check the SLP opportunities in the loop, analyze and build SLP trees. */ + ok = vect_analyze_slp (loop_vinfo, *n_stmts); +@@ -1941,9 +2069,6 @@ start_over: + LOOP_VINFO_INT_NITERS (loop_vinfo)); + } + +- HOST_WIDE_INT max_niter +- = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo)); +- + /* Analyze the alignment of the data-refs in the loop. + Fail if a data reference is found that cannot be vectorized. */ + +@@ -1990,6 +2115,9 @@ start_over: + } + } + ++ /* Dissolve SLP-only groups. */ ++ vect_dissolve_slp_only_groups (loop_vinfo); ++ + /* Scan all the remaining operations in the loop that are not subject + to SLP and make sure they are vectorizable. */ + ok = vect_analyze_loop_operations (loop_vinfo); +@@ -2032,6 +2160,16 @@ start_over: + " support peeling for gaps.\n"); + } + ++ /* If we're vectorizing an epilogue loop, we either need a fully-masked ++ loop or a loop that has a lower VF than the main loop. */ ++ if (LOOP_VINFO_EPILOGUE_P (loop_vinfo) ++ && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) ++ && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), ++ LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo))) ++ return opt_result::failure_at (vect_location, ++ "Vectorization factor too high for" ++ " epilogue loop.\n"); ++ + /* Check the costings of the loop make vectorizing worthwhile. */ + res = vect_analyze_loop_costing (loop_vinfo); + if (res < 0) +@@ -2044,42 +2182,7 @@ start_over: + return opt_result::failure_at (vect_location, + "Loop costings not worthwhile.\n"); + +- /* Decide whether we need to create an epilogue loop to handle +- remaining scalar iterations. */ +- th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); +- +- unsigned HOST_WIDE_INT const_vf; +- if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) +- /* The main loop handles all iterations. */ +- LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; +- else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) +- && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) +- { +- /* Work out the (constant) number of iterations that need to be +- peeled for reasons other than niters. */ +- unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); +- if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) +- peel_niter += 1; +- if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter, +- LOOP_VINFO_VECT_FACTOR (loop_vinfo))) +- LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; +- } +- else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) +- /* ??? When peeling for gaps but not alignment, we could +- try to check whether the (variable) niters is known to be +- VF * N + 1. That's something of a niche case though. */ +- || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) +- || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf) +- || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo)) +- < (unsigned) exact_log2 (const_vf)) +- /* In case of versioning, check if the maximum number of +- iterations is greater than th. If they are identical, +- the epilogue is unnecessary. */ +- && (!LOOP_REQUIRES_VERSIONING (loop_vinfo) +- || ((unsigned HOST_WIDE_INT) max_niter +- > (th / const_vf) * const_vf)))) +- LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; +- ++ determine_peel_for_niter (loop_vinfo); + /* If an epilogue loop is required make sure we can create one. */ + if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) + || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)) +@@ -2101,10 +2204,21 @@ start_over: + /* During peeling, we need to check if number of loop iterations is + enough for both peeled prolog loop and vector loop. This check + can be merged along with threshold check of loop versioning, so +- increase threshold for this case if necessary. */ +- if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) ++ increase threshold for this case if necessary. ++ ++ If we are analyzing an epilogue we still want to check what its ++ versioning threshold would be. If we decide to vectorize the epilogues we ++ will want to use the lowest versioning threshold of all epilogues and main ++ loop. This will enable us to enter a vectorized epilogue even when ++ versioning the loop. We can't simply check whether the epilogue requires ++ versioning though since we may have skipped some versioning checks when ++ analyzing the epilogue. For instance, checks for alias versioning will be ++ skipped when dealing with epilogues as we assume we already checked them ++ for the main loop. So instead we always check the 'orig_loop_vinfo'. */ ++ if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo)) + { + poly_uint64 niters_th = 0; ++ unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); + + if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) + { +@@ -2125,6 +2239,14 @@ start_over: + /* One additional iteration because of peeling for gap. */ + if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) + niters_th += 1; ++ ++ /* Use the same condition as vect_transform_loop to decide when to use ++ the cost to determine a versioning threshold. */ ++ if (th >= vect_vf_for_cost (loop_vinfo) ++ && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) ++ && ordered_p (th, niters_th)) ++ niters_th = ordered_max (poly_uint64 (th), niters_th); ++ + LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th; + } + +@@ -2240,22 +2362,95 @@ again: + goto start_over; + } + ++/* Return true if vectorizing a loop using NEW_LOOP_VINFO appears ++ to be better than vectorizing it using OLD_LOOP_VINFO. Assume that ++ OLD_LOOP_VINFO is better unless something specifically indicates ++ otherwise. ++ ++ Note that this deliberately isn't a partial order. */ ++ ++static bool ++vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo, ++ loop_vec_info old_loop_vinfo) ++{ ++ struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo); ++ gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop); ++ ++ poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo); ++ poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo); ++ ++ /* Always prefer a VF of loop->simdlen over any other VF. */ ++ if (loop->simdlen) ++ { ++ bool new_simdlen_p = known_eq (new_vf, loop->simdlen); ++ bool old_simdlen_p = known_eq (old_vf, loop->simdlen); ++ if (new_simdlen_p != old_simdlen_p) ++ return new_simdlen_p; ++ } ++ ++ /* Limit the VFs to what is likely to be the maximum number of iterations, ++ to handle cases in which at least one loop_vinfo is fully-masked. */ ++ HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop); ++ if (estimated_max_niter != -1) ++ { ++ if (known_le (estimated_max_niter, new_vf)) ++ new_vf = estimated_max_niter; ++ if (known_le (estimated_max_niter, old_vf)) ++ old_vf = estimated_max_niter; ++ } ++ ++ /* Check whether the (fractional) cost per scalar iteration is lower ++ or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */ ++ poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost ++ * poly_widest_int (old_vf)); ++ poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost ++ * poly_widest_int (new_vf)); ++ if (maybe_lt (rel_old, rel_new)) ++ return false; ++ if (known_lt (rel_new, rel_old)) ++ return true; ++ ++ /* If there's nothing to choose between the loop bodies, see whether ++ there's a difference in the prologue and epilogue costs. */ ++ if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost) ++ return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost; ++ ++ return false; ++} ++ ++/* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return ++ true if we should. */ ++ ++static bool ++vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo, ++ loop_vec_info old_loop_vinfo) ++{ ++ if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo)) ++ return false; ++ ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "***** Preferring vector mode %s to vector mode %s\n", ++ GET_MODE_NAME (new_loop_vinfo->vector_mode), ++ GET_MODE_NAME (old_loop_vinfo->vector_mode)); ++ return true; ++} ++ + /* Function vect_analyze_loop. + + Apply a set of analyses on LOOP, and create a loop_vec_info struct + for it. The different analyses will record information in the +- loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must +- be vectorized. */ ++ loop_vec_info struct. */ + opt_loop_vec_info +-vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo, +- vec_info_shared *shared) ++vect_analyze_loop (struct loop *loop, vec_info_shared *shared) + { +- auto_vector_sizes vector_sizes; ++ auto_vector_modes vector_modes; + + /* Autodetect first vector size we try. */ +- current_vector_size = 0; +- targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); +- unsigned int next_size = 0; ++ unsigned int autovec_flags ++ = targetm.vectorize.autovectorize_vector_modes (&vector_modes, ++ loop->simdlen != 0); ++ unsigned int mode_i = 0; + + DUMP_VECT_SCOPE ("analyze_loop_nest"); + +@@ -2272,58 +2467,221 @@ vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo, + " loops cannot be vectorized\n"); + + unsigned n_stmts = 0; +- poly_uint64 autodetected_vector_size = 0; ++ machine_mode autodetected_vector_mode = VOIDmode; ++ opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL); ++ machine_mode next_vector_mode = VOIDmode; ++ poly_uint64 lowest_th = 0; ++ unsigned vectorized_loops = 0; ++ bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS) ++ && !unlimited_cost_model (loop)); ++ ++ bool vect_epilogues = false; ++ opt_result res = opt_result::success (); ++ unsigned HOST_WIDE_INT simdlen = loop->simdlen; + while (1) + { + /* Check the CFG characteristics of the loop (nesting, entry/exit). */ +- opt_loop_vec_info loop_vinfo +- = vect_analyze_loop_form (loop, shared); ++ opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared); + if (!loop_vinfo) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "bad loop form.\n"); ++ gcc_checking_assert (first_loop_vinfo == NULL); + return loop_vinfo; + } ++ loop_vinfo->vector_mode = next_vector_mode; + + bool fatal = false; + +- if (orig_loop_vinfo) +- LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo; ++ /* When pick_lowest_cost_p is true, we should in principle iterate ++ over all the loop_vec_infos that LOOP_VINFO could replace and ++ try to vectorize LOOP_VINFO under the same conditions. ++ E.g. when trying to replace an epilogue loop, we should vectorize ++ LOOP_VINFO as an epilogue loop with the same VF limit. When trying ++ to replace the main loop, we should vectorize LOOP_VINFO as a main ++ loop too. ++ ++ However, autovectorize_vector_modes is usually sorted as follows: ++ ++ - Modes that naturally produce lower VFs usually follow modes that ++ naturally produce higher VFs. ++ ++ - When modes naturally produce the same VF, maskable modes ++ usually follow unmaskable ones, so that the maskable mode ++ can be used to vectorize the epilogue of the unmaskable mode. ++ ++ This order is preferred because it leads to the maximum ++ epilogue vectorization opportunities. Targets should only use ++ a different order if they want to make wide modes available while ++ disparaging them relative to earlier, smaller modes. The assumption ++ in that case is that the wider modes are more expensive in some ++ way that isn't reflected directly in the costs. ++ ++ There should therefore be few interesting cases in which ++ LOOP_VINFO fails when treated as an epilogue loop, succeeds when ++ treated as a standalone loop, and ends up being genuinely cheaper ++ than FIRST_LOOP_VINFO. */ ++ if (vect_epilogues) ++ LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo; ++ ++ res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts); ++ if (mode_i == 0) ++ autodetected_vector_mode = loop_vinfo->vector_mode; ++ if (dump_enabled_p ()) ++ { ++ if (res) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "***** Analysis succeeded with vector mode %s\n", ++ GET_MODE_NAME (loop_vinfo->vector_mode)); ++ else ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "***** Analysis failed with vector mode %s\n", ++ GET_MODE_NAME (loop_vinfo->vector_mode)); ++ } ++ ++ loop->aux = NULL; ++ ++ if (!fatal) ++ while (mode_i < vector_modes.length () ++ && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i])) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "***** The result for vector mode %s would" ++ " be the same\n", ++ GET_MODE_NAME (vector_modes[mode_i])); ++ mode_i += 1; ++ } + +- opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts); + if (res) + { + LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; ++ vectorized_loops++; + +- return loop_vinfo; +- } +- +- delete loop_vinfo; ++ /* Once we hit the desired simdlen for the first time, ++ discard any previous attempts. */ ++ if (simdlen ++ && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen)) ++ { ++ delete first_loop_vinfo; ++ first_loop_vinfo = opt_loop_vec_info::success (NULL); ++ LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL; ++ simdlen = 0; ++ } ++ else if (pick_lowest_cost_p && first_loop_vinfo) ++ { ++ /* Keep trying to roll back vectorization attempts while the ++ loop_vec_infos they produced were worse than this one. */ ++ vec &vinfos = first_loop_vinfo->epilogue_vinfos; ++ while (!vinfos.is_empty () ++ && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ())) ++ { ++ gcc_assert (vect_epilogues); ++ delete vinfos.pop (); ++ } ++ if (vinfos.is_empty () ++ && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo)) ++ { ++ delete first_loop_vinfo; ++ first_loop_vinfo = opt_loop_vec_info::success (NULL); ++ LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL; ++ } ++ } + +- if (next_size == 0) +- autodetected_vector_size = current_vector_size; ++ if (first_loop_vinfo == NULL) ++ { ++ first_loop_vinfo = loop_vinfo; ++ lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo); ++ } ++ else if (vect_epilogues ++ /* For now only allow one epilogue loop. */ ++ && first_loop_vinfo->epilogue_vinfos.is_empty ()) ++ { ++ first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo); ++ poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); ++ gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo) ++ || maybe_ne (lowest_th, 0U)); ++ /* Keep track of the known smallest versioning ++ threshold. */ ++ if (ordered_p (lowest_th, th)) ++ lowest_th = ordered_min (lowest_th, th); ++ } ++ else ++ delete loop_vinfo; ++ ++ /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is ++ enabled, SIMDUID is not set, it is the innermost loop and we have ++ either already found the loop's SIMDLEN or there was no SIMDLEN to ++ begin with. ++ TODO: Enable epilogue vectorization for loops with SIMDUID set. */ ++ vect_epilogues = (!simdlen ++ && loop->inner == NULL ++ && PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK) ++ && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo) ++ && !loop->simduid ++ /* For now only allow one epilogue loop, but allow ++ pick_lowest_cost_p to replace it. */ ++ && (first_loop_vinfo->epilogue_vinfos.is_empty () ++ || pick_lowest_cost_p)); ++ ++ /* Commit to first_loop_vinfo if we have no reason to try ++ alternatives. */ ++ if (!simdlen && !vect_epilogues && !pick_lowest_cost_p) ++ break; ++ } ++ else ++ { ++ delete loop_vinfo; ++ if (fatal) ++ { ++ gcc_checking_assert (first_loop_vinfo == NULL); ++ break; ++ } ++ } + +- if (next_size < vector_sizes.length () +- && known_eq (vector_sizes[next_size], autodetected_vector_size)) +- next_size += 1; ++ if (mode_i < vector_modes.length () ++ && VECTOR_MODE_P (autodetected_vector_mode) ++ && (related_vector_mode (vector_modes[mode_i], ++ GET_MODE_INNER (autodetected_vector_mode)) ++ == autodetected_vector_mode) ++ && (related_vector_mode (autodetected_vector_mode, ++ GET_MODE_INNER (vector_modes[mode_i])) ++ == vector_modes[mode_i])) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "***** Skipping vector mode %s, which would" ++ " repeat the analysis for %s\n", ++ GET_MODE_NAME (vector_modes[mode_i]), ++ GET_MODE_NAME (autodetected_vector_mode)); ++ mode_i += 1; ++ } + +- if (fatal +- || next_size == vector_sizes.length () +- || known_eq (current_vector_size, 0U)) +- return opt_loop_vec_info::propagate_failure (res); ++ if (mode_i == vector_modes.length () ++ || autodetected_vector_mode == VOIDmode) ++ break; + + /* Try the next biggest vector size. */ +- current_vector_size = vector_sizes[next_size++]; ++ next_vector_mode = vector_modes[mode_i++]; + if (dump_enabled_p ()) +- { +- dump_printf_loc (MSG_NOTE, vect_location, +- "***** Re-trying analysis with " +- "vector size "); +- dump_dec (MSG_NOTE, current_vector_size); +- dump_printf (MSG_NOTE, "\n"); +- } ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "***** Re-trying analysis with vector mode %s\n", ++ GET_MODE_NAME (next_vector_mode)); ++ } ++ ++ if (first_loop_vinfo) ++ { ++ loop->aux = (loop_vec_info) first_loop_vinfo; ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "***** Choosing vector mode %s\n", ++ GET_MODE_NAME (first_loop_vinfo->vector_mode)); ++ LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th; ++ return first_loop_vinfo; + } ++ ++ return opt_loop_vec_info::propagate_failure (res); + } + + /* Return true if there is an in-order reduction function for CODE, storing +@@ -2397,17 +2755,17 @@ reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn) + + /* If there is a neutral value X such that SLP reduction NODE would not + be affected by the introduction of additional X elements, return that X, +- otherwise return null. CODE is the code of the reduction. REDUC_CHAIN +- is true if the SLP statements perform a single reduction, false if each +- statement performs an independent reduction. */ ++ otherwise return null. CODE is the code of the reduction and VECTOR_TYPE ++ is the vector type that would hold element X. REDUC_CHAIN is true if ++ the SLP statements perform a single reduction, false if each statement ++ performs an independent reduction. */ + + static tree +-neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code, +- bool reduc_chain) ++neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type, ++ tree_code code, bool reduc_chain) + { + vec stmts = SLP_TREE_SCALAR_STMTS (slp_node); + stmt_vec_info stmt_vinfo = stmts[0]; +- tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo); + tree scalar_type = TREE_TYPE (vector_type); + struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father; + gcc_assert (loop); +@@ -2453,241 +2811,55 @@ report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg) + dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt); + } + +-/* DEF_STMT_INFO occurs in a loop that contains a potential reduction +- operation. Return true if the results of DEF_STMT_INFO are something +- that can be accumulated by such a reduction. */ ++/* Return true if we need an in-order reduction for operation CODE ++ on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer ++ overflow must wrap. */ + +-static bool +-vect_valid_reduction_input_p (stmt_vec_info def_stmt_info) ++bool ++needs_fold_left_reduction_p (tree type, tree_code code) + { +- return (is_gimple_assign (def_stmt_info->stmt) +- || is_gimple_call (def_stmt_info->stmt) +- || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def +- || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI +- && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def +- && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt)))); +-} ++ /* CHECKME: check for !flag_finite_math_only too? */ ++ if (SCALAR_FLOAT_TYPE_P (type)) ++ switch (code) ++ { ++ case MIN_EXPR: ++ case MAX_EXPR: ++ return false; + +-/* Detect SLP reduction of the form: ++ default: ++ return !flag_associative_math; ++ } + +- #a1 = phi +- a2 = operation (a1) +- a3 = operation (a2) +- a4 = operation (a3) +- a5 = operation (a4) ++ if (INTEGRAL_TYPE_P (type)) ++ { ++ if (!operation_no_trapping_overflow (type, code)) ++ return true; ++ return false; ++ } + +- #a = phi ++ if (SAT_FIXED_POINT_TYPE_P (type)) ++ return true; + +- PHI is the reduction phi node (#a1 = phi above) +- FIRST_STMT is the first reduction stmt in the chain +- (a2 = operation (a1)). ++ return false; ++} + +- Return TRUE if a reduction chain was detected. */ ++/* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and ++ has a handled computation expression. Store the main reduction ++ operation in *CODE. */ + + static bool +-vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi, +- gimple *first_stmt) ++check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi, ++ tree loop_arg, enum tree_code *code, ++ vec > &path) + { +- struct loop *loop = (gimple_bb (phi))->loop_father; +- struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info); +- enum tree_code code; +- gimple *loop_use_stmt = NULL; +- stmt_vec_info use_stmt_info; +- tree lhs; +- imm_use_iterator imm_iter; +- use_operand_p use_p; +- int nloop_uses, size = 0, n_out_of_loop_uses; +- bool found = false; +- +- if (loop != vect_loop) +- return false; +- +- auto_vec reduc_chain; +- lhs = PHI_RESULT (phi); +- code = gimple_assign_rhs_code (first_stmt); +- while (1) +- { +- nloop_uses = 0; +- n_out_of_loop_uses = 0; +- FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) +- { +- gimple *use_stmt = USE_STMT (use_p); +- if (is_gimple_debug (use_stmt)) +- continue; +- +- /* Check if we got back to the reduction phi. */ +- if (use_stmt == phi) +- { +- loop_use_stmt = use_stmt; +- found = true; +- break; +- } +- +- if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) +- { +- loop_use_stmt = use_stmt; +- nloop_uses++; +- } +- else +- n_out_of_loop_uses++; +- +- /* There are can be either a single use in the loop or two uses in +- phi nodes. */ +- if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses)) +- return false; +- } +- +- if (found) +- break; +- +- /* We reached a statement with no loop uses. */ +- if (nloop_uses == 0) +- return false; +- +- /* This is a loop exit phi, and we haven't reached the reduction phi. */ +- if (gimple_code (loop_use_stmt) == GIMPLE_PHI) +- return false; +- +- if (!is_gimple_assign (loop_use_stmt) +- || code != gimple_assign_rhs_code (loop_use_stmt) +- || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt))) +- return false; +- +- /* Insert USE_STMT into reduction chain. */ +- use_stmt_info = loop_info->lookup_stmt (loop_use_stmt); +- reduc_chain.safe_push (use_stmt_info); +- +- lhs = gimple_assign_lhs (loop_use_stmt); +- size++; +- } +- +- if (!found || loop_use_stmt != phi || size < 2) +- return false; +- +- /* Swap the operands, if needed, to make the reduction operand be the second +- operand. */ +- lhs = PHI_RESULT (phi); +- for (unsigned i = 0; i < reduc_chain.length (); ++i) +- { +- gassign *next_stmt = as_a (reduc_chain[i]->stmt); +- if (gimple_assign_rhs2 (next_stmt) == lhs) +- { +- tree op = gimple_assign_rhs1 (next_stmt); +- stmt_vec_info def_stmt_info = loop_info->lookup_def (op); +- +- /* Check that the other def is either defined in the loop +- ("vect_internal_def"), or it's an induction (defined by a +- loop-header phi-node). */ +- if (def_stmt_info +- && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)) +- && vect_valid_reduction_input_p (def_stmt_info)) +- { +- lhs = gimple_assign_lhs (next_stmt); +- continue; +- } +- +- return false; +- } +- else +- { +- tree op = gimple_assign_rhs2 (next_stmt); +- stmt_vec_info def_stmt_info = loop_info->lookup_def (op); +- +- /* Check that the other def is either defined in the loop +- ("vect_internal_def"), or it's an induction (defined by a +- loop-header phi-node). */ +- if (def_stmt_info +- && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)) +- && vect_valid_reduction_input_p (def_stmt_info)) +- { +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G", +- next_stmt); +- +- swap_ssa_operands (next_stmt, +- gimple_assign_rhs1_ptr (next_stmt), +- gimple_assign_rhs2_ptr (next_stmt)); +- update_stmt (next_stmt); +- +- if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt))) +- LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true; +- } +- else +- return false; +- } +- +- lhs = gimple_assign_lhs (next_stmt); +- } +- +- /* Build up the actual chain. */ +- for (unsigned i = 0; i < reduc_chain.length () - 1; ++i) +- { +- REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0]; +- REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1]; +- } +- REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0]; +- REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL; +- +- /* Save the chain for further analysis in SLP detection. */ +- LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]); +- REDUC_GROUP_SIZE (reduc_chain[0]) = size; +- +- return true; +-} +- +-/* Return true if we need an in-order reduction for operation CODE +- on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer +- overflow must wrap. */ +- +-static bool +-needs_fold_left_reduction_p (tree type, tree_code code, +- bool need_wrapping_integral_overflow) +-{ +- /* CHECKME: check for !flag_finite_math_only too? */ +- if (SCALAR_FLOAT_TYPE_P (type)) +- switch (code) +- { +- case MIN_EXPR: +- case MAX_EXPR: +- return false; +- +- default: +- return !flag_associative_math; +- } +- +- if (INTEGRAL_TYPE_P (type)) +- { +- if (!operation_no_trapping_overflow (type, code)) +- return true; +- if (need_wrapping_integral_overflow +- && !TYPE_OVERFLOW_WRAPS (type) +- && operation_can_overflow (code)) +- return true; +- return false; +- } +- +- if (SAT_FIXED_POINT_TYPE_P (type)) +- return true; +- +- return false; +-} +- +-/* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and +- reduction operation CODE has a handled computation expression. */ +- +-bool +-check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi, +- tree loop_arg, enum tree_code code) +-{ +- auto_vec > path; +- auto_bitmap visited; +- tree lookfor = PHI_RESULT (phi); +- ssa_op_iter curri; +- use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE); +- while (USE_FROM_PTR (curr) != loop_arg) +- curr = op_iter_next_use (&curri); +- curri.i = curri.numops; +- do ++ auto_bitmap visited; ++ tree lookfor = PHI_RESULT (phi); ++ ssa_op_iter curri; ++ use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE); ++ while (USE_FROM_PTR (curr) != loop_arg) ++ curr = op_iter_next_use (&curri); ++ curri.i = curri.numops; ++ do + { + path.safe_push (std::make_pair (curri, curr)); + tree use = USE_FROM_PTR (curr); +@@ -2747,36 +2919,71 @@ pop: + /* Check whether the reduction path detected is valid. */ + bool fail = path.length () == 0; + bool neg = false; ++ *code = ERROR_MARK; + for (unsigned i = 1; i < path.length (); ++i) + { + gimple *use_stmt = USE_STMT (path[i].second); + tree op = USE_FROM_PTR (path[i].second); +- if (! has_single_use (op) +- || ! is_gimple_assign (use_stmt)) ++ if (! is_gimple_assign (use_stmt) ++ /* The following make sure we can compute the operand index ++ easily plus it mostly disallows chaining via COND_EXPR condition ++ operands. */ ++ || (gimple_assign_rhs1 (use_stmt) != op ++ && gimple_assign_rhs2 (use_stmt) != op ++ && gimple_assign_rhs3 (use_stmt) != op)) + { + fail = true; + break; + } +- if (gimple_assign_rhs_code (use_stmt) != code) ++ /* Check there's only a single stmt the op is used on inside ++ of the loop. */ ++ imm_use_iterator imm_iter; ++ gimple *op_use_stmt; ++ unsigned cnt = 0; ++ FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op) ++ if (!is_gimple_debug (op_use_stmt) ++ && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))) ++ cnt++; ++ if (cnt != 1) + { +- if (code == PLUS_EXPR +- && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR) +- { +- /* Track whether we negate the reduction value each iteration. */ +- if (gimple_assign_rhs2 (use_stmt) == op) +- neg = ! neg; +- } +- else +- { +- fail = true; +- break; +- } ++ fail = true; ++ break; ++ } ++ tree_code use_code = gimple_assign_rhs_code (use_stmt); ++ if (use_code == MINUS_EXPR) ++ { ++ use_code = PLUS_EXPR; ++ /* Track whether we negate the reduction value each iteration. */ ++ if (gimple_assign_rhs2 (use_stmt) == op) ++ neg = ! neg; ++ } ++ if (CONVERT_EXPR_CODE_P (use_code) ++ && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)), ++ TREE_TYPE (gimple_assign_rhs1 (use_stmt)))) ++ ; ++ else if (*code == ERROR_MARK) ++ *code = use_code; ++ else if (use_code != *code) ++ { ++ fail = true; ++ break; + } + } +- return ! fail && ! neg; ++ return ! fail && ! neg && *code != ERROR_MARK; ++} ++ ++bool ++check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi, ++ tree loop_arg, enum tree_code code) ++{ ++ auto_vec > path; ++ enum tree_code code_; ++ return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path) ++ && code_ == code); + } + + ++ + /* Function vect_is_simple_reduction + + (1) Detect a cross-iteration def-use cycle that represents a simple +@@ -2823,25 +3030,15 @@ pop: + + static stmt_vec_info + vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, +- bool *double_reduc, +- bool need_wrapping_integral_overflow, +- enum vect_reduction_type *v_reduc_type) ++ bool *double_reduc) + { + gphi *phi = as_a (phi_info->stmt); +- struct loop *loop = (gimple_bb (phi))->loop_father; +- struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info); +- bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop); + gimple *phi_use_stmt = NULL; +- enum tree_code orig_code, code; +- tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE; +- tree type; +- tree name; + imm_use_iterator imm_iter; + use_operand_p use_p; +- bool phi_def; + + *double_reduc = false; +- *v_reduc_type = TREE_CODE_REDUCTION; ++ STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION; + + tree phi_name = PHI_RESULT (phi); + /* ??? If there are no uses of the PHI result the inner loop reduction +@@ -2850,6 +3047,7 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, + can be constant. See PR60382. */ + if (has_zero_uses (phi_name)) + return NULL; ++ class loop *loop = (gimple_bb (phi))->loop_father; + unsigned nphi_def_loop_uses = 0; + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name) + { +@@ -2870,44 +3068,26 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, + phi_use_stmt = use_stmt; + } + +- edge latch_e = loop_latch_edge (loop); +- tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); +- if (TREE_CODE (loop_arg) != SSA_NAME) ++ tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop)); ++ if (TREE_CODE (latch_def) != SSA_NAME) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +- "reduction: not ssa_name: %T\n", loop_arg); ++ "reduction: not ssa_name: %T\n", latch_def); + return NULL; + } + +- stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg); ++ stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def); + if (!def_stmt_info + || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))) + return NULL; + +- if (gassign *def_stmt = dyn_cast (def_stmt_info->stmt)) +- { +- name = gimple_assign_lhs (def_stmt); +- phi_def = false; +- } +- else if (gphi *def_stmt = dyn_cast (def_stmt_info->stmt)) +- { +- name = PHI_RESULT (def_stmt); +- phi_def = true; +- } +- else +- { +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +- "reduction: unhandled reduction operation: %G", +- def_stmt_info->stmt); +- return NULL; +- } +- ++ bool nested_in_vect_loop ++ = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop); + unsigned nlatch_def_loop_uses = 0; + auto_vec lcphis; + bool inner_loop_of_double_reduc = false; +- FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) ++ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def) + { + gimple *use_stmt = USE_STMT (use_p); + if (is_gimple_debug (use_stmt)) +@@ -2925,11 +3105,21 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, + } + } + ++ /* If we are vectorizing an inner reduction we are executing that ++ in the original order only in case we are not dealing with a ++ double reduction. */ ++ if (nested_in_vect_loop && !inner_loop_of_double_reduc) ++ { ++ if (dump_enabled_p ()) ++ report_vect_op (MSG_NOTE, def_stmt_info->stmt, ++ "detected nested cycle: "); ++ return def_stmt_info; ++ } ++ + /* If this isn't a nested cycle or if the nested cycle reduction value + is used ouside of the inner loop we cannot handle uses of the reduction + value. */ +- if ((!nested_in_vect_loop || inner_loop_of_double_reduc) +- && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)) ++ if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +@@ -2939,11 +3129,9 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, + + /* If DEF_STMT is a phi node itself, we expect it to have a single argument + defined in the inner loop. */ +- if (phi_def) ++ if (gphi *def_stmt = dyn_cast (def_stmt_info->stmt)) + { +- gphi *def_stmt = as_a (def_stmt_info->stmt); +- op1 = PHI_ARG_DEF (def_stmt, 0); +- ++ tree op1 = PHI_ARG_DEF (def_stmt, 0); + if (gimple_phi_num_args (def_stmt) != 1 + || TREE_CODE (op1) != SSA_NAME) + { +@@ -2974,290 +3162,74 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, + return NULL; + } + +- /* If we are vectorizing an inner reduction we are executing that +- in the original order only in case we are not dealing with a +- double reduction. */ +- bool check_reduction = true; +- if (flow_loop_nested_p (vect_loop, loop)) +- { +- gphi *lcphi; +- unsigned i; +- check_reduction = false; +- FOR_EACH_VEC_ELT (lcphis, i, lcphi) +- FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi)) +- { +- gimple *use_stmt = USE_STMT (use_p); +- if (is_gimple_debug (use_stmt)) +- continue; +- if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt))) +- check_reduction = true; +- } +- } +- +- gassign *def_stmt = as_a (def_stmt_info->stmt); +- code = orig_code = gimple_assign_rhs_code (def_stmt); +- +- if (nested_in_vect_loop && !check_reduction) +- { +- /* FIXME: Even for non-reductions code generation is funneled +- through vectorizable_reduction for the stmt defining the +- PHI latch value. So we have to artificially restrict ourselves +- for the supported operations. */ +- switch (get_gimple_rhs_class (code)) +- { +- case GIMPLE_BINARY_RHS: +- case GIMPLE_TERNARY_RHS: +- break; +- default: +- /* Not supported by vectorizable_reduction. */ +- if (dump_enabled_p ()) +- report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, +- "nested cycle: not handled operation: "); +- return NULL; +- } +- if (dump_enabled_p ()) +- report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: "); +- return def_stmt_info; +- } +- +- /* We can handle "res -= x[i]", which is non-associative by +- simply rewriting this into "res += -x[i]". Avoid changing +- gimple instruction for the first simple tests and only do this +- if we're allowed to change code at all. */ +- if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name) +- code = PLUS_EXPR; +- +- if (code == COND_EXPR) ++ /* Look for the expression computing latch_def from then loop PHI result. */ ++ auto_vec > path; ++ enum tree_code code; ++ if (check_reduction_path (vect_location, loop, phi, latch_def, &code, ++ path)) + { +- if (! nested_in_vect_loop) +- *v_reduc_type = COND_REDUCTION; ++ STMT_VINFO_REDUC_CODE (phi_info) = code; ++ if (code == COND_EXPR && !nested_in_vect_loop) ++ STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION; + +- op3 = gimple_assign_rhs1 (def_stmt); +- if (COMPARISON_CLASS_P (op3)) +- { +- op4 = TREE_OPERAND (op3, 1); +- op3 = TREE_OPERAND (op3, 0); +- } +- if (op3 == phi_name || op4 == phi_name) ++ /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP ++ reduction chain for which the additional restriction is that ++ all operations in the chain are the same. */ ++ auto_vec reduc_chain; ++ unsigned i; ++ bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR; ++ for (i = path.length () - 1; i >= 1; --i) + { +- if (dump_enabled_p ()) +- report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, +- "reduction: condition depends on previous" +- " iteration: "); +- return NULL; ++ gimple *stmt = USE_STMT (path[i].second); ++ stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt); ++ STMT_VINFO_REDUC_IDX (stmt_info) ++ = path[i].second->use - gimple_assign_rhs1_ptr (stmt); ++ enum tree_code stmt_code = gimple_assign_rhs_code (stmt); ++ bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code) ++ && (i == 1 || i == path.length () - 1)); ++ if ((stmt_code != code && !leading_conversion) ++ /* We can only handle the final value in epilogue ++ generation for reduction chains. */ ++ || (i != 1 && !has_single_use (gimple_assign_lhs (stmt)))) ++ is_slp_reduc = false; ++ /* For reduction chains we support a trailing/leading ++ conversions. We do not store those in the actual chain. */ ++ if (leading_conversion) ++ continue; ++ reduc_chain.safe_push (stmt_info); + } +- +- op1 = gimple_assign_rhs2 (def_stmt); +- op2 = gimple_assign_rhs3 (def_stmt); +- } +- else if (!commutative_tree_code (code) || !associative_tree_code (code)) +- { +- if (dump_enabled_p ()) +- report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, +- "reduction: not commutative/associative: "); +- return NULL; +- } +- else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS) +- { +- op1 = gimple_assign_rhs1 (def_stmt); +- op2 = gimple_assign_rhs2 (def_stmt); +- } +- else +- { +- if (dump_enabled_p ()) +- report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, +- "reduction: not handled operation: "); +- return NULL; +- } +- +- if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME) +- { +- if (dump_enabled_p ()) +- report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, +- "reduction: both uses not ssa_names: "); +- +- return NULL; +- } +- +- type = TREE_TYPE (gimple_assign_lhs (def_stmt)); +- if ((TREE_CODE (op1) == SSA_NAME +- && !types_compatible_p (type,TREE_TYPE (op1))) +- || (TREE_CODE (op2) == SSA_NAME +- && !types_compatible_p (type, TREE_TYPE (op2))) +- || (op3 && TREE_CODE (op3) == SSA_NAME +- && !types_compatible_p (type, TREE_TYPE (op3))) +- || (op4 && TREE_CODE (op4) == SSA_NAME +- && !types_compatible_p (type, TREE_TYPE (op4)))) +- { +- if (dump_enabled_p ()) +- { +- dump_printf_loc (MSG_NOTE, vect_location, +- "reduction: multiple types: operation type: " +- "%T, operands types: %T,%T", +- type, TREE_TYPE (op1), TREE_TYPE (op2)); +- if (op3) +- dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3)); +- +- if (op4) +- dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4)); +- dump_printf (MSG_NOTE, "\n"); +- } +- +- return NULL; +- } +- +- /* Check whether it's ok to change the order of the computation. +- Generally, when vectorizing a reduction we change the order of the +- computation. This may change the behavior of the program in some +- cases, so we need to check that this is ok. One exception is when +- vectorizing an outer-loop: the inner-loop is executed sequentially, +- and therefore vectorizing reductions in the inner-loop during +- outer-loop vectorization is safe. */ +- if (check_reduction +- && *v_reduc_type == TREE_CODE_REDUCTION +- && needs_fold_left_reduction_p (type, code, +- need_wrapping_integral_overflow)) +- *v_reduc_type = FOLD_LEFT_REDUCTION; +- +- /* Reduction is safe. We're dealing with one of the following: +- 1) integer arithmetic and no trapv +- 2) floating point arithmetic, and special flags permit this optimization +- 3) nested cycle (i.e., outer loop vectorization). */ +- stmt_vec_info def1_info = loop_info->lookup_def (op1); +- stmt_vec_info def2_info = loop_info->lookup_def (op2); +- if (code != COND_EXPR && !def1_info && !def2_info) +- { +- if (dump_enabled_p ()) +- report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: "); +- return NULL; +- } +- +- /* Check that one def is the reduction def, defined by PHI, +- the other def is either defined in the loop ("vect_internal_def"), +- or it's an induction (defined by a loop-header phi-node). */ +- +- if (def2_info +- && def2_info->stmt == phi +- && (code == COND_EXPR +- || !def1_info +- || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt)) +- || vect_valid_reduction_input_p (def1_info))) +- { +- if (dump_enabled_p ()) +- report_vect_op (MSG_NOTE, def_stmt, "detected reduction: "); +- return def_stmt_info; +- } +- +- if (def1_info +- && def1_info->stmt == phi +- && (code == COND_EXPR +- || !def2_info +- || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt)) +- || vect_valid_reduction_input_p (def2_info))) +- { +- if (! nested_in_vect_loop && orig_code != MINUS_EXPR) ++ if (is_slp_reduc && reduc_chain.length () > 1) + { +- /* Check if we can swap operands (just for simplicity - so that +- the rest of the code can assume that the reduction variable +- is always the last (second) argument). */ +- if (code == COND_EXPR) ++ for (unsigned i = 0; i < reduc_chain.length () - 1; ++i) + { +- /* Swap cond_expr by inverting the condition. */ +- tree cond_expr = gimple_assign_rhs1 (def_stmt); +- enum tree_code invert_code = ERROR_MARK; +- enum tree_code cond_code = TREE_CODE (cond_expr); +- +- if (TREE_CODE_CLASS (cond_code) == tcc_comparison) +- { +- bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0)); +- invert_code = invert_tree_comparison (cond_code, honor_nans); +- } +- if (invert_code != ERROR_MARK) +- { +- TREE_SET_CODE (cond_expr, invert_code); +- swap_ssa_operands (def_stmt, +- gimple_assign_rhs2_ptr (def_stmt), +- gimple_assign_rhs3_ptr (def_stmt)); +- } +- else +- { +- if (dump_enabled_p ()) +- report_vect_op (MSG_NOTE, def_stmt, +- "detected reduction: cannot swap operands " +- "for cond_expr"); +- return NULL; +- } ++ REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0]; ++ REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1]; + } +- else +- swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt), +- gimple_assign_rhs2_ptr (def_stmt)); +- +- if (dump_enabled_p ()) +- report_vect_op (MSG_NOTE, def_stmt, +- "detected reduction: need to swap operands: "); +- +- if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt))) +- LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true; +- } +- else +- { +- if (dump_enabled_p ()) +- report_vect_op (MSG_NOTE, def_stmt, "detected reduction: "); +- } ++ REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0]; ++ REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL; + +- return def_stmt_info; +- } ++ /* Save the chain for further analysis in SLP detection. */ ++ LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]); ++ REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length (); + +- /* Try to find SLP reduction chain. */ +- if (! nested_in_vect_loop +- && code != COND_EXPR +- && orig_code != MINUS_EXPR +- && vect_is_slp_reduction (loop_info, phi, def_stmt)) +- { +- if (dump_enabled_p ()) +- report_vect_op (MSG_NOTE, def_stmt, +- "reduction: detected reduction chain: "); ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "reduction: detected reduction chain\n"); ++ } ++ else if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "reduction: detected reduction\n"); + + return def_stmt_info; + } + +- /* Look for the expression computing loop_arg from loop PHI result. */ +- if (check_reduction_path (vect_location, loop, phi, loop_arg, code)) +- return def_stmt_info; +- + if (dump_enabled_p ()) +- { +- report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, +- "reduction: unknown pattern: "); +- } ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "reduction: unknown pattern\n"); + + return NULL; + } + +-/* Wrapper around vect_is_simple_reduction, which will modify code +- in-place if it enables detection of more reductions. Arguments +- as there. */ +- +-stmt_vec_info +-vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, +- bool *double_reduc, +- bool need_wrapping_integral_overflow) +-{ +- enum vect_reduction_type v_reduc_type; +- stmt_vec_info def_info +- = vect_is_simple_reduction (loop_info, phi_info, double_reduc, +- need_wrapping_integral_overflow, +- &v_reduc_type); +- if (def_info) +- { +- STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type; +- STMT_VINFO_REDUC_DEF (phi_info) = def_info; +- STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type; +- STMT_VINFO_REDUC_DEF (def_info) = phi_info; +- } +- return def_info; +-} +- + /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */ + int + vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue, +@@ -3601,7 +3573,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, + &vec_inside_cost, &vec_epilogue_cost); + + vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost); +- ++ ++ /* Stash the costs so that we can compare two loop_vec_infos. */ ++ loop_vinfo->vec_inside_cost = vec_inside_cost; ++ loop_vinfo->vec_outside_cost = vec_outside_cost; ++ + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); +@@ -3846,6 +3822,7 @@ have_whole_vector_shift (machine_mode mode) + + static void + vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn, ++ vect_reduction_type reduction_type, + int ncopies, stmt_vector_for_cost *cost_vec) + { + int prologue_cost = 0, epilogue_cost = 0, inside_cost; +@@ -3860,8 +3837,6 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn, + loop = LOOP_VINFO_LOOP (loop_vinfo); + + /* Condition reductions generate two reductions in the loop. */ +- vect_reduction_type reduction_type +- = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info); + if (reduction_type == COND_REDUCTION) + ncopies *= 2; + +@@ -4080,15 +4055,15 @@ vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies, + + A cost model should help decide between these two schemes. */ + +-tree +-get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val, ++static tree ++get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, ++ enum tree_code code, tree init_val, + tree *adjustment_def) + { + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + tree scalar_type = TREE_TYPE (init_val); +- tree vectype = get_vectype_for_scalar_type (scalar_type); +- enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt); ++ tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type); + tree def_for_init; + tree init_def; + REAL_VALUE_TYPE real_init_val = dconst0; +@@ -4103,8 +4078,10 @@ get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val, + gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo) + || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father); + +- vect_reduction_type reduction_type +- = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo); ++ /* ADJUSTMENT_DEF is NULL when called from ++ vect_create_epilog_for_reduction to vectorize double reduction. */ ++ if (adjustment_def) ++ *adjustment_def = NULL; + + switch (code) + { +@@ -4118,11 +4095,6 @@ get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val, + case MULT_EXPR: + case BIT_AND_EXPR: + { +- /* ADJUSTMENT_DEF is NULL when called from +- vect_create_epilog_for_reduction to vectorize double reduction. */ +- if (adjustment_def) +- *adjustment_def = init_val; +- + if (code == MULT_EXPR) + { + real_init_val = dconst1; +@@ -4137,10 +4109,14 @@ get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val, + else + def_for_init = build_int_cst (scalar_type, int_init_val); + +- if (adjustment_def) +- /* Option1: the first element is '0' or '1' as well. */ +- init_def = gimple_build_vector_from_val (&stmts, vectype, +- def_for_init); ++ if (adjustment_def || operand_equal_p (def_for_init, init_val, 0)) ++ { ++ /* Option1: the first element is '0' or '1' as well. */ ++ if (!operand_equal_p (def_for_init, init_val, 0)) ++ *adjustment_def = init_val; ++ init_def = gimple_build_vector_from_val (&stmts, vectype, ++ def_for_init); ++ } + else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()) + { + /* Option2 (variable length): the first element is INIT_VAL. */ +@@ -4164,16 +4140,6 @@ get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val, + case MAX_EXPR: + case COND_EXPR: + { +- if (adjustment_def) +- { +- *adjustment_def = NULL_TREE; +- if (reduction_type != COND_REDUCTION +- && reduction_type != EXTRACT_LAST_REDUCTION) +- { +- init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo); +- break; +- } +- } + init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val); + init_def = gimple_build_vector_from_val (&stmts, vectype, init_val); + } +@@ -4201,6 +4167,7 @@ get_initial_defs_for_reduction (slp_tree slp_node, + { + vec stmts = SLP_TREE_SCALAR_STMTS (slp_node); + stmt_vec_info stmt_vinfo = stmts[0]; ++ vec_info *vinfo = stmt_vinfo->vinfo; + unsigned HOST_WIDE_INT nunits; + unsigned j, number_of_places_left_in_vector; + tree vector_type; +@@ -4293,7 +4260,7 @@ get_initial_defs_for_reduction (slp_tree slp_node, + { + /* First time round, duplicate ELTS to fill the + required number of vectors. */ +- duplicate_and_interleave (&ctor_seq, vector_type, elts, ++ duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts, + number_of_vectors, *vec_oprnds); + break; + } +@@ -4309,42 +4276,47 @@ get_initial_defs_for_reduction (slp_tree slp_node, + gsi_insert_seq_on_edge_immediate (pe, ctor_seq); + } + ++/* For a statement STMT_INFO taking part in a reduction operation return ++ the stmt_vec_info the meta information is stored on. */ + +-/* Function vect_create_epilog_for_reduction +- +- Create code at the loop-epilog to finalize the result of a reduction ++stmt_vec_info ++info_for_reduction (stmt_vec_info stmt_info) ++{ ++ stmt_info = vect_orig_stmt (stmt_info); ++ gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info)); ++ if (!is_a (stmt_info->stmt)) ++ stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); ++ gphi *phi = as_a (stmt_info->stmt); ++ if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) ++ { ++ if (gimple_phi_num_args (phi) == 1) ++ stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); ++ } ++ else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) ++ { ++ edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father); ++ stmt_vec_info info ++ = stmt_info->vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe)); ++ if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def) ++ stmt_info = info; ++ } ++ return stmt_info; ++} ++ ++/* Function vect_create_epilog_for_reduction ++ ++ Create code at the loop-epilog to finalize the result of a reduction + computation. + +- VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector +- reduction statements. + STMT_INFO is the scalar reduction stmt that is being vectorized. +- NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the +- number of elements that we can fit in a vectype (nunits). In this case +- we have to generate more than one vector stmt - i.e - we need to "unroll" +- the vector stmt by a factor VF/nunits. For more details see documentation +- in vectorizable_operation. +- REDUC_FN is the internal function for the epilog reduction. +- REDUCTION_PHIS is a list of the phi-nodes that carry the reduction +- computation. +- REDUC_INDEX is the index of the operand in the right hand side of the +- statement that is defined by REDUCTION_PHI. +- DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled. + SLP_NODE is an SLP node containing a group of reduction statements. The + first one in this group is STMT_INFO. +- INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case +- when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to +- be smaller than any value of the IV in the loop, for MIN_EXPR larger than +- any value of the IV in the loop. +- INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION. +- NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is +- null if this is not an SLP reduction ++ SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE ++ REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi ++ (counting from 0) + + This function: +- 1. Creates the reduction def-use cycles: sets the arguments for +- REDUCTION_PHIS: +- The loop-entry argument is the vectorized initial-value of the reduction. +- The loop-latch argument is taken from VECT_DEFS - the vector of partial +- sums. ++ 1. Completes the reduction def-use cycles. + 2. "Reduces" each vector of partial results VECT_DEFS into a single result, + by calling the function specified by REDUC_FN if available, or by + other means (whole-vector shifts or a scalar loop). +@@ -4354,7 +4326,7 @@ get_initial_defs_for_reduction (slp_tree slp_node, + The flow at the entry to this function: + + loop: +- vec_def = phi # REDUCTION_PHI ++ vec_def = phi # REDUCTION_PHI + VECT_DEF = vector_stmt # vectorized form of STMT_INFO + s_loop = scalar_stmt # (scalar) STMT_INFO + loop_exit: +@@ -4379,21 +4351,34 @@ get_initial_defs_for_reduction (slp_tree slp_node, + */ + + static void +-vect_create_epilog_for_reduction (vec vect_defs, +- stmt_vec_info stmt_info, +- gimple *reduc_def_stmt, +- int ncopies, internal_fn reduc_fn, +- vec reduction_phis, +- bool double_reduc, ++vect_create_epilog_for_reduction (stmt_vec_info stmt_info, + slp_tree slp_node, +- slp_instance slp_node_instance, +- tree induc_val, enum tree_code induc_code, +- tree neutral_op) ++ slp_instance slp_node_instance) + { ++ stmt_vec_info reduc_info = info_for_reduction (stmt_info); ++ gcc_assert (reduc_info->is_reduc_info); ++ loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); ++ /* For double reductions we need to get at the inner loop reduction ++ stmt which has the meta info attached. Our stmt_info is that of the ++ loop-closed PHI of the inner loop which we remember as ++ def for the reduction PHI generation. */ ++ bool double_reduc = false; ++ stmt_vec_info rdef_info = stmt_info; ++ if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) ++ { ++ gcc_assert (!slp_node); ++ double_reduc = true; ++ stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def ++ (stmt_info->stmt, 0)); ++ stmt_info = vect_stmt_to_vectorize (stmt_info); ++ } ++ gphi *reduc_def_stmt ++ = as_a (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt); ++ enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info); ++ internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info); + stmt_vec_info prev_phi_info; + tree vectype; + machine_mode mode; +- loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL; + basic_block exit_bb; + tree scalar_dest; +@@ -4401,32 +4386,24 @@ vect_create_epilog_for_reduction (vec vect_defs, + gimple *new_phi = NULL, *phi; + stmt_vec_info phi_info; + gimple_stmt_iterator exit_gsi; +- tree vec_dest; +- tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest; ++ tree new_temp = NULL_TREE, new_name, new_scalar_dest; + gimple *epilog_stmt = NULL; +- enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt); + gimple *exit_phi; + tree bitsize; +- tree adjustment_def = NULL; +- tree vec_initial_def = NULL; +- tree expr, def, initial_def = NULL; ++ tree def; + tree orig_name, scalar_result; + imm_use_iterator imm_iter, phi_imm_iter; + use_operand_p use_p, phi_use_p; + gimple *use_stmt; +- stmt_vec_info reduction_phi_info = NULL; + bool nested_in_vect_loop = false; + auto_vec new_phis; +- auto_vec inner_phis; + int j, i; + auto_vec scalar_results; +- unsigned int group_size = 1, k, ratio; +- auto_vec vec_initial_defs; ++ unsigned int group_size = 1, k; + auto_vec phis; + bool slp_reduc = false; + bool direct_slp_reduc; + tree new_phi_result; +- stmt_vec_info inner_phi = NULL; + tree induction_index = NULL_TREE; + + if (slp_node) +@@ -4439,127 +4416,53 @@ vect_create_epilog_for_reduction (vec vect_defs, + nested_in_vect_loop = true; + gcc_assert (!slp_node); + } ++ gcc_assert (!nested_in_vect_loop || double_reduc); + +- vectype = STMT_VINFO_VECTYPE (stmt_info); ++ vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info); + gcc_assert (vectype); + mode = TYPE_MODE (vectype); + +- /* 1. Create the reduction def-use cycle: +- Set the arguments of REDUCTION_PHIS, i.e., transform +- +- loop: +- vec_def = phi # REDUCTION_PHI +- VECT_DEF = vector_stmt # vectorized form of STMT +- ... +- +- into: +- +- loop: +- vec_def = phi # REDUCTION_PHI +- VECT_DEF = vector_stmt # vectorized form of STMT +- ... +- +- (in case of SLP, do it for all the phis). */ +- +- /* Get the loop-entry arguments. */ +- enum vect_def_type initial_def_dt = vect_unknown_def_type; ++ tree initial_def = NULL; ++ tree induc_val = NULL_TREE; ++ tree adjustment_def = NULL; + if (slp_node) +- { +- unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); +- vec_initial_defs.reserve (vec_num); +- get_initial_defs_for_reduction (slp_node_instance->reduc_phis, +- &vec_initial_defs, vec_num, +- REDUC_GROUP_FIRST_ELEMENT (stmt_info), +- neutral_op); +- } ++ ; + else + { + /* Get at the scalar def before the loop, that defines the initial value + of the reduction variable. */ + initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt, + loop_preheader_edge (loop)); +- /* Optimize: if initial_def is for REDUC_MAX smaller than the base +- and we can't use zero for induc_val, use initial_def. Similarly +- for REDUC_MIN and initial_def larger than the base. */ +- if (TREE_CODE (initial_def) == INTEGER_CST +- && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) +- == INTEGER_INDUC_COND_REDUCTION) +- && !integer_zerop (induc_val) +- && ((induc_code == MAX_EXPR +- && tree_int_cst_lt (initial_def, induc_val)) +- || (induc_code == MIN_EXPR +- && tree_int_cst_lt (induc_val, initial_def)))) +- induc_val = initial_def; +- +- if (double_reduc) +- /* In case of double reduction we only create a vector variable +- to be put in the reduction phi node. The actual statement +- creation is done later in this function. */ +- vec_initial_def = vect_create_destination_var (initial_def, vectype); ++ /* Optimize: for induction condition reduction, if we can't use zero ++ for induc_val, use initial_def. */ ++ if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) ++ induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); ++ else if (double_reduc) ++ ; + else if (nested_in_vect_loop) +- { +- /* Do not use an adjustment def as that case is not supported +- correctly if ncopies is not one. */ +- vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt); +- vec_initial_def = vect_get_vec_def_for_operand (initial_def, +- stmt_info); +- } ++ ; + else +- vec_initial_def +- = get_initial_def_for_reduction (stmt_info, initial_def, +- &adjustment_def); +- vec_initial_defs.create (1); +- vec_initial_defs.quick_push (vec_initial_def); ++ adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info); + } + +- /* Set phi nodes arguments. */ +- FOR_EACH_VEC_ELT (reduction_phis, i, phi_info) ++ unsigned vec_num; ++ int ncopies; ++ if (slp_node) + { +- tree vec_init_def = vec_initial_defs[i]; +- tree def = vect_defs[i]; +- for (j = 0; j < ncopies; j++) +- { +- if (j != 0) +- { +- phi_info = STMT_VINFO_RELATED_STMT (phi_info); +- if (nested_in_vect_loop) +- vec_init_def +- = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def); +- } +- +- /* Set the loop-entry arg of the reduction-phi. */ +- +- gphi *phi = as_a (phi_info->stmt); +- if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) +- == INTEGER_INDUC_COND_REDUCTION) +- { +- /* Initialise the reduction phi to zero. This prevents initial +- values of non-zero interferring with the reduction op. */ +- gcc_assert (ncopies == 1); +- gcc_assert (i == 0); +- +- tree vec_init_def_type = TREE_TYPE (vec_init_def); +- tree induc_val_vec +- = build_vector_from_val (vec_init_def_type, induc_val); +- +- add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop), +- UNKNOWN_LOCATION); +- } +- else +- add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop), +- UNKNOWN_LOCATION); +- +- /* Set the loop-latch arg for the reduction-phi. */ +- if (j > 0) +- def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def); +- +- add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION); +- +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_NOTE, vect_location, +- "transform reduction: created def-use cycle: %G%G", +- phi, SSA_NAME_DEF_STMT (def)); +- } ++ vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length (); ++ ncopies = 1; ++ } ++ else ++ { ++ vec_num = 1; ++ ncopies = 0; ++ phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt)); ++ do ++ { ++ ncopies++; ++ phi_info = STMT_VINFO_RELATED_STMT (phi_info); ++ } ++ while (phi_info); + } + + /* For cond reductions we want to create a new vector (INDEX_COND_EXPR) +@@ -4569,7 +4472,7 @@ vect_create_epilog_for_reduction (vec vect_defs, + The first match will be a 1 to allow 0 to be used for non-matching + indexes. If there are no matches at all then the vector will be all + zeroes. */ +- if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION) ++ if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION) + { + tree indx_before_incr, indx_after_incr; + poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype); +@@ -4627,11 +4530,17 @@ vect_create_epilog_for_reduction (vec vect_defs, + tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt)); + + /* Create a conditional, where the condition is taken from vec_stmt +- (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and +- else is the phi (NEW_PHI_TREE). */ +- tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type, +- ccompare, indx_before_incr, +- new_phi_tree); ++ (CCOMPARE). The then and else values mirror the main VEC_COND_EXPR: ++ the reduction phi corresponds to NEW_PHI_TREE and the new values ++ correspond to INDEX_BEFORE_INCR. */ ++ gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) >= 1); ++ tree index_cond_expr; ++ if (STMT_VINFO_REDUC_IDX (stmt_info) == 2) ++ index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type, ++ ccompare, indx_before_incr, new_phi_tree); ++ else ++ index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type, ++ ccompare, new_phi_tree, indx_before_incr); + induction_index = make_ssa_name (cr_index_vector_type); + gimple *index_condition = gimple_build_assign (induction_index, + index_cond_expr); +@@ -4674,12 +4583,17 @@ vect_create_epilog_for_reduction (vec vect_defs, + /* 2.1 Create new loop-exit-phis to preserve loop-closed form: + v_out1 = phi + Store them in NEW_PHIS. */ +- ++ if (double_reduc) ++ loop = outer_loop; + exit_bb = single_exit (loop)->dest; + prev_phi_info = NULL; +- new_phis.create (vect_defs.length ()); +- FOR_EACH_VEC_ELT (vect_defs, i, def) ++ new_phis.create (slp_node ? vec_num : ncopies); ++ for (unsigned i = 0; i < vec_num; i++) + { ++ if (slp_node) ++ def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt); ++ else ++ def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt); + for (j = 0; j < ncopies; j++) + { + tree new_def = copy_ssa_name (def); +@@ -4698,37 +4612,6 @@ vect_create_epilog_for_reduction (vec vect_defs, + } + } + +- /* The epilogue is created for the outer-loop, i.e., for the loop being +- vectorized. Create exit phis for the outer loop. */ +- if (double_reduc) +- { +- loop = outer_loop; +- exit_bb = single_exit (loop)->dest; +- inner_phis.create (vect_defs.length ()); +- FOR_EACH_VEC_ELT (new_phis, i, phi) +- { +- stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi); +- tree new_result = copy_ssa_name (PHI_RESULT (phi)); +- gphi *outer_phi = create_phi_node (new_result, exit_bb); +- SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx, +- PHI_RESULT (phi)); +- prev_phi_info = loop_vinfo->add_stmt (outer_phi); +- inner_phis.quick_push (phi_info); +- new_phis[i] = outer_phi; +- while (STMT_VINFO_RELATED_STMT (phi_info)) +- { +- phi_info = STMT_VINFO_RELATED_STMT (phi_info); +- new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt)); +- outer_phi = create_phi_node (new_result, exit_bb); +- SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx, +- PHI_RESULT (phi_info->stmt)); +- stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi); +- STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info; +- prev_phi_info = outer_phi_info; +- } +- } +- } +- + exit_gsi = gsi_after_labels (exit_bb); + + /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 +@@ -4747,12 +4630,6 @@ vect_create_epilog_for_reduction (vec vect_defs, + gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); + gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info); + } +- +- code = gimple_assign_rhs_code (orig_stmt_info->stmt); +- /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore, +- partial results are added and not subtracted. */ +- if (code == MINUS_EXPR) +- code = PLUS_EXPR; + + scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt); + scalar_type = TREE_TYPE (scalar_dest); +@@ -4760,15 +4637,6 @@ vect_create_epilog_for_reduction (vec vect_defs, + new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); + bitsize = TYPE_SIZE (scalar_type); + +- /* In case this is a reduction in an inner-loop while vectorizing an outer +- loop - we don't need to extract a single scalar result at the end of the +- inner-loop (unless it is double reduction, i.e., the use of reduction is +- outside the outer-loop). The final vector of partial results will be used +- in the vectorized outer-loop, or reduced to a scalar result at the end of +- the outer-loop. */ +- if (nested_in_vect_loop && !double_reduc) +- goto vect_finalize_reduction; +- + /* SLP reduction without reduction chain, e.g., + # a1 = phi + # b1 = phi +@@ -4791,53 +4659,48 @@ vect_create_epilog_for_reduction (vec vect_defs, + one vector. */ + if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc) + { ++ gimple_seq stmts = NULL; + tree first_vect = PHI_RESULT (new_phis[0]); +- gassign *new_vec_stmt = NULL; +- vec_dest = vect_create_destination_var (scalar_dest, vectype); ++ first_vect = gimple_convert (&stmts, vectype, first_vect); + for (k = 1; k < new_phis.length (); k++) + { + gimple *next_phi = new_phis[k]; + tree second_vect = PHI_RESULT (next_phi); +- tree tem = make_ssa_name (vec_dest, new_vec_stmt); +- new_vec_stmt = gimple_build_assign (tem, code, +- first_vect, second_vect); +- gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT); +- first_vect = tem; ++ second_vect = gimple_convert (&stmts, vectype, second_vect); ++ first_vect = gimple_build (&stmts, code, vectype, ++ first_vect, second_vect); + } ++ gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); + + new_phi_result = first_vect; +- if (new_vec_stmt) +- { +- new_phis.truncate (0); +- new_phis.safe_push (new_vec_stmt); +- } ++ new_phis.truncate (0); ++ new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect)); + } + /* Likewise if we couldn't use a single defuse cycle. */ + else if (ncopies > 1) + { + gcc_assert (new_phis.length () == 1); ++ gimple_seq stmts = NULL; + tree first_vect = PHI_RESULT (new_phis[0]); +- gassign *new_vec_stmt = NULL; +- vec_dest = vect_create_destination_var (scalar_dest, vectype); ++ first_vect = gimple_convert (&stmts, vectype, first_vect); + stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]); + for (int k = 1; k < ncopies; ++k) + { + next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info); + tree second_vect = PHI_RESULT (next_phi_info->stmt); +- tree tem = make_ssa_name (vec_dest, new_vec_stmt); +- new_vec_stmt = gimple_build_assign (tem, code, +- first_vect, second_vect); +- gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT); +- first_vect = tem; ++ second_vect = gimple_convert (&stmts, vectype, second_vect); ++ first_vect = gimple_build (&stmts, code, vectype, ++ first_vect, second_vect); + } ++ gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); + new_phi_result = first_vect; + new_phis.truncate (0); +- new_phis.safe_push (new_vec_stmt); ++ new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect)); + } + else + new_phi_result = PHI_RESULT (new_phis[0]); + +- if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION ++ if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION + && reduc_fn != IFN_LAST) + { + /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing +@@ -4852,8 +4715,7 @@ vect_create_epilog_for_reduction (vec vect_defs, + tree index_vec_type = TREE_TYPE (induction_index); + gcc_checking_assert (TYPE_UNSIGNED (index_vec_type)); + tree index_scalar_type = TREE_TYPE (index_vec_type); +- tree index_vec_cmp_type = build_same_sized_truth_vector_type +- (index_vec_type); ++ tree index_vec_cmp_type = truth_type_for (index_vec_type); + + /* Get an unsigned integer version of the type of the data vector. */ + int scalar_precision +@@ -4946,7 +4808,7 @@ vect_create_epilog_for_reduction (vec vect_defs, + gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); + scalar_results.safe_push (new_temp); + } +- else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION ++ else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION + && reduc_fn == IFN_LAST) + { + /* Condition reduction without supported IFN_REDUC_MAX. Generate +@@ -4989,7 +4851,6 @@ vect_create_epilog_for_reduction (vec vect_defs, + if (off != 0) + { + tree new_idx_val = idx_val; +- tree new_val = val; + if (off != v_size - el_size) + { + new_idx_val = make_ssa_name (idx_eltype); +@@ -4998,7 +4859,7 @@ vect_create_epilog_for_reduction (vec vect_defs, + old_idx_val); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + } +- new_val = make_ssa_name (data_eltype); ++ tree new_val = make_ssa_name (data_eltype); + epilog_stmt = gimple_build_assign (new_val, + COND_EXPR, + build2 (GT_EXPR, +@@ -5060,9 +4921,8 @@ vect_create_epilog_for_reduction (vec vect_defs, + gimple_set_lhs (epilog_stmt, new_temp); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + +- if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) +- == INTEGER_INDUC_COND_REDUCTION) +- && !operand_equal_p (initial_def, induc_val, 0)) ++ if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) ++ && induc_val) + { + /* Earlier we set the initial value to be a vector if induc_val + values. Check the result and if it is induc_val then replace +@@ -5100,7 +4960,7 @@ vect_create_epilog_for_reduction (vec vect_defs, + tree index = build_index_vector (vectype, 0, 1); + tree index_type = TREE_TYPE (index); + tree index_elt_type = TREE_TYPE (index_type); +- tree mask_type = build_same_sized_truth_vector_type (index_type); ++ tree mask_type = truth_type_for (index_type); + + /* Create a vector that, for each element, identifies which of + the REDUC_GROUP_SIZE results should use it. */ +@@ -5112,6 +4972,14 @@ vect_create_epilog_for_reduction (vec vect_defs, + scalar value if we have one, otherwise the initial scalar value + is itself a neutral value. */ + tree vector_identity = NULL_TREE; ++ tree neutral_op = NULL_TREE; ++ if (slp_node) ++ { ++ stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info); ++ neutral_op ++ = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, ++ vectype, code, first != NULL); ++ } + if (neutral_op) + vector_identity = gimple_build_vector_from_val (&seq, vectype, + neutral_op); +@@ -5161,32 +5029,19 @@ vect_create_epilog_for_reduction (vec vect_defs, + bool reduce_with_shift; + tree vec_temp; + +- /* COND reductions all do the final reduction with MAX_EXPR +- or MIN_EXPR. */ +- if (code == COND_EXPR) +- { +- if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) +- == INTEGER_INDUC_COND_REDUCTION) +- code = induc_code; +- else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) +- == CONST_COND_REDUCTION) +- code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info); +- else +- code = MAX_EXPR; +- } +- + /* See if the target wants to do the final (shift) reduction + in a vector mode of smaller size and first reduce upper/lower + halves against each other. */ + enum machine_mode mode1 = mode; +- tree vectype1 = vectype; +- unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype)); +- unsigned sz1 = sz; ++ tree stype = TREE_TYPE (vectype); ++ unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); ++ unsigned nunits1 = nunits; + if (!slp_reduc + && (mode1 = targetm.vectorize.split_reduction (mode)) != mode) +- sz1 = GET_MODE_SIZE (mode1).to_constant (); ++ nunits1 = GET_MODE_NUNITS (mode1).to_constant (); + +- vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1); ++ tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype), ++ stype, nunits1); + reduce_with_shift = have_whole_vector_shift (mode1); + if (!VECTOR_MODE_P (mode1)) + reduce_with_shift = false; +@@ -5200,11 +5055,13 @@ vect_create_epilog_for_reduction (vec vect_defs, + /* First reduce the vector to the desired vector size we should + do shift reduction on by combining upper and lower halves. */ + new_temp = new_phi_result; +- while (sz > sz1) ++ while (nunits > nunits1) + { + gcc_assert (!slp_reduc); +- sz /= 2; +- vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz); ++ nunits /= 2; ++ vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype), ++ stype, nunits); ++ unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1)); + + /* The target has to make sure we support lowpart/highpart + extraction, either via direct vector extract or through +@@ -5229,15 +5086,14 @@ vect_create_epilog_for_reduction (vec vect_defs, + = gimple_build_assign (dst2, BIT_FIELD_REF, + build3 (BIT_FIELD_REF, vectype1, + new_temp, TYPE_SIZE (vectype1), +- bitsize_int (sz * BITS_PER_UNIT))); ++ bitsize_int (bitsize))); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + } + else + { + /* Extract via punning to appropriately sized integer mode + vector. */ +- tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT, +- 1); ++ tree eltype = build_nonstandard_integer_type (bitsize, 1); + tree etype = build_vector_type (eltype, 2); + gcc_assert (convert_optab_handler (vec_extract_optab, + TYPE_MODE (etype), +@@ -5266,7 +5122,7 @@ vect_create_epilog_for_reduction (vec vect_defs, + = gimple_build_assign (tem, BIT_FIELD_REF, + build3 (BIT_FIELD_REF, eltype, + new_temp, TYPE_SIZE (eltype), +- bitsize_int (sz * BITS_PER_UNIT))); ++ bitsize_int (bitsize))); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + dst2 = make_ssa_name (vectype1); + epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR, +@@ -5307,8 +5163,8 @@ vect_create_epilog_for_reduction (vec vect_defs, + dump_printf_loc (MSG_NOTE, vect_location, + "Reduce using vector shifts\n"); + +- mode1 = TYPE_MODE (vectype1); +- vec_dest = vect_create_destination_var (scalar_dest, vectype1); ++ gimple_seq stmts = NULL; ++ new_temp = gimple_convert (&stmts, vectype1, new_temp); + for (elt_offset = nelements / 2; + elt_offset >= 1; + elt_offset /= 2) +@@ -5316,18 +5172,12 @@ vect_create_epilog_for_reduction (vec vect_defs, + calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel); + indices.new_vector (sel, 2, nelements); + tree mask = vect_gen_perm_mask_any (vectype1, indices); +- epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR, +- new_temp, zero_vec, mask); +- new_name = make_ssa_name (vec_dest, epilog_stmt); +- gimple_assign_set_lhs (epilog_stmt, new_name); +- gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); +- +- epilog_stmt = gimple_build_assign (vec_dest, code, new_name, +- new_temp); +- new_temp = make_ssa_name (vec_dest, epilog_stmt); +- gimple_assign_set_lhs (epilog_stmt, new_temp); +- gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); ++ new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1, ++ new_temp, zero_vec, mask); ++ new_temp = gimple_build (&stmts, code, ++ vectype1, new_name, new_temp); + } ++ gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); + + /* 2.4 Extract the final scalar result. Create: + s_out3 = extract_field */ +@@ -5439,9 +5289,8 @@ vect_create_epilog_for_reduction (vec vect_defs, + scalar_results.safe_push (new_temp); + } + +- if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) +- == INTEGER_INDUC_COND_REDUCTION) +- && !operand_equal_p (initial_def, induc_val, 0)) ++ if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) ++ && induc_val) + { + /* Earlier we set the initial value to be a vector if induc_val + values. Check the result and if it is induc_val then replace +@@ -5457,12 +5306,7 @@ vect_create_epilog_for_reduction (vec vect_defs, + scalar_results[0] = tmp; + } + } +- +-vect_finalize_reduction: +- +- if (double_reduc) +- loop = loop->inner; +- ++ + /* 2.5 Adjust the final result by the initial value of the reduction + variable. (When such adjustment is not needed, then + 'adjustment_def' is zero). For example, if code is PLUS we create: +@@ -5471,25 +5315,26 @@ vect_finalize_reduction: + if (adjustment_def) + { + gcc_assert (!slp_reduc); ++ gimple_seq stmts = NULL; + if (nested_in_vect_loop) + { + new_phi = new_phis[0]; +- gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE); +- expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def); +- new_dest = vect_create_destination_var (scalar_dest, vectype); ++ gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def))); ++ adjustment_def = gimple_convert (&stmts, vectype, adjustment_def); ++ new_temp = gimple_build (&stmts, code, vectype, ++ PHI_RESULT (new_phi), adjustment_def); + } + else + { + new_temp = scalar_results[0]; + gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); +- expr = build2 (code, scalar_type, new_temp, adjustment_def); +- new_dest = vect_create_destination_var (scalar_dest, scalar_type); ++ adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def); ++ new_temp = gimple_build (&stmts, code, scalar_type, ++ new_temp, adjustment_def); + } + +- epilog_stmt = gimple_build_assign (new_dest, expr); +- new_temp = make_ssa_name (new_dest, epilog_stmt); +- gimple_assign_set_lhs (epilog_stmt, new_temp); +- gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); ++ epilog_stmt = gimple_seq_last_stmt (stmts); ++ gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); + if (nested_in_vect_loop) + { + stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt); +@@ -5507,6 +5352,9 @@ vect_finalize_reduction: + new_phis[0] = epilog_stmt; + } + ++ if (double_reduc) ++ loop = loop->inner; ++ + /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit + phis with new adjusted scalar results, i.e., replace use + with use . +@@ -5552,24 +5400,10 @@ vect_finalize_reduction: + correspond to the first vector stmt, etc. + (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */ + if (group_size > new_phis.length ()) +- { +- ratio = group_size / new_phis.length (); +- gcc_assert (!(group_size % new_phis.length ())); +- } +- else +- ratio = 1; ++ gcc_assert (!(group_size % new_phis.length ())); + +- stmt_vec_info epilog_stmt_info = NULL; + for (k = 0; k < group_size; k++) + { +- if (k % ratio == 0) +- { +- epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]); +- reduction_phi_info = reduction_phis[k / ratio]; +- if (double_reduc) +- inner_phi = inner_phis[k / ratio]; +- } +- + if (slp_reduc) + { + stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k]; +@@ -5580,121 +5414,12 @@ vect_finalize_reduction: + scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt); + } + +- phis.create (3); +- /* Find the loop-closed-use at the loop exit of the original scalar +- result. (The reduction result is expected to have two immediate uses - +- one at the latch block, and one at the loop exit). */ +- FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) +- if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))) +- && !is_gimple_debug (USE_STMT (use_p))) +- phis.safe_push (USE_STMT (use_p)); +- +- /* While we expect to have found an exit_phi because of loop-closed-ssa +- form we can end up without one if the scalar cycle is dead. */ +- +- FOR_EACH_VEC_ELT (phis, i, exit_phi) +- { +- if (outer_loop) +- { +- stmt_vec_info exit_phi_vinfo +- = loop_vinfo->lookup_stmt (exit_phi); +- gphi *vect_phi; +- +- if (double_reduc) +- STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi; +- else +- STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info; +- if (!double_reduc +- || STMT_VINFO_DEF_TYPE (exit_phi_vinfo) +- != vect_double_reduction_def) +- continue; +- +- /* Handle double reduction: +- +- stmt1: s1 = phi - double reduction phi (outer loop) +- stmt2: s3 = phi - (regular) reduc phi (inner loop) +- stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop) +- stmt4: s2 = phi - double reduction stmt (outer loop) +- +- At that point the regular reduction (stmt2 and stmt3) is +- already vectorized, as well as the exit phi node, stmt4. +- Here we vectorize the phi node of double reduction, stmt1, and +- update all relevant statements. */ +- +- /* Go through all the uses of s2 to find double reduction phi +- node, i.e., stmt1 above. */ +- orig_name = PHI_RESULT (exit_phi); +- FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) +- { +- stmt_vec_info use_stmt_vinfo; +- tree vect_phi_init, preheader_arg, vect_phi_res; +- basic_block bb = gimple_bb (use_stmt); +- +- /* Check that USE_STMT is really double reduction phi +- node. */ +- if (gimple_code (use_stmt) != GIMPLE_PHI +- || gimple_phi_num_args (use_stmt) != 2 +- || bb->loop_father != outer_loop) +- continue; +- use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt); +- if (!use_stmt_vinfo +- || STMT_VINFO_DEF_TYPE (use_stmt_vinfo) +- != vect_double_reduction_def) +- continue; +- +- /* Create vector phi node for double reduction: +- vs1 = phi +- vs1 was created previously in this function by a call to +- vect_get_vec_def_for_operand and is stored in +- vec_initial_def; +- vs2 is defined by INNER_PHI, the vectorized EXIT_PHI; +- vs0 is created here. */ +- +- /* Create vector phi node. */ +- vect_phi = create_phi_node (vec_initial_def, bb); +- loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi); +- +- /* Create vs0 - initial def of the double reduction phi. */ +- preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt, +- loop_preheader_edge (outer_loop)); +- vect_phi_init = get_initial_def_for_reduction +- (stmt_info, preheader_arg, NULL); +- +- /* Update phi node arguments with vs0 and vs2. */ +- add_phi_arg (vect_phi, vect_phi_init, +- loop_preheader_edge (outer_loop), +- UNKNOWN_LOCATION); +- add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt), +- loop_latch_edge (outer_loop), UNKNOWN_LOCATION); +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_NOTE, vect_location, +- "created double reduction phi node: %G", +- vect_phi); +- +- vect_phi_res = PHI_RESULT (vect_phi); +- +- /* Replace the use, i.e., set the correct vs1 in the regular +- reduction phi node. FORNOW, NCOPIES is always 1, so the +- loop is redundant. */ +- stmt_vec_info use_info = reduction_phi_info; +- for (j = 0; j < ncopies; j++) +- { +- edge pr_edge = loop_preheader_edge (loop); +- SET_PHI_ARG_DEF (as_a (use_info->stmt), +- pr_edge->dest_idx, vect_phi_res); +- use_info = STMT_VINFO_RELATED_STMT (use_info); +- } +- } +- } +- } +- +- phis.release (); + if (nested_in_vect_loop) + { + if (double_reduc) + loop = outer_loop; + else +- continue; ++ gcc_unreachable (); + } + + phis.create (3); +@@ -5824,9 +5549,6 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info, + gcc_assert (!nested_in_vect_loop_p (loop, stmt_info)); + gcc_assert (ncopies == 1); + gcc_assert (TREE_CODE_LENGTH (code) == binary_op); +- gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1)); +- gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) +- == FOLD_LEFT_REDUCTION); + + if (slp_node) + gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out), +@@ -5840,10 +5562,7 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info, + if (slp_node) + { + auto_vec > vec_defs (2); +- auto_vec sops(2); +- sops.quick_push (ops[0]); +- sops.quick_push (ops[1]); +- vect_get_slp_defs (sops, slp_node, &vec_defs); ++ vect_get_slp_defs (slp_node, &vec_defs); + vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]); + vec_defs[0].release (); + vec_defs[1].release (); +@@ -5984,6 +5703,55 @@ is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop) + <= TYPE_PRECISION (lhs_type)); + } + ++/* Check if masking can be supported by inserting a conditional expression. ++ CODE is the code for the operation. COND_FN is the conditional internal ++ function, if it exists. VECTYPE_IN is the type of the vector input. */ ++static bool ++use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn, ++ tree vectype_in) ++{ ++ if (cond_fn != IFN_LAST ++ && direct_internal_fn_supported_p (cond_fn, vectype_in, ++ OPTIMIZE_FOR_SPEED)) ++ return false; ++ ++ switch (code) ++ { ++ case DOT_PROD_EXPR: ++ return true; ++ ++ default: ++ return false; ++ } ++} ++ ++/* Insert a conditional expression to enable masked vectorization. CODE is the ++ code for the operation. VOP is the array of operands. MASK is the loop ++ mask. GSI is a statement iterator used to place the new conditional ++ expression. */ ++static void ++build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask, ++ gimple_stmt_iterator *gsi) ++{ ++ switch (code) ++ { ++ case DOT_PROD_EXPR: ++ { ++ tree vectype = TREE_TYPE (vop[1]); ++ tree zero = build_zero_cst (vectype); ++ tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1"); ++ gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR, ++ mask, vop[1], zero); ++ gsi_insert_before (gsi, select, GSI_SAME_STMT); ++ vop[1] = masked_op1; ++ break; ++ } ++ ++ default: ++ gcc_unreachable (); ++ } ++} ++ + /* Function vectorizable_reduction. + + Check if STMT_INFO performs a reduction operation that can be vectorized. +@@ -6027,182 +5795,163 @@ is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop) + corresponds to the type of arguments to the reduction stmt, and should *NOT* + be used to create the vectorized stmt. The right vectype for the vectorized + stmt is obtained from the type of the result X: +- get_vectype_for_scalar_type (TREE_TYPE (X)) ++ get_vectype_for_scalar_type (vinfo, TREE_TYPE (X)) + + This means that, contrary to "regular" reductions (or "regular" stmts in + general), the following equation: +- STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X)) ++ STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X)) + does *NOT* necessarily hold for reduction patterns. */ + + bool +-vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, +- stmt_vec_info *vec_stmt, slp_tree slp_node, ++vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node, + slp_instance slp_node_instance, + stmt_vector_for_cost *cost_vec) + { +- tree vec_dest; + tree scalar_dest; +- tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); + tree vectype_in = NULL_TREE; + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); +- enum tree_code code, orig_code; +- internal_fn reduc_fn; +- machine_mode vec_mode; +- int op_type; +- optab optab; +- tree new_temp = NULL_TREE; +- enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type; ++ enum vect_def_type cond_reduc_dt = vect_unknown_def_type; + stmt_vec_info cond_stmt_vinfo = NULL; +- enum tree_code cond_reduc_op_code = ERROR_MARK; + tree scalar_type; +- bool is_simple_use; + int i; + int ncopies; +- int epilog_copies; +- stmt_vec_info prev_stmt_info, prev_phi_info; + bool single_defuse_cycle = false; +- stmt_vec_info new_stmt_info = NULL; +- int j; +- tree ops[3]; +- enum vect_def_type dts[3]; +- bool nested_cycle = false, found_nested_cycle_def = false; ++ bool nested_cycle = false; + bool double_reduc = false; +- basic_block def_bb; +- struct loop * def_stmt_loop; +- tree def_arg; +- auto_vec vec_oprnds0; +- auto_vec vec_oprnds1; +- auto_vec vec_oprnds2; +- auto_vec vect_defs; +- auto_vec phis; + int vec_num; +- tree def0, tem; ++ tree tem; + tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE; + tree cond_reduc_val = NULL_TREE; + + /* Make sure it was already recognized as a reduction computation. */ + if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def ++ && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def + && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle) + return false; + +- if (nested_in_vect_loop_p (loop, stmt_info)) ++ /* The stmt we store reduction analysis meta on. */ ++ stmt_vec_info reduc_info = info_for_reduction (stmt_info); ++ reduc_info->is_reduc_info = true; ++ ++ if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) + { +- loop = loop->inner; +- nested_cycle = true; ++ if (is_a (stmt_info->stmt)) ++ { ++ /* Analysis for double-reduction is done on the outer ++ loop PHI, nested cycles have no further restrictions. */ ++ STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type; ++ /* For nested cycles we want to let regular vectorizable_* ++ routines handle code-generation. */ ++ if (STMT_VINFO_DEF_TYPE (reduc_info) != vect_double_reduction_def) ++ { ++ stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); ++ STMT_VINFO_DEF_TYPE (stmt_info) = vect_internal_def; ++ STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (stmt_info)) ++ = vect_internal_def; ++ } ++ } ++ else ++ STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; ++ return true; + } + +- if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) +- gcc_assert (slp_node +- && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info); +- +- if (gphi *phi = dyn_cast (stmt_info->stmt)) ++ stmt_vec_info orig_stmt_of_analysis = stmt_info; ++ stmt_vec_info phi_info = stmt_info; ++ if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def ++ || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) + { +- tree phi_result = gimple_phi_result (phi); +- /* Analysis is fully done on the reduction stmt invocation. */ +- if (! vec_stmt) ++ if (!is_a (stmt_info->stmt)) + { +- if (slp_node) +- slp_node_instance->reduc_phis = slp_node; +- + STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; + return true; + } +- +- if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION) +- /* Leave the scalar phi in place. Note that checking +- STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works +- for reductions involving a single statement. */ +- return true; +- +- stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); +- reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info); +- +- if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info) +- == EXTRACT_LAST_REDUCTION) +- /* Leave the scalar phi in place. */ +- return true; +- +- gassign *reduc_stmt = as_a (reduc_stmt_info->stmt); +- code = gimple_assign_rhs_code (reduc_stmt); +- for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k) ++ if (slp_node) + { +- tree op = gimple_op (reduc_stmt, k); +- if (op == phi_result) +- continue; +- if (k == 1 && code == COND_EXPR) +- continue; +- bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt); +- gcc_assert (is_simple_use); +- if (dt == vect_constant_def || dt == vect_external_def) +- continue; +- if (!vectype_in +- || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) +- < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op))))) +- vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op)); +- break; ++ slp_node_instance->reduc_phis = slp_node; ++ /* ??? We're leaving slp_node to point to the PHIs, we only ++ need it to get at the number of vector stmts which wasn't ++ yet initialized for the instance root. */ + } +- /* For a nested cycle we might end up with an operation like +- phi_result * phi_result. */ +- if (!vectype_in) +- vectype_in = STMT_VINFO_VECTYPE (stmt_info); +- gcc_assert (vectype_in); +- +- if (slp_node) +- ncopies = 1; +- else +- ncopies = vect_get_num_copies (loop_vinfo, vectype_in); +- +- stmt_vec_info use_stmt_info; +- if (ncopies > 1 +- && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live +- && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result)) +- && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info) +- single_defuse_cycle = true; +- +- /* Create the destination vector */ +- scalar_dest = gimple_assign_lhs (reduc_stmt); +- vec_dest = vect_create_destination_var (scalar_dest, vectype_out); +- +- if (slp_node) +- /* The size vect_schedule_slp_instance computes is off for us. */ +- vec_num = vect_get_num_vectors +- (LOOP_VINFO_VECT_FACTOR (loop_vinfo) +- * SLP_TREE_SCALAR_STMTS (slp_node).length (), +- vectype_in); +- else +- vec_num = 1; ++ if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def) ++ stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info)); ++ else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */ ++ { ++ use_operand_p use_p; ++ gimple *use_stmt; ++ bool res = single_imm_use (gimple_phi_result (stmt_info->stmt), ++ &use_p, &use_stmt); ++ gcc_assert (res); ++ phi_info = loop_vinfo->lookup_stmt (use_stmt); ++ stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info)); ++ } ++ } + +- /* Generate the reduction PHIs upfront. */ +- prev_phi_info = NULL; +- for (j = 0; j < ncopies; j++) ++ /* PHIs should not participate in patterns. */ ++ gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info)); ++ gphi *reduc_def_phi = as_a (phi_info->stmt); ++ ++ /* Verify following REDUC_IDX from the latch def leads us back to the PHI ++ and compute the reduction chain length. */ ++ tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, ++ loop_latch_edge (loop)); ++ unsigned reduc_chain_length = 0; ++ bool only_slp_reduc_chain = true; ++ stmt_info = NULL; ++ while (reduc_def != PHI_RESULT (reduc_def_phi)) ++ { ++ stmt_vec_info def = loop_vinfo->lookup_def (reduc_def); ++ stmt_vec_info vdef = vect_stmt_to_vectorize (def); ++ if (STMT_VINFO_REDUC_IDX (vdef) == -1) + { +- if (j == 0 || !single_defuse_cycle) ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "reduction chain broken by patterns.\n"); ++ return false; ++ } ++ if (!REDUC_GROUP_FIRST_ELEMENT (vdef)) ++ only_slp_reduc_chain = false; ++ /* ??? For epilogue generation live members of the chain need ++ to point back to the PHI via their original stmt for ++ info_for_reduction to work. */ ++ if (STMT_VINFO_LIVE_P (vdef)) ++ STMT_VINFO_REDUC_DEF (def) = phi_info; ++ if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (vdef->stmt))) ++ { ++ if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (vdef->stmt)), ++ TREE_TYPE (gimple_assign_rhs1 (vdef->stmt)))) + { +- for (i = 0; i < vec_num; i++) +- { +- /* Create the reduction-phi that defines the reduction +- operand. */ +- gimple *new_phi = create_phi_node (vec_dest, loop->header); +- stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi); +- +- if (slp_node) +- SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info); +- else +- { +- if (j == 0) +- STMT_VINFO_VEC_STMT (stmt_info) +- = *vec_stmt = new_phi_info; +- else +- STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info; +- prev_phi_info = new_phi_info; +- } +- } ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "conversion in the reduction chain.\n"); ++ return false; + } + } ++ else if (!stmt_info) ++ /* First non-conversion stmt. */ ++ stmt_info = vdef; ++ reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef)); ++ reduc_chain_length++; ++ } ++ /* PHIs should not participate in patterns. */ ++ gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info)); + +- return true; ++ if (nested_in_vect_loop_p (loop, stmt_info)) ++ { ++ loop = loop->inner; ++ nested_cycle = true; ++ } ++ ++ /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last ++ element. */ ++ if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info)) ++ { ++ gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info)); ++ stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info); + } ++ if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) ++ gcc_assert (slp_node ++ && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info); + + /* 1. Is vectorizable reduction? */ + /* Not supportable if the reduction variable is used in the loop, unless +@@ -6235,37 +5984,13 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + inside the loop body. The last operand is the reduction variable, + which is defined by the loop-header-phi. */ + ++ tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); ++ STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out; + gassign *stmt = as_a (stmt_info->stmt); +- +- /* Flatten RHS. */ +- switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) +- { +- case GIMPLE_BINARY_RHS: +- code = gimple_assign_rhs_code (stmt); +- op_type = TREE_CODE_LENGTH (code); +- gcc_assert (op_type == binary_op); +- ops[0] = gimple_assign_rhs1 (stmt); +- ops[1] = gimple_assign_rhs2 (stmt); +- break; +- +- case GIMPLE_TERNARY_RHS: +- code = gimple_assign_rhs_code (stmt); +- op_type = TREE_CODE_LENGTH (code); +- gcc_assert (op_type == ternary_op); +- ops[0] = gimple_assign_rhs1 (stmt); +- ops[1] = gimple_assign_rhs2 (stmt); +- ops[2] = gimple_assign_rhs3 (stmt); +- break; +- +- case GIMPLE_UNARY_RHS: +- return false; +- +- default: +- gcc_unreachable (); +- } +- +- if (code == COND_EXPR && slp_node) +- return false; ++ enum tree_code code = gimple_assign_rhs_code (stmt); ++ bool lane_reduc_code_p ++ = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR); ++ int op_type = TREE_CODE_LENGTH (code); + + scalar_dest = gimple_assign_lhs (stmt); + scalar_type = TREE_TYPE (scalar_dest); +@@ -6277,67 +6002,65 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + if (!type_has_mode_precision_p (scalar_type)) + return false; + ++ /* For lane-reducing ops we're reducing the number of reduction PHIs ++ which means the only use of that may be in the lane-reducing operation. */ ++ if (lane_reduc_code_p ++ && reduc_chain_length != 1 ++ && !only_slp_reduc_chain) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "lane-reducing reduction with extra stmts.\n"); ++ return false; ++ } ++ + /* All uses but the last are expected to be defined in the loop. + The last use is the reduction variable. In case of nested cycle this + assumption is not true: we use reduc_index to record the index of the + reduction variable. */ +- stmt_vec_info reduc_def_info; +- if (orig_stmt_info) +- reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info); +- else +- reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info); +- gcc_assert (reduc_def_info); +- gphi *reduc_def_phi = as_a (reduc_def_info->stmt); +- tree reduc_def = PHI_RESULT (reduc_def_phi); +- int reduc_index = -1; ++ reduc_def = PHI_RESULT (reduc_def_phi); + for (i = 0; i < op_type; i++) + { ++ tree op = gimple_op (stmt, i + 1); + /* The condition of COND_EXPR is checked in vectorizable_condition(). */ + if (i == 0 && code == COND_EXPR) + continue; + + stmt_vec_info def_stmt_info; +- is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem, +- &def_stmt_info); +- dt = dts[i]; +- gcc_assert (is_simple_use); +- if (dt == vect_reduction_def +- && ops[i] == reduc_def) +- { +- reduc_index = i; +- continue; +- } +- else if (tem) ++ enum vect_def_type dt; ++ if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem, ++ &def_stmt_info)) + { +- /* To properly compute ncopies we are interested in the widest +- input type in case we're looking at a widening accumulation. */ +- if (!vectype_in +- || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) +- < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))) +- vectype_in = tem; ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "use not simple.\n"); ++ return false; + } ++ if (i == STMT_VINFO_REDUC_IDX (stmt_info)) ++ continue; + +- if (dt != vect_internal_def +- && dt != vect_external_def +- && dt != vect_constant_def +- && dt != vect_induction_def +- && !(dt == vect_nested_cycle && nested_cycle)) ++ /* There should be only one cycle def in the stmt, the one ++ leading to reduc_def. */ ++ if (VECTORIZABLE_CYCLE_DEF (dt)) + return false; + +- if (dt == vect_nested_cycle +- && ops[i] == reduc_def) +- { +- found_nested_cycle_def = true; +- reduc_index = i; +- } ++ /* To properly compute ncopies we are interested in the widest ++ non-reduction input type in case we're looking at a widening ++ accumulation that we later handle in vect_transform_reduction. */ ++ if (lane_reduc_code_p ++ && tem ++ && (!vectype_in ++ || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) ++ < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))) ++ vectype_in = tem; + +- if (i == 1 && code == COND_EXPR) ++ if (code == COND_EXPR) + { +- /* Record how value of COND_EXPR is defined. */ ++ /* Record how the non-reduction-def value of COND_EXPR is defined. */ + if (dt == vect_constant_def) + { + cond_reduc_dt = dt; +- cond_reduc_val = ops[i]; ++ cond_reduc_val = op; + } + if (dt == vect_induction_def + && def_stmt_info +@@ -6348,93 +6071,35 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + } + } + } +- + if (!vectype_in) +- vectype_in = vectype_out; +- +- /* When vectorizing a reduction chain w/o SLP the reduction PHI is not +- directy used in stmt. */ +- if (reduc_index == -1) +- { +- if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION) +- { +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +- "in-order reduction chain without SLP.\n"); +- return false; +- } +- } +- +- if (!(reduc_index == -1 +- || dts[reduc_index] == vect_reduction_def +- || dts[reduc_index] == vect_nested_cycle +- || ((dts[reduc_index] == vect_internal_def +- || dts[reduc_index] == vect_external_def +- || dts[reduc_index] == vect_constant_def +- || dts[reduc_index] == vect_induction_def) +- && nested_cycle && found_nested_cycle_def))) +- { +- /* For pattern recognized stmts, orig_stmt might be a reduction, +- but some helper statements for the pattern might not, or +- might be COND_EXPRs with reduction uses in the condition. */ +- gcc_assert (orig_stmt_info); +- return false; +- } +- +- /* PHIs should not participate in patterns. */ +- gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info)); +- enum vect_reduction_type v_reduc_type +- = STMT_VINFO_REDUC_TYPE (reduc_def_info); +- stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info); ++ vectype_in = STMT_VINFO_VECTYPE (phi_info); ++ STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; + +- STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type; ++ enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info); ++ STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type; + /* If we have a condition reduction, see if we can simplify it further. */ + if (v_reduc_type == COND_REDUCTION) + { +- /* TODO: We can't yet handle reduction chains, since we need to treat +- each COND_EXPR in the chain specially, not just the last one. +- E.g. for: +- +- x_1 = PHI +- x_2 = a_2 ? ... : x_1; +- x_3 = a_3 ? ... : x_2; ++ if (slp_node) ++ return false; + +- we're interested in the last element in x_3 for which a_2 || a_3 +- is true, whereas the current reduction chain handling would +- vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3 +- as a reduction operation. */ +- if (reduc_index == -1) ++ /* When the condition uses the reduction value in the condition, fail. */ ++ if (STMT_VINFO_REDUC_IDX (stmt_info) == 0) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +- "conditional reduction chains not supported\n"); ++ "condition depends on previous iteration\n"); + return false; + } + +- /* vect_is_simple_reduction ensured that operand 2 is the +- loop-carried operand. */ +- gcc_assert (reduc_index == 2); +- +- /* Loop peeling modifies initial value of reduction PHI, which +- makes the reduction stmt to be transformed different to the +- original stmt analyzed. We need to record reduction code for +- CONST_COND_REDUCTION type reduction at analyzing stage, thus +- it can be used directly at transform stage. */ +- if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR +- || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR) +- { +- /* Also set the reduction type to CONST_COND_REDUCTION. */ +- gcc_assert (cond_reduc_dt == vect_constant_def); +- STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION; +- } +- else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, +- vectype_in, OPTIMIZE_FOR_SPEED)) ++ if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, ++ vectype_in, OPTIMIZE_FOR_SPEED)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "optimizing condition reduction with" + " FOLD_EXTRACT_LAST.\n"); +- STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION; ++ STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION; + } + else if (cond_reduc_dt == vect_induction_def) + { +@@ -6445,6 +6110,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + gcc_assert (TREE_CODE (base) == INTEGER_CST + && TREE_CODE (step) == INTEGER_CST); + cond_reduc_val = NULL_TREE; ++ enum tree_code cond_reduc_op_code = ERROR_MARK; + tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo)); + if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base))) + ; +@@ -6477,16 +6143,17 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + dump_printf_loc (MSG_NOTE, vect_location, + "condition expression based on " + "integer induction.\n"); +- STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) +- = INTEGER_INDUC_COND_REDUCTION; ++ STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code; ++ STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) ++ = cond_reduc_val; ++ STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION; + } + } + else if (cond_reduc_dt == vect_constant_def) + { + enum vect_def_type cond_initial_dt; +- gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]); + tree cond_initial_val +- = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop)); ++ = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop)); + + gcc_assert (cond_reduc_val != NULL_TREE); + vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt); +@@ -6503,25 +6170,15 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + "condition expression based on " + "compile time constant.\n"); + /* Record reduction code at analysis stage. */ +- STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) ++ STMT_VINFO_REDUC_CODE (reduc_info) + = integer_onep (e) ? MAX_EXPR : MIN_EXPR; +- STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) +- = CONST_COND_REDUCTION; ++ STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION; + } + } + } + } + +- if (orig_stmt_info) +- gcc_assert (tmp == orig_stmt_info +- || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info); +- else +- /* We changed STMT to be the first stmt in reduction chain, hence we +- check that in this case the first element in the chain is STMT. */ +- gcc_assert (tmp == stmt_info +- || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info); +- +- if (STMT_VINFO_LIVE_P (reduc_def_info)) ++ if (STMT_VINFO_LIVE_P (phi_info)) + return false; + + if (slp_node) +@@ -6531,102 +6188,13 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + + gcc_assert (ncopies >= 1); + +- vec_mode = TYPE_MODE (vectype_in); + poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); + + if (nested_cycle) + { +- def_bb = gimple_bb (reduc_def_phi); +- def_stmt_loop = def_bb->loop_father; +- def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, +- loop_preheader_edge (def_stmt_loop)); +- stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg); +- if (def_arg_stmt_info +- && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info) +- == vect_double_reduction_def)) +- double_reduc = true; +- } +- +- vect_reduction_type reduction_type +- = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info); +- if ((double_reduc || reduction_type != TREE_CODE_REDUCTION) +- && ncopies > 1) +- { +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +- "multiple types in double reduction or condition " +- "reduction.\n"); +- return false; +- } +- +- if (code == COND_EXPR) +- { +- /* Only call during the analysis stage, otherwise we'll lose +- STMT_VINFO_TYPE. */ +- if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL, +- true, NULL, cost_vec)) +- { +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +- "unsupported condition in reduction\n"); +- return false; +- } +- } +- else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR +- || code == LROTATE_EXPR || code == RROTATE_EXPR) +- { +- /* Only call during the analysis stage, otherwise we'll lose +- STMT_VINFO_TYPE. We only support this for nested cycles +- without double reductions at the moment. */ +- if (!nested_cycle +- || double_reduc +- || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL, +- NULL, cost_vec))) +- { +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +- "unsupported shift or rotation in reduction\n"); +- return false; +- } +- } +- else +- { +- /* 4. Supportable by target? */ +- +- /* 4.1. check support for the operation in the loop */ +- optab = optab_for_tree_code (code, vectype_in, optab_default); +- if (!optab) +- { +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +- "no optab.\n"); +- +- return false; +- } +- +- if (optab_handler (optab, vec_mode) == CODE_FOR_nothing) +- { +- if (dump_enabled_p ()) +- dump_printf (MSG_NOTE, "op not supported by target.\n"); +- +- if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD) +- || !vect_worthwhile_without_simd_p (loop_vinfo, code)) +- return false; +- +- if (dump_enabled_p ()) +- dump_printf (MSG_NOTE, "proceeding using word mode.\n"); +- } +- +- /* Worthwhile without SIMD support? */ +- if (!VECTOR_MODE_P (TYPE_MODE (vectype_in)) +- && !vect_worthwhile_without_simd_p (loop_vinfo, code)) +- { +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +- "not worthwhile without SIMD support.\n"); +- +- return false; +- } ++ gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) ++ == vect_double_reduction_def); ++ double_reduc = true; + } + + /* 4.2. Check support for the epilog operation. +@@ -6664,38 +6232,55 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + (and also the same tree-code) when generating the epilog code and + when generating the code inside the loop. */ + +- if (orig_stmt_info +- && (reduction_type == TREE_CODE_REDUCTION +- || reduction_type == FOLD_LEFT_REDUCTION)) +- { +- /* This is a reduction pattern: get the vectype from the type of the +- reduction variable, and get the tree-code from orig_stmt. */ +- orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt); +- gcc_assert (vectype_out); +- vec_mode = TYPE_MODE (vectype_out); +- } +- else +- { +- /* Regular reduction: use the same vectype and tree-code as used for +- the vector code inside the loop can be used for the epilog code. */ +- orig_code = code; +- +- if (code == MINUS_EXPR) +- orig_code = PLUS_EXPR; ++ enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info); ++ STMT_VINFO_REDUC_CODE (reduc_info) = orig_code; + +- /* For simple condition reductions, replace with the actual expression +- we want to base our reduction around. */ +- if (reduction_type == CONST_COND_REDUCTION) ++ vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); ++ if (reduction_type == TREE_CODE_REDUCTION) ++ { ++ /* Check whether it's ok to change the order of the computation. ++ Generally, when vectorizing a reduction we change the order of the ++ computation. This may change the behavior of the program in some ++ cases, so we need to check that this is ok. One exception is when ++ vectorizing an outer-loop: the inner-loop is executed sequentially, ++ and therefore vectorizing reductions in the inner-loop during ++ outer-loop vectorization is safe. */ ++ if (needs_fold_left_reduction_p (scalar_type, orig_code)) ++ { ++ /* When vectorizing a reduction chain w/o SLP the reduction PHI ++ is not directy used in stmt. */ ++ if (!only_slp_reduc_chain ++ && reduc_chain_length != 1) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "in-order reduction chain without SLP.\n"); ++ return false; ++ } ++ STMT_VINFO_REDUC_TYPE (reduc_info) ++ = reduction_type = FOLD_LEFT_REDUCTION; ++ } ++ else if (!commutative_tree_code (orig_code) ++ || !associative_tree_code (orig_code)) + { +- orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info); +- gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR); ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "reduction: not commutative/associative"); ++ return false; + } +- else if (reduction_type == INTEGER_INDUC_COND_REDUCTION) +- orig_code = cond_reduc_op_code; + } + +- reduc_fn = IFN_LAST; ++ if ((double_reduc || reduction_type != TREE_CODE_REDUCTION) ++ && ncopies > 1) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "multiple types in double reduction or condition " ++ "reduction or fold-left reduction.\n"); ++ return false; ++ } + ++ internal_fn reduc_fn = IFN_LAST; + if (reduction_type == TREE_CODE_REDUCTION + || reduction_type == FOLD_LEFT_REDUCTION + || reduction_type == INTEGER_INDUC_COND_REDUCTION +@@ -6740,6 +6325,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + OPTIMIZE_FOR_SPEED)) + reduc_fn = IFN_REDUC_MAX; + } ++ STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn; + + if (reduction_type != EXTRACT_LAST_REDUCTION + && (!nested_cycle || double_reduc) +@@ -6757,7 +6343,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + tree neutral_op = NULL_TREE; + if (slp_node) + neutral_op = neutral_op_for_slp_reduction +- (slp_node_instance->reduc_phis, code, ++ (slp_node_instance->reduc_phis, vectype_out, orig_code, + REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL); + + if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION) +@@ -6822,10 +6408,11 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + which each SLP statement has its own initial value and in which + that value needs to be repeated for every instance of the + statement within the initial vector. */ +- unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); ++ unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); + scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out)); + if (!neutral_op +- && !can_duplicate_and_interleave_p (group_size, elt_mode)) ++ && !can_duplicate_and_interleave_p (loop_vinfo, group_size, ++ elt_mode)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +@@ -6848,26 +6435,6 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + } + } + +- /* In case of widenning multiplication by a constant, we update the type +- of the constant to be the type of the other operand. We check that the +- constant fits the type in the pattern recognition pass. */ +- if (code == DOT_PROD_EXPR +- && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1]))) +- { +- if (TREE_CODE (ops[0]) == INTEGER_CST) +- ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]); +- else if (TREE_CODE (ops[1]) == INTEGER_CST) +- ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]); +- else +- { +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +- "invalid types in dot-prod\n"); +- +- return false; +- } +- } +- + if (reduction_type == COND_REDUCTION) + { + widest_int ni; +@@ -6925,26 +6492,68 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + This only works when we see both the reduction PHI and its only consumer + in vectorizable_reduction and there are no intermediate stmts + participating. */ +- stmt_vec_info use_stmt_info; +- tree reduc_phi_result = gimple_phi_result (reduc_def_phi); + if (ncopies > 1 + && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) +- && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result)) +- && vect_stmt_to_vectorize (use_stmt_info) == stmt_info) ++ && reduc_chain_length == 1) ++ single_defuse_cycle = true; ++ ++ if (single_defuse_cycle || lane_reduc_code_p) + { +- single_defuse_cycle = true; +- epilog_copies = 1; ++ gcc_assert (code != COND_EXPR); ++ ++ /* 4. Supportable by target? */ ++ bool ok = true; ++ ++ /* 4.1. check support for the operation in the loop */ ++ optab optab = optab_for_tree_code (code, vectype_in, optab_vector); ++ if (!optab) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "no optab.\n"); ++ ok = false; ++ } ++ ++ machine_mode vec_mode = TYPE_MODE (vectype_in); ++ if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing) ++ { ++ if (dump_enabled_p ()) ++ dump_printf (MSG_NOTE, "op not supported by target.\n"); ++ if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD) ++ || !vect_worthwhile_without_simd_p (loop_vinfo, code)) ++ ok = false; ++ else ++ if (dump_enabled_p ()) ++ dump_printf (MSG_NOTE, "proceeding using word mode.\n"); ++ } ++ ++ /* Worthwhile without SIMD support? */ ++ if (ok ++ && !VECTOR_MODE_P (TYPE_MODE (vectype_in)) ++ && !vect_worthwhile_without_simd_p (loop_vinfo, code)) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "not worthwhile without SIMD support.\n"); ++ ok = false; ++ } ++ ++ /* lane-reducing operations have to go through vect_transform_reduction. ++ For the other cases try without the single cycle optimization. */ ++ if (!ok) ++ { ++ if (lane_reduc_code_p) ++ return false; ++ else ++ single_defuse_cycle = false; ++ } + } +- else +- epilog_copies = ncopies; ++ STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle; + + /* If the reduction stmt is one of the patterns that have lane + reduction embedded we cannot handle the case of ! single_defuse_cycle. */ +- if ((ncopies > 1 +- && ! single_defuse_cycle) +- && (code == DOT_PROD_EXPR +- || code == WIDEN_SUM_EXPR +- || code == SAD_EXPR)) ++ if ((ncopies > 1 && ! single_defuse_cycle) ++ && lane_reduc_code_p) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +@@ -6958,46 +6567,130 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + else + vec_num = 1; + ++ vect_model_reduction_cost (stmt_info, reduc_fn, reduction_type, ncopies, ++ cost_vec); ++ if (dump_enabled_p () ++ && reduction_type == FOLD_LEFT_REDUCTION) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "using an in-order (fold-left) reduction.\n"); ++ STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type; ++ /* All but single defuse-cycle optimized, lane-reducing and fold-left ++ reductions go through their own vectorizable_* routines. */ ++ if (!single_defuse_cycle ++ && code != DOT_PROD_EXPR ++ && code != WIDEN_SUM_EXPR ++ && code != SAD_EXPR ++ && reduction_type != FOLD_LEFT_REDUCTION) ++ { ++ stmt_vec_info tem ++ = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info)); ++ if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem)) ++ { ++ gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem)); ++ tem = REDUC_GROUP_FIRST_ELEMENT (tem); ++ } ++ STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def; ++ STMT_VINFO_DEF_TYPE (tem) = vect_internal_def; ++ } ++ else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) ++ { ++ vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); ++ internal_fn cond_fn = get_conditional_internal_fn (code); ++ ++ if (reduction_type != FOLD_LEFT_REDUCTION ++ && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in) ++ && (cond_fn == IFN_LAST ++ || !direct_internal_fn_supported_p (cond_fn, vectype_in, ++ OPTIMIZE_FOR_SPEED))) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "can't use a fully-masked loop because no" ++ " conditional operation is available.\n"); ++ LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; ++ } ++ else ++ vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num, ++ vectype_in, NULL); ++ } ++ return true; ++} ++ ++/* Transform the definition stmt STMT_INFO of a reduction PHI backedge ++ value. */ ++ ++bool ++vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, ++ stmt_vec_info *vec_stmt, slp_tree slp_node) ++{ ++ tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); ++ loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); ++ class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); ++ int i; ++ int ncopies; ++ int j; ++ int vec_num; ++ ++ stmt_vec_info reduc_info = info_for_reduction (stmt_info); ++ gcc_assert (reduc_info->is_reduc_info); ++ ++ if (nested_in_vect_loop_p (loop, stmt_info)) ++ { ++ loop = loop->inner; ++ gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def); ++ } ++ ++ gassign *stmt = as_a (stmt_info->stmt); ++ enum tree_code code = gimple_assign_rhs_code (stmt); ++ int op_type = TREE_CODE_LENGTH (code); ++ ++ /* Flatten RHS. */ ++ tree ops[3]; ++ switch (get_gimple_rhs_class (code)) ++ { ++ case GIMPLE_TERNARY_RHS: ++ ops[2] = gimple_assign_rhs3 (stmt); ++ /* Fall thru. */ ++ case GIMPLE_BINARY_RHS: ++ ops[0] = gimple_assign_rhs1 (stmt); ++ ops[1] = gimple_assign_rhs2 (stmt); ++ break; ++ default: ++ gcc_unreachable (); ++ } ++ ++ /* All uses but the last are expected to be defined in the loop. ++ The last use is the reduction variable. In case of nested cycle this ++ assumption is not true: we use reduc_index to record the index of the ++ reduction variable. */ ++ stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)); ++ gphi *reduc_def_phi = as_a (phi_info->stmt); ++ int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info); ++ tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info); ++ ++ if (slp_node) ++ { ++ ncopies = 1; ++ vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); ++ } ++ else ++ { ++ ncopies = vect_get_num_copies (loop_vinfo, vectype_in); ++ vec_num = 1; ++ } ++ + internal_fn cond_fn = get_conditional_internal_fn (code); + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); +- +- if (!vec_stmt) /* transformation not required. */ +- { +- vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec); +- if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) +- { +- if (reduction_type != FOLD_LEFT_REDUCTION +- && (cond_fn == IFN_LAST +- || !direct_internal_fn_supported_p (cond_fn, vectype_in, +- OPTIMIZE_FOR_SPEED))) +- { +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +- "can't use a fully-masked loop because no" +- " conditional operation is available.\n"); +- LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; +- } +- else if (reduc_index == -1) +- { +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +- "can't use a fully-masked loop for chained" +- " reductions.\n"); +- LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; +- } +- else +- vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num, +- vectype_in); +- } +- if (dump_enabled_p () +- && reduction_type == FOLD_LEFT_REDUCTION) +- dump_printf_loc (MSG_NOTE, vect_location, +- "using an in-order (fold-left) reduction.\n"); +- STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; +- return true; +- } ++ bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in); + + /* Transform. */ ++ stmt_vec_info new_stmt_info = NULL; ++ stmt_vec_info prev_stmt_info; ++ tree new_temp = NULL_TREE; ++ auto_vec vec_oprnds0; ++ auto_vec vec_oprnds1; ++ auto_vec vec_oprnds2; ++ tree def0; + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); +@@ -7008,23 +6701,26 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + + bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); + ++ vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); + if (reduction_type == FOLD_LEFT_REDUCTION) +- return vectorize_fold_left_reduction +- (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code, +- reduc_fn, ops, vectype_in, reduc_index, masks); +- +- if (reduction_type == EXTRACT_LAST_REDUCTION) + { +- gcc_assert (!slp_node); +- return vectorizable_condition (stmt_info, gsi, vec_stmt, +- true, NULL, NULL); ++ internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info); ++ return vectorize_fold_left_reduction ++ (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code, ++ reduc_fn, ops, vectype_in, reduc_index, masks); + } + ++ bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info); ++ gcc_assert (single_defuse_cycle ++ || code == DOT_PROD_EXPR ++ || code == WIDEN_SUM_EXPR ++ || code == SAD_EXPR); ++ + /* Create the destination vector */ +- vec_dest = vect_create_destination_var (scalar_dest, vectype_out); ++ tree scalar_dest = gimple_assign_lhs (stmt); ++ tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out); + + prev_stmt_info = NULL; +- prev_phi_info = NULL; + if (!slp_node) + { + vec_oprnds0.create (1); +@@ -7033,32 +6729,8 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + vec_oprnds2.create (1); + } + +- phis.create (vec_num); +- vect_defs.create (vec_num); +- if (!slp_node) +- vect_defs.quick_push (NULL_TREE); +- +- if (slp_node) +- phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis)); +- else +- phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info)); +- + for (j = 0; j < ncopies; j++) + { +- if (code == COND_EXPR) +- { +- gcc_assert (!slp_node); +- vectorizable_condition (stmt_info, gsi, vec_stmt, +- true, NULL, NULL); +- break; +- } +- if (code == LSHIFT_EXPR +- || code == RSHIFT_EXPR) +- { +- vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL); +- break; +- } +- + /* Handle uses. */ + if (j == 0) + { +@@ -7066,16 +6738,8 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + { + /* Get vec defs for all the operands except the reduction index, + ensuring the ordering of the ops in the vector is kept. */ +- auto_vec slp_ops; + auto_vec, 3> vec_defs; +- +- slp_ops.quick_push (ops[0]); +- slp_ops.quick_push (ops[1]); +- if (op_type == ternary_op) +- slp_ops.quick_push (ops[2]); +- +- vect_get_slp_defs (slp_ops, slp_node, &vec_defs); +- ++ vect_get_slp_defs (slp_node, &vec_defs); + vec_oprnds0.safe_splice (vec_defs[0]); + vec_defs[0].release (); + vec_oprnds1.safe_splice (vec_defs[1]); +@@ -7130,7 +6794,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) + { + tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; +- if (masked_loop_p) ++ if (masked_loop_p && !mask_by_cond_expr) + { + /* Make sure that the reduction accumulator is vop[0]. */ + if (reduc_index == 1) +@@ -7154,6 +6818,14 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + if (op_type == ternary_op) + vop[2] = vec_oprnds2[i]; + ++ if (masked_loop_p && mask_by_cond_expr) ++ { ++ tree mask = vect_get_loop_mask (gsi, masks, ++ vec_num * ncopies, ++ vectype_in, i * ncopies + j); ++ build_vect_cond_expr (code, vop, mask, gsi); ++ } ++ + gassign *new_stmt = gimple_build_assign (vec_dest, code, + vop[0], vop[1], vop[2]); + new_temp = make_ssa_name (vec_dest, new_stmt); +@@ -7163,15 +6835,10 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + } + + if (slp_node) +- { +- SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info); +- vect_defs.quick_push (new_temp); +- } +- else +- vect_defs[0] = new_temp; ++ SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info); + } + +- if (slp_node) ++ if (slp_node || single_defuse_cycle) + continue; + + if (j == 0) +@@ -7182,20 +6849,244 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + prev_stmt_info = new_stmt_info; + } + +- /* Finalize the reduction-phi (set its arguments) and create the +- epilog reduction code. */ +- if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node) +- vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt); ++ if (single_defuse_cycle && !slp_node) ++ STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info; ++ ++ return true; ++} ++ ++/* Transform phase of a cycle PHI. */ ++ ++bool ++vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt, ++ slp_tree slp_node, slp_instance slp_node_instance) ++{ ++ tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); ++ loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); ++ class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); ++ int i; ++ int ncopies; ++ stmt_vec_info prev_phi_info; ++ int j; ++ bool nested_cycle = false; ++ int vec_num; ++ ++ if (nested_in_vect_loop_p (loop, stmt_info)) ++ { ++ loop = loop->inner; ++ nested_cycle = true; ++ } ++ ++ stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); ++ reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info); ++ stmt_vec_info reduc_info = info_for_reduction (stmt_info); ++ gcc_assert (reduc_info->is_reduc_info); ++ ++ if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION ++ || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION) ++ /* Leave the scalar phi in place. */ ++ return true; ++ ++ tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info); ++ /* For a nested cycle we do not fill the above. */ ++ if (!vectype_in) ++ vectype_in = STMT_VINFO_VECTYPE (stmt_info); ++ gcc_assert (vectype_in); ++ ++ if (slp_node) ++ { ++ /* The size vect_schedule_slp_instance computes is off for us. */ ++ vec_num = vect_get_num_vectors ++ (LOOP_VINFO_VECT_FACTOR (loop_vinfo) ++ * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in); ++ ncopies = 1; ++ } ++ else ++ { ++ vec_num = 1; ++ ncopies = vect_get_num_copies (loop_vinfo, vectype_in); ++ } ++ ++ /* Check whether we should use a single PHI node and accumulate ++ vectors to one before the backedge. */ ++ if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info)) ++ ncopies = 1; ++ ++ /* Create the destination vector */ ++ gphi *phi = as_a (stmt_info->stmt); ++ tree vec_dest = vect_create_destination_var (gimple_phi_result (phi), ++ vectype_out); ++ ++ /* Get the loop-entry arguments. */ ++ tree vec_initial_def; ++ auto_vec vec_initial_defs; ++ if (slp_node) ++ { ++ vec_initial_defs.reserve (vec_num); ++ gcc_assert (slp_node == slp_node_instance->reduc_phis); ++ stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info); ++ tree neutral_op ++ = neutral_op_for_slp_reduction (slp_node, vectype_out, ++ STMT_VINFO_REDUC_CODE (reduc_info), ++ first != NULL); ++ get_initial_defs_for_reduction (slp_node_instance->reduc_phis, ++ &vec_initial_defs, vec_num, ++ first != NULL, neutral_op); ++ } ++ else ++ { ++ /* Get at the scalar def before the loop, that defines the initial ++ value of the reduction variable. */ ++ tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi, ++ loop_preheader_edge (loop)); ++ /* Optimize: if initial_def is for REDUC_MAX smaller than the base ++ and we can't use zero for induc_val, use initial_def. Similarly ++ for REDUC_MIN and initial_def larger than the base. */ ++ if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) ++ { ++ tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); ++ if (TREE_CODE (initial_def) == INTEGER_CST ++ && !integer_zerop (induc_val) ++ && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR ++ && tree_int_cst_lt (initial_def, induc_val)) ++ || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR ++ && tree_int_cst_lt (induc_val, initial_def)))) ++ { ++ induc_val = initial_def; ++ /* Communicate we used the initial_def to epilouge ++ generation. */ ++ STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE; ++ } ++ vec_initial_def = build_vector_from_val (vectype_out, induc_val); ++ } ++ else if (nested_cycle) ++ { ++ /* Do not use an adjustment def as that case is not supported ++ correctly if ncopies is not one. */ ++ vec_initial_def = vect_get_vec_def_for_operand (initial_def, ++ reduc_stmt_info); ++ } ++ else ++ { ++ tree adjustment_def = NULL_TREE; ++ tree *adjustment_defp = &adjustment_def; ++ enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info); ++ if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) ++ adjustment_defp = NULL; ++ vec_initial_def ++ = get_initial_def_for_reduction (reduc_stmt_info, code, ++ initial_def, adjustment_defp); ++ STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def; ++ } ++ vec_initial_defs.create (1); ++ vec_initial_defs.quick_push (vec_initial_def); ++ } ++ ++ /* Generate the reduction PHIs upfront. */ ++ prev_phi_info = NULL; ++ for (i = 0; i < vec_num; i++) ++ { ++ tree vec_init_def = vec_initial_defs[i]; ++ for (j = 0; j < ncopies; j++) ++ { ++ /* Create the reduction-phi that defines the reduction ++ operand. */ ++ gphi *new_phi = create_phi_node (vec_dest, loop->header); ++ stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi); ++ ++ /* Set the loop-entry arg of the reduction-phi. */ ++ if (j != 0 && nested_cycle) ++ vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo, ++ vec_init_def); ++ add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop), ++ UNKNOWN_LOCATION); ++ ++ /* The loop-latch arg is set in epilogue processing. */ ++ ++ if (slp_node) ++ SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info); ++ else ++ { ++ if (j == 0) ++ STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info; ++ else ++ STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info; ++ prev_phi_info = new_phi_info; ++ } ++ } ++ } ++ ++ return true; ++} ++ ++/* Vectorizes LC PHIs. */ ++ ++bool ++vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt, ++ slp_tree slp_node) ++{ ++ loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); ++ if (!loop_vinfo ++ || !is_a (stmt_info->stmt) ++ || gimple_phi_num_args (stmt_info->stmt) != 1) ++ return false; ++ ++ if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def ++ && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def) ++ return false; ++ ++ if (!vec_stmt) /* transformation not required. */ ++ { ++ STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type; ++ return true; ++ } + +- vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi, +- epilog_copies, reduc_fn, phis, +- double_reduc, slp_node, slp_node_instance, +- cond_reduc_val, cond_reduc_op_code, +- neutral_op); ++ tree vectype = STMT_VINFO_VECTYPE (stmt_info); ++ tree scalar_dest = gimple_phi_result (stmt_info->stmt); ++ basic_block bb = gimple_bb (stmt_info->stmt); ++ edge e = single_pred_edge (bb); ++ tree vec_dest = vect_create_destination_var (scalar_dest, vectype); ++ vec vec_oprnds = vNULL; ++ vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE, ++ stmt_info, &vec_oprnds, NULL, slp_node); ++ if (slp_node) ++ { ++ unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); ++ gcc_assert (vec_oprnds.length () == vec_num); ++ for (unsigned i = 0; i < vec_num; i++) ++ { ++ /* Create the vectorized LC PHI node. */ ++ gphi *new_phi = create_phi_node (vec_dest, bb); ++ add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION); ++ stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi); ++ SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info); ++ } ++ } ++ else ++ { ++ unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype); ++ stmt_vec_info prev_phi_info = NULL; ++ for (unsigned i = 0; i < ncopies; i++) ++ { ++ if (i != 0) ++ vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL); ++ /* Create the vectorized LC PHI node. */ ++ gphi *new_phi = create_phi_node (vec_dest, bb); ++ add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION); ++ stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi); ++ if (i == 0) ++ STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info; ++ else ++ STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info; ++ prev_phi_info = new_phi_info; ++ } ++ } ++ vec_oprnds.release (); + + return true; + } + ++ + /* Function vect_min_worthwhile_factor. + + For a loop where we could vectorize the operation indicated by CODE, +@@ -7789,8 +7680,8 @@ vectorizable_induction (stmt_vec_info stmt_info, + bool + vectorizable_live_operation (stmt_vec_info stmt_info, + gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, +- slp_tree slp_node, int slp_index, +- stmt_vec_info *vec_stmt, ++ slp_tree slp_node, slp_instance slp_node_instance, ++ int slp_index, stmt_vec_info *vec_stmt, + stmt_vector_for_cost *) + { + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); +@@ -7807,8 +7698,33 @@ vectorizable_live_operation (stmt_vec_info stmt_info, + + gcc_assert (STMT_VINFO_LIVE_P (stmt_info)); + +- if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def) +- return false; ++ /* If a stmt of a reduction is live, vectorize it via ++ vect_create_epilog_for_reduction. vectorizable_reduction assessed ++ validity so just trigger the transform here. */ ++ if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))) ++ { ++ if (!vec_stmt) ++ return true; ++ if (slp_node) ++ { ++ /* For reduction chains the meta-info is attached to ++ the group leader. */ ++ if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) ++ stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info); ++ /* For SLP reductions we vectorize the epilogue for ++ all involved stmts together. */ ++ else if (slp_index != 0) ++ return true; ++ } ++ stmt_vec_info reduc_info = info_for_reduction (stmt_info); ++ gcc_assert (reduc_info->is_reduc_info); ++ if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION ++ || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION) ++ return true; ++ vect_create_epilog_for_reduction (stmt_info, slp_node, ++ slp_node_instance); ++ return true; ++ } + + /* FORNOW. CHECKME. */ + if (nested_in_vect_loop_p (loop, stmt_info)) +@@ -7892,7 +7808,7 @@ vectorizable_live_operation (stmt_vec_info stmt_info, + gcc_assert (ncopies == 1 && !slp_node); + vect_record_loop_mask (loop_vinfo, + &LOOP_VINFO_MASKS (loop_vinfo), +- 1, vectype); ++ 1, vectype, NULL); + } + } + return true; +@@ -8071,31 +7987,34 @@ loop_niters_no_overflow (loop_vec_info loop_vinfo) + return false; + } + +-/* Return a mask type with half the number of elements as TYPE. */ ++/* Return a mask type with half the number of elements as OLD_TYPE, ++ given that it should have mode NEW_MODE. */ + + tree +-vect_halve_mask_nunits (tree type) ++vect_halve_mask_nunits (tree old_type, machine_mode new_mode) + { +- poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2); +- return build_truth_vector_type (nunits, current_vector_size); ++ poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2); ++ return build_truth_vector_type_for_mode (nunits, new_mode); + } + +-/* Return a mask type with twice as many elements as TYPE. */ ++/* Return a mask type with twice as many elements as OLD_TYPE, ++ given that it should have mode NEW_MODE. */ + + tree +-vect_double_mask_nunits (tree type) ++vect_double_mask_nunits (tree old_type, machine_mode new_mode) + { +- poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2; +- return build_truth_vector_type (nunits, current_vector_size); ++ poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2; ++ return build_truth_vector_type_for_mode (nunits, new_mode); + } + + /* Record that a fully-masked version of LOOP_VINFO would need MASKS to + contain a sequence of NVECTORS masks that each control a vector of type +- VECTYPE. */ ++ VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND ++ these vector masks with the vector version of SCALAR_MASK. */ + + void + vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, +- unsigned int nvectors, tree vectype) ++ unsigned int nvectors, tree vectype, tree scalar_mask) + { + gcc_assert (nvectors != 0); + if (masks->length () < nvectors) +@@ -8106,10 +8025,17 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, + unsigned int nscalars_per_iter + = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype), + LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); ++ ++ if (scalar_mask) ++ { ++ scalar_cond_masked_key cond (scalar_mask, nvectors); ++ loop_vinfo->scalar_cond_masked_set.add (cond); ++ } ++ + if (rgm->max_nscalars_per_iter < nscalars_per_iter) + { + rgm->max_nscalars_per_iter = nscalars_per_iter; +- rgm->mask_type = build_same_sized_truth_vector_type (vectype); ++ rgm->mask_type = truth_type_for (vectype); + } + } + +@@ -8154,7 +8080,7 @@ vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks, + gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type), + TYPE_VECTOR_SUBPARTS (vectype))); + gimple_seq seq = NULL; +- mask_type = build_same_sized_truth_vector_type (vectype); ++ mask_type = truth_type_for (vectype); + mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask); + if (seq) + gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); +@@ -8242,6 +8168,186 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, + *seen_store = stmt_info; + } + ++/* Helper function to pass to simplify_replace_tree to enable replacing tree's ++ in the hash_map with its corresponding values. */ ++ ++static tree ++find_in_mapping (tree t, void *context) ++{ ++ hash_map* mapping = (hash_map*) context; ++ ++ tree *value = mapping->get (t); ++ return value ? *value : t; ++} ++ ++/* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the ++ original loop that has now been vectorized. ++ ++ The inits of the data_references need to be advanced with the number of ++ iterations of the main loop. This has been computed in vect_do_peeling and ++ is stored in parameter ADVANCE. We first restore the data_references ++ initial offset with the values recored in ORIG_DRS_INIT. ++ ++ Since the loop_vec_info of this EPILOGUE was constructed for the original ++ loop, its stmt_vec_infos all point to the original statements. These need ++ to be updated to point to their corresponding copies as well as the SSA_NAMES ++ in their PATTERN_DEF_SEQs and RELATED_STMTs. ++ ++ The data_reference's connections also need to be updated. Their ++ corresponding dr_vec_info need to be reconnected to the EPILOGUE's ++ stmt_vec_infos, their statements need to point to their corresponding copy, ++ if they are gather loads or scatter stores then their reference needs to be ++ updated to point to its corresponding copy and finally we set ++ 'base_misaligned' to false as we have already peeled for alignment in the ++ prologue of the main loop. */ ++ ++static void ++update_epilogue_loop_vinfo (class loop *epilogue, tree advance, ++ drs_init_vec &orig_drs_init) ++{ ++ loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue); ++ auto_vec stmt_worklist; ++ hash_map mapping; ++ gimple *orig_stmt, *new_stmt; ++ gimple_stmt_iterator epilogue_gsi; ++ gphi_iterator epilogue_phi_gsi; ++ stmt_vec_info stmt_vinfo = NULL, related_vinfo; ++ basic_block *epilogue_bbs = get_loop_body (epilogue); ++ ++ LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs; ++ ++ /* Restore original data_reference's offset, before the previous loop and its ++ prologue. */ ++ std::pair *dr_init; ++ unsigned i; ++ for (i = 0; orig_drs_init.iterate (i, &dr_init); i++) ++ DR_OFFSET (dr_init->first) = dr_init->second; ++ ++ /* Advance data_reference's with the number of iterations of the previous ++ loop and its prologue. */ ++ vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR); ++ ++ ++ /* The EPILOGUE loop is a copy of the original loop so they share the same ++ gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to ++ point to the copied statements. We also create a mapping of all LHS' in ++ the original loop and all the LHS' in the EPILOGUE and create worklists to ++ update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */ ++ for (unsigned i = 0; i < epilogue->num_nodes; ++i) ++ { ++ for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]); ++ !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi)) ++ { ++ new_stmt = epilogue_phi_gsi.phi (); ++ ++ gcc_assert (gimple_uid (new_stmt) > 0); ++ stmt_vinfo ++ = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1]; ++ ++ orig_stmt = STMT_VINFO_STMT (stmt_vinfo); ++ STMT_VINFO_STMT (stmt_vinfo) = new_stmt; ++ ++ mapping.put (gimple_phi_result (orig_stmt), ++ gimple_phi_result (new_stmt)); ++ /* PHI nodes can not have patterns or related statements. */ ++ gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL ++ && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL); ++ } ++ ++ for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]); ++ !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi)) ++ { ++ new_stmt = gsi_stmt (epilogue_gsi); ++ ++ gcc_assert (gimple_uid (new_stmt) > 0); ++ stmt_vinfo ++ = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1]; ++ ++ orig_stmt = STMT_VINFO_STMT (stmt_vinfo); ++ STMT_VINFO_STMT (stmt_vinfo) = new_stmt; ++ ++ if (tree old_lhs = gimple_get_lhs (orig_stmt)) ++ mapping.put (old_lhs, gimple_get_lhs (new_stmt)); ++ ++ if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo)) ++ { ++ gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo); ++ for (gimple_stmt_iterator gsi = gsi_start (seq); ++ !gsi_end_p (gsi); gsi_next (&gsi)) ++ stmt_worklist.safe_push (gsi_stmt (gsi)); ++ } ++ ++ related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo); ++ if (related_vinfo != NULL && related_vinfo != stmt_vinfo) ++ { ++ gimple *stmt = STMT_VINFO_STMT (related_vinfo); ++ stmt_worklist.safe_push (stmt); ++ /* Set BB such that the assert in ++ 'get_initial_def_for_reduction' is able to determine that ++ the BB of the related stmt is inside this loop. */ ++ gimple_set_bb (stmt, ++ gimple_bb (new_stmt)); ++ related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo); ++ gcc_assert (related_vinfo == NULL ++ || related_vinfo == stmt_vinfo); ++ } ++ } ++ } ++ ++ /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed ++ using the original main loop and thus need to be updated to refer to the ++ cloned variables used in the epilogue. */ ++ for (unsigned i = 0; i < stmt_worklist.length (); ++i) ++ { ++ gimple *stmt = stmt_worklist[i]; ++ tree *new_op; ++ ++ for (unsigned j = 1; j < gimple_num_ops (stmt); ++j) ++ { ++ tree op = gimple_op (stmt, j); ++ if ((new_op = mapping.get(op))) ++ gimple_set_op (stmt, j, *new_op); ++ else ++ { ++ op = simplify_replace_tree (op, NULL_TREE, NULL_TREE, ++ &find_in_mapping, &mapping); ++ gimple_set_op (stmt, j, op); ++ } ++ } ++ } ++ ++ struct data_reference *dr; ++ vec datarefs = epilogue_vinfo->shared->datarefs; ++ FOR_EACH_VEC_ELT (datarefs, i, dr) ++ { ++ orig_stmt = DR_STMT (dr); ++ gcc_assert (gimple_uid (orig_stmt) > 0); ++ stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1]; ++ /* Data references for gather loads and scatter stores do not use the ++ updated offset we set using ADVANCE. Instead we have to make sure the ++ reference in the data references point to the corresponding copy of ++ the original in the epilogue. */ ++ if (STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo)) ++ { ++ DR_REF (dr) ++ = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE, ++ &find_in_mapping, &mapping); ++ DR_BASE_ADDRESS (dr) ++ = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE, ++ &find_in_mapping, &mapping); ++ } ++ DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo); ++ stmt_vinfo->dr_aux.stmt = stmt_vinfo; ++ /* The vector size of the epilogue is smaller than that of the main loop ++ so the alignment is either the same or lower. This means the dr will ++ thus by definition be aligned. */ ++ STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false; ++ } ++ ++ epilogue_vinfo->shared->datarefs_copy.release (); ++ epilogue_vinfo->shared->save_datarefs (); ++} ++ + /* Function vect_transform_loop. + + The analysis phase has determined that the loop is vectorizable. +@@ -8279,11 +8385,11 @@ vect_transform_loop (loop_vec_info loop_vinfo) + if (th >= vect_vf_for_cost (loop_vinfo) + && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) + { +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_NOTE, vect_location, +- "Profitability threshold is %d loop iterations.\n", +- th); +- check_profitability = true; ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Profitability threshold is %d loop iterations.\n", ++ th); ++ check_profitability = true; + } + + /* Make sure there exists a single-predecessor exit bb. Do this before +@@ -8301,18 +8407,8 @@ vect_transform_loop (loop_vec_info loop_vinfo) + + if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) + { +- poly_uint64 versioning_threshold +- = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); +- if (check_profitability +- && ordered_p (poly_uint64 (th), versioning_threshold)) +- { +- versioning_threshold = ordered_max (poly_uint64 (th), +- versioning_threshold); +- check_profitability = false; +- } + struct loop *sloop +- = vect_loop_versioning (loop_vinfo, th, check_profitability, +- versioning_threshold); ++ = vect_loop_versioning (loop_vinfo); + sloop->force_vectorize = false; + check_profitability = false; + } +@@ -8337,9 +8433,13 @@ vect_transform_loop (loop_vec_info loop_vinfo) + LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters; + tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo)); + bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo); ++ tree advance; ++ drs_init_vec orig_drs_init; ++ + epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, + &step_vector, &niters_vector_mult_vf, th, +- check_profitability, niters_no_overflow); ++ check_profitability, niters_no_overflow, ++ &advance, orig_drs_init); + + if (niters_vector == NULL_TREE) + { +@@ -8413,7 +8513,9 @@ vect_transform_loop (loop_vec_info loop_vinfo) + + if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def + || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def +- || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) ++ || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def ++ || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle ++ || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def) + && ! PURE_SLP_STMT (stmt_info)) + { + if (dump_enabled_p ()) +@@ -8565,12 +8667,9 @@ vect_transform_loop (loop_vec_info loop_vinfo) + dump_printf (MSG_NOTE, "\n"); + } + else +- { +- dump_printf_loc (MSG_NOTE, vect_location, +- "LOOP EPILOGUE VECTORIZED (VS="); +- dump_dec (MSG_NOTE, current_vector_size); +- dump_printf (MSG_NOTE, ")\n"); +- } ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "LOOP EPILOGUE VECTORIZED (MODE=%s)\n", ++ GET_MODE_NAME (loop_vinfo->vector_mode)); + } + + /* Loops vectorized with a variable factor won't benefit from +@@ -8592,57 +8691,14 @@ vect_transform_loop (loop_vec_info loop_vinfo) + since vectorized loop can have loop-carried dependencies. */ + loop->safelen = 0; + +- /* Don't vectorize epilogue for epilogue. */ +- if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) +- epilogue = NULL; +- +- if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK)) +- epilogue = NULL; +- + if (epilogue) + { +- auto_vector_sizes vector_sizes; +- targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); +- unsigned int next_size = 0; +- +- /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work +- on niters already ajusted for the iterations of the prologue. */ +- if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) +- && known_eq (vf, lowest_vf)) +- { +- unsigned HOST_WIDE_INT eiters +- = (LOOP_VINFO_INT_NITERS (loop_vinfo) +- - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)); +- eiters +- = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo); +- epilogue->nb_iterations_upper_bound = eiters - 1; +- epilogue->any_upper_bound = true; +- +- unsigned int ratio; +- while (next_size < vector_sizes.length () +- && !(constant_multiple_p (current_vector_size, +- vector_sizes[next_size], &ratio) +- && eiters >= lowest_vf / ratio)) +- next_size += 1; +- } +- else +- while (next_size < vector_sizes.length () +- && maybe_lt (current_vector_size, vector_sizes[next_size])) +- next_size += 1; +- +- if (next_size == vector_sizes.length ()) +- epilogue = NULL; +- } ++ update_epilogue_loop_vinfo (epilogue, advance, orig_drs_init); + +- if (epilogue) +- { ++ epilogue->simduid = loop->simduid; + epilogue->force_vectorize = loop->force_vectorize; + epilogue->safelen = loop->safelen; + epilogue->dont_vectorize = false; +- +- /* We may need to if-convert epilogue to vectorize it. */ +- if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) +- tree_if_conversion (epilogue); + } + + return epilogue; +diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c +index badf4e7104e..6356ecd692f 100644 +--- a/gcc/tree-vect-patterns.c ++++ b/gcc/tree-vect-patterns.c +@@ -46,6 +46,8 @@ along with GCC; see the file COPYING3. If not see + #include "cgraph.h" + #include "omp-simd-clone.h" + #include "predict.h" ++#include "tree-vector-builder.h" ++#include "vec-perm-indices.h" + + /* Return true if we have a useful VR_RANGE range for VAR, storing it + in *MIN_VALUE and *MAX_VALUE if so. Note the range in the dump files. */ +@@ -185,15 +187,15 @@ vect_get_external_def_edge (vec_info *vinfo, tree var) + is nonnull. */ + + static bool +-vect_supportable_direct_optab_p (tree otype, tree_code code, ++vect_supportable_direct_optab_p (vec_info *vinfo, tree otype, tree_code code, + tree itype, tree *vecotype_out, + tree *vecitype_out = NULL) + { +- tree vecitype = get_vectype_for_scalar_type (itype); ++ tree vecitype = get_vectype_for_scalar_type (vinfo, itype); + if (!vecitype) + return false; + +- tree vecotype = get_vectype_for_scalar_type (otype); ++ tree vecotype = get_vectype_for_scalar_type (vinfo, otype); + if (!vecotype) + return false; + +@@ -632,6 +634,7 @@ static bool + vect_split_statement (stmt_vec_info stmt2_info, tree new_rhs, + gimple *stmt1, tree vectype) + { ++ vec_info *vinfo = stmt2_info->vinfo; + if (is_pattern_stmt_p (stmt2_info)) + { + /* STMT2_INFO is part of a pattern. Get the statement to which +@@ -675,7 +678,7 @@ vect_split_statement (stmt_vec_info stmt2_info, tree new_rhs, + two-statement pattern now. */ + gcc_assert (!STMT_VINFO_RELATED_STMT (stmt2_info)); + tree lhs_type = TREE_TYPE (gimple_get_lhs (stmt2_info->stmt)); +- tree lhs_vectype = get_vectype_for_scalar_type (lhs_type); ++ tree lhs_vectype = get_vectype_for_scalar_type (vinfo, lhs_type); + if (!lhs_vectype) + return false; + +@@ -712,6 +715,8 @@ static tree + vect_convert_input (stmt_vec_info stmt_info, tree type, + vect_unpromoted_value *unprom, tree vectype) + { ++ vec_info *vinfo = stmt_info->vinfo; ++ + /* Check for a no-op conversion. */ + if (types_compatible_p (type, TREE_TYPE (unprom->op))) + return unprom->op; +@@ -749,7 +754,7 @@ vect_convert_input (stmt_vec_info stmt_info, tree type, + unsigned promotion. */ + tree midtype = build_nonstandard_integer_type + (TYPE_PRECISION (type), TYPE_UNSIGNED (unprom->type)); +- tree vec_midtype = get_vectype_for_scalar_type (midtype); ++ tree vec_midtype = get_vectype_for_scalar_type (vinfo, midtype); + if (vec_midtype) + { + input = vect_recog_temp_ssa_var (midtype, NULL); +@@ -830,17 +835,8 @@ vect_convert_output (stmt_vec_info stmt_info, tree type, gimple *pattern_stmt, + /* Return true if STMT_VINFO describes a reduction for which reassociation + is allowed. If STMT_INFO is part of a group, assume that it's part of + a reduction chain and optimistically assume that all statements +- except the last allow reassociation. */ +- +-static bool +-vect_reassociating_reduction_p (stmt_vec_info stmt_vinfo) +-{ +- return (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def +- ? STMT_VINFO_REDUC_TYPE (stmt_vinfo) != FOLD_LEFT_REDUCTION +- : REDUC_GROUP_FIRST_ELEMENT (stmt_vinfo) != NULL); +-} +- +-/* As above, but also require it to have code CODE and to be a reduction ++ except the last allow reassociation. ++ Also require it to have code CODE and to be a reduction + in the outermost loop. When returning true, store the operands in + *OP0_OUT and *OP1_OUT. */ + +@@ -862,11 +858,19 @@ vect_reassociating_reduction_p (stmt_vec_info stmt_info, tree_code code, + if (loop && nested_in_vect_loop_p (loop, stmt_info)) + return false; + +- if (!vect_reassociating_reduction_p (stmt_info)) ++ if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def) ++ { ++ if (needs_fold_left_reduction_p (TREE_TYPE (gimple_assign_lhs (assign)), ++ code)) ++ return false; ++ } ++ else if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) == NULL) + return false; + + *op0_out = gimple_assign_rhs1 (assign); + *op1_out = gimple_assign_rhs2 (assign); ++ if (commutative_tree_code (code) && STMT_VINFO_REDUC_IDX (stmt_info) == 0) ++ std::swap (*op0_out, *op1_out); + return true; + } + +@@ -983,7 +987,7 @@ vect_recog_dot_prod_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + vect_pattern_detected ("vect_recog_dot_prod_pattern", last_stmt); + + tree half_vectype; +- if (!vect_supportable_direct_optab_p (type, DOT_PROD_EXPR, half_type, ++ if (!vect_supportable_direct_optab_p (vinfo, type, DOT_PROD_EXPR, half_type, + type_out, &half_vectype)) + return NULL; + +@@ -1141,7 +1145,7 @@ vect_recog_sad_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + vect_pattern_detected ("vect_recog_sad_pattern", last_stmt); + + tree half_vectype; +- if (!vect_supportable_direct_optab_p (sum_type, SAD_EXPR, half_type, ++ if (!vect_supportable_direct_optab_p (vinfo, sum_type, SAD_EXPR, half_type, + type_out, &half_vectype)) + return NULL; + +@@ -1187,6 +1191,7 @@ vect_recog_widen_op_pattern (stmt_vec_info last_stmt_info, tree *type_out, + tree_code orig_code, tree_code wide_code, + bool shift_p, const char *name) + { ++ vec_info *vinfo = last_stmt_info->vinfo; + gimple *last_stmt = last_stmt_info->stmt; + + vect_unpromoted_value unprom[2]; +@@ -1206,8 +1211,8 @@ vect_recog_widen_op_pattern (stmt_vec_info last_stmt_info, tree *type_out, + TYPE_UNSIGNED (half_type)); + + /* Check target support */ +- tree vectype = get_vectype_for_scalar_type (half_type); +- tree vecitype = get_vectype_for_scalar_type (itype); ++ tree vectype = get_vectype_for_scalar_type (vinfo, half_type); ++ tree vecitype = get_vectype_for_scalar_type (vinfo, itype); + enum tree_code dummy_code; + int dummy_int; + auto_vec dummy_vec; +@@ -1219,7 +1224,7 @@ vect_recog_widen_op_pattern (stmt_vec_info last_stmt_info, tree *type_out, + &dummy_int, &dummy_vec)) + return NULL; + +- *type_out = get_vectype_for_scalar_type (type); ++ *type_out = get_vectype_for_scalar_type (vinfo, type); + if (!*type_out) + return NULL; + +@@ -1271,6 +1276,7 @@ vect_recog_widen_mult_pattern (stmt_vec_info last_stmt_info, tree *type_out) + static gimple * + vect_recog_pow_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + { ++ vec_info *vinfo = stmt_vinfo->vinfo; + gimple *last_stmt = stmt_vinfo->stmt; + tree base, exp; + gimple *stmt; +@@ -1339,7 +1345,7 @@ vect_recog_pow_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + if (node->simd_clones == NULL) + return NULL; + } +- *type_out = get_vectype_for_scalar_type (TREE_TYPE (base)); ++ *type_out = get_vectype_for_scalar_type (vinfo, TREE_TYPE (base)); + if (!*type_out) + return NULL; + tree def = vect_recog_temp_ssa_var (TREE_TYPE (base), NULL); +@@ -1364,7 +1370,7 @@ vect_recog_pow_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + || (TREE_CODE (exp) == REAL_CST + && real_equal (&TREE_REAL_CST (exp), &dconst2))) + { +- if (!vect_supportable_direct_optab_p (TREE_TYPE (base), MULT_EXPR, ++ if (!vect_supportable_direct_optab_p (vinfo, TREE_TYPE (base), MULT_EXPR, + TREE_TYPE (base), type_out)) + return NULL; + +@@ -1377,7 +1383,7 @@ vect_recog_pow_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + if (TREE_CODE (exp) == REAL_CST + && real_equal (&TREE_REAL_CST (exp), &dconsthalf)) + { +- *type_out = get_vectype_for_scalar_type (TREE_TYPE (base)); ++ *type_out = get_vectype_for_scalar_type (vinfo, TREE_TYPE (base)); + if (*type_out + && direct_internal_fn_supported_p (IFN_SQRT, *type_out, + OPTIMIZE_FOR_SPEED)) +@@ -1470,8 +1476,8 @@ vect_recog_widen_sum_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + + vect_pattern_detected ("vect_recog_widen_sum_pattern", last_stmt); + +- if (!vect_supportable_direct_optab_p (type, WIDEN_SUM_EXPR, unprom0.type, +- type_out)) ++ if (!vect_supportable_direct_optab_p (vinfo, type, WIDEN_SUM_EXPR, ++ unprom0.type, type_out)) + return NULL; + + var = vect_recog_temp_ssa_var (type, NULL); +@@ -1662,7 +1668,7 @@ vect_recog_over_widening_pattern (stmt_vec_info last_stmt_info, tree *type_out) + + vect_pattern_detected ("vect_recog_over_widening_pattern", last_stmt); + +- *type_out = get_vectype_for_scalar_type (type); ++ *type_out = get_vectype_for_scalar_type (vinfo, type); + if (!*type_out) + return NULL; + +@@ -1683,8 +1689,8 @@ vect_recog_over_widening_pattern (stmt_vec_info last_stmt_info, tree *type_out) + wants to rewrite anyway. If targets have a minimum element size + for some optabs, we should pattern-match smaller ops to larger ops + where beneficial. */ +- tree new_vectype = get_vectype_for_scalar_type (new_type); +- tree op_vectype = get_vectype_for_scalar_type (op_type); ++ tree new_vectype = get_vectype_for_scalar_type (vinfo, new_type); ++ tree op_vectype = get_vectype_for_scalar_type (vinfo, op_type); + if (!new_vectype || !op_vectype) + return NULL; + +@@ -1842,7 +1848,7 @@ vect_recog_average_pattern (stmt_vec_info last_stmt_info, tree *type_out) + TYPE_UNSIGNED (new_type)); + + /* Check for target support. */ +- tree new_vectype = get_vectype_for_scalar_type (new_type); ++ tree new_vectype = get_vectype_for_scalar_type (vinfo, new_type); + if (!new_vectype + || !direct_internal_fn_supported_p (ifn, new_vectype, + OPTIMIZE_FOR_SPEED)) +@@ -1850,7 +1856,7 @@ vect_recog_average_pattern (stmt_vec_info last_stmt_info, tree *type_out) + + /* The IR requires a valid vector type for the cast result, even though + it's likely to be discarded. */ +- *type_out = get_vectype_for_scalar_type (type); ++ *type_out = get_vectype_for_scalar_type (vinfo, type); + if (!*type_out) + return NULL; + +@@ -1936,7 +1942,7 @@ vect_recog_cast_forwprop_pattern (stmt_vec_info last_stmt_info, tree *type_out) + the unnecessary widening and narrowing. */ + vect_pattern_detected ("vect_recog_cast_forwprop_pattern", last_stmt); + +- *type_out = get_vectype_for_scalar_type (lhs_type); ++ *type_out = get_vectype_for_scalar_type (vinfo, lhs_type); + if (!*type_out) + return NULL; + +@@ -1996,24 +2002,107 @@ vect_recog_rotate_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + enum vect_def_type dt; + optab optab1, optab2; + edge ext_def = NULL; ++ bool bswap16_p = false; + +- if (!is_gimple_assign (last_stmt)) +- return NULL; ++ if (is_gimple_assign (last_stmt)) ++ { ++ rhs_code = gimple_assign_rhs_code (last_stmt); ++ switch (rhs_code) ++ { ++ case LROTATE_EXPR: ++ case RROTATE_EXPR: ++ break; ++ default: ++ return NULL; ++ } + +- rhs_code = gimple_assign_rhs_code (last_stmt); +- switch (rhs_code) ++ lhs = gimple_assign_lhs (last_stmt); ++ oprnd0 = gimple_assign_rhs1 (last_stmt); ++ type = TREE_TYPE (oprnd0); ++ oprnd1 = gimple_assign_rhs2 (last_stmt); ++ } ++ else if (gimple_call_builtin_p (last_stmt, BUILT_IN_BSWAP16)) + { +- case LROTATE_EXPR: +- case RROTATE_EXPR: +- break; +- default: +- return NULL; ++ /* __builtin_bswap16 (x) is another form of x r>> 8. ++ The vectorizer has bswap support, but only if the argument isn't ++ promoted. */ ++ lhs = gimple_call_lhs (last_stmt); ++ oprnd0 = gimple_call_arg (last_stmt, 0); ++ type = TREE_TYPE (oprnd0); ++ if (TYPE_PRECISION (TREE_TYPE (lhs)) != 16 ++ || TYPE_PRECISION (type) <= 16 ++ || TREE_CODE (oprnd0) != SSA_NAME ++ || BITS_PER_UNIT != 8 ++ || !TYPE_UNSIGNED (TREE_TYPE (lhs))) ++ return NULL; ++ ++ stmt_vec_info def_stmt_info; ++ if (!vect_is_simple_use (oprnd0, vinfo, &dt, &def_stmt_info, &def_stmt)) ++ return NULL; ++ ++ if (dt != vect_internal_def) ++ return NULL; ++ ++ if (gimple_assign_cast_p (def_stmt)) ++ { ++ def = gimple_assign_rhs1 (def_stmt); ++ if (INTEGRAL_TYPE_P (TREE_TYPE (def)) ++ && TYPE_PRECISION (TREE_TYPE (def)) == 16) ++ oprnd0 = def; ++ } ++ ++ type = TREE_TYPE (lhs); ++ vectype = get_vectype_for_scalar_type (vinfo, type); ++ if (vectype == NULL_TREE) ++ return NULL; ++ ++ if (tree char_vectype = get_same_sized_vectype (char_type_node, vectype)) ++ { ++ /* The encoding uses one stepped pattern for each byte in the ++ 16-bit word. */ ++ vec_perm_builder elts (TYPE_VECTOR_SUBPARTS (char_vectype), 2, 3); ++ for (unsigned i = 0; i < 3; ++i) ++ for (unsigned j = 0; j < 2; ++j) ++ elts.quick_push ((i + 1) * 2 - j - 1); ++ ++ vec_perm_indices indices (elts, 1, ++ TYPE_VECTOR_SUBPARTS (char_vectype)); ++ if (can_vec_perm_const_p (TYPE_MODE (char_vectype), indices)) ++ { ++ /* vectorizable_bswap can handle the __builtin_bswap16 if we ++ undo the argument promotion. */ ++ if (!useless_type_conversion_p (type, TREE_TYPE (oprnd0))) ++ { ++ def = vect_recog_temp_ssa_var (type, NULL); ++ def_stmt = gimple_build_assign (def, NOP_EXPR, oprnd0); ++ append_pattern_def_seq (stmt_vinfo, def_stmt); ++ oprnd0 = def; ++ } ++ ++ /* Pattern detected. */ ++ vect_pattern_detected ("vect_recog_rotate_pattern", last_stmt); ++ ++ *type_out = vectype; ++ ++ /* Pattern supported. Create a stmt to be used to replace the ++ pattern, with the unpromoted argument. */ ++ var = vect_recog_temp_ssa_var (type, NULL); ++ pattern_stmt = gimple_build_call (gimple_call_fndecl (last_stmt), ++ 1, oprnd0); ++ gimple_call_set_lhs (pattern_stmt, var); ++ gimple_call_set_fntype (as_a (pattern_stmt), ++ gimple_call_fntype (last_stmt)); ++ return pattern_stmt; ++ } ++ } ++ ++ oprnd1 = build_int_cst (integer_type_node, 8); ++ rhs_code = LROTATE_EXPR; ++ bswap16_p = true; + } ++ else ++ return NULL; + +- lhs = gimple_assign_lhs (last_stmt); +- oprnd0 = gimple_assign_rhs1 (last_stmt); +- type = TREE_TYPE (oprnd0); +- oprnd1 = gimple_assign_rhs2 (last_stmt); + if (TREE_CODE (oprnd0) != SSA_NAME + || TYPE_PRECISION (TREE_TYPE (lhs)) != TYPE_PRECISION (type) + || !INTEGRAL_TYPE_P (type) +@@ -2029,7 +2118,7 @@ vect_recog_rotate_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + && dt != vect_external_def) + return NULL; + +- vectype = get_vectype_for_scalar_type (type); ++ vectype = get_vectype_for_scalar_type (vinfo, type); + if (vectype == NULL_TREE) + return NULL; + +@@ -2038,14 +2127,39 @@ vect_recog_rotate_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + optab1 = optab_for_tree_code (rhs_code, vectype, optab_vector); + if (optab1 + && optab_handler (optab1, TYPE_MODE (vectype)) != CODE_FOR_nothing) +- return NULL; ++ { ++ use_rotate: ++ if (bswap16_p) ++ { ++ if (!useless_type_conversion_p (type, TREE_TYPE (oprnd0))) ++ { ++ def = vect_recog_temp_ssa_var (type, NULL); ++ def_stmt = gimple_build_assign (def, NOP_EXPR, oprnd0); ++ append_pattern_def_seq (stmt_vinfo, def_stmt); ++ oprnd0 = def; ++ } ++ ++ /* Pattern detected. */ ++ vect_pattern_detected ("vect_recog_rotate_pattern", last_stmt); ++ ++ *type_out = vectype; ++ ++ /* Pattern supported. Create a stmt to be used to replace the ++ pattern. */ ++ var = vect_recog_temp_ssa_var (type, NULL); ++ pattern_stmt = gimple_build_assign (var, LROTATE_EXPR, oprnd0, ++ oprnd1); ++ return pattern_stmt; ++ } ++ return NULL; ++ } + + if (is_a (vinfo) || dt != vect_internal_def) + { + optab2 = optab_for_tree_code (rhs_code, vectype, optab_scalar); + if (optab2 + && optab_handler (optab2, TYPE_MODE (vectype)) != CODE_FOR_nothing) +- return NULL; ++ goto use_rotate; + } + + /* If vector/vector or vector/scalar shifts aren't supported by the target, +@@ -2070,6 +2184,14 @@ vect_recog_rotate_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + + *type_out = vectype; + ++ if (bswap16_p && !useless_type_conversion_p (type, TREE_TYPE (oprnd0))) ++ { ++ def = vect_recog_temp_ssa_var (type, NULL); ++ def_stmt = gimple_build_assign (def, NOP_EXPR, oprnd0); ++ append_pattern_def_seq (stmt_vinfo, def_stmt); ++ oprnd0 = def; ++ } ++ + if (dt == vect_external_def && TREE_CODE (oprnd1) == SSA_NAME) + ext_def = vect_get_external_def_edge (vinfo, oprnd1); + +@@ -2106,7 +2228,7 @@ vect_recog_rotate_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + } + else + { +- tree vecstype = get_vectype_for_scalar_type (stype); ++ tree vecstype = get_vectype_for_scalar_type (vinfo, stype); + + if (vecstype == NULL_TREE) + return NULL; +@@ -2235,7 +2357,7 @@ vect_recog_vector_vector_shift_pattern (stmt_vec_info stmt_vinfo, + if (!def_vinfo) + return NULL; + +- *type_out = get_vectype_for_scalar_type (TREE_TYPE (oprnd0)); ++ *type_out = get_vectype_for_scalar_type (vinfo, TREE_TYPE (oprnd0)); + if (*type_out == NULL_TREE) + return NULL; + +@@ -2258,7 +2380,8 @@ vect_recog_vector_vector_shift_pattern (stmt_vec_info stmt_vinfo, + TYPE_PRECISION (TREE_TYPE (oprnd1))); + def = vect_recog_temp_ssa_var (TREE_TYPE (rhs1), NULL); + def_stmt = gimple_build_assign (def, BIT_AND_EXPR, rhs1, mask); +- tree vecstype = get_vectype_for_scalar_type (TREE_TYPE (rhs1)); ++ tree vecstype = get_vectype_for_scalar_type (vinfo, ++ TREE_TYPE (rhs1)); + append_pattern_def_seq (stmt_vinfo, def_stmt, vecstype); + } + } +@@ -2423,6 +2546,7 @@ static gimple * + vect_synth_mult_by_constant (tree op, tree val, + stmt_vec_info stmt_vinfo) + { ++ vec_info *vinfo = stmt_vinfo->vinfo; + tree itype = TREE_TYPE (op); + machine_mode mode = TYPE_MODE (itype); + struct algorithm alg; +@@ -2441,7 +2565,7 @@ vect_synth_mult_by_constant (tree op, tree val, + + /* Targets that don't support vector shifts but support vector additions + can synthesize shifts that way. */ +- bool synth_shift_p = !vect_supportable_shift (LSHIFT_EXPR, multtype); ++ bool synth_shift_p = !vect_supportable_shift (vinfo, LSHIFT_EXPR, multtype); + + HOST_WIDE_INT hwval = tree_to_shwi (val); + /* Use MAX_COST here as we don't want to limit the sequence on rtx costs. +@@ -2452,7 +2576,7 @@ vect_synth_mult_by_constant (tree op, tree val, + if (!possible) + return NULL; + +- tree vectype = get_vectype_for_scalar_type (multtype); ++ tree vectype = get_vectype_for_scalar_type (vinfo, multtype); + + if (!vectype + || !target_supports_mult_synth_alg (&alg, variant, +@@ -2598,6 +2722,7 @@ vect_synth_mult_by_constant (tree op, tree val, + static gimple * + vect_recog_mult_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + { ++ vec_info *vinfo = stmt_vinfo->vinfo; + gimple *last_stmt = stmt_vinfo->stmt; + tree oprnd0, oprnd1, vectype, itype; + gimple *pattern_stmt; +@@ -2618,7 +2743,7 @@ vect_recog_mult_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + || !type_has_mode_precision_p (itype)) + return NULL; + +- vectype = get_vectype_for_scalar_type (itype); ++ vectype = get_vectype_for_scalar_type (vinfo, itype); + if (vectype == NULL_TREE) + return NULL; + +@@ -2686,6 +2811,7 @@ vect_recog_mult_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + static gimple * + vect_recog_divmod_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + { ++ vec_info *vinfo = stmt_vinfo->vinfo; + gimple *last_stmt = stmt_vinfo->stmt; + tree oprnd0, oprnd1, vectype, itype, cond; + gimple *pattern_stmt, *def_stmt; +@@ -2718,7 +2844,7 @@ vect_recog_divmod_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + return NULL; + + scalar_int_mode itype_mode = SCALAR_INT_TYPE_MODE (itype); +- vectype = get_vectype_for_scalar_type (itype); ++ vectype = get_vectype_for_scalar_type (vinfo, itype); + if (vectype == NULL_TREE) + return NULL; + +@@ -2785,7 +2911,7 @@ vect_recog_divmod_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + { + tree utype + = build_nonstandard_integer_type (prec, 1); +- tree vecutype = get_vectype_for_scalar_type (utype); ++ tree vecutype = get_vectype_for_scalar_type (vinfo, utype); + tree shift + = build_int_cst (utype, GET_MODE_BITSIZE (itype_mode) + - tree_log2 (oprnd1)); +@@ -3104,6 +3230,7 @@ vect_recog_divmod_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + static gimple * + vect_recog_mixed_size_cond_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + { ++ vec_info *vinfo = stmt_vinfo->vinfo; + gimple *last_stmt = stmt_vinfo->stmt; + tree cond_expr, then_clause, else_clause; + tree type, vectype, comp_vectype, itype = NULL_TREE, vecitype; +@@ -3126,7 +3253,7 @@ vect_recog_mixed_size_cond_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + return NULL; + + comp_scalar_type = TREE_TYPE (TREE_OPERAND (cond_expr, 0)); +- comp_vectype = get_vectype_for_scalar_type (comp_scalar_type); ++ comp_vectype = get_vectype_for_scalar_type (vinfo, comp_scalar_type); + if (comp_vectype == NULL_TREE) + return NULL; + +@@ -3174,7 +3301,7 @@ vect_recog_mixed_size_cond_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + if (GET_MODE_BITSIZE (type_mode) == cmp_mode_size) + return NULL; + +- vectype = get_vectype_for_scalar_type (type); ++ vectype = get_vectype_for_scalar_type (vinfo, type); + if (vectype == NULL_TREE) + return NULL; + +@@ -3189,7 +3316,7 @@ vect_recog_mixed_size_cond_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + || GET_MODE_BITSIZE (SCALAR_TYPE_MODE (itype)) != cmp_mode_size) + return NULL; + +- vecitype = get_vectype_for_scalar_type (itype); ++ vecitype = get_vectype_for_scalar_type (vinfo, itype); + if (vecitype == NULL_TREE) + return NULL; + +@@ -3283,11 +3410,12 @@ check_bool_pattern (tree var, vec_info *vinfo, hash_set &stmts) + if (stmt_could_throw_p (cfun, def_stmt)) + return false; + +- comp_vectype = get_vectype_for_scalar_type (TREE_TYPE (rhs1)); ++ comp_vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (rhs1)); + if (comp_vectype == NULL_TREE) + return false; + +- tree mask_type = get_mask_type_for_scalar_type (TREE_TYPE (rhs1)); ++ tree mask_type = get_mask_type_for_scalar_type (vinfo, ++ TREE_TYPE (rhs1)); + if (mask_type + && expand_vec_cmp_expr_p (comp_vectype, mask_type, rhs_code)) + return false; +@@ -3297,7 +3425,7 @@ check_bool_pattern (tree var, vec_info *vinfo, hash_set &stmts) + scalar_mode mode = SCALAR_TYPE_MODE (TREE_TYPE (rhs1)); + tree itype + = build_nonstandard_integer_type (GET_MODE_BITSIZE (mode), 1); +- vecitype = get_vectype_for_scalar_type (itype); ++ vecitype = get_vectype_for_scalar_type (vinfo, itype); + if (vecitype == NULL_TREE) + return false; + } +@@ -3326,10 +3454,11 @@ check_bool_pattern (tree var, vec_info *vinfo, hash_set &stmts) + static tree + adjust_bool_pattern_cast (tree type, tree var, stmt_vec_info stmt_info) + { ++ vec_info *vinfo = stmt_info->vinfo; + gimple *cast_stmt = gimple_build_assign (vect_recog_temp_ssa_var (type, NULL), + NOP_EXPR, var); + append_pattern_def_seq (stmt_info, cast_stmt, +- get_vectype_for_scalar_type (type)); ++ get_vectype_for_scalar_type (vinfo, type)); + return gimple_assign_lhs (cast_stmt); + } + +@@ -3343,6 +3472,7 @@ static void + adjust_bool_pattern (tree var, tree out_type, + stmt_vec_info stmt_info, hash_map &defs) + { ++ vec_info *vinfo = stmt_info->vinfo; + gimple *stmt = SSA_NAME_DEF_STMT (var); + enum tree_code rhs_code, def_rhs_code; + tree itype, cond_expr, rhs1, rhs2, irhs1, irhs2; +@@ -3504,7 +3634,7 @@ adjust_bool_pattern (tree var, tree out_type, + + gimple_set_location (pattern_stmt, loc); + append_pattern_def_seq (stmt_info, pattern_stmt, +- get_vectype_for_scalar_type (itype)); ++ get_vectype_for_scalar_type (vinfo, itype)); + defs.put (var, gimple_assign_lhs (pattern_stmt)); + } + +@@ -3607,14 +3737,14 @@ search_type_for_mask_1 (tree var, vec_info *vinfo, + break; + } + +- comp_vectype = get_vectype_for_scalar_type (TREE_TYPE (rhs1)); ++ comp_vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (rhs1)); + if (comp_vectype == NULL_TREE) + { + res = NULL_TREE; + break; + } + +- mask_type = get_mask_type_for_scalar_type (TREE_TYPE (rhs1)); ++ mask_type = get_mask_type_for_scalar_type (vinfo, TREE_TYPE (rhs1)); + if (!mask_type + || !expand_vec_cmp_expr_p (comp_vectype, mask_type, rhs_code)) + { +@@ -3722,7 +3852,7 @@ vect_recog_bool_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + if (! INTEGRAL_TYPE_P (TREE_TYPE (lhs)) + || TYPE_PRECISION (TREE_TYPE (lhs)) == 1) + return NULL; +- vectype = get_vectype_for_scalar_type (TREE_TYPE (lhs)); ++ vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (lhs)); + if (vectype == NULL_TREE) + return NULL; + +@@ -3759,7 +3889,7 @@ vect_recog_bool_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + + if (!useless_type_conversion_p (type, TREE_TYPE (lhs))) + { +- tree new_vectype = get_vectype_for_scalar_type (type); ++ tree new_vectype = get_vectype_for_scalar_type (vinfo, type); + append_pattern_def_seq (stmt_vinfo, pattern_stmt, new_vectype); + + lhs = vect_recog_temp_ssa_var (TREE_TYPE (lhs), NULL); +@@ -3775,7 +3905,7 @@ vect_recog_bool_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + else if (rhs_code == COND_EXPR + && TREE_CODE (var) == SSA_NAME) + { +- vectype = get_vectype_for_scalar_type (TREE_TYPE (lhs)); ++ vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (lhs)); + if (vectype == NULL_TREE) + return NULL; + +@@ -3789,7 +3919,7 @@ vect_recog_bool_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + tree type + = build_nonstandard_integer_type (prec, + TYPE_UNSIGNED (TREE_TYPE (var))); +- if (get_vectype_for_scalar_type (type) == NULL_TREE) ++ if (get_vectype_for_scalar_type (vinfo, type) == NULL_TREE) + return NULL; + + if (!check_bool_pattern (var, vinfo, bool_stmts)) +@@ -3833,7 +3963,7 @@ vect_recog_bool_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + + cst0 = build_int_cst (type, 0); + cst1 = build_int_cst (type, 1); +- new_vectype = get_vectype_for_scalar_type (type); ++ new_vectype = get_vectype_for_scalar_type (vinfo, type); + + rhs = vect_recog_temp_ssa_var (type, NULL); + pattern_stmt = gimple_build_assign (rhs, COND_EXPR, var, cst1, cst0); +@@ -3874,7 +4004,7 @@ build_mask_conversion (tree mask, tree vectype, stmt_vec_info stmt_vinfo) + gimple *stmt; + tree masktype, tmp; + +- masktype = build_same_sized_truth_vector_type (vectype); ++ masktype = truth_type_for (vectype); + tmp = vect_recog_temp_ssa_var (TREE_TYPE (masktype), NULL); + stmt = gimple_build_assign (tmp, CONVERT_EXPR, mask); + append_pattern_def_seq (stmt_vinfo, stmt, masktype); +@@ -3934,19 +4064,19 @@ vect_recog_mask_conversion_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + { + int rhs_index = internal_fn_stored_value_index (ifn); + tree rhs = gimple_call_arg (last_stmt, rhs_index); +- vectype1 = get_vectype_for_scalar_type (TREE_TYPE (rhs)); ++ vectype1 = get_vectype_for_scalar_type (vinfo, TREE_TYPE (rhs)); + } + else + { + lhs = gimple_call_lhs (last_stmt); +- vectype1 = get_vectype_for_scalar_type (TREE_TYPE (lhs)); ++ vectype1 = get_vectype_for_scalar_type (vinfo, TREE_TYPE (lhs)); + } + + tree mask_arg = gimple_call_arg (last_stmt, mask_argno); + tree mask_arg_type = search_type_for_mask (mask_arg, vinfo); + if (!mask_arg_type) + return NULL; +- vectype2 = get_mask_type_for_scalar_type (mask_arg_type); ++ vectype2 = get_mask_type_for_scalar_type (vinfo, mask_arg_type); + + if (!vectype1 || !vectype2 + || known_eq (TYPE_VECTOR_SUBPARTS (vectype1), +@@ -3992,7 +4122,7 @@ vect_recog_mask_conversion_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + /* Check for cond expression requiring mask conversion. */ + if (rhs_code == COND_EXPR) + { +- vectype1 = get_vectype_for_scalar_type (TREE_TYPE (lhs)); ++ vectype1 = get_vectype_for_scalar_type (vinfo, TREE_TYPE (lhs)); + + if (TREE_CODE (rhs1) == SSA_NAME) + { +@@ -4023,7 +4153,7 @@ vect_recog_mask_conversion_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + else + return NULL; + +- vectype2 = get_mask_type_for_scalar_type (rhs1_type); ++ vectype2 = get_mask_type_for_scalar_type (vinfo, rhs1_type); + + if (!vectype1 || !vectype2) + return NULL; +@@ -4058,7 +4188,8 @@ vect_recog_mask_conversion_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + tree wide_scalar_type = build_nonstandard_integer_type + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype1))), + TYPE_UNSIGNED (rhs1_type)); +- tree vectype3 = get_vectype_for_scalar_type (wide_scalar_type); ++ tree vectype3 = get_vectype_for_scalar_type (vinfo, ++ wide_scalar_type); + if (expand_vec_cond_expr_p (vectype1, vectype3, TREE_CODE (rhs1))) + return NULL; + } +@@ -4113,14 +4244,14 @@ vect_recog_mask_conversion_pattern (stmt_vec_info stmt_vinfo, tree *type_out) + + if (TYPE_PRECISION (rhs1_type) < TYPE_PRECISION (rhs2_type)) + { +- vectype1 = get_mask_type_for_scalar_type (rhs1_type); ++ vectype1 = get_mask_type_for_scalar_type (vinfo, rhs1_type); + if (!vectype1) + return NULL; + rhs2 = build_mask_conversion (rhs2, vectype1, stmt_vinfo); + } + else + { +- vectype1 = get_mask_type_for_scalar_type (rhs2_type); ++ vectype1 = get_mask_type_for_scalar_type (vinfo, rhs2_type); + if (!vectype1) + return NULL; + rhs1 = build_mask_conversion (rhs1, vectype1, stmt_vinfo); +@@ -4191,7 +4322,7 @@ vect_convert_mask_for_vectype (tree mask, tree vectype, + tree mask_type = search_type_for_mask (mask, vinfo); + if (mask_type) + { +- tree mask_vectype = get_mask_type_for_scalar_type (mask_type); ++ tree mask_vectype = get_mask_type_for_scalar_type (vinfo, mask_type); + if (mask_vectype + && maybe_ne (TYPE_VECTOR_SUBPARTS (vectype), + TYPE_VECTOR_SUBPARTS (mask_vectype))) +@@ -4214,10 +4345,11 @@ vect_add_conversion_to_pattern (tree type, tree value, stmt_vec_info stmt_info) + if (useless_type_conversion_p (type, TREE_TYPE (value))) + return value; + ++ vec_info *vinfo = stmt_info->vinfo; + tree new_value = vect_recog_temp_ssa_var (type, NULL); + gassign *conversion = gimple_build_assign (new_value, CONVERT_EXPR, value); + append_pattern_def_seq (stmt_info, conversion, +- get_vectype_for_scalar_type (type)); ++ get_vectype_for_scalar_type (vinfo, type)); + return new_value; + } + +@@ -4253,7 +4385,8 @@ vect_recog_gather_scatter_pattern (stmt_vec_info stmt_info, tree *type_out) + return NULL; + + /* Convert the mask to the right form. */ +- tree gs_vectype = get_vectype_for_scalar_type (gs_info.element_type); ++ tree gs_vectype = get_vectype_for_scalar_type (loop_vinfo, ++ gs_info.element_type); + if (mask) + mask = vect_convert_mask_for_vectype (mask, gs_vectype, stmt_info, + loop_vinfo); +@@ -4731,6 +4864,7 @@ static inline void + vect_mark_pattern_stmts (stmt_vec_info orig_stmt_info, gimple *pattern_stmt, + tree pattern_vectype) + { ++ stmt_vec_info orig_stmt_info_saved = orig_stmt_info; + gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (orig_stmt_info); + + gimple *orig_pattern_stmt = NULL; +@@ -4765,6 +4899,9 @@ vect_mark_pattern_stmts (stmt_vec_info orig_stmt_info, gimple *pattern_stmt, + for (gimple_stmt_iterator si = gsi_start (def_seq); + !gsi_end_p (si); gsi_next (&si)) + { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "extra pattern stmt: %G", gsi_stmt (si)); + stmt_vec_info pattern_stmt_info + = vect_init_pattern_stmt (gsi_stmt (si), + orig_stmt_info, pattern_vectype); +@@ -4790,6 +4927,60 @@ vect_mark_pattern_stmts (stmt_vec_info orig_stmt_info, gimple *pattern_stmt, + } + else + vect_set_pattern_stmt (pattern_stmt, orig_stmt_info, pattern_vectype); ++ ++ /* Transfer reduction path info to the pattern. */ ++ if (STMT_VINFO_REDUC_IDX (orig_stmt_info_saved) != -1) ++ { ++ vec_info *vinfo = orig_stmt_info_saved->vinfo; ++ tree lookfor = gimple_op (orig_stmt_info_saved->stmt, ++ 1 + STMT_VINFO_REDUC_IDX (orig_stmt_info)); ++ /* Search the pattern def sequence and the main pattern stmt. Note ++ we may have inserted all into a containing pattern def sequence ++ so the following is a bit awkward. */ ++ gimple_stmt_iterator si; ++ gimple *s; ++ if (def_seq) ++ { ++ si = gsi_start (def_seq); ++ s = gsi_stmt (si); ++ gsi_next (&si); ++ } ++ else ++ { ++ si = gsi_none (); ++ s = pattern_stmt; ++ } ++ do ++ { ++ bool found = false; ++ for (unsigned i = 1; i < gimple_num_ops (s); ++i) ++ if (gimple_op (s, i) == lookfor) ++ { ++ STMT_VINFO_REDUC_IDX (vinfo->lookup_stmt (s)) = i - 1; ++ lookfor = gimple_get_lhs (s); ++ found = true; ++ break; ++ } ++ if (s == pattern_stmt) ++ { ++ if (!found && dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "failed to update reduction index.\n"); ++ break; ++ } ++ if (gsi_end_p (si)) ++ s = pattern_stmt; ++ else ++ { ++ s = gsi_stmt (si); ++ if (s == pattern_stmt) ++ /* Found the end inside a bigger pattern def seq. */ ++ si = gsi_none (); ++ else ++ gsi_next (&si); ++ } ++ } while (1); ++ } + } + + /* Function vect_pattern_recog_1 +diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c +index 2abf480270c..0bef35782b5 100644 +--- a/gcc/tree-vect-slp.c ++++ b/gcc/tree-vect-slp.c +@@ -79,6 +79,7 @@ vect_free_slp_tree (slp_tree node, bool final_p) + + SLP_TREE_CHILDREN (node).release (); + SLP_TREE_SCALAR_STMTS (node).release (); ++ SLP_TREE_SCALAR_OPS (node).release (); + SLP_TREE_VEC_STMTS (node).release (); + SLP_TREE_LOAD_PERMUTATION (node).release (); + +@@ -122,6 +123,7 @@ vect_create_new_slp_node (vec scalar_stmts) + + node = XNEW (struct _slp_tree); + SLP_TREE_SCALAR_STMTS (node) = scalar_stmts; ++ SLP_TREE_SCALAR_OPS (node) = vNULL; + SLP_TREE_VEC_STMTS (node).create (0); + SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0; + SLP_TREE_CHILDREN (node).create (nops); +@@ -138,6 +140,28 @@ vect_create_new_slp_node (vec scalar_stmts) + return node; + } + ++/* Create an SLP node for OPS. */ ++ ++static slp_tree ++vect_create_new_slp_node (vec ops) ++{ ++ slp_tree node; ++ ++ node = XNEW (struct _slp_tree); ++ SLP_TREE_SCALAR_STMTS (node) = vNULL; ++ SLP_TREE_SCALAR_OPS (node) = ops; ++ SLP_TREE_VEC_STMTS (node).create (0); ++ SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0; ++ SLP_TREE_CHILDREN (node) = vNULL; ++ SLP_TREE_LOAD_PERMUTATION (node) = vNULL; ++ SLP_TREE_TWO_OPERATORS (node) = false; ++ SLP_TREE_DEF_TYPE (node) = vect_external_def; ++ node->refcnt = 1; ++ node->max_nunits = 1; ++ ++ return node; ++} ++ + + /* This structure is used in creation of an SLP tree. Each instance + corresponds to the same operand in a group of scalar stmts in an SLP +@@ -146,6 +170,8 @@ typedef struct _slp_oprnd_info + { + /* Def-stmts for the operands. */ + vec def_stmts; ++ /* Operands. */ ++ vec ops; + /* Information about the first statement, its vector def-type, type, the + operand itself in case it's constant, and an indication if it's a pattern + stmt. */ +@@ -169,6 +195,7 @@ vect_create_oprnd_info (int nops, int group_size) + { + oprnd_info = XNEW (struct _slp_oprnd_info); + oprnd_info->def_stmts.create (group_size); ++ oprnd_info->ops.create (group_size); + oprnd_info->first_dt = vect_uninitialized_def; + oprnd_info->first_op_type = NULL_TREE; + oprnd_info->any_pattern = false; +@@ -190,6 +217,7 @@ vect_free_oprnd_info (vec &oprnds_info) + FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) + { + oprnd_info->def_stmts.release (); ++ oprnd_info->ops.release (); + XDELETE (oprnd_info); + } + +@@ -197,6 +225,19 @@ vect_free_oprnd_info (vec &oprnds_info) + } + + ++/* Return true if STMTS contains a pattern statement. */ ++ ++static bool ++vect_contains_pattern_stmt_p (vec stmts) ++{ ++ stmt_vec_info stmt_info; ++ unsigned int i; ++ FOR_EACH_VEC_ELT (stmts, i, stmt_info) ++ if (is_pattern_stmt_p (stmt_info)) ++ return true; ++ return false; ++} ++ + /* Find the place of the data-ref in STMT_INFO in the interleaving chain + that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part + of the chain. */ +@@ -231,7 +272,8 @@ vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info, + (if nonnull). */ + + bool +-can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode, ++can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count, ++ machine_mode elt_mode, + unsigned int *nvectors_out, + tree *vector_type_out, + tree *permutes) +@@ -243,7 +285,7 @@ can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode, + { + scalar_int_mode int_mode; + poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT; +- if (multiple_p (current_vector_size, elt_bytes, &nelts) ++ if (multiple_p (GET_MODE_SIZE (vinfo->vector_mode), elt_bytes, &nelts) + && int_mode_for_size (elt_bits, 0).exists (&int_mode)) + { + tree int_type = build_nonstandard_integer_type +@@ -322,6 +364,14 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap, + { + internal_fn ifn = gimple_call_internal_fn (stmt); + commutative_op = first_commutative_argument (ifn); ++ ++ /* Masked load, only look at mask. */ ++ if (ifn == IFN_MASK_LOAD) ++ { ++ number_of_oprnds = 1; ++ /* Mask operand index. */ ++ first_op_idx = 5; ++ } + } + } + else if (gassign *stmt = dyn_cast (stmt_info->stmt)) +@@ -380,6 +430,13 @@ again: + + if (first) + { ++ /* For the swapping logic below force vect_reduction_def ++ for the reduction op in a SLP reduction group. */ ++ if (!STMT_VINFO_DATA_REF (stmt_info) ++ && REDUC_GROUP_FIRST_ELEMENT (stmt_info) ++ && (int)i == STMT_VINFO_REDUC_IDX (stmt_info) ++ && def_stmt_info) ++ dt = vect_reduction_def; + oprnd_info->first_dt = dt; + oprnd_info->first_op_type = TREE_TYPE (oprnd); + } +@@ -389,20 +446,35 @@ again: + the def-stmt/s of the first stmt. Allow different definition + types for reduction chains: the first stmt must be a + vect_reduction_def (a phi node), and the rest +- vect_internal_def. */ ++ end in the reduction chain. */ + tree type = TREE_TYPE (oprnd); + if ((oprnd_info->first_dt != dt + && !(oprnd_info->first_dt == vect_reduction_def +- && dt == vect_internal_def) ++ && !STMT_VINFO_DATA_REF (stmt_info) ++ && REDUC_GROUP_FIRST_ELEMENT (stmt_info) ++ && def_stmt_info ++ && !STMT_VINFO_DATA_REF (def_stmt_info) ++ && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info) ++ == REDUC_GROUP_FIRST_ELEMENT (stmt_info))) + && !((oprnd_info->first_dt == vect_external_def + || oprnd_info->first_dt == vect_constant_def) + && (dt == vect_external_def + || dt == vect_constant_def))) +- || !types_compatible_p (oprnd_info->first_op_type, type)) ++ || !types_compatible_p (oprnd_info->first_op_type, type) ++ || (!STMT_VINFO_DATA_REF (stmt_info) ++ && REDUC_GROUP_FIRST_ELEMENT (stmt_info) ++ && ((!def_stmt_info ++ || STMT_VINFO_DATA_REF (def_stmt_info) ++ || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info) ++ != REDUC_GROUP_FIRST_ELEMENT (stmt_info))) ++ != (oprnd_info->first_dt != vect_reduction_def)))) + { + /* Try swapping operands if we got a mismatch. */ + if (i == commutative_op && !swapped) + { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "trying swapped operands\n"); + swapped = true; + goto again; + } +@@ -415,9 +487,9 @@ again: + } + if ((dt == vect_constant_def + || dt == vect_external_def) +- && !current_vector_size.is_constant () ++ && !GET_MODE_SIZE (vinfo->vector_mode).is_constant () + && (TREE_CODE (type) == BOOLEAN_TYPE +- || !can_duplicate_and_interleave_p (stmts.length (), ++ || !can_duplicate_and_interleave_p (vinfo, stmts.length (), + TYPE_MODE (type)))) + { + if (dump_enabled_p ()) +@@ -431,14 +503,37 @@ again: + /* Check the types of the definitions. */ + switch (dt) + { +- case vect_constant_def: + case vect_external_def: ++ /* Make sure to demote the overall operand to external. */ ++ oprnd_info->first_dt = vect_external_def; ++ /* Fallthru. */ ++ case vect_constant_def: ++ oprnd_info->def_stmts.quick_push (NULL); ++ oprnd_info->ops.quick_push (oprnd); + break; + ++ case vect_internal_def: + case vect_reduction_def: ++ if (oprnd_info->first_dt == vect_reduction_def ++ && !STMT_VINFO_DATA_REF (stmt_info) ++ && REDUC_GROUP_FIRST_ELEMENT (stmt_info) ++ && !STMT_VINFO_DATA_REF (def_stmt_info) ++ && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info) ++ == REDUC_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ /* For a SLP reduction chain we want to duplicate the ++ reduction to each of the chain members. That gets ++ us a sane SLP graph (still the stmts are not 100% ++ correct wrt the initial values). */ ++ gcc_assert (!first); ++ oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]); ++ oprnd_info->ops.quick_push (oprnd_info->ops[0]); ++ break; ++ } ++ /* Fallthru. */ + case vect_induction_def: +- case vect_internal_def: + oprnd_info->def_stmts.quick_push (def_stmt_info); ++ oprnd_info->ops.quick_push (oprnd); + break; + + default: +@@ -468,6 +563,8 @@ again: + + if (first_op_cond) + { ++ /* To get rid of this swapping we have to move the stmt code ++ to the SLP tree as well (and gather it here per stmt). */ + gassign *stmt = as_a (stmt_info->stmt); + tree cond = gimple_assign_rhs1 (stmt); + enum tree_code code = TREE_CODE (cond); +@@ -492,10 +589,8 @@ again: + } + else + { +- unsigned int op = commutative_op + first_op_idx; +- swap_ssa_operands (stmt_info->stmt, +- gimple_op_ptr (stmt_info->stmt, op), +- gimple_op_ptr (stmt_info->stmt, op + 1)); ++ /* Commutative ops need not reflect swapping, ops are in ++ the SLP tree. */ + } + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, +@@ -620,7 +715,7 @@ vect_two_operations_perm_ok_p (vec stmts, + is false then this indicates the comparison could not be + carried out or the stmts will never be vectorized by SLP. + +- Note COND_EXPR is possibly ismorphic to another one after swapping its ++ Note COND_EXPR is possibly isomorphic to another one after swapping its + operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to + the first stmt by swapping the two operands of comparison; set SWAP[i] + to 2 if stmt I is isormorphic to the first stmt by inverting the code +@@ -1030,7 +1125,6 @@ vect_build_slp_tree_2 (vec_info *vinfo, + vec stmts, unsigned int group_size, + poly_uint64 *max_nunits, + bool *matches, unsigned *npermutes, unsigned *tree_size, +- unsigned max_tree_size, + scalar_stmts_to_slp_tree_map_t *bst_map); + + static slp_tree +@@ -1038,7 +1132,6 @@ vect_build_slp_tree (vec_info *vinfo, + vec stmts, unsigned int group_size, + poly_uint64 *max_nunits, + bool *matches, unsigned *npermutes, unsigned *tree_size, +- unsigned max_tree_size, + scalar_stmts_to_slp_tree_map_t *bst_map) + { + if (slp_tree *leader = bst_map->get (stmts)) +@@ -1056,8 +1149,7 @@ vect_build_slp_tree (vec_info *vinfo, + poly_uint64 this_max_nunits = 1; + slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, + &this_max_nunits, +- matches, npermutes, tree_size, +- max_tree_size, bst_map); ++ matches, npermutes, tree_size, bst_map); + if (res) + { + res->max_nunits = this_max_nunits; +@@ -1081,7 +1173,6 @@ vect_build_slp_tree_2 (vec_info *vinfo, + vec stmts, unsigned int group_size, + poly_uint64 *max_nunits, + bool *matches, unsigned *npermutes, unsigned *tree_size, +- unsigned max_tree_size, + scalar_stmts_to_slp_tree_map_t *bst_map) + { + unsigned nops, i, this_tree_size = 0; +@@ -1109,7 +1200,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, + if (gphi *stmt = dyn_cast (stmt_info->stmt)) + { + tree scalar_type = TREE_TYPE (PHI_RESULT (stmt)); +- tree vectype = get_vectype_for_scalar_type (scalar_type); ++ tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type); + if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits)) + return NULL; + +@@ -1129,18 +1220,12 @@ vect_build_slp_tree_2 (vec_info *vinfo, + /* Else def types have to match. */ + stmt_vec_info other_info; + FOR_EACH_VEC_ELT (stmts, i, other_info) +- { +- /* But for reduction chains only check on the first stmt. */ +- if (!STMT_VINFO_DATA_REF (other_info) +- && REDUC_GROUP_FIRST_ELEMENT (other_info) +- && REDUC_GROUP_FIRST_ELEMENT (other_info) != stmt_info) +- continue; +- if (STMT_VINFO_DEF_TYPE (other_info) != def_type) +- return NULL; +- } ++ if (STMT_VINFO_DEF_TYPE (other_info) != def_type) ++ return NULL; + } + else + return NULL; ++ (*tree_size)++; + node = vect_create_new_slp_node (stmts); + return node; + } +@@ -1152,13 +1237,23 @@ vect_build_slp_tree_2 (vec_info *vinfo, + &this_max_nunits, matches, &two_operators)) + return NULL; + +- /* If the SLP node is a load, terminate the recursion. */ ++ /* If the SLP node is a load, terminate the recursion unless masked. */ + if (STMT_VINFO_GROUPED_ACCESS (stmt_info) + && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) + { +- *max_nunits = this_max_nunits; +- node = vect_create_new_slp_node (stmts); +- return node; ++ if (gcall *stmt = dyn_cast (stmt_info->stmt)) ++ { ++ /* Masked load. */ ++ gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)); ++ nops = 1; ++ } ++ else ++ { ++ *max_nunits = this_max_nunits; ++ (*tree_size)++; ++ node = vect_create_new_slp_node (stmts); ++ return node; ++ } + } + + /* Get at the operands, verifying they are compatible. */ +@@ -1184,9 +1279,6 @@ vect_build_slp_tree_2 (vec_info *vinfo, + + stmt_info = stmts[0]; + +- if (tree_size) +- max_tree_size -= *tree_size; +- + /* Create SLP_TREE nodes for the definition node/s. */ + FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) + { +@@ -1194,32 +1286,34 @@ vect_build_slp_tree_2 (vec_info *vinfo, + unsigned old_tree_size = this_tree_size; + unsigned int j; + ++ if (oprnd_info->first_dt == vect_uninitialized_def) ++ { ++ /* COND_EXPR have one too many eventually if the condition ++ is a SSA name. */ ++ gcc_assert (i == 3 && nops == 4); ++ continue; ++ } ++ + if (oprnd_info->first_dt != vect_internal_def + && oprnd_info->first_dt != vect_reduction_def + && oprnd_info->first_dt != vect_induction_def) +- continue; +- +- if (++this_tree_size > max_tree_size) + { +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_MISSED_OPTIMIZATION, +- vect_location, +- "Build SLP failed: SLP tree too large\n"); +- FOR_EACH_VEC_ELT (children, j, child) +- vect_free_slp_tree (child, false); +- vect_free_oprnd_info (oprnds_info); +- return NULL; ++ slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops); ++ SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt; ++ oprnd_info->ops = vNULL; ++ children.safe_push (invnode); ++ continue; + } + + if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts, + group_size, &this_max_nunits, + matches, npermutes, +- &this_tree_size, +- max_tree_size, bst_map)) != NULL) ++ &this_tree_size, bst_map)) != NULL) + { + /* If we have all children of child built up from scalars then just + throw that away and build it up this node from scalars. */ +- if (!SLP_TREE_CHILDREN (child).is_empty () ++ if (is_a (vinfo) ++ && !SLP_TREE_CHILDREN (child).is_empty () + /* ??? Rejecting patterns this way doesn't work. We'd have to + do extra work to cancel the pattern so the uses see the + scalar version. */ +@@ -1244,6 +1338,9 @@ vect_build_slp_tree_2 (vec_info *vinfo, + "scalars instead\n"); + oprnd_info->def_stmts = vNULL; + SLP_TREE_DEF_TYPE (child) = vect_external_def; ++ SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops; ++ oprnd_info->ops = vNULL; ++ ++this_tree_size; + children.safe_push (child); + continue; + } +@@ -1273,9 +1370,12 @@ vect_build_slp_tree_2 (vec_info *vinfo, + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Building vector operands from scalars\n"); ++ this_tree_size++; + child = vect_create_new_slp_node (oprnd_info->def_stmts); + SLP_TREE_DEF_TYPE (child) = vect_external_def; ++ SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops; + children.safe_push (child); ++ oprnd_info->ops = vNULL; + oprnd_info->def_stmts = vNULL; + continue; + } +@@ -1355,6 +1455,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, + { + std::swap (oprnds_info[0]->def_stmts[j], + oprnds_info[1]->def_stmts[j]); ++ std::swap (oprnds_info[0]->ops[j], ++ oprnds_info[1]->ops[j]); + if (dump_enabled_p ()) + dump_printf (MSG_NOTE, "%d ", j); + } +@@ -1365,37 +1467,12 @@ vect_build_slp_tree_2 (vec_info *vinfo, + if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts, + group_size, &this_max_nunits, + tem, npermutes, +- &this_tree_size, +- max_tree_size, bst_map)) != NULL) ++ &this_tree_size, bst_map)) != NULL) + { +- /* ... so if successful we can apply the operand swapping +- to the GIMPLE IL. This is necessary because for example +- vect_get_slp_defs uses operand indexes and thus expects +- canonical operand order. This is also necessary even +- if we end up building the operand from scalars as +- we'll continue to process swapped operand two. */ +- for (j = 0; j < group_size; ++j) +- gimple_set_plf (stmts[j]->stmt, GF_PLF_1, false); +- for (j = 0; j < group_size; ++j) +- if (matches[j] == !swap_not_matching) +- { +- gassign *stmt = as_a (stmts[j]->stmt); +- /* Avoid swapping operands twice. */ +- if (gimple_plf (stmt, GF_PLF_1)) +- continue; +- swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt), +- gimple_assign_rhs2_ptr (stmt)); +- gimple_set_plf (stmt, GF_PLF_1, true); +- } +- /* Verify we swap all duplicates or none. */ +- if (flag_checking) +- for (j = 0; j < group_size; ++j) +- gcc_assert (gimple_plf (stmts[j]->stmt, GF_PLF_1) +- == (matches[j] == !swap_not_matching)); +- + /* If we have all children of child built up from scalars then + just throw that away and build it up this node from scalars. */ +- if (!SLP_TREE_CHILDREN (child).is_empty () ++ if (is_a (vinfo) ++ && !SLP_TREE_CHILDREN (child).is_empty () + /* ??? Rejecting patterns this way doesn't work. We'd have + to do extra work to cancel the pattern so the uses see the + scalar version. */ +@@ -1421,6 +1498,9 @@ vect_build_slp_tree_2 (vec_info *vinfo, + "scalars instead\n"); + oprnd_info->def_stmts = vNULL; + SLP_TREE_DEF_TYPE (child) = vect_external_def; ++ SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops; ++ oprnd_info->ops = vNULL; ++ ++this_tree_size; + children.safe_push (child); + continue; + } +@@ -1444,8 +1524,7 @@ fail: + + vect_free_oprnd_info (oprnds_info); + +- if (tree_size) +- *tree_size += this_tree_size; ++ *tree_size += this_tree_size + 1; + *max_nunits = this_max_nunits; + + node = vect_create_new_slp_node (stmts); +@@ -1460,9 +1539,10 @@ static void + vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc, + slp_tree node, hash_set &visited) + { +- int i; ++ unsigned i; + stmt_vec_info stmt_info; + slp_tree child; ++ tree op; + + if (visited.add (node)) + return; +@@ -1470,11 +1550,23 @@ vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc, + dump_metadata_t metadata (dump_kind, loc.get_impl_location ()); + dump_user_location_t user_loc = loc.get_user_location (); + dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u)\n", +- SLP_TREE_DEF_TYPE (node) != vect_internal_def +- ? " (external)" : "", node, ++ SLP_TREE_DEF_TYPE (node) == vect_external_def ++ ? " (external)" ++ : (SLP_TREE_DEF_TYPE (node) == vect_constant_def ++ ? " (constant)" ++ : ""), node, + estimated_poly_value (node->max_nunits)); +- FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) +- dump_printf_loc (metadata, user_loc, "\tstmt %d %G", i, stmt_info->stmt); ++ if (SLP_TREE_SCALAR_STMTS (node).exists ()) ++ FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) ++ dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt); ++ else ++ { ++ dump_printf_loc (metadata, user_loc, "\t{ "); ++ FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) ++ dump_printf (metadata, "%T%s ", op, ++ i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : ""); ++ dump_printf (metadata, "}\n"); ++ } + if (SLP_TREE_CHILDREN (node).is_empty ()) + return; + dump_printf_loc (metadata, user_loc, "\tchildren"); +@@ -1563,8 +1655,6 @@ vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size, + vec permutation, + hash_set &visited) + { +- stmt_vec_info stmt_info; +- vec tmp_stmts; + unsigned int i; + slp_tree child; + +@@ -1574,15 +1664,30 @@ vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size, + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) + vect_slp_rearrange_stmts (child, group_size, permutation, visited); + +- gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ()); +- tmp_stmts.create (group_size); +- tmp_stmts.quick_grow_cleared (group_size); +- +- FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) +- tmp_stmts[permutation[i]] = stmt_info; +- +- SLP_TREE_SCALAR_STMTS (node).release (); +- SLP_TREE_SCALAR_STMTS (node) = tmp_stmts; ++ if (SLP_TREE_SCALAR_STMTS (node).exists ()) ++ { ++ gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ()); ++ vec tmp_stmts; ++ tmp_stmts.create (group_size); ++ tmp_stmts.quick_grow (group_size); ++ stmt_vec_info stmt_info; ++ FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) ++ tmp_stmts[permutation[i]] = stmt_info; ++ SLP_TREE_SCALAR_STMTS (node).release (); ++ SLP_TREE_SCALAR_STMTS (node) = tmp_stmts; ++ } ++ if (SLP_TREE_SCALAR_OPS (node).exists ()) ++ { ++ gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ()); ++ vec tmp_ops; ++ tmp_ops.create (group_size); ++ tmp_ops.quick_grow (group_size); ++ tree op; ++ FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) ++ tmp_ops[permutation[i]] = op; ++ SLP_TREE_SCALAR_OPS (node).release (); ++ SLP_TREE_SCALAR_OPS (node) = tmp_ops; ++ } + } + + +@@ -1668,9 +1773,10 @@ vect_gather_slp_loads (slp_instance inst, slp_tree node, + + if (SLP_TREE_CHILDREN (node).length () == 0) + { ++ if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) ++ return; + stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; +- if (SLP_TREE_DEF_TYPE (node) == vect_internal_def +- && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ if (STMT_VINFO_GROUPED_ACCESS (stmt_info) + && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) + SLP_INSTANCE_LOADS (inst).safe_push (node); + } +@@ -1913,7 +2019,7 @@ vect_analyze_slp_instance (vec_info *vinfo, + if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) + { + scalar_type = TREE_TYPE (DR_REF (dr)); +- vectype = get_vectype_for_scalar_type (scalar_type); ++ vectype = get_vectype_for_scalar_type (vinfo, scalar_type); + group_size = DR_GROUP_SIZE (stmt_info); + } + else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info)) +@@ -1964,7 +2070,8 @@ vect_analyze_slp_instance (vec_info *vinfo, + /* Mark the first element of the reduction chain as reduction to properly + transform the node. In the reduction analysis phase only the last + element of the chain is marked as reduction. */ +- STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def; ++ STMT_VINFO_DEF_TYPE (stmt_info) ++ = STMT_VINFO_DEF_TYPE (scalar_stmts.last ()); + STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) + = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ())); + } +@@ -1982,9 +2089,10 @@ vect_analyze_slp_instance (vec_info *vinfo, + scalar_stmts_to_slp_tree_map_t *bst_map + = new scalar_stmts_to_slp_tree_map_t (); + poly_uint64 max_nunits = nunits; ++ unsigned tree_size = 0; + node = vect_build_slp_tree (vinfo, scalar_stmts, group_size, + &max_nunits, matches, &npermutes, +- NULL, max_tree_size, bst_map); ++ &tree_size, bst_map); + /* The map keeps a reference on SLP nodes built, release that. */ + for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin (); + it != bst_map->end (); ++it) +@@ -1993,6 +2101,34 @@ vect_analyze_slp_instance (vec_info *vinfo, + delete bst_map; + if (node != NULL) + { ++ /* If this is a reduction chain with a conversion in front ++ amend the SLP tree with a node for that. */ ++ if (!dr ++ && REDUC_GROUP_FIRST_ELEMENT (stmt_info) ++ && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def) ++ { ++ /* Get at the conversion stmt - we know it's the single use ++ of the last stmt of the reduction chain. */ ++ gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt; ++ use_operand_p use_p; ++ gimple *use_stmt; ++ bool r = single_imm_use (gimple_assign_lhs (tem), &use_p, &use_stmt); ++ gcc_assert (r); ++ next_info = vinfo->lookup_stmt (use_stmt); ++ next_info = vect_stmt_to_vectorize (next_info); ++ scalar_stmts = vNULL; ++ scalar_stmts.create (group_size); ++ for (unsigned i = 0; i < group_size; ++i) ++ scalar_stmts.quick_push (next_info); ++ slp_tree conv = vect_create_new_slp_node (scalar_stmts); ++ SLP_TREE_CHILDREN (conv).quick_push (node); ++ node = conv; ++ /* We also have to fake this conversion stmt as SLP reduction group ++ so we don't have to mess with too much code elsewhere. */ ++ REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info; ++ REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL; ++ } ++ + /* Calculate the unrolling factor based on the smallest type. */ + poly_uint64 unrolling_factor + = calculate_unrolling_factor (max_nunits, group_size); +@@ -2025,6 +2161,10 @@ vect_analyze_slp_instance (vec_info *vinfo, + SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor; + SLP_INSTANCE_LOADS (new_instance) = vNULL; + vect_gather_slp_loads (new_instance, node); ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "SLP size %u vs. limit %u.\n", ++ tree_size, max_tree_size); + + /* Compute the load permutation. */ + slp_tree load_node; +@@ -2231,8 +2371,11 @@ vect_make_slp_decision (loop_vec_info loop_vinfo) + FOR_EACH_VEC_ELT (slp_instances, i, instance) + { + /* FORNOW: SLP if you can. */ +- /* All unroll factors have the form current_vector_size * X for some +- rational X, so they must have a common multiple. */ ++ /* All unroll factors have the form: ++ ++ GET_MODE_SIZE (vinfo->vector_mode) * X ++ ++ for some rational X, so they must have a common multiple. */ + unrolling_factor + = force_common_multiple (unrolling_factor, + SLP_INSTANCE_UNROLLING_FACTOR (instance)); +@@ -2327,7 +2470,8 @@ vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype, + + if (!only_edge) + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) +- if (SLP_TREE_DEF_TYPE (child) != vect_external_def) ++ if (SLP_TREE_DEF_TYPE (child) != vect_external_def ++ && SLP_TREE_DEF_TYPE (child) != vect_constant_def) + vect_detect_hybrid_slp_stmts (child, i, stype, visited); + } + +@@ -2514,8 +2658,15 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node, + VF divided by the number of elements in a vector. */ + if (!STMT_VINFO_GROUPED_ACCESS (stmt_info) + && REDUC_GROUP_FIRST_ELEMENT (stmt_info)) +- SLP_TREE_NUMBER_OF_VEC_STMTS (node) +- = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]); ++ { ++ for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i) ++ if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def) ++ { ++ SLP_TREE_NUMBER_OF_VEC_STMTS (node) ++ = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]); ++ break; ++ } ++ } + else + { + poly_uint64 vf; +@@ -2533,6 +2684,39 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node, + return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec); + } + ++/* Try to build NODE from scalars, returning true on success. ++ NODE_INSTANCE is the SLP instance that contains NODE. */ ++ ++static bool ++vect_slp_convert_to_external (vec_info *vinfo, slp_tree node, ++ slp_instance node_instance) ++{ ++ stmt_vec_info stmt_info; ++ unsigned int i; ++ ++ if (!is_a (vinfo) ++ || node == SLP_INSTANCE_TREE (node_instance) ++ || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))) ++ return false; ++ ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Building vector operands from scalars instead\n"); ++ ++ /* Don't remove and free the child nodes here, since they could be ++ referenced by other structures. The analysis and scheduling phases ++ (need to) ignore child nodes of anything that isn't vect_internal_def. */ ++ unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length (); ++ SLP_TREE_DEF_TYPE (node) = vect_external_def; ++ SLP_TREE_SCALAR_OPS (node).safe_grow (group_size); ++ FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) ++ { ++ tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt); ++ SLP_TREE_SCALAR_OPS (node)[i] = lhs; ++ } ++ return true; ++} ++ + /* Analyze statements contained in SLP tree NODE after recursively analyzing + the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE. + +@@ -2559,6 +2743,13 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node, + { + SLP_TREE_NUMBER_OF_VEC_STMTS (node) + = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader); ++ /* Cope with cases in which we made a late decision to build the ++ node from scalars. */ ++ if (SLP_TREE_DEF_TYPE (*leader) == vect_external_def ++ && vect_slp_convert_to_external (vinfo, node, node_instance)) ++ ; ++ else ++ gcc_assert (SLP_TREE_DEF_TYPE (node) == SLP_TREE_DEF_TYPE (*leader)); + return true; + } + +@@ -2579,25 +2770,31 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node, + auto_vec dt; + dt.safe_grow (SLP_TREE_CHILDREN (node).length ()); + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) +- dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]); ++ if (SLP_TREE_SCALAR_STMTS (child).length () != 0) ++ dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]); + + /* Push SLP node def-type to stmt operands. */ + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) +- if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) ++ if (SLP_TREE_DEF_TYPE (child) != vect_internal_def ++ && SLP_TREE_SCALAR_STMTS (child).length () != 0) + STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) + = SLP_TREE_DEF_TYPE (child); + + /* Check everything worked out. */ + bool res = true; + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) +- if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) +- { +- if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) +- != SLP_TREE_DEF_TYPE (child)) +- res = false; +- } +- else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) != dt[j]) +- res = false; ++ if (SLP_TREE_SCALAR_STMTS (child).length () != 0) ++ { ++ if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) ++ { ++ if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) ++ != SLP_TREE_DEF_TYPE (child)) ++ res = false; ++ } ++ else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) ++ != dt[j]) ++ res = false; ++ } + if (!res && dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: same operand with different " +@@ -2609,7 +2806,13 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node, + + /* Restore def-types. */ + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) +- STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j]; ++ if (SLP_TREE_SCALAR_STMTS (child).length () != 0) ++ STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j]; ++ ++ /* If this node can't be vectorized, try pruning the tree here rather ++ than felling the whole thing. */ ++ if (!res && vect_slp_convert_to_external (vinfo, node, node_instance)) ++ res = true; + + return res; + } +@@ -2818,19 +3021,17 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo) + return true; + } + +-/* Check if the basic block can be vectorized. Returns a bb_vec_info +- if so and sets fatal to true if failure is independent of +- current_vector_size. */ ++/* Check if the region described by BB_VINFO can be vectorized, returning ++ true if so. When returning false, set FATAL to true if the same failure ++ would prevent vectorization at other vector sizes, false if it is still ++ worth trying other sizes. N_STMTS is the number of statements in the ++ region. */ + +-static bb_vec_info +-vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin, +- gimple_stmt_iterator region_end, +- vec datarefs, int n_stmts, +- bool &fatal, vec_info_shared *shared) ++static bool ++vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal) + { + DUMP_VECT_SCOPE ("vect_slp_analyze_bb"); + +- bb_vec_info bb_vinfo; + slp_instance instance; + int i; + poly_uint64 min_vf = 2; +@@ -2838,34 +3039,15 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin, + /* The first group of checks is independent of the vector size. */ + fatal = true; + +- if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB)) +- { +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +- "not vectorized: too many instructions in " +- "basic block.\n"); +- free_data_refs (datarefs); +- return NULL; +- } +- +- bb_vinfo = new _bb_vec_info (region_begin, region_end, shared); +- if (!bb_vinfo) +- return NULL; +- +- BB_VINFO_DATAREFS (bb_vinfo) = datarefs; +- bb_vinfo->shared->save_datarefs (); +- + /* Analyze the data references. */ + +- if (!vect_analyze_data_refs (bb_vinfo, &min_vf)) ++ if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: unhandled data-ref in basic " + "block.\n"); +- +- delete bb_vinfo; +- return NULL; ++ return false; + } + + if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2) +@@ -2874,9 +3056,7 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin, + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: not enough data-refs in " + "basic block.\n"); +- +- delete bb_vinfo; +- return NULL; ++ return false; + } + + if (!vect_analyze_data_ref_accesses (bb_vinfo)) +@@ -2885,9 +3065,7 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin, + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: unhandled data access in " + "basic block.\n"); +- +- delete bb_vinfo; +- return NULL; ++ return false; + } + + /* If there are no grouped stores in the region there is no need +@@ -2899,9 +3077,7 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin, + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: no grouped stores in " + "basic block.\n"); +- +- delete bb_vinfo; +- return NULL; ++ return false; + } + + /* While the rest of the analysis below depends on it in some way. */ +@@ -2921,9 +3097,7 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin, + "not vectorized: failed to find SLP opportunities " + "in basic block.\n"); + } +- +- delete bb_vinfo; +- return NULL; ++ return false; + } + + vect_record_base_alignments (bb_vinfo); +@@ -2954,19 +3128,14 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin, + i++; + } + if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ()) +- { +- delete bb_vinfo; +- return NULL; +- } ++ return false; + + if (!vect_slp_analyze_operations (bb_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: bad operation in basic block.\n"); +- +- delete bb_vinfo; +- return NULL; ++ return false; + } + + /* Cost model: check if the vectorization is worthwhile. */ +@@ -2977,80 +3146,61 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin, + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: vectorization is not " + "profitable.\n"); +- +- delete bb_vinfo; +- return NULL; ++ return false; + } + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Basic block will be vectorized using SLP\n"); +- +- return bb_vinfo; ++ return true; + } + ++/* Subroutine of vect_slp_bb. Try to vectorize the statements between ++ REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true ++ on success. The region has N_STMTS statements and has the datarefs ++ given by DATAREFS. */ + +-/* Main entry for the BB vectorizer. Analyze and transform BB, returns +- true if anything in the basic-block was vectorized. */ +- +-bool +-vect_slp_bb (basic_block bb) ++static bool ++vect_slp_bb_region (gimple_stmt_iterator region_begin, ++ gimple_stmt_iterator region_end, ++ vec datarefs, ++ unsigned int n_stmts) + { + bb_vec_info bb_vinfo; +- gimple_stmt_iterator gsi; +- bool any_vectorized = false; +- auto_vector_sizes vector_sizes; ++ auto_vector_modes vector_modes; + + /* Autodetect first vector size we try. */ +- current_vector_size = 0; +- targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); +- unsigned int next_size = 0; ++ machine_mode next_vector_mode = VOIDmode; ++ targetm.vectorize.autovectorize_vector_modes (&vector_modes, false); ++ unsigned int mode_i = 0; + +- gsi = gsi_start_bb (bb); ++ vec_info_shared shared; + +- poly_uint64 autodetected_vector_size = 0; ++ machine_mode autodetected_vector_mode = VOIDmode; + while (1) + { +- if (gsi_end_p (gsi)) +- break; +- +- gimple_stmt_iterator region_begin = gsi; +- vec datarefs = vNULL; +- int insns = 0; +- +- for (; !gsi_end_p (gsi); gsi_next (&gsi)) +- { +- gimple *stmt = gsi_stmt (gsi); +- if (is_gimple_debug (stmt)) +- continue; +- insns++; +- +- if (gimple_location (stmt) != UNKNOWN_LOCATION) +- vect_location = stmt; +- +- if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs)) +- break; +- } +- +- /* Skip leading unhandled stmts. */ +- if (gsi_stmt (region_begin) == gsi_stmt (gsi)) +- { +- gsi_next (&gsi); +- continue; +- } +- +- gimple_stmt_iterator region_end = gsi; +- + bool vectorized = false; + bool fatal = false; +- vec_info_shared shared; +- bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end, +- datarefs, insns, fatal, &shared); +- if (bb_vinfo ++ bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared); ++ ++ bool first_time_p = shared.datarefs.is_empty (); ++ BB_VINFO_DATAREFS (bb_vinfo) = datarefs; ++ if (first_time_p) ++ bb_vinfo->shared->save_datarefs (); ++ else ++ bb_vinfo->shared->check_datarefs (); ++ bb_vinfo->vector_mode = next_vector_mode; ++ ++ if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal) + && dbg_cnt (vect_slp)) + { + if (dump_enabled_p ()) +- dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n"); ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "***** Analysis succeeded with vector mode" ++ " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode)); ++ dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n"); ++ } + + bb_vinfo->shared->check_datarefs (); + vect_schedule_slp (bb_vinfo); +@@ -3058,7 +3208,7 @@ vect_slp_bb (basic_block bb) + unsigned HOST_WIDE_INT bytes; + if (dump_enabled_p ()) + { +- if (current_vector_size.is_constant (&bytes)) ++ if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes)) + dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, + "basic block part vectorized using %wu byte " + "vectors\n", bytes); +@@ -3070,50 +3220,120 @@ vect_slp_bb (basic_block bb) + + vectorized = true; + } +- delete bb_vinfo; ++ else ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "***** Analysis failed with vector mode %s\n", ++ GET_MODE_NAME (bb_vinfo->vector_mode)); ++ } + +- any_vectorized |= vectorized; ++ if (mode_i == 0) ++ autodetected_vector_mode = bb_vinfo->vector_mode; + +- if (next_size == 0) +- autodetected_vector_size = current_vector_size; ++ if (!fatal) ++ while (mode_i < vector_modes.length () ++ && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i])) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "***** The result for vector mode %s would" ++ " be the same\n", ++ GET_MODE_NAME (vector_modes[mode_i])); ++ mode_i += 1; ++ } + +- if (next_size < vector_sizes.length () +- && known_eq (vector_sizes[next_size], autodetected_vector_size)) +- next_size += 1; ++ delete bb_vinfo; ++ ++ if (mode_i < vector_modes.length () ++ && VECTOR_MODE_P (autodetected_vector_mode) ++ && (related_vector_mode (vector_modes[mode_i], ++ GET_MODE_INNER (autodetected_vector_mode)) ++ == autodetected_vector_mode) ++ && (related_vector_mode (autodetected_vector_mode, ++ GET_MODE_INNER (vector_modes[mode_i])) ++ == vector_modes[mode_i])) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "***** Skipping vector mode %s, which would" ++ " repeat the analysis for %s\n", ++ GET_MODE_NAME (vector_modes[mode_i]), ++ GET_MODE_NAME (autodetected_vector_mode)); ++ mode_i += 1; ++ } + + if (vectorized +- || next_size == vector_sizes.length () +- || known_eq (current_vector_size, 0U) ++ || mode_i == vector_modes.length () ++ || autodetected_vector_mode == VOIDmode + /* If vect_slp_analyze_bb_1 signaled that analysis for all + vector sizes will fail do not bother iterating. */ + || fatal) ++ return vectorized; ++ ++ /* Try the next biggest vector size. */ ++ next_vector_mode = vector_modes[mode_i++]; ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "***** Re-trying analysis with vector mode %s\n", ++ GET_MODE_NAME (next_vector_mode)); ++ } ++} ++ ++/* Main entry for the BB vectorizer. Analyze and transform BB, returns ++ true if anything in the basic-block was vectorized. */ ++ ++bool ++vect_slp_bb (basic_block bb) ++{ ++ gimple_stmt_iterator gsi; ++ bool any_vectorized = false; ++ ++ gsi = gsi_start_bb (bb); ++ while (!gsi_end_p (gsi)) ++ { ++ gimple_stmt_iterator region_begin = gsi; ++ vec datarefs = vNULL; ++ int insns = 0; ++ ++ for (; !gsi_end_p (gsi); gsi_next (&gsi)) + { +- if (gsi_end_p (region_end)) ++ gimple *stmt = gsi_stmt (gsi); ++ if (is_gimple_debug (stmt)) ++ continue; ++ insns++; ++ ++ if (gimple_location (stmt) != UNKNOWN_LOCATION) ++ vect_location = stmt; ++ ++ if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs)) + break; ++ } + +- /* Skip the unhandled stmt. */ ++ /* Skip leading unhandled stmts. */ ++ if (gsi_stmt (region_begin) == gsi_stmt (gsi)) ++ { + gsi_next (&gsi); +- +- /* And reset vector sizes. */ +- current_vector_size = 0; +- next_size = 0; ++ continue; + } +- else ++ ++ gimple_stmt_iterator region_end = gsi; ++ ++ if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB)) + { +- /* Try the next biggest vector size. */ +- current_vector_size = vector_sizes[next_size++]; + if (dump_enabled_p ()) +- { +- dump_printf_loc (MSG_NOTE, vect_location, +- "***** Re-trying analysis with " +- "vector size "); +- dump_dec (MSG_NOTE, current_vector_size); +- dump_printf (MSG_NOTE, "\n"); +- } +- +- /* Start over. */ +- gsi = region_begin; ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "not vectorized: too many instructions in " ++ "basic block.\n"); + } ++ else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns)) ++ any_vectorized = true; ++ ++ if (gsi_end_p (region_end)) ++ break; ++ ++ /* Skip the unhandled stmt. */ ++ gsi_next (&gsi); + } + + return any_vectorized; +@@ -3184,8 +3404,9 @@ vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo) + to cut down on the number of interleaves. */ + + void +-duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec elts, +- unsigned int nresults, vec &results) ++duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type, ++ vec elts, unsigned int nresults, ++ vec &results) + { + unsigned int nelts = elts.length (); + tree element_type = TREE_TYPE (vector_type); +@@ -3194,7 +3415,7 @@ duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec elts, + unsigned int nvectors = 1; + tree new_vector_type; + tree permutes[2]; +- if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type), ++ if (!can_duplicate_and_interleave_p (vinfo, nelts, TYPE_MODE (element_type), + &nvectors, &new_vector_type, + permutes)) + gcc_unreachable (); +@@ -3276,52 +3497,45 @@ duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec elts, + + /* For constant and loop invariant defs of SLP_NODE this function returns + (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts. +- OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of +- scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create. +- REDUC_INDEX is the index of the reduction operand in the statements, unless +- it is -1. */ ++ OP_NODE determines the node for the operand containing the scalar ++ operands. */ + + static void +-vect_get_constant_vectors (tree op, slp_tree slp_node, +- vec *vec_oprnds, +- unsigned int op_num, unsigned int number_of_vectors) ++vect_get_constant_vectors (slp_tree op_node, slp_tree slp_node, ++ vec *vec_oprnds) + { +- vec stmts = SLP_TREE_SCALAR_STMTS (slp_node); +- stmt_vec_info stmt_vinfo = stmts[0]; +- gimple *stmt = stmt_vinfo->stmt; ++ stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (slp_node)[0]; ++ vec_info *vinfo = stmt_vinfo->vinfo; + unsigned HOST_WIDE_INT nunits; + tree vec_cst; + unsigned j, number_of_places_left_in_vector; + tree vector_type; + tree vop; +- int group_size = stmts.length (); ++ int group_size = op_node->ops.length (); + unsigned int vec_num, i; + unsigned number_of_copies = 1; +- vec voprnds; +- voprnds.create (number_of_vectors); +- bool constant_p, is_store; ++ bool constant_p; + tree neutral_op = NULL; +- enum tree_code code = gimple_expr_code (stmt); + gimple_seq ctor_seq = NULL; + auto_vec permute_results; + ++ /* ??? SLP analysis should compute the vector type for the ++ constant / invariant and store it in the SLP node. */ ++ tree op = op_node->ops[0]; + /* Check if vector type is a boolean vector. */ ++ tree stmt_vectype = STMT_VINFO_VECTYPE (stmt_vinfo); + if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op)) + && vect_mask_constant_operand_p (stmt_vinfo)) +- vector_type +- = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo)); +- else +- vector_type = get_vectype_for_scalar_type (TREE_TYPE (op)); +- +- if (STMT_VINFO_DATA_REF (stmt_vinfo)) +- { +- is_store = true; +- op = gimple_assign_rhs1 (stmt); +- } ++ vector_type = truth_type_for (stmt_vectype); + else +- is_store = false; ++ vector_type = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op)); + +- gcc_assert (op); ++ unsigned int number_of_vectors ++ = vect_get_num_vectors (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) ++ * TYPE_VECTOR_SUBPARTS (stmt_vectype), ++ vector_type); ++ vec_oprnds->create (number_of_vectors); ++ auto_vec voprnds (number_of_vectors); + + /* NUMBER_OF_COPIES is the number of times we need to use the same values in + created vectors. It is greater than 1 if unrolling is performed. +@@ -3353,56 +3567,8 @@ vect_get_constant_vectors (tree op, slp_tree slp_node, + bool place_after_defs = false; + for (j = 0; j < number_of_copies; j++) + { +- for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--) ++ for (i = group_size - 1; op_node->ops.iterate (i, &op); i--) + { +- stmt = stmt_vinfo->stmt; +- if (is_store) +- op = gimple_assign_rhs1 (stmt); +- else +- { +- switch (code) +- { +- case COND_EXPR: +- { +- tree cond = gimple_assign_rhs1 (stmt); +- if (TREE_CODE (cond) == SSA_NAME) +- op = gimple_op (stmt, op_num + 1); +- else if (op_num == 0 || op_num == 1) +- op = TREE_OPERAND (cond, op_num); +- else +- { +- if (op_num == 2) +- op = gimple_assign_rhs2 (stmt); +- else +- op = gimple_assign_rhs3 (stmt); +- } +- } +- break; +- +- case CALL_EXPR: +- op = gimple_call_arg (stmt, op_num); +- break; +- +- case LSHIFT_EXPR: +- case RSHIFT_EXPR: +- case LROTATE_EXPR: +- case RROTATE_EXPR: +- op = gimple_op (stmt, op_num + 1); +- /* Unlike the other binary operators, shifts/rotates have +- the shift count being int, instead of the same type as +- the lhs, so make sure the scalar is the right type if +- we are dealing with vectors of +- long long/long/short/char. */ +- if (op_num == 1 && TREE_CODE (op) == INTEGER_CST) +- op = fold_convert (TREE_TYPE (vector_type), op); +- break; +- +- default: +- op = gimple_op (stmt, op_num + 1); +- break; +- } +- } +- + /* Create 'vect_ = {op0,op1,...,opn}'. */ + number_of_places_left_in_vector--; + tree orig_op = op; +@@ -3472,9 +3638,9 @@ vect_get_constant_vectors (tree op, slp_tree slp_node, + vec_cst = gimple_build_vector (&ctor_seq, &elts); + else + { +- if (vec_oprnds->is_empty ()) +- duplicate_and_interleave (&ctor_seq, vector_type, elts, +- number_of_vectors, ++ if (permute_results.is_empty ()) ++ duplicate_and_interleave (vinfo, &ctor_seq, vector_type, ++ elts, number_of_vectors, + permute_results); + vec_cst = permute_results[number_of_vectors - j - 1]; + } +@@ -3516,8 +3682,6 @@ vect_get_constant_vectors (tree op, slp_tree slp_node, + vec_oprnds->quick_push (vop); + } + +- voprnds.release (); +- + /* In case that VF is greater than the unrolling factor needed for the SLP + group of stmts, NUMBER_OF_VECTORS to be created is greater than + NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have +@@ -3548,25 +3712,17 @@ vect_get_constant_vectors (tree op, slp_tree slp_node, + static void + vect_get_slp_vect_defs (slp_tree slp_node, vec *vec_oprnds) + { +- tree vec_oprnd; + stmt_vec_info vec_def_stmt_info; + unsigned int i; + + gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ()); + + FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info) +- { +- gcc_assert (vec_def_stmt_info); +- if (gphi *vec_def_phi = dyn_cast (vec_def_stmt_info->stmt)) +- vec_oprnd = gimple_phi_result (vec_def_phi); +- else +- vec_oprnd = gimple_get_lhs (vec_def_stmt_info->stmt); +- vec_oprnds->quick_push (vec_oprnd); +- } ++ vec_oprnds->quick_push (gimple_get_lhs (vec_def_stmt_info->stmt)); + } + + +-/* Get vectorized definitions for SLP_NODE. ++/* Get N vectorized definitions for SLP_NODE. + If the scalar definitions are loop invariants or constants, collect them and + call vect_get_constant_vectors() to create vector stmts. + Otherwise, the def-stmts must be already vectorized and the vectorized stmts +@@ -3574,91 +3730,26 @@ vect_get_slp_vect_defs (slp_tree slp_node, vec *vec_oprnds) + vect_get_slp_vect_defs () to retrieve them. */ + + void +-vect_get_slp_defs (vec ops, slp_tree slp_node, +- vec > *vec_oprnds) ++vect_get_slp_defs (slp_tree slp_node, vec > *vec_oprnds, unsigned n) + { +- int number_of_vects = 0, i; +- unsigned int child_index = 0; +- HOST_WIDE_INT lhs_size_unit, rhs_size_unit; +- slp_tree child = NULL; +- vec vec_defs; +- tree oprnd; +- bool vectorized_defs; ++ if (n == -1U) ++ n = SLP_TREE_CHILDREN (slp_node).length (); + +- stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0]; +- FOR_EACH_VEC_ELT (ops, i, oprnd) ++ for (unsigned i = 0; i < n; ++i) + { +- /* For each operand we check if it has vectorized definitions in a child +- node or we need to create them (for invariants and constants). We +- check if the LHS of the first stmt of the next child matches OPRND. +- If it does, we found the correct child. Otherwise, we call +- vect_get_constant_vectors (), and not advance CHILD_INDEX in order +- to check this child node for the next operand. */ +- vectorized_defs = false; +- if (SLP_TREE_CHILDREN (slp_node).length () > child_index) +- { +- child = SLP_TREE_CHILDREN (slp_node)[child_index]; +- +- /* We have to check both pattern and original def, if available. */ +- if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) +- { +- stmt_vec_info first_def_info = SLP_TREE_SCALAR_STMTS (child)[0]; +- stmt_vec_info related = STMT_VINFO_RELATED_STMT (first_def_info); +- tree first_def_op; +- +- if (gphi *first_def = dyn_cast (first_def_info->stmt)) +- first_def_op = gimple_phi_result (first_def); +- else +- first_def_op = gimple_get_lhs (first_def_info->stmt); +- if (operand_equal_p (oprnd, first_def_op, 0) +- || (related +- && operand_equal_p (oprnd, +- gimple_get_lhs (related->stmt), 0))) +- { +- /* The number of vector defs is determined by the number of +- vector statements in the node from which we get those +- statements. */ +- number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child); +- vectorized_defs = true; +- child_index++; +- } +- } +- else +- child_index++; +- } +- +- if (!vectorized_defs) +- { +- if (i == 0) +- { +- number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); +- /* Number of vector stmts was calculated according to LHS in +- vect_schedule_slp_instance (), fix it by replacing LHS with +- RHS, if necessary. See vect_get_smallest_scalar_type () for +- details. */ +- vect_get_smallest_scalar_type (first_stmt_info, &lhs_size_unit, +- &rhs_size_unit); +- if (rhs_size_unit != lhs_size_unit) +- { +- number_of_vects *= rhs_size_unit; +- number_of_vects /= lhs_size_unit; +- } +- } +- } ++ slp_tree child = SLP_TREE_CHILDREN (slp_node)[i]; + +- /* Allocate memory for vectorized defs. */ +- vec_defs = vNULL; +- vec_defs.create (number_of_vects); ++ vec vec_defs = vNULL; + +- /* For reduction defs we call vect_get_constant_vectors (), since we are +- looking for initial loop invariant values. */ +- if (vectorized_defs) +- /* The defs are already vectorized. */ +- vect_get_slp_vect_defs (child, &vec_defs); ++ /* For each operand we check if it has vectorized definitions in a child ++ node or we need to create them (for invariants and constants). */ ++ if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) ++ { ++ vec_defs.create (SLP_TREE_NUMBER_OF_VEC_STMTS (child)); ++ vect_get_slp_vect_defs (child, &vec_defs); ++ } + else +- /* Build vectors from scalar defs. */ +- vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i, +- number_of_vects); ++ vect_get_constant_vectors (child, slp_node, &vec_defs); + + vec_oprnds->quick_push (vec_defs); + } +@@ -3939,17 +4030,6 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance, + stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node); + si = gsi_for_stmt (last_stmt_info->stmt); + +- /* Mark the first element of the reduction chain as reduction to properly +- transform the node. In the analysis phase only the last element of the +- chain is marked as reduction. */ +- if (!STMT_VINFO_GROUPED_ACCESS (stmt_info) +- && REDUC_GROUP_FIRST_ELEMENT (stmt_info) +- && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info) +- { +- STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def; +- STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; +- } +- + /* Handle two-operation SLP nodes by vectorizing the group with + both operations and then performing a merge. */ + if (SLP_TREE_TWO_OPERATORS (node)) +diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c +index 74abfbfe56e..5d6da3d9708 100644 +--- a/gcc/tree-vect-stmts.c ++++ b/gcc/tree-vect-stmts.c +@@ -329,13 +329,13 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, + basic_block bb = gimple_bb (USE_STMT (use_p)); + if (!flow_bb_inside_loop_p (loop, bb)) + { ++ if (is_gimple_debug (USE_STMT (use_p))) ++ continue; ++ + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "vec_stmt_relevant_p: used out of loop.\n"); + +- if (is_gimple_debug (USE_STMT (use_p))) +- continue; +- + /* We expect all such uses to be in the loop exit phis + (because of loop closed form) */ + gcc_assert (gimple_code (USE_STMT (use_p)) == GIMPLE_PHI); +@@ -456,7 +456,6 @@ process_use (stmt_vec_info stmt_vinfo, tree use, loop_vec_info loop_vinfo, + bool force) + { + stmt_vec_info dstmt_vinfo; +- basic_block bb, def_bb; + enum vect_def_type dt; + + /* case 1: we are only interested in uses that need to be vectorized. Uses +@@ -472,28 +471,8 @@ process_use (stmt_vec_info stmt_vinfo, tree use, loop_vec_info loop_vinfo, + if (!dstmt_vinfo) + return opt_result::success (); + +- def_bb = gimple_bb (dstmt_vinfo->stmt); +- +- /* case 2: A reduction phi (STMT) defined by a reduction stmt (DSTMT_VINFO). +- DSTMT_VINFO must have already been processed, because this should be the +- only way that STMT, which is a reduction-phi, was put in the worklist, +- as there should be no other uses for DSTMT_VINFO in the loop. So we just +- check that everything is as expected, and we are done. */ +- bb = gimple_bb (stmt_vinfo->stmt); +- if (gimple_code (stmt_vinfo->stmt) == GIMPLE_PHI +- && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def +- && gimple_code (dstmt_vinfo->stmt) != GIMPLE_PHI +- && STMT_VINFO_DEF_TYPE (dstmt_vinfo) == vect_reduction_def +- && bb->loop_father == def_bb->loop_father) +- { +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_NOTE, vect_location, +- "reduc-stmt defining reduc-phi in the same nest.\n"); +- gcc_assert (STMT_VINFO_RELEVANT (dstmt_vinfo) < vect_used_by_reduction); +- gcc_assert (STMT_VINFO_LIVE_P (dstmt_vinfo) +- || STMT_VINFO_RELEVANT (dstmt_vinfo) > vect_unused_in_scope); +- return opt_result::success (); +- } ++ basic_block def_bb = gimple_bb (dstmt_vinfo->stmt); ++ basic_block bb = gimple_bb (stmt_vinfo->stmt); + + /* case 3a: outer-loop stmt defining an inner-loop stmt: + outer-loop-header-bb: +@@ -607,7 +586,7 @@ process_use (stmt_vec_info stmt_vinfo, tree use, loop_vec_info loop_vinfo, + This pass detects such stmts. */ + + opt_result +-vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo) ++vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo, bool *fatal) + { + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); +@@ -777,7 +756,11 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo) + = process_use (stmt_vinfo, gs_info.offset, loop_vinfo, relevant, + &worklist, true); + if (!res) +- return res; ++ { ++ if (fatal) ++ *fatal = false; ++ return res; ++ } + } + } /* while worklist */ + +@@ -791,6 +774,7 @@ vect_prologue_cost_for_slp_op (slp_tree node, stmt_vec_info stmt_info, + unsigned opno, enum vect_def_type dt, + stmt_vector_for_cost *cost_vec) + { ++ vec_info *vinfo = stmt_info->vinfo; + gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0]->stmt; + tree op = gimple_op (stmt, opno); + unsigned prologue_cost = 0; +@@ -798,7 +782,7 @@ vect_prologue_cost_for_slp_op (slp_tree node, stmt_vec_info stmt_info, + /* Without looking at the actual initializer a vector of + constants can be implemented as load from the constant pool. + When all elements are the same we can use a splat. */ +- tree vectype = get_vectype_for_scalar_type (TREE_TYPE (op)); ++ tree vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op)); + unsigned group_size = SLP_TREE_SCALAR_STMTS (node).length (); + unsigned num_vects_to_check; + unsigned HOST_WIDE_INT const_nunits; +@@ -1603,9 +1587,9 @@ vect_get_vec_def_for_operand (tree op, stmt_vec_info stmt_vinfo, tree vectype) + vector_type = vectype; + else if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op)) + && VECTOR_BOOLEAN_TYPE_P (stmt_vectype)) +- vector_type = build_same_sized_truth_vector_type (stmt_vectype); ++ vector_type = truth_type_for (stmt_vectype); + else +- vector_type = get_vectype_for_scalar_type (TREE_TYPE (op)); ++ vector_type = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op)); + + gcc_assert (vector_type); + return vect_init_vector (stmt_vinfo, op, vector_type, NULL); +@@ -1720,16 +1704,8 @@ vect_get_vec_defs (tree op0, tree op1, stmt_vec_info stmt_info, + { + if (slp_node) + { +- int nops = (op1 == NULL_TREE) ? 1 : 2; +- auto_vec ops (nops); +- auto_vec > vec_defs (nops); +- +- ops.quick_push (op0); +- if (op1) +- ops.quick_push (op1); +- +- vect_get_slp_defs (ops, slp_node, &vec_defs); +- ++ auto_vec > vec_defs (SLP_TREE_CHILDREN (slp_node).length ()); ++ vect_get_slp_defs (slp_node, &vec_defs, op1 ? 2 : 1); + *vec_oprnds0 = vec_defs[0]; + if (op1) + *vec_oprnds1 = vec_defs[1]; +@@ -1874,7 +1850,8 @@ static tree permute_vec_elements (tree, tree, tree, stmt_vec_info, + says how the load or store is going to be implemented and GROUP_SIZE + is the number of load or store statements in the containing group. + If the access is a gather load or scatter store, GS_INFO describes +- its arguments. ++ its arguments. If the load or store is conditional, SCALAR_MASK is the ++ condition under which it occurs. + + Clear LOOP_VINFO_CAN_FULLY_MASK_P if a fully-masked loop is not + supported, otherwise record the required mask types. */ +@@ -1883,7 +1860,7 @@ static void + check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, + vec_load_store_type vls_type, int group_size, + vect_memory_access_type memory_access_type, +- gather_scatter_info *gs_info) ++ gather_scatter_info *gs_info, tree scalar_mask) + { + /* Invariant loads need no special support. */ + if (memory_access_type == VMAT_INVARIANT) +@@ -1907,7 +1884,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, + return; + } + unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype); +- vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype); ++ vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask); + return; + } + +@@ -1931,7 +1908,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, + return; + } + unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype); +- vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype); ++ vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask); + return; + } + +@@ -1949,9 +1926,8 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, + } + + machine_mode mask_mode; +- if (!(targetm.vectorize.get_mask_mode +- (GET_MODE_NUNITS (vecmode), +- GET_MODE_SIZE (vecmode)).exists (&mask_mode)) ++ if (!VECTOR_MODE_P (vecmode) ++ || !targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode) + || !can_vec_mask_load_store_p (vecmode, mask_mode, is_load)) + { + if (dump_enabled_p ()) +@@ -1969,7 +1945,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + unsigned int nvectors; + if (can_div_away_from_zero_p (group_size * vf, nunits, &nvectors)) +- vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype); ++ vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask); + else + gcc_unreachable (); + } +@@ -2311,6 +2287,29 @@ get_group_load_store_type (stmt_vec_info stmt_info, tree vectype, bool slp, + && gap < (vect_known_alignment_in_bytes (first_dr_info) + / vect_get_scalar_dr_size (first_dr_info))) + overrun_p = false; ++ ++ /* If the gap splits the vector in half and the target ++ can do half-vector operations avoid the epilogue peeling ++ by simply loading half of the vector only. Usually ++ the construction with an upper zero half will be elided. */ ++ dr_alignment_support alignment_support_scheme; ++ scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype)); ++ machine_mode vmode; ++ if (overrun_p ++ && !masked_p ++ && (((alignment_support_scheme ++ = vect_supportable_dr_alignment (first_dr_info, false))) ++ == dr_aligned ++ || alignment_support_scheme == dr_unaligned_supported) ++ && known_eq (nunits, (group_size - gap) * 2) ++ && known_eq (nunits, group_size) ++ && related_vector_mode (TYPE_MODE (vectype), elmode, ++ group_size - gap).exists (&vmode) ++ && (convert_optab_handler (vec_init_optab, ++ TYPE_MODE (vectype), vmode) ++ != CODE_FOR_nothing)) ++ overrun_p = false; ++ + if (overrun_p && !can_overrun_p) + { + if (dump_enabled_p ()) +@@ -2536,6 +2535,7 @@ vect_check_load_store_mask (stmt_vec_info stmt_info, tree mask, + vect_def_type *mask_dt_out, + tree *mask_vectype_out) + { ++ vec_info *vinfo = stmt_info->vinfo; + if (!VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (mask))) + { + if (dump_enabled_p ()) +@@ -2564,7 +2564,7 @@ vect_check_load_store_mask (stmt_vec_info stmt_info, tree mask, + + tree vectype = STMT_VINFO_VECTYPE (stmt_info); + if (!mask_vectype) +- mask_vectype = get_mask_type_for_scalar_type (TREE_TYPE (vectype)); ++ mask_vectype = get_mask_type_for_scalar_type (vinfo, TREE_TYPE (vectype)); + + if (!mask_vectype || !VECTOR_BOOLEAN_TYPE_P (mask_vectype)) + { +@@ -2728,7 +2728,7 @@ vect_build_gather_load_calls (stmt_vec_info stmt_info, + || TREE_CODE (masktype) == INTEGER_TYPE + || types_compatible_p (srctype, masktype))); + if (mask && TREE_CODE (masktype) == INTEGER_TYPE) +- masktype = build_same_sized_truth_vector_type (srctype); ++ masktype = truth_type_for (srctype); + + tree mask_halftype = masktype; + tree perm_mask = NULL_TREE; +@@ -2774,8 +2774,7 @@ vect_build_gather_load_calls (stmt_vec_info stmt_info, + mask_perm_mask = vect_gen_perm_mask_checked (masktype, indices); + } + else if (mask) +- mask_halftype +- = build_same_sized_truth_vector_type (gs_info->offset_vectype); ++ mask_halftype = truth_type_for (gs_info->offset_vectype); + } + else + gcc_unreachable (); +@@ -2952,6 +2951,7 @@ vect_get_gather_scatter_ops (struct loop *loop, stmt_vec_info stmt_info, + gather_scatter_info *gs_info, + tree *dataref_ptr, tree *vec_offset) + { ++ vec_info *vinfo = stmt_info->vinfo; + gimple_seq stmts = NULL; + *dataref_ptr = force_gimple_operand (gs_info->base, &stmts, true, NULL_TREE); + if (stmts != NULL) +@@ -2962,7 +2962,7 @@ vect_get_gather_scatter_ops (struct loop *loop, stmt_vec_info stmt_info, + gcc_assert (!new_bb); + } + tree offset_type = TREE_TYPE (gs_info->offset); +- tree offset_vectype = get_vectype_for_scalar_type (offset_type); ++ tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type); + *vec_offset = vect_get_vec_def_for_operand (gs_info->offset, stmt_info, + offset_vectype); + } +@@ -2997,7 +2997,7 @@ vect_get_strided_load_store_ops (stmt_vec_info stmt_info, + /* The offset given in GS_INFO can have pointer type, so use the element + type of the vector instead. */ + tree offset_type = TREE_TYPE (gs_info->offset); +- tree offset_vectype = get_vectype_for_scalar_type (offset_type); ++ tree offset_vectype = get_vectype_for_scalar_type (loop_vinfo, offset_type); + offset_type = TREE_TYPE (offset_vectype); + + /* Calculate X = DR_STEP / SCALE and convert it to the appropriate type. */ +@@ -3161,8 +3161,7 @@ simple_integer_narrowing (tree vectype_out, tree vectype_in, + int multi_step_cvt = 0; + auto_vec interm_types; + if (!supportable_narrowing_operation (NOP_EXPR, vectype_out, vectype_in, +- &code, &multi_step_cvt, +- &interm_types) ++ &code, &multi_step_cvt, &interm_types) + || multi_step_cvt) + return false; + +@@ -3295,10 +3294,10 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + return false; + } + } +- /* If all arguments are external or constant defs use a vector type with +- the same size as the output vector type. */ ++ /* If all arguments are external or constant defs, infer the vector type ++ from the scalar type. */ + if (!vectype_in) +- vectype_in = get_same_sized_vectype (rhs_type, vectype_out); ++ vectype_in = get_vectype_for_scalar_type (vinfo, rhs_type); + if (vec_stmt) + gcc_assert (vectype_in); + if (!vectype_in) +@@ -3309,6 +3308,19 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + + return false; + } ++ /* FORNOW: we don't yet support mixtures of vector sizes for calls, ++ just mixtures of nunits. E.g. DI->SI versions of __builtin_ctz* ++ are traditionally vectorized as two VnDI->VnDI IFN_CTZs followed ++ by a pack of the two vectors into an SI vector. We would need ++ separate code to handle direct VnDI->VnSI IFN_CTZs. */ ++ if (TYPE_SIZE (vectype_in) != TYPE_SIZE (vectype_out)) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "mismatched vector sizes %T and %T\n", ++ vectype_in, vectype_out); ++ return false; ++ } + + /* FORNOW */ + nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in); +@@ -3415,7 +3427,9 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + unsigned int nvectors = (slp_node + ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) + : ncopies); +- vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out); ++ tree scalar_mask = gimple_call_arg (stmt_info->stmt, mask_opno); ++ vect_record_loop_mask (loop_vinfo, masks, nvectors, ++ vectype_out, scalar_mask); + } + return true; + } +@@ -3446,9 +3460,7 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + auto_vec > vec_defs (nargs); + vec vec_oprnds0; + +- for (i = 0; i < nargs; i++) +- vargs[i] = gimple_call_arg (stmt, i); +- vect_get_slp_defs (vargs, slp_node, &vec_defs); ++ vect_get_slp_defs (slp_node, &vec_defs); + vec_oprnds0 = vec_defs[0]; + + /* Arguments are ready. Create the new vector stmt. */ +@@ -3470,8 +3482,7 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + = gimple_build_call_internal_vec (ifn, vargs); + gimple_call_set_lhs (call, half_res); + gimple_call_set_nothrow (call, true); +- new_stmt_info +- = vect_finish_stmt_generation (stmt_info, call, gsi); ++ vect_finish_stmt_generation (stmt_info, call, gsi); + if ((i & 1) == 0) + { + prev_res = half_res; +@@ -3523,8 +3534,7 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + if (mask_opno >= 0 && !vectypes[mask_opno]) + { + gcc_assert (modifier != WIDEN); +- vectypes[mask_opno] +- = build_same_sized_truth_vector_type (vectype_in); ++ vectypes[mask_opno] = truth_type_for (vectype_in); + } + + for (i = 0; i < nargs; i++) +@@ -3570,8 +3580,7 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + gcall *call = gimple_build_call_internal_vec (ifn, vargs); + gimple_call_set_lhs (call, half_res); + gimple_call_set_nothrow (call, true); +- new_stmt_info +- = vect_finish_stmt_generation (stmt_info, call, gsi); ++ vect_finish_stmt_generation (stmt_info, call, gsi); + if ((j & 1) == 0) + { + prev_res = half_res; +@@ -3622,9 +3631,7 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + auto_vec > vec_defs (nargs); + vec vec_oprnds0; + +- for (i = 0; i < nargs; i++) +- vargs.quick_push (gimple_call_arg (stmt, i)); +- vect_get_slp_defs (vargs, slp_node, &vec_defs); ++ vect_get_slp_defs (slp_node, &vec_defs); + vec_oprnds0 = vec_defs[0]; + + /* Arguments are ready. Create the new vector stmt. */ +@@ -4087,9 +4094,8 @@ vectorizable_simd_clone_call (stmt_vec_info stmt_info, + || arginfo[i].dt == vect_external_def) + && bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR) + { +- arginfo[i].vectype +- = get_vectype_for_scalar_type (TREE_TYPE (gimple_call_arg (stmt, +- i))); ++ tree arg_type = TREE_TYPE (gimple_call_arg (stmt, i)); ++ arginfo[i].vectype = get_vectype_for_scalar_type (vinfo, arg_type); + if (arginfo[i].vectype == NULL + || (simd_clone_subparts (arginfo[i].vectype) + > bestn->simdclone->simdlen)) +@@ -4802,10 +4808,10 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + } + } + +- /* If op0 is an external or constant defs use a vector type of +- the same size as the output vector type. */ ++ /* If op0 is an external or constant def, infer the vector type ++ from the scalar type. */ + if (!vectype_in) +- vectype_in = get_same_sized_vectype (rhs_type, vectype_out); ++ vectype_in = get_vectype_for_scalar_type (vinfo, rhs_type); + if (vec_stmt) + gcc_assert (vectype_in); + if (!vectype_in) +@@ -4863,7 +4869,9 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + switch (modifier) + { + case NONE: +- if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR) ++ if (code != FIX_TRUNC_EXPR ++ && code != FLOAT_EXPR ++ && !CONVERT_EXPR_CODE_P (code)) + return false; + if (supportable_convert_operation (code, vectype_out, vectype_in, + &decl1, &code1)) +@@ -5452,7 +5460,7 @@ vectorizable_assignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + either as shift by a scalar or by a vector. */ + + bool +-vect_supportable_shift (enum tree_code code, tree scalar_type) ++vect_supportable_shift (vec_info *vinfo, enum tree_code code, tree scalar_type) + { + + machine_mode vec_mode; +@@ -5460,7 +5468,7 @@ vect_supportable_shift (enum tree_code code, tree scalar_type) + int icode; + tree vectype; + +- vectype = get_vectype_for_scalar_type (scalar_type); ++ vectype = get_vectype_for_scalar_type (vinfo, scalar_type); + if (!vectype) + return false; + +@@ -5491,7 +5499,7 @@ vect_supportable_shift (enum tree_code code, tree scalar_type) + stmt to replace it, put it in VEC_STMT, and insert it at GSI. + Return true if STMT_INFO is vectorizable in this way. */ + +-bool ++static bool + vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + stmt_vec_info *vec_stmt, slp_tree slp_node, + stmt_vector_for_cost *cost_vec) +@@ -5524,6 +5532,7 @@ vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + bool scalar_shift_arg = true; + bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); + vec_info *vinfo = stmt_info->vinfo; ++ bool incompatible_op1_vectype_p = false; + + if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo) + return false; +@@ -5565,10 +5574,10 @@ vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + "use not simple.\n"); + return false; + } +- /* If op0 is an external or constant def use a vector type with +- the same size as the output vector type. */ ++ /* If op0 is an external or constant def, infer the vector type ++ from the scalar type. */ + if (!vectype) +- vectype = get_same_sized_vectype (TREE_TYPE (op0), vectype_out); ++ vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op0)); + if (vec_stmt) + gcc_assert (vectype); + if (!vectype) +@@ -5666,9 +5675,16 @@ vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + "vector/vector shift/rotate found.\n"); + + if (!op1_vectype) +- op1_vectype = get_same_sized_vectype (TREE_TYPE (op1), vectype_out); +- if (op1_vectype == NULL_TREE +- || TYPE_MODE (op1_vectype) != TYPE_MODE (vectype)) ++ op1_vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op1)); ++ incompatible_op1_vectype_p ++ = (op1_vectype == NULL_TREE ++ || maybe_ne (TYPE_VECTOR_SUBPARTS (op1_vectype), ++ TYPE_VECTOR_SUBPARTS (vectype)) ++ || TYPE_MODE (op1_vectype) != TYPE_MODE (vectype)); ++ if (incompatible_op1_vectype_p ++ && (!slp_node ++ || SLP_TREE_DEF_TYPE ++ (SLP_TREE_CHILDREN (slp_node)[1]) != vect_constant_def)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +@@ -5707,7 +5723,10 @@ vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + so make sure the scalar is the right type if we are + dealing with vectors of long long/long/short/char. */ + if (dt[1] == vect_constant_def) +- op1 = fold_convert (TREE_TYPE (vectype), op1); ++ { ++ if (!slp_node) ++ op1 = fold_convert (TREE_TYPE (vectype), op1); ++ } + else if (!useless_type_conversion_p (TREE_TYPE (vectype), + TREE_TYPE (op1))) + { +@@ -5818,6 +5837,21 @@ vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + } + } + } ++ else if (slp_node && incompatible_op1_vectype_p) ++ { ++ /* Convert the scalar constant shift amounts in-place. */ ++ slp_tree shift = SLP_TREE_CHILDREN (slp_node)[1]; ++ gcc_assert (SLP_TREE_DEF_TYPE (shift) == vect_constant_def); ++ for (unsigned i = 0; ++ i < SLP_TREE_SCALAR_OPS (shift).length (); ++i) ++ { ++ SLP_TREE_SCALAR_OPS (shift)[i] ++ = fold_convert (TREE_TYPE (vectype), ++ SLP_TREE_SCALAR_OPS (shift)[i]); ++ gcc_assert ((TREE_CODE (SLP_TREE_SCALAR_OPS (shift)[i]) ++ == INTEGER_CST)); ++ } ++ } + + /* vec_oprnd1 is available if operand 1 should be of a scalar-type + (a special case for certain kind of vector shifts); otherwise, +@@ -5894,7 +5928,7 @@ vectorizable_operation (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + poly_uint64 nunits_in; + poly_uint64 nunits_out; + tree vectype_out; +- int ncopies; ++ int ncopies, vec_num; + int j, i; + vec vec_oprnds0 = vNULL; + vec vec_oprnds1 = vNULL; +@@ -5964,8 +5998,8 @@ vectorizable_operation (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + "use not simple.\n"); + return false; + } +- /* If op0 is an external or constant def use a vector type with +- the same size as the output vector type. */ ++ /* If op0 is an external or constant def, infer the vector type ++ from the scalar type. */ + if (!vectype) + { + /* For boolean type we cannot determine vectype by +@@ -5985,7 +6019,7 @@ vectorizable_operation (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + vectype = vectype_out; + } + else +- vectype = get_same_sized_vectype (TREE_TYPE (op0), vectype_out); ++ vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op0)); + } + if (vec_stmt) + gcc_assert (vectype); +@@ -6031,9 +6065,15 @@ vectorizable_operation (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in + case of SLP. */ + if (slp_node) +- ncopies = 1; ++ { ++ ncopies = 1; ++ vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); ++ } + else +- ncopies = vect_get_num_copies (loop_vinfo, vectype); ++ { ++ ncopies = vect_get_num_copies (loop_vinfo, vectype); ++ vec_num = 1; ++ } + + gcc_assert (ncopies >= 1); + +@@ -6086,8 +6126,34 @@ vectorizable_operation (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + return false; + } + ++ int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info); ++ vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL); ++ internal_fn cond_fn = get_conditional_internal_fn (code); ++ + if (!vec_stmt) /* transformation not required. */ + { ++ /* If this operation is part of a reduction, a fully-masked loop ++ should only change the active lanes of the reduction chain, ++ keeping the inactive lanes as-is. */ ++ if (loop_vinfo ++ && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) ++ && reduc_idx >= 0) ++ { ++ if (cond_fn == IFN_LAST ++ || !direct_internal_fn_supported_p (cond_fn, vectype, ++ OPTIMIZE_FOR_SPEED)) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "can't use a fully-masked loop because no" ++ " conditional operation is available.\n"); ++ LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; ++ } ++ else ++ vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num, ++ vectype, NULL); ++ } ++ + STMT_VINFO_TYPE (stmt_info) = op_vec_info_type; + DUMP_VECT_SCOPE ("vectorizable_operation"); + vect_model_simple_cost (stmt_info, ncopies, dt, ndts, slp_node, cost_vec); +@@ -6100,6 +6166,8 @@ vectorizable_operation (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + dump_printf_loc (MSG_NOTE, vect_location, + "transform binary/unary operation.\n"); + ++ bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); ++ + /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as + vectors with unsigned elements, but the result is signed. So, we + need to compute the MINUS_EXPR into vectype temporary and +@@ -6180,12 +6248,8 @@ vectorizable_operation (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + { + if (slp_node) + { +- auto_vec ops(3); +- ops.quick_push (op0); +- ops.quick_push (op1); +- ops.quick_push (op2); + auto_vec > vec_defs(3); +- vect_get_slp_defs (ops, slp_node, &vec_defs); ++ vect_get_slp_defs (slp_node, &vec_defs); + vec_oprnds0 = vec_defs[0]; + vec_oprnds1 = vec_defs[1]; + vec_oprnds2 = vec_defs[2]; +@@ -6221,22 +6285,41 @@ vectorizable_operation (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + ? vec_oprnds1[i] : NULL_TREE); + vop2 = ((op_type == ternary_op) + ? vec_oprnds2[i] : NULL_TREE); +- gassign *new_stmt = gimple_build_assign (vec_dest, code, +- vop0, vop1, vop2); +- new_temp = make_ssa_name (vec_dest, new_stmt); +- gimple_assign_set_lhs (new_stmt, new_temp); +- new_stmt_info +- = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); +- if (vec_cvt_dest) ++ if (masked_loop_p && reduc_idx >= 0) + { +- new_temp = build1 (VIEW_CONVERT_EXPR, vectype_out, new_temp); +- gassign *new_stmt +- = gimple_build_assign (vec_cvt_dest, VIEW_CONVERT_EXPR, +- new_temp); +- new_temp = make_ssa_name (vec_cvt_dest, new_stmt); ++ /* Perform the operation on active elements only and take ++ inactive elements from the reduction chain input. */ ++ gcc_assert (!vop2); ++ vop2 = reduc_idx == 1 ? vop1 : vop0; ++ tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, ++ vectype, i * ncopies + j); ++ gcall *call = gimple_build_call_internal (cond_fn, 4, mask, ++ vop0, vop1, vop2); ++ new_temp = make_ssa_name (vec_dest, call); ++ gimple_call_set_lhs (call, new_temp); ++ gimple_call_set_nothrow (call, true); ++ new_stmt_info ++ = vect_finish_stmt_generation (stmt_info, call, gsi); ++ } ++ else ++ { ++ gassign *new_stmt = gimple_build_assign (vec_dest, code, ++ vop0, vop1, vop2); ++ new_temp = make_ssa_name (vec_dest, new_stmt); + gimple_assign_set_lhs (new_stmt, new_temp); + new_stmt_info + = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); ++ if (vec_cvt_dest) ++ { ++ new_temp = build1 (VIEW_CONVERT_EXPR, vectype_out, new_temp); ++ gassign *new_stmt ++ = gimple_build_assign (vec_cvt_dest, VIEW_CONVERT_EXPR, ++ new_temp); ++ new_temp = make_ssa_name (vec_cvt_dest, new_stmt); ++ gimple_assign_set_lhs (new_stmt, new_temp); ++ new_stmt_info ++ = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); ++ } + } + if (slp_node) + SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info); +@@ -6517,7 +6600,7 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + if (loop_vinfo + && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) + check_load_store_masking (loop_vinfo, vectype, vls_type, group_size, +- memory_access_type, &gs_info); ++ memory_access_type, &gs_info, mask); + + STMT_VINFO_TYPE (stmt_info) = store_vec_info_type; + vect_model_store_cost (stmt_info, ncopies, rhs_dt, memory_access_type, +@@ -6580,8 +6663,7 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + ncopies *= 2; + + if (mask) +- mask_halfvectype +- = build_same_sized_truth_vector_type (gs_info.offset_vectype); ++ mask_halfvectype = truth_type_for (gs_info.offset_vectype); + } + else + gcc_unreachable (); +@@ -6840,9 +6922,8 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + of vector elts directly. */ + scalar_mode elmode = SCALAR_TYPE_MODE (elem_type); + machine_mode vmode; +- if (!mode_for_vector (elmode, group_size).exists (&vmode) +- || !VECTOR_MODE_P (vmode) +- || !targetm.vector_mode_supported_p (vmode) ++ if (!related_vector_mode (TYPE_MODE (vectype), elmode, ++ group_size).exists (&vmode) + || (convert_optab_handler (vec_extract_optab, + TYPE_MODE (vectype), vmode) + == CODE_FOR_nothing)) +@@ -6859,9 +6940,8 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + element extracts from the original vector type and + element size stores. */ + if (int_mode_for_size (lsize, 0).exists (&elmode) +- && mode_for_vector (elmode, lnunits).exists (&vmode) +- && VECTOR_MODE_P (vmode) +- && targetm.vector_mode_supported_p (vmode) ++ && related_vector_mode (TYPE_MODE (vectype), elmode, ++ lnunits).exists (&vmode) + && (convert_optab_handler (vec_extract_optab, + vmode, elmode) + != CODE_FOR_nothing)) +@@ -7624,14 +7704,6 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + if (!scalar_dest) + return false; + +- if (slp_node != NULL) +- { +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +- "SLP of masked loads not supported.\n"); +- return false; +- } +- + int mask_index = internal_fn_mask_index (ifn); + if (mask_index >= 0) + { +@@ -7714,6 +7786,15 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + group_size = DR_GROUP_SIZE (first_stmt_info); + ++ /* Refuse non-SLP vectorization of SLP-only groups. */ ++ if (!slp && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info)) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "cannot vectorize load in non-SLP mode.\n"); ++ return false; ++ } ++ + if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) + slp_perm = true; + +@@ -7767,7 +7848,7 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + if (loop_vinfo + && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) + check_load_store_masking (loop_vinfo, vectype, VLS_LOAD, group_size, +- memory_access_type, &gs_info); ++ memory_access_type, &gs_info, mask); + + STMT_VINFO_TYPE (stmt_info) = load_vec_info_type; + vect_model_load_cost (stmt_info, ncopies, memory_access_type, +@@ -7947,9 +8028,8 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + vector elts directly. */ + scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype)); + machine_mode vmode; +- if (mode_for_vector (elmode, group_size).exists (&vmode) +- && VECTOR_MODE_P (vmode) +- && targetm.vector_mode_supported_p (vmode) ++ if (related_vector_mode (TYPE_MODE (vectype), elmode, ++ group_size).exists (&vmode) + && (convert_optab_handler (vec_init_optab, + TYPE_MODE (vectype), vmode) + != CODE_FOR_nothing)) +@@ -7973,9 +8053,8 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + /* If we can't construct such a vector fall back to + element loads of the original vector type. */ + if (int_mode_for_size (lsize, 0).exists (&elmode) +- && mode_for_vector (elmode, lnunits).exists (&vmode) +- && VECTOR_MODE_P (vmode) +- && targetm.vector_mode_supported_p (vmode) ++ && related_vector_mode (TYPE_MODE (vectype), elmode, ++ lnunits).exists (&vmode) + && (convert_optab_handler (vec_init_optab, vmode, elmode) + != CODE_FOR_nothing)) + { +@@ -8413,8 +8492,17 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + simd_lane_access_p, + byte_offset, bump); + if (mask) +- vec_mask = vect_get_vec_def_for_operand (mask, stmt_info, +- mask_vectype); ++ { ++ if (slp_node) ++ { ++ auto_vec > vec_defs (1); ++ vect_get_slp_defs (slp_node, &vec_defs); ++ vec_mask = vec_defs[0][0]; ++ } ++ else ++ vec_mask = vect_get_vec_def_for_operand (mask, stmt_info, ++ mask_vectype); ++ } + } + else + { +@@ -8564,8 +8652,25 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + } + else + { ++ tree ltype = vectype; ++ /* If there's no peeling for gaps but we have a gap ++ with slp loads then load the lower half of the ++ vector only. See get_group_load_store_type for ++ when we apply this optimization. */ ++ if (slp ++ && loop_vinfo ++ && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ++ && DR_GROUP_GAP (first_stmt_info) != 0 ++ && known_eq (nunits, ++ (group_size ++ - DR_GROUP_GAP (first_stmt_info)) * 2) ++ && known_eq (nunits, group_size)) ++ ltype = build_vector_type (TREE_TYPE (vectype), ++ (group_size ++ - DR_GROUP_GAP ++ (first_stmt_info))); + data_ref +- = fold_build2 (MEM_REF, vectype, dataref_ptr, ++ = fold_build2 (MEM_REF, ltype, dataref_ptr, + dataref_offset + ? dataref_offset + : build_int_cst (ref_type, 0)); +@@ -8579,6 +8684,23 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + TREE_TYPE (data_ref) + = build_aligned_type (TREE_TYPE (data_ref), + TYPE_ALIGN (elem_type)); ++ if (ltype != vectype) ++ { ++ vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); ++ tree tem = make_ssa_name (ltype); ++ new_stmt = gimple_build_assign (tem, data_ref); ++ vect_finish_stmt_generation (stmt_info, new_stmt, gsi); ++ data_ref = NULL; ++ vec *v; ++ vec_alloc (v, 2); ++ CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); ++ CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, ++ build_zero_cst (ltype)); ++ new_stmt ++ = gimple_build_assign (vec_dest, ++ build_constructor ++ (vectype, v)); ++ } + } + break; + } +@@ -8864,7 +8986,7 @@ vect_is_simple_cond (tree cond, vec_info *vinfo, + scalar_type = build_nonstandard_integer_type + (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))), + TYPE_UNSIGNED (scalar_type)); +- *comp_vectype = get_vectype_for_scalar_type (scalar_type); ++ *comp_vectype = get_vectype_for_scalar_type (vinfo, scalar_type); + } + + return true; +@@ -8881,9 +9003,9 @@ vect_is_simple_cond (tree cond, vec_info *vinfo, + + Return true if STMT_INFO is vectorizable in this way. */ + +-bool ++static bool + vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, +- stmt_vec_info *vec_stmt, bool for_reduction, ++ stmt_vec_info *vec_stmt, + slp_tree slp_node, stmt_vector_for_cost *cost_vec) + { + vec_info *vinfo = stmt_info->vinfo; +@@ -8913,22 +9035,39 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + tree vec_cmp_type; + bool masked = false; + +- if (for_reduction && STMT_SLP_TYPE (stmt_info)) ++ if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo) ++ return false; ++ ++ /* Is vectorizable conditional operation? */ ++ gassign *stmt = dyn_cast (stmt_info->stmt); ++ if (!stmt) ++ return false; ++ ++ code = gimple_assign_rhs_code (stmt); ++ if (code != COND_EXPR) + return false; + +- vect_reduction_type reduction_type +- = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info); +- if (reduction_type == TREE_CODE_REDUCTION) ++ stmt_vec_info reduc_info = NULL; ++ int reduc_index = -1; ++ vect_reduction_type reduction_type = TREE_CODE_REDUCTION; ++ bool for_reduction ++ = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) != NULL; ++ if (for_reduction) + { +- if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo) ++ if (STMT_SLP_TYPE (stmt_info)) + return false; +- +- if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def +- && !(STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle +- && for_reduction)) ++ reduc_info = info_for_reduction (stmt_info); ++ reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); ++ reduc_index = STMT_VINFO_REDUC_IDX (stmt_info); ++ gcc_assert (reduction_type != EXTRACT_LAST_REDUCTION ++ || reduc_index != -1); ++ } ++ else ++ { ++ if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def) + return false; + +- /* FORNOW: not yet supported. */ ++ /* FORNOW: only supported as part of a reduction. */ + if (STMT_VINFO_LIVE_P (stmt_info)) + { + if (dump_enabled_p ()) +@@ -8938,16 +9077,6 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + } + } + +- /* Is vectorizable conditional operation? */ +- gassign *stmt = dyn_cast (stmt_info->stmt); +- if (!stmt) +- return false; +- +- code = gimple_assign_rhs_code (stmt); +- +- if (code != COND_EXPR) +- return false; +- + tree vectype = STMT_VINFO_VECTYPE (stmt_info); + tree vectype1 = NULL_TREE, vectype2 = NULL_TREE; + +@@ -8981,7 +9110,7 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + return false; + + masked = !COMPARISON_CLASS_P (cond_expr); +- vec_cmp_type = build_same_sized_truth_vector_type (comp_vectype); ++ vec_cmp_type = truth_type_for (comp_vectype); + + if (vec_cmp_type == NULL_TREE) + return false; +@@ -8993,6 +9122,29 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + cond_expr1 = TREE_OPERAND (cond_expr, 1); + } + ++ /* For conditional reductions, the "then" value needs to be the candidate ++ value calculated by this iteration while the "else" value needs to be ++ the result carried over from previous iterations. If the COND_EXPR ++ is the other way around, we need to swap it. */ ++ bool must_invert_cmp_result = false; ++ if (reduction_type == EXTRACT_LAST_REDUCTION && reduc_index == 1) ++ { ++ if (masked) ++ must_invert_cmp_result = true; ++ else ++ { ++ bool honor_nans = HONOR_NANS (TREE_TYPE (cond_expr0)); ++ tree_code new_code = invert_tree_comparison (cond_code, honor_nans); ++ if (new_code == ERROR_MARK) ++ must_invert_cmp_result = true; ++ else ++ cond_code = new_code; ++ } ++ /* Make sure we don't accidentally use the old condition. */ ++ cond_expr = NULL_TREE; ++ std::swap (then_clause, else_clause); ++ } ++ + if (!masked && VECTOR_BOOLEAN_TYPE_P (comp_vectype)) + { + /* Boolean values may have another representation in vectors +@@ -9053,6 +9205,16 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + return false; + } + } ++ if (loop_vinfo ++ && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) ++ && reduction_type == EXTRACT_LAST_REDUCTION) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "can't yet use a fully-masked loop for" ++ " EXTRACT_LAST_REDUCTION.\n"); ++ LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; ++ } + if (expand_vec_cond_expr_p (vectype, comp_vectype, + cond_code)) + { +@@ -9082,24 +9244,42 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + /* Handle cond expr. */ + for (j = 0; j < ncopies; j++) + { ++ tree loop_mask = NULL_TREE; ++ bool swap_cond_operands = false; ++ ++ /* See whether another part of the vectorized code applies a loop ++ mask to the condition, or to its inverse. */ ++ ++ if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) ++ { ++ scalar_cond_masked_key cond (cond_expr, ncopies); ++ if (loop_vinfo->scalar_cond_masked_set.contains (cond)) ++ { ++ vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); ++ loop_mask = vect_get_loop_mask (gsi, masks, ncopies, vectype, j); ++ } ++ else ++ { ++ bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0)); ++ cond.code = invert_tree_comparison (cond.code, honor_nans); ++ if (loop_vinfo->scalar_cond_masked_set.contains (cond)) ++ { ++ vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); ++ loop_mask = vect_get_loop_mask (gsi, masks, ncopies, ++ vectype, j); ++ cond_code = cond.code; ++ swap_cond_operands = true; ++ } ++ } ++ } ++ + stmt_vec_info new_stmt_info = NULL; + if (j == 0) + { + if (slp_node) + { +- auto_vec ops; + auto_vec, 4> vec_defs; +- +- if (masked) +- ops.safe_push (cond_expr); +- else +- { +- ops.safe_push (cond_expr0); +- ops.safe_push (cond_expr1); +- } +- ops.safe_push (then_clause); +- ops.safe_push (else_clause); +- vect_get_slp_defs (ops, slp_node, &vec_defs); ++ vect_get_slp_defs (slp_node, &vec_defs); + vec_oprnds3 = vec_defs.pop (); + vec_oprnds2 = vec_defs.pop (); + if (!masked) +@@ -9159,6 +9339,9 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + vec_then_clause = vec_oprnds2[i]; + vec_else_clause = vec_oprnds3[i]; + ++ if (swap_cond_operands) ++ std::swap (vec_then_clause, vec_else_clause); ++ + if (masked) + vec_compare = vec_cond_lhs; + else +@@ -9197,6 +9380,50 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + } + } + } ++ ++ /* If we decided to apply a loop mask to the result of the vector ++ comparison, AND the comparison with the mask now. Later passes ++ should then be able to reuse the AND results between mulitple ++ vector statements. ++ ++ For example: ++ for (int i = 0; i < 100; ++i) ++ x[i] = y[i] ? z[i] : 10; ++ ++ results in following optimized GIMPLE: ++ ++ mask__35.8_43 = vect__4.7_41 != { 0, ... }; ++ vec_mask_and_46 = loop_mask_40 & mask__35.8_43; ++ _19 = &MEM[base: z_12(D), index: ivtmp_56, step: 4, offset: 0B]; ++ vect_iftmp.11_47 = .MASK_LOAD (_19, 4B, vec_mask_and_46); ++ vect_iftmp.12_52 = VEC_COND_EXPR ; ++ ++ instead of using a masked and unmasked forms of ++ vec != { 0, ... } (masked in the MASK_LOAD, ++ unmasked in the VEC_COND_EXPR). */ ++ ++ if (loop_mask) ++ { ++ if (COMPARISON_CLASS_P (vec_compare)) ++ { ++ tree tmp = make_ssa_name (vec_cmp_type); ++ tree op0 = TREE_OPERAND (vec_compare, 0); ++ tree op1 = TREE_OPERAND (vec_compare, 1); ++ gassign *g = gimple_build_assign (tmp, ++ TREE_CODE (vec_compare), ++ op0, op1); ++ vect_finish_stmt_generation (stmt_info, g, gsi); ++ vec_compare = tmp; ++ } ++ ++ tree tmp2 = make_ssa_name (vec_cmp_type); ++ gassign *g = gimple_build_assign (tmp2, BIT_AND_EXPR, ++ vec_compare, loop_mask); ++ vect_finish_stmt_generation (stmt_info, g, gsi); ++ vec_compare = tmp2; ++ } ++ + if (reduction_type == EXTRACT_LAST_REDUCTION) + { + if (!is_gimple_val (vec_compare)) +@@ -9207,6 +9434,15 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + vect_finish_stmt_generation (stmt_info, new_stmt, gsi); + vec_compare = vec_compare_name; + } ++ if (must_invert_cmp_result) ++ { ++ tree vec_compare_name = make_ssa_name (vec_cmp_type); ++ gassign *new_stmt = gimple_build_assign (vec_compare_name, ++ BIT_NOT_EXPR, ++ vec_compare); ++ vect_finish_stmt_generation (stmt_info, new_stmt, gsi); ++ vec_compare = vec_compare_name; ++ } + gcall *new_stmt = gimple_build_call_internal + (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare, + vec_then_clause); +@@ -9345,7 +9581,7 @@ vectorizable_comparison (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + /* Invariant comparison. */ + if (!vectype) + { +- vectype = get_vectype_for_scalar_type (TREE_TYPE (rhs1)); ++ vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (rhs1)); + if (maybe_ne (TYPE_VECTOR_SUBPARTS (vectype), nunits)) + return false; + } +@@ -9446,12 +9682,8 @@ vectorizable_comparison (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + { + if (slp_node) + { +- auto_vec ops; + auto_vec, 2> vec_defs; +- +- ops.safe_push (rhs1); +- ops.safe_push (rhs2); +- vect_get_slp_defs (ops, slp_node, &vec_defs); ++ vect_get_slp_defs (slp_node, &vec_defs); + vec_oprnds1 = vec_defs.pop (); + vec_oprnds0 = vec_defs.pop (); + if (swap_p) +@@ -9544,7 +9776,8 @@ vectorizable_comparison (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + + static bool + can_vectorize_live_stmts (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, +- slp_tree slp_node, stmt_vec_info *vec_stmt, ++ slp_tree slp_node, slp_instance slp_node_instance, ++ stmt_vec_info *vec_stmt, + stmt_vector_for_cost *cost_vec) + { + if (slp_node) +@@ -9554,13 +9787,15 @@ can_vectorize_live_stmts (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, slp_stmt_info) + { + if (STMT_VINFO_LIVE_P (slp_stmt_info) +- && !vectorizable_live_operation (slp_stmt_info, gsi, slp_node, i, ++ && !vectorizable_live_operation (slp_stmt_info, gsi, slp_node, ++ slp_node_instance, i, + vec_stmt, cost_vec)) + return false; + } + } + else if (STMT_VINFO_LIVE_P (stmt_info) +- && !vectorizable_live_operation (stmt_info, gsi, slp_node, -1, ++ && !vectorizable_live_operation (stmt_info, gsi, slp_node, ++ slp_node_instance, -1, + vec_stmt, cost_vec)) + return false; + +@@ -9736,14 +9971,13 @@ vect_analyze_stmt (stmt_vec_info stmt_info, bool *need_to_vectorize, + || vectorizable_load (stmt_info, NULL, NULL, node, node_instance, + cost_vec) + || vectorizable_store (stmt_info, NULL, NULL, node, cost_vec) +- || vectorizable_reduction (stmt_info, NULL, NULL, node, +- node_instance, cost_vec) ++ || vectorizable_reduction (stmt_info, node, node_instance, cost_vec) + || vectorizable_induction (stmt_info, NULL, NULL, node, cost_vec) + || vectorizable_shift (stmt_info, NULL, NULL, node, cost_vec) +- || vectorizable_condition (stmt_info, NULL, NULL, false, node, +- cost_vec) ++ || vectorizable_condition (stmt_info, NULL, NULL, node, cost_vec) + || vectorizable_comparison (stmt_info, NULL, NULL, node, +- cost_vec)); ++ cost_vec) ++ || vectorizable_lc_phi (stmt_info, NULL, node)); + else + { + if (bb_vinfo) +@@ -9759,8 +9993,7 @@ vect_analyze_stmt (stmt_vec_info stmt_info, bool *need_to_vectorize, + || vectorizable_load (stmt_info, NULL, NULL, node, node_instance, + cost_vec) + || vectorizable_store (stmt_info, NULL, NULL, node, cost_vec) +- || vectorizable_condition (stmt_info, NULL, NULL, false, node, +- cost_vec) ++ || vectorizable_condition (stmt_info, NULL, NULL, node, cost_vec) + || vectorizable_comparison (stmt_info, NULL, NULL, node, + cost_vec)); + } +@@ -9775,7 +10008,9 @@ vect_analyze_stmt (stmt_vec_info stmt_info, bool *need_to_vectorize, + need extra handling, except for vectorizable reductions. */ + if (!bb_vinfo + && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type +- && !can_vectorize_live_stmts (stmt_info, NULL, node, NULL, cost_vec)) ++ && STMT_VINFO_TYPE (stmt_info) != lc_phi_info_type ++ && !can_vectorize_live_stmts (stmt_info, NULL, node, node_instance, ++ NULL, cost_vec)) + return opt_result::failure_at (stmt_info->stmt, + "not vectorized:" + " live stmt not supported: %G", +@@ -9864,8 +10099,7 @@ vect_transform_stmt (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + break; + + case condition_vec_info_type: +- done = vectorizable_condition (stmt_info, gsi, &vec_stmt, false, +- slp_node, NULL); ++ done = vectorizable_condition (stmt_info, gsi, &vec_stmt, slp_node, NULL); + gcc_assert (done); + break; + +@@ -9887,8 +10121,18 @@ vect_transform_stmt (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + break; + + case reduc_vec_info_type: +- done = vectorizable_reduction (stmt_info, gsi, &vec_stmt, slp_node, +- slp_node_instance, NULL); ++ done = vect_transform_reduction (stmt_info, gsi, &vec_stmt, slp_node); ++ gcc_assert (done); ++ break; ++ ++ case cycle_phi_info_type: ++ done = vect_transform_cycle_phi (stmt_info, &vec_stmt, slp_node, ++ slp_node_instance); ++ gcc_assert (done); ++ break; ++ ++ case lc_phi_info_type: ++ done = vectorizable_lc_phi (stmt_info, &vec_stmt, slp_node); + gcc_assert (done); + break; + +@@ -9944,19 +10188,66 @@ vect_transform_stmt (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + } + } + +- /* Handle stmts whose DEF is used outside the loop-nest that is +- being vectorized. */ +- if (STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type) +- { +- done = can_vectorize_live_stmts (stmt_info, gsi, slp_node, &vec_stmt, +- NULL); +- gcc_assert (done); +- } +- + if (vec_stmt) + STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt; + +- return is_store; ++ if (STMT_VINFO_TYPE (stmt_info) == store_vec_info_type) ++ return is_store; ++ ++ /* If this stmt defines a value used on a backedge, update the ++ vectorized PHIs. */ ++ stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); ++ stmt_vec_info reduc_info; ++ if (STMT_VINFO_REDUC_DEF (orig_stmt_info) ++ && vect_stmt_to_vectorize (orig_stmt_info) == stmt_info ++ && (reduc_info = info_for_reduction (orig_stmt_info)) ++ && STMT_VINFO_REDUC_TYPE (reduc_info) != FOLD_LEFT_REDUCTION ++ && STMT_VINFO_REDUC_TYPE (reduc_info) != EXTRACT_LAST_REDUCTION) ++ { ++ gphi *phi; ++ if (!slp_node ++ && (phi = dyn_cast ++ (STMT_VINFO_REDUC_DEF (orig_stmt_info)->stmt)) ++ && dominated_by_p (CDI_DOMINATORS, ++ gimple_bb (orig_stmt_info->stmt), gimple_bb (phi))) ++ { ++ edge e = loop_latch_edge (gimple_bb (phi)->loop_father); ++ stmt_vec_info phi_info ++ = STMT_VINFO_VEC_STMT (STMT_VINFO_REDUC_DEF (orig_stmt_info)); ++ stmt_vec_info vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); ++ do ++ { ++ add_phi_arg (as_a (phi_info->stmt), ++ gimple_get_lhs (vec_stmt->stmt), e, ++ gimple_phi_arg_location (phi, e->dest_idx)); ++ phi_info = STMT_VINFO_RELATED_STMT (phi_info); ++ vec_stmt = STMT_VINFO_RELATED_STMT (vec_stmt); ++ } ++ while (phi_info); ++ gcc_assert (!vec_stmt); ++ } ++ else if (slp_node ++ && slp_node != slp_node_instance->reduc_phis) ++ { ++ slp_tree phi_node = slp_node_instance->reduc_phis; ++ gphi *phi = as_a (SLP_TREE_SCALAR_STMTS (phi_node)[0]->stmt); ++ edge e = loop_latch_edge (gimple_bb (phi)->loop_father); ++ gcc_assert (SLP_TREE_VEC_STMTS (phi_node).length () ++ == SLP_TREE_VEC_STMTS (slp_node).length ()); ++ for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i) ++ add_phi_arg (as_a (SLP_TREE_VEC_STMTS (phi_node)[i]->stmt), ++ gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt), ++ e, gimple_phi_arg_location (phi, e->dest_idx)); ++ } ++ } ++ ++ /* Handle stmts whose DEF is used outside the loop-nest that is ++ being vectorized. */ ++ done = can_vectorize_live_stmts (stmt_info, gsi, slp_node, ++ slp_node_instance, &vec_stmt, NULL); ++ gcc_assert (done); ++ ++ return false; + } + + +@@ -9979,18 +10270,28 @@ vect_remove_stores (stmt_vec_info first_stmt_info) + } + } + +-/* Function get_vectype_for_scalar_type_and_size. ++/* If NUNITS is nonzero, return a vector type that contains NUNITS ++ elements of type SCALAR_TYPE, or null if the target doesn't support ++ such a type. + +- Returns the vector type corresponding to SCALAR_TYPE and SIZE as supported +- by the target. */ ++ If NUNITS is zero, return a vector type that contains elements of ++ type SCALAR_TYPE, choosing whichever vector size the target prefers. ++ ++ If PREVAILING_MODE is VOIDmode, we have not yet chosen a vector mode ++ for this vectorization region and want to "autodetect" the best choice. ++ Otherwise, PREVAILING_MODE is a previously-chosen vector TYPE_MODE ++ and we want the new type to be interoperable with it. PREVAILING_MODE ++ in this case can be a scalar integer mode or a vector mode; when it ++ is a vector mode, the function acts like a tree-level version of ++ related_vector_mode. */ + + tree +-get_vectype_for_scalar_type_and_size (tree scalar_type, poly_uint64 size) ++get_related_vectype_for_scalar_type (machine_mode prevailing_mode, ++ tree scalar_type, poly_uint64 nunits) + { + tree orig_scalar_type = scalar_type; + scalar_mode inner_mode; + machine_mode simd_mode; +- poly_uint64 nunits; + tree vectype; + + if (!is_int_mode (TYPE_MODE (scalar_type), &inner_mode) +@@ -10030,19 +10331,45 @@ get_vectype_for_scalar_type_and_size (tree scalar_type, poly_uint64 size) + if (scalar_type == NULL_TREE) + return NULL_TREE; + +- /* If no size was supplied use the mode the target prefers. Otherwise +- lookup a vector mode of the specified size. */ +- if (known_eq (size, 0U)) +- simd_mode = targetm.vectorize.preferred_simd_mode (inner_mode); +- else if (!multiple_p (size, nbytes, &nunits) +- || !mode_for_vector (inner_mode, nunits).exists (&simd_mode)) +- return NULL_TREE; +- /* NOTE: nunits == 1 is allowed to support single element vector types. */ +- if (!multiple_p (GET_MODE_SIZE (simd_mode), nbytes, &nunits)) +- return NULL_TREE; ++ /* If no prevailing mode was supplied, use the mode the target prefers. ++ Otherwise lookup a vector mode based on the prevailing mode. */ ++ if (prevailing_mode == VOIDmode) ++ { ++ gcc_assert (known_eq (nunits, 0U)); ++ simd_mode = targetm.vectorize.preferred_simd_mode (inner_mode); ++ if (SCALAR_INT_MODE_P (simd_mode)) ++ { ++ /* Traditional behavior is not to take the integer mode ++ literally, but simply to use it as a way of determining ++ the vector size. It is up to mode_for_vector to decide ++ what the TYPE_MODE should be. ++ ++ Note that nunits == 1 is allowed in order to support single ++ element vector types. */ ++ if (!multiple_p (GET_MODE_SIZE (simd_mode), nbytes, &nunits) ++ || !mode_for_vector (inner_mode, nunits).exists (&simd_mode)) ++ return NULL_TREE; ++ } ++ } ++ else if (SCALAR_INT_MODE_P (prevailing_mode) ++ || !related_vector_mode (prevailing_mode, ++ inner_mode, nunits).exists (&simd_mode)) ++ { ++ /* Fall back to using mode_for_vector, mostly in the hope of being ++ able to use an integer mode. */ ++ if (known_eq (nunits, 0U) ++ && !multiple_p (GET_MODE_SIZE (prevailing_mode), nbytes, &nunits)) ++ return NULL_TREE; + +- vectype = build_vector_type (scalar_type, nunits); ++ if (!mode_for_vector (inner_mode, nunits).exists (&simd_mode)) ++ return NULL_TREE; ++ } ++ ++ vectype = build_vector_type_for_mode (scalar_type, simd_mode); + ++ /* In cases where the mode was chosen by mode_for_vector, check that ++ the target actually supports the chosen mode, or that it at least ++ allows the vector mode to be replaced by a like-sized integer. */ + if (!VECTOR_MODE_P (TYPE_MODE (vectype)) + && !INTEGRAL_MODE_P (TYPE_MODE (vectype))) + return NULL_TREE; +@@ -10056,22 +10383,22 @@ get_vectype_for_scalar_type_and_size (tree scalar_type, poly_uint64 size) + return vectype; + } + +-poly_uint64 current_vector_size; +- + /* Function get_vectype_for_scalar_type. + + Returns the vector type corresponding to SCALAR_TYPE as supported + by the target. */ + + tree +-get_vectype_for_scalar_type (tree scalar_type) ++get_vectype_for_scalar_type (vec_info *vinfo, tree scalar_type) + { +- tree vectype; +- vectype = get_vectype_for_scalar_type_and_size (scalar_type, +- current_vector_size); +- if (vectype +- && known_eq (current_vector_size, 0U)) +- current_vector_size = GET_MODE_SIZE (TYPE_MODE (vectype)); ++ tree vectype = get_related_vectype_for_scalar_type (vinfo->vector_mode, ++ scalar_type); ++ if (vectype && vinfo->vector_mode == VOIDmode) ++ vinfo->vector_mode = TYPE_MODE (vectype); ++ ++ if (vectype) ++ vinfo->used_vector_modes.add (TYPE_MODE (vectype)); ++ + return vectype; + } + +@@ -10081,15 +10408,14 @@ get_vectype_for_scalar_type (tree scalar_type) + of vectors of specified SCALAR_TYPE as supported by target. */ + + tree +-get_mask_type_for_scalar_type (tree scalar_type) ++get_mask_type_for_scalar_type (vec_info *vinfo, tree scalar_type) + { +- tree vectype = get_vectype_for_scalar_type (scalar_type); ++ tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type); + + if (!vectype) + return NULL; + +- return build_truth_vector_type (TYPE_VECTOR_SUBPARTS (vectype), +- current_vector_size); ++ return truth_type_for (vectype); + } + + /* Function get_same_sized_vectype +@@ -10101,10 +10427,29 @@ tree + get_same_sized_vectype (tree scalar_type, tree vector_type) + { + if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)) +- return build_same_sized_truth_vector_type (vector_type); ++ return truth_type_for (vector_type); ++ ++ poly_uint64 nunits; ++ if (!multiple_p (GET_MODE_SIZE (TYPE_MODE (vector_type)), ++ GET_MODE_SIZE (TYPE_MODE (scalar_type)), &nunits)) ++ return NULL_TREE; ++ ++ return get_related_vectype_for_scalar_type (TYPE_MODE (vector_type), ++ scalar_type, nunits); ++} ++ ++/* Return true if replacing LOOP_VINFO->vector_mode with VECTOR_MODE ++ would not change the chosen vector modes. */ + +- return get_vectype_for_scalar_type_and_size +- (scalar_type, GET_MODE_SIZE (TYPE_MODE (vector_type))); ++bool ++vect_chooses_same_modes_p (vec_info *vinfo, machine_mode vector_mode) ++{ ++ for (vec_info::mode_set::iterator i = vinfo->used_vector_modes.begin (); ++ i != vinfo->used_vector_modes.end (); ++i) ++ if (!VECTOR_MODE_P (*i) ++ || related_vector_mode (vector_mode, GET_MODE_INNER (*i), 0) != *i) ++ return false; ++ return true; + } + + /* Function vect_is_simple_use. +@@ -10492,11 +10837,8 @@ supportable_widening_operation (enum tree_code code, stmt_vec_info stmt_info, + { + intermediate_mode = insn_data[icode1].operand[0].mode; + if (VECTOR_BOOLEAN_TYPE_P (prev_type)) +- { +- intermediate_type = vect_halve_mask_nunits (prev_type); +- if (intermediate_mode != TYPE_MODE (intermediate_type)) +- return false; +- } ++ intermediate_type ++ = vect_halve_mask_nunits (prev_type, intermediate_mode); + else + intermediate_type + = lang_hooks.types.type_for_mode (intermediate_mode, +@@ -10680,11 +11022,8 @@ supportable_narrowing_operation (enum tree_code code, + { + intermediate_mode = insn_data[icode1].operand[0].mode; + if (VECTOR_BOOLEAN_TYPE_P (prev_type)) +- { +- intermediate_type = vect_double_mask_nunits (prev_type); +- if (intermediate_mode != TYPE_MODE (intermediate_type)) +- return false; +- } ++ intermediate_type ++ = vect_double_mask_nunits (prev_type, intermediate_mode); + else + intermediate_type + = lang_hooks.types.type_for_mode (intermediate_mode, uns); +@@ -10777,6 +11116,7 @@ vect_get_vector_types_for_stmt (stmt_vec_info stmt_info, + tree *stmt_vectype_out, + tree *nunits_vectype_out) + { ++ vec_info *vinfo = stmt_info->vinfo; + gimple *stmt = stmt_info->stmt; + + *stmt_vectype_out = NULL_TREE; +@@ -10810,7 +11150,12 @@ vect_get_vector_types_for_stmt (stmt_vec_info stmt_info, + tree vectype; + tree scalar_type = NULL_TREE; + if (STMT_VINFO_VECTYPE (stmt_info)) +- *stmt_vectype_out = vectype = STMT_VINFO_VECTYPE (stmt_info); ++ { ++ *stmt_vectype_out = vectype = STMT_VINFO_VECTYPE (stmt_info); ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "precomputed vectype: %T\n", vectype); ++ } + else + { + gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)); +@@ -10842,8 +11187,8 @@ vect_get_vector_types_for_stmt (stmt_vec_info stmt_info, + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, +- "get vectype for scalar type: %T\n", scalar_type); +- vectype = get_vectype_for_scalar_type (scalar_type); ++ "get vectype for scalar type: %T\n", scalar_type); ++ vectype = get_vectype_for_scalar_type (vinfo, scalar_type); + if (!vectype) + return opt_result::failure_at (stmt, + "not vectorized:" +@@ -10859,42 +11204,38 @@ vect_get_vector_types_for_stmt (stmt_vec_info stmt_info, + + /* Don't try to compute scalar types if the stmt produces a boolean + vector; use the existing vector type instead. */ +- tree nunits_vectype; +- if (VECTOR_BOOLEAN_TYPE_P (vectype)) +- nunits_vectype = vectype; +- else ++ tree nunits_vectype = vectype; ++ if (!VECTOR_BOOLEAN_TYPE_P (vectype) ++ && *stmt_vectype_out != boolean_type_node) + { + /* The number of units is set according to the smallest scalar + type (or the largest vector size, but we only support one + vector size per vectorization). */ +- if (*stmt_vectype_out != boolean_type_node) ++ HOST_WIDE_INT dummy; ++ scalar_type = vect_get_smallest_scalar_type (stmt_info, &dummy, &dummy); ++ if (scalar_type != TREE_TYPE (vectype)) + { +- HOST_WIDE_INT dummy; +- scalar_type = vect_get_smallest_scalar_type (stmt_info, +- &dummy, &dummy); ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "get vectype for smallest scalar type: %T\n", ++ scalar_type); ++ nunits_vectype = get_vectype_for_scalar_type (vinfo, scalar_type); ++ if (!nunits_vectype) ++ return opt_result::failure_at ++ (stmt, "not vectorized: unsupported data-type %T\n", ++ scalar_type); ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, "nunits vectype: %T\n", ++ nunits_vectype); + } +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_NOTE, vect_location, +- "get vectype for scalar type: %T\n", scalar_type); +- nunits_vectype = get_vectype_for_scalar_type (scalar_type); + } +- if (!nunits_vectype) +- return opt_result::failure_at (stmt, +- "not vectorized: unsupported data-type %T\n", +- scalar_type); + +- if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)), +- GET_MODE_SIZE (TYPE_MODE (nunits_vectype)))) +- return opt_result::failure_at (stmt, +- "not vectorized: different sized vector " +- "types in statement, %T and %T\n", +- vectype, nunits_vectype); ++ gcc_assert (*stmt_vectype_out == boolean_type_node ++ || multiple_p (TYPE_VECTOR_SUBPARTS (nunits_vectype), ++ TYPE_VECTOR_SUBPARTS (*stmt_vectype_out))); + + if (dump_enabled_p ()) + { +- dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", +- nunits_vectype); +- + dump_printf_loc (MSG_NOTE, vect_location, "nunits = "); + dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (nunits_vectype)); + dump_printf (MSG_NOTE, "\n"); +@@ -10911,6 +11252,7 @@ vect_get_vector_types_for_stmt (stmt_vec_info stmt_info, + opt_tree + vect_get_mask_type_for_stmt (stmt_vec_info stmt_info) + { ++ vec_info *vinfo = stmt_info->vinfo; + gimple *stmt = stmt_info->stmt; + tree mask_type = NULL; + tree vectype, scalar_type; +@@ -10920,7 +11262,7 @@ vect_get_mask_type_for_stmt (stmt_vec_info stmt_info) + && !VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (gimple_assign_rhs1 (stmt)))) + { + scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); +- mask_type = get_mask_type_for_scalar_type (scalar_type); ++ mask_type = get_mask_type_for_scalar_type (vinfo, scalar_type); + + if (!mask_type) + return opt_tree::failure_at (stmt, +@@ -10968,7 +11310,7 @@ vect_get_mask_type_for_stmt (stmt_vec_info stmt_info) + && !VECTOR_BOOLEAN_TYPE_P (mask_type) + && gimple_code (stmt) == GIMPLE_ASSIGN + && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison) +- mask_type = build_same_sized_truth_vector_type (mask_type); ++ mask_type = truth_type_for (mask_type); + } + + /* No mask_type should mean loop invariant predicate. +diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c +index d89ec3b7c76..c2c6377d3f9 100644 +--- a/gcc/tree-vectorizer.c ++++ b/gcc/tree-vectorizer.c +@@ -639,8 +639,11 @@ vec_info::new_stmt_vec_info (gimple *stmt) + STMT_VINFO_TYPE (res) = undef_vec_info_type; + STMT_VINFO_RELEVANT (res) = vect_unused_in_scope; + STMT_VINFO_VECTORIZABLE (res) = true; +- STMT_VINFO_VEC_REDUCTION_TYPE (res) = TREE_CODE_REDUCTION; +- STMT_VINFO_VEC_CONST_COND_REDUC_CODE (res) = ERROR_MARK; ++ STMT_VINFO_REDUC_TYPE (res) = TREE_CODE_REDUCTION; ++ STMT_VINFO_REDUC_CODE (res) = ERROR_MARK; ++ STMT_VINFO_REDUC_FN (res) = IFN_LAST; ++ STMT_VINFO_REDUC_IDX (res) = -1; ++ STMT_VINFO_SLP_VECT_ONLY (res) = false; + + if (gimple_code (stmt) == GIMPLE_PHI + && is_loop_header_bb_p (gimple_bb (stmt))) +@@ -862,8 +865,7 @@ set_uid_loop_bbs (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) + + static unsigned + try_vectorize_loop_1 (hash_table *&simduid_to_vf_htab, +- unsigned *num_vectorized_loops, +- loop_p loop, loop_vec_info orig_loop_vinfo, ++ unsigned *num_vectorized_loops, loop_p loop, + gimple *loop_vectorized_call, + gimple *loop_dist_alias_call) + { +@@ -871,6 +873,7 @@ try_vectorize_loop_1 (hash_table *&simduid_to_vf_htab, + vec_info_shared shared; + auto_purge_vect_location sentinel; + vect_location = find_loop_location (loop); ++ + if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION + && dump_enabled_p ()) + dump_printf (MSG_NOTE | MSG_PRIORITY_INTERNALS, +@@ -878,10 +881,17 @@ try_vectorize_loop_1 (hash_table *&simduid_to_vf_htab, + LOCATION_FILE (vect_location.get_location_t ()), + LOCATION_LINE (vect_location.get_location_t ())); + +- /* Try to analyze the loop, retaining an opt_problem if dump_enabled_p. */ +- opt_loop_vec_info loop_vinfo +- = vect_analyze_loop (loop, orig_loop_vinfo, &shared); +- loop->aux = loop_vinfo; ++ opt_loop_vec_info loop_vinfo = opt_loop_vec_info::success (NULL); ++ /* In the case of epilogue vectorization the loop already has its ++ loop_vec_info set, we do not require to analyze the loop in this case. */ ++ if (loop_vec_info vinfo = loop_vec_info_for_loop (loop)) ++ loop_vinfo = opt_loop_vec_info::success (vinfo); ++ else ++ { ++ /* Try to analyze the loop, retaining an opt_problem if dump_enabled_p. */ ++ loop_vinfo = vect_analyze_loop (loop, &shared); ++ loop->aux = loop_vinfo; ++ } + + if (!loop_vinfo) + if (dump_enabled_p ()) +@@ -968,7 +978,7 @@ try_vectorize_loop_1 (hash_table *&simduid_to_vf_htab, + unsigned HOST_WIDE_INT bytes; + if (dump_enabled_p ()) + { +- if (current_vector_size.is_constant (&bytes)) ++ if (GET_MODE_SIZE (loop_vinfo->vector_mode).is_constant (&bytes)) + dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, + "loop vectorized using %wu byte vectors\n", bytes); + else +@@ -1009,8 +1019,13 @@ try_vectorize_loop_1 (hash_table *&simduid_to_vf_htab, + + /* Epilogue of vectorized loop must be vectorized too. */ + if (new_loop) +- ret |= try_vectorize_loop_1 (simduid_to_vf_htab, num_vectorized_loops, +- new_loop, loop_vinfo, NULL, NULL); ++ { ++ /* Don't include vectorized epilogues in the "vectorized loops" count. ++ */ ++ unsigned dont_count = *num_vectorized_loops; ++ ret |= try_vectorize_loop_1 (simduid_to_vf_htab, &dont_count, ++ new_loop, NULL, NULL); ++ } + + return ret; + } +@@ -1026,8 +1041,7 @@ try_vectorize_loop (hash_table *&simduid_to_vf_htab, + || loop->force_vectorize)) + return 0; + +- return try_vectorize_loop_1 (simduid_to_vf_htab, num_vectorized_loops, +- loop, NULL, ++ return try_vectorize_loop_1 (simduid_to_vf_htab, num_vectorized_loops, loop, + vect_loop_vectorized_call (loop), + vect_loop_dist_alias_call (loop)); + } +@@ -1344,7 +1358,8 @@ get_vec_alignment_for_array_type (tree type) + gcc_assert (TREE_CODE (type) == ARRAY_TYPE); + poly_uint64 array_size, vector_size; + +- tree vectype = get_vectype_for_scalar_type (strip_array_types (type)); ++ tree scalar_type = strip_array_types (type); ++ tree vectype = get_related_vectype_for_scalar_type (VOIDmode, scalar_type); + if (!vectype + || !poly_int_tree_p (TYPE_SIZE (type), &array_size) + || !poly_int_tree_p (TYPE_SIZE (vectype), &vector_size) +@@ -1512,3 +1527,36 @@ make_pass_ipa_increase_alignment (gcc::context *ctxt) + { + return new pass_ipa_increase_alignment (ctxt); + } ++ ++/* If the condition represented by T is a comparison or the SSA name ++ result of a comparison, extract the comparison's operands. Represent ++ T as NE_EXPR otherwise. */ ++ ++void ++scalar_cond_masked_key::get_cond_ops_from_tree (tree t) ++{ ++ if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison) ++ { ++ this->code = TREE_CODE (t); ++ this->op0 = TREE_OPERAND (t, 0); ++ this->op1 = TREE_OPERAND (t, 1); ++ return; ++ } ++ ++ if (TREE_CODE (t) == SSA_NAME) ++ if (gassign *stmt = dyn_cast (SSA_NAME_DEF_STMT (t))) ++ { ++ tree_code code = gimple_assign_rhs_code (stmt); ++ if (TREE_CODE_CLASS (code) == tcc_comparison) ++ { ++ this->code = code; ++ this->op0 = gimple_assign_rhs1 (stmt); ++ this->op1 = gimple_assign_rhs2 (stmt); ++ return; ++ } ++ } ++ ++ this->code = NE_EXPR; ++ this->op0 = t; ++ this->op1 = build_zero_cst (TREE_TYPE (t)); ++} +diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h +index 148b9a7f215..c46e2742c36 100644 +--- a/gcc/tree-vectorizer.h ++++ b/gcc/tree-vectorizer.h +@@ -26,6 +26,7 @@ typedef struct _stmt_vec_info *stmt_vec_info; + #include "tree-data-ref.h" + #include "tree-hash-traits.h" + #include "target.h" ++#include + + /* Used for naming of new temporaries. */ + enum vect_var_kind { +@@ -120,6 +121,8 @@ struct _slp_tree { + vec children; + /* A group of scalar stmts to be vectorized together. */ + vec stmts; ++ /* A group of scalar operands to be vectorized together. */ ++ vec ops; + /* Load permutation relative to the stores, NULL if there is no + permutation. */ + vec load_permutation; +@@ -170,13 +173,82 @@ typedef struct _slp_instance { + + #define SLP_TREE_CHILDREN(S) (S)->children + #define SLP_TREE_SCALAR_STMTS(S) (S)->stmts ++#define SLP_TREE_SCALAR_OPS(S) (S)->ops + #define SLP_TREE_VEC_STMTS(S) (S)->vec_stmts + #define SLP_TREE_NUMBER_OF_VEC_STMTS(S) (S)->vec_stmts_size + #define SLP_TREE_LOAD_PERMUTATION(S) (S)->load_permutation + #define SLP_TREE_TWO_OPERATORS(S) (S)->two_operators + #define SLP_TREE_DEF_TYPE(S) (S)->def_type + ++/* Key for map that records association between ++ scalar conditions and corresponding loop mask, and ++ is populated by vect_record_loop_mask. */ + ++struct scalar_cond_masked_key ++{ ++ scalar_cond_masked_key (tree t, unsigned ncopies_) ++ : ncopies (ncopies_) ++ { ++ get_cond_ops_from_tree (t); ++ } ++ ++ void get_cond_ops_from_tree (tree); ++ ++ unsigned ncopies; ++ tree_code code; ++ tree op0; ++ tree op1; ++}; ++ ++template<> ++struct default_hash_traits ++{ ++ typedef scalar_cond_masked_key compare_type; ++ typedef scalar_cond_masked_key value_type; ++ ++ static inline hashval_t ++ hash (value_type v) ++ { ++ inchash::hash h; ++ h.add_int (v.code); ++ inchash::add_expr (v.op0, h, 0); ++ inchash::add_expr (v.op1, h, 0); ++ h.add_int (v.ncopies); ++ return h.end (); ++ } ++ ++ static inline bool ++ equal (value_type existing, value_type candidate) ++ { ++ return (existing.ncopies == candidate.ncopies ++ && existing.code == candidate.code ++ && operand_equal_p (existing.op0, candidate.op0, 0) ++ && operand_equal_p (existing.op1, candidate.op1, 0)); ++ } ++ ++ static inline void ++ mark_empty (value_type &v) ++ { ++ v.ncopies = 0; ++ } ++ ++ static inline bool ++ is_empty (value_type v) ++ { ++ return v.ncopies == 0; ++ } ++ ++ static inline void mark_deleted (value_type &) {} ++ ++ static inline bool is_deleted (const value_type &) ++ { ++ return false; ++ } ++ ++ static inline void remove (value_type &) {} ++}; ++ ++typedef hash_set scalar_cond_masked_set_type; + + /* Describes two objects whose addresses must be unequal for the vectorized + loop to be valid. */ +@@ -217,6 +289,7 @@ struct vec_info_shared { + + /* Vectorizer state common between loop and basic-block vectorization. */ + struct vec_info { ++ typedef hash_set > mode_set; + enum vec_kind { bb, loop }; + + vec_info (vec_kind, void *, vec_info_shared *); +@@ -254,6 +327,14 @@ struct vec_info { + /* Cost data used by the target cost model. */ + void *target_cost_data; + ++ /* The set of vector modes used in the vectorized region. */ ++ mode_set used_vector_modes; ++ ++ /* The argument we should pass to related_vector_mode when looking up ++ the vector mode for a scalar mode, or VOIDmode if we haven't yet ++ made any decisions about which vector modes to use. */ ++ machine_mode vector_mode; ++ + private: + stmt_vec_info new_stmt_vec_info (gimple *stmt); + void set_vinfo_for_stmt (gimple *, stmt_vec_info); +@@ -377,6 +458,8 @@ struct rgroup_masks { + + typedef auto_vec vec_loop_masks; + ++typedef auto_vec > drs_init_vec; ++ + /*-----------------------------------------------------------------*/ + /* Info on vectorized loops. */ + /*-----------------------------------------------------------------*/ +@@ -399,7 +482,7 @@ typedef struct _loop_vec_info : public vec_info { + /* Condition under which this loop is analyzed and versioned. */ + tree num_iters_assumptions; + +- /* Threshold of number of iterations below which vectorzation will not be ++ /* Threshold of number of iterations below which vectorization will not be + performed. It is calculated from MIN_PROFITABLE_ITERS and + PARAM_MIN_VECT_LOOP_BOUND. */ + unsigned int th; +@@ -421,6 +504,9 @@ typedef struct _loop_vec_info : public vec_info { + on inactive scalars. */ + vec_loop_masks masks; + ++ /* Set of scalar conditions that have loop mask applied. */ ++ scalar_cond_masked_set_type scalar_cond_masked_set; ++ + /* If we are using a loop mask to align memory addresses, this variable + contains the number of vector elements that we should skip in the + first iteration of the vector loop (i.e. the number of leading +@@ -497,6 +583,13 @@ typedef struct _loop_vec_info : public vec_info { + /* Cost of a single scalar iteration. */ + int single_scalar_iteration_cost; + ++ /* The cost of the vector prologue and epilogue, including peeled ++ iterations and set-up code. */ ++ int vec_outside_cost; ++ ++ /* The cost of the vector loop body. */ ++ int vec_inside_cost; ++ + /* Is the loop vectorizable? */ + bool vectorizable; + +@@ -551,6 +644,10 @@ typedef struct _loop_vec_info : public vec_info { + this points to the original vectorized loop. Otherwise NULL. */ + _loop_vec_info *orig_loop_info; + ++ /* Used to store loop_vec_infos of epilogues of this loop during ++ analysis. */ ++ vec<_loop_vec_info *> epilogue_vinfos; ++ + } *loop_vec_info; + + /* Access Functions. */ +@@ -682,6 +779,8 @@ enum stmt_vec_info_type { + type_promotion_vec_info_type, + type_demotion_vec_info_type, + type_conversion_vec_info_type, ++ cycle_phi_info_type, ++ lc_phi_info_type, + loop_exit_ctrl_vec_info_type + }; + +@@ -917,21 +1016,42 @@ struct _stmt_vec_info { + for loop vectorization. */ + vect_memory_access_type memory_access_type; + +- /* For reduction loops, this is the type of reduction. */ +- enum vect_reduction_type v_reduc_type; ++ /* For INTEGER_INDUC_COND_REDUCTION, the initial value to be used. */ ++ tree induc_cond_initial_val; + +- /* For CONST_COND_REDUCTION, record the reduc code. */ +- enum tree_code const_cond_reduc_code; ++ /* If not NULL the value to be added to compute final reduction value. */ ++ tree reduc_epilogue_adjustment; + + /* On a reduction PHI the reduction type as detected by +- vect_force_simple_reduction. */ ++ vect_is_simple_reduction and vectorizable_reduction. */ + enum vect_reduction_type reduc_type; + ++ /* The original reduction code, to be used in the epilogue. */ ++ enum tree_code reduc_code; ++ /* An internal function we should use in the epilogue. */ ++ internal_fn reduc_fn; ++ ++ /* On a stmt participating in the reduction the index of the operand ++ on the reduction SSA cycle. */ ++ int reduc_idx; ++ + /* On a reduction PHI the def returned by vect_force_simple_reduction. + On the def returned by vect_force_simple_reduction the + corresponding PHI. */ + stmt_vec_info reduc_def; + ++ /* The vector input type relevant for reduction vectorization. */ ++ tree reduc_vectype_in; ++ ++ /* The vector type for performing the actual reduction. */ ++ tree reduc_vectype; ++ ++ /* Whether we force a single cycle PHI during reduction vectorization. */ ++ bool force_single_cycle; ++ ++ /* Whether on this stmt reduction meta is recorded. */ ++ bool is_reduc_info; ++ + /* The number of scalar stmt references from active SLP instances. */ + unsigned int num_slp_uses; + +@@ -949,6 +1069,9 @@ struct _stmt_vec_info { + and OPERATION_BITS without changing the result. */ + unsigned int operation_precision; + signop operation_sign; ++ ++ /* True if this is only suitable for SLP vectorization. */ ++ bool slp_vect_only_p; + }; + + /* Information about a gather/scatter call. */ +@@ -1011,8 +1134,10 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo) + #define STMT_VINFO_STRIDED_P(S) (S)->strided_p + #define STMT_VINFO_MEMORY_ACCESS_TYPE(S) (S)->memory_access_type + #define STMT_VINFO_SIMD_LANE_ACCESS_P(S) (S)->simd_lane_access_p +-#define STMT_VINFO_VEC_REDUCTION_TYPE(S) (S)->v_reduc_type +-#define STMT_VINFO_VEC_CONST_COND_REDUC_CODE(S) (S)->const_cond_reduc_code ++#define STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL(S) (S)->induc_cond_initial_val ++#define STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT(S) (S)->reduc_epilogue_adjustment ++#define STMT_VINFO_REDUC_IDX(S) (S)->reduc_idx ++#define STMT_VINFO_FORCE_SINGLE_CYCLE(S) (S)->force_single_cycle + + #define STMT_VINFO_DR_WRT_VEC_LOOP(S) (S)->dr_wrt_vec_loop + #define STMT_VINFO_DR_BASE_ADDRESS(S) (S)->dr_wrt_vec_loop.base_address +@@ -1043,7 +1168,12 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo) + #define STMT_VINFO_MIN_NEG_DIST(S) (S)->min_neg_dist + #define STMT_VINFO_NUM_SLP_USES(S) (S)->num_slp_uses + #define STMT_VINFO_REDUC_TYPE(S) (S)->reduc_type ++#define STMT_VINFO_REDUC_CODE(S) (S)->reduc_code ++#define STMT_VINFO_REDUC_FN(S) (S)->reduc_fn + #define STMT_VINFO_REDUC_DEF(S) (S)->reduc_def ++#define STMT_VINFO_REDUC_VECTYPE(S) (S)->reduc_vectype ++#define STMT_VINFO_REDUC_VECTYPE_IN(S) (S)->reduc_vectype_in ++#define STMT_VINFO_SLP_VECT_ONLY(S) (S)->slp_vect_only_p + + #define DR_GROUP_FIRST_ELEMENT(S) \ + (gcc_checking_assert ((S)->dr_aux.dr), (S)->first_element) +@@ -1358,7 +1488,7 @@ vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype) + static inline void + vect_update_max_nunits (poly_uint64 *max_nunits, poly_uint64 nunits) + { +- /* All unit counts have the form current_vector_size * X for some ++ /* All unit counts have the form vec_info::vector_size * X for some + rational X, so two unit sizes must have a common multiple. + Everything is a multiple of the initial value of 1. */ + *max_nunits = force_common_multiple (*max_nunits, nunits); +@@ -1466,20 +1596,22 @@ extern void vect_set_loop_condition (struct loop *, loop_vec_info, + extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge); + struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, + struct loop *, edge); +-struct loop *vect_loop_versioning (loop_vec_info, unsigned int, bool, +- poly_uint64); ++struct loop *vect_loop_versioning (loop_vec_info); + extern struct loop *vect_do_peeling (loop_vec_info, tree, tree, +- tree *, tree *, tree *, int, bool, bool); ++ tree *, tree *, tree *, int, bool, bool, ++ tree *, drs_init_vec &); + extern void vect_prepare_for_masked_peels (loop_vec_info); + extern dump_user_location_t find_loop_location (struct loop *); + extern bool vect_can_advance_ivs_p (loop_vec_info); ++extern void vect_update_inits_of_drs (loop_vec_info, tree, tree_code); + + /* In tree-vect-stmts.c. */ +-extern poly_uint64 current_vector_size; +-extern tree get_vectype_for_scalar_type (tree); +-extern tree get_vectype_for_scalar_type_and_size (tree, poly_uint64); +-extern tree get_mask_type_for_scalar_type (tree); ++extern tree get_related_vectype_for_scalar_type (machine_mode, tree, ++ poly_uint64 = 0); ++extern tree get_vectype_for_scalar_type (vec_info *, tree); ++extern tree get_mask_type_for_scalar_type (vec_info *, tree); + extern tree get_same_sized_vectype (tree, tree); ++extern bool vect_chooses_same_modes_p (vec_info *, machine_mode); + extern bool vect_get_loop_mask_type (loop_vec_info); + extern bool vect_is_simple_use (tree, vec_info *, enum vect_def_type *, + stmt_vec_info * = NULL, gimple ** = NULL); +@@ -1491,15 +1623,15 @@ extern bool supportable_widening_operation (enum tree_code, stmt_vec_info, + enum tree_code *, int *, + vec *); + extern bool supportable_narrowing_operation (enum tree_code, tree, tree, +- enum tree_code *, +- int *, vec *); ++ enum tree_code *, int *, ++ vec *); + extern unsigned record_stmt_cost (stmt_vector_for_cost *, int, + enum vect_cost_for_stmt, stmt_vec_info, + int, enum vect_cost_model_location); + extern stmt_vec_info vect_finish_replace_stmt (stmt_vec_info, gimple *); + extern stmt_vec_info vect_finish_stmt_generation (stmt_vec_info, gimple *, + gimple_stmt_iterator *); +-extern opt_result vect_mark_stmts_to_be_vectorized (loop_vec_info); ++extern opt_result vect_mark_stmts_to_be_vectorized (loop_vec_info, bool *); + extern tree vect_get_store_rhs (stmt_vec_info); + extern tree vect_get_vec_def_for_operand_1 (stmt_vec_info, enum vect_def_type); + extern tree vect_get_vec_def_for_operand (tree, stmt_vec_info, tree = NULL); +@@ -1515,19 +1647,13 @@ extern bool vect_transform_stmt (stmt_vec_info, gimple_stmt_iterator *, + extern void vect_remove_stores (stmt_vec_info); + extern opt_result vect_analyze_stmt (stmt_vec_info, bool *, slp_tree, + slp_instance, stmt_vector_for_cost *); +-extern bool vectorizable_condition (stmt_vec_info, gimple_stmt_iterator *, +- stmt_vec_info *, bool, slp_tree, +- stmt_vector_for_cost *); +-extern bool vectorizable_shift (stmt_vec_info, gimple_stmt_iterator *, +- stmt_vec_info *, slp_tree, +- stmt_vector_for_cost *); + extern void vect_get_load_cost (stmt_vec_info, int, bool, + unsigned int *, unsigned int *, + stmt_vector_for_cost *, + stmt_vector_for_cost *, bool); + extern void vect_get_store_cost (stmt_vec_info, int, + unsigned int *, stmt_vector_for_cost *); +-extern bool vect_supportable_shift (enum tree_code, tree); ++extern bool vect_supportable_shift (vec_info *, enum tree_code, tree); + extern tree vect_gen_perm_mask_any (tree, const vec_perm_indices &); + extern tree vect_gen_perm_mask_checked (tree, const vec_perm_indices &); + extern void optimize_mask_stores (struct loop*); +@@ -1557,7 +1683,7 @@ extern bool vect_check_gather_scatter (stmt_vec_info, loop_vec_info, + gather_scatter_info *); + extern opt_result vect_find_stmt_data_reference (loop_p, gimple *, + vec *); +-extern opt_result vect_analyze_data_refs (vec_info *, poly_uint64 *); ++extern opt_result vect_analyze_data_refs (vec_info *, poly_uint64 *, bool *); + extern void vect_record_base_alignments (vec_info *); + extern tree vect_create_data_ref_ptr (stmt_vec_info, tree, struct loop *, tree, + tree *, gimple_stmt_iterator *, +@@ -1586,40 +1712,43 @@ extern tree vect_create_addr_base_for_vector_ref (stmt_vec_info, gimple_seq *, + tree, tree = NULL_TREE); + + /* In tree-vect-loop.c. */ +-/* FORNOW: Used in tree-parloops.c. */ +-extern stmt_vec_info vect_force_simple_reduction (loop_vec_info, stmt_vec_info, +- bool *, bool); +-/* Used in gimple-loop-interchange.c. */ ++/* Used in tree-vect-loop-manip.c */ ++extern void determine_peel_for_niter (loop_vec_info); ++/* Used in gimple-loop-interchange.c and tree-parloops.c. */ + extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree, + enum tree_code); ++extern bool needs_fold_left_reduction_p (tree, tree_code); + /* Drive for loop analysis stage. */ +-extern opt_loop_vec_info vect_analyze_loop (struct loop *, +- loop_vec_info, +- vec_info_shared *); ++extern opt_loop_vec_info vect_analyze_loop (struct loop *, vec_info_shared *); + extern tree vect_build_loop_niters (loop_vec_info, bool * = NULL); + extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *, + tree *, bool); +-extern tree vect_halve_mask_nunits (tree); +-extern tree vect_double_mask_nunits (tree); ++extern tree vect_halve_mask_nunits (tree, machine_mode); ++extern tree vect_double_mask_nunits (tree, machine_mode); + extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *, +- unsigned int, tree); ++ unsigned int, tree, tree); + extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *, + unsigned int, tree, unsigned int); ++extern stmt_vec_info info_for_reduction (stmt_vec_info); + + /* Drive for loop transformation stage. */ + extern struct loop *vect_transform_loop (loop_vec_info); + extern opt_loop_vec_info vect_analyze_loop_form (struct loop *, + vec_info_shared *); + extern bool vectorizable_live_operation (stmt_vec_info, gimple_stmt_iterator *, +- slp_tree, int, stmt_vec_info *, ++ slp_tree, slp_instance, int, ++ stmt_vec_info *, + stmt_vector_for_cost *); +-extern bool vectorizable_reduction (stmt_vec_info, gimple_stmt_iterator *, +- stmt_vec_info *, slp_tree, slp_instance, ++extern bool vectorizable_reduction (stmt_vec_info, slp_tree, slp_instance, + stmt_vector_for_cost *); + extern bool vectorizable_induction (stmt_vec_info, gimple_stmt_iterator *, + stmt_vec_info *, slp_tree, + stmt_vector_for_cost *); +-extern tree get_initial_def_for_reduction (stmt_vec_info, tree, tree *); ++extern bool vect_transform_reduction (stmt_vec_info, gimple_stmt_iterator *, ++ stmt_vec_info *, slp_tree); ++extern bool vect_transform_cycle_phi (stmt_vec_info, stmt_vec_info *, ++ slp_tree, slp_instance); ++extern bool vectorizable_lc_phi (stmt_vec_info, stmt_vec_info *, slp_tree); + extern bool vect_worthwhile_without_simd_p (vec_info *, tree_code); + extern int vect_get_known_peeling_cost (loop_vec_info, int, int *, + stmt_vector_for_cost *, +@@ -1637,15 +1766,16 @@ extern void vect_schedule_slp (vec_info *); + extern opt_result vect_analyze_slp (vec_info *, unsigned); + extern bool vect_make_slp_decision (loop_vec_info); + extern void vect_detect_hybrid_slp (loop_vec_info); +-extern void vect_get_slp_defs (vec , slp_tree, vec > *); ++extern void vect_get_slp_defs (slp_tree, vec > *, unsigned n = -1U); + extern bool vect_slp_bb (basic_block); + extern stmt_vec_info vect_find_last_scalar_stmt_in_slp (slp_tree); + extern bool is_simple_and_all_uses_invariant (stmt_vec_info, loop_vec_info); +-extern bool can_duplicate_and_interleave_p (unsigned int, machine_mode, ++extern bool can_duplicate_and_interleave_p (vec_info *, unsigned int, ++ machine_mode, + unsigned int * = NULL, + tree * = NULL, tree * = NULL); +-extern void duplicate_and_interleave (gimple_seq *, tree, vec, +- unsigned int, vec &); ++extern void duplicate_and_interleave (vec_info *, gimple_seq *, tree, ++ vec, unsigned int, vec &); + extern int vect_get_place_in_interleaving_chain (stmt_vec_info, stmt_vec_info); + + /* In tree-vect-patterns.c. */ +diff --git a/gcc/tree-vrp.c b/gcc/tree-vrp.c +index 2140101d7d2..fbcd8aa6367 100644 +--- a/gcc/tree-vrp.c ++++ b/gcc/tree-vrp.c +@@ -69,23 +69,20 @@ along with GCC; see the file COPYING3. If not see + #include "builtins.h" + #include "wide-int-range.h" + ++static bool ++ranges_from_anti_range (const value_range_base *ar, ++ value_range_base *vr0, value_range_base *vr1, ++ bool handle_pointers = false); ++ + /* Set of SSA names found live during the RPO traversal of the function + for still active basic-blocks. */ + static sbitmap *live; + +-void +-value_range_base::set (enum value_range_kind kind, tree min, tree max) +-{ +- m_kind = kind; +- m_min = min; +- m_max = max; +- if (flag_checking) +- check (); +-} +- + void + value_range::set_equiv (bitmap equiv) + { ++ if (undefined_p () || varying_p ()) ++ equiv = NULL; + /* Since updating the equivalence set involves deep copying the + bitmaps, only do it if absolutely necessary. + +@@ -261,7 +258,8 @@ value_range_base::constant_p () const + void + value_range_base::set_undefined () + { +- set (VR_UNDEFINED, NULL, NULL); ++ m_kind = VR_UNDEFINED; ++ m_min = m_max = NULL; + } + + void +@@ -273,7 +271,8 @@ value_range::set_undefined () + void + value_range_base::set_varying () + { +- set (VR_VARYING, NULL, NULL); ++ m_kind = VR_VARYING; ++ m_min = m_max = NULL; + } + + void +@@ -335,6 +334,24 @@ value_range::equiv_add (const_tree var, + bool + value_range_base::singleton_p (tree *result) const + { ++ if (m_kind == VR_ANTI_RANGE) ++ { ++ if (nonzero_p ()) ++ { ++ if (TYPE_PRECISION (type ()) == 1) ++ { ++ if (result) ++ *result = m_max; ++ return true; ++ } ++ return false; ++ } ++ ++ value_range_base vr0, vr1; ++ return (ranges_from_anti_range (this, &vr0, &vr1, true) ++ && vr1.undefined_p () ++ && vr0.singleton_p (result)); ++ } + if (m_kind == VR_RANGE + && vrp_operand_equal_p (min (), max ()) + && is_gimple_min_invariant (min ())) +@@ -510,23 +527,28 @@ static assert_locus **asserts_for; + /* Return the maximum value for TYPE. */ + + tree +-vrp_val_max (const_tree type) ++vrp_val_max (const_tree type, bool handle_pointers) + { +- if (!INTEGRAL_TYPE_P (type)) +- return NULL_TREE; +- +- return TYPE_MAX_VALUE (type); ++ if (INTEGRAL_TYPE_P (type)) ++ return TYPE_MAX_VALUE (type); ++ if (POINTER_TYPE_P (type) && handle_pointers) ++ { ++ wide_int max = wi::max_value (TYPE_PRECISION (type), TYPE_SIGN (type)); ++ return wide_int_to_tree (const_cast (type), max); ++ } ++ return NULL_TREE; + } + + /* Return the minimum value for TYPE. */ + + tree +-vrp_val_min (const_tree type) ++vrp_val_min (const_tree type, bool handle_pointers) + { +- if (!INTEGRAL_TYPE_P (type)) +- return NULL_TREE; +- +- return TYPE_MIN_VALUE (type); ++ if (INTEGRAL_TYPE_P (type)) ++ return TYPE_MIN_VALUE (type); ++ if (POINTER_TYPE_P (type) && handle_pointers) ++ return build_zero_cst (const_cast (type)); ++ return NULL_TREE; + } + + /* Return whether VAL is equal to the maximum value of its type. +@@ -637,8 +659,7 @@ intersect_range_with_nonzero_bits (enum value_range_kind vr_type, + extract ranges from var + CST op limit. */ + + void +-value_range_base::set_and_canonicalize (enum value_range_kind kind, +- tree min, tree max) ++value_range_base::set (enum value_range_kind kind, tree min, tree max) + { + /* Use the canonical setters for VR_UNDEFINED and VR_VARYING. */ + if (kind == VR_UNDEFINED) +@@ -652,11 +673,31 @@ value_range_base::set_and_canonicalize (enum value_range_kind kind, + return; + } + ++ /* Convert POLY_INT_CST bounds into worst-case INTEGER_CST bounds. */ ++ if (POLY_INT_CST_P (min)) ++ { ++ tree type_min = vrp_val_min (TREE_TYPE (min), true); ++ widest_int lb ++ = constant_lower_bound_with_limit (wi::to_poly_widest (min), ++ wi::to_widest (type_min)); ++ min = wide_int_to_tree (TREE_TYPE (min), lb); ++ } ++ if (POLY_INT_CST_P (max)) ++ { ++ tree type_max = vrp_val_max (TREE_TYPE (max), true); ++ widest_int ub ++ = constant_upper_bound_with_limit (wi::to_poly_widest (max), ++ wi::to_widest (type_max)); ++ max = wide_int_to_tree (TREE_TYPE (max), ub); ++ } ++ + /* Nothing to canonicalize for symbolic ranges. */ + if (TREE_CODE (min) != INTEGER_CST + || TREE_CODE (max) != INTEGER_CST) + { +- set (kind, min, max); ++ m_kind = kind; ++ m_min = min; ++ m_max = max; + return; + } + +@@ -692,12 +733,13 @@ value_range_base::set_and_canonicalize (enum value_range_kind kind, + kind = kind == VR_RANGE ? VR_ANTI_RANGE : VR_RANGE; + } + ++ tree type = TREE_TYPE (min); ++ + /* Anti-ranges that can be represented as ranges should be so. */ + if (kind == VR_ANTI_RANGE) + { + /* For -fstrict-enums we may receive out-of-range ranges so consider + values < -INF and values > INF as -INF/INF as well. */ +- tree type = TREE_TYPE (min); + bool is_min = (INTEGRAL_TYPE_P (type) + && tree_int_cst_compare (min, TYPE_MIN_VALUE (type)) <= 0); + bool is_max = (INTEGRAL_TYPE_P (type) +@@ -740,22 +782,37 @@ value_range_base::set_and_canonicalize (enum value_range_kind kind, + } + } + ++ /* Normalize [MIN, MAX] into VARYING and ~[MIN, MAX] into UNDEFINED. ++ ++ Avoid using TYPE_{MIN,MAX}_VALUE because -fstrict-enums can ++ restrict those to a subset of what actually fits in the type. ++ Instead use the extremes of the type precision which will allow ++ compare_range_with_value() to check if a value is inside a range, ++ whereas if we used TYPE_*_VAL, said function would just punt ++ upon seeing a VARYING. */ ++ unsigned prec = TYPE_PRECISION (type); ++ signop sign = TYPE_SIGN (type); ++ if (wi::eq_p (wi::to_wide (min), wi::min_value (prec, sign)) ++ && wi::eq_p (wi::to_wide (max), wi::max_value (prec, sign))) ++ { ++ if (kind == VR_RANGE) ++ set_varying (); ++ else if (kind == VR_ANTI_RANGE) ++ set_undefined (); ++ else ++ gcc_unreachable (); ++ return; ++ } ++ + /* Do not drop [-INF(OVF), +INF(OVF)] to varying. (OVF) has to be sticky + to make sure VRP iteration terminates, otherwise we can get into + oscillations. */ + +- set (kind, min, max); +-} +- +-void +-value_range::set_and_canonicalize (enum value_range_kind kind, +- tree min, tree max, bitmap equiv) +-{ +- value_range_base::set_and_canonicalize (kind, min, max); +- if (this->kind () == VR_RANGE || this->kind () == VR_ANTI_RANGE) +- set_equiv (equiv); +- else +- equiv_clear (); ++ m_kind = kind; ++ m_min = min; ++ m_max = max; ++ if (flag_checking) ++ check (); + } + + void +@@ -776,32 +833,19 @@ value_range::set (tree val) + set (VR_RANGE, val, val, NULL); + } + +-/* Set value range VR to a non-NULL range of type TYPE. */ ++/* Set value range VR to a nonzero range of type TYPE. */ + + void +-value_range_base::set_nonnull (tree type) ++value_range_base::set_nonzero (tree type) + { + tree zero = build_int_cst (type, 0); + set (VR_ANTI_RANGE, zero, zero); + } + +-void +-value_range::set_nonnull (tree type) +-{ +- tree zero = build_int_cst (type, 0); +- set (VR_ANTI_RANGE, zero, zero, NULL); +-} +- +-/* Set value range VR to a NULL range of type TYPE. */ ++/* Set value range VR to a ZERO range of type TYPE. */ + + void +-value_range_base::set_null (tree type) +-{ +- set (build_int_cst (type, 0)); +-} +- +-void +-value_range::set_null (tree type) ++value_range_base::set_zero (tree type) + { + set (build_int_cst (type, 0)); + } +@@ -830,22 +874,6 @@ vrp_bitmap_equal_p (const_bitmap b1, const_bitmap b2) + && bitmap_equal_p (b1, b2))); + } + +-/* Return true if VR is [0, 0]. */ +- +-static inline bool +-range_is_null (const value_range_base *vr) +-{ +- return vr->zero_p (); +-} +- +-static inline bool +-range_is_nonnull (const value_range_base *vr) +-{ +- return (vr->kind () == VR_ANTI_RANGE +- && vr->min () == vr->max () +- && integer_zerop (vr->min ())); +-} +- + /* Return true if max and min of VR are INTEGER_CST. It's not necessary + a singleton. */ + +@@ -949,22 +977,17 @@ operand_less_p (tree val, tree val2) + /* LT is folded faster than GE and others. Inline the common case. */ + if (TREE_CODE (val) == INTEGER_CST && TREE_CODE (val2) == INTEGER_CST) + return tree_int_cst_lt (val, val2); ++ else if (TREE_CODE (val) == SSA_NAME && TREE_CODE (val2) == SSA_NAME) ++ return val == val2 ? 0 : -2; + else + { +- tree tcmp; +- +- fold_defer_overflow_warnings (); +- +- tcmp = fold_binary_to_constant (LT_EXPR, boolean_type_node, val, val2); +- +- fold_undefer_and_ignore_overflow_warnings (); +- +- if (!tcmp +- || TREE_CODE (tcmp) != INTEGER_CST) +- return -2; +- +- if (!integer_zerop (tcmp)) ++ int cmp = compare_values (val, val2); ++ if (cmp == -1) + return 1; ++ else if (cmp == 0 || cmp == 1) ++ return 0; ++ else ++ return -2; + } + + return 0; +@@ -998,8 +1021,8 @@ compare_values_warnv (tree val1, tree val2, bool *strict_overflow_p) + + /* Convert the two values into the same type. This is needed because + sizetype causes sign extension even for unsigned types. */ +- val2 = fold_convert (TREE_TYPE (val1), val2); +- STRIP_USELESS_TYPE_CONVERSION (val2); ++ if (!useless_type_conversion_p (TREE_TYPE (val1), TREE_TYPE (val2))) ++ val2 = fold_convert (TREE_TYPE (val1), val2); + + const bool overflow_undefined + = INTEGRAL_TYPE_P (TREE_TYPE (val1)) +@@ -1107,32 +1130,43 @@ compare_values_warnv (tree val1, tree val2, bool *strict_overflow_p) + } + else + { +- tree t; ++ if (TREE_CODE (val1) == INTEGER_CST && TREE_CODE (val2) == INTEGER_CST) ++ { ++ /* We cannot compare overflowed values. */ ++ if (TREE_OVERFLOW (val1) || TREE_OVERFLOW (val2)) ++ return -2; ++ ++ return tree_int_cst_compare (val1, val2); ++ } + + /* First see if VAL1 and VAL2 are not the same. */ +- if (val1 == val2 || operand_equal_p (val1, val2, 0)) ++ if (operand_equal_p (val1, val2, 0)) + return 0; + ++ fold_defer_overflow_warnings (); ++ + /* If VAL1 is a lower address than VAL2, return -1. */ +- if (operand_less_p (val1, val2) == 1) +- return -1; ++ tree t = fold_binary_to_constant (LT_EXPR, boolean_type_node, val1, val2); ++ if (t && integer_onep (t)) ++ { ++ fold_undefer_and_ignore_overflow_warnings (); ++ return -1; ++ } + + /* If VAL1 is a higher address than VAL2, return +1. */ +- if (operand_less_p (val2, val1) == 1) +- return 1; +- +- /* If VAL1 is different than VAL2, return +2. +- For integer constants we either have already returned -1 or 1 +- or they are equivalent. We still might succeed in proving +- something about non-trivial operands. */ +- if (TREE_CODE (val1) != INTEGER_CST +- || TREE_CODE (val2) != INTEGER_CST) ++ t = fold_binary_to_constant (LT_EXPR, boolean_type_node, val2, val1); ++ if (t && integer_onep (t)) + { +- t = fold_binary_to_constant (NE_EXPR, boolean_type_node, val1, val2); +- if (t && integer_onep (t)) +- return 2; ++ fold_undefer_and_ignore_overflow_warnings (); ++ return 1; + } + ++ /* If VAL1 is different than VAL2, return +2. */ ++ t = fold_binary_to_constant (NE_EXPR, boolean_type_node, val1, val2); ++ fold_undefer_and_ignore_overflow_warnings (); ++ if (t && integer_onep (t)) ++ return 2; ++ + return -2; + } + } +@@ -1231,7 +1265,8 @@ vrp_set_zero_nonzero_bits (const tree expr_type, + + static bool + ranges_from_anti_range (const value_range_base *ar, +- value_range_base *vr0, value_range_base *vr1) ++ value_range_base *vr0, value_range_base *vr1, ++ bool handle_pointers) + { + tree type = ar->type (); + +@@ -1244,18 +1279,18 @@ ranges_from_anti_range (const value_range_base *ar, + if (ar->kind () != VR_ANTI_RANGE + || TREE_CODE (ar->min ()) != INTEGER_CST + || TREE_CODE (ar->max ()) != INTEGER_CST +- || !vrp_val_min (type) +- || !vrp_val_max (type)) ++ || !vrp_val_min (type, handle_pointers) ++ || !vrp_val_max (type, handle_pointers)) + return false; + +- if (tree_int_cst_lt (vrp_val_min (type), ar->min ())) ++ if (tree_int_cst_lt (vrp_val_min (type, handle_pointers), ar->min ())) + vr0->set (VR_RANGE, +- vrp_val_min (type), ++ vrp_val_min (type, handle_pointers), + wide_int_to_tree (type, wi::to_wide (ar->min ()) - 1)); +- if (tree_int_cst_lt (ar->max (), vrp_val_max (type))) ++ if (tree_int_cst_lt (ar->max (), vrp_val_max (type, handle_pointers))) + vr1->set (VR_RANGE, + wide_int_to_tree (type, wi::to_wide (ar->max ()) + 1), +- vrp_val_max (type)); ++ vrp_val_max (type, handle_pointers)); + if (vr0->undefined_p ()) + { + *vr0 = *vr1; +@@ -1266,21 +1301,20 @@ ranges_from_anti_range (const value_range_base *ar, + } + + /* Extract the components of a value range into a pair of wide ints in +- [WMIN, WMAX]. +- +- If the value range is anything but a VR_*RANGE of constants, the +- resulting wide ints are set to [-MIN, +MAX] for the type. */ ++ [WMIN, WMAX], after having normalized any symbolics from the input. */ + + static void inline +-extract_range_into_wide_ints (const value_range_base *vr, +- signop sign, unsigned prec, +- wide_int &wmin, wide_int &wmax) ++extract_range_into_wide_ints (const value_range_base *vr_, ++ tree type, wide_int &wmin, wide_int &wmax) + { +- gcc_assert (vr->kind () != VR_ANTI_RANGE || vr->symbolic_p ()); +- if (range_int_cst_p (vr)) ++ signop sign = TYPE_SIGN (type); ++ unsigned int prec = TYPE_PRECISION (type); ++ gcc_assert (vr_->kind () != VR_ANTI_RANGE || vr_->symbolic_p ()); ++ value_range vr = vr_->normalize_symbolics (); ++ if (range_int_cst_p (&vr)) + { +- wmin = wi::to_wide (vr->min ()); +- wmax = wi::to_wide (vr->max ()); ++ wmin = wi::to_wide (vr.min ()); ++ wmax = wi::to_wide (vr.max ()); + } + else + { +@@ -1295,7 +1329,7 @@ extract_range_into_wide_ints (const value_range_base *vr, + + static void + extract_range_from_multiplicative_op (value_range_base *vr, +- enum tree_code code, ++ enum tree_code code, tree type, + const value_range_base *vr0, + const value_range_base *vr1) + { +@@ -1307,13 +1341,31 @@ extract_range_from_multiplicative_op (value_range_base *vr, + || code == ROUND_DIV_EXPR + || code == RSHIFT_EXPR + || code == LSHIFT_EXPR); +- gcc_assert (vr0->kind () == VR_RANGE +- && vr0->kind () == vr1->kind ()); ++ if (!range_int_cst_p (vr1)) ++ { ++ vr->set_varying (); ++ return; ++ } ++ ++ /* Even if vr0 is VARYING or otherwise not usable, we can derive ++ useful ranges just from the shift count. E.g. ++ x >> 63 for signed 64-bit x is always [-1, 0]. */ ++ value_range_base tem = vr0->normalize_symbolics (); ++ tree vr0_min, vr0_max; ++ if (tem.kind () == VR_RANGE) ++ { ++ vr0_min = tem.min (); ++ vr0_max = tem.max (); ++ } ++ else ++ { ++ vr0_min = vrp_val_min (type); ++ vr0_max = vrp_val_max (type); ++ } + +- tree type = vr0->type (); + wide_int res_lb, res_ub; +- wide_int vr0_lb = wi::to_wide (vr0->min ()); +- wide_int vr0_ub = wi::to_wide (vr0->max ()); ++ wide_int vr0_lb = wi::to_wide (vr0_min); ++ wide_int vr0_ub = wi::to_wide (vr0_max); + wide_int vr1_lb = wi::to_wide (vr1->min ()); + wide_int vr1_ub = wi::to_wide (vr1->max ()); + bool overflow_undefined = TYPE_OVERFLOW_UNDEFINED (type); +@@ -1323,9 +1375,8 @@ extract_range_from_multiplicative_op (value_range_base *vr, + code, TYPE_SIGN (type), prec, + vr0_lb, vr0_ub, vr1_lb, vr1_ub, + overflow_undefined)) +- vr->set_and_canonicalize (VR_RANGE, +- wide_int_to_tree (type, res_lb), +- wide_int_to_tree (type, res_ub)); ++ vr->set (VR_RANGE, wide_int_to_tree (type, res_lb), ++ wide_int_to_tree (type, res_ub)); + else + vr->set_varying (); + } +@@ -1583,9 +1634,9 @@ extract_range_from_binary_expr (value_range_base *vr, + code is EXACT_DIV_EXPR. We could mask out bits in the resulting + range, but then we also need to hack up vrp_union. It's just + easier to special case when vr0 is ~[0,0] for EXACT_DIV_EXPR. */ +- if (code == EXACT_DIV_EXPR && range_is_nonnull (&vr0)) ++ if (code == EXACT_DIV_EXPR && vr0.nonzero_p ()) + { +- vr->set_nonnull (expr_type); ++ vr->set_nonzero (expr_type); + return; + } + +@@ -1663,9 +1714,9 @@ extract_range_from_binary_expr (value_range_base *vr, + If both are null, then the result is null. Otherwise they + are varying. */ + if (!range_includes_zero_p (&vr0) && !range_includes_zero_p (&vr1)) +- vr->set_nonnull (expr_type); +- else if (range_is_null (&vr0) && range_is_null (&vr1)) +- vr->set_null (expr_type); ++ vr->set_nonzero (expr_type); ++ else if (vr0.zero_p () && vr1.zero_p ()) ++ vr->set_zero (expr_type); + else + vr->set_varying (); + } +@@ -1692,9 +1743,9 @@ extract_range_from_binary_expr (value_range_base *vr, + && (flag_delete_null_pointer_checks + || (range_int_cst_p (&vr1) + && !tree_int_cst_sign_bit (vr1.max ())))) +- vr->set_nonnull (expr_type); +- else if (range_is_null (&vr0) && range_is_null (&vr1)) +- vr->set_null (expr_type); ++ vr->set_nonzero (expr_type); ++ else if (vr0.zero_p () && vr1.zero_p ()) ++ vr->set_zero (expr_type); + else + vr->set_varying (); + } +@@ -1702,8 +1753,8 @@ extract_range_from_binary_expr (value_range_base *vr, + { + /* For pointer types, we are really only interested in asserting + whether the expression evaluates to non-NULL. */ +- if (range_is_null (&vr0) || range_is_null (&vr1)) +- vr->set_null (expr_type); ++ if (vr0.zero_p () || vr1.zero_p ()) ++ vr->set_zero (expr_type); + else + vr->set_varying (); + } +@@ -1717,19 +1768,30 @@ extract_range_from_binary_expr (value_range_base *vr, + range and see what we end up with. */ + if (code == PLUS_EXPR || code == MINUS_EXPR) + { ++ value_range_kind vr0_kind = vr0.kind (), vr1_kind = vr1.kind (); ++ tree vr0_min = vr0.min (), vr0_max = vr0.max (); ++ tree vr1_min = vr1.min (), vr1_max = vr1.max (); + /* This will normalize things such that calculating + [0,0] - VR_VARYING is not dropped to varying, but is + calculated as [MIN+1, MAX]. */ + if (vr0.varying_p ()) +- vr0.set (VR_RANGE, vrp_val_min (expr_type), vrp_val_max (expr_type)); ++ { ++ vr0_kind = VR_RANGE; ++ vr0_min = vrp_val_min (expr_type); ++ vr0_max = vrp_val_max (expr_type); ++ } + if (vr1.varying_p ()) +- vr1.set (VR_RANGE, vrp_val_min (expr_type), vrp_val_max (expr_type)); ++ { ++ vr1_kind = VR_RANGE; ++ vr1_min = vrp_val_min (expr_type); ++ vr1_max = vrp_val_max (expr_type); ++ } + + const bool minus_p = (code == MINUS_EXPR); +- tree min_op0 = vr0.min (); +- tree min_op1 = minus_p ? vr1.max () : vr1.min (); +- tree max_op0 = vr0.max (); +- tree max_op1 = minus_p ? vr1.min () : vr1.max (); ++ tree min_op0 = vr0_min; ++ tree min_op1 = minus_p ? vr1_max : vr1_min; ++ tree max_op0 = vr0_max; ++ tree max_op1 = minus_p ? vr1_min : vr1_max; + tree sym_min_op0 = NULL_TREE; + tree sym_min_op1 = NULL_TREE; + tree sym_max_op0 = NULL_TREE; +@@ -1742,7 +1804,7 @@ extract_range_from_binary_expr (value_range_base *vr, + single-symbolic ranges, try to compute the precise resulting range, + but only if we know that this resulting range will also be constant + or single-symbolic. */ +- if (vr0.kind () == VR_RANGE && vr1.kind () == VR_RANGE ++ if (vr0_kind == VR_RANGE && vr1_kind == VR_RANGE + && (TREE_CODE (min_op0) == INTEGER_CST + || (sym_min_op0 + = get_single_symbol (min_op0, &neg_min_op0, &min_op0))) +@@ -1823,8 +1885,8 @@ extract_range_from_binary_expr (value_range_base *vr, + wide_int wmin, wmax; + wide_int vr0_min, vr0_max; + wide_int vr1_min, vr1_max; +- extract_range_into_wide_ints (&vr0, sign, prec, vr0_min, vr0_max); +- extract_range_into_wide_ints (&vr1, sign, prec, vr1_min, vr1_max); ++ extract_range_into_wide_ints (&vr0, expr_type, vr0_min, vr0_max); ++ extract_range_into_wide_ints (&vr1, expr_type, vr1_min, vr1_max); + if (wide_int_range_min_max (wmin, wmax, code, sign, prec, + vr0_min, vr0_max, vr1_min, vr1_max)) + vr->set (VR_RANGE, wide_int_to_tree (expr_type, wmin), +@@ -1841,7 +1903,7 @@ extract_range_from_binary_expr (value_range_base *vr, + vr->set_varying (); + return; + } +- extract_range_from_multiplicative_op (vr, code, &vr0, &vr1); ++ extract_range_from_multiplicative_op (vr, code, expr_type, &vr0, &vr1); + return; + } + else if (code == RSHIFT_EXPR +@@ -1856,13 +1918,8 @@ extract_range_from_binary_expr (value_range_base *vr, + { + if (code == RSHIFT_EXPR) + { +- /* Even if vr0 is VARYING or otherwise not usable, we can derive +- useful ranges just from the shift count. E.g. +- x >> 63 for signed 64-bit x is always [-1, 0]. */ +- if (vr0.kind () != VR_RANGE || vr0.symbolic_p ()) +- vr0.set (VR_RANGE, vrp_val_min (expr_type), +- vrp_val_max (expr_type)); +- extract_range_from_multiplicative_op (vr, code, &vr0, &vr1); ++ extract_range_from_multiplicative_op (vr, code, expr_type, ++ &vr0, &vr1); + return; + } + else if (code == LSHIFT_EXPR +@@ -1878,7 +1935,7 @@ extract_range_from_binary_expr (value_range_base *vr, + { + min = wide_int_to_tree (expr_type, res_lb); + max = wide_int_to_tree (expr_type, res_ub); +- vr->set_and_canonicalize (VR_RANGE, min, max); ++ vr->set (VR_RANGE, min, max); + return; + } + } +@@ -1897,7 +1954,7 @@ extract_range_from_binary_expr (value_range_base *vr, + bool extra_range_p; + + /* Special case explicit division by zero as undefined. */ +- if (range_is_null (&vr1)) ++ if (vr1.zero_p ()) + { + vr->set_undefined (); + return; +@@ -1910,9 +1967,9 @@ extract_range_from_binary_expr (value_range_base *vr, + NOTE: As a future improvement, we may be able to do better + with mixed symbolic (anti-)ranges like [0, A]. See note in + ranges_from_anti_range. */ +- extract_range_into_wide_ints (&vr0, sign, prec, ++ extract_range_into_wide_ints (&vr0, expr_type, + dividend_min, dividend_max); +- extract_range_into_wide_ints (&vr1, sign, prec, ++ extract_range_into_wide_ints (&vr1, expr_type, + divisor_min, divisor_max); + if (!wide_int_range_div (wmin, wmax, code, sign, prec, + dividend_min, dividend_max, +@@ -1936,15 +1993,15 @@ extract_range_from_binary_expr (value_range_base *vr, + } + else if (code == TRUNC_MOD_EXPR) + { +- if (range_is_null (&vr1)) ++ if (vr1.zero_p ()) + { + vr->set_undefined (); + return; + } + wide_int wmin, wmax, tmp; + wide_int vr0_min, vr0_max, vr1_min, vr1_max; +- extract_range_into_wide_ints (&vr0, sign, prec, vr0_min, vr0_max); +- extract_range_into_wide_ints (&vr1, sign, prec, vr1_min, vr1_max); ++ extract_range_into_wide_ints (&vr0, expr_type, vr0_min, vr0_max); ++ extract_range_into_wide_ints (&vr1, expr_type, vr1_min, vr1_max); + wide_int_range_trunc_mod (wmin, wmax, sign, prec, + vr0_min, vr0_max, vr1_min, vr1_max); + min = wide_int_to_tree (expr_type, wmin); +@@ -1962,8 +2019,8 @@ extract_range_from_binary_expr (value_range_base *vr, + &may_be_nonzero0, &must_be_nonzero0); + vrp_set_zero_nonzero_bits (expr_type, &vr1, + &may_be_nonzero1, &must_be_nonzero1); +- extract_range_into_wide_ints (&vr0, sign, prec, vr0_min, vr0_max); +- extract_range_into_wide_ints (&vr1, sign, prec, vr1_min, vr1_max); ++ extract_range_into_wide_ints (&vr0, expr_type, vr0_min, vr0_max); ++ extract_range_into_wide_ints (&vr1, expr_type, vr1_min, vr1_max); + if (code == BIT_AND_EXPR) + { + if (wide_int_range_bit_and (wmin, wmax, sign, prec, +@@ -2140,9 +2197,9 @@ extract_range_from_unary_expr (value_range_base *vr, + if (POINTER_TYPE_P (type) || POINTER_TYPE_P (op0_type)) + { + if (!range_includes_zero_p (&vr0)) +- vr->set_nonnull (type); +- else if (range_is_null (&vr0)) +- vr->set_null (type); ++ vr->set_nonzero (type); ++ else if (vr0.zero_p ()) ++ vr->set_zero (type); + else + vr->set_varying (); + return; +@@ -2167,8 +2224,7 @@ extract_range_from_unary_expr (value_range_base *vr, + signop outer_sign = TYPE_SIGN (outer_type); + unsigned inner_prec = TYPE_PRECISION (inner_type); + unsigned outer_prec = TYPE_PRECISION (outer_type); +- extract_range_into_wide_ints (&vr0, inner_sign, inner_prec, +- vr0_min, vr0_max); ++ extract_range_into_wide_ints (&vr0, inner_type, vr0_min, vr0_max); + if (wide_int_range_convert (wmin, wmax, + inner_sign, inner_prec, + outer_sign, outer_prec, +@@ -2176,7 +2232,7 @@ extract_range_from_unary_expr (value_range_base *vr, + { + tree min = wide_int_to_tree (outer_type, wmin); + tree max = wide_int_to_tree (outer_type, wmax); +- vr->set_and_canonicalize (VR_RANGE, min, max); ++ vr->set (VR_RANGE, min, max); + } + else + vr->set_varying (); +@@ -2186,7 +2242,7 @@ extract_range_from_unary_expr (value_range_base *vr, + { + wide_int wmin, wmax; + wide_int vr0_min, vr0_max; +- extract_range_into_wide_ints (&vr0, sign, prec, vr0_min, vr0_max); ++ extract_range_into_wide_ints (&vr0, type, vr0_min, vr0_max); + if (wide_int_range_abs (wmin, wmax, sign, prec, vr0_min, vr0_max, + TYPE_OVERFLOW_UNDEFINED (type))) + vr->set (VR_RANGE, wide_int_to_tree (type, wmin), +@@ -2199,7 +2255,8 @@ extract_range_from_unary_expr (value_range_base *vr, + { + wide_int wmin, wmax; + wide_int vr0_min, vr0_max; +- extract_range_into_wide_ints (&vr0, SIGNED, prec, vr0_min, vr0_max); ++ tree signed_type = make_signed_type (TYPE_PRECISION (type)); ++ extract_range_into_wide_ints (&vr0, signed_type, vr0_min, vr0_max); + wide_int_range_absu (wmin, wmax, prec, vr0_min, vr0_max); + vr->set (VR_RANGE, wide_int_to_tree (type, wmin), + wide_int_to_tree (type, wmax)); +@@ -5468,8 +5525,10 @@ union_ranges (enum value_range_kind *vr0type, + enum value_range_kind vr1type, + tree vr1min, tree vr1max) + { +- bool mineq = vrp_operand_equal_p (*vr0min, vr1min); +- bool maxeq = vrp_operand_equal_p (*vr0max, vr1max); ++ int cmpmin = compare_values (*vr0min, vr1min); ++ int cmpmax = compare_values (*vr0max, vr1max); ++ bool mineq = cmpmin == 0; ++ bool maxeq = cmpmax == 0; + + /* [] is vr0, () is vr1 in the following classification comments. */ + if (mineq && maxeq) +@@ -5569,8 +5628,8 @@ union_ranges (enum value_range_kind *vr0type, + else + gcc_unreachable (); + } +- else if ((maxeq || operand_less_p (vr1max, *vr0max) == 1) +- && (mineq || operand_less_p (*vr0min, vr1min) == 1)) ++ else if ((maxeq || cmpmax == 1) ++ && (mineq || cmpmin == -1)) + { + /* [ ( ) ] or [( ) ] or [ ( )] */ + if (*vr0type == VR_RANGE +@@ -5603,8 +5662,8 @@ union_ranges (enum value_range_kind *vr0type, + else + gcc_unreachable (); + } +- else if ((maxeq || operand_less_p (*vr0max, vr1max) == 1) +- && (mineq || operand_less_p (vr1min, *vr0min) == 1)) ++ else if ((maxeq || cmpmax == -1) ++ && (mineq || cmpmin == 1)) + { + /* ( [ ] ) or ([ ] ) or ( [ ]) */ + if (*vr0type == VR_RANGE +@@ -5643,10 +5702,10 @@ union_ranges (enum value_range_kind *vr0type, + else + gcc_unreachable (); + } +- else if ((operand_less_p (vr1min, *vr0max) == 1 +- || operand_equal_p (vr1min, *vr0max, 0)) +- && operand_less_p (*vr0min, vr1min) == 1 +- && operand_less_p (*vr0max, vr1max) == 1) ++ else if (cmpmin == -1 ++ && cmpmax == -1 ++ && (operand_less_p (vr1min, *vr0max) == 1 ++ || operand_equal_p (vr1min, *vr0max, 0))) + { + /* [ ( ] ) or [ ]( ) */ + if (*vr0type == VR_RANGE +@@ -5680,10 +5739,10 @@ union_ranges (enum value_range_kind *vr0type, + else + gcc_unreachable (); + } +- else if ((operand_less_p (*vr0min, vr1max) == 1 +- || operand_equal_p (*vr0min, vr1max, 0)) +- && operand_less_p (vr1min, *vr0min) == 1 +- && operand_less_p (vr1max, *vr0max) == 1) ++ else if (cmpmin == 1 ++ && cmpmax == 1 ++ && (operand_less_p (*vr0min, vr1max) == 1 ++ || operand_equal_p (*vr0min, vr1max, 0))) + { + /* ( [ ) ] or ( )[ ] */ + if (*vr0type == VR_RANGE +@@ -6083,7 +6142,7 @@ value_range::intersect_helper (value_range *vr0, const value_range *vr1) + VR_RANGE can still be a VR_RANGE. Work on a temporary so we can + fall back to vr0 when this turns things to varying. */ + value_range tem; +- tem.set_and_canonicalize (vr0type, vr0min, vr0max); ++ tem.set (vr0type, vr0min, vr0max); + /* If that failed, use the saved original VR0. */ + if (tem.varying_p ()) + return; +@@ -6152,8 +6211,8 @@ value_range_base::union_helper (const value_range_base *vr0, + vr1->kind (), vr1->min (), vr1->max ()); + + /* Work on a temporary so we can still use vr0 when union returns varying. */ +- value_range tem; +- tem.set_and_canonicalize (vr0type, vr0min, vr0max); ++ value_range_base tem; ++ tem.set (vr0type, vr0min, vr0max); + + /* Failed to find an efficient meet. Before giving up and setting + the result to VARYING, see if we can at least derive a useful +@@ -6162,7 +6221,7 @@ value_range_base::union_helper (const value_range_base *vr0, + && range_includes_zero_p (vr0) == 0 + && range_includes_zero_p (vr1) == 0) + { +- tem.set_nonnull (vr0->type ()); ++ tem.set_nonzero (vr0->type ()); + return tem; + } + +@@ -6233,6 +6292,58 @@ value_range::union_ (const value_range *other) + } + } + ++/* Normalize symbolics into constants. */ ++ ++value_range_base ++value_range_base::normalize_symbolics () const ++{ ++ if (varying_p () || undefined_p ()) ++ return *this; ++ tree ttype = type (); ++ bool min_symbolic = !is_gimple_min_invariant (min ()); ++ bool max_symbolic = !is_gimple_min_invariant (max ()); ++ if (!min_symbolic && !max_symbolic) ++ return *this; ++ ++ // [SYM, SYM] -> VARYING ++ if (min_symbolic && max_symbolic) ++ { ++ value_range_base var; ++ var.set_varying (); ++ return var; ++ } ++ if (kind () == VR_RANGE) ++ { ++ // [SYM, NUM] -> [-MIN, NUM] ++ if (min_symbolic) ++ return value_range_base (VR_RANGE, vrp_val_min (ttype), max ()); ++ // [NUM, SYM] -> [NUM, +MAX] ++ return value_range_base (VR_RANGE, min (), vrp_val_max (ttype)); ++ } ++ gcc_assert (kind () == VR_ANTI_RANGE); ++ // ~[SYM, NUM] -> [NUM + 1, +MAX] ++ if (min_symbolic) ++ { ++ if (!vrp_val_is_max (max ())) ++ { ++ tree n = wide_int_to_tree (ttype, wi::to_wide (max ()) + 1); ++ return value_range_base (VR_RANGE, n, vrp_val_max (ttype)); ++ } ++ value_range_base var; ++ var.set_varying (); ++ return var; ++ } ++ // ~[NUM, SYM] -> [-MIN, NUM - 1] ++ if (!vrp_val_is_min (min ())) ++ { ++ tree n = wide_int_to_tree (ttype, wi::to_wide (min ()) - 1); ++ return value_range_base (VR_RANGE, vrp_val_min (ttype), n); ++ } ++ value_range_base var; ++ var.set_varying (); ++ return var; ++} ++ + /* Visit all arguments for PHI node PHI that flow through executable + edges. If a valid value range can be derived from all the incoming + value ranges, set a new range for the LHS of PHI. */ +diff --git a/gcc/tree-vrp.h b/gcc/tree-vrp.h +index 9d52b428d05..4bcff924b58 100644 +--- a/gcc/tree-vrp.h ++++ b/gcc/tree-vrp.h +@@ -46,8 +46,8 @@ public: + + void set (value_range_kind, tree, tree); + void set (tree); +- void set_nonnull (tree); +- void set_null (tree); ++ void set_nonzero (tree); ++ void set_zero (tree); + + enum value_range_kind kind () const; + tree min () const; +@@ -70,11 +70,13 @@ public: + /* Misc methods. */ + tree type () const; + bool may_contain_p (tree) const; +- void set_and_canonicalize (enum value_range_kind, tree, tree); + bool zero_p () const; ++ bool nonzero_p () const; + bool singleton_p (tree *result = NULL) const; + void dump (FILE *) const; + ++ value_range_base normalize_symbolics () const; ++ + protected: + void check (); + static value_range_base union_helper (const value_range_base *, +@@ -118,8 +120,6 @@ class GTY((user)) value_range : public value_range_base + /* Deep-copies equiv bitmap argument. */ + void set (value_range_kind, tree, tree, bitmap = NULL); + void set (tree); +- void set_nonnull (tree); +- void set_null (tree); + + bool operator== (const value_range &) const /* = delete */; + bool operator!= (const value_range &) const /* = delete */; +@@ -138,7 +138,6 @@ class GTY((user)) value_range : public value_range_base + + /* Misc methods. */ + void deep_copy (const value_range *); +- void set_and_canonicalize (enum value_range_kind, tree, tree, bitmap = NULL); + void dump (FILE *) const; + + private: +@@ -222,6 +221,16 @@ value_range_base::zero_p () const + && integer_zerop (m_max)); + } + ++/* Return TRUE if range is nonzero. */ ++ ++inline bool ++value_range_base::nonzero_p () const ++{ ++ return (m_kind == VR_ANTI_RANGE ++ && integer_zerop (m_min) ++ && integer_zerop (m_max)); ++} ++ + extern void dump_value_range (FILE *, const value_range *); + extern void dump_value_range (FILE *, const value_range_base *); + +@@ -259,8 +268,8 @@ extern bool vrp_val_is_min (const_tree); + extern bool vrp_val_is_max (const_tree); + extern int value_inside_range (tree, tree, tree); + +-extern tree vrp_val_min (const_tree); +-extern tree vrp_val_max (const_tree); ++extern tree vrp_val_min (const_tree, bool handle_pointers = false); ++extern tree vrp_val_max (const_tree, bool handle_pointers = false); + + extern void extract_range_from_unary_expr (value_range_base *vr, + enum tree_code code, +diff --git a/gcc/tree.c b/gcc/tree.c +index 32e94e48132..c4b8eea675f 100644 +--- a/gcc/tree.c ++++ b/gcc/tree.c +@@ -8213,8 +8213,6 @@ build_nonstandard_integer_type (unsigned HOST_WIDE_INT precision, + else + fixup_signed_type (itype); + +- ret = itype; +- + inchash::hash hstate; + inchash::add_expr (TYPE_MAX_VALUE (itype), hstate); + ret = type_hash_canon (hstate.end (), itype); +@@ -11079,44 +11077,44 @@ build_vector_type (tree innertype, poly_int64 nunits) + return make_vector_type (innertype, nunits, VOIDmode); + } + +-/* Build truth vector with specified length and number of units. */ ++/* Build a truth vector with NUNITS units, giving it mode MASK_MODE. */ + + tree +-build_truth_vector_type (poly_uint64 nunits, poly_uint64 vector_size) ++build_truth_vector_type_for_mode (poly_uint64 nunits, machine_mode mask_mode) + { +- machine_mode mask_mode +- = targetm.vectorize.get_mask_mode (nunits, vector_size).else_blk (); +- +- poly_uint64 vsize; +- if (mask_mode == BLKmode) +- vsize = vector_size * BITS_PER_UNIT; +- else +- vsize = GET_MODE_BITSIZE (mask_mode); ++ gcc_assert (mask_mode != BLKmode); + ++ poly_uint64 vsize = GET_MODE_BITSIZE (mask_mode); + unsigned HOST_WIDE_INT esize = vector_element_size (vsize, nunits); +- + tree bool_type = build_nonstandard_boolean_type (esize); + + return make_vector_type (bool_type, nunits, mask_mode); + } + +-/* Returns a vector type corresponding to a comparison of VECTYPE. */ ++/* Build a vector type that holds one boolean result for each element of ++ vector type VECTYPE. The public interface for this operation is ++ truth_type_for. */ + +-tree +-build_same_sized_truth_vector_type (tree vectype) ++static tree ++build_truth_vector_type_for (tree vectype) + { +- if (VECTOR_BOOLEAN_TYPE_P (vectype)) +- return vectype; ++ machine_mode vector_mode = TYPE_MODE (vectype); ++ poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); + +- poly_uint64 size = GET_MODE_SIZE (TYPE_MODE (vectype)); ++ machine_mode mask_mode; ++ if (VECTOR_MODE_P (vector_mode) ++ && targetm.vectorize.get_mask_mode (vector_mode).exists (&mask_mode)) ++ return build_truth_vector_type_for_mode (nunits, mask_mode); + +- if (known_eq (size, 0U)) +- size = tree_to_uhwi (TYPE_SIZE_UNIT (vectype)); ++ poly_uint64 vsize = tree_to_poly_uint64 (TYPE_SIZE (vectype)); ++ unsigned HOST_WIDE_INT esize = vector_element_size (vsize, nunits); ++ tree bool_type = build_nonstandard_boolean_type (esize); + +- return build_truth_vector_type (TYPE_VECTOR_SUBPARTS (vectype), size); ++ return make_vector_type (bool_type, nunits, BLKmode); + } + +-/* Similarly, but builds a variant type with TYPE_VECTOR_OPAQUE set. */ ++/* Like build_vector_type, but builds a variant type with TYPE_VECTOR_OPAQUE ++ set. */ + + tree + build_opaque_vector_type (tree innertype, poly_int64 nunits) +@@ -11915,8 +11913,7 @@ truth_type_for (tree type) + { + if (VECTOR_BOOLEAN_TYPE_P (type)) + return type; +- return build_truth_vector_type (TYPE_VECTOR_SUBPARTS (type), +- GET_MODE_SIZE (TYPE_MODE (type))); ++ return build_truth_vector_type_for (type); + } + else + return boolean_type_node; +diff --git a/gcc/tree.h b/gcc/tree.h +index 2f8e37bb356..6f73593faa7 100644 +--- a/gcc/tree.h ++++ b/gcc/tree.h +@@ -4272,8 +4272,7 @@ extern tree build_reference_type_for_mode (tree, machine_mode, bool); + extern tree build_reference_type (tree); + extern tree build_vector_type_for_mode (tree, machine_mode); + extern tree build_vector_type (tree, poly_int64); +-extern tree build_truth_vector_type (poly_uint64, poly_uint64); +-extern tree build_same_sized_truth_vector_type (tree vectype); ++extern tree build_truth_vector_type_for_mode (poly_uint64, machine_mode); + extern tree build_opaque_vector_type (tree, poly_int64); + extern tree build_index_type (tree); + extern tree build_array_type (tree, tree, bool = false); +diff --git a/gcc/vr-values.c b/gcc/vr-values.c +index 0e10aca92bb..02c89ab030a 100644 +--- a/gcc/vr-values.c ++++ b/gcc/vr-values.c +@@ -118,7 +118,10 @@ vr_values::get_value_range (const_tree var) + if (POINTER_TYPE_P (TREE_TYPE (sym)) + && (nonnull_arg_p (sym) + || get_ptr_nonnull (var))) +- vr->set_nonnull (TREE_TYPE (sym)); ++ { ++ vr->set_nonzero (TREE_TYPE (sym)); ++ vr->equiv_clear (); ++ } + else if (INTEGRAL_TYPE_P (TREE_TYPE (sym))) + { + get_range_info (var, *vr); +@@ -130,7 +133,10 @@ vr_values::get_value_range (const_tree var) + } + else if (TREE_CODE (sym) == RESULT_DECL + && DECL_BY_REFERENCE (sym)) +- vr->set_nonnull (TREE_TYPE (sym)); ++ { ++ vr->set_nonzero (TREE_TYPE (sym)); ++ vr->equiv_clear (); ++ } + } + + return vr; +@@ -491,9 +497,9 @@ vr_values::extract_range_for_var_from_comparison_expr (tree var, + vice-versa. Use set_and_canonicalize which does this for + us. */ + if (cond_code == LE_EXPR) +- vr_p->set_and_canonicalize (VR_RANGE, min, max, vr_p->equiv ()); ++ vr_p->set (VR_RANGE, min, max, vr_p->equiv ()); + else if (cond_code == GT_EXPR) +- vr_p->set_and_canonicalize (VR_ANTI_RANGE, min, max, vr_p->equiv ()); ++ vr_p->set (VR_ANTI_RANGE, min, max, vr_p->equiv ()); + else + gcc_unreachable (); + } +@@ -565,7 +571,7 @@ vr_values::extract_range_for_var_from_comparison_expr (tree var, + && vrp_val_is_max (max)) + min = max = limit; + +- vr_p->set_and_canonicalize (VR_ANTI_RANGE, min, max, vr_p->equiv ()); ++ vr_p->set (VR_ANTI_RANGE, min, max, vr_p->equiv ()); + } + else if (cond_code == LE_EXPR || cond_code == LT_EXPR) + { +@@ -858,7 +864,10 @@ vr_values::extract_range_from_binary_expr (value_range *vr, + || (vr1.kind () == VR_ANTI_RANGE + && vr1.min () == op0 + && vr1.min () == vr1.max ()))) +- vr->set_nonnull (expr_type); ++ { ++ vr->set_nonzero (expr_type); ++ vr->equiv_clear (); ++ } + } + + /* Extract range information from a unary expression CODE OP0 based on +@@ -1085,7 +1094,8 @@ vr_values::extract_range_basic (value_range *vr, gimple *stmt) + && TREE_CODE (SSA_NAME_VAR (arg)) == PARM_DECL + && cfun->after_inlining) + { +- vr->set_null (type); ++ vr->set_zero (type); ++ vr->equiv_clear (); + return; + } + break; +@@ -1392,7 +1402,10 @@ vr_values::extract_range_basic (value_range *vr, gimple *stmt) + && gimple_stmt_nonnegative_warnv_p (stmt, &sop)) + set_value_range_to_nonnegative (vr, type); + else if (vrp_stmt_computes_nonzero (stmt)) +- vr->set_nonnull (type); ++ { ++ vr->set_nonzero (type); ++ vr->equiv_clear (); ++ } + else + vr->set_varying (); + } +diff --git a/libgcc/libgcov-driver-system.c b/libgcc/libgcov-driver-system.c +index b5f3e89ebdc..0d106002098 100644 +--- a/libgcc/libgcov-driver-system.c ++++ b/libgcc/libgcov-driver-system.c +@@ -262,10 +262,8 @@ static int + gcov_exit_open_gcda_file (struct gcov_info *gi_ptr, + struct gcov_filename *gf) + { +- const char *fname = gi_ptr->filename; + int append_slash = 0; +- +- fname = gi_ptr->filename; ++ const char *fname = gi_ptr->filename; + + /* Build relocated filename, stripping off leading + directories from the initial filename if requested. */ +diff --git a/libgcc/libgcov-util.c b/libgcc/libgcov-util.c +index ae0dd017204..e672768966b 100644 +--- a/libgcc/libgcov-util.c ++++ b/libgcc/libgcov-util.c +@@ -461,10 +461,9 @@ gcov_read_profile_dir (const char* dir_name, int recompute_summary ATTRIBUTE_UNU + #ifdef HAVE_FTW_H + ftw (".", ftw_read_file, 50); + #endif +- ret = chdir (pwd); ++ chdir (pwd); + free (pwd); + +- + return gcov_info_head;; + } + -- Gitee