diff --git a/0097-Improve-non-loop-disambiguation.patch b/0097-Improve-non-loop-disambiguation.patch new file mode 100644 index 0000000000000000000000000000000000000000..ae609d29474c7a05896508e06227959e405b2ad5 --- /dev/null +++ b/0097-Improve-non-loop-disambiguation.patch @@ -0,0 +1,101 @@ +From 6de2e0d400cbe46da482a672810c37b1832c408c Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=E9=83=91=E6=99=A8=E5=8D=89?= +Date: Thu, 25 Jul 2024 19:45:43 +0800 +Subject: [PATCH] Improve non-loop disambiguation + +This optimization is brought from https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=038b077689bb5310386b04d40a2cea234f01e6aa. + +When dr_may_alias_p is called without a loop context, it tries +to use the tree-affine interface to calculate the difference +between the two addresses and use that difference to check whether +the gap between the accesses is known at compile time. However, as the +example in the PR shows, this doesn't expand SSA_NAMEs and so can easily +be defeated by things like reassociation. + +One fix would have been to use aff_combination_expand to expand the +SSA_NAMEs, but we'd then need some way of maintaining the associated +cache. This patch instead reuses the innermost_loop_behavior fields +(which exist even when no loop context is provided). + +It might still be useful to do the aff_combination_expand thing too, +if an example turns out to need it. +--- + gcc/common.opt | 4 ++++ + gcc/testsuite/gcc.dg/vect/bb-slp-pr106019.c | 16 +++++++++++++++ + gcc/tree-data-ref.cc | 22 +++++++++++++++++++++ + 3 files changed, 42 insertions(+) + create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-pr106019.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index b18f0b944..75bf9c9c1 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -3217,6 +3217,10 @@ ftree-loop-vectorize + Common Var(flag_tree_loop_vectorize) Optimization EnabledBy(ftree-vectorize) + Enable loop vectorization on trees. + ++falias-analysis-expand-ssa ++Common Var(flag_alias_analysis_expand_ssa) Init(0) ++Enable expanded SSA name analysis during alias analysis. ++ + ftree-slp-vectorize + Common Var(flag_tree_slp_vectorize) Optimization EnabledBy(ftree-vectorize) + Enable basic block vectorization (SLP) on trees. +diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr106019.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr106019.c +new file mode 100644 +index 000000000..5ff8a8a62 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr106019.c +@@ -0,0 +1,16 @@ ++/* { dg-do compile } */ ++/* { dg-additional-options "-falias-analysis-expand-ssa" } */ ++ ++void f(double *p, long i) ++{ ++ p[i+0] += 1; ++ p[i+1] += 1; ++} ++void g(double *p, long i) ++{ ++ double *q = p + i; ++ q[0] += 1; ++ q[1] += 1; ++} ++ ++/* { dg-final { scan-tree-dump-not "can't determine dependence" slp2 } } */ +diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc +index e6ae9e847..a05073c51 100644 +--- a/gcc/tree-data-ref.cc ++++ b/gcc/tree-data-ref.cc +@@ -2993,6 +2993,28 @@ dr_may_alias_p (const struct data_reference *a, const struct data_reference *b, + disambiguation. */ + if (!loop_nest) + { ++ if (flag_alias_analysis_expand_ssa) ++ { ++ tree tree_size_a = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a))); ++ tree tree_size_b = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b))); ++ ++ if (DR_BASE_ADDRESS (a) ++ && DR_BASE_ADDRESS (b) ++ && operand_equal_p (DR_BASE_ADDRESS (a), DR_BASE_ADDRESS (b)) ++ && operand_equal_p (DR_OFFSET (a), DR_OFFSET (b)) ++ && poly_int_tree_p (tree_size_a) ++ && poly_int_tree_p (tree_size_b) ++ && !ranges_maybe_overlap_p (wi::to_widest (DR_INIT (a)), ++ wi::to_widest (tree_size_a), ++ wi::to_widest (DR_INIT (b)), ++ wi::to_widest (tree_size_b))) ++ { ++ gcc_assert (integer_zerop (DR_STEP (a)) ++ && integer_zerop (DR_STEP (b))); ++ return false; ++ } ++ } ++ + aff_tree off1, off2; + poly_widest_int size1, size2; + get_inner_reference_aff (DR_REF (a), &off1, &size1); +-- +2.33.0 + diff --git a/0098-CHREC-multiplication-and-undefined-overflow.patch b/0098-CHREC-multiplication-and-undefined-overflow.patch new file mode 100644 index 0000000000000000000000000000000000000000..9f9a6b7410fd4a910d9eb899401a81f62500a797 --- /dev/null +++ b/0098-CHREC-multiplication-and-undefined-overflow.patch @@ -0,0 +1,265 @@ +From c4e4fef145c1e402f0558cc35f6c1ed0a08beffb Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=E9=83=91=E6=99=A8=E5=8D=89?= +Date: Thu, 25 Jul 2024 20:16:52 +0800 +Subject: [PATCH] CHREC multiplication and undefined overflow + +This optimization is brought from https://gcc.gnu.org/pipermail/gcc-patches/2024-February/646531.html + +When folding a multiply CHRECs are handled like {a, +, b} * c +is {a*c, +, b*c} but that isn't generally correct when overflow +invokes undefined behavior. The following uses unsigned arithmetic +unless either a is zero or a and b have the same sign. + +I've used simple early outs for INTEGER_CSTs and otherwise use +a range-query since we lack a tree_expr_nonpositive_p and +get_range_pos_neg isn't a good fit. +--- + gcc/common.opt | 4 ++ + gcc/testsuite/gcc.dg/pr68317.c | 6 +- + gcc/testsuite/gcc.dg/torture/pr114074.c | 31 ++++++++++ + gcc/tree-chrec.cc | 81 +++++++++++++++++++++---- + gcc/tree-chrec.h | 2 +- + gcc/value-range.cc | 12 ++++ + gcc/value-range.h | 2 + + 7 files changed, 123 insertions(+), 15 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/torture/pr114074.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index b18f0b944..d3af3ba39 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1771,6 +1771,10 @@ floop-interchange + Common Var(flag_loop_interchange) Optimization + Enable loop interchange on trees. + ++fchrec-mul-fold-strict-overflow ++Common Var(flag_chrec_mul_fold_strict_overflow) Init(0) ++Enable strict overflow handling during constant folding of multiply CHRECs. ++ + floop-block + Common Alias(floop-nest-optimize) + Enable loop nest transforms. Same as -floop-nest-optimize. +diff --git a/gcc/testsuite/gcc.dg/pr68317.c b/gcc/testsuite/gcc.dg/pr68317.c +index bd053a752..671a67d95 100644 +--- a/gcc/testsuite/gcc.dg/pr68317.c ++++ b/gcc/testsuite/gcc.dg/pr68317.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -fdisable-tree-ethread" } */ ++/* { dg-options "-O2 -fdisable-tree-ethread -fchrec-mul-fold-strict-overflow" } */ + + /* Note: Threader will collapse loop. */ + +@@ -12,8 +12,8 @@ foo () + { + int32_t index = 0; + +- for (index; index <= 10; index--) // expected warning here ++ for (index; index <= 10; index--) /* { dg-warning "iteration \[0-9\]+ invokes undefined behavior" } */ + /* Result of the following multiply will overflow + when converted to signed int32_t. */ +- bar ((0xcafe + index) * 0xdead); /* { dg-warning "iteration \[0-9\]+ invokes undefined behavior" } */ ++ bar ((0xcafe + index) * 0xdead); + } +diff --git a/gcc/testsuite/gcc.dg/torture/pr114074.c b/gcc/testsuite/gcc.dg/torture/pr114074.c +new file mode 100644 +index 000000000..9a383d8fc +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/torture/pr114074.c +@@ -0,0 +1,31 @@ ++/* { dg-do run } */ ++<<<<<<< HEAD ++/* { dg-options "-fchrec-mul-fold-strict-overflow" } */ ++======= ++/* { dg-options "-fchrec-mul-fold-strict-overflow"" } */ ++>>>>>>> 47092575e7696f5a21cf75284fe3d4feb0c813ab ++int a, b, d; ++ ++__attribute__((noipa)) void ++foo (void) ++{ ++ ++d; ++} ++ ++int ++main () ++{ ++ for (a = 0; a > -3; a -= 2) ++ { ++ int c = a; ++ b = __INT_MAX__ - 3000; ++ a = ~c * b; ++ foo (); ++ if (!a) ++ break; ++ a = c; ++ } ++ if (d != 2) ++ __builtin_abort (); ++ return 0; ++} +diff --git a/gcc/tree-chrec.cc b/gcc/tree-chrec.cc +index c44cea754..3323901bc 100644 +--- a/gcc/tree-chrec.cc ++++ b/gcc/tree-chrec.cc +@@ -38,6 +38,8 @@ along with GCC; see the file COPYING3. If not see + #include "gimple.h" + #include "tree-ssa-loop.h" + #include "dumpfile.h" ++#include "value-range.h" ++#include "value-query.h" + #include "tree-scalar-evolution.h" + + /* Extended folder for chrecs. */ +@@ -404,6 +406,13 @@ chrec_fold_multiply (tree type, + || automatically_generated_chrec_p (op1)) + return chrec_fold_automatically_generated_operands (op0, op1); + ++ if (flag_chrec_mul_fold_strict_overflow) ++ { ++ if (TREE_CODE (op0) != POLYNOMIAL_CHREC ++ && TREE_CODE (op1) == POLYNOMIAL_CHREC) ++ std::swap (op0, op1); ++ } ++ + switch (TREE_CODE (op0)) + { + case POLYNOMIAL_CHREC: +@@ -428,10 +437,53 @@ chrec_fold_multiply (tree type, + if (integer_zerop (op1)) + return build_int_cst (type, 0); + +- return build_polynomial_chrec +- (CHREC_VARIABLE (op0), +- chrec_fold_multiply (type, CHREC_LEFT (op0), op1), +- chrec_fold_multiply (type, CHREC_RIGHT (op0), op1)); ++ if (flag_chrec_mul_fold_strict_overflow) ++ { ++ /* When overflow is undefined and CHREC_LEFT/RIGHT do not have the ++ same sign or CHREC_LEFT is zero then folding the multiply into ++ the addition does not have the same behavior on overflow. Use ++ unsigned arithmetic in that case. */ ++ value_range rl, rr; ++ if (!ANY_INTEGRAL_TYPE_P (type) ++ || TYPE_OVERFLOW_WRAPS (type) ++ || integer_zerop (CHREC_LEFT (op0)) ++ || (TREE_CODE (CHREC_LEFT (op0)) == INTEGER_CST ++ && TREE_CODE (CHREC_RIGHT (op0)) == INTEGER_CST ++ && (tree_int_cst_sgn (CHREC_LEFT (op0)) ++ == tree_int_cst_sgn (CHREC_RIGHT (op0)))) ++ || (get_range_query (cfun)->range_of_expr (rl, CHREC_LEFT (op0)) ++ && !rl.undefined_p () ++ && (rl.nonpositive_p () || rl.nonnegative_p ()) ++ && get_range_query (cfun)->range_of_expr (rr, ++ CHREC_RIGHT (op0)) ++ && !rr.undefined_p () ++ && ((rl.nonpositive_p () && rr.nonpositive_p ()) ++ || (rl.nonnegative_p () && rr.nonnegative_p ())))) ++ { ++ tree left = chrec_fold_multiply (type, CHREC_LEFT (op0), op1); ++ tree right = chrec_fold_multiply (type, CHREC_RIGHT (op0), op1); ++ return build_polynomial_chrec (CHREC_VARIABLE (op0), left, right); ++ } ++ else ++ { ++ tree utype = unsigned_type_for (type); ++ tree uop1 = chrec_convert_rhs (utype, op1); ++ tree uleft0 = chrec_convert_rhs (utype, CHREC_LEFT (op0)); ++ tree uright0 = chrec_convert_rhs (utype, CHREC_RIGHT (op0)); ++ tree left = chrec_fold_multiply (utype, uleft0, uop1); ++ tree right = chrec_fold_multiply (utype, uright0, uop1); ++ tree tem = build_polynomial_chrec (CHREC_VARIABLE (op0), ++ left, right); ++ return chrec_convert_rhs (type, tem); ++ } ++ } ++ else ++ { ++ return build_polynomial_chrec ++ (CHREC_VARIABLE (op0), ++ chrec_fold_multiply (type, CHREC_LEFT (op0), op1), ++ chrec_fold_multiply (type, CHREC_RIGHT (op0), op1)); ++ } + } + + CASE_CONVERT: +@@ -449,13 +501,20 @@ chrec_fold_multiply (tree type, + switch (TREE_CODE (op1)) + { + case POLYNOMIAL_CHREC: +- gcc_checking_assert +- (!chrec_contains_symbols_defined_in_loop (op1, +- CHREC_VARIABLE (op1))); +- return build_polynomial_chrec +- (CHREC_VARIABLE (op1), +- chrec_fold_multiply (type, CHREC_LEFT (op1), op0), +- chrec_fold_multiply (type, CHREC_RIGHT (op1), op0)); ++ if (flag_chrec_mul_fold_strict_overflow) ++ { ++ gcc_unreachable (); ++ } ++ else ++ { ++ gcc_checking_assert ++ (!chrec_contains_symbols_defined_in_loop (op1, ++ CHREC_VARIABLE (op1))); ++ return build_polynomial_chrec ++ (CHREC_VARIABLE (op1), ++ chrec_fold_multiply (type, CHREC_LEFT (op1), op0), ++ chrec_fold_multiply (type, CHREC_RIGHT (op1), op0)); ++ } + + CASE_CONVERT: + if (tree_contains_chrecs (op1, NULL)) +diff --git a/gcc/tree-chrec.h b/gcc/tree-chrec.h +index fcf41710d..cdc97d5d9 100644 +--- a/gcc/tree-chrec.h ++++ b/gcc/tree-chrec.h +@@ -63,7 +63,7 @@ extern tree chrec_fold_plus (tree, tree, tree); + extern tree chrec_fold_minus (tree, tree, tree); + extern tree chrec_fold_multiply (tree, tree, tree); + extern tree chrec_convert (tree, tree, gimple *, bool = true, tree = NULL); +-extern tree chrec_convert_rhs (tree, tree, gimple *); ++extern tree chrec_convert_rhs (tree, tree, gimple * = NULL); + extern tree chrec_convert_aggressive (tree, tree, bool *); + + /* Operations. */ +diff --git a/gcc/value-range.cc b/gcc/value-range.cc +index 000bbcf89..a1dc10a24 100644 +--- a/gcc/value-range.cc ++++ b/gcc/value-range.cc +@@ -656,6 +656,18 @@ irange::contains_p (tree cst) const + + return false; + } ++bool ++irange::nonnegative_p () const ++{ ++ return wi::ge_p (lower_bound (), 0, TYPE_SIGN (type ())); ++} ++ ++bool ++irange::nonpositive_p () const ++{ ++ return wi::le_p (upper_bound (), 0, TYPE_SIGN (type ())); ++} ++ + + + /* Normalize addresses into constants. */ +diff --git a/gcc/value-range.h b/gcc/value-range.h +index d4cba22d5..2dc0907de 100644 +--- a/gcc/value-range.h ++++ b/gcc/value-range.h +@@ -69,6 +69,8 @@ public: + bool varying_p () const; + bool singleton_p (tree *result = NULL) const; + bool contains_p (tree) const; ++ bool nonnegative_p () const; ++ bool nonpositive_p () const; + + // In-place operators. + void union_ (const irange &); +-- +2.33.0 + diff --git a/0099-Enable-Transposed-SLP.patch b/0099-Enable-Transposed-SLP.patch new file mode 100644 index 0000000000000000000000000000000000000000..b4e8b24b669790890da83fa4966a18efb18f90ae --- /dev/null +++ b/0099-Enable-Transposed-SLP.patch @@ -0,0 +1,5624 @@ +From 0dd3b8532f35486bd5db2c71342c8dfed4c0893a Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=E9=83=91=E6=99=A8=E5=8D=89?= +Date: Thu, 25 Jul 2024 17:25:23 +0800 +Subject: [PATCH] Enable Transposed SLP. + +--- + gcc/common.opt | 4 + + gcc/testsuite/gcc.dg/vect/transpose-1.c | 53 + + gcc/testsuite/gcc.dg/vect/transpose-2.c | 50 + + gcc/testsuite/gcc.dg/vect/transpose-3.c | 54 + + gcc/testsuite/gcc.dg/vect/transpose-4.c | 53 + + gcc/testsuite/gcc.dg/vect/transpose-5.c | 74 ++ + gcc/testsuite/gcc.dg/vect/transpose-6.c | 67 + + gcc/testsuite/gcc.dg/vect/transpose-7.c | 53 + + gcc/testsuite/gcc.dg/vect/transpose-8.c | 53 + + gcc/testsuite/gcc.dg/vect/vect.exp | 7 + + gcc/tree-loop-distribution.cc | 1464 ++++++++++++++++++++- + gcc/tree-vect-data-refs.cc | 237 ++++ + gcc/tree-vect-loop.cc | 42 +- + gcc/tree-vect-patterns.cc | 4 +- + gcc/tree-vect-slp.cc | 1553 ++++++++++++++++++++--- + gcc/tree-vect-stmts.cc | 973 +++++++++++++- + gcc/tree-vectorizer.h | 96 +- + 17 files changed, 4648 insertions(+), 189 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-1.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-2.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-3.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-4.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-5.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-6.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-7.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-8.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index b18f0b944..5958c4e0b 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -3221,6 +3221,10 @@ ftree-slp-vectorize + Common Var(flag_tree_slp_vectorize) Optimization EnabledBy(ftree-vectorize) + Enable basic block vectorization (SLP) on trees. + ++ftree-slp-transpose-vectorize ++Common Var(flag_tree_slp_transpose_vectorize) Optimization Init(0) ++Enable basic block vectorization (SLP) for transposed stores and loads on trees. ++ + fvect-cost-model= + Common Joined RejectNegative Enum(vect_cost_model) Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT) Optimization + -fvect-cost-model=[unlimited|dynamic|cheap|very-cheap] Specifies the cost model for vectorization. +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-1.c b/gcc/testsuite/gcc.dg/vect/transpose-1.c +new file mode 100644 +index 000000000..8237a8b9e +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-1.c +@@ -0,0 +1,53 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++ ++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; ++ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ c2[i] = pix1[2] - pix2[2]; ++ c3[i] = pix1[3] - pix2[3]; ++ c4[i] = pix1[4] - pix2[4]; ++ c5[i] = pix1[5] - pix2[5]; ++ c6[i] = pix1[6] - pix2[6]; ++ c7[i] = pix1[7] - pix2[7]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 16; ++ int i2 = 8; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 2; ++ input2[i] = i; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 1264) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-2.c b/gcc/testsuite/gcc.dg/vect/transpose-2.c +new file mode 100644 +index 000000000..fdf4dbd96 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-2.c +@@ -0,0 +1,50 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-additional-options "-fno-tree-loop-vectorize -fno-tree-dse" } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 8 ++#define M 256 ++ ++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned short c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; ++ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ c2[i] = pix1[2] - pix2[2]; ++ c3[i] = pix1[3] - pix2[3]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i] + c2[i] + c3[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 5; ++ int i2 = 4; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 4; ++ input2[i] = i * 2; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 1440) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-3.c b/gcc/testsuite/gcc.dg/vect/transpose-3.c +new file mode 100644 +index 000000000..e492e3717 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-3.c +@@ -0,0 +1,54 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-additional-options "-fno-tree-loop-vectorize -fno-tree-dse -fno-tree-fre" } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++ ++int foo (unsigned short *pix1, int i_pix1, unsigned short *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; ++ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ c2[i] = pix1[2] - pix2[2]; ++ c3[i] = pix1[3] - pix2[3]; ++ c4[i] = pix1[4] - pix2[4]; ++ c5[i] = pix1[5] - pix2[5]; ++ c6[i] = pix1[6] - pix2[6]; ++ c7[i] = pix1[7] - pix2[7]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned short input1[M]; ++ unsigned short input2[M]; ++ int i1 = 8; ++ int i2 = 4; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 4; ++ input2[i] = i; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 1680) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-4.c b/gcc/testsuite/gcc.dg/vect/transpose-4.c +new file mode 100644 +index 000000000..0b4adea9b +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-4.c +@@ -0,0 +1,53 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++ ++int foo (unsigned *pix1, int i_pix1, unsigned *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; ++ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ c2[i] = pix1[2] - pix2[2]; ++ c3[i] = pix1[3] - pix2[3]; ++ c4[i] = pix1[4] - pix2[4]; ++ c5[i] = pix1[5] - pix2[5]; ++ c6[i] = pix1[6] - pix2[6]; ++ c7[i] = pix1[7] - pix2[7]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned input1[M]; ++ unsigned input2[M]; ++ int i1 = 12; ++ int i2 = 6; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 7; ++ input2[i] = i * 3; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 3616) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-5.c b/gcc/testsuite/gcc.dg/vect/transpose-5.c +new file mode 100644 +index 000000000..040dedf1b +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-5.c +@@ -0,0 +1,74 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-additional-options "-fno-tree-dse -fno-tree-fre" } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++#define eps 1e-8 ++ ++double foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ unsigned a0[N]; ++ unsigned a1[N]; ++ unsigned a2[N]; ++ unsigned a3[N]; ++ ++ int b0[N]; ++ int b1[N]; ++ int b2[N]; ++ int b3[N]; ++ ++ for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ a0[i] = (pix1[0] - pix2[0]) + ((pix1[4] + pix2[4]) << 16); ++ a1[i] = (pix1[1] - pix2[1]) + ((pix1[5] + pix2[5]) << 16); ++ a2[i] = (pix1[2] - pix2[2]) + ((pix1[6] + pix2[6]) << 16); ++ a3[i] = (pix1[3] - pix2[3]) + ((pix1[7] + pix2[7]) << 16); ++ } ++ ++ for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ b0[i] = (pix1[0] - pix2[0]) + (pix1[4] + pix2[4]); ++ b1[i] = (pix1[1] - pix2[1]) + (pix1[5] + pix2[5]); ++ b2[i] = (pix1[2] - pix2[2]) + (pix1[6] + pix2[6]); ++ b3[i] = (pix1[3] - pix2[3]) + (pix1[7] + pix2[7]); ++ } ++ ++ double sum = 0; ++ for (int i = 0; i < N; i++) ++ { ++ sum += a0[i] + a1[i] + a2[i] + a3[i] + b0[i] + b1[i] + b2[i] + b3[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 8; ++ int i2 = 3; ++ unsigned char m = 2; ++ unsigned short n = 12; ++ float t = 3.0; ++ double k = 4.2; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 6; ++ input2[i] = i * 3; ++ } ++ double sum = foo (input1, i1, input2, i2); ++ if (fabs (sum - 78648144) > eps) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ ++/* { dg-final { scan-tree-dump-times "vectorizable_store for slp transpose" 2 "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-6.c b/gcc/testsuite/gcc.dg/vect/transpose-6.c +new file mode 100644 +index 000000000..3e134ac02 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-6.c +@@ -0,0 +1,67 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-require-effective-target vect_int } */ ++/* { dg-require-effective-target vect_float } */ ++#include ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++#define eps 1e-8 ++ ++float foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ unsigned a0[N]; ++ unsigned a1[N]; ++ unsigned a2[N]; ++ unsigned a3[N]; ++ ++ float c0[N]; ++ float c1[N]; ++ float c2[N]; ++ float c3[N]; ++ ++ for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ a0[i] = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16); ++ a1[i] = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16); ++ a2[i] = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16); ++ a3[i] = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16); ++ ++ c0[i] = (pix1[0] * pix2[0]) + (pix1[4] * pix2[4]); ++ c1[i] = (pix1[1] * pix2[1]) + (pix1[5] * pix2[5]); ++ c2[i] = (pix1[2] * pix2[2]) + (pix1[6] * pix2[6]); ++ c3[i] = (pix1[3] * pix2[3]) + (pix1[7] * pix2[7]); ++ } ++ ++ float sum = 0; ++ for (int i = 0; i < N; i++) ++ { ++ sum += a0[i] + a1[i] + a2[i] + a3[i] + c0[i] + c1[i] + c2[i] + c3[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 18; ++ int i2 = 6; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 4; ++ input2[i] = i * 2; ++ } ++ float sum = foo (input1, i1, input2, i2); ++ if (fabs (sum - 106041168) > eps) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ ++/* { dg-final { scan-tree-dump-times "vectorizable_store for slp transpose" 2 "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-7.c b/gcc/testsuite/gcc.dg/vect/transpose-7.c +new file mode 100644 +index 000000000..8ba1b1b6d +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-7.c +@@ -0,0 +1,53 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-additional-options "-fno-tree-loop-vectorize -fno-tree-dse" } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 16 ++#define M 256 ++ ++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned char c0[N], c1[N]; ++ for (int i = 0; i < N/2; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ } ++ for (int i = N/2; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 6; ++ int i2 = 4; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 5; ++ input2[i] = i * 2; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 3280) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-8.c b/gcc/testsuite/gcc.dg/vect/transpose-8.c +new file mode 100644 +index 000000000..a154f012a +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-8.c +@@ -0,0 +1,53 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-additional-options "-fno-tree-loop-vectorize" } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include "tree-vect.h" ++ ++#define N 32 ++#define M 256 ++ ++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned char c0[N], c1[N]; ++ for (int i = 0; i < N/2; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ } ++ for (int i = N/2; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 6; ++ int i2 = 4; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 5; ++ input2[i] = i * 2; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 7584) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect.exp b/gcc/testsuite/gcc.dg/vect/vect.exp +index dcaef1e0a..ae5212411 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect.exp ++++ b/gcc/testsuite/gcc.dg/vect/vect.exp +@@ -117,6 +117,13 @@ et-dg-runtest dg-runtest [lsort \ + [glob -nocomplain $srcdir/$subdir/no-vfa-*.\[cS\]]] \ + "" $DEFAULT_VECTCFLAGS + ++# -ftree-slp-transpose-vectorize SLP tests ++set VECT_SLP_CFLAGS $SAVED_VECT_SLP_CFLAGS ++lappend VECT_SLP_CFLAGS "-ftree-slp-transpose-vectorize" ++et-dg-runtest dg-runtest [lsort \ ++ [glob -nocomplain $srcdir/$subdir/transpose-*.\[cS\]]] \ ++ "" "-ftree-slp-transpose-vectorize -fdump-tree-slp-details -O3" ++ + # -ffast-math tests + set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS + lappend DEFAULT_VECTCFLAGS "-ffast-math" +diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc +index 606eb05e6..8d118e987 100644 +--- a/gcc/tree-loop-distribution.cc ++++ b/gcc/tree-loop-distribution.cc +@@ -36,6 +36,47 @@ along with GCC; see the file COPYING3. If not see + | D(I) = A(I-1)*E + |ENDDO + ++ If an unvectorizable loop has grouped loads, and calculations from grouped ++ loads are isomorphic, build temp arrays using stmts where isomorphic ++ calculations end. Afer distribution, the partition built from temp ++ arrays can be vectorized in pass SLP after loop unrolling. For example, ++ ++ |DO I = 1, N ++ | A = FOO (ARG_1); ++ | B = FOO (ARG_2); ++ | C = BAR_0 (A); ++ | D = BAR_1 (B); ++ |ENDDO ++ ++ is transformed to ++ ++ |DO I = 1, N ++ | J = FOO (ARG_1); ++ | K = FOO (ARG_2); ++ | X[I] = J; ++ | Y[I] = K; ++ | A = X[I]; ++ | B = Y[I]; ++ | C = BAR_0 (A); ++ | D = BAR_1 (B); ++ |ENDDO ++ ++ and is then distributed to ++ ++ |DO I = 1, N ++ | J = FOO (ARG_1); ++ | K = FOO (ARG_2); ++ | X[I] = J; ++ | Y[I] = K; ++ |ENDDO ++ ++ |DO I = 1, N ++ | A = X[I]; ++ | B = Y[I]; ++ | C = BAR_0 (A); ++ | D = BAR_1 (B); ++ |ENDDO ++ + Loop distribution is the dual of loop fusion. It separates statements + of a loop (or loop nest) into multiple loops (or loop nests) with the + same loop header. The major goal is to separate statements which may +@@ -44,7 +85,9 @@ along with GCC; see the file COPYING3. If not see + + 1) Seed partitions with specific type statements. For now we support + two types seed statements: statement defining variable used outside +- of loop; statement storing to memory. ++ of loop; statement storing to memory. Moreover, for unvectorizable ++ loops, we try to find isomorphic stmts from grouped load and build ++ temp arrays as new seed statements. + 2) Build reduced dependence graph (RDG) for loop to be distributed. + The vertices (RDG:V) model all statements in the loop and the edges + (RDG:E) model flow and control dependencies between statements. +@@ -90,6 +133,8 @@ along with GCC; see the file COPYING3. If not see + data reuse. */ + + #include "config.h" ++#define INCLUDE_MAP ++#define INCLUDE_ALGORITHM + #include "system.h" + #include "coretypes.h" + #include "backend.h" +@@ -115,6 +160,7 @@ along with GCC; see the file COPYING3. If not see + #include "tree-vectorizer.h" + #include "tree-eh.h" + #include "gimple-fold.h" ++#include "optabs-tree.h" + #include "tree-affine.h" + #include "intl.h" + #include "rtl.h" +@@ -188,6 +234,52 @@ struct rdg_vertex + #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I])) + #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I])) + ++/* Results of isomorphic group analysis. */ ++#define UNINITIALIZED (0) ++#define ISOMORPHIC (1) ++#define HETEROGENEOUS (1 << 1) ++#define UNCERTAIN (1 << 2) ++ ++/* Information of a stmt while analyzing isomorphic use in group. */ ++ ++typedef struct _group_info ++{ ++ gimple *stmt; ++ ++ /* True if stmt can be a cut point. */ ++ bool cut_point; ++ ++ /* For use_stmt with two rhses, one of which is the lhs of stmt. ++ If the other is unknown to be isomorphic, mark it uncertain. */ ++ bool uncertain; ++ ++ /* Searching of isomorphic stmt reaches heterogeneous groups or reaches ++ MEM stmts. */ ++ bool done; ++ ++ _group_info () ++ { ++ stmt = NULL; ++ cut_point = false; ++ uncertain = false; ++ done = false; ++ } ++} *group_info; ++ ++/* PAIR of cut points and corresponding profit. */ ++typedef std::pair *, int> stmts_profit; ++ ++/* MAP of vector factor VF and corresponding stmts_profit PAIR. */ ++typedef std::map vf_stmts_profit_map; ++ ++/* PAIR of group_num and iteration_num. We consider rhses from the same ++ group and interation are isomorphic. */ ++typedef std::pair group_iteration; ++ ++/* An isomorphic stmt is detetmined by lhs of use_stmt, group_num and ++ the iteration_num when we insert this stmt to this map. */ ++typedef std::map isomer_stmt_lhs; ++ + /* Data dependence type. */ + + enum rdg_dep_type +@@ -600,13 +692,14 @@ class loop_distribution + /* Returns true when PARTITION1 and PARTITION2 access the same memory + object in RDG. */ + bool share_memory_accesses (struct graph *rdg, +- partition *partition1, partition *partition2); ++ partition *partition1, partition *partition2, ++ hash_set *excluded_arrays); + + /* For each seed statement in STARTING_STMTS, this function builds + partition for it by adding depended statements according to RDG. + All partitions are recorded in PARTITIONS. */ + void rdg_build_partitions (struct graph *rdg, +- vec starting_stmts, ++ vec *starting_stmts, + vec *partitions); + + /* Compute partition dependence created by the data references in DRS1 +@@ -643,15 +736,50 @@ class loop_distribution + + /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution. + ALIAS_DDRS contains ddrs which need runtime alias check. */ +- void finalize_partitions (class loop *loop, vec +- *partitions, vec *alias_ddrs); ++ void finalize_partitions (class loop *loop, ++ vec *partitions, ++ vec *alias_ddrs, bitmap producers); ++ ++ /* Analyze loop form and if it's vectorizable to decide if we need to ++ insert temp arrays to distribute it. */ ++ bool may_insert_temp_arrays (loop_p loop, struct graph *&rdg, ++ control_dependences *cd); ++ ++ /* Reset gimple_uid of GIMPLE_DEBUG and GIMPLE_LABEL to -1. */ ++ void reset_gimple_uid (loop_p loop); ++ ++ bool check_loop_vectorizable (loop_p loop); ++ ++ inline void rebuild_rdg (loop_p loop, struct graph *&rdg, ++ control_dependences *cd); ++ ++ /* If loop is not distributed, remove inserted temp arrays. */ ++ void remove_insertion (loop_p loop, struct graph *flow_only_rdg, ++ bitmap producers, struct partition *partition); ++ ++ /* Insert temp arrays if isomorphic computation exists. Temp arrays will be ++ regarded as SEED_STMTS for building partitions in succeeding processes. */ ++ bool insert_temp_arrays (loop_p loop, vec seed_stmts, ++ hash_set *tmp_array_vars, bitmap producers); ++ ++ void build_producers (loop_p loop, bitmap producers, ++ vec &transformed); ++ ++ void do_insertion (loop_p loop, struct graph *flow_only_rdg, tree iv, ++ bitmap cut_points, hash_set *tmp_array_vars, ++ bitmap producers); ++ ++ /* Fuse PARTITIONS built from inserted temp arrays into one partition, ++ fuse the rest into another. */ ++ void merge_remaining_partitions (vec *partitions, ++ bitmap producers); + + /* Distributes the code from LOOP in such a way that producer statements + are placed before consumer statements. Tries to separate only the + statements from STMTS into separate loops. Returns the number of + distributed loops. Set NB_CALLS to number of generated builtin calls. + Set *DESTROY_P to whether LOOP needs to be destroyed. */ +- int distribute_loop (class loop *loop, const vec &stmts, ++ int distribute_loop (class loop *loop, vec &stmts, + control_dependences *cd, int *nb_calls, bool *destroy_p, + bool only_patterns_p); + +@@ -1893,7 +2021,8 @@ loop_distribution::classify_partition (loop_p loop, + + bool + loop_distribution::share_memory_accesses (struct graph *rdg, +- partition *partition1, partition *partition2) ++ partition *partition1, partition *partition2, ++ hash_set *excluded_arrays) + { + unsigned i, j; + bitmap_iterator bi, bj; +@@ -1927,7 +2056,10 @@ loop_distribution::share_memory_accesses (struct graph *rdg, + if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0) + && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0) + && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0) +- && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0)) ++ && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0) ++ /* An exception, if PARTITION1 and PARTITION2 contain the ++ temp array we inserted, do not merge them. */ ++ && !excluded_arrays->contains (DR_REF (dr1))) + return true; + } + } +@@ -1941,14 +2073,14 @@ loop_distribution::share_memory_accesses (struct graph *rdg, + + void + loop_distribution::rdg_build_partitions (struct graph *rdg, +- vec starting_stmts, ++ vec *starting_stmts, + vec *partitions) + { + auto_bitmap processed; + int i; + gimple *stmt; + +- FOR_EACH_VEC_ELT (starting_stmts, i, stmt) ++ FOR_EACH_VEC_ELT (*starting_stmts, i, stmt) + { + int v = rdg_vertex_for_stmt (rdg, stmt); + +@@ -2912,13 +3044,47 @@ fuse_memset_builtins (vec *partitions) + } + } + ++void ++loop_distribution::merge_remaining_partitions ++ (vec *partitions, ++ bitmap producers) ++{ ++ struct partition *partition = NULL; ++ struct partition *p1 = NULL, *p2 = NULL; ++ for (unsigned i = 0; partitions->iterate (i, &partition); i++) ++ { ++ if (bitmap_intersect_p (producers, partition->stmts)) ++ { ++ if (p1 == NULL) ++ { ++ p1 = partition; ++ continue; ++ } ++ partition_merge_into (NULL, p1, partition, FUSE_FINALIZE); ++ } ++ else ++ { ++ if (p2 == NULL) ++ { ++ p2 = partition; ++ continue; ++ } ++ partition_merge_into (NULL, p2, partition, FUSE_FINALIZE); ++ } ++ partitions->unordered_remove (i); ++ partition_free (partition); ++ i--; ++ } ++} ++ + void + loop_distribution::finalize_partitions (class loop *loop, + vec *partitions, +- vec *alias_ddrs) ++ vec *alias_ddrs, ++ bitmap producers) + { + unsigned i; +- struct partition *partition, *a; ++ struct partition *partition; + + if (partitions->length () == 1 + || alias_ddrs->length () > 0) +@@ -2950,13 +3116,7 @@ loop_distribution::finalize_partitions (class loop *loop, + || (loop->inner == NULL + && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin)) + { +- a = (*partitions)[0]; +- for (i = 1; partitions->iterate (i, &partition); ++i) +- { +- partition_merge_into (NULL, a, partition, FUSE_FINALIZE); +- partition_free (partition); +- } +- partitions->truncate (1); ++ merge_remaining_partitions (partitions, producers); + } + + /* Fuse memset builtins if possible. */ +@@ -2964,6 +3124,1216 @@ loop_distribution::finalize_partitions (class loop *loop, + fuse_memset_builtins (partitions); + } + ++/* Gimple uids of GIMPLE_DEBUG and GIMPLE_LABEL were changed during function ++ vect_analyze_loop, reset them to -1. */ ++ ++void ++loop_distribution::reset_gimple_uid (loop_p loop) ++{ ++ basic_block *bbs = get_loop_body_in_custom_order (loop, this, ++ bb_top_order_cmp_r); ++ for (int i = 0; i < int (loop->num_nodes); i++) ++ { ++ basic_block bb = bbs[i]; ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ gimple *stmt = gsi_stmt (gsi); ++ if (is_gimple_debug (stmt) || gimple_code (stmt) == GIMPLE_LABEL) ++ gimple_set_uid (stmt, -1); ++ } ++ } ++ free (bbs); ++} ++ ++bool ++loop_distribution::check_loop_vectorizable (loop_p loop) ++{ ++ vec_info_shared shared; ++ vect_analyze_loop (loop, &shared, true); ++ loop_vec_info vinfo = loop_vec_info_for_loop (loop); ++ reset_gimple_uid (loop); ++ if (vinfo == NULL) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, ++ "Loop %d no temp array insertion: bad data access pattern," ++ " unable to generate loop_vinfo.\n", loop->num); ++ return false; ++ } ++ if (vinfo->vectorizable) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Loop %d no temp array insertion: original loop" ++ " can be vectorized without distribution.\n", ++ loop->num); ++ delete vinfo; ++ loop->aux = NULL; ++ return false; ++ } ++ if (vinfo->grouped_loads.length () == 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Loop %d no temp array insertion: original loop" ++ " has no grouped loads.\n" , loop->num); ++ delete vinfo; ++ loop->aux = NULL; ++ return false; ++ } ++ return true; ++} ++ ++inline void ++loop_distribution::rebuild_rdg (loop_p loop, struct graph *&rdg, ++ control_dependences *cd) ++{ ++ free_rdg (rdg); ++ rdg = build_rdg (loop, cd); ++ gcc_checking_assert (rdg != NULL); ++} ++ ++bool ++loop_distribution::may_insert_temp_arrays (loop_p loop, struct graph *&rdg, ++ control_dependences *cd) ++{ ++ if (!(flag_tree_slp_transpose_vectorize && flag_tree_loop_vectorize)) ++ return false; ++ ++ /* Only loops with two basic blocks HEADER and LATCH are supported. HEADER ++ is the main body of a LOOP and LATCH is the basic block that controls the ++ LOOP execution. Size of temp array is determined by loop execution time, ++ so it must be a const. */ ++ tree loop_extent = number_of_latch_executions (loop); ++ if (loop->inner != NULL || loop->num_nodes > 2 ++ || TREE_CODE (loop_extent) != INTEGER_CST) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Loop %d: no temp array insertion: bad loop" ++ " form.\n", loop->num); ++ return false; ++ } ++ ++ if (loop->dont_vectorize) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Loop %d: no temp array insertion: this loop" ++ " should never be vectorized.\n", ++ loop->num); ++ return false; ++ } ++ ++ /* Do not distribute a LOOP that is able to be vectorized without ++ distribution. */ ++ if (!check_loop_vectorizable (loop)) ++ { ++ rebuild_rdg (loop, rdg, cd); ++ return false; ++ } ++ ++ rebuild_rdg (loop, rdg, cd); ++ return true; ++} ++ ++/* Return max grouped loads' length if all groupes length satisfy len = 2 ^ n. ++ Otherwise, return 0. */ ++ ++static unsigned ++get_max_vf (loop_vec_info vinfo) ++{ ++ unsigned size = 0; ++ unsigned max = 0; ++ stmt_vec_info stmt_info; ++ unsigned i = 0; ++ FOR_EACH_VEC_ELT (vinfo->grouped_loads, i, stmt_info) ++ { ++ size = stmt_info->size; ++ if (!pow2p_hwi (size)) ++ return 0; ++ max = size > max ? size : max; ++ } ++ return max; ++} ++ ++/* Convert grouped_loads from linked list to vector with length vf. Init ++ group_info of each stmt in the same group and put then into a vector. And ++ these vectors consist WORKLISTS. We will re-analyze a group if it is ++ uncertain, so we regard WORKLISTS as a circular queue. */ ++ ++static unsigned ++build_queue (loop_vec_info vinfo, unsigned vf, ++ vec *> &worklists) ++{ ++ stmt_vec_info stmt_info; ++ unsigned i = 0; ++ group_info ginfo = NULL; ++ vec *worklist = NULL; ++ FOR_EACH_VEC_ELT (vinfo->grouped_loads, i, stmt_info) ++ { ++ unsigned group_size = stmt_info->size; ++ stmt_vec_info c_stmt_info = stmt_info; ++ bool succ = true; ++ while (group_size >= vf) ++ { ++ vec_alloc (worklist, vf); ++ for (unsigned j = 0; j < vf; ++j) ++ { ++ if (c_stmt_info == NULL) ++ { ++ succ = false; ++ break; ++ } ++ ginfo = new _group_info (); ++ ginfo->stmt = c_stmt_info->stmt; ++ worklist->safe_push (ginfo); ++ c_stmt_info = c_stmt_info->next_element; ++ } ++ if (!succ) ++ { ++ unsigned k = 0; ++ ginfo = NULL; ++ FOR_EACH_VEC_ELT (*worklist, k, ginfo) ++ delete ginfo; ++ vec_free (worklist); ++ break; ++ } ++ worklists.safe_push (worklist); ++ group_size -= vf; ++ } ++ } ++ return worklists.length (); ++} ++ ++static bool ++check_same_oprand_type (tree op1, tree op2) ++{ ++ tree type1 = TREE_TYPE (op1); ++ tree type2 = TREE_TYPE (op2); ++ if (TREE_CODE (type1) != INTEGER_TYPE && TREE_CODE (type1) != REAL_TYPE) ++ return false; ++ ++ return (TREE_CODE (type1) == TREE_CODE (type2) ++ && TYPE_UNSIGNED (type1) == TYPE_UNSIGNED (type2) ++ && TYPE_PRECISION (type1) == TYPE_PRECISION (type2)); ++} ++ ++static bool ++bit_field_p (gimple *stmt) ++{ ++ unsigned i = 0; ++ auto_vec datarefs_vec; ++ data_reference_p dr; ++ if (!find_data_references_in_stmt (NULL, stmt, &datarefs_vec)) ++ return true; ++ ++ FOR_EACH_VEC_ELT (datarefs_vec, i, dr) ++ { ++ if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF ++ && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1))) ++ return true; ++ } ++ return false; ++} ++ ++static inline bool ++shift_operation (enum tree_code op) ++{ ++ return op == LSHIFT_EXPR || op == RSHIFT_EXPR || op == LROTATE_EXPR ++ || op == RROTATE_EXPR; ++} ++ ++/* Return relationship between USE_STMT and the first use_stmt of the group. ++ RHS1 is the lhs of stmt recorded in group_info. If another rhs of use_stmt ++ is not a constant, return UNCERTAIN and re-check it later. */ ++ ++static unsigned ++check_isomorphic (gimple *use_stmt, gimple *first, ++ tree rhs1, vec &hetero_lhs) ++{ ++ /* Check same operation. */ ++ enum tree_code rhs_code_first = gimple_assign_rhs_code (first); ++ enum tree_code rhs_code_current = gimple_assign_rhs_code (use_stmt); ++ if (rhs_code_first != rhs_code_current) ++ return HETEROGENEOUS; ++ ++ /* For shift operations, oprands should be equal. */ ++ if (shift_operation (rhs_code_current)) ++ { ++ tree shift_op_first = gimple_assign_rhs2 (first); ++ tree shift_op_current = gimple_assign_rhs2 (use_stmt); ++ if (!operand_equal_p (shift_op_first, shift_op_current, 0) ++ || !TREE_CONSTANT (shift_op_first)) ++ return HETEROGENEOUS; ++ ++ return ISOMORPHIC; ++ } ++ /* Type convertion expr or assignment. */ ++ if (gimple_num_ops (first) == 2) ++ return (rhs_code_first == NOP_EXPR || rhs_code_first == CONVERT_EXPR ++ || rhs_code_first == SSA_NAME) ? ISOMORPHIC : HETEROGENEOUS; ++ ++ /* We find USE_STMT from lhs of a stmt, denote it as rhs1 of USE_STMT and ++ the other one as rhs2. Check if define-stmt of current rhs2 is isomorphic ++ with define-stmt of rhs2 in the first USE_STMT at this group. */ ++ tree rhs2_first = gimple_assign_rhs1 (use_stmt) == rhs1 ++ ? gimple_assign_rhs2 (first) : gimple_assign_rhs1 (first); ++ tree rhs2_curr = gimple_assign_rhs1 (use_stmt) == rhs1 ++ ? gimple_assign_rhs2 (use_stmt) : gimple_assign_rhs1 (use_stmt); ++ ++ if (check_same_oprand_type (rhs2_first, rhs2_curr)) ++ { ++ if (TREE_CONSTANT (rhs2_curr)) ++ return ISOMORPHIC; ++ else if (hetero_lhs.contains (rhs2_curr)) ++ return HETEROGENEOUS; ++ ++ /* Provisionally set the stmt as uncertain and analyze the whole group ++ in function CHECK_UNCERTAIN later if all use_stmts are uncertain. */ ++ return UNCERTAIN; ++ } ++ return HETEROGENEOUS; ++} ++ ++static bool ++unsupported_operations (gimple *stmt) ++{ ++ enum tree_code code = gimple_assign_rhs_code (stmt); ++ return code == COND_EXPR; ++} ++ ++/* Check if the single use_stmt of STMT is isomorphic with the first one's ++ use_stmt in current group. */ ++ ++static unsigned ++check_use_stmt (group_info elmt, gimple *&first, ++ vec &tmp_stmts, vec &hetero_lhs) ++{ ++ if (gimple_code (elmt->stmt) != GIMPLE_ASSIGN) ++ return HETEROGENEOUS; ++ use_operand_p dummy; ++ tree lhs = gimple_assign_lhs (elmt->stmt); ++ gimple *use_stmt = NULL; ++ single_imm_use (lhs, &dummy, &use_stmt); ++ /* STMTs with three rhs are not supported, e.g., GIMPLE_COND. */ ++ if (use_stmt == NULL || gimple_code (use_stmt) != GIMPLE_ASSIGN ++ || unsupported_operations (use_stmt) || bit_field_p (use_stmt)) ++ return HETEROGENEOUS; ++ tmp_stmts.safe_push (use_stmt); ++ if (first == NULL) ++ { ++ first = use_stmt; ++ return UNINITIALIZED; ++ } ++ /* Check if current use_stmt and the first menber's use_stmt in the group ++ are of the same type. */ ++ tree first_lhs = gimple_assign_lhs (first); ++ tree curr_lhs = gimple_assign_lhs (use_stmt); ++ if (!check_same_oprand_type (first_lhs, curr_lhs)) ++ return HETEROGENEOUS; ++ return check_isomorphic (use_stmt, first, lhs, hetero_lhs); ++} ++ ++/* Replace stmt field in group with stmts in TMP_STMTS, and insert their ++ lhs_info to ISOMER_LHS. */ ++ ++static void ++update_isomer_lhs (vec *group, unsigned group_num, ++ unsigned iteration, isomer_stmt_lhs &isomer_lhs, ++ vec &tmp_stmts, int &profit, ++ vec &merged_groups) ++{ ++ group_info elmt = NULL; ++ /* Do not insert temp array if isomorphic stmts from grouped load have ++ only casting operations. Once isomorphic calculation has 3 oprands, ++ such as plus operation, this group can be regarded as cut point. */ ++ bool operated = (gimple_num_ops (tmp_stmts[0]) == 3); ++ /* Do not insert temp arrays if search of iosomophic stmts reaches ++ MEM stmts. */ ++ bool has_vdef = gimple_vdef (tmp_stmts[0]) != NULL; ++ bool merge = false; ++ for (unsigned i = 0; i < group->length (); i++) ++ { ++ elmt = (*group)[i]; ++ elmt->stmt = has_vdef ? NULL : tmp_stmts[i]; ++ elmt->cut_point = has_vdef ? false : (elmt->cut_point || operated); ++ elmt->uncertain = false; ++ elmt->done = has_vdef; ++ tree lhs = gimple_assign_lhs (tmp_stmts[i]); ++ if (isomer_lhs.find (lhs) != isomer_lhs.end ()) ++ { ++ merge = true; ++ continue; ++ } ++ isomer_lhs[lhs] = std::make_pair (group_num, iteration); ++ } ++ if (merge) ++ { ++ merged_groups.safe_push (group_num); ++ profit = 0; ++ return; ++ } ++ enum vect_cost_for_stmt kind = scalar_stmt; ++ int scalar_cost = builtin_vectorization_cost (kind, NULL_TREE, 0); ++ profit = (tmp_stmts.length () - 1) * scalar_cost; ++} ++ ++/* Try to find rhs2 in ISOMER_LHS, if all rhs2 were found and their group_num ++ and iteration are same, GROUP is isomorphic. */ ++ ++static unsigned ++check_isomorphic_rhs (vec *group, vec &tmp_stmts, ++ isomer_stmt_lhs &isomer_lhs) ++{ ++ group_info elmt = NULL; ++ gimple *stmt = NULL; ++ unsigned j = 0; ++ unsigned group_num = -1u; ++ unsigned iteration = -1u; ++ tree rhs1 = NULL; ++ tree rhs2 = NULL; ++ unsigned status = UNINITIALIZED; ++ FOR_EACH_VEC_ELT (*group, j, elmt) ++ { ++ rhs1 = gimple_assign_lhs (elmt->stmt); ++ stmt = tmp_stmts[j]; ++ rhs2 = (rhs1 == gimple_assign_rhs1 (stmt)) ++ ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt); ++ isomer_stmt_lhs::iterator iter = isomer_lhs.find (rhs2); ++ if (iter != isomer_lhs.end ()) ++ { ++ if (group_num == -1u) ++ { ++ group_num = iter->second.first; ++ iteration = iter->second.second; ++ status |= ISOMORPHIC; ++ continue; ++ } ++ if (iter->second.first == group_num ++ && iter->second.second == iteration) ++ { ++ status |= ISOMORPHIC; ++ continue; ++ } ++ return HETEROGENEOUS; ++ } ++ else ++ status |= UNCERTAIN; ++ } ++ return status; ++} ++ ++/* Update group_info for uncertain groups. */ ++ ++static void ++update_uncertain_stmts (vec *group, unsigned group_num, ++ unsigned iteration, vec &tmp_stmts) ++{ ++ unsigned j = 0; ++ group_info elmt = NULL; ++ FOR_EACH_VEC_ELT (*group, j, elmt) ++ { ++ elmt->uncertain = true; ++ elmt->done = false; ++ } ++} ++ ++/* Push stmts in TMP_STMTS into HETERO_LHS. */ ++ ++static void ++set_hetero (vec *group, vec &hetero_lhs, ++ vec &tmp_stmts) ++{ ++ group_info elmt = NULL; ++ unsigned i = 0; ++ for (i = 0; i < group->length (); i++) ++ { ++ elmt = (*group)[i]; ++ elmt->uncertain = false; ++ elmt->done = true; ++ } ++ gimple *stmt = NULL; ++ FOR_EACH_VEC_ELT (tmp_stmts, i, stmt) ++ if (stmt != NULL) ++ hetero_lhs.safe_push (gimple_assign_lhs (stmt)); ++} ++ ++/* Given an uncertain group, TMP_STMTS are use_stmts of stmts in GROUP. ++ Rhs1 is the lhs of stmt in GROUP, rhs2 is the other rhs of USE_STMT. ++ ++ Try to find rhs2 in ISOMER_LHS, if all found rhs2 have same group_num ++ and iteration, this uncertain group is isomorphic. ++ ++ If no rhs matched, this GROUP remains uncertain and update group_info. ++ ++ Otherwise, this GROUP is heterogeneous and return true to end analysis ++ for this group. */ ++ ++static bool ++check_uncertain (vec *group, unsigned group_num, ++ unsigned iteration, int &profit, ++ vec &tmp_stmts, isomer_stmt_lhs &isomer_lhs, ++ vec &hetero_lhs, vec &merged_groups) ++{ ++ unsigned status = check_isomorphic_rhs (group, tmp_stmts, isomer_lhs); ++ bool done = false; ++ switch (status) ++ { ++ case UNCERTAIN: ++ update_uncertain_stmts (group, group_num, iteration, tmp_stmts); ++ break; ++ case ISOMORPHIC: ++ update_isomer_lhs (group, group_num, iteration, isomer_lhs, ++ tmp_stmts, profit, merged_groups); ++ break; ++ default: ++ set_hetero (group, hetero_lhs, tmp_stmts); ++ done = true; ++ } ++ return done; ++} ++ ++/* Return false if analysis of this group is not finished, e.g., isomorphic or ++ uncertain. Calculate the profit if vectorized. */ ++ ++static bool ++check_group (vec *group, unsigned group_num, unsigned iteration, ++ int &profit, vec &merged_groups, ++ isomer_stmt_lhs &isomer_lhs, vec &hetero_lhs) ++{ ++ unsigned j = 0; ++ group_info elmt = NULL; ++ gimple *first = NULL; ++ unsigned res = 0; ++ /* Record single use stmts in TMP_STMTS and decide whether replace stmts in ++ ginfo in succeeding processes. */ ++ auto_vec tmp_stmts; ++ FOR_EACH_VEC_ELT (*group, j, elmt) ++ { ++ if (merged_groups.contains (group_num)) ++ return true; ++ res |= check_use_stmt (elmt, first, tmp_stmts, hetero_lhs); ++ } ++ ++ /* Update each group member according to RES. */ ++ switch (res) ++ { ++ case ISOMORPHIC: ++ update_isomer_lhs (group, group_num, iteration, isomer_lhs, ++ tmp_stmts, profit, merged_groups); ++ return false; ++ case UNCERTAIN: ++ return check_uncertain (group, group_num, iteration, profit, ++ tmp_stmts, isomer_lhs, hetero_lhs, ++ merged_groups); ++ default: ++ set_hetero (group, hetero_lhs, tmp_stmts); ++ return true; ++ } ++} ++ ++/* Return true if all analysises are done except uncertain groups. */ ++ ++static bool ++end_of_search (vec *> &circular_queue, ++ vec &merged_groups) ++{ ++ unsigned i = 0; ++ vec *group = NULL; ++ group_info elmt = NULL; ++ FOR_EACH_VEC_ELT (circular_queue, i, group) ++ { ++ if (merged_groups.contains (i)) ++ continue; ++ elmt = (*group)[0]; ++ /* If there is any isomorphic use_stmts, continue analysis of isomorphic ++ use_stmts. */ ++ if (!elmt->done && !elmt->uncertain) ++ return false; ++ } ++ return true; ++} ++ ++/* Push valid stmts to STMTS as cutpoints. */ ++ ++static bool ++check_any_cutpoints (vec *> &circular_queue, ++ vec *&stmts, vec &merged_groups) ++{ ++ unsigned front = 0; ++ vec *group = NULL; ++ group_info elmt = NULL; ++ unsigned max = circular_queue.length () * circular_queue[0]->length (); ++ vec_alloc (stmts, max); ++ while (front < circular_queue.length ()) ++ { ++ unsigned i = 0; ++ if (merged_groups.contains (front)) ++ { ++ front++; ++ continue; ++ } ++ group = circular_queue[front++]; ++ FOR_EACH_VEC_ELT (*group, i, elmt) ++ if (elmt->stmt != NULL && elmt->done && elmt->cut_point) ++ stmts->safe_push (elmt->stmt); ++ } ++ return stmts->length () != 0; ++} ++ ++/* Grouped loads are isomorphic. Make pair for group number and iteration, ++ map load stmt to this pair. We set iteration 0 here. */ ++ ++static void ++init_isomer_lhs (vec *> &groups, isomer_stmt_lhs &isomer_lhs) ++{ ++ vec *group = NULL; ++ group_info elmt = NULL; ++ unsigned i = 0; ++ FOR_EACH_VEC_ELT (groups, i, group) ++ { ++ unsigned j = 0; ++ FOR_EACH_VEC_ELT (*group, j, elmt) ++ isomer_lhs[gimple_assign_lhs (elmt->stmt)] = std::make_pair (i, 0); ++ } ++} ++ ++/* It's not a strict analysis of load/store profit. Assume scalar and vector ++ load/store are of the same cost. The result PROFIT equals profit form ++ vectorizing of scalar loads/stores minus cost of a vectorized load/store. */ ++ ++static int ++load_store_profit (unsigned scalar_mem_ops, unsigned vf, unsigned new_mem_ops) ++{ ++ int profit = 0; ++ enum vect_cost_for_stmt kind = scalar_load; ++ int scalar_cost = builtin_vectorization_cost (kind, NULL_TREE, 0); ++ profit += (scalar_mem_ops - (scalar_mem_ops / vf)) * scalar_cost; ++ profit -= new_mem_ops / vf * scalar_cost; ++ kind = scalar_store; ++ scalar_cost = builtin_vectorization_cost (kind, NULL_TREE, 0); ++ profit -= new_mem_ops / vf * scalar_cost; ++ return profit; ++} ++ ++/* Breadth first search the graph consisting of define-use chain starting from ++ the circular queue initialized by function BUILD_QUEUE. Find single use of ++ each stmt in group and check if they are isomorphic. Isomorphic is defined ++ as same rhs type, same operator, and isomorphic calculation of each rhs ++ starting from load. If another rhs is uncertain to be isomorphic, put it ++ at the end of circular queue and re-analyze it during the next iteration. ++ If a group shares the same use_stmt with another group, skip one of them in ++ succeedor prcoesses as merged. Iterate the circular queue until all ++ remianing groups heterogeneous or reaches MEN stmts. If all other groups ++ have finishes the analysis, and the remaining groups are uncertain, ++ return false to avoid endless loop. */ ++ ++bool ++bfs_find_isomer_stmts (vec *> &circular_queue, ++ stmts_profit &profit_pair, unsigned vf, ++ bool &reach_vdef) ++{ ++ isomer_stmt_lhs isomer_lhs; ++ auto_vec hetero_lhs; ++ auto_vec merged_groups; ++ vec *group = NULL; ++ /* True if analysis finishes. */ ++ bool done = false; ++ int profit_sum = 0; ++ vec *stmts = NULL; ++ init_isomer_lhs (circular_queue, isomer_lhs); ++ for (unsigned i = 1; !done; ++i) ++ { ++ unsigned front = 0; ++ /* Re-initialize DONE to TRUE while a new iteration begins. */ ++ done = true; ++ while (front < circular_queue.length ()) ++ { ++ int profit = 0; ++ group = circular_queue[front]; ++ done &= check_group (group, front, i, profit, merged_groups, ++ isomer_lhs, hetero_lhs); ++ profit_sum += profit; ++ if (profit != 0 && (*group)[0]->stmt == NULL) ++ { ++ reach_vdef = true; ++ return false; ++ } ++ ++front; ++ } ++ /* Uncertain result, return. */ ++ if (!done && end_of_search (circular_queue, merged_groups)) ++ return false; ++ } ++ if (check_any_cutpoints (circular_queue, stmts, merged_groups)) ++ { ++ profit_pair.first = stmts; ++ unsigned loads = circular_queue.length () * circular_queue[0]->length (); ++ profit_pair.second = profit_sum + load_store_profit (loads, vf, ++ stmts->length ()); ++ if (profit_pair.second > 0) ++ return true; ++ } ++ return false; ++} ++ ++/* Free memory allocated by ginfo. */ ++ ++static void ++free_ginfos (vec *> &worklists) ++{ ++ vec *worklist; ++ unsigned i = 0; ++ while (i < worklists.length ()) ++ { ++ worklist = worklists[i++]; ++ group_info ginfo; ++ unsigned j = 0; ++ FOR_EACH_VEC_ELT (*worklist, j, ginfo) ++ delete ginfo; ++ vec_free (worklist); ++ } ++} ++ ++static void ++release_tmp_stmts (vf_stmts_profit_map &candi_stmts) ++{ ++ vf_stmts_profit_map::iterator iter; ++ for (iter = candi_stmts.begin (); iter != candi_stmts.end (); ++iter) ++ iter->second.first->release (); ++} ++ ++/* Choose the group of stmt with maximun profit. */ ++ ++static bool ++decide_stmts_by_profit (vf_stmts_profit_map &candi_stmts, vec &stmts) ++{ ++ vf_stmts_profit_map::iterator iter; ++ int profit = 0; ++ int max = 0; ++ vec *tmp = NULL; ++ for (iter = candi_stmts.begin (); iter != candi_stmts.end (); ++iter) ++ { ++ profit = iter->second.second; ++ if (profit > max) ++ { ++ tmp = iter->second.first; ++ max = profit; ++ } ++ } ++ if (max == 0) ++ { ++ release_tmp_stmts (candi_stmts); ++ return false; ++ } ++ unsigned i = 0; ++ gimple *stmt = NULL; ++ FOR_EACH_VEC_ELT (*tmp, i, stmt) ++ stmts.safe_push (stmt); ++ release_tmp_stmts (candi_stmts); ++ return stmts.length () != 0; ++} ++ ++/* Find isomorphic stmts from grouped loads with vector factor VF. ++ ++ Given source code as follows and ignore casting. ++ ++ a0 = (a[0] + b[0]) + ((a[4] - b[4]) << 16); ++ a1 = (a[1] + b[1]) + ((a[5] - b[5]) << 16); ++ a2 = (a[2] + b[2]) + ((a[6] - b[6]) << 16); ++ a3 = (a[3] + b[3]) + ((a[7] - b[7]) << 16); ++ ++ We get grouped loads in VINFO as ++ ++ GROUP_1 GROUP_2 ++ _1 = *a _11 = *b ++ _2 = *(a + 1) _12 = *(b + 1) ++ _3 = *(a + 2) _13 = *(b + 2) ++ _4 = *(a + 3) _14 = *(b + 3) ++ _5 = *(a + 4) _15 = *(b + 4) ++ _6 = *(a + 5) _16 = *(b + 5) ++ _7 = *(a + 6) _17 = *(b + 6) ++ _8 = *(a + 7) _18 = *(b + 7) ++ ++ First we try VF = 8, we get two worklists ++ ++ WORKLIST_1 WORKLIST_2 ++ _1 = *a _11 = *b ++ _2 = *(a + 1) _12 = *(b + 1) ++ _3 = *(a + 2) _13 = *(b + 2) ++ _4 = *(a + 3) _14 = *(b + 3) ++ _5 = *(a + 4) _15 = *(b + 4) ++ _6 = *(a + 5) _16 = *(b + 5) ++ _7 = *(a + 6) _17 = *(b + 6) ++ _8 = *(a + 7) _18 = *(b + 7) ++ ++ We find _111 = _1 + _11 and _115 = _5 - _15 are not isomorphic, ++ so we try VF = VF / 2. ++ ++ GROUP_1 GROUP_2 ++ _1 = *a _5 = *(a + 4) ++ _2 = *(a + 1) _6 = *(a + 5) ++ _3 = *(a + 2) _7 = *(a + 6) ++ _4 = *(a + 3) _8 = *(a + 7) ++ ++ GROUP_3 GROUP_4 ++ _11 = *b _15 = *(b + 4) ++ _12 = *(b + 1) _16 = *(b + 5) ++ _13 = *(b + 2) _17 = *(b + 6) ++ _14 = *(b + 3) _18 = *(b + 7) ++ ++ We first analyze group_1, and find all operations are isomorphic, then ++ replace stmts in group_1 with their use_stmts. Group_2 as well. ++ ++ GROUP_1 GROUP_2 ++ _111 = _1 + _11 _115 = _5 - _15 ++ _112 = _2 + _12 _116 = _6 - _16 ++ _113 = _3 + _13 _117 = _7 - _17 ++ _114 = _4 + _14 _118 = _8 - _18 ++ ++ When analyzing group_3 and group_4, we find their use_stmts are the same ++ as group_1 and group_2. So group_3 is regarded as being merged to group_1 ++ and group_4 being merged to group_2. In future procedures, we will skip ++ group_3 and group_4. ++ ++ We repeat such processing until opreations are not isomorphic or searching ++ reaches MEM stmts. In our given case, searching end up at a0, a1, a2 and ++ a3. */ ++ ++static bool ++find_isomorphic_stmts (loop_vec_info vinfo, vec &stmts) ++{ ++ unsigned vf = get_max_vf (vinfo); ++ if (vf == 0) ++ return false; ++ auto_vec *> circular_queue; ++ /* Map of vector factor and corresponding vectorizing profit. */ ++ stmts_profit profit_map; ++ /* Map of cut_points and vector factor. */ ++ vf_stmts_profit_map candi_stmts; ++ bool reach_vdef = false; ++ while (vf > 2) ++ { ++ if (build_queue (vinfo, vf, circular_queue) == 0) ++ return false; ++ if (!bfs_find_isomer_stmts (circular_queue, profit_map, vf, reach_vdef)) ++ { ++ if (reach_vdef) ++ { ++ release_tmp_stmts (candi_stmts); ++ free_ginfos (circular_queue); ++ circular_queue.release (); ++ return false; ++ } ++ vf /= 2; ++ free_ginfos (circular_queue); ++ circular_queue.release (); ++ continue; ++ } ++ candi_stmts[vf] = profit_map; ++ free_ginfos (circular_queue); ++ vf /= 2; ++ circular_queue.release (); ++ } ++ return decide_stmts_by_profit (candi_stmts, stmts); ++} ++ ++/* Get iv from SEED_STMTS and make sure each seed_stmt has only one iv as index ++ and all indices are the same. */ ++ ++static tree ++find_index (vec seed_stmts) ++{ ++ if (seed_stmts.length () == 0) ++ return NULL; ++ bool found_index = false; ++ tree index = NULL; ++ unsigned ui = 0; ++ for (ui = 0; ui < seed_stmts.length (); ui++) ++ { ++ if (!gimple_vdef (seed_stmts[ui])) ++ return NULL; ++ tree lhs = gimple_assign_lhs (seed_stmts[ui]); ++ unsigned num_index = 0; ++ while (TREE_CODE (lhs) == ARRAY_REF) ++ { ++ if (TREE_CODE (TREE_OPERAND (lhs, 1)) == SSA_NAME) ++ { ++ num_index++; ++ if (num_index > 1) ++ return NULL; ++ if (index == NULL) ++ { ++ index = TREE_OPERAND (lhs, 1); ++ found_index = true; ++ } ++ else if (index != TREE_OPERAND (lhs, 1)) ++ return NULL; ++ } ++ lhs = TREE_OPERAND (lhs, 0); ++ } ++ if (!found_index) ++ return NULL; ++ } ++ return index; ++} ++ ++/* Check if expression of phi is an increament of a const. */ ++ ++static void ++check_phi_inc (struct vertex *v_phi, struct graph *rdg, bool &found_inc) ++{ ++ struct graph_edge *e_phi; ++ for (e_phi = v_phi->succ; e_phi; e_phi = e_phi->succ_next) ++ { ++ struct vertex *v_inc = &(rdg->vertices[e_phi->dest]); ++ if (!is_gimple_assign (RDGV_STMT (v_inc)) ++ || gimple_expr_code (RDGV_STMT (v_inc)) != PLUS_EXPR) ++ continue; ++ tree rhs1 = gimple_assign_rhs1 (RDGV_STMT (v_inc)); ++ tree rhs2 = gimple_assign_rhs2 (RDGV_STMT (v_inc)); ++ if (!(integer_onep (rhs1) || integer_onep (rhs2))) ++ continue; ++ struct graph_edge *e_inc; ++ /* find cycle with only two vertices inc and phi: inc <--> phi. */ ++ bool found_cycle = false; ++ for (e_inc = v_inc->succ; e_inc; e_inc = e_inc->succ_next) ++ { ++ if (e_inc->dest == e_phi->src) ++ { ++ found_cycle = true; ++ break; ++ } ++ } ++ if (!found_cycle) ++ continue; ++ found_inc = true; ++ } ++} ++ ++/* Check if phi satisfies form like PHI <0, i>. */ ++ ++static inline bool ++iv_check_phi_stmt (gimple *phi_stmt) ++{ ++ return gimple_phi_num_args (phi_stmt) == 2 ++ && (integer_zerop (gimple_phi_arg_def (phi_stmt, 0)) ++ || integer_zerop (gimple_phi_arg_def (phi_stmt, 1))); ++} ++ ++/* Make sure the iteration varible is a phi. */ ++ ++static tree ++get_iv_from_seed (struct graph *flow_only_rdg, vec seed_stmts) ++{ ++ tree index = find_index (seed_stmts); ++ if (index == NULL) ++ return NULL; ++ for (int i = 0; i < flow_only_rdg->n_vertices; i++) ++ { ++ struct vertex *v = &(flow_only_rdg->vertices[i]); ++ if (RDGV_STMT (v) != seed_stmts[0]) ++ continue; ++ struct graph_edge *e; ++ bool found_phi = false; ++ for (e = v->pred; e; e = e->pred_next) ++ { ++ struct vertex *v_phi = &(flow_only_rdg->vertices[e->src]); ++ gimple *phi_stmt = RDGV_STMT (v_phi); ++ if (gimple_code (phi_stmt) != GIMPLE_PHI ++ || gimple_phi_result (phi_stmt) != index) ++ continue; ++ if (!iv_check_phi_stmt (phi_stmt)) ++ return NULL; ++ /* find inc expr in succ of phi. */ ++ bool found_inc = false; ++ check_phi_inc (v_phi, flow_only_rdg, found_inc); ++ if (!found_inc) ++ return NULL; ++ found_phi = true; ++ break; ++ } ++ if (!found_phi) ++ return NULL; ++ break; ++ } ++ return index; ++} ++ ++/* Do not distribute loop if vertexes in ROOT_MAP have antidependence with in ++ FLOW_ONLY_RDG. */ ++ ++static bool ++check_no_dependency (struct graph *flow_only_rdg, bitmap root_map) ++{ ++ bitmap_iterator bi; ++ unsigned ui; ++ auto_vec visited_nodes; ++ auto_bitmap visited_map; ++ EXECUTE_IF_SET_IN_BITMAP (root_map, 0, ui, bi) ++ visited_nodes.safe_push (ui); ++ for (ui = 0; ui < visited_nodes.length (); ui++) ++ { ++ struct vertex *v = &(flow_only_rdg->vertices[visited_nodes[ui]]); ++ struct graph_edge *e; ++ for (e = v->succ; e; e = e->succ_next) ++ { ++ if (bitmap_bit_p (root_map, e->dest)) ++ return false; ++ if (bitmap_bit_p (visited_map, e->dest)) ++ continue; ++ visited_nodes.safe_push (e->dest); ++ bitmap_set_bit (visited_map, e->dest); ++ } ++ } ++ return true; ++} ++ ++/* Find isomorphic stmts from GROUPED_LOADS in VINFO and make sure ++ there is no dependency among those STMT we found. */ ++ ++static unsigned ++get_cut_points (struct graph *flow_only_rdg, bitmap cut_points, ++ loop_vec_info vinfo) ++{ ++ unsigned n_stmts = 0; ++ ++ /* STMTS that may be CUT_POINTS. */ ++ auto_vec stmts; ++ if (!find_isomorphic_stmts (vinfo, stmts)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "No temp array insertion: no isomorphic stmts" ++ " were found.\n"); ++ return 0; ++ } ++ ++ for (int i = 0; i < flow_only_rdg->n_vertices; i++) ++ { ++ if (stmts.contains (RDG_STMT (flow_only_rdg, i))) ++ bitmap_set_bit (cut_points, i); ++ } ++ n_stmts = bitmap_count_bits (cut_points); ++ ++ bool succ = check_no_dependency (flow_only_rdg, cut_points); ++ if (!succ) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "No temp array inserted: data dependency" ++ " among isomorphic stmts.\n"); ++ return 0; ++ } ++ return n_stmts; ++} ++ ++static void ++build_temp_array (struct vertex *v, gimple_stmt_iterator &gsi, ++ poly_uint64 array_extent, tree iv, ++ hash_set *tmp_array_vars, vec *transformed) ++{ ++ gimple *stmt = RDGV_STMT (v); ++ tree lhs = gimple_assign_lhs (stmt); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "original stmt:\t"); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_VOPS|TDF_MEMSYMS); ++ } ++ tree var_ssa = duplicate_ssa_name (lhs, stmt); ++ gimple_assign_set_lhs (stmt, var_ssa); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "changed to:\t"); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_VOPS | TDF_MEMSYMS); ++ } ++ gimple_set_uid (gsi_stmt (gsi), -1); ++ tree vect_elt_type = TREE_TYPE (lhs); ++ tree array_type = build_array_type_nelts (vect_elt_type, array_extent); ++ tree array = create_tmp_var (array_type); ++ tree array_ssa = build4 (ARRAY_REF, vect_elt_type, array, iv, NULL, NULL); ++ tmp_array_vars->add (array_ssa); ++ gimple *store = gimple_build_assign (array_ssa, var_ssa); ++ tree new_vdef = make_ssa_name (gimple_vop (cfun), store); ++ gsi_insert_after (&gsi, store, GSI_NEW_STMT); ++ gimple_set_vdef (store, new_vdef); ++ transformed->safe_push (store); ++ gimple_set_uid (gsi_stmt (gsi), -1); ++ tree array_ssa2 = build4 (ARRAY_REF, vect_elt_type, array, iv, NULL, NULL); ++ tmp_array_vars->add (array_ssa2); ++ gimple *load = gimple_build_assign (lhs, array_ssa2); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "insert stmt:\t"); ++ print_gimple_stmt (dump_file, store, 0, TDF_VOPS|TDF_MEMSYMS); ++ fprintf (dump_file, " and stmt:\t"); ++ print_gimple_stmt (dump_file, load, 0, TDF_VOPS|TDF_MEMSYMS); ++ } ++ gimple_set_vuse (load, new_vdef); ++ gsi_insert_after (&gsi, load, GSI_NEW_STMT); ++ gimple_set_uid (gsi_stmt (gsi), -1); ++} ++ ++/* Set bitmap PRODUCERS based on vec TRANSFORMED. */ ++ ++void ++loop_distribution::build_producers (loop_p loop, bitmap producers, ++ vec &transformed) ++{ ++ auto_vec stmts; ++ stmts_from_loop (loop, &stmts); ++ int i = 0; ++ gimple *stmt = NULL; ++ ++ FOR_EACH_VEC_ELT (stmts, i, stmt) ++ gimple_set_uid (stmt, i); ++ i = 0; ++ FOR_EACH_VEC_ELT (transformed, i, stmt) ++ bitmap_set_bit (producers, stmt->uid); ++} ++ ++/* Transform stmt ++ ++ A = FOO (ARG_1); ++ ++ to ++ ++ STMT_1: A1 = FOO (ARG_1); ++ STMT_2: X[I] = A1; ++ STMT_3: A = X[I]; ++ ++ Producer is STMT_2 who defines the temp array and consumer is ++ STMT_3 who uses the temp array. */ ++ ++void ++loop_distribution::do_insertion (loop_p loop, struct graph *flow_only_rdg, ++ tree iv, bitmap cut_points, ++ hash_set *tmp_array_vars, ++ bitmap producers) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "=== do insertion ===\n"); ++ ++ auto_vec transformed; ++ ++ /* Execution times of loop. */ ++ poly_uint64 array_extent ++ = tree_to_poly_uint64 (number_of_latch_executions (loop)) + 1; ++ ++ basic_block *bbs = get_loop_body_in_custom_order (loop, this, ++ bb_top_order_cmp_r); ++ ++ for (int i = 0; i < int (loop->num_nodes); i++) ++ { ++ basic_block bb = bbs[i]; ++ ++ /* Find all cut points in bb and transform them. */ ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ unsigned j = gimple_uid (gsi_stmt (gsi)); ++ if (bitmap_bit_p (cut_points, j)) ++ { ++ struct vertex *v = &(flow_only_rdg->vertices[j]); ++ build_temp_array (v, gsi, array_extent, iv, tmp_array_vars, ++ &transformed); ++ } ++ } ++ } ++ build_producers (loop, producers, transformed); ++ update_ssa (TODO_update_ssa); ++ free (bbs); ++} ++ ++/* After temp array insertion, given stmts ++ STMT_1: M = FOO (ARG_1); ++ STMT_2: X[I] = M; ++ STMT_3: A = X[I]; ++ STMT_2 is the producer, STMT_1 is its prev and STMT_3 is its next. ++ Replace M with A, and remove STMT_2 and STMT_3. */ ++ ++static void ++reset_gimple_assign (struct graph *flow_only_rdg, struct partition *partition, ++ gimple_stmt_iterator &gsi, int j) ++{ ++ struct vertex *v = &(flow_only_rdg->vertices[j]); ++ gimple *stmt = RDGV_STMT (v); ++ gimple *prev = stmt->prev; ++ gimple *next = stmt->next; ++ tree n_lhs = gimple_assign_lhs (next); ++ gimple_assign_set_lhs (prev, n_lhs); ++ unlink_stmt_vdef (stmt); ++ if (partition) ++ bitmap_clear_bit (partition->stmts, gimple_uid (gsi_stmt (gsi))); ++ gsi_remove (&gsi, true); ++ release_defs (stmt); ++ if (partition) ++ bitmap_clear_bit (partition->stmts, gimple_uid (gsi_stmt (gsi))); ++ gsi_remove (&gsi, true); ++} ++ ++void ++loop_distribution::remove_insertion (loop_p loop, struct graph *flow_only_rdg, ++ bitmap producers, struct partition *partition) ++{ ++ basic_block *bbs = get_loop_body_in_custom_order (loop, this, ++ bb_top_order_cmp_r); ++ for (int i = 0; i < int (loop->num_nodes); i++) ++ { ++ basic_block bb = bbs[i]; ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ unsigned j = gimple_uid (gsi_stmt (gsi)); ++ if (bitmap_bit_p (producers, j)) ++ reset_gimple_assign (flow_only_rdg, partition, gsi, j); ++ } ++ } ++ update_ssa (TODO_update_ssa); ++ free (bbs); ++} ++ ++/* Insert temp arrays if isomorphic computation exists. Temp arrays will be ++ regarded as SEED_STMTS for building partitions in succeeding processes. */ ++ ++bool ++loop_distribution::insert_temp_arrays (loop_p loop, vec seed_stmts, ++ hash_set *tmp_array_vars, bitmap producers) ++{ ++ struct graph *flow_only_rdg = build_rdg (loop, NULL); ++ gcc_checking_assert (flow_only_rdg != NULL); ++ tree iv = get_iv_from_seed (flow_only_rdg, seed_stmts); ++ if (iv == NULL) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Loop %d no temp array insertion: failed to get" ++ " iteration variable.\n", loop->num); ++ free_rdg (flow_only_rdg); ++ return false; ++ } ++ auto_bitmap cut_points; ++ loop_vec_info vinfo = loop_vec_info_for_loop (loop); ++ unsigned n_cut_points = get_cut_points (flow_only_rdg, cut_points, vinfo); ++ delete vinfo; ++ loop->aux = NULL; ++ if (n_cut_points == 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Loop %d no temp array insertion: no cut points" ++ " found.\n", loop->num); ++ free_rdg (flow_only_rdg); ++ return false; ++ } ++ do_insertion (loop, flow_only_rdg, iv, cut_points, tmp_array_vars, producers); ++ if (dump_enabled_p ()) ++ { ++ dump_user_location_t loc = find_loop_location (loop); ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, "Insertion done:" ++ " %d temp arrays inserted in Loop %d.\n", ++ n_cut_points, loop->num); ++ } ++ free_rdg (flow_only_rdg); ++ return true; ++} ++ ++static bool find_seed_stmts_for_distribution (class loop *, vec *); ++ + /* Distributes the code from LOOP in such a way that producer statements + are placed before consumer statements. Tries to separate only the + statements from STMTS into separate loops. Returns the number of +@@ -2972,7 +4342,7 @@ loop_distribution::finalize_partitions (class loop *loop, + + int + loop_distribution::distribute_loop (class loop *loop, +- const vec &stmts, ++ vec &stmts, + control_dependences *cd, int *nb_calls, bool *destroy_p, + bool only_patterns_p) + { +@@ -3021,6 +4391,33 @@ loop_distribution::distribute_loop (class loop *loop, + return 0; + } + ++ /* Try to distribute LOOP if LOOP is simple enough and unable to vectorize. ++ If LOOP has grouped loads, recursively find isomorphic stmts and insert ++ temp arrays, rebuild RDG and call find_seed_stmts_for_distribution ++ to replace STMTS. */ ++ ++ hash_set tmp_array_vars; ++ ++ /* STMTs that define those inserted TMP_ARRAYs. */ ++ auto_bitmap producers; ++ ++ /* New SEED_STMTS after insertion. */ ++ auto_vec work_list; ++ bool insert_success = false; ++ if (may_insert_temp_arrays (loop, rdg, cd)) ++ { ++ if (insert_temp_arrays (loop, stmts, &tmp_array_vars, producers)) ++ { ++ if (find_seed_stmts_for_distribution (loop, &work_list)) ++ { ++ insert_success = true; ++ } ++ else ++ remove_insertion (loop, rdg, producers, NULL); ++ rebuild_rdg (loop, rdg, cd); ++ } ++ } ++ + data_reference_p dref; + for (i = 0; datarefs_vec.iterate (i, &dref); ++i) + dref->aux = (void *) (uintptr_t) i; +@@ -3029,7 +4426,10 @@ loop_distribution::distribute_loop (class loop *loop, + dump_rdg (dump_file, rdg); + + auto_vec partitions; +- rdg_build_partitions (rdg, stmts, &partitions); ++ if (work_list.length() > stmts.length()) ++ rdg_build_partitions (rdg, &work_list, &partitions); ++ else ++ rdg_build_partitions (rdg, &stmts, &partitions); + + auto_vec alias_ddrs; + +@@ -3101,7 +4501,7 @@ loop_distribution::distribute_loop (class loop *loop, + for (int j = i + 1; + partitions.iterate (j, &partition); ++j) + { +- if (share_memory_accesses (rdg, into, partition)) ++ if (share_memory_accesses (rdg, into, partition, &tmp_array_vars)) + { + partition_merge_into (rdg, into, partition, FUSE_SHARE_REF); + partitions.unordered_remove (j); +@@ -3151,7 +4551,7 @@ loop_distribution::distribute_loop (class loop *loop, + } + } + +- finalize_partitions (loop, &partitions, &alias_ddrs); ++ finalize_partitions (loop, &partitions, &alias_ddrs, producers); + + /* If there is a reduction in all partitions make sure the last one + is not classified for builtin code generation. */ +@@ -3169,6 +4569,24 @@ loop_distribution::distribute_loop (class loop *loop, + } + + nbp = partitions.length (); ++ ++ /* If we have inserted TMP_ARRAYs but there is only one partition left in ++ the succeeding processes, remove those inserted TMP_ARRAYs back to the ++ original version. */ ++ ++ if (nbp == 1 && insert_success) ++ { ++ struct partition *partition = NULL; ++ partitions.iterate (0, &partition); ++ remove_insertion (loop, rdg, producers, partition); ++ if (dump_enabled_p ()) ++ { ++ dump_user_location_t loc = find_loop_location (loop); ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, "Insertion removed:" ++ " unable to distribute loop %d.\n", loop->num); ++ } ++ } ++ + if (nbp == 0 + || (nbp == 1 && !partition_builtin_p (partitions[0])) + || (nbp > 1 && partition_contains_all_rw (rdg, partitions))) +diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc +index 04e68f621..aae7f62f3 100644 +--- a/gcc/tree-vect-data-refs.cc ++++ b/gcc/tree-vect-data-refs.cc +@@ -2791,6 +2791,9 @@ vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info) + DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element; + + DR_GROUP_SIZE (stmt_info) = groupsize; ++ ++ DR_GROUP_SLP_TRANSPOSE (stmt_info) = false; ++ + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, +@@ -2820,6 +2823,20 @@ vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info) + DR_GROUP_GAP (stmt_info)); + } + ++ /* SLP: create an SLP data structure for every interleaving group of ++ loads for further analysis in vect_analyse_slp. */ ++ if (DR_IS_READ (dr) && !slp_impossible) ++ { ++ if (loop_vinfo) ++ { ++ LOOP_VINFO_GROUPED_LOADS (loop_vinfo).safe_push (stmt_info); ++ } ++ if (bb_vinfo) ++ { ++ BB_VINFO_GROUPED_LOADS (bb_vinfo).safe_push (stmt_info); ++ } ++ } ++ + /* SLP: create an SLP data structure for every interleaving group of + stores for further analysis in vect_analyse_slp. */ + if (DR_IS_WRITE (dr) && !slp_impossible) +@@ -5636,6 +5653,226 @@ vect_permute_store_chain (vec_info *vinfo, vec &dr_chain, + } + } + ++/* Encoding the PERM_MASK_FIRST. */ ++ ++static void ++vect_indices_encoding_first (tree vectype, unsigned int array_num, ++ tree &perm_mask_high_first, ++ tree &perm_mask_low_first) ++{ ++ unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); ++ vec_perm_builder sel (nelt, nelt, 1); ++ sel.quick_grow (nelt); ++ unsigned int group_num = nelt / array_num; ++ unsigned int index = 0; ++ unsigned int array = 0; ++ unsigned int group = 0; ++ ++ /* The encoding has 1 pattern in the fisrt stage. */ ++ for (array = 0; array < array_num / 2; array++) ++ { ++ for (group = 0; group < group_num * 2; group++) ++ { ++ sel[index++] = array + array_num * group; ++ } ++ } ++ vec_perm_indices indices (sel, 2, nelt); ++ perm_mask_high_first = vect_gen_perm_mask_checked (vectype, indices); ++ ++ index = 0; ++ for (array = array_num / 2; array < array_num; array++) ++ { ++ for (group = 0; group < group_num * 2; group++) ++ { ++ sel[index++] = array + array_num * group; ++ } ++ } ++ indices.new_vector (sel, 2, nelt); ++ perm_mask_low_first = vect_gen_perm_mask_checked (vectype, indices); ++} ++ ++/* Encoding the PERM_MASK. */ ++ ++static void ++vect_indices_encoding (tree vectype, unsigned int array_num, ++ tree &perm_mask_high, tree &perm_mask_low) ++{ ++ unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); ++ vec_perm_builder sel (nelt, nelt, 1); ++ sel.quick_grow (nelt); ++ unsigned int group_num = nelt / array_num; ++ unsigned int index = 0; ++ unsigned int array = 0; ++ unsigned int group = 0; ++ ++ /* The encoding has 2 patterns in the folllowing stages. */ ++ for (array = 0; array < array_num / 2; array++) ++ { ++ for (group = 0; group < group_num; group++) ++ { ++ sel[index++] = group + group_num * array; ++ } ++ for (group = 0; group < group_num; group++) ++ { ++ sel[index++] = nelt + group + group_num * array; ++ } ++ } ++ vec_perm_indices indices (sel, 2, nelt); ++ perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); ++ ++ index = 0; ++ for (array = array_num / 2; array < array_num; array++) ++ { ++ for (group = 0; group < group_num; group++) ++ { ++ sel[index++] = group + group_num * array; ++ } ++ for (group = 0; group < group_num; group++) ++ { ++ sel[index++] = nelt + group + group_num * array; ++ } ++ } ++ indices.new_vector (sel, 2, nelt); ++ perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); ++} ++ ++/* Function vect_transpose_store_chain. ++ ++ Given a chain of interleaved stores in DR_CHAIN of LENGTH and ARRAY_NUM that ++ must be a power of 2. Generate interleave_high/low stmts to reorder ++ the data correctly for the stores. Return the final references for stores ++ in RESULT_CHAIN. This function is similar to vect_permute_store_chain (), ++ we interleave the contents of the vectors in their order. ++ ++ E.g., LENGTH is 4, the scalar type is short (i.e., VF is 8) and ARRAY_NUM ++ is 4. That is, the input is 4 vectors each containing 8 elements. ++ And 2 (VF / ARRAY_NUM) of 8 elements come from the same array. we interleave ++ the contents of the four vectors in their order. We assign a number to each ++ element, the input sequence is: ++ ++ 1st vec: 0 1 2 3 4 5 6 7 ++ 2nd vec: 8 9 10 11 12 13 14 15 ++ 3rd vec: 16 17 18 19 20 21 22 23 ++ 4th vec: 24 25 26 27 28 29 30 31 ++ ++ The output sequence should be: ++ ++ 1st vec: 0 4 8 12 16 20 24 28 ++ 2nd vec: 1 5 9 13 17 21 25 29 ++ 3rd vec: 2 6 10 14 18 22 26 30 ++ 4th vec: 3 7 11 15 19 23 27 31 ++ ++ In our example, ++ We get 2 (VF / ARRAY_NUM) elements together in every vector. ++ ++ I1: 0 4 1 5 2 6 3 7 ++ I2: 8 12 9 13 10 14 11 15 ++ I3: 16 20 17 21 18 22 19 23 ++ I4: 24 28 25 29 26 30 27 31 ++ ++ Then, we use interleave_high/low instructions to create such output. ++ Every 2 (VF / ARRAY_NUM) elements are regarded as a whole. The permutation ++ is done in log LENGTH stages. ++ ++ I1: interleave_high (1st vec, 3rd vec) ++ I2: interleave_low (1st vec, 3rd vec) ++ I3: interleave_high (2nd vec, 4th vec) ++ I4: interleave_low (2nd vec, 4th vec) ++ ++ The first stage of the sequence should be: ++ ++ I1: 0 4 16 20 1 5 17 21 ++ I2: 2 6 18 22 3 7 19 23 ++ I3: 8 12 24 28 9 13 25 29 ++ I4: 10 14 26 30 11 15 27 31 ++ ++ The following stage sequence should be, i.e. the final result is: ++ ++ I1: 0 4 8 12 16 20 24 28 ++ I2: 1 5 9 13 17 21 25 29 ++ I3: 2 6 10 14 18 22 26 30 ++ I4: 3 7 11 15 19 23 27 31. */ ++ ++void ++vect_transpose_store_chain (vec_info *vinfo, vec dr_chain, ++ unsigned int length, unsigned int array_num, ++ stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, ++ vec *result_chain) ++{ ++ gimple *perm_stmt = NULL; ++ tree vectype = STMT_VINFO_VECTYPE (stmt_info); ++ tree perm_mask_low_first = NULL; ++ tree perm_mask_high_first = NULL; ++ tree perm_mask_low = NULL; ++ tree perm_mask_high = NULL; ++ unsigned int log_length = exact_log2 (length); ++ ++ /* Only power of 2 is supported. */ ++ gcc_assert (pow2p_hwi (length)); ++ ++ /* The encoding has 2 types, one for the grouped pattern in the fisrt stage, ++ another for the interleaved patterns in the following stages. */ ++ gcc_assert (array_num != 0); ++ ++ /* Create grouped stmt (in the first stage): ++ group = nelt / array_num; ++ high_first = VEC_PERM_EXPR ++ low_first = VEC_PERM_EXPR */ ++ vect_indices_encoding_first (vectype, array_num, perm_mask_high_first, ++ perm_mask_low_first); ++ ++ /* Create interleaving stmt (in the following stages): ++ high = VEC_PERM_EXPR ++ low = VEC_PERM_EXPR */ ++ vect_indices_encoding (vectype, array_num, perm_mask_high, perm_mask_low); ++ ++ for (unsigned int perm_time = 0; perm_time < log_length; perm_time++) ++ { ++ for (unsigned int index = 0; index < length / 2; index++) ++ { ++ tree vect1 = dr_chain[index]; ++ tree vect2 = dr_chain[index + length / 2]; ++ ++ tree high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); ++ perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, vect2, ++ perm_time == 0 ? perm_mask_high_first ++ : perm_mask_high); ++ vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); ++ (*result_chain)[2 * index] = high; ++ ++ tree low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); ++ perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, vect2, ++ perm_time == 0 ? perm_mask_low_first ++ : perm_mask_low); ++ vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); ++ (*result_chain)[2 * index+1] = low; ++ } ++ memcpy (dr_chain.address (), result_chain->address (), ++ length * sizeof (tree)); ++ } ++} ++ + /* Function vect_setup_realignment + + This function is called when vectorizing an unaligned load using +diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc +index 3435f9378..f296e9415 100644 +--- a/gcc/tree-vect-loop.cc ++++ b/gcc/tree-vect-loop.cc +@@ -2856,7 +2856,7 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared, + loop_vec_info main_loop_vinfo, + const vector_modes &vector_modes, unsigned &mode_i, + machine_mode &autodetected_vector_mode, +- bool &fatal) ++ bool &fatal, bool result_only_p) + { + loop_vec_info loop_vinfo + = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo); +@@ -2865,6 +2865,8 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared, + loop_vinfo->vector_mode = vector_mode; + unsigned int suggested_unroll_factor = 1; + ++ /* Loop_vinfo for loop-distribution pass. */ ++ opt_loop_vec_info fail_loop_vinfo = opt_loop_vec_info::success (NULL); + /* Run the main analysis. */ + opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, + &suggested_unroll_factor); +@@ -2933,7 +2935,21 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared, + + if (!res) + { +- delete loop_vinfo; ++ ++ /* If current analysis shows LOOP is unable to vectorize, loop_vinfo ++ will be deleted. If LOOP is under ldist analysis, backup it before ++ it is deleted and return it if all modes are analyzed and still ++ fail to vectorize. */ ++ if (result_only_p && (mode_i == vector_modes.length () ++ || autodetected_vector_mode == VOIDmode)) ++ { ++ fail_loop_vinfo = opt_loop_vec_info::success (loop_vinfo); ++ loop->aux = (loop_vec_info) fail_loop_vinfo; ++ } ++ else ++ { ++ delete loop_vinfo; ++ } + if (fatal) + gcc_checking_assert (main_loop_vinfo == NULL); + return opt_loop_vec_info::propagate_failure (res); +@@ -2946,9 +2962,11 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared, + + Apply a set of analyses on LOOP, and create a loop_vec_info struct + for it. The different analyses will record information in the +- loop_vec_info struct. */ ++ loop_vec_info struct. When RESULT_ONLY_P is true, quit analysis ++ if loop is vectorizable, otherwise, do not delete vinfo. */ + opt_loop_vec_info +-vect_analyze_loop (class loop *loop, vec_info_shared *shared) ++vect_analyze_loop (class loop *loop, vec_info_shared *shared, ++ bool result_only_p) + { + DUMP_VECT_SCOPE ("analyze_loop_nest"); + +@@ -2996,6 +3014,12 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) + && !unlimited_cost_model (loop)); + machine_mode autodetected_vector_mode = VOIDmode; + opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL); ++ /* Loop_vinfo for loop-distribution pass. */ ++ opt_loop_vec_info fail_loop_vinfo = opt_loop_vec_info::success (NULL); ++ if (result_only_p) ++ { ++ vect_slp_init (); ++ } + unsigned int mode_i = 0; + unsigned HOST_WIDE_INT simdlen = loop->simdlen; + +@@ -3019,10 +3043,16 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) + opt_loop_vec_info loop_vinfo + = vect_analyze_loop_1 (loop, shared, &loop_form_info, + NULL, vector_modes, mode_i, +- autodetected_vector_mode, fatal); ++ autodetected_vector_mode, fatal, result_only_p); + if (fatal) + break; + ++ if (result_only_p && (mode_i == vector_modes.length () ++ || autodetected_vector_mode == VOIDmode)) ++ { ++ return loop_vinfo; ++ } ++ + if (loop_vinfo) + { + /* Analyzis has been successful so update the VF value. The +@@ -3132,7 +3162,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) + = vect_analyze_loop_1 (loop, shared, &loop_form_info, + first_loop_vinfo, + vector_modes, mode_i, +- autodetected_vector_mode, fatal); ++ autodetected_vector_mode, fatal, result_only_p); + if (fatal) + break; + +diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc +index e1bcab0f7..c0c15773d 100644 +--- a/gcc/tree-vect-patterns.cc ++++ b/gcc/tree-vect-patterns.cc +@@ -5632,8 +5632,8 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = { + internal functions. */ + { vect_recog_gather_scatter_pattern, "gather_scatter" }, + { vect_recog_mask_conversion_pattern, "mask_conversion" }, +- { vect_recog_widen_plus_pattern, "widen_plus" }, +- { vect_recog_widen_minus_pattern, "widen_minus" }, ++ // { vect_recog_widen_plus_pattern, "widen_plus" }, ++ // { vect_recog_widen_minus_pattern, "widen_minus" }, + }; + + const unsigned int NUM_PATTERNS = ARRAY_SIZE (vect_vect_recog_func_ptrs); +diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc +index af477c31a..6cbf8085f 100644 +--- a/gcc/tree-vect-slp.cc ++++ b/gcc/tree-vect-slp.cc +@@ -49,6 +49,8 @@ along with GCC; see the file COPYING3. If not see + #include "tree-eh.h" + #include "tree-cfg.h" + #include "alloc-pool.h" ++#include "print-tree.h" ++#include "gimple-pretty-print.h" + + static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *, + slp_tree, stmt_vector_for_cost *); +@@ -994,6 +996,21 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, + } + + gcc_assert (vectype); ++ if (!STMT_VINFO_VECTYPE (stmt_info)) ++ STMT_VINFO_VECTYPE (stmt_info) = vectype; ++ if (dump_file) ++ { ++ fprintf (dump_file, "vect_build_slp_tree_1: %p\n", stmt_info); ++ print_gimple_stmt (dump_file, stmt, 0); ++ fprintf (dump_file, "vect_build_slp_tree_1: vectype="); ++ if (vectype) ++ print_generic_expr (dump_file, vectype); ++ fprintf (dump_file, "\n"); ++ fprintf (dump_file, "internal vectype="); ++ if (STMT_VINFO_VECTYPE (stmt_info)) ++ print_generic_expr (dump_file, STMT_VINFO_VECTYPE (stmt_info)); ++ fprintf (dump_file, "\n"); ++ } + + gcall *call_stmt = dyn_cast (stmt); + if (call_stmt) +@@ -1575,10 +1592,10 @@ vect_build_slp_tree (vec_info *vinfo, + dump_printf_loc (MSG_NOTE, vect_location, + "SLP discovery for node %p succeeded\n", res); + gcc_assert (res_ == res); +- res->max_nunits = this_max_nunits; ++ res_->max_nunits = this_max_nunits; + vect_update_max_nunits (max_nunits, this_max_nunits); + /* Keep a reference for the bst_map use. */ +- SLP_TREE_REF_COUNT (res)++; ++ SLP_TREE_REF_COUNT (res_)++; + } + return res_; + } +@@ -3190,8 +3207,10 @@ vect_build_slp_instance (vec_info *vinfo, + + /* For basic block SLP, try to break the group up into multiples of + a vector size. */ ++ bb_vec_info bb_vinfo = dyn_cast (vinfo); + if (is_a (vinfo) +- && (i > 1 && i < group_size)) ++ && (i > 1 && i < group_size) ++ && !bb_vinfo->transposed) + { + tree scalar_type + = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info))); +@@ -3301,84 +3320,1034 @@ vect_analyze_slp_instance (vec_info *vinfo, + scalar_stmts.create (DR_GROUP_SIZE (stmt_info)); + while (next_info) + { +- scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info)); +- next_info = DR_GROUP_NEXT_ELEMENT (next_info); ++ scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info)); ++ next_info = DR_GROUP_NEXT_ELEMENT (next_info); ++ } ++ } ++ else if (kind == slp_inst_kind_reduc_chain) ++ { ++ /* Collect the reduction stmts and store them in scalar_stmts. */ ++ scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info)); ++ while (next_info) ++ { ++ scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info)); ++ next_info = REDUC_GROUP_NEXT_ELEMENT (next_info); ++ } ++ /* Mark the first element of the reduction chain as reduction to properly ++ transform the node. In the reduction analysis phase only the last ++ element of the chain is marked as reduction. */ ++ STMT_VINFO_DEF_TYPE (stmt_info) ++ = STMT_VINFO_DEF_TYPE (scalar_stmts.last ()); ++ STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) ++ = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ())); ++ } ++ else if (kind == slp_inst_kind_ctor) ++ { ++ tree rhs = gimple_assign_rhs1 (stmt_info->stmt); ++ tree val; ++ scalar_stmts.create (CONSTRUCTOR_NELTS (rhs)); ++ FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val) ++ { ++ stmt_vec_info def_info = vinfo->lookup_def (val); ++ def_info = vect_stmt_to_vectorize (def_info); ++ scalar_stmts.quick_push (def_info); ++ } ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Analyzing vectorizable constructor: %G\n", ++ stmt_info->stmt); ++ } ++ else if (kind == slp_inst_kind_reduc_group) ++ { ++ /* Collect reduction statements. */ ++ const vec &reductions ++ = as_a (vinfo)->reductions; ++ scalar_stmts.create (reductions.length ()); ++ for (i = 0; reductions.iterate (i, &next_info); i++) ++ if ((STMT_VINFO_RELEVANT_P (next_info) ++ || STMT_VINFO_LIVE_P (next_info)) ++ /* ??? Make sure we didn't skip a conversion around a reduction ++ path. In that case we'd have to reverse engineer that conversion ++ stmt following the chain using reduc_idx and from the PHI ++ using reduc_def. */ ++ && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def) ++ scalar_stmts.quick_push (next_info); ++ /* If less than two were relevant/live there's nothing to SLP. */ ++ if (scalar_stmts.length () < 2) ++ return false; ++ } ++ else ++ gcc_unreachable (); ++ ++ vec roots = vNULL; ++ if (kind == slp_inst_kind_ctor) ++ { ++ roots.create (1); ++ roots.quick_push (stmt_info); ++ } ++ /* Build the tree for the SLP instance. */ ++ bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts, ++ roots, ++ max_tree_size, limit, bst_map, ++ kind == slp_inst_kind_store ++ ? stmt_info : NULL); ++ if (!res) ++ roots.release (); ++ ++ /* ??? If this is slp_inst_kind_store and the above succeeded here's ++ where we should do store group splitting. */ ++ ++ return res; ++} ++ ++static inline bool ++is_const_assign (stmt_vec_info store_elem) ++{ ++ if (store_elem == NULL) ++ { ++ gcc_unreachable (); ++ } ++ gimple *stmt = store_elem->stmt; ++ gimple_rhs_class rhs_class = gimple_assign_rhs_class (stmt); ++ return rhs_class == GIMPLE_SINGLE_RHS ++ && TREE_CONSTANT (gimple_assign_rhs1 (store_elem->stmt)); ++} ++ ++/* Push inits to INNERMOST_INITS and check const assign. */ ++ ++static bool ++record_innermost (vec &innermost_inits, ++ vec &innermost_offsets, ++ stmt_vec_info stmt_vinfo) ++{ ++ if (!stmt_vinfo) ++ { ++ return false; ++ } ++ stmt_vec_info next_info = stmt_vinfo; ++ while (next_info) ++ { ++ /* No need to vectorize constant assign in a transposed version. */ ++ if (is_const_assign (next_info)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "no need to vectorize, store is const assign: %G", ++ next_info->stmt); ++ } ++ return false; ++ } ++ innermost_inits.safe_push (STMT_VINFO_DR_INIT (next_info)); ++ innermost_offsets.safe_push (STMT_VINFO_DR_OFFSET (next_info)); ++ next_info = DR_GROUP_NEXT_ELEMENT (next_info); ++ } ++ return true; ++} ++ ++/* Compare inits to INNERMOST_INITS, return FALSE if inits do not match ++ the first grouped_store. And check const assign meanwhile. */ ++ ++static bool ++compare_innermost (const vec &innermost_inits, ++ const vec &innermost_offsets, ++ stmt_vec_info stmt_vinfo) ++{ ++ if (!stmt_vinfo || innermost_inits.length () != stmt_vinfo->size) ++ { ++ return false; ++ } ++ stmt_vec_info next_info = stmt_vinfo; ++ unsigned int i = 0; ++ while (next_info) ++ { ++ if (is_const_assign (next_info)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "no need to vectorize, store is const " ++ "assign: %G", next_info->stmt); ++ } ++ return false; ++ } ++ if (innermost_inits[i] != STMT_VINFO_DR_INIT (next_info) ++ || innermost_offsets[i] != STMT_VINFO_DR_OFFSET (next_info)) ++ { ++ return false; ++ } ++ next_info = DR_GROUP_NEXT_ELEMENT (next_info); ++ i++; ++ } ++ return true; ++} ++ ++static bool ++check_same_bb (stmt_vec_info grp1, stmt_vec_info grp2) ++{ ++ if (grp1->stmt->bb->index == grp2->stmt->bb->index) ++ { ++ return true; ++ } ++ return false; ++} ++ ++/* Check if grouped stores are of same type. ++ input: t1/t2 = TREE_TYPE (gimple_assign_lhs (first_element->stmt)) ++ output: 0 if same, 1 or -1 else. */ ++ ++static int ++tree_type_cmp (const tree t1, const tree t2) ++{ ++ gcc_checking_assert (t1 != NULL && t2 != NULL); ++ if (t1 != t2) ++ { ++ if (TREE_CODE (t1) != TREE_CODE (t2)) ++ { ++ return TREE_CODE (t1) > TREE_CODE (t2) ? 1 : -1; ++ } ++ if (TYPE_UNSIGNED (t1) != TYPE_UNSIGNED (t2)) ++ { ++ return TYPE_UNSIGNED (t1) > TYPE_UNSIGNED (t2) ? 1 : -1; ++ } ++ if (TYPE_PRECISION (t1) != TYPE_PRECISION (t2)) ++ { ++ return TYPE_PRECISION (t1) > TYPE_PRECISION (t2) ? 1 : -1; ++ } ++ } ++ return 0; ++} ++ ++/* Check it if 2 grouped stores are of same type that ++ we can analyze them in a transpose group. */ ++static int ++check_same_store_type (stmt_vec_info grp1, stmt_vec_info grp2) ++{ ++ if (grp1 == grp2) ++ { ++ return 0; ++ } ++ if (grp1->size != grp2->size) ++ { ++ return grp1->size > grp2->size ? 1 : -1; ++ } ++ tree lhs1 = gimple_assign_lhs (grp1->stmt); ++ tree lhs2 = gimple_assign_lhs (grp2->stmt); ++ if (TREE_CODE (lhs1) != TREE_CODE (lhs2)) ++ { ++ return TREE_CODE (lhs1) > TREE_CODE (lhs2) ? 1 : -1; ++ } ++ tree grp_type1 = TREE_TYPE (gimple_assign_lhs (grp1->stmt)); ++ tree grp_type2 = TREE_TYPE (gimple_assign_lhs (grp2->stmt)); ++ int cmp = tree_type_cmp (grp_type1, grp_type2); ++ return cmp; ++} ++ ++/* Sort grouped stores according to group_size and store_type. ++ output: 0 if same, 1 if grp1 > grp2, -1 otherwise. */ ++ ++static int ++grouped_store_cmp (const void *grp1_, const void *grp2_) ++{ ++ stmt_vec_info grp1 = *(stmt_vec_info *)const_cast(grp1_); ++ stmt_vec_info grp2 = *(stmt_vec_info *)const_cast(grp2_); ++ return check_same_store_type (grp1, grp2); ++} ++ ++/* Transposing is based on permutation in registers. Permutation requires ++ vector length being power of 2 and satisfying the vector mode. */ ++ ++static inline bool ++check_filling_reg (stmt_vec_info current_element) ++{ ++ if (current_element->size == 0) ++ { ++ return false; ++ } ++ /* If the gimple STMT was already vectorized in vect pass, it's unable to ++ conduct transpose analysis, skip it. */ ++ bool lhs_vectorized ++ = TREE_CODE (TREE_TYPE (gimple_get_lhs (current_element->stmt))) ++ == VECTOR_TYPE; ++ bool rhs_vectorized ++ = TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (current_element->stmt))) ++ == VECTOR_TYPE; ++ if (lhs_vectorized || rhs_vectorized) ++ { ++ return false; ++ } ++ unsigned int store_precision ++ = TYPE_PRECISION (TREE_TYPE (gimple_get_lhs (current_element->stmt))); ++ auto_vector_modes vector_modes; ++ targetm.vectorize.autovectorize_vector_modes (&vector_modes, false); ++ unsigned min_mode_size = -1u; ++ for (unsigned i = 0; i < vector_modes.length (); i++) ++ { ++ unsigned mode_bit_size = (GET_MODE_BITSIZE (vector_modes[i])).coeffs[0]; ++ min_mode_size = mode_bit_size < min_mode_size ++ ? mode_bit_size : min_mode_size; ++ } ++ return store_precision != 0 ++ && pow2p_hwi (current_element->size) ++ && (current_element->size * store_precision % min_mode_size == 0); ++} ++ ++/* Check if previous groups are suitable to transpose, if not, set their ++ group number to -1, reduce grp_num and clear current_groups. ++ Otherwise, just clear current_groups. */ ++ ++static void ++check_and_clear_groups (vec ¤t_groups, ++ unsigned int &grp_num) ++{ ++ stmt_vec_info first_element; ++ if (current_groups.length () == 1 ++ || (current_groups.length () != 0 ++ && !pow2p_hwi (current_groups.length ()))) ++ { ++ while (current_groups.length () != 0) ++ { ++ first_element = current_groups.pop (); ++ first_element->group_number = -1; ++ } ++ grp_num--; ++ } ++ else ++ { ++ while (current_groups.length ()) ++ { ++ current_groups.pop (); ++ } ++ } ++} ++ ++ ++/* Make sure that transpose slp vectorization is conducted only if grouped ++ stores are one dimension array ref. */ ++ ++static bool ++is_store_one_dim_array (gimple *stmt) ++{ ++ tree op = gimple_get_lhs (stmt); ++ if (TREE_CODE (op) != ARRAY_REF) ++ return false; ++ return TREE_OPERAND_LENGTH (op) > 0 ++ && TREE_OPERAND_LENGTH (TREE_OPERAND (op, 0)) == 0; ++} ++ ++/* Set grouped_stores with similar MEM_REF to the same group and mark their ++ grp_num. Groups with same grp_num consist the minimum unit to analyze ++ transpose. Return num of such units. */ ++ ++static unsigned ++vect_prepare_transpose (bb_vec_info bb_vinfo) ++{ ++ stmt_vec_info current_element = NULL; ++ stmt_vec_info first_element = NULL; ++ unsigned int i = 0; ++ unsigned int grp_num = 0; ++ /* Use arrays to record MEM_REF data in different GROUPED_STORES. */ ++ auto_vec innermost_inits; ++ auto_vec innermost_offsets; ++ ++ /* A set of stmt_vec_info with same store type. Analyze them if their size ++ is suitable to transpose. */ ++ auto_vec current_groups; ++ ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, current_element) ++ { ++ /* Compare current grouped_store to the first one if first_element exists, ++ push current_element to current_groups if they are similar on innermost ++ behavior of MEM_REF. */ ++ if (first_element != NULL ++ && !check_same_store_type (first_element, current_element) ++ && compare_innermost (innermost_inits, innermost_offsets, ++ current_element) ++ && check_same_bb (first_element, current_element)) ++ { ++ current_groups.safe_push (current_element); ++ current_element->group_number = grp_num; ++ /* If current_element is the last element in grouped_stores, continue ++ will exit the loop and leave the last group unanalyzed. */ ++ if (i == bb_vinfo->grouped_stores.length () - 1) ++ { ++ check_and_clear_groups (current_groups, grp_num); ++ } ++ continue; ++ } ++ check_and_clear_groups (current_groups, grp_num); ++ innermost_inits.release (); ++ innermost_offsets.release (); ++ /* Beginning of a new group to analyze whether they are able to consist ++ a unit to conduct transpose analysis. */ ++ first_element = NULL; ++ if (is_store_one_dim_array (current_element->stmt) ++ && check_filling_reg (current_element) ++ && record_innermost (innermost_inits, innermost_offsets, ++ current_element)) ++ { ++ first_element = current_element; ++ current_groups.safe_push (current_element); ++ current_element->group_number = ++grp_num; ++ if (i == bb_vinfo->grouped_stores.length () - 1) ++ { ++ check_and_clear_groups (current_groups, grp_num); ++ } ++ continue; ++ } ++ current_element->group_number = -1; ++ } ++ return grp_num; ++} ++ ++/* Return a flag to transpose grouped stores before building slp tree. ++ Add bool may_transpose in class vec_info. */ ++ ++static bool ++vect_may_transpose (bb_vec_info bb_vinfo) ++{ ++ if (targetm.vectorize.vec_perm_const == NULL) ++ { ++ return false; ++ } ++ ++ if (bb_vinfo->grouped_stores.length () < 2) ++ { ++ return false; ++ } ++ ++ DUMP_VECT_SCOPE ("analyze if grouped stores may transpose to slp"); ++ /* Sort grouped_stores according to size and type for function ++ vect_prepare_transpose (). */ ++ bb_vinfo->grouped_stores.qsort (grouped_store_cmp); ++ ++ int groups = vect_prepare_transpose (bb_vinfo); ++ BB_VINFO_TRANS_GROUPS (bb_vinfo) = groups; ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "%d groups to analyze transposed slp.\n", groups); ++ return groups != 0; ++} ++ ++/* Get the base address of STMT_INFO. */ ++ ++static tree ++get_op_base_address (stmt_vec_info stmt_info) ++{ ++ struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); ++ tree op = DR_BASE_ADDRESS (dr); ++ while (TREE_OPERAND_LENGTH (op) > 0) ++ { ++ op = TREE_OPERAND (op, 0); ++ } ++ return op; ++} ++ ++/* Compare the UID of the two stmt_info STMTINFO_A and STMTINFO_B. ++ Sorting them in ascending order. */ ++ ++static int ++dr_group_cmp (const void *stmtinfo_a_, const void *stmtinfo_b_) ++{ ++ stmt_vec_info stmtinfo_a ++ = *(stmt_vec_info *) const_cast (stmtinfo_a_); ++ stmt_vec_info stmtinfo_b ++ = *(stmt_vec_info *) const_cast (stmtinfo_b_); ++ ++ /* Stabilize sort. */ ++ if (stmtinfo_a == stmtinfo_b) ++ { ++ return 0; ++ } ++ return gimple_uid (stmtinfo_a->stmt) < gimple_uid (stmtinfo_b->stmt) ? -1 : 1; ++} ++ ++/* Find the first elements of the grouped loads which are required to merge. */ ++ ++static void ++vect_slp_grouped_load_find (bb_vec_info bb_vinfo, vec &visited, ++ vec &res) ++{ ++ unsigned int i = 0; ++ stmt_vec_info merge_first_element = NULL; ++ stmt_vec_info first_element = NULL; ++ tree opa = NULL; ++ unsigned int grp_size_a = 0; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_loads, i, first_element) ++ { ++ if (visited[i]) ++ { ++ continue; ++ } ++ if (!STMT_VINFO_GROUPED_ACCESS (first_element) ++ || !pow2p_hwi (DR_GROUP_SIZE (first_element))) ++ { ++ /* Non-conforming grouped load should be grouped separately. */ ++ if (merge_first_element == NULL) ++ { ++ visited[i] = true; ++ res.safe_push (first_element); ++ return; ++ } ++ } ++ if (merge_first_element == NULL) ++ { ++ merge_first_element = first_element; ++ opa = get_op_base_address (first_element); ++ grp_size_a = DR_GROUP_SIZE (first_element); ++ res.safe_push (first_element); ++ visited[i] = true; ++ continue; ++ } ++ ++ /* If the two first elements are of the same base address and group size, ++ these two grouped loads need to be merged. */ ++ tree opb = get_op_base_address (first_element); ++ unsigned int grp_size_b = DR_GROUP_SIZE (first_element); ++ if (opa == opb && grp_size_a == grp_size_b) ++ { ++ res.safe_push (first_element); ++ visited[i] = true; ++ } ++ } ++} ++ ++/* Merge the grouped loads that are found from ++ vect_slp_grouped_load_find (). */ ++ ++static stmt_vec_info ++vect_slp_grouped_load_merge (vec &res) ++{ ++ stmt_vec_info stmt_info = res[0]; ++ if (res.length () == 1) ++ { ++ return stmt_info; ++ } ++ unsigned int i = 0; ++ unsigned int size = DR_GROUP_SIZE (res[0]); ++ unsigned int new_group_size = size * res.length (); ++ stmt_vec_info first_element = NULL; ++ stmt_vec_info merge_first_element = NULL; ++ stmt_vec_info last_element = NULL; ++ FOR_EACH_VEC_ELT (res, i, first_element) ++ { ++ if (merge_first_element == NULL) ++ { ++ merge_first_element = first_element; ++ last_element = merge_first_element; ++ size = DR_GROUP_SIZE (merge_first_element); ++ } ++ ++ if (last_element != first_element ++ && !DR_GROUP_NEXT_ELEMENT (last_element)) ++ { ++ DR_GROUP_NEXT_ELEMENT (last_element) = first_element; ++ /* Store the gap from the previous member of the group. If there is ++ no gap in the access, DR_GROUP_GAP is always 1. */ ++ DR_GROUP_GAP_TRANS (first_element) = DR_GROUP_GAP (first_element); ++ DR_GROUP_GAP (first_element) = 1; ++ } ++ for (stmt_info = first_element; stmt_info; ++ stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info)) ++ { ++ DR_GROUP_FIRST_ELEMENT (stmt_info) = merge_first_element; ++ DR_GROUP_SIZE_TRANS (stmt_info) = DR_GROUP_SIZE (stmt_info); ++ DR_GROUP_SIZE (stmt_info) = new_group_size; ++ last_element = stmt_info; ++ } ++ } ++ DR_GROUP_SIZE (merge_first_element) = new_group_size; ++ DR_GROUP_SLP_TRANSPOSE (merge_first_element) = true; ++ DR_GROUP_NEXT_ELEMENT (last_element) = NULL; ++ return merge_first_element; ++} ++ ++/* Merge the grouped loads which have the same base address and group size. ++ For example, for grouped loads (opa_1, opa_2, opb_1, opb_2): ++ opa_1: a0->a1->a2->a3 ++ opa_2: a8->a9->a10->a11 ++ opb_1: b0->b1 ++ opb_2: b16->b17 ++ we can probably get two merged grouped loads: ++ opa: a0->a1->a2->a3->a8->a9->a10->a11 ++ opb: b0->b1->b16->b17. */ ++ ++static bool ++vect_merge_slp_grouped_loads (bb_vec_info bb_vinfo) ++{ ++ if (bb_vinfo->grouped_loads.length () <= 0) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "The number of grouped loads is 0.\n"); ++ } ++ return false; ++ } ++ bb_vinfo->grouped_loads.qsort (dr_group_cmp); ++ auto_vec visited (bb_vinfo->grouped_loads.length ()); ++ auto_vec grouped_loads_merge; ++ for (unsigned int i = 0; i < bb_vinfo->grouped_loads.length (); i++) ++ { ++ visited.safe_push (false); ++ } ++ while (1) ++ { ++ /* Find grouped loads which are required to merge. */ ++ auto_vec res; ++ vect_slp_grouped_load_find (bb_vinfo, visited, res); ++ if (res.is_empty ()) ++ { ++ break; ++ } ++ /* Merge the required grouped loads into one group. */ ++ grouped_loads_merge.safe_push (vect_slp_grouped_load_merge (res)); ++ } ++ if (grouped_loads_merge.length () == bb_vinfo->grouped_loads.length ()) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "No grouped loads need to be merged.\n"); ++ } ++ return false; ++ } ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Merging grouped loads successfully.\n"); ++ } ++ BB_VINFO_GROUPED_LOADS (bb_vinfo).release (); ++ for (unsigned int i = 0; i < grouped_loads_merge.length (); i++) ++ { ++ BB_VINFO_GROUPED_LOADS (bb_vinfo).safe_push (grouped_loads_merge[i]); ++ } ++ return true; ++} ++ ++/* Find the first elements of the grouped stores ++ which are required to transpose and merge. */ ++ ++static void ++vect_slp_grouped_store_find (bb_vec_info bb_vinfo, vec &visited, ++ vec &res) ++{ ++ stmt_vec_info first_element = NULL; ++ stmt_vec_info merge_first_element = NULL; ++ unsigned int k = 0; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element) ++ { ++ if (visited[k]) ++ { ++ continue; ++ } ++ /* Non-conforming grouped store should be grouped separately. */ ++ if (!STMT_VINFO_GROUPED_ACCESS (first_element) ++ || first_element->group_number == -1) ++ { ++ if (merge_first_element == NULL) ++ { ++ visited[k] = true; ++ res.safe_push (first_element); ++ return; ++ } ++ } ++ if (first_element->group_number != -1 ++ && merge_first_element == NULL) ++ { ++ merge_first_element = first_element; ++ } ++ if (merge_first_element->group_number == first_element->group_number) ++ { ++ visited[k] = true; ++ res.safe_push (first_element); ++ } ++ } ++} ++ ++/* Transpose and merge the grouped stores that are found from ++ vect_slp_grouped_store_find (). */ ++ ++static stmt_vec_info ++vect_slp_grouped_store_transform (vec &res) ++{ ++ stmt_vec_info stmt_info = res[0]; ++ if (res.length () == 1) ++ { ++ return stmt_info; ++ } ++ stmt_vec_info rearrange_first_element = stmt_info; ++ stmt_vec_info last_element = rearrange_first_element; ++ ++ unsigned int size = DR_GROUP_SIZE (rearrange_first_element); ++ unsigned int new_group_size = size * res.length (); ++ for (unsigned int i = 1; i < res.length (); i++) ++ { ++ /* Store the gap from the previous member of the group. If there is no ++ gap in the access, DR_GROUP_GAP is always 1. */ ++ DR_GROUP_GAP_TRANS (res[i]) = DR_GROUP_GAP (res[i]); ++ DR_GROUP_GAP (res[i]) = 1; ++ } ++ while (!res.is_empty ()) ++ { ++ stmt_info = res[0]; ++ res.ordered_remove (0); ++ if (DR_GROUP_NEXT_ELEMENT (stmt_info)) ++ { ++ res.safe_push (DR_GROUP_NEXT_ELEMENT (stmt_info)); ++ } ++ DR_GROUP_FIRST_ELEMENT (stmt_info) = rearrange_first_element; ++ DR_GROUP_NEXT_ELEMENT (last_element) = stmt_info; ++ DR_GROUP_SIZE_TRANS (stmt_info) = DR_GROUP_SIZE (stmt_info); ++ DR_GROUP_SIZE (stmt_info) = new_group_size; ++ last_element = stmt_info; ++ } ++ ++ DR_GROUP_SIZE (rearrange_first_element) = new_group_size; ++ DR_GROUP_SLP_TRANSPOSE (rearrange_first_element) = true; ++ DR_GROUP_NEXT_ELEMENT (last_element) = NULL; ++ return rearrange_first_element; ++} ++ ++/* Save the STMT_INFO in the grouped stores to BB_VINFO_SCALAR_STORES for ++ transposing back grouped stores. */ ++ ++static void ++get_scalar_stores (bb_vec_info bb_vinfo) ++{ ++ unsigned int k = 0; ++ stmt_vec_info first_element = NULL; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element) ++ { ++ /* Filter the grouped store which is unnecessary for transposing. */ ++ if (!STMT_VINFO_GROUPED_ACCESS (first_element) ++ || first_element->group_number == -1) ++ { ++ continue; ++ } ++ vec tmp_scalar_store; ++ tmp_scalar_store.create (DR_GROUP_SIZE (first_element)); ++ for (stmt_vec_info stmt_info = first_element; stmt_info; ++ stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info)) ++ { ++ tmp_scalar_store.safe_push (stmt_info); ++ } ++ BB_VINFO_SCALAR_STORES (bb_vinfo).safe_push (tmp_scalar_store); ++ } ++} ++ ++/* Transpose and merge the grouped stores which have the same group number. ++ For example, for grouped stores (opa_0, opa_1, opa_2, opa_3): ++ opa_0: a00->a01->a02->a03 ++ opa_1: a10->a11->a12->a13 ++ opa_2: a20->a21->a22->a23 ++ opa_2: a30->a31->a32->a33 ++ we can probably get the merged grouped store: ++ opa: a00->a10->a20->a30 ++ ->a01->a11->a21->a31 ++ ->a02->a12->a22->a32 ++ ->a03->a13->a23->a33. */ ++ ++static bool ++vect_transform_slp_grouped_stores (bb_vec_info bb_vinfo) ++{ ++ if (bb_vinfo->grouped_stores.length () <= 0) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "The number of grouped stores is 0.\n"); ++ } ++ return false; ++ } ++ ++ bb_vinfo->grouped_stores.qsort (dr_group_cmp); ++ auto_vec grouped_stores_merge; ++ auto_vec visited (bb_vinfo->grouped_stores.length ()); ++ unsigned int i = 0; ++ for (i = 0; i < bb_vinfo->grouped_stores.length (); i++) ++ { ++ visited.safe_push (false); ++ } ++ ++ /* Get scalar stores for the following transposition recovery. */ ++ get_scalar_stores (bb_vinfo); ++ ++ while (1) ++ { ++ /* Find grouped stores which are required to transpose and merge. */ ++ auto_vec res; ++ vect_slp_grouped_store_find (bb_vinfo, visited, res); ++ if (res.is_empty ()) ++ { ++ break; ++ } ++ /* Transpose and merge the required grouped stores into one group. */ ++ grouped_stores_merge.safe_push (vect_slp_grouped_store_transform (res)); ++ } ++ ++ BB_VINFO_GROUPED_STORES (bb_vinfo).release (); ++ for (i = 0; i < grouped_stores_merge.length (); i++) ++ { ++ BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (grouped_stores_merge[i]); ++ } ++ ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Transposing grouped stores successfully.\n"); ++ } ++ return true; ++} ++ ++/* A helpful function of vect_transform_back_slp_grouped_stores (). */ ++ ++static auto_vec ++vect_transform_back_slp_grouped_store (bb_vec_info bb_vinfo, ++ stmt_vec_info first_stmt_info) ++{ ++ auto_vec grouped_stores_split; ++ for (unsigned int i = 0; i < bb_vinfo->scalar_stores.length (); i++) ++ { ++ vec scalar_tmp = bb_vinfo->scalar_stores[i]; ++ if (scalar_tmp.length () > 1 ++ && scalar_tmp[0]->group_number != first_stmt_info->group_number) ++ { ++ continue; ++ } ++ stmt_vec_info cur_stmt_info = NULL; ++ stmt_vec_info cur_first_stmt_info = NULL; ++ stmt_vec_info last_stmt_info = NULL; ++ unsigned int k = 0; ++ FOR_EACH_VEC_ELT (scalar_tmp, k, cur_stmt_info) ++ { ++ if (k == 0) ++ { ++ cur_first_stmt_info = cur_stmt_info; ++ last_stmt_info = cur_stmt_info; ++ } ++ DR_GROUP_FIRST_ELEMENT (cur_stmt_info) = cur_first_stmt_info; ++ DR_GROUP_NEXT_ELEMENT (last_stmt_info) = cur_stmt_info; ++ last_stmt_info = cur_stmt_info; ++ } ++ DR_GROUP_SIZE (cur_first_stmt_info) = k; ++ DR_GROUP_NEXT_ELEMENT (last_stmt_info) = NULL; ++ if (first_stmt_info != cur_first_stmt_info) ++ { ++ DR_GROUP_GAP (cur_first_stmt_info) ++ = DR_GROUP_GAP_TRANS (cur_first_stmt_info); ++ DR_GROUP_SLP_TRANSPOSE (cur_first_stmt_info) = false; ++ DR_GROUP_NUMBER (cur_first_stmt_info) = -1; ++ } ++ grouped_stores_split.safe_push (cur_first_stmt_info); ++ } ++ return grouped_stores_split; ++} ++ ++/* Transform the grouped store back. */ ++ ++void ++vect_transform_back_slp_grouped_stores (bb_vec_info bb_vinfo, ++ stmt_vec_info first_stmt_info) ++{ ++ if (first_stmt_info->group_number == -1) ++ { ++ return; ++ } ++ /* Transform back. */ ++ auto_vec grouped_stores_split ++ = vect_transform_back_slp_grouped_store (bb_vinfo, first_stmt_info); ++ ++ /* Add the remaining grouped stores to grouped_stores_split. */ ++ stmt_vec_info first_element = NULL; ++ unsigned int i = 0; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, first_element) ++ { ++ if (first_element->group_number != first_stmt_info->group_number) ++ { ++ grouped_stores_split.safe_push (first_element); ++ } ++ } ++ DR_GROUP_SLP_TRANSPOSE (first_stmt_info) = false; ++ DR_GROUP_NUMBER (first_stmt_info) = -1; ++ BB_VINFO_GROUPED_STORES (bb_vinfo).release (); ++ for (i = 0; i < grouped_stores_split.length (); i++) ++ { ++ BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (grouped_stores_split[i]); ++ } ++} ++ ++/* Function check_for_slp_vectype ++ ++ Restriction for grouped stores by checking their vectype. ++ If the vectype of the grouped store is changed, it need transform back. ++ If all grouped stores need to be transformed back, return FALSE. */ ++ ++static bool ++check_for_slp_vectype (bb_vec_info bb_vinfo) ++{ ++ if (dump_file) ++ fprintf (dump_file, "check_for_slp_vectype: enter\n"); ++ stmt_vec_info first_element = NULL; ++ unsigned int i = 0; ++ int count = 0; ++ auto_vec grouped_stores_check; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, first_element) ++ { ++ grouped_stores_check.safe_push (first_element); ++ } ++ FOR_EACH_VEC_ELT (grouped_stores_check, i, first_element) ++ { ++ if (STMT_VINFO_GROUPED_ACCESS (first_element) ++ && first_element->group_number != -1) ++ { ++ unsigned int group_size_b ++ = DR_GROUP_SIZE_TRANS (first_element); ++ tree vectype = STMT_VINFO_VECTYPE (first_element); ++ gimple *stmt = STMT_VINFO_STMT (first_element); ++ tree lhs = gimple_get_lhs (stmt); ++ tree type = TREE_TYPE (lhs); ++#if 0 ++ if (!vectype && !type) ++ { ++ if (dump_file) ++ fprintf (dump_file, "check_for_slp_vectype: no vectype/stmt type\n"); ++ continue; ++ } ++ ++ if (!vectype) ++ vectype = type; ++#endif ++ if (dump_file) ++ { ++ fprintf (dump_file, "check_for_slp_vectype: %p\n", first_element); ++ print_gimple_stmt (dump_file, stmt, 0); ++ fprintf (dump_file, "check_for_slp_vectype: vectype="); ++ if (vectype) ++ print_generic_expr (dump_file, vectype); ++ fprintf (dump_file, "\n"); ++ } ++#if 0 ++ if (!vectype || !VECTOR_TYPE_P (vectype)) ++ continue; ++#endif ++ poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); ++ if (nunits.to_constant () > group_size_b) ++ { ++ count++; ++ /* If the vectype is changed, this grouped store need ++ to be transformed back. */ ++ vect_transform_back_slp_grouped_stores (bb_vinfo, first_element); ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "No supported: only supported for" ++ " group_size geq than nunits.\n"); ++ } ++ } ++ } ++ } ++ if (count == BB_VINFO_TRANS_GROUPS (bb_vinfo)) ++ { ++ return false; ++ } ++ if (dump_file) ++ fprintf (dump_file, "check_for_slp_vectype: True\n"); ++ return true; ++} ++ ++/* Function check_for_dr_alignment ++ ++ Check the alignment of the slp instance loads. ++ Return FALSE if a load cannot be vectorized. */ ++ ++static bool ++check_for_dr_alignment (bb_vec_info bb_vinfo, slp_instance instance) ++{ ++ slp_tree node = NULL; ++ unsigned int i = 0; ++ FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node) ++ { ++ stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; ++ dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info); ++ if (dump_file) ++ { ++ fprintf (dump_file, "check_for_dr_alignment: %p\n", first_stmt_info); ++ ++ gimple *stmt = STMT_VINFO_STMT (first_stmt_info); ++ tree lhs = gimple_get_lhs (stmt); ++ tree type = TREE_TYPE (lhs); ++ print_gimple_stmt (dump_file, stmt, 0); ++ } ++ ++ tree vectype = STMT_VINFO_VECTYPE (first_stmt_info); ++ int malign = dr_misalignment (first_dr_info, vectype); ++ enum dr_alignment_support supportable_dr_alignment ++ = vect_supportable_dr_alignment (bb_vinfo, first_dr_info, ++ vectype, malign); ++ if (supportable_dr_alignment == dr_explicit_realign_optimized ++ || supportable_dr_alignment == dr_explicit_realign) ++ { ++ return false; + } + } +- else if (kind == slp_inst_kind_reduc_chain) ++ return true; ++} ++ ++/* Initialize slp_transpose flag before transposing. */ ++ ++static void ++init_stmt_info_slp_transpose (bb_vec_info bb_vinfo) ++{ ++ stmt_vec_info first_element = NULL; ++ unsigned int k = 0; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element) + { +- /* Collect the reduction stmts and store them in scalar_stmts. */ +- scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info)); +- while (next_info) ++ if (STMT_VINFO_GROUPED_ACCESS (first_element)) + { +- scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info)); +- next_info = REDUC_GROUP_NEXT_ELEMENT (next_info); ++ DR_GROUP_SLP_TRANSPOSE (first_element) = false; + } +- /* Mark the first element of the reduction chain as reduction to properly +- transform the node. In the reduction analysis phase only the last +- element of the chain is marked as reduction. */ +- STMT_VINFO_DEF_TYPE (stmt_info) +- = STMT_VINFO_DEF_TYPE (scalar_stmts.last ()); +- STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) +- = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ())); + } +- else if (kind == slp_inst_kind_ctor) ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_loads, k, first_element) + { +- tree rhs = gimple_assign_rhs1 (stmt_info->stmt); +- tree val; +- scalar_stmts.create (CONSTRUCTOR_NELTS (rhs)); +- FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val) ++ if (STMT_VINFO_GROUPED_ACCESS (first_element)) + { +- stmt_vec_info def_info = vinfo->lookup_def (val); +- def_info = vect_stmt_to_vectorize (def_info); +- scalar_stmts.quick_push (def_info); ++ DR_GROUP_SLP_TRANSPOSE (first_element) = false; + } +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_NOTE, vect_location, +- "Analyzing vectorizable constructor: %G\n", +- stmt_info->stmt); + } +- else if (kind == slp_inst_kind_reduc_group) ++} ++ ++/* Analyze and transpose the stmts before building the SLP tree. */ ++ ++static bool ++vect_analyze_transpose (bb_vec_info bb_vinfo) ++{ ++ DUMP_VECT_SCOPE ("vect_analyze_transpose"); ++ ++ if (!vect_may_transpose (bb_vinfo)) + { +- /* Collect reduction statements. */ +- const vec &reductions +- = as_a (vinfo)->reductions; +- scalar_stmts.create (reductions.length ()); +- for (i = 0; reductions.iterate (i, &next_info); i++) +- if ((STMT_VINFO_RELEVANT_P (next_info) +- || STMT_VINFO_LIVE_P (next_info)) +- /* ??? Make sure we didn't skip a conversion around a reduction +- path. In that case we'd have to reverse engineer that conversion +- stmt following the chain using reduc_idx and from the PHI +- using reduc_def. */ +- && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def) +- scalar_stmts.quick_push (next_info); +- /* If less than two were relevant/live there's nothing to SLP. */ +- if (scalar_stmts.length () < 2) +- return false; ++ return false; + } +- else +- gcc_unreachable (); + +- vec roots = vNULL; +- if (kind == slp_inst_kind_ctor) ++ /* For basic block SLP, try to merge the grouped stores and loads ++ into one group. */ ++ init_stmt_info_slp_transpose (bb_vinfo); ++ if (vect_transform_slp_grouped_stores (bb_vinfo) ++ && vect_merge_slp_grouped_loads (bb_vinfo)) + { +- roots.create (1); +- roots.quick_push (stmt_info); ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Analysis succeeded with SLP transposed.\n"); ++ } ++ return true; + } +- /* Build the tree for the SLP instance. */ +- bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts, +- roots, +- max_tree_size, limit, bst_map, +- kind == slp_inst_kind_store +- ? stmt_info : NULL); +- if (!res) +- roots.release (); +- +- /* ??? If this is slp_inst_kind_store and the above succeeded here's +- where we should do store group splitting. */ +- +- return res; ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Analysis failed with SLP transposed.\n"); ++ } ++ return false; + } + + /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP +@@ -4963,7 +5932,7 @@ vect_slp_analyze_operations (vec_info *vinfo) + /* Check we can vectorize the reduction. */ + || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc + && !vectorizable_bb_reduc_epilogue (instance, &cost_vec))) +- { ++ { + slp_tree node = SLP_INSTANCE_TREE (instance); + stmt_vec_info stmt_info; + if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()) +@@ -4975,7 +5944,7 @@ vect_slp_analyze_operations (vec_info *vinfo) + "removing SLP instance operations starting from: %G", + stmt_info->stmt); + vect_free_slp_instance (instance); +- vinfo->slp_instances.ordered_remove (i); ++ vinfo->slp_instances.ordered_remove (i); + cost_vec.release (); + while (!visited_vec.is_empty ()) + visited.remove (visited_vec.pop ()); +@@ -5204,7 +6173,7 @@ vect_bb_slp_scalar_cost (vec_info *vinfo, + gimple *orig_stmt = orig_stmt_info->stmt; + + /* If there is a non-vectorized use of the defs then the scalar +- stmt is kept live in which case we do not account it or any ++ stmt is kept live in which case we do not account it or any + required defs in the SLP children in the scalar cost. This + way we make the vectorization more costly when compared to + the scalar cost. */ +@@ -5481,7 +6450,11 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo, + + vec_outside_cost = vec_prologue_cost + vec_epilogue_cost; + +- if (dump_enabled_p ()) ++ BB_VINFO_VEC_INSIDE_COST (bb_vinfo) = vec_inside_cost; ++ BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo) = vec_outside_cost; ++ BB_VINFO_SCALAR_COST (bb_vinfo) = scalar_cost; ++ ++ if (!unlimited_cost_model (NULL) && dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Cost model analysis for part in loop %d:\n", sl); +@@ -5819,7 +6792,7 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, + if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL)) + { + if (dump_enabled_p ()) +- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: unhandled data-ref in basic " + "block.\n"); + return false; +@@ -5854,6 +6827,22 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, + + vect_pattern_recog (bb_vinfo); + ++ /* Transpose grouped stores and loads for better vectorizable version. */ ++ if (bb_vinfo->transposed) ++ { ++ if (!vect_analyze_transpose (bb_vinfo)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "not vectorized: unhandled slp transposed in " ++ "basic block.\n"); ++ } ++ return false; ++ } ++ } ++ bb_vinfo->before_slp = true; ++ + /* Update store groups from pattern processing. */ + vect_fixup_store_groups_with_patterns (bb_vinfo); + +@@ -5872,6 +6861,20 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, + return false; + } + ++ /* Check if the vectype is suitable for SLP transposed. */ ++ if (bb_vinfo->transposed && !check_for_slp_vectype (bb_vinfo)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "Failed to SLP transposed in the basic block.\n"); ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "not vectorized: vectype is not suitable for " ++ "SLP transposed in basic block.\n"); ++ } ++ return false; ++ } ++ + /* Optimize permutations. */ + vect_optimize_slp (bb_vinfo); + +@@ -5914,6 +6917,27 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, + if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ()) + return false; + ++ /* Check if the alignment is suitable for SLP transposed. */ ++ if (bb_vinfo->transposed) ++ { ++ for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); i++) ++ { ++ if (!check_for_dr_alignment (bb_vinfo, instance)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "Failed to SLP transposed in the basic " ++ "block.\n"); ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "not vectorized: alignment is not suitable " ++ "for SLP transposed in basic block.\n"); ++ } ++ return false; ++ } ++ } ++ } ++ + if (!vect_slp_analyze_operations (bb_vinfo)) + { + if (dump_enabled_p ()) +@@ -5923,7 +6947,88 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, + } + + vect_bb_partition_graph (bb_vinfo); ++ return true; ++} ++ ++static bool ++may_new_transpose_bbvinfo (bb_vec_info bb_vinfo_ori, bool res_ori, ++ loop_p orig_loop) ++{ ++ /* If the flag is false or the slp analysis is broken before ++ vect_analyze_slp, we don't try to analyze the transposed SLP version. */ ++ if (!flag_tree_slp_transpose_vectorize ++ || !BB_VINFO_BEFORE_SLP (bb_vinfo_ori)) ++ { ++ return false; ++ } ++ ++ /* If the original bb_vinfo can't be vectorized, try to new a bb_vinfo ++ of the transposed version. */ ++ if (!res_ori) ++ { ++ return true; ++ } ++ ++ /* Caculate the cost of the original bb_vinfo. */ ++ if (unlimited_cost_model (NULL)) ++ { ++ vec &instances = BB_VINFO_SLP_INSTANCES (bb_vinfo_ori); ++ vect_bb_vectorization_profitable_p (bb_vinfo_ori, instances, orig_loop); ++ } ++ /* If the vec cost and scalar cost are not much difference (here we set the ++ threshold to 4), we try to new a bb_vinfo of the transposed version. */ ++ if (BB_VINFO_SCALAR_COST (bb_vinfo_ori) ++ < 4 * (BB_VINFO_VEC_INSIDE_COST (bb_vinfo_ori) ++ + BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_ori))) ++ { ++ return true; ++ } ++ return false; ++} + ++static bool ++may_choose_transpose_bbvinfo (bb_vec_info bb_vinfo_trans, bool res_trans, ++ bb_vec_info bb_vinfo_ori, bool res_ori, ++ loop_p orig_loop) ++{ ++ /* The original bb_vinfo is chosen if the transposed bb_vinfo ++ can't be vectorized. */ ++ if (!res_trans) ++ { ++ return false; ++ } ++ /* Caculate the cost of the transposed bb_vinfo. */ ++ if (unlimited_cost_model (NULL)) ++ { ++ vec &instances = BB_VINFO_SLP_INSTANCES (bb_vinfo_trans); ++ vect_bb_vectorization_profitable_p (bb_vinfo_trans, instances, ++ orig_loop); ++ } ++ int diff_bb_cost = -1; ++ int diff_bb_cost_trans = -1; ++ if (res_ori) ++ { ++ diff_bb_cost = BB_VINFO_SCALAR_COST (bb_vinfo_ori) ++ - BB_VINFO_VEC_INSIDE_COST (bb_vinfo_ori) ++ - BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_ori); ++ } ++ if (res_trans) ++ { ++ diff_bb_cost_trans = BB_VINFO_SCALAR_COST (bb_vinfo_trans) ++ - BB_VINFO_VEC_INSIDE_COST (bb_vinfo_trans) ++ - BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_trans); ++ } ++ /* The original bb_vinfo is chosen when one of the following conditions ++ is satisfied as follows: ++ 1) The cost of original version is better transposed version. ++ 2) The vec cost is similar to scalar cost in the transposed version. */ ++ if ((res_ori && res_trans && diff_bb_cost >= diff_bb_cost_trans) ++ || (res_trans && BB_VINFO_SCALAR_COST (bb_vinfo_trans) ++ <= (BB_VINFO_VEC_INSIDE_COST (bb_vinfo_trans) ++ + BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_trans)))) ++ { ++ return false; ++ } + return true; + } + +@@ -5937,6 +7042,7 @@ vect_slp_region (vec bbs, vec datarefs, + loop_p orig_loop) + { + bb_vec_info bb_vinfo; ++ bb_vec_info bb_vinfo_trans = NULL; + auto_vector_modes vector_modes; + + /* Autodetect first vector size we try. */ +@@ -5951,6 +7057,10 @@ vect_slp_region (vec bbs, vec datarefs, + { + bool vectorized = false; + bool fatal = false; ++ bool res_bb_vinfo_ori = false; ++ bool res_bb_vinfo_trans = false; ++ ++ /* New a bb_vinfo of the original version. */ + bb_vinfo = new _bb_vec_info (bbs, &shared); + + bool first_time_p = shared.datarefs.is_empty (); +@@ -5960,8 +7070,113 @@ vect_slp_region (vec bbs, vec datarefs, + else + bb_vinfo->shared->check_datarefs (); + bb_vinfo->vector_mode = next_vector_mode; ++ bb_vinfo->transposed = false; ++ bb_vinfo->before_slp = false; ++ ++ res_bb_vinfo_ori = vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, ++ dataref_groups); ++ auto_vec profitable_subgraphs; ++ auto_vec profitable_subgraphs_trans; ++ for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo)) ++ { ++ if (instance->subgraph_entries.is_empty ()) ++ continue; ++ ++ vect_location = instance->location (); ++ if (!unlimited_cost_model (NULL) ++ && !vect_bb_vectorization_profitable_p ++ (bb_vinfo, instance->subgraph_entries, orig_loop)) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "not vectorized: vectorization is not " ++ "profitable.\n"); ++ continue; ++ } ++ if (res_bb_vinfo_ori) ++ { ++ if (!dbg_cnt (vect_slp)) ++ continue; ++ profitable_subgraphs.safe_push (instance); ++ } ++ } ++ ++ /* Analyze and new a transposed bb_vinfo. */ ++ if (may_new_transpose_bbvinfo (bb_vinfo, res_bb_vinfo_ori, orig_loop)) ++ { ++ bool fatal_trans = false; ++ bb_vinfo_trans ++ = new _bb_vec_info (bbs, &shared); ++ bool first_time_p = shared.datarefs.is_empty (); ++ BB_VINFO_DATAREFS (bb_vinfo_trans) = datarefs; ++ if (first_time_p) ++ { ++ bb_vinfo_trans->shared->save_datarefs (); ++ } ++ else ++ { ++ bb_vinfo_trans->shared->check_datarefs (); ++ } ++ bb_vinfo_trans->vector_mode = next_vector_mode; ++ bb_vinfo_trans->transposed = true; ++ bb_vinfo_trans->before_slp = false; ++ ++ res_bb_vinfo_trans ++ = vect_slp_analyze_bb_1 (bb_vinfo_trans, n_stmts, fatal_trans, ++ dataref_groups); ++ if (res_bb_vinfo_trans) ++ { ++ for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo_trans)) ++ { ++ if (instance->subgraph_entries.is_empty ()) ++ continue; ++ ++ vect_location = instance->location (); ++ if (!unlimited_cost_model (NULL) ++ && !vect_bb_vectorization_profitable_p ++ (bb_vinfo_trans, instance->subgraph_entries, orig_loop)) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "not vectorized: transpose vectorization is not " ++ "profitable.\n"); ++ res_bb_vinfo_trans = false; ++ continue; ++ } ++ if (res_bb_vinfo_trans) ++ { ++ if (!dbg_cnt (vect_slp)) ++ continue; ++ profitable_subgraphs_trans.safe_push (instance); ++ } ++ } ++ } ++ if (may_choose_transpose_bbvinfo (bb_vinfo_trans, ++ res_bb_vinfo_trans, ++ bb_vinfo, res_bb_vinfo_ori, ++ orig_loop)) ++ { ++ bb_vinfo = bb_vinfo_trans; ++ fatal = fatal_trans; ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Basic block part vectorized " ++ "using transposed version.\n"); ++ } ++ } ++ else ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Basic block part vectorized " ++ "\n"); ++ } ++ } ++ } + +- if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)) ++ if (res_bb_vinfo_ori || res_bb_vinfo_trans) + { + if (dump_enabled_p ()) + { +@@ -5972,90 +7187,129 @@ vect_slp_region (vec bbs, vec datarefs, + } + + bb_vinfo->shared->check_datarefs (); +- +- auto_vec profitable_subgraphs; +- for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo)) ++ if (!res_bb_vinfo_trans) + { +- if (instance->subgraph_entries.is_empty ()) +- continue; +- +- vect_location = instance->location (); +- if (!unlimited_cost_model (NULL) +- && !vect_bb_vectorization_profitable_p +- (bb_vinfo, instance->subgraph_entries, orig_loop)) ++ /* When we're vectorizing an if-converted loop body make sure ++ we vectorized all if-converted code. */ ++ if (!profitable_subgraphs.is_empty () ++ && orig_loop) + { +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +- "not vectorized: vectorization is not " +- "profitable.\n"); +- continue; ++ gcc_assert (bb_vinfo->bbs.length () == 1); ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]); ++ !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ /* The costing above left us with DCEable vectorized scalar ++ stmts having the visited flag set on profitable ++ subgraphs. Do the delayed clearing of the flag here. */ ++ if (gimple_visited_p (gsi_stmt (gsi))) ++ { ++ gimple_set_visited (gsi_stmt (gsi), false); ++ continue; ++ } ++ if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED) ++ continue; ++ ++ if (gassign *ass = dyn_cast (gsi_stmt (gsi))) ++ if (gimple_assign_rhs_code (ass) == COND_EXPR) ++ { ++ if (!profitable_subgraphs.is_empty () ++ && dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "not profitable because of " ++ "unprofitable if-converted scalar " ++ "code\n"); ++ profitable_subgraphs.truncate (0); ++ } ++ } + } + +- if (!dbg_cnt (vect_slp)) +- continue; ++ /* Finally schedule the profitable subgraphs. */ ++ for (slp_instance instance : profitable_subgraphs) ++ { ++ if (!vectorized && dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Basic block will be vectorized " ++ "using SLP\n"); ++ vectorized = true; + +- profitable_subgraphs.safe_push (instance); +- } ++ vect_schedule_slp (bb_vinfo, instance->subgraph_entries); + +- /* When we're vectorizing an if-converted loop body make sure +- we vectorized all if-converted code. */ +- if (!profitable_subgraphs.is_empty () +- && orig_loop) ++ unsigned HOST_WIDE_INT bytes; ++ if (dump_enabled_p ()) ++ { ++ if (GET_MODE_SIZE ++ (bb_vinfo->vector_mode).is_constant (&bytes)) ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, ++ "basic block part vectorized using %wu " ++ "byte vectors\n", bytes); ++ else ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, ++ "basic block part vectorized using " ++ "variable length vectors\n"); ++ } ++ } ++ } ++ else + { +- gcc_assert (bb_vinfo->bbs.length () == 1); +- for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]); +- !gsi_end_p (gsi); gsi_next (&gsi)) ++ if (!profitable_subgraphs_trans.is_empty () ++ && orig_loop) + { +- /* The costing above left us with DCEable vectorized scalar +- stmts having the visited flag set on profitable +- subgraphs. Do the delayed clearing of the flag here. */ +- if (gimple_visited_p (gsi_stmt (gsi))) ++ gcc_assert (bb_vinfo->bbs.length () == 1); ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]); ++ !gsi_end_p (gsi); gsi_next (&gsi)) + { +- gimple_set_visited (gsi_stmt (gsi), false); +- continue; ++ /* The costing above left us with DCEable vectorized scalar ++ stmts having the visited flag set on profitable ++ subgraphs. Do the delayed clearing of the flag here. */ ++ if (gimple_visited_p (gsi_stmt (gsi))) ++ { ++ gimple_set_visited (gsi_stmt (gsi), false); ++ continue; ++ } ++ if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED) ++ continue; ++ ++ if (gassign *ass = dyn_cast (gsi_stmt (gsi))) ++ if (gimple_assign_rhs_code (ass) == COND_EXPR) ++ { ++ if (!profitable_subgraphs_trans.is_empty () ++ && dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "not profitable because of " ++ "unprofitable if-converted scalar " ++ "code\n"); ++ profitable_subgraphs_trans.truncate (0); ++ } + } +- if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED) +- continue; +- +- if (gassign *ass = dyn_cast (gsi_stmt (gsi))) +- if (gimple_assign_rhs_code (ass) == COND_EXPR) +- { +- if (!profitable_subgraphs.is_empty () +- && dump_enabled_p ()) +- dump_printf_loc (MSG_NOTE, vect_location, +- "not profitable because of " +- "unprofitable if-converted scalar " +- "code\n"); +- profitable_subgraphs.truncate (0); +- } + } +- } + +- /* Finally schedule the profitable subgraphs. */ +- for (slp_instance instance : profitable_subgraphs) +- { +- if (!vectorized && dump_enabled_p ()) +- dump_printf_loc (MSG_NOTE, vect_location, +- "Basic block will be vectorized " +- "using SLP\n"); +- vectorized = true; ++ /* Finally schedule the profitable subgraphs. */ ++ for (slp_instance instance : profitable_subgraphs_trans) ++ { ++ if (!vectorized && dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Basic block will be vectorized " ++ "using SLP\n"); ++ vectorized = true; + +- vect_schedule_slp (bb_vinfo, instance->subgraph_entries); ++ vect_schedule_slp (bb_vinfo, instance->subgraph_entries); + +- unsigned HOST_WIDE_INT bytes; +- if (dump_enabled_p ()) +- { +- if (GET_MODE_SIZE +- (bb_vinfo->vector_mode).is_constant (&bytes)) +- dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, +- "basic block part vectorized using %wu " +- "byte vectors\n", bytes); +- else +- dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, +- "basic block part vectorized using " +- "variable length vectors\n"); ++ unsigned HOST_WIDE_INT bytes; ++ if (dump_enabled_p ()) ++ { ++ if (GET_MODE_SIZE ++ (bb_vinfo->vector_mode).is_constant (&bytes)) ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, ++ "basic block part vectorized using %wu " ++ "byte vectors\n", bytes); ++ else ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, ++ "basic block part vectorized using " ++ "variable length vectors\n"); ++ } + } + } ++ + } + else + { +@@ -6081,6 +7335,10 @@ vect_slp_region (vec bbs, vec datarefs, + } + + delete bb_vinfo; ++ if (bb_vinfo_trans) ++ { ++ bb_vinfo_trans = NULL; ++ } + + if (mode_i < vector_modes.length () + && VECTOR_MODE_P (autodetected_vector_mode) +@@ -7244,10 +8502,17 @@ vect_schedule_slp_node (vec_info *vinfo, + ready early, vectorized stores go before the last scalar + stmt which is where all uses are ready. */ + stmt_vec_info last_stmt_info = NULL; +- if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) +- last_stmt_info = vect_find_first_scalar_stmt_in_slp (node); +- else /* DR_IS_WRITE */ +- last_stmt_info = vect_find_last_scalar_stmt_in_slp (node); ++ ++ if (DR_GROUP_FIRST_ELEMENT (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ last_stmt_info = vect_find_last_scalar_stmt_in_slp (node); ++ else ++ { ++ if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) ++ last_stmt_info = vect_find_first_scalar_stmt_in_slp (node); ++ else /* DR_IS_WRITE */ ++ last_stmt_info = vect_find_last_scalar_stmt_in_slp (node); ++ } + si = gsi_for_stmt (last_stmt_info->stmt); + } + else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type +diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc +index 349200411..3099f6743 100644 +--- a/gcc/tree-vect-stmts.cc ++++ b/gcc/tree-vect-stmts.cc +@@ -1369,10 +1369,10 @@ vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, int ncopies, + + static void + vect_init_vector_1 (vec_info *vinfo, stmt_vec_info stmt_vinfo, gimple *new_stmt, +- gimple_stmt_iterator *gsi) ++ gimple_stmt_iterator *gsi, bool transpose=false) + { + if (gsi) +- vect_finish_stmt_generation (vinfo, stmt_vinfo, new_stmt, gsi); ++ vect_finish_stmt_generation (vinfo, stmt_vinfo, new_stmt, gsi, transpose); + else + vinfo->insert_on_entry (stmt_vinfo, new_stmt); + +@@ -1393,7 +1393,7 @@ vect_init_vector_1 (vec_info *vinfo, stmt_vec_info stmt_vinfo, gimple *new_stmt, + + tree + vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type, +- gimple_stmt_iterator *gsi) ++ gimple_stmt_iterator *gsi, bool transpose) + { + gimple *init_stmt; + tree new_temp; +@@ -1418,7 +1418,7 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type, + new_temp = make_ssa_name (TREE_TYPE (type)); + init_stmt = gimple_build_assign (new_temp, COND_EXPR, + val, true_val, false_val); +- vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi); ++ vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi, transpose); + val = new_temp; + } + } +@@ -1437,7 +1437,7 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type, + { + init_stmt = gsi_stmt (gsi2); + gsi_remove (&gsi2, false); +- vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi); ++ vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi, transpose); + } + } + } +@@ -1446,7 +1446,7 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type, + + new_temp = vect_get_new_ssa_name (type, vect_simple_var, "cst_"); + init_stmt = gimple_build_assign (new_temp, val); +- vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi); ++ vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi, transpose); + return new_temp; + } + +@@ -1572,9 +1572,11 @@ vect_get_vec_defs (vec_info *vinfo, stmt_vec_info stmt_info, slp_tree slp_node, + statement and create and return a stmt_vec_info for it. */ + + static void +-vect_finish_stmt_generation_1 (vec_info *, +- stmt_vec_info stmt_info, gimple *vec_stmt) ++vect_finish_stmt_generation_1 (vec_info *vinfo, ++ stmt_vec_info stmt_info, gimple *vec_stmt, bool transpose=false) + { ++ if (transpose) ++ stmt_vec_info vec_stmt_info = vinfo->add_pattern_stmt (vec_stmt, NULL); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "add new stmt: %G", vec_stmt); + +@@ -1616,7 +1618,7 @@ vect_finish_replace_stmt (vec_info *vinfo, + void + vect_finish_stmt_generation (vec_info *vinfo, + stmt_vec_info stmt_info, gimple *vec_stmt, +- gimple_stmt_iterator *gsi) ++ gimple_stmt_iterator *gsi, bool transpose) + { + gcc_assert (!stmt_info || gimple_code (stmt_info->stmt) != GIMPLE_LABEL); + +@@ -1648,7 +1650,7 @@ vect_finish_stmt_generation (vec_info *vinfo, + } + } + gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT); +- vect_finish_stmt_generation_1 (vinfo, stmt_info, vec_stmt); ++ vect_finish_stmt_generation_1 (vinfo, stmt_info, vec_stmt, transpose); + } + + /* We want to vectorize a call to combined function CFN with function +@@ -2159,6 +2161,173 @@ vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype) + return NULL_TREE; + } + ++/* Check succeedor BB, BB without load is regarded as empty BB. Ignore empty ++ BB in DFS. */ ++ ++static unsigned ++mem_refs_in_bb (basic_block bb, vec &stmts) ++{ ++ unsigned num = 0; ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); ++ !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ gimple *stmt = gsi_stmt (gsi); ++ if (is_gimple_debug (stmt)) ++ continue; ++ if (is_gimple_assign (stmt) && gimple_has_mem_ops (stmt) ++ && !gimple_has_volatile_ops (stmt)) ++ { ++ if (gimple_assign_rhs_code (stmt) == MEM_REF ++ || gimple_assign_rhs_code (stmt) == ARRAY_REF) ++ { ++ stmts.safe_push (stmt); ++ num++; ++ } ++ else if (TREE_CODE (gimple_get_lhs (stmt)) == MEM_REF ++ || TREE_CODE (gimple_get_lhs (stmt)) == ARRAY_REF) ++ num++; ++ } ++ } ++ return num; ++} ++ ++static bool ++check_same_base (vec *datarefs, data_reference_p dr) ++{ ++ for (unsigned ui = 0; ui < datarefs->length (); ui++) ++ { ++ tree op1 = TREE_OPERAND (DR_BASE_OBJECT (dr), 0); ++ tree op2 = TREE_OPERAND (DR_BASE_OBJECT ((*datarefs)[ui]), 0); ++ if (TREE_CODE (op1) != TREE_CODE (op2)) ++ continue; ++ if (TREE_CODE (op1) == ADDR_EXPR) ++ { ++ op1 = TREE_OPERAND (op1, 0); ++ op2 = TREE_OPERAND (op2, 0); ++ } ++ enum tree_code code = TREE_CODE (op1); ++ switch (code) ++ { ++ case VAR_DECL: ++ if (DECL_NAME (op1) == DECL_NAME (op2) ++ && DR_IS_READ ((*datarefs)[ui])) ++ return true; ++ break; ++ case SSA_NAME: ++ if (SSA_NAME_VERSION (op1) == SSA_NAME_VERSION (op2) ++ && DR_IS_READ ((*datarefs)[ui])) ++ return true; ++ break; ++ default: ++ break; ++ } ++ } ++ return false; ++} ++ ++/* Iterate all load STMTS, if staisfying same base vectorized stmt, then return, ++ Otherwise, set false to SUCCESS. */ ++ ++static void ++check_vec_use (loop_vec_info loop_vinfo, vec &stmts, ++ stmt_vec_info stmt_info, bool &success) ++{ ++ if (stmt_info == NULL) ++ { ++ success = false; ++ return; ++ } ++ if (DR_IS_READ (stmt_info->dr_aux.dr)) ++ { ++ success = false; ++ return; ++ } ++ unsigned ui = 0; ++ gimple *candidate = NULL; ++ FOR_EACH_VEC_ELT (stmts, ui, candidate) ++ { ++ if (TREE_CODE (TREE_TYPE (gimple_get_lhs (candidate))) != VECTOR_TYPE) ++ continue; ++ ++ if (candidate->bb != candidate->bb->loop_father->header) ++ { ++ success = false; ++ return; ++ } ++ auto_vec datarefs; ++ tree res = find_data_references_in_bb (candidate->bb->loop_father, ++ candidate->bb, &datarefs); ++ if (res == chrec_dont_know) ++ { ++ success = false; ++ return; ++ } ++ if (check_same_base (&datarefs, stmt_info->dr_aux.dr)) ++ return; ++ } ++ success = false; ++} ++ ++/* Deep first search from present BB. If succeedor has load STMTS, ++ stop further searching. */ ++ ++static void ++dfs_check_bb (loop_vec_info loop_vinfo, basic_block bb, stmt_vec_info stmt_info, ++ bool &success, vec &visited_bbs) ++{ ++ if (bb == cfun->cfg->x_exit_block_ptr) ++ { ++ success = false; ++ return; ++ } ++ if (!success || visited_bbs.contains (bb) || bb == loop_vinfo->loop->latch) ++ return; ++ ++ visited_bbs.safe_push (bb); ++ auto_vec stmts; ++ unsigned num = mem_refs_in_bb (bb, stmts); ++ /* Empty BB. */ ++ if (num == 0) ++ { ++ edge e; ++ edge_iterator ei; ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ dfs_check_bb (loop_vinfo, e->dest, stmt_info, success, visited_bbs); ++ if (!success) ++ return; ++ } ++ return; ++ } ++ /* Non-empty BB. */ ++ check_vec_use (loop_vinfo, stmts, stmt_info, success); ++} ++ ++/* For grouped store, if all succeedors of present BB have vectorized load ++ from same base of store. If so, set memory_access_type using ++ VMAT_CONTIGUOUS_PERMUTE instead of VMAT_LOAD_STORE_LANES. */ ++ ++static bool ++conti_perm (stmt_vec_info stmt_vinfo, loop_vec_info loop_vinfo) ++{ ++ gimple *stmt = stmt_vinfo->stmt; ++ if (gimple_code (stmt) != GIMPLE_ASSIGN) ++ return false; ++ ++ if (DR_IS_READ (stmt_vinfo->dr_aux.dr)) ++ return false; ++ ++ basic_block bb = stmt->bb; ++ bool success = true; ++ auto_vec visited_bbs; ++ visited_bbs.safe_push (bb); ++ edge e; ++ edge_iterator ei; ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ dfs_check_bb (loop_vinfo, e->dest, stmt_vinfo, success, visited_bbs); ++ return success; ++} ++ + /* A subroutine of get_load_store_type, with a subset of the same + arguments. Handle the case where STMT_INFO is part of a grouped load + or store. +@@ -2373,6 +2542,20 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, + *memory_access_type = VMAT_CONTIGUOUS_PERMUTE; + overrun_p = would_overrun_p; + } ++ ++ if (*memory_access_type == VMAT_LOAD_STORE_LANES ++ && TREE_CODE (loop_vinfo->num_iters) == INTEGER_CST ++ && maybe_eq (tree_to_shwi (loop_vinfo->num_iters), ++ loop_vinfo->vectorization_factor) ++ && conti_perm (stmt_info, loop_vinfo) ++ && (vls_type == VLS_LOAD ++ ? vect_grouped_load_supported (vectype, single_element_p, ++ group_size) ++ : vect_grouped_store_supported (vectype, group_size))) ++ { ++ *memory_access_type = VMAT_CONTIGUOUS_PERMUTE; ++ overrun_p = would_overrun_p; ++ } + } + + /* As a last resort, trying using a gather load or scatter store. +@@ -7456,6 +7639,154 @@ vectorizable_scan_store (vec_info *vinfo, + return true; + } + ++/* Function vect_permute_store_chains ++ ++ Call function vect_permute_store_chain (). ++ Given a chain of interleaved stores in DR_CHAIN, generate ++ interleave_high/low stmts to reorder the data correctly. ++ Return the final references for stores in RESULT_CHAIN. */ ++ ++static void ++vect_permute_store_chains (vec_info *vinfo, vec dr_chain, ++ unsigned int num_each, stmt_vec_info stmt_info, ++ gimple_stmt_iterator *gsi, vec *result_chain, ++ unsigned int group) ++{ ++ unsigned int k = 0; ++ unsigned int t = 0; ++ ++ /* Divide vectors into GROUP parts. And permute every NUM_EACH vectors ++ together. */ ++ for (k = 0; k < group; k++) ++ { ++ auto_vec dr_chain_transposed (num_each); ++ auto_vec result_chain_transposed (num_each); ++ for (t = k; t < dr_chain.length (); t = t + group) ++ { ++ dr_chain_transposed.quick_push (dr_chain[t]); ++ } ++ vect_permute_store_chain (vinfo, dr_chain_transposed, num_each, ++ stmt_info, gsi, &result_chain_transposed); ++ for (t = 0; t < num_each; t++) ++ { ++ result_chain->quick_push (result_chain_transposed[t]); ++ } ++ } ++} ++ ++/* Function transpose_oprnd_store ++ ++ Calculate the transposed results from VEC_OPRNDS (VEC_STMT) ++ for vectorizable_store. */ ++ ++static void ++transpose_oprnd_store (vec_info *vinfo, vecvec_oprnds, ++ vec *result_chain, unsigned int vec_num, ++ unsigned int const_nunits, unsigned int array_num, ++ stmt_vec_info first_stmt_info, ++ gimple_stmt_iterator *gsi) ++{ ++ unsigned int group_for_transform = 0; ++ unsigned int num_each = 0; ++ ++ /* Transpose back for vec_oprnds. */ ++ /* vec = {vec1, vec2, ...} */ ++ if (array_num < const_nunits ++ && const_nunits % array_num == 0) ++ { ++ vect_transpose_store_chain (vinfo, vec_oprnds, ++ vec_num, array_num, ++ first_stmt_info, ++ gsi, result_chain); ++ } ++ /* vec1 = {vec_part1}, vec2 = {vec_part2}, ... */ ++ else if (array_num >= const_nunits ++ && array_num % const_nunits == 0) ++ { ++ group_for_transform = array_num / const_nunits; ++ num_each = vec_oprnds.length () / group_for_transform; ++ vect_permute_store_chains (vinfo, vec_oprnds, ++ num_each, first_stmt_info, ++ gsi, result_chain, ++ group_for_transform); ++ } ++ else ++ { ++ gcc_unreachable (); ++ } ++} ++ ++static dr_vec_info * ++get_dr_info (stmt_vec_info stmt_info) ++{ ++ dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); ++ if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED) ++ { ++ SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN); ++ } ++ return dr_info; ++} ++ ++static unsigned ++dr_align_vect_store (vec_info *vinfo, dr_vec_info *cur_first_dr_info, ++ tree vectype, unsigned HOST_WIDE_INT &align) ++{ ++ unsigned misalign = 0; ++ align = known_alignment (DR_TARGET_ALIGNMENT (cur_first_dr_info)); ++ if (aligned_access_p (cur_first_dr_info, vectype)) ++ { ++ return misalign; ++ } ++ else if (cur_first_dr_info->misalignment == -1) ++ { ++ align = dr_alignment (vect_dr_behavior (vinfo, cur_first_dr_info)); ++ } ++ else ++ { ++ misalign = cur_first_dr_info->misalignment; ++ } ++ return misalign; ++} ++ ++static void ++add_new_stmt_vect_store (vec_info *vinfo, tree vectype, tree dataref_ptr, ++ tree dataref_offset, tree ref_type, ++ dr_vec_info *cur_first_dr_info, tree vec_oprnd, ++ gimple_stmt_iterator *gsi, stmt_vec_info stmt_info) ++{ ++ /* Data align. */ ++ unsigned HOST_WIDE_INT align; ++ unsigned misalign = dr_align_vect_store (vinfo, cur_first_dr_info, ++ vectype, align); ++ ++ if (dataref_offset == NULL_TREE && TREE_CODE (dataref_ptr) == SSA_NAME) ++ { ++ set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, misalign); ++ } ++ ++ /* Get data_ref. */ ++ tree offset = dataref_offset ? dataref_offset : build_int_cst (ref_type, 0); ++ tree data_ref = fold_build2 (MEM_REF, vectype, dataref_ptr, offset); ++ if (aligned_access_p (cur_first_dr_info, vectype)) ++ { ++ ; ++ } ++ else if (cur_first_dr_info->misalignment == -1) ++ { ++ TREE_TYPE (data_ref) = build_aligned_type (TREE_TYPE (data_ref), ++ align * BITS_PER_UNIT); ++ } ++ else ++ { ++ tree elem_type = TREE_TYPE (vectype); ++ TREE_TYPE (data_ref) = build_aligned_type (TREE_TYPE (data_ref), ++ TYPE_ALIGN (elem_type)); ++ } ++ /* Add new stmt. */ ++ vect_copy_ref_info (data_ref, DR_REF (cur_first_dr_info->dr)); ++ gassign *new_stmt = gimple_build_assign (data_ref, vec_oprnd); ++ vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi, true); ++} + + /* Function vectorizable_store. + +@@ -8333,6 +8664,16 @@ vectorizable_store (vec_info *vinfo, + &vec_offsets); + vec_offset = vec_offsets[0]; + } ++ /* If the stmt_info need to be transposed recovery, dataref_ptr ++ will be caculated later. */ ++ else if (memory_access_type == VMAT_CONTIGUOUS ++ && is_a (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE ( ++ DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ dataref_ptr = NULL_TREE; ++ } + else + dataref_ptr + = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type, +@@ -8423,6 +8764,75 @@ vectorizable_store (vec_info *vinfo, + } + else + { ++ /* group_size: the size of group after transposing and merging. ++ group_size_b: the size of group before transposing and merging, ++ and only group_size_b >= const_nunits is supported. ++ array_num: the number of arrays. ++ const_nunits: TYPE_VECTOR_SUBPARTS (vectype). ++ ncontinues: group_size_b / const_nunits, it means the number of ++ times an array is stored in memory. */ ++ if (slp && is_a (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "vectorizable_store for slp transpose.\n"); ++ } ++ /* Transpose back for grouped stores. */ ++ vect_transform_back_slp_grouped_stores (bb_vinfo, ++ first_stmt_info); ++ ++ result_chain.create (vec_oprnds.length ()); ++ unsigned int const_nunits = nunits.to_constant (); ++ unsigned int group_size_b = DR_GROUP_SIZE_TRANS (first_stmt_info); ++ unsigned int array_num = group_size / group_size_b; ++ transpose_oprnd_store (vinfo, vec_oprnds, &result_chain, vec_num, ++ const_nunits, array_num, ++ first_stmt_info, gsi); ++ ++ /* For every store group, not for every vec, because transposing ++ and merging have changed the data reference access. */ ++ gcc_assert (group_size_b >= const_nunits); ++ unsigned int ncontinues = group_size_b / const_nunits; ++ ++ unsigned int k = 0; ++ for (i = 0; i < array_num; i++) ++ { ++ stmt_vec_info first_stmt_b; ++ BB_VINFO_GROUPED_STORES (vinfo).iterate (i, &first_stmt_b); ++ bool simd_lane_access_p ++ = STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_b) != 0; ++ tree ref_type = get_group_alias_ptr_type (first_stmt_b); ++ dataref_ptr = vect_create_data_ref_ptr ( ++ vinfo, first_stmt_b, aggr_type, ++ simd_lane_access_p ? loop : NULL, ++ offset, &dummy, gsi, &ptr_incr, ++ simd_lane_access_p, bump); ++ dr_vec_info *cur_first_dr_info = get_dr_info (first_stmt_b); ++ for (unsigned int t = 0; t < ncontinues; t++) ++ { ++ vec_oprnd = result_chain[k]; ++ k++; ++ if (t > 0) ++ { ++ /* Bump the vector pointer. */ ++ dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ++ ptr_incr, gsi, ++ first_stmt_b, bump); ++ } ++ add_new_stmt_vect_store (vinfo, vectype, dataref_ptr, ++ dataref_offset, ref_type, ++ cur_first_dr_info, vec_oprnd, ++ gsi, first_stmt_b); ++ } ++ } ++ oprnds.release (); ++ result_chain.release (); ++ vec_oprnds.release (); ++ return true; ++ } + new_stmt = NULL; + if (grouped_store) + { +@@ -8719,6 +9129,451 @@ hoist_defs_of_uses (stmt_vec_info stmt_info, class loop *loop) + return true; + } + ++static tree ++calculate_new_type (tree vectype, unsigned int const_nunits, ++ unsigned int group_size_b, unsigned int &nloads, ++ unsigned int &ncontinues, tree &lvectype) ++{ ++ tree ltype = TREE_TYPE (vectype); ++ /* nloads is the number of ARRAYs in a vector. ++ vectemp = {a[], b[], ...} */ ++ if (group_size_b < const_nunits) ++ { ++ tree ptype; ++ tree vtype ++ = vector_vector_composition_type (vectype, ++ const_nunits / group_size_b, ++ &ptype); ++ if (vtype != NULL_TREE) ++ { ++ nloads = const_nunits / group_size_b; ++ lvectype = vtype; ++ ltype = ptype; ++ ncontinues = 1; ++ } ++ } ++ /* ncontinues is the number of vectors from an ARRAY. ++ vectemp1 = {a[0], a[1], ...} ++ ... ++ vectempm = {a[k], a[k+1], ...} */ ++ else ++ { ++ nloads = 1; ++ ltype = vectype; ++ ncontinues = group_size_b / const_nunits; ++ } ++ ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype))); ++ return ltype; ++} ++ ++static void ++generate_old_load_permutations (slp_tree slp_node, unsigned int group_size, ++ vec &old_load_permutation) ++{ ++ /* Generate the old load permutations from the slp_node. */ ++ unsigned i = 0; ++ unsigned k = 0; ++ ++ /* If SLP_NODE has load_permutation, we copy it to old_load_permutation. ++ Otherwise, we generate a permutation sequentially. */ ++ if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) ++ { ++ FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), i, k) ++ { ++ old_load_permutation.safe_push (k); ++ } ++ } ++ else ++ { ++ for (unsigned i = 0; i < group_size; i++) ++ { ++ old_load_permutation.safe_push (i); ++ } ++ } ++} ++ ++static void ++generate_new_load_permutation_mapping (unsigned slp_node_length, ++ vec &group_idx, ++ const vec &load_permutation, ++ unsigned int group_size_b, ++ unsigned &new_group_size, ++ vec &group_from) ++{ ++ /* group_num_vec: only stores the group_loads IDs which are caculated from ++ load_permutation. */ ++ auto_vec group_num_vec; ++ ++ /* Caculate which group_loads are the stmts in SLP_NODE from. */ ++ unsigned i = 0; ++ unsigned k = 0; ++ FOR_EACH_VEC_ELT (load_permutation, i, k) ++ { ++ unsigned int t0 = k / group_size_b; ++ if (!group_num_vec.contains (t0)) ++ { ++ group_num_vec.safe_push (t0); ++ } ++ group_from.safe_push (t0); ++ } ++ group_num_vec.qsort (cmp_for_group_num); ++ /* n_groups: the number of group_loads. */ ++ unsigned int n_groups = group_num_vec.length (); ++ new_group_size = n_groups * group_size_b; ++ for (i = 0; i < n_groups; i++) ++ { ++ group_idx.safe_push (group_num_vec[i] * group_size_b); ++ } ++ /* A new mapping from group_ind_vec to group_from. ++ For example: ++ Origin: group_from = {1,1,3,3,5,5,7,7}; ++ After mapping: group_from = {0,0,1,1,2,2,2,2}; */ ++ auto_vec group_ind_vec (n_groups); ++ for (k = 0; k < n_groups; k++) ++ { ++ group_ind_vec.safe_push (k); ++ } ++ for (i = 0; i < slp_node_length; i++) ++ { ++ for (k = 0; k < n_groups; k++) ++ { ++ if (group_from[i] == group_num_vec[k]) ++ { ++ group_from[i] = group_ind_vec[k]; ++ break; ++ } ++ } ++ } ++} ++ ++static void ++generate_new_load_permutation (vec &new_load_permutation, ++ const vec &old_load_permutation, ++ slp_tree slp_node, bool &this_load_permuted, ++ const vec &group_from, ++ unsigned int group_size_b) ++{ ++ unsigned slp_node_length = SLP_TREE_SCALAR_STMTS (slp_node).length (); ++ /* Generate the new load permutation from the new mapping. */ ++ new_load_permutation.create (slp_node_length); ++ unsigned i = 0; ++ unsigned k = 0; ++ FOR_EACH_VEC_ELT (old_load_permutation, i, k) ++ { ++ /* t1 is the new permutation of k in the old permutation. ++ t1 = base_address + offset: ++ base_address = group_from[i] * group_size_b; ++ offset = k % group_size_b. */ ++ unsigned int t1 ++ = group_from[i] * group_size_b + k % group_size_b; ++ new_load_permutation.safe_push (t1); ++ if (t1 != k) ++ { ++ this_load_permuted = true; ++ } ++ } ++} ++ ++static bool ++is_slp_perm (bool slp_perm, bool this_load_permuted, poly_uint64 nunits, ++ unsigned int group_size, stmt_vec_info first_stmt_info) ++{ ++ /* Calculate the unrolling factor based on the smallest type. */ ++ poly_uint64 unrolling_factor ++ = exact_div (common_multiple (nunits, group_size), group_size); ++ /* The load requires permutation when unrolling exposes ++ a gap either because the group is larger than the SLP ++ group-size or because there is a gap between the groups. */ ++ if (!slp_perm && !this_load_permuted ++ && (known_eq (unrolling_factor, 1U) ++ || (group_size == DR_GROUP_SIZE (first_stmt_info) ++ && DR_GROUP_GAP (first_stmt_info) == 0))) ++ { ++ return false; ++ } ++ else ++ { ++ return true; ++ } ++} ++ ++static void ++generate_load_permutation (slp_tree slp_node, unsigned &new_group_size, ++ unsigned int group_size, unsigned int group_size_b, ++ bool &this_load_permuted, vec &group_idx, ++ vec &new_load_permutation) ++{ ++ /* Generate the old load permutations from SLP_NODE. */ ++ vec old_load_permutation; ++ old_load_permutation.create (group_size); ++ generate_old_load_permutations (slp_node, group_size, old_load_permutation); ++ ++ /* Caculate which group_loads are the stmts in SLP_NODE from. */ ++ unsigned slp_node_length = SLP_TREE_SCALAR_STMTS (slp_node).length (); ++ /* group_from: stores the group_loads ID for every stmt in SLP_NODE. */ ++ vec group_from; ++ group_from.create (slp_node_length); ++ generate_new_load_permutation_mapping (slp_node_length, group_idx, ++ old_load_permutation, ++ group_size_b, new_group_size, ++ group_from); ++ ++ /* Generate the new load permutation from the new mapping and caculate ++ this_load_permuted flag. If this_load_permuted is true, we need execute ++ slp permutation by using new load permutation. */ ++ generate_new_load_permutation (new_load_permutation, old_load_permutation, ++ slp_node, this_load_permuted, group_from, ++ group_size_b); ++ old_load_permutation.release (); ++ group_from.release (); ++} ++ ++static unsigned int ++dr_align_vect_load (vec_info *vinfo, dr_vec_info *cur_first_dr_info, ++ tree vectype, unsigned HOST_WIDE_INT &align, ++ enum dr_alignment_support alignment_support_scheme) ++{ ++ unsigned int misalign = 0; ++ ++ align = known_alignment (DR_TARGET_ALIGNMENT (cur_first_dr_info)); ++ if (alignment_support_scheme == dr_aligned) ++ { ++ gcc_assert (aligned_access_p (cur_first_dr_info, vectype)); ++ } ++ else if (cur_first_dr_info->misalignment == -1) ++ { ++ align = dr_alignment (vect_dr_behavior (vinfo, cur_first_dr_info)); ++ } ++ else ++ { ++ misalign = cur_first_dr_info->misalignment; ++ } ++ return misalign; ++} ++ ++static stmt_vec_info ++add_new_stmt_vect_load (vec_info *vinfo, tree vectype, tree dataref_ptr, ++ tree dataref_offset, tree ref_type, tree ltype, ++ gassign *(&new_stmt), dr_vec_info *cur_first_dr_info, ++ gimple_stmt_iterator *gsi, stmt_vec_info stmt_info) ++{ ++ /* Data align. */ ++ int malign = dr_misalignment (cur_first_dr_info, vectype); ++ enum dr_alignment_support alignment_support_scheme ++ = vect_supportable_dr_alignment (vinfo, cur_first_dr_info, ++ vectype, malign); ++ unsigned HOST_WIDE_INT align; ++ unsigned int misalign = dr_align_vect_load (vinfo, cur_first_dr_info, ++ vectype, align, ++ alignment_support_scheme); ++ if (dataref_offset == NULL_TREE && TREE_CODE (dataref_ptr) == SSA_NAME) ++ { ++ set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, misalign); ++ } ++ ++ /* Get data_ref. */ ++ tree offset = dataref_offset ? dataref_offset : build_int_cst (ref_type, 0); ++ tree data_ref = fold_build2 (MEM_REF, ltype, dataref_ptr, offset); ++ if (alignment_support_scheme == dr_aligned) ++ { ++ ; ++ } ++ else if (cur_first_dr_info->misalignment == -1) ++ { ++ TREE_TYPE (data_ref) ++ = build_aligned_type (TREE_TYPE (data_ref), align * BITS_PER_UNIT); ++ } ++ else ++ { ++ tree elem_type = TREE_TYPE (vectype); ++ TREE_TYPE (data_ref) ++ = build_aligned_type (TREE_TYPE (data_ref), TYPE_ALIGN (elem_type)); ++ } ++ ++ /* Add new stmt. */ ++ vect_copy_ref_info (data_ref, DR_REF (cur_first_dr_info->dr)); ++ new_stmt = gimple_build_assign (make_ssa_name (ltype), data_ref); ++ vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi, true); ++ stmt_vec_info vec_stmt_info = vinfo->lookup_stmt (new_stmt); ++ return vec_stmt_info; ++} ++ ++static void ++push_new_stmt_to_dr_chain (bool slp_perm, stmt_vec_info new_stmt_info, ++ vec dr_chain, slp_tree slp_node) ++{ ++ if (slp_perm) ++ dr_chain.quick_push (gimple_assign_lhs (new_stmt_info->stmt)); ++ else ++ SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info->stmt); ++} ++ ++static stmt_vec_info ++get_first_stmt_info_before_transpose (stmt_vec_info first_stmt_info, ++ unsigned int group_el, ++ unsigned int group_size) ++{ ++ stmt_vec_info last_stmt_info = first_stmt_info; ++ unsigned int count = 0; ++ gcc_assert (group_el < group_size); ++ while (count < group_el) ++ { ++ last_stmt_info = DR_GROUP_NEXT_ELEMENT (last_stmt_info); ++ count++; ++ } ++ return last_stmt_info; ++} ++ ++static stmt_vec_info ++add_new_stmt_for_nloads_greater_than_one (vec_info *vinfo, tree lvectype, ++ tree vectype, ++ vec *v, ++ stmt_vec_info stmt_info, ++ gimple_stmt_iterator *gsi) ++{ ++ tree vec_inv = build_constructor (lvectype, v); ++ tree new_temp = vect_init_vector (vinfo, stmt_info, vec_inv, lvectype, gsi, true); ++ stmt_vec_info new_stmt_info = vinfo->lookup_def (new_temp); ++ if (lvectype != vectype) ++ { ++ gassign *new_stmt = gimple_build_assign (make_ssa_name (vectype), ++ VIEW_CONVERT_EXPR, ++ build1 (VIEW_CONVERT_EXPR, ++ vectype, new_temp)); ++ vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi, true); ++ new_stmt_info = vinfo->lookup_stmt (new_stmt); ++ } ++ return new_stmt_info; ++} ++ ++/* Function new_vect_stmt_for_nloads. ++ ++ New a VEC_STMT when nloads Arrays are merged into a vector. ++ ++ ncopies is the number of vectors that need to be loaded from memmory. ++ nloads is the number of ARRAYs in a vector. ++ vectemp = {a[], b[], ...} */ ++ ++static void ++new_vect_stmt_for_nloads (vec_info *vinfo, unsigned int ncopies, ++ unsigned int nloads, const vec &group_idx, ++ stmt_vec_info stmt_info, offset_info *offset_info, ++ vectype_info *vectype_info, ++ vect_memory_access_type memory_access_type, ++ bool slp_perm, vec dr_chain, slp_tree slp_node, ++ gimple_stmt_iterator *gsi) ++{ ++ vec *v = NULL; ++ stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); ++ unsigned int group_size = DR_GROUP_SIZE (first_stmt_info); ++ stmt_vec_info first_stmt_info_b = NULL; ++ stmt_vec_info new_stmt_info = NULL; ++ tree dataref_ptr = NULL_TREE; ++ tree dummy; ++ gimple *ptr_incr = NULL; ++ unsigned int n = 0; ++ for (unsigned int i = 0; i < ncopies; i++) ++ { ++ vec_alloc (v, nloads); ++ for (unsigned int t = 0; t < nloads; t++) ++ { ++ first_stmt_info_b = get_first_stmt_info_before_transpose ( ++ first_stmt_info, group_idx[n++], group_size); ++ dr_vec_info* cur_first_dr_info = get_dr_info (first_stmt_info_b); ++ tree bump = vect_get_data_ptr_increment (vinfo, cur_first_dr_info, ++ vectype_info->ltype, ++ memory_access_type); ++ bool simd_lane_access_p ++ = STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_info_b) != 0; ++ ++ /* Create dataref_ptr which is point to init_address. */ ++ dataref_ptr = vect_create_data_ref_ptr ( ++ vinfo, first_stmt_info_b, vectype_info->ltype, NULL, ++ offset_info->offset, &dummy, gsi, &ptr_incr, ++ simd_lane_access_p, bump); ++ ++ gassign *new_stmt = NULL; ++ new_stmt_info = add_new_stmt_vect_load (vinfo, vectype_info->vectype, dataref_ptr, ++ offset_info->dataref_offset, ++ vectype_info->ref_type, vectype_info->ltype, ++ new_stmt, cur_first_dr_info, gsi, ++ first_stmt_info_b); ++ ++ CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_assign_lhs (new_stmt)); ++ } ++ new_stmt_info = add_new_stmt_for_nloads_greater_than_one ( ++ vinfo, vectype_info->lvectype, ++ vectype_info->vectype, v, ++ first_stmt_info_b, gsi); ++ push_new_stmt_to_dr_chain (slp_perm, new_stmt_info, ++ dr_chain, slp_node); ++ } ++} ++ ++/* Function new_vect_stmt_for_ncontinues. ++ ++ New a VEC_STMTs when an Array is divided into several vectors. ++ ++ n_groups is the number of ARRAYs. ++ ncontinues is the number of vectors from an ARRAY. ++ vectemp1 = {a[0], a[1], ...} ++ ... ++ vectempm = {a[k], a[k+1], ...} */ ++ ++static void ++new_vect_stmt_for_ncontinues (vec_info *vinfo, unsigned int ncontinues, ++ const vec &group_idx, ++ stmt_vec_info stmt_info, ++ offset_info* offset_info, ++ vectype_info* vectype_info, ++ vect_memory_access_type memory_access_type, ++ bool slp_perm, vec &dr_chain, ++ slp_tree slp_node, ++ gimple_stmt_iterator *gsi) ++{ ++ stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); ++ unsigned int group_size = DR_GROUP_SIZE (first_stmt_info); ++ stmt_vec_info new_stmt_info = NULL; ++ tree dataref_ptr = NULL_TREE; ++ tree dummy; ++ gimple *ptr_incr = NULL; ++ unsigned int n_groups = group_idx.length (); ++ for (unsigned int i = 0; i < n_groups; i++) ++ { ++ stmt_vec_info first_stmt_info_b = get_first_stmt_info_before_transpose ( ++ first_stmt_info, group_idx[i], group_size); ++ dr_vec_info* cur_first_dr_info = get_dr_info (first_stmt_info_b); ++ tree bump = vect_get_data_ptr_increment (vinfo, cur_first_dr_info, ++ vectype_info->ltype, memory_access_type); ++ bool simd_lane_access_p ++ = STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_info_b) != 0; ++ for (unsigned int k = 0; k < ncontinues; k++) ++ { ++ /* Create dataref_ptr which is point to init_address. */ ++ if (k == 0) ++ { ++ dataref_ptr = vect_create_data_ref_ptr ( ++ vinfo, first_stmt_info_b, vectype_info->ltype, NULL, ++ offset_info->offset, &dummy, gsi, &ptr_incr, ++ simd_lane_access_p, bump); ++ } ++ else ++ { ++ dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, ++ gsi, first_stmt_info_b, bump); ++ } ++ gassign *new_stmt = NULL; ++ new_stmt_info = add_new_stmt_vect_load (vinfo, vectype_info->vectype, dataref_ptr, ++ offset_info->dataref_offset, ++ vectype_info->ref_type, vectype_info->ltype, ++ new_stmt, cur_first_dr_info, gsi, ++ first_stmt_info_b); ++ push_new_stmt_to_dr_chain (slp_perm, new_stmt_info, ++ dr_chain, slp_node); ++ } ++ } ++} ++ + /* vectorizable_load. + + Check if STMT_INFO reads a non scalar data-ref (array/pointer/structure) +@@ -9338,6 +10193,8 @@ vectorizable_load (vec_info *vinfo, + if (bb_vinfo) + first_stmt_info_for_drptr + = vect_find_first_scalar_stmt_in_slp (slp_node); ++ // first_stmt_info_for_drptr = SLP_TREE_SCALAR_STMTS (slp_node)[0]; ++ + + /* Check if the chain of loads is already vectorized. */ + if (STMT_VINFO_VEC_STMTS (first_stmt_info).exists () +@@ -9601,6 +10458,9 @@ vectorizable_load (vec_info *vinfo, + } + tree vec_mask = NULL_TREE; + poly_uint64 group_elt = 0; ++ unsigned new_group_size = 0; ++ vec new_load_permutation; ++ + for (j = 0; j < ncopies; j++) + { + /* 1. Create the vector or array pointer update chain. */ +@@ -9621,6 +10481,15 @@ vectorizable_load (vec_info *vinfo, + dataref_ptr = unshare_expr (DR_BASE_ADDRESS (first_dr_info->dr)); + dataref_offset = build_int_cst (ref_type, 0); + } ++ /* If the stmt_info need to be transposed recovery, dataref_ptr ++ will be caculated later. */ ++ else if (slp && is_a (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE ( ++ DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ dataref_ptr = NULL_TREE; ++ } + else if (diff_first_stmt_info) + { + dataref_ptr +@@ -9731,6 +10600,63 @@ vectorizable_load (vec_info *vinfo, + /* Record that VEC_ARRAY is now dead. */ + vect_clobber_variable (vinfo, stmt_info, gsi, vec_array); + } ++ else if (slp && is_a (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "vectorizable_load for slp transpose.\n"); ++ } ++ /* group_size: the size of group after merging. ++ group_size_b: the size of group before merging. ++ const_nunits: TYPE_VECTOR_SUBPARTS (vectype), it is the number of ++ elements in a vector. ++ nloads: const_nunits / group_size_b or 1, it means the number ++ of ARRAYs in a vector. ++ ncontinues: group_size_b / const_nunits or 1, it means the number ++ of vectors from an ARRAY. */ ++ unsigned int group_size_b = DR_GROUP_SIZE_TRANS (first_stmt_info); ++ unsigned int const_nunits = nunits.to_constant (); ++ unsigned int nloads = const_nunits; ++ unsigned int ncontinues = group_size_b; ++ tree lvectype = vectype; ++ tree ltype = calculate_new_type (vectype, const_nunits, ++ group_size_b, nloads, ++ ncontinues, lvectype); ++ bool this_load_permuted = false; ++ auto_vec group_idx; ++ generate_load_permutation (slp_node, new_group_size, group_size, ++ group_size_b, this_load_permuted, ++ group_idx, new_load_permutation); ++ slp_perm = is_slp_perm (slp_perm, this_load_permuted, nunits, ++ group_size, first_stmt_info); ++ ++ /* ncopies: the number of vectors that need to be loaded from ++ memmory. */ ++ unsigned int ncopies = new_group_size / const_nunits; ++ offset_info offset_info = {offset, NULL_TREE, dataref_offset}; ++ vectype_info vectype_info = {vectype, ltype, lvectype, ref_type}; ++ if (slp_perm) ++ { ++ dr_chain.create (ncopies); ++ } ++ if (nloads > 1 && ncontinues == 1) ++ { ++ new_vect_stmt_for_nloads (vinfo, ncopies, nloads, group_idx, ++ stmt_info, &offset_info, &vectype_info, ++ memory_access_type, slp_perm, dr_chain, ++ slp_node, gsi); ++ } ++ else ++ { ++ new_vect_stmt_for_ncontinues (vinfo, ncontinues, group_idx, ++ stmt_info, &offset_info, ++ &vectype_info, memory_access_type, ++ slp_perm, dr_chain, slp_node, gsi); ++ } ++ } + else + { + for (i = 0; i < vec_num; i++) +@@ -10177,7 +11103,32 @@ vectorizable_load (vec_info *vinfo, + if (slp && !slp_perm) + continue; + +- if (slp_perm) ++ /* Using the new load permutation to generate vector permute statements ++ from a list of loads in DR_CHAIN. */ ++ if (slp && slp_perm && is_a (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ unsigned n_perms; ++ stmt_vec_info stmt_info_ = SLP_TREE_SCALAR_STMTS (slp_node)[0]; ++ unsigned int old_size = DR_GROUP_SIZE (stmt_info); ++ DR_GROUP_SIZE (stmt_info_) = new_group_size; ++ vec old_load_permutation ++ = SLP_TREE_LOAD_PERMUTATION (slp_node); ++ SLP_TREE_LOAD_PERMUTATION (slp_node) = new_load_permutation; ++ bool perm_load_success = vect_transform_slp_perm_load ( ++ vinfo, slp_node, dr_chain, gsi, vf, ++ false, &n_perms); ++ DR_GROUP_SIZE (stmt_info_) = old_size; ++ SLP_TREE_LOAD_PERMUTATION (slp_node) = old_load_permutation; ++ new_load_permutation.release (); ++ if (!perm_load_success) ++ { ++ dr_chain.release (); ++ return false; ++ } ++ } ++ else if (slp_perm) + { + unsigned n_perms; + /* For SLP we know we've seen all possible uses of dr_chain so +diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h +index 642eb0aeb..e13bc6c99 100644 +--- a/gcc/tree-vectorizer.h ++++ b/gcc/tree-vectorizer.h +@@ -412,6 +412,21 @@ public: + vec ddrs; + }; + ++/* Information about offset in vectorizable_load. */ ++struct offset_info { ++ tree offset; ++ tree byte_offset; ++ tree dataref_offset; ++}; ++ ++/* Information about vectype in vectorizable_load. */ ++struct vectype_info { ++ tree vectype; ++ tree ltype; ++ tree lvectype; ++ tree ref_type; ++}; ++ + /* Vectorizer state common between loop and basic-block vectorization. */ + class vec_info { + public: +@@ -455,6 +470,14 @@ public: + stmt in the chain. */ + auto_vec grouped_stores; + ++ /* All interleaving chains of loads, represented by the first ++ stmt in the chain. */ ++ auto_vec grouped_loads; ++ ++ /* All interleaving chains of stores (before transposed), represented by all ++ stmt in the chain. */ ++ auto_vec > scalar_stores; ++ + /* The set of vector modes used in the vectorized region. */ + mode_set used_vector_modes; + +@@ -899,6 +922,8 @@ public: + #define LOOP_VINFO_CHECK_NONZERO(L) (L)->check_nonzero + #define LOOP_VINFO_LOWER_BOUNDS(L) (L)->lower_bounds + #define LOOP_VINFO_GROUPED_STORES(L) (L)->grouped_stores ++#define LOOP_VINFO_GROUPED_LOADS(L) (L)->grouped_loads ++#define LOOP_VINFO_SCALAR_STORES(L) (L)->scalar_stores + #define LOOP_VINFO_SLP_INSTANCES(L) (L)->slp_instances + #define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor + #define LOOP_VINFO_REDUCTIONS(L) (L)->reductions +@@ -982,6 +1007,25 @@ public: + vec bbs; + + vec roots; ++ ++ /* True, if bb_vinfo can goto vect_analyze_slp. */ ++ bool before_slp; ++ ++ /* True, if bb_vinfo is a transposed version. */ ++ bool transposed; ++ ++ /* The number of transposed groups. */ ++ int transposed_group; ++ ++ /* The cost of the scalar iterations. */ ++ int scalar_cost; ++ ++ /* The cost of the vector prologue and epilogue, including peeled ++ iterations and set-up code. */ ++ int vec_outside_cost; ++ ++ /* The cost of the vector loop body. */ ++ int vec_inside_cost; + } *bb_vec_info; + + #define BB_VINFO_BB(B) (B)->bb +@@ -989,6 +1033,14 @@ public: + #define BB_VINFO_SLP_INSTANCES(B) (B)->slp_instances + #define BB_VINFO_DATAREFS(B) (B)->shared->datarefs + #define BB_VINFO_DDRS(B) (B)->shared->ddrs ++#define BB_VINFO_GROUPED_LOADS(B) (B)->grouped_loads ++#define BB_VINFO_SCALAR_STORES(B) (B)->scalar_stores ++#define BB_VINFO_VEC_OUTSIDE_COST(B) (B)->vec_outside_cost ++#define BB_VINFO_VEC_INSIDE_COST(B) (B)->vec_inside_cost ++#define BB_VINFO_SCALAR_COST(B) (B)->scalar_cost ++#define BB_VINFO_SLP_TRANSPOSED(B) (B)->transposed ++#define BB_VINFO_BEFORE_SLP(B) (B)->before_slp ++#define BB_VINFO_TRANS_GROUPS(B) (B)->transposed_group + + /*-----------------------------------------------------------------*/ + /* Info on vectorized defs. */ +@@ -1219,6 +1271,17 @@ public: + stmt_vec_info next_element; + /* The size of the group. */ + unsigned int size; ++ ++ /* The size of the group before transposed. */ ++ unsigned int size_before_transpose; ++ ++ /* If true, the stmt_info is slp transposed. */ ++ bool slp_transpose; ++ ++ /* Mark the group store number for rebuild interleaving chain ++ during transpose phase. Value -1 represents unable to transpose. */ ++ int group_number; ++ + /* For stores, number of stores from this group seen. We vectorize the last + one. */ + unsigned int store_count; +@@ -1226,6 +1289,9 @@ public: + is 1. */ + unsigned int gap; + ++ /* The gap before transposed. */ ++ unsigned int gap_before_transpose; ++ + /* The minimum negative dependence distance this stmt participates in + or zero if none. */ + unsigned int min_neg_dist; +@@ -1427,6 +1493,12 @@ struct gather_scatter_info { + #define STMT_VINFO_SLP_VECT_ONLY(S) (S)->slp_vect_only_p + #define STMT_VINFO_SLP_VECT_ONLY_PATTERN(S) (S)->slp_vect_pattern_only_p + ++#define DR_GROUP_SLP_TRANSPOSE(S) \ ++ (gcc_checking_assert ((S)->dr_aux.dr), (S)->slp_transpose) ++#define DR_GROUP_SIZE_TRANS(S) \ ++ (gcc_checking_assert ((S)->dr_aux.dr), (S)->size_before_transpose) ++#define DR_GROUP_NUMBER(S) \ ++ (gcc_checking_assert ((S)->dr_aux.dr), (S)->group_number) + #define DR_GROUP_FIRST_ELEMENT(S) \ + (gcc_checking_assert ((S)->dr_aux.dr), (S)->first_element) + #define DR_GROUP_NEXT_ELEMENT(S) \ +@@ -1437,6 +1509,8 @@ struct gather_scatter_info { + (gcc_checking_assert ((S)->dr_aux.dr), (S)->store_count) + #define DR_GROUP_GAP(S) \ + (gcc_checking_assert ((S)->dr_aux.dr), (S)->gap) ++#define DR_GROUP_GAP_TRANS(S) \ ++ (gcc_checking_assert ((S)->dr_aux.dr), (S)->gap_before_transpose) + + #define REDUC_GROUP_FIRST_ELEMENT(S) \ + (gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element) +@@ -2033,6 +2107,17 @@ vect_get_scalar_dr_size (dr_vec_info *dr_info) + return tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_info->dr)))); + } + ++/* Compare two unsigned int A and B. ++ Sorting them in ascending order. */ ++ ++static inline int ++cmp_for_group_num (const void *a_, const void *b_) ++{ ++ unsigned int a = *(unsigned int *)const_cast(a_); ++ unsigned int b = *(unsigned int *)const_cast(b_); ++ return a < b ? -1 : 1; ++} ++ + /* Return true if LOOP_VINFO requires a runtime check for whether the + vector loop is profitable. */ + +@@ -2152,7 +2237,7 @@ record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count, + + extern void vect_finish_replace_stmt (vec_info *, stmt_vec_info, gimple *); + extern void vect_finish_stmt_generation (vec_info *, stmt_vec_info, gimple *, +- gimple_stmt_iterator *); ++ gimple_stmt_iterator *,bool transpose=false); + extern opt_result vect_mark_stmts_to_be_vectorized (loop_vec_info, bool *); + extern tree vect_get_store_rhs (stmt_vec_info); + void vect_get_vec_defs_for_operand (vec_info *vinfo, stmt_vec_info, unsigned, +@@ -2168,7 +2253,7 @@ void vect_get_vec_defs (vec_info *, stmt_vec_info, slp_tree, unsigned, + tree = NULL, vec * = NULL, tree = NULL, + tree = NULL, vec * = NULL, tree = NULL); + extern tree vect_init_vector (vec_info *, stmt_vec_info, tree, tree, +- gimple_stmt_iterator *); ++ gimple_stmt_iterator *, bool transpose=false); + extern tree vect_get_slp_vect_def (slp_tree, unsigned); + extern bool vect_transform_stmt (vec_info *, stmt_vec_info, + gimple_stmt_iterator *, +@@ -2235,6 +2320,9 @@ extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool); + extern void vect_permute_store_chain (vec_info *, vec &, + unsigned int, stmt_vec_info, + gimple_stmt_iterator *, vec *); ++extern void vect_transpose_store_chain (vec_info *, vec, unsigned int, ++ unsigned int, stmt_vec_info, ++ gimple_stmt_iterator *, vec *); + extern tree vect_setup_realignment (vec_info *, + stmt_vec_info, gimple_stmt_iterator *, + tree *, enum dr_alignment_support, tree, +@@ -2262,7 +2350,8 @@ extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree, + enum tree_code); + extern bool needs_fold_left_reduction_p (tree, code_helper); + /* Drive for loop analysis stage. */ +-extern opt_loop_vec_info vect_analyze_loop (class loop *, vec_info_shared *); ++extern opt_loop_vec_info vect_analyze_loop (class loop *, vec_info_shared *, ++ bool result_only_p = false); + extern tree vect_build_loop_niters (loop_vec_info, bool * = NULL); + extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *, + tree *, bool); +@@ -2331,6 +2420,7 @@ extern bool vect_transform_slp_perm_load (vec_info *, slp_tree, const vec + gimple_stmt_iterator *, poly_uint64, + bool, unsigned *, + unsigned * = nullptr, bool = false); ++extern void vect_transform_back_slp_grouped_stores (bb_vec_info, stmt_vec_info); + extern bool vect_slp_analyze_operations (vec_info *); + extern void vect_schedule_slp (vec_info *, const vec &); + extern opt_result vect_analyze_slp (vec_info *, unsigned); +-- +2.33.0 + diff --git a/0100-Add-hip09-machine-discribtion.patch b/0100-Add-hip09-machine-discribtion.patch new file mode 100644 index 0000000000000000000000000000000000000000..c2221d937ecabd58db19b1e0f7782a57a83ac638 --- /dev/null +++ b/0100-Add-hip09-machine-discribtion.patch @@ -0,0 +1,882 @@ +From d9131757175667d35e74d9ee84689039990af768 Mon Sep 17 00:00:00 2001 +From: xingyushuai +Date: Fri, 3 Mar 2023 09:31:04 +0800 +Subject: [PATCH 001/157] Add hip09 machine discribtion + +Here is the patch introducing hip09 machine model +for the scheduler. +--- + gcc/config/aarch64/aarch64-cores.def | 1 + + gcc/config/aarch64/aarch64-cost-tables.h | 104 +++++ + gcc/config/aarch64/aarch64-tune.md | 2 +- + gcc/config/aarch64/aarch64.cc | 109 +++++ + gcc/config/aarch64/aarch64.md | 1 + + gcc/config/aarch64/hip09.md | 558 +++++++++++++++++++++++ + 6 files changed, 774 insertions(+), 1 deletion(-) + create mode 100644 gcc/config/aarch64/hip09.md + +diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def +index 70b11eb80..a854bdb24 100644 +--- a/gcc/config/aarch64/aarch64-cores.def ++++ b/gcc/config/aarch64/aarch64-cores.def +@@ -130,6 +130,7 @@ AARCH64_CORE("a64fx", a64fx, a64fx, 8_2A, AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F + + /* HiSilicon ('H') cores. */ + AARCH64_CORE("tsv110", tsv110, tsv110, 8_2A, AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | AARCH64_FL_SHA2, tsv110, 0x48, 0xd01, -1) ++AARCH64_CORE("hip09", hip09, hip09, 8_5A, AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_SVE | AARCH64_FL_I8MM | AARCH64_FL_F32MM | AARCH64_FL_F64MM | AARCH64_FL_PROFILE | AARCH64_FL_PREDRES, hip09, 0x48, 0xd02, 0x0) + + /* ARMv8.3-A Architecture Processors. */ + +diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h +index 48522606f..fc5a3cbe4 100644 +--- a/gcc/config/aarch64/aarch64-cost-tables.h ++++ b/gcc/config/aarch64/aarch64-cost-tables.h +@@ -668,6 +668,110 @@ const struct cpu_cost_table a64fx_extra_costs = + } + }; + ++const struct cpu_cost_table hip09_extra_costs = ++{ ++ /* ALU */ ++ { ++ 0, /* arith. */ ++ 0, /* logical. */ ++ 0, /* shift. */ ++ 0, /* shift_reg. */ ++ COSTS_N_INSNS (1), /* arith_shift. */ ++ COSTS_N_INSNS (1), /* arith_shift_reg. */ ++ COSTS_N_INSNS (1), /* log_shift. */ ++ COSTS_N_INSNS (1), /* log_shift_reg. */ ++ 0, /* extend. */ ++ COSTS_N_INSNS (1), /* extend_arith. */ ++ 0, /* bfi. */ ++ 0, /* bfx. */ ++ 0, /* clz. */ ++ 0, /* rev. */ ++ 0, /* non_exec. */ ++ true /* non_exec_costs_exec. */ ++ }, ++ ++ { ++ /* MULT SImode */ ++ { ++ COSTS_N_INSNS (2), /* simple. */ ++ COSTS_N_INSNS (2), /* flag_setting. */ ++ COSTS_N_INSNS (2), /* extend. */ ++ COSTS_N_INSNS (2), /* add. */ ++ COSTS_N_INSNS (2), /* extend_add. */ ++ COSTS_N_INSNS (11) /* idiv. */ ++ }, ++ /* MULT DImode */ ++ { ++ COSTS_N_INSNS (3), /* simple. */ ++ 0, /* flag_setting (N/A). */ ++ COSTS_N_INSNS (3), /* extend. */ ++ COSTS_N_INSNS (3), /* add. */ ++ COSTS_N_INSNS (3), /* extend_add. */ ++ COSTS_N_INSNS (19) /* idiv. */ ++ } ++ }, ++ /* LD/ST */ ++ { ++ COSTS_N_INSNS (3), /* load. */ ++ COSTS_N_INSNS (4), /* load_sign_extend. */ ++ COSTS_N_INSNS (3), /* ldrd. */ ++ COSTS_N_INSNS (3), /* ldm_1st. */ ++ 1, /* ldm_regs_per_insn_1st. */ ++ 2, /* ldm_regs_per_insn_subsequent. */ ++ COSTS_N_INSNS (4), /* loadf. */ ++ COSTS_N_INSNS (4), /* loadd. */ ++ COSTS_N_INSNS (4), /* load_unaligned. */ ++ 0, /* store. */ ++ 0, /* strd. */ ++ 0, /* stm_1st. */ ++ 1, /* stm_regs_per_insn_1st. */ ++ 2, /* stm_regs_per_insn_subsequent. */ ++ 0, /* storef. */ ++ 0, /* stored. */ ++ COSTS_N_INSNS (1), /* store_unaligned. */ ++ COSTS_N_INSNS (4), /* loadv. */ ++ COSTS_N_INSNS (4) /* storev. */ ++ }, ++ { ++ /* FP SFmode */ ++ { ++ COSTS_N_INSNS (10), /* div. */ ++ COSTS_N_INSNS (4), /* mult. */ ++ COSTS_N_INSNS (4), /* mult_addsub. */ ++ COSTS_N_INSNS (4), /* fma. */ ++ COSTS_N_INSNS (4), /* addsub. */ ++ COSTS_N_INSNS (1), /* fpconst. */ ++ COSTS_N_INSNS (1), /* neg. */ ++ COSTS_N_INSNS (1), /* compare. */ ++ COSTS_N_INSNS (2), /* widen. */ ++ COSTS_N_INSNS (2), /* narrow. */ ++ COSTS_N_INSNS (2), /* toint. */ ++ COSTS_N_INSNS (1), /* fromint. */ ++ COSTS_N_INSNS (2) /* roundint. */ ++ }, ++ /* FP DFmode */ ++ { ++ COSTS_N_INSNS (17), /* div. */ ++ COSTS_N_INSNS (4), /* mult. */ ++ COSTS_N_INSNS (6), /* mult_addsub. */ ++ COSTS_N_INSNS (6), /* fma. */ ++ COSTS_N_INSNS (3), /* addsub. */ ++ COSTS_N_INSNS (1), /* fpconst. */ ++ COSTS_N_INSNS (1), /* neg. */ ++ COSTS_N_INSNS (1), /* compare. */ ++ COSTS_N_INSNS (2), /* widen. */ ++ COSTS_N_INSNS (2), /* narrow. */ ++ COSTS_N_INSNS (2), /* toint. */ ++ COSTS_N_INSNS (1), /* fromint. */ ++ COSTS_N_INSNS (2) /* roundint. */ ++ } ++ }, ++ /* Vector */ ++ { ++ COSTS_N_INSNS (1) /* alu. */ ++ } ++}; ++ + const struct cpu_cost_table ampere1_extra_costs = + { + /* ALU */ +diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md +index 9dc9adc70..238bb6e31 100644 +--- a/gcc/config/aarch64/aarch64-tune.md ++++ b/gcc/config/aarch64/aarch64-tune.md +@@ -1,5 +1,5 @@ + ;; -*- buffer-read-only: t -*- + ;; Generated automatically by gentune.sh from aarch64-cores.def + (define_attr "tune" +- "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexx2,neoversen2,demeter,neoversev2" ++ "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,hip09,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexx2,neoversen2,demeter,neoversev2" + (const (symbol_ref "((enum attr_tune) aarch64_tune)"))) +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 5537a537c..e9b3980c4 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -465,6 +465,22 @@ static const struct cpu_addrcost_table tsv110_addrcost_table = + 0, /* imm_offset */ + }; + ++static const struct cpu_addrcost_table hip09_addrcost_table = ++{ ++ { ++ 1, /* hi */ ++ 0, /* si */ ++ 0, /* di */ ++ 1, /* ti */ ++ }, ++ 0, /* pre_modify */ ++ 0, /* post_modify */ ++ 0, /* register_offset */ ++ 1, /* register_sextend */ ++ 1, /* register_zextend */ ++ 0, /* imm_offset */ ++}; ++ + static const struct cpu_addrcost_table qdf24xx_addrcost_table = + { + { +@@ -660,6 +676,16 @@ static const struct cpu_regmove_cost a64fx_regmove_cost = + 2 /* FP2FP */ + }; + ++static const struct cpu_regmove_cost hip09_regmove_cost = ++{ ++ 1, /* GP2GP */ ++ /* Avoid the use of slow int<->fp moves for spilling by setting ++ their cost higher than memmov_cost. */ ++ 2, /* GP2FP */ ++ 3, /* FP2GP */ ++ 2 /* FP2FP */ ++}; ++ + static const struct cpu_regmove_cost neoversen2_regmove_cost = + { + 1, /* GP2GP */ +@@ -947,6 +973,43 @@ static const struct cpu_vector_cost tsv110_vector_cost = + nullptr /* issue_info */ + }; + ++static const advsimd_vec_cost hip09_advsimd_vector_cost = ++{ ++ 2, /* int_stmt_cost */ ++ 2, /* fp_stmt_cost */ ++ 0, /* ld2_st2_permute_cost */ ++ 0, /* ld3_st3_permute_cost */ ++ 0, /* ld4_st4_permute_cost */ ++ 2, /* permute_cost */ ++ 3, /* reduc_i8_cost */ ++ 3, /* reduc_i16_cost */ ++ 3, /* reduc_i32_cost */ ++ 3, /* reduc_i64_cost */ ++ 3, /* reduc_f16_cost */ ++ 3, /* reduc_f32_cost */ ++ 3, /* reduc_f64_cost */ ++ 3, /* store_elt_extra_cost */ ++ 3, /* vec_to_scalar_cost */ ++ 2, /* scalar_to_vec_cost */ ++ 5, /* align_load_cost */ ++ 5, /* unalign_load_cost */ ++ 1, /* unalign_store_cost */ ++ 1 /* store_cost */ ++}; ++ ++static const struct cpu_vector_cost hip09_vector_cost = ++{ ++ 1, /* scalar_int_stmt_cost */ ++ 1, /* scalar_fp_stmt_cost */ ++ 5, /* scalar_load_cost */ ++ 1, /* scalar_store_cost */ ++ 1, /* cond_taken_branch_cost */ ++ 1, /* cond_not_taken_branch_cost */ ++ &hip09_advsimd_vector_cost, /* advsimd */ ++ nullptr, /* sve */ ++ nullptr /* issue_info */ ++}; ++ + static const advsimd_vec_cost cortexa57_advsimd_vector_cost = + { + 2, /* int_stmt_cost */ +@@ -1293,6 +1356,18 @@ static const cpu_prefetch_tune tsv110_prefetch_tune = + -1 /* default_opt_level */ + }; + ++ ++static const cpu_prefetch_tune hip09_prefetch_tune = ++{ ++ 0, /* num_slots */ ++ 64, /* l1_cache_size */ ++ 64, /* l1_cache_line_size */ ++ 512, /* l2_cache_size */ ++ true, /* prefetch_dynamic_strides */ ++ -1, /* minimum_stride */ ++ -1 /* default_opt_level */ ++}; ++ + static const cpu_prefetch_tune xgene1_prefetch_tune = + { + 8, /* num_slots */ +@@ -1658,6 +1733,40 @@ static const struct tune_params tsv110_tunings = + &tsv110_prefetch_tune + }; + ++static const struct tune_params hip09_tunings = ++{ ++ &hip09_extra_costs, ++ &hip09_addrcost_table, ++ &hip09_regmove_cost, ++ &hip09_vector_cost, ++ &generic_branch_cost, ++ &generic_approx_modes, ++ SVE_256, /* sve_width */ ++ { 4, /* load_int. */ ++ 4, /* store_int. */ ++ 4, /* load_fp. */ ++ 4, /* store_fp. */ ++ 4, /* load_pred. */ ++ 4 /* store_pred. */ ++ }, /* memmov_cost. */ ++ 4, /* issue_rate */ ++ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH ++ | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */ ++ "16", /* function_align. */ ++ "4", /* jump_align. */ ++ "8", /* loop_align. */ ++ 2, /* int_reassoc_width. */ ++ 4, /* fp_reassoc_width. */ ++ 1, /* vec_reassoc_width. */ ++ 2, /* min_div_recip_mul_sf. */ ++ 2, /* min_div_recip_mul_df. */ ++ 0, /* max_case_values. */ ++ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ ++ (AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS ++ | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ ++ &hip09_prefetch_tune ++}; ++ + static const struct tune_params xgene1_tunings = + { + &xgene1_extra_costs, +diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md +index d24c8afcf..cf699e4c7 100644 +--- a/gcc/config/aarch64/aarch64.md ++++ b/gcc/config/aarch64/aarch64.md +@@ -477,6 +477,7 @@ + (include "thunderx2t99.md") + (include "tsv110.md") + (include "thunderx3t110.md") ++(include "hip09.md") + + ;; ------------------------------------------------------------------- + ;; Jumps and other miscellaneous insns +diff --git a/gcc/config/aarch64/hip09.md b/gcc/config/aarch64/hip09.md +new file mode 100644 +index 000000000..25428de9a +--- /dev/null ++++ b/gcc/config/aarch64/hip09.md +@@ -0,0 +1,558 @@ ++;; hip09 pipeline description ++;; Copyright (C) 2023 Free Software Foundation, Inc. ++;; ++;;Contributed by Yushuai Xing ++;; ++;; This file is part of GCC. ++;; ++;; GCC is free software; you can redistribute it and/or modify it ++;; under the terms of the GNU General Public License as published by ++;; the Free Software Foundation; either version 3, or (at your option) ++;; any later version. ++;; ++;; GCC is distributed in the hope that it will be useful, but ++;; WITHOUT ANY WARRANTY; without even the implied warranty of ++;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++;; General Public License for more details. ++;; ++;; You should have received a copy of the GNU General Public License ++;; along with GCC; see the file COPYING3. If not see ++;; . ++ ++(define_automaton "hip09") ++(define_automaton "hip09_ldst") ++(define_automaton "hip09_fsu") ++ ++(define_attr "hip09_type" ++ "hip09_neon_abs, hip09_neon_fp_arith, hip09_neon_mul, hip09_neon_mla, ++ hip09_neon_dot, hip09_neon_fp_div, hip09_neon_fp_sqrt, ++ hip09_neon_ins, hip09_neon_load1, hip09_neon_load1_lanes, ++ hip09_neon_load2and4, hip09_neon_load3_3reg, ++ hip09_neon_load4_4reg, hip09_neon_store1and2, ++ hip09_neon_store1_1reg, hip09_neon_store1_2reg, ++ hip09_neon_store1_3reg, hip09_neon_store1_4reg, ++ hip09_neon_store3and4_lane, hip09_neon_store3_3reg, ++ hip09_neon_store4_4reg, unknown" ++ (cond [ ++ (eq_attr "type" "neon_abs,neon_abs_q,neon_add,neon_add_q,\ ++ neon_neg,neon_neg_q,neon_sub,neon_sub_q,neon_add_widen,\ ++ neon_sub_widen,neon_qadd,neon_qadd_q,\ ++ neon_add_long,neon_sub_long,\ ++ neon_qabs,neon_qabs_q,neon_qneg,\ ++ neon_qneg_q,neon_qsub,neon_qsub_q,neon_compare,\ ++ neon_compare_q,neon_compare_zero,\ ++ neon_compare_zero_q,neon_logic,neon_logic_q,\ ++ neon_minmax,neon_minmax_q,neon_tst,\ ++ neon_tst_q,neon_bsl,neon_bsl_q,\ ++ neon_cls,neon_cls_q,neon_ext,\ ++ neon_ext_q,neon_rev,neon_rev_q,\ ++ neon_tbl1,neon_tbl1_q,neon_fp_abs_s,\ ++ neon_fp_abs_s_q,neon_fp_abs_d,\ ++ neon_fp_neg_s,neon_fp_neg_s_q,\ ++ neon_fp_neg_d,neon_fp_neg_d_q,\ ++ neon_shift_imm_narrow_q,neon_move,neon_move_q") ++ (const_string "hip09_neon_abs") ++ (eq_attr "type" "neon_abd,neon_abd_q,\ ++ neon_arith_acc,neon_arith_acc_q,\ ++ neon_add_halve,neon_add_halve_q,\ ++ neon_sub_halve,neon_sub_halve_q,\ ++ neon_add_halve_narrow_q,\ ++ neon_sub_halve_narrow_q,neon_reduc_add,\ ++ neon_reduc_add_q,\ ++ neon_sat_mul_b,neon_sat_mul_b_q,\ ++ neon_sat_mul_b_long,neon_mul_b,neon_mul_b_q,\ ++ neon_mul_b_long,neon_mla_b,neon_mla_b_q,\ ++ neon_mla_b_long,neon_sat_mla_b_long,\ ++ neon_sat_shift_imm,\ ++ neon_sat_shift_imm_q,neon_shift_imm_long,\ ++ neon_shift_imm,neon_shift_imm_q,neon_cnt,\ ++ neon_cnt_q,neon_fp_recpe_s,neon_fp_recpe_s_q,\ ++ neon_fp_recpe_d,neon_fp_recpe_d_q,\ ++ neon_fp_rsqrte_s,neon_fp_rsqrte_s_q,\ ++ neon_fp_rsqrte_d,neon_fp_rsqrte_d_q,\ ++ neon_fp_recpx_s,neon_fp_recpx_s_q,\ ++ neon_fp_recpx_d,neon_fp_recpx_d_q,\ ++ neon_tbl2,neon_tbl2_q,neon_to_gp,\ ++ neon_to_gp_q,neon_fp_abd_s,neon_fp_abd_s_q,\ ++ neon_fp_abd_d,neon_fp_abd_d_q,\ ++ neon_fp_addsub_s,neon_fp_addsub_s_q,\ ++ neon_fp_addsub_d,neon_fp_addsub_d_q,\ ++ neon_fp_compare_s,neon_fp_compare_s_q,\ ++ neon_fp_compare_d,neon_fp_compare_d_q,\ ++ neon_fp_cvt_widen_s,neon_fp_to_int_s,\ ++ neon_fp_to_int_s_q,neon_fp_to_int_d,\ ++ neon_fp_to_int_d_q,neon_fp_minmax_s,\ ++ neon_fp_minmax_s_q,neon_fp_minmax_d,\ ++ neon_fp_minmax_d_q,neon_fp_round_s,\ ++ neon_fp_round_s_q,neon_fp_cvt_narrow_d_q,\ ++ neon_fp_round_d,neon_fp_round_d_q,\ ++ neon_fp_cvt_narrow_s_q") ++ (const_string "hip09_neon_fp_arith") ++ (eq_attr "type" "neon_sat_mul_h,neon_sat_mul_h_q,\ ++ neon_sat_mul_s,neon_sat_mul_s_q,\ ++ neon_sat_mul_h_scalar,neon_sat_mul_s_scalar,\ ++ neon_sat_mul_h_scalar_q,neon_sat_mul_h_long,\ ++ neon_sat_mul_s_long,neon_sat_mul_h_scalar_long,\ ++ neon_sat_mul_s_scalar_long,neon_mul_h,neon_mul_h_q,\ ++ neon_mul_s,neon_mul_s_q,neon_mul_h_long,\ ++ neon_mul_s_long,neon_mul_h_scalar_long,\ ++ neon_mul_s_scalar_long,neon_mla_h,neon_mla_h_q,\ ++ neon_mla_s,neon_mla_h_scalar,\ ++ neon_mla_h_scalar_q,neon_mla_s_scalar,\ ++ neon_mla_h_long,\ ++ neon_mla_s_long,neon_sat_mla_h_long,\ ++ neon_sat_mla_s_long,neon_sat_mla_h_scalar_long,\ ++ neon_sat_mla_s_scalar_long,neon_mla_s_scalar_long,\ ++ neon_mla_h_scalar_long,neon_mla_s_scalar_q,\ ++ neon_shift_acc,neon_shift_acc_q,neon_shift_reg,\ ++ neon_shift_reg_q,neon_sat_shift_reg,\ ++ neon_sat_shift_reg_q,neon_sat_shift_imm_narrow_q,\ ++ neon_tbl3,neon_tbl3_q,neon_fp_reduc_add_s,\ ++ neon_fp_reduc_add_s_q,neon_fp_reduc_add_d,\ ++ neon_fp_reduc_add_d_q,neon_fp_reduc_minmax_s,\ ++ neon_fp_reduc_minmax_d,neon_fp_reduc_minmax_s_q,\ ++ neon_fp_reduc_minmax_d_q,\ ++ neon_fp_mul_s_q,\ ++ neon_fp_mul_d,neon_fp_mul_d_q,\ ++ neon_fp_mul_d_scalar_q,neon_fp_mul_s_scalar,\ ++ neon_fp_mul_s_scalar_q") ++ (const_string "hip09_neon_mul") ++ (eq_attr "type" "neon_mla_s_q,neon_reduc_minmax,\ ++ neon_reduc_minmax_q,neon_fp_recps_s,\ ++ neon_fp_recps_s_q,neon_fp_recps_d,\ ++ neon_fp_recps_d_q,neon_tbl4,neon_tbl4_q,\ ++ neon_fp_mla_s,\ ++ neon_fp_mla_d,neon_fp_mla_d_q,\ ++ neon_fp_mla_s_scalar,neon_fp_mla_s_scalar_q,\ ++ neon_fp_mla_d_scalar_q") ++ (const_string "hip09_neon_mla") ++ (eq_attr "type" "neon_dot,neon_dot_q") ++ (const_string "hip09_neon_dot") ++ (eq_attr "type" "neon_fp_div_s,neon_fp_div_s_q,\ ++ neon_fp_div_d,neon_fp_div_d_q") ++ (const_string "hip09_neon_fp_div") ++ (eq_attr "type" "neon_fp_sqrt_s,neon_fp_sqrt_s_q,\ ++ neon_fp_sqrt_d,neon_fp_sqrt_d_q") ++ (const_string "hip09_neon_fp_sqrt") ++ (eq_attr "type" "neon_dup,neon_dup_q,\ ++ neon_ins,neon_ins_q") ++ (const_string "hip09_neon_ins") ++ (eq_attr "type" "neon_load1_1reg,neon_load1_1reg_q,\ ++ neon_load1_2reg,neon_load1_2reg_q,\ ++ neon_load1_3reg,neon_load1_3reg_q,\ ++ neon_load1_4reg,neon_load1_4reg_q") ++ (const_string "hip09_neon_load1") ++ (eq_attr "type" "neon_load1_one_lane,\ ++ neon_load1_one_lane_q,\ ++ neon_load1_all_lanes,neon_load1_all_lanes_q") ++ (const_string "hip09_neon_load1_lanes") ++ (eq_attr "type" "neon_load2_all_lanes,\ ++ neon_load2_all_lanes_q,\ ++ neon_load2_one_lane,neon_load2_2reg,\ ++ neon_load2_2reg_q,neon_load3_one_lane,\ ++ neon_load3_all_lanes,neon_load3_all_lanes_q,\ ++ neon_load4_one_lane,neon_load4_all_lanes,\ ++ neon_load4_all_lanes_q") ++ (const_string "hip09_neon_load2and4") ++ (eq_attr "type" "neon_load3_3reg,neon_load3_3reg_q") ++ (const_string "hip09_neon_load3_3reg") ++ (eq_attr "type" "neon_load4_4reg,neon_load4_4reg_q") ++ (const_string "hip09_neon_load4_4reg") ++ (eq_attr "type" "neon_store1_one_lane,\ ++ neon_store1_one_lane_q,neon_store2_one_lane,\ ++ neon_store2_one_lane_q,neon_store2_2reg,\ ++ neon_store2_2reg_q") ++ (const_string "hip09_neon_store1and2") ++ (eq_attr "type" "neon_store1_1reg,neon_store1_1reg_q") ++ (const_string "hip09_neon_store1_1reg") ++ (eq_attr "type" "neon_store1_2reg,neon_store1_2reg_q") ++ (const_string "hip09_neon_store1_2reg") ++ (eq_attr "type" "neon_store1_3reg,neon_store1_3reg_q") ++ (const_string "hip09_neon_store1_3reg") ++ (eq_attr "type" "neon_store1_4reg,neon_store1_4reg_q") ++ (const_string "hip09_neon_store1_4reg") ++ (eq_attr "type" "neon_store3_one_lane,\ ++ neon_store3_one_lane_q,neon_store4_one_lane,\ ++ neon_store4_one_lane_q") ++ (const_string "hip09_neon_store3and4_lane") ++ (eq_attr "type" "neon_store3_3reg,\ ++ neon_store3_3reg_q") ++ (const_string "hip09_neon_store3_3reg") ++ (eq_attr "type" "neon_store4_4reg,\ ++ neon_store4_4reg_q") ++ (const_string "hip09_neon_store4_4reg")] ++ (const_string "unknown"))) ++ ++; The hip09 core is modelled as issues pipeline that has ++; the following functional units. ++; 1. Two pipelines for branch micro operations: BRU1, BRU2 ++ ++(define_cpu_unit "hip09_bru0" "hip09") ++(define_cpu_unit "hip09_bru1" "hip09") ++ ++(define_reservation "hip09_bru01" "hip09_bru0|hip09_bru1") ++ ++; 2. Four pipelines for single cycle integer micro operations: ALUs1, ALUs2, ALUs3, ALUs4 ++ ++(define_cpu_unit "hip09_alus0" "hip09") ++(define_cpu_unit "hip09_alus1" "hip09") ++(define_cpu_unit "hip09_alus2" "hip09") ++(define_cpu_unit "hip09_alus3" "hip09") ++ ++(define_reservation "hip09_alus0123" "hip09_alus0|hip09_alus1|hip09_alus2|hip09_alus3") ++(define_reservation "hip09_alus01" "hip09_alus0|hip09_alus1") ++(define_reservation "hip09_alus23" "hip09_alus2|hip09_alus3") ++ ++; 3. Two pipelines for multi cycles integer micro operations: ALUm1, ALUm2 ++ ++(define_cpu_unit "hip09_alum0" "hip09") ++(define_cpu_unit "hip09_alum1" "hip09") ++ ++(define_reservation "hip09_alum01" "hip09_alum0|hip09_alum1") ++ ++; 4. Two pipelines for load micro opetations: Load1, Load2 ++ ++(define_cpu_unit "hip09_load0" "hip09_ldst") ++(define_cpu_unit "hip09_load1" "hip09_ldst") ++ ++(define_reservation "hip09_ld01" "hip09_load0|hip09_load1") ++ ++; 5. Two pipelines for store micro operations: Store1, Store2 ++ ++(define_cpu_unit "hip09_store0" "hip09_ldst") ++(define_cpu_unit "hip09_store1" "hip09_ldst") ++ ++(define_reservation "hip09_st01" "hip09_store0|hip09_store1") ++ ++; 6. Two pipelines for store data micro operations: STD0,STD1 ++ ++(define_cpu_unit "hip09_store_data0" "hip09_ldst") ++(define_cpu_unit "hip09_store_data1" "hip09_ldst") ++ ++(define_reservation "hip09_std01" "hip09_store_data0|hip09_store_data1") ++ ++; 7. Four asymmetric pipelines for Asimd and FP micro operations: FSU1, FSU2, FSU3, FSU4 ++ ++(define_cpu_unit "hip09_fsu0" "hip09_fsu") ++(define_cpu_unit "hip09_fsu1" "hip09_fsu") ++(define_cpu_unit "hip09_fsu2" "hip09_fsu") ++(define_cpu_unit "hip09_fsu3" "hip09_fsu") ++ ++(define_reservation "hip09_fsu0123" "hip09_fsu0|hip09_fsu1|hip09_fsu2|hip09_fsu3") ++(define_reservation "hip09_fsu02" "hip09_fsu0|hip09_fsu2") ++ ++ ++; 8. Two pipelines for sve operations but same with fsu1 and fsu3: SVE1, SVE2 ++ ++;; Simple Execution Unit: ++; ++;; Simple ALU without shift ++(define_insn_reservation "hip09_alu" 1 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "alu_imm,logic_imm,\ ++ adc_imm,adc_reg,\ ++ alu_sreg,logic_reg,\ ++ mov_imm,mov_reg,\ ++ csel,rotate_imm,bfm,mov_imm,\ ++ clz,rbit,rev")) ++ "hip09_alus0123") ++ ++(define_insn_reservation "hip09_alus" 1 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "alus_sreg,alus_imm,\ ++ adcs_reg,adcs_imm,\ ++ logics_imm,logics_reg,adr")) ++ "hip09_alus23") ++ ++;; ALU ops with shift and extend ++(define_insn_reservation "hip09_alu_ext_shift" 2 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "alu_ext,alus_ext,\ ++ logics_shift_imm,logics_shift_reg,\ ++ logic_shift_reg,logic_shift_imm,\ ++ ")) ++ "hip09_alum01") ++ ++;; Multiplies instructions ++(define_insn_reservation "hip09_mult" 3 ++ (and (eq_attr "tune" "hip09") ++ (ior (eq_attr "mul32" "yes") ++ (eq_attr "widen_mul64" "yes"))) ++ "hip09_alum01") ++ ++;; Integer divide ++(define_insn_reservation "hip09_div" 10 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "udiv,sdiv")) ++ "hip09_alum0") ++ ++;; Branch execution Unit ++; ++; Branches take two issue slot. ++; No latency as there is no result ++(define_insn_reservation "hip09_branch" 2 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "branch,call")) ++ "hip09_bru01 + hip09_alus23") ++ ++;; Load execution Unit ++; ++; Loads of up to two words. ++(define_insn_reservation "hip09_load1" 4 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "load_4,load_8")) ++ "hip09_ld01") ++ ++; Stores of up to two words. ++(define_insn_reservation "hip09_store1" 1 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "store_4,store_8")) ++ "hip09_st01") ++ ++;; FP data processing instructions. ++ ++(define_insn_reservation "hip09_fp_arith" 1 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "ffariths,ffarithd,fmov,fconsts,fconstd,\ ++ f_mrc")) ++ "hip09_fsu0123") ++ ++(define_insn_reservation "hip09_fp_cmp" 4 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "fcmps,fcmpd")) ++ "hip09_fsu0123+hip09_alus23") ++ ++(define_insn_reservation "hip09_fp_ccmp" 7 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "fccmps,fccmpd")) ++ "hip09_alus01+hip09_fsu0123+hip09_alus23") ++ ++(define_insn_reservation "hip09_fp_csel" 4 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "fcsel,f_mcr")) ++ "hip09_alus01+hip09_fsu0123") ++ ++(define_insn_reservation "hip09_fp_divs" 7 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "fdivs")) ++ "hip09_fsu0123") ++ ++(define_insn_reservation "hip09_fp_divd" 10 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "fdivd")) ++ "hip09_fsu0123") ++ ++(define_insn_reservation "hip09_fp_sqrts" 9 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "fsqrts")) ++ "hip09_fsu0123") ++ ++(define_insn_reservation "hip09_fp_sqrtd" 15 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "fsqrtd")) ++ "hip09_fsu0123") ++ ++(define_insn_reservation "hip09_fp_mul" 3 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "fmuls,fmuld")) ++ "hip09_fsu0123") ++ ++(define_insn_reservation "hip09_fp_add" 2 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "fadds,faddd,f_minmaxs,f_minmaxd,f_cvt,\ ++ f_rints,f_rintd")) ++ "hip09_fsu0123") ++ ++(define_insn_reservation "hip09_fp_mac" 4 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "fmacs,fmacd")) ++ "hip09_fsu0123") ++ ++;; FP miscellaneous instructions. ++ ++(define_insn_reservation "hip09_fp_cvt" 5 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "f_cvtf2i")) ++ "hip09_fsu0123+hip09_alus23") ++ ++(define_insn_reservation "hip09_fp_cvt2" 5 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "f_cvti2f")) ++ "hip09_alus01+hip09_fsu0123") ++ ++;; FP Load Instructions ++ ++(define_insn_reservation "hip09_fp_load" 7 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "f_loads,f_loadd")) ++ "hip09_ld01") ++ ++(define_insn_reservation "hip09_fp_load2" 6 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "neon_ldp_q,neon_ldp")) ++ "hip09_ld01+hip09_alus01") ++ ++;; FP store instructions ++ ++(define_insn_reservation "hip09_fp_store" 2 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "f_stores,f_stored")) ++ "hip09_st01+hip09_std01") ++ ++;; ASIMD integer instructions ++ ++(define_insn_reservation "hip09_asimd_base1" 1 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_abs")) ++ "hip09_fsu0123") ++ ++(define_insn_reservation "hip09_asimd_base2" 2 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_fp_arith")) ++ "hip09_fsu0123") ++ ++(define_insn_reservation "hip09_asimd_base3" 3 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_mul")) ++ "hip09_fsu0123") ++ ++(define_insn_reservation "hip09_asimd_base4" 4 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_mla")) ++ "hip09_fsu0123") ++ ++(define_insn_reservation "hip09_asimd_base5" 5 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "neon_fp_mul_s")) ++ "hip09_fsu0123") ++ ++(define_insn_reservation "hip09_asimd_dot" 6 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_dot")) ++ "hip09_fsu0123") ++ ++(define_insn_reservation "hip09_asimd_bfmmla" 9 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "neon_fp_mla_s_q")) ++ "hip09_fsu0123") ++ ++(define_insn_reservation "hip09_asimd_fdiv" 15 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_fp_div")) ++ "hip09_fsu0123") ++ ++(define_insn_reservation "hip09_asimd_fsqrt" 25 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_fp_sqrt")) ++ "hip09_fsu0123") ++ ++(define_insn_reservation "hip09_asimd_pmull" 2 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "crypto_pmull")) ++ "hip09_fsu2") ++ ++(define_insn_reservation "hip09_asimd_dup" 4 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_ins")) ++ "hip09_alus01+hip09_fsu0123") ++ ++;; ASIMD load instructions ++ ++(define_insn_reservation "hip09_asimd_ld1_reg" 6 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_load1")) ++ "hip09_ld01") ++ ++(define_insn_reservation "hip09_asimd_ld1_lane" 7 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_load1_lanes")) ++ "hip09_ld01+hip09_fsu0123") ++ ++(define_insn_reservation "hip09_asimd_ld23" 8 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_load2and4")) ++"hip09_ld01+hip09_fsu0123") ++ ++(define_insn_reservation "hip09_asimd_ld3_mtp" 9 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_load3_3reg")) ++ "hip09_ld01+hip09_fsu0123") ++ ++(define_insn_reservation "hip09_asimd_ld4_mtp" 13 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_load4_4reg")) ++ "hip09_ld01+hip09_fsu0123") ++ ++;; ASIMD store instructions ++ ++(define_insn_reservation "hip09_asimd_st12" 1 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_store1and2")) ++ "hip09_st01+hip09_std01") ++ ++(define_insn_reservation "hip09_asimd_st1_1reg" 2 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_store1_1reg")) ++ "hip09_st01+hip09_std01") ++ ++(define_insn_reservation "hip09_asimd_st1_2reg" 3 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_store1_2reg")) ++ "hip09_st01+hip09_std01") ++ ++(define_insn_reservation "hip09_asimd_st1_3reg" 4 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_store1_3reg")) ++ "hip09_st01+hip09_std01") ++ ++(define_insn_reservation "hip09_asimd_st1_4reg" 5 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_store1_4reg")) ++ "hip09_st01+hip09_std01") ++ ++(define_insn_reservation "hip09_asimd_st34_lane" 4 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_store3and4_lane")) ++ "hip09_fsu0123+hip09_st01+hip09_std01") ++ ++(define_insn_reservation "hip09_asimd_st3_mtp" 7 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_store3_3reg")) ++ "hip09_fsu0123+hip09_st01+hip09_std01") ++ ++(define_insn_reservation "hip09_asimd_st4_mtp" 10 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "hip09_type" "hip09_neon_store4_4reg")) ++ "hip09_fsu0123+hip09_st01+hip09_std01") ++ ++;; Cryptography extensions ++ ++(define_insn_reservation "hip09_asimd_aes" 2 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "crypto_aese,crypto_aesmc")) ++ "hip09_fsu02") ++ ++(define_insn_reservation "hip09_asimd_sha3" 1 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "crypto_sha3")) ++ "hip09_fsu2") ++ ++(define_insn_reservation "hip09_asimd_sha1" 2 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "crypto_sha1_fast,crypto_sha1_xor,\ ++ crypto_sha256_fast,crypto_sha512,\ ++ crypto_sm3")) ++ "hip09_fsu2") ++ ++(define_insn_reservation "hip09_asimd_sha1_and256" 4 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "crypto_sha1_slow,crypto_sha256_slow,\ ++ crypto_sm4")) ++ "hip09_fsu2") ++ ++;; CRC extension. ++ ++(define_insn_reservation "hip09_crc" 2 ++ (and (eq_attr "tune" "hip09") ++ (eq_attr "type" "crc")) ++ "hip09_alum01") +-- +2.33.0 + diff --git a/0101-Add-hip11-CPU-pipeline-scheduling.patch b/0101-Add-hip11-CPU-pipeline-scheduling.patch new file mode 100644 index 0000000000000000000000000000000000000000..7b89890555fa6f0ccb95e7d657c0ba87ebb745d2 --- /dev/null +++ b/0101-Add-hip11-CPU-pipeline-scheduling.patch @@ -0,0 +1,755 @@ +From 824fccdab1d3c5e87fb88b31f0eeb7abd1b35c1f Mon Sep 17 00:00:00 2001 +From: XingYuShuai <1150775134@qq.com> +Date: Mon, 26 Feb 2024 20:34:06 +0800 +Subject: [PATCH 002/157] Add hip11 CPU pipeline scheduling + +This patch adds an mcpu: hip11. It has been tested on aarch64 +and no regressions from this patch. +--- + gcc/config/aarch64/aarch64-cores.def | 1 + + gcc/config/aarch64/aarch64-cost-tables.h | 104 ++++++ + gcc/config/aarch64/aarch64-tune.md | 2 +- + gcc/config/aarch64/aarch64.cc | 108 ++++++ + gcc/config/aarch64/aarch64.md | 1 + + gcc/config/aarch64/hip11.md | 418 +++++++++++++++++++++++ + gcc/doc/invoke.texi | 2 +- + 7 files changed, 634 insertions(+), 2 deletions(-) + create mode 100644 gcc/config/aarch64/hip11.md + +diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def +index a854bdb24..601b72abb 100644 +--- a/gcc/config/aarch64/aarch64-cores.def ++++ b/gcc/config/aarch64/aarch64-cores.def +@@ -173,6 +173,7 @@ AARCH64_CORE("cortex-a710", cortexa710, cortexa57, 9A, AARCH64_FL_FOR_ARCH9 | + AARCH64_CORE("cortex-x2", cortexx2, cortexa57, 9A, AARCH64_FL_FOR_ARCH9 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_MEMTAG | AARCH64_FL_I8MM | AARCH64_FL_BF16, neoversen2, 0x41, 0xd48, -1) + + AARCH64_CORE("neoverse-n2", neoversen2, cortexa57, 9A, AARCH64_FL_FOR_ARCH9 | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_RNG | AARCH64_FL_MEMTAG | AARCH64_FL_PROFILE, neoversen2, 0x41, 0xd49, -1) ++AARCH64_CORE("hip11", hip11, hip11, 8_5A, AARCH64_FL_FOR_ARCH8_5| AARCH64_FL_SVE | AARCH64_FL_SVE2 | AARCH64_FL_F16, hip11, 0x48, 0xd22, -1) + + AARCH64_CORE("demeter", demeter, cortexa57, 9A, AARCH64_FL_FOR_ARCH9 | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_RNG | AARCH64_FL_MEMTAG | AARCH64_FL_PROFILE, neoversev2, 0x41, 0xd4f, -1) + AARCH64_CORE("neoverse-v2", neoversev2, cortexa57, 9A, AARCH64_FL_FOR_ARCH9 | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_RNG | AARCH64_FL_MEMTAG | AARCH64_FL_PROFILE, neoversev2, 0x41, 0xd4f, -1) +diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h +index fc5a3cbe4..0ee427b61 100644 +--- a/gcc/config/aarch64/aarch64-cost-tables.h ++++ b/gcc/config/aarch64/aarch64-cost-tables.h +@@ -561,6 +561,110 @@ const struct cpu_cost_table tsv110_extra_costs = + } + }; + ++const struct cpu_cost_table hip11_extra_costs = ++{ ++ /* ALU */ ++ { ++ 0, /* arith. */ ++ 0, /* logical. */ ++ 0, /* shift. */ ++ 0, /* shift_reg. */ ++ COSTS_N_INSNS (1), /* arith_shift. */ ++ COSTS_N_INSNS (1), /* arith_shift_reg. */ ++ COSTS_N_INSNS (1), /* log_shift. */ ++ COSTS_N_INSNS (1), /* log_shift_reg. */ ++ 0, /* extend. */ ++ COSTS_N_INSNS (1), /* extend_arith. */ ++ 0, /* bfi. */ ++ 0, /* bfx. */ ++ 0, /* clz. */ ++ 0, /* rev. */ ++ 0, /* non_exec. */ ++ true /* non_exec_costs_exec. */ ++ }, ++ ++ { ++ /* MULT SImode */ ++ { ++ COSTS_N_INSNS (2), /* simple. */ ++ COSTS_N_INSNS (2), /* flag_setting. */ ++ COSTS_N_INSNS (2), /* extend. */ ++ COSTS_N_INSNS (2), /* add. */ ++ COSTS_N_INSNS (2), /* extend_add. */ ++ COSTS_N_INSNS (11) /* idiv. */ ++ }, ++ /* MULT DImode */ ++ { ++ COSTS_N_INSNS (3), /* simple. */ ++ 0, /* flag_setting (N/A). */ ++ COSTS_N_INSNS (3), /* extend. */ ++ COSTS_N_INSNS (3), /* add. */ ++ COSTS_N_INSNS (3), /* extend_add. */ ++ COSTS_N_INSNS (19) /* idiv. */ ++ } ++ }, ++ /* LD/ST */ ++ { ++ COSTS_N_INSNS (3), /* load. */ ++ COSTS_N_INSNS (4), /* load_sign_extend. */ ++ COSTS_N_INSNS (3), /* ldrd. */ ++ COSTS_N_INSNS (3), /* ldm_1st. */ ++ 1, /* ldm_regs_per_insn_1st. */ ++ 2, /* ldm_regs_per_insn_subsequent. */ ++ COSTS_N_INSNS (4), /* loadf. */ ++ COSTS_N_INSNS (4), /* loadd. */ ++ COSTS_N_INSNS (4), /* load_unaligned. */ ++ 0, /* store. */ ++ 0, /* strd. */ ++ 0, /* stm_1st. */ ++ 1, /* stm_regs_per_insn_1st. */ ++ 2, /* stm_regs_per_insn_subsequent. */ ++ 0, /* storef. */ ++ 0, /* stored. */ ++ COSTS_N_INSNS (1), /* store_unaligned. */ ++ COSTS_N_INSNS (4), /* loadv. */ ++ COSTS_N_INSNS (4) /* storev. */ ++ }, ++ { ++ /* FP SFmode */ ++ { ++ COSTS_N_INSNS (10), /* div. */ ++ COSTS_N_INSNS (4), /* mult. */ ++ COSTS_N_INSNS (4), /* mult_addsub. */ ++ COSTS_N_INSNS (4), /* fma. */ ++ COSTS_N_INSNS (4), /* addsub. */ ++ COSTS_N_INSNS (1), /* fpconst. */ ++ COSTS_N_INSNS (1), /* neg. */ ++ COSTS_N_INSNS (1), /* compare. */ ++ COSTS_N_INSNS (2), /* widen. */ ++ COSTS_N_INSNS (2), /* narrow. */ ++ COSTS_N_INSNS (2), /* toint. */ ++ COSTS_N_INSNS (1), /* fromint. */ ++ COSTS_N_INSNS (2) /* roundint. */ ++ }, ++ /* FP DFmode */ ++ { ++ COSTS_N_INSNS (17), /* div. */ ++ COSTS_N_INSNS (4), /* mult. */ ++ COSTS_N_INSNS (6), /* mult_addsub. */ ++ COSTS_N_INSNS (6), /* fma. */ ++ COSTS_N_INSNS (3), /* addsub. */ ++ COSTS_N_INSNS (1), /* fpconst. */ ++ COSTS_N_INSNS (1), /* neg. */ ++ COSTS_N_INSNS (1), /* compare. */ ++ COSTS_N_INSNS (2), /* widen. */ ++ COSTS_N_INSNS (2), /* narrow. */ ++ COSTS_N_INSNS (2), /* toint. */ ++ COSTS_N_INSNS (1), /* fromint. */ ++ COSTS_N_INSNS (2) /* roundint. */ ++ } ++ }, ++ /* Vector */ ++ { ++ COSTS_N_INSNS (1) /* alu. */ ++ } ++}; ++ + const struct cpu_cost_table a64fx_extra_costs = + { + /* ALU */ +diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md +index 238bb6e31..511422081 100644 +--- a/gcc/config/aarch64/aarch64-tune.md ++++ b/gcc/config/aarch64/aarch64-tune.md +@@ -1,5 +1,5 @@ + ;; -*- buffer-read-only: t -*- + ;; Generated automatically by gentune.sh from aarch64-cores.def + (define_attr "tune" +- "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,hip09,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexx2,neoversen2,demeter,neoversev2" ++ "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,hip09,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexx2,neoversen2,hip11,demeter,neoversev2" + (const (symbol_ref "((enum attr_tune) aarch64_tune)"))) +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index e9b3980c4..7c62ddb2a 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -481,6 +481,22 @@ static const struct cpu_addrcost_table hip09_addrcost_table = + 0, /* imm_offset */ + }; + ++static const struct cpu_addrcost_table hip11_addrcost_table = ++{ ++ { ++ 1, /* hi */ ++ 0, /* si */ ++ 0, /* di */ ++ 1, /* ti */ ++ }, ++ 0, /* pre_modify */ ++ 0, /* post_modify */ ++ 0, /* register_offset */ ++ 1, /* register_sextend */ ++ 1, /* register_zextend */ ++ 0, /* imm_offset */ ++}; ++ + static const struct cpu_addrcost_table qdf24xx_addrcost_table = + { + { +@@ -666,6 +682,16 @@ static const struct cpu_regmove_cost tsv110_regmove_cost = + 2 /* FP2FP */ + }; + ++static const struct cpu_regmove_cost hip11_regmove_cost = ++{ ++ 1, /* GP2GP */ ++ /* Avoid the use of slow int<->fp moves for spilling by setting ++ their cost higher than memmov_cost. */ ++ 2, /* GP2FP */ ++ 3, /* FP2GP */ ++ 2 /* FP2FP */ ++}; ++ + static const struct cpu_regmove_cost a64fx_regmove_cost = + { + 1, /* GP2GP */ +@@ -1010,6 +1036,43 @@ static const struct cpu_vector_cost hip09_vector_cost = + nullptr /* issue_info */ + }; + ++static const advsimd_vec_cost hip11_advsimd_vector_cost = ++{ ++ 2, /* int_stmt_cost */ ++ 2, /* fp_stmt_cost */ ++ 0, /* ld2_st2_permute_cost */ ++ 0, /* ld3_st3_permute_cost */ ++ 0, /* ld4_st4_permute_cost */ ++ 2, /* permute_cost */ ++ 3, /* reduc_i8_cost */ ++ 3, /* reduc_i16_cost */ ++ 3, /* reduc_i32_cost */ ++ 3, /* reduc_i64_cost */ ++ 3, /* reduc_f16_cost */ ++ 3, /* reduc_f32_cost */ ++ 3, /* reduc_f64_cost */ ++ 3, /* store_elt_extra_cost */ ++ 5, /* vec_to_scalar_cost */ ++ 5, /* scalar_to_vec_cost */ ++ 5, /* align_load_cost */ ++ 5, /* unalign_load_cost */ ++ 1, /* unalign_store_cost */ ++ 1 /* store_cost */ ++}; ++ ++static const struct cpu_vector_cost hip11_vector_cost = ++{ ++ 1, /* scalar_int_stmt_cost */ ++ 1, /* scalar_fp_stmt_cost */ ++ 5, /* scalar_load_cost */ ++ 1, /* scalar_store_cost */ ++ 1, /* cond_taken_branch_cost */ ++ 1, /* cond_not_taken_branch_cost */ ++ &hip11_advsimd_vector_cost, /* advsimd */ ++ nullptr, /* sve */ ++ nullptr /* issue_info */ ++}; ++ + static const advsimd_vec_cost cortexa57_advsimd_vector_cost = + { + 2, /* int_stmt_cost */ +@@ -1368,6 +1431,17 @@ static const cpu_prefetch_tune hip09_prefetch_tune = + -1 /* default_opt_level */ + }; + ++static const cpu_prefetch_tune hip11_prefetch_tune = ++{ ++ 0, /* num_slots */ ++ 64, /* l1_cache_size */ ++ 64, /* l1_cache_line_size */ ++ 512, /* l2_cache_size */ ++ true, /* prefetch_dynamic_strides */ ++ -1, /* minimum_stride */ ++ -1 /* default_opt_level */ ++}; ++ + static const cpu_prefetch_tune xgene1_prefetch_tune = + { + 8, /* num_slots */ +@@ -1767,6 +1841,40 @@ static const struct tune_params hip09_tunings = + &hip09_prefetch_tune + }; + ++static const struct tune_params hip11_tunings = ++{ ++ &hip11_extra_costs, ++ &hip11_addrcost_table, ++ &hip11_regmove_cost, ++ &hip11_vector_cost, ++ &generic_branch_cost, ++ &generic_approx_modes, ++ SVE_512, /* sve_width */ ++ { 4, /* load_int. */ ++ 4, /* store_int. */ ++ 4, /* load_fp. */ ++ 4, /* store_fp. */ ++ 4, /* load_pred. */ ++ 4 /* store_pred. */ ++ }, /* memmov_cost. */ ++ 4, /* issue_rate */ ++ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH ++ | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */ ++ "16", /* function_align. */ ++ "4", /* jump_align. */ ++ "8", /* loop_align. */ ++ 2, /* int_reassoc_width. */ ++ 4, /* fp_reassoc_width. */ ++ 1, /* vec_reassoc_width. */ ++ 2, /* min_div_recip_mul_sf. */ ++ 2, /* min_div_recip_mul_df. */ ++ 0, /* max_case_values. */ ++ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ ++ (AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS ++ | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ ++ &hip11_prefetch_tune ++}; ++ + static const struct tune_params xgene1_tunings = + { + &xgene1_extra_costs, +diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md +index cf699e4c7..c0c64a798 100644 +--- a/gcc/config/aarch64/aarch64.md ++++ b/gcc/config/aarch64/aarch64.md +@@ -478,6 +478,7 @@ + (include "tsv110.md") + (include "thunderx3t110.md") + (include "hip09.md") ++(include "hip11.md") + + ;; ------------------------------------------------------------------- + ;; Jumps and other miscellaneous insns +diff --git a/gcc/config/aarch64/hip11.md b/gcc/config/aarch64/hip11.md +new file mode 100644 +index 000000000..45f91e65b +--- /dev/null ++++ b/gcc/config/aarch64/hip11.md +@@ -0,0 +1,418 @@ ++;; hip11 pipeline description ++;; Copyright (C) 2018-2024 Free Software Foundation, Inc. ++;; ++;; This file is part of GCC. ++;; ++;; GCC is free software; you can redistribute it and/or modify it ++;; under the terms of the GNU General Public License as published by ++;; the Free Software Foundation; either version 3, or (at your option) ++;; any later version. ++;; ++;; GCC is distributed in the hope that it will be useful, but ++;; WITHOUT ANY WARRANTY; without even the implied warranty of ++;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++;; General Public License for more details. ++;; ++;; You should have received a copy of the GNU General Public License ++;; along with GCC; see the file COPYING3. If not see ++;; . ++ ++(define_automaton "hip11") ++ ++;; The hip11 core is modelled as issues pipeline that has ++;; the following functional units. ++;; 1. Three pipelines for integer operations: ALU1, ALU2, ALU3 ++ ++(define_cpu_unit "hip11_alu1_issue" "hip11") ++(define_reservation "hip11_alu1" "hip11_alu1_issue") ++ ++(define_cpu_unit "hip11_alu2_issue" "hip11") ++(define_reservation "hip11_alu2" "hip11_alu2_issue") ++ ++(define_cpu_unit "hip11_alu3_issue" "hip11") ++(define_reservation "hip11_alu3" "hip11_alu3_issue") ++ ++(define_reservation "hip11alu" "hip11_alu1|hip11_alu2|hip11_alu3") ++ ++;; 2. One pipeline for complex integer operations: MDU ++ ++(define_cpu_unit "hip11_mdu_issue" "hip11") ++(define_reservation "hip11_mdu" "hip11_mdu_issue") ++ ++;; 3. Two asymmetric pipelines for Asimd and FP operations: FSU1, FSU2 ++(define_automaton "hip11_fsu") ++ ++(define_cpu_unit "hip11_fsu1_issue" ++ "hip11_fsu") ++(define_cpu_unit "hip11_fsu2_issue" ++ "hip11_fsu") ++ ++(define_reservation "hip11_fsu1" "hip11_fsu1_issue") ++(define_reservation "hip11_fsu2" "hip11_fsu2_issue") ++(define_reservation "hip11_fsu_pipe" "hip11_fsu1|hip11_fsu2") ++ ++;; 4. Two pipeline for branch operations but same with alu2 and alu3: BRU1, BRU2 ++ ++;; 5. Two pipelines for load and store operations: LS1, LS2. ++ ++(define_cpu_unit "hip11_ls1_issue" "hip11") ++(define_cpu_unit "hip11_ls2_issue" "hip11") ++(define_reservation "hip11_ls1" "hip11_ls1_issue") ++(define_reservation "hip11_ls2" "hip11_ls2_issue") ++ ++;; Block all issue queues. ++ ++(define_reservation "hip11_block" "hip11_fsu1_issue + hip11_fsu2_issue ++ + hip11_mdu_issue + hip11_alu1_issue ++ + hip11_alu2_issue + hip11_alu3_issue + hip11_ls1_issue + hip11_ls2_issue") ++ ++;; Branch execution Unit ++;; ++(define_insn_reservation "hip11_branch" 1 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "branch")) ++ "hip11_alu2|hip11_alu3") ++ ++(define_insn_reservation "hip11_return_from_subroutine" 6 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "branch") ++ (eq_attr "sls_length" "retbr")) ++ "hip11_mdu,(hip11_alu2|hip11_alu3)") ++ ++ ;; Simple Execution Unit: ++;; ++;; Simple ALU without shift ++(define_insn_reservation "hip11_alu" 1 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "alu_imm,logic_imm,\ ++ alu_sreg,logic_reg,\ ++ adc_imm,adc_reg,\ ++ adr,bfm,clz,rbit,rev,\ ++ shift_imm,shift_reg,\ ++ mov_imm,mov_reg,\ ++ mvn_imm,mvn_reg,\ ++ mrs,multiple,csel,\ ++ rotate_imm")) ++ "hip11_alu1|hip11_alu2|hip11_alu3") ++ ++(define_insn_reservation "hip11_alus" 1 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "alus_imm,logics_imm,\ ++ alus_sreg,logics_reg,\ ++ adcs_imm,adcs_reg")) ++ "hip11_alu2|hip11_alu3") ++ ++;; ALU ops with shift ++(define_insn_reservation "hip11_alu_shift" 2 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "extend,\ ++ alu_shift_imm_lsl_1to4,alu_shift_imm_other,alu_shift_reg,\ ++ crc,logic_shift_imm,logic_shift_reg,\ ++ mov_shift,mvn_shift,\ ++ mov_shift_reg,mvn_shift_reg")) ++ "hip11_mdu") ++ ++(define_insn_reservation "hip11_alus_shift" 2 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "alus_shift_imm,alus_shift_reg,\ ++ logics_shift_imm,logics_shift_reg")) ++ "hip11_alu2|hip11_alu3") ++ ++;; Multiplies instructions ++(define_insn_reservation "hip11_mult" 3 ++ (and (eq_attr "tune" "hip11") ++ (ior (eq_attr "mul32" "yes") ++ (eq_attr "widen_mul64" "yes"))) ++ "hip11_mdu") ++ ++;; Integer divide ++(define_insn_reservation "hip11_div" 10 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "udiv,sdiv")) ++ "hip11_mdu") ++ ++(define_insn_reservation "hip11_mla" 4 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "mla,smlal,umlal,smull,umull")) ++ "hip11_mdu") ++ ++;; Block all issue pipes for a cycle ++(define_insn_reservation "hip11_block" 1 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "block")) ++ "hip11_block") ++ ++;; Load-store execution Unit ++;; ++(define_insn_reservation "hip11_load1" 4 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "load_4,load_8,load_16")) ++ "hip11_ls1|hip11_ls2") ++ ++(define_insn_reservation "hip11_fp_load" 5 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "f_loads,f_loadd")) ++ "hip11_ls1|hip11_ls2") ++ ++(define_insn_reservation "hip11_neon_ld1_single" 7 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load1_one_lane,neon_load1_one_lane_q,\ ++ neon_load1_all_lanes,neon_load1_all_lanes_q")) ++ "(hip11_ls1|hip11_ls2)+hip11_fsu1") ++ ++(define_insn_reservation "hip11_neon_ld1_1reg" 5 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load1_1reg,neon_load1_1reg_q")) ++ "hip11_ls1|hip11_ls2") ++ ++(define_insn_reservation "hip11_neon_ld1_2reg" 6 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load1_2reg,neon_load1_2reg_q")) ++ "hip11_ls1|hip11_ls2") ++ ++(define_insn_reservation "hip11_neon_ld1_3reg" 7 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load1_3reg,neon_load1_3reg_q")) ++ "hip11_ls1|hip11_ls2") ++ ++(define_insn_reservation "hip11_neon_ld1_4reg" 8 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load1_4reg,neon_load1_4reg_q")) ++ "hip11_ls1|hip11_ls2") ++ ++(define_insn_reservation "hip11_neon_ld2" 8 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load2_one_lane,neon_load2_one_lane_q,\ ++ neon_load2_all_lanes,neon_load2_all_lanes_q,\ ++ neon_load2_2reg,neon_load2_2reg_q,\ ++ neon_load2_4reg,neon_load2_4reg_q")) ++ "(hip11_ls1|hip11_ls2)+hip11_fsu1") ++ ++(define_insn_reservation "hip11_neon_ld3_single" 9 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load3_one_lane,neon_load3_one_lane_q,\ ++ neon_load3_all_lanes,neon_load3_all_lanes_q")) ++ "(hip11_ls1|hip11_ls2)+hip11_fsu1") ++ ++(define_insn_reservation "hip11_neon_ld3_multiple" 13 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load3_3reg,neon_load3_3reg_q")) ++ "(hip11_ls1|hip11_ls2)+hip11_fsu1") ++ ++(define_insn_reservation "hip11_neon_ld4_single" 10 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load4_one_lane,neon_load4_one_lane_q,\ ++ neon_load4_all_lanes,neon_load4_all_lanes_q")) ++ "(hip11_ls1|hip11_ls2)+hip11_fsu1") ++ ++(define_insn_reservation "hip11_neon_ld4_multiple" 11 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load4_4reg,neon_load4_4reg_q")) ++ "(hip11_ls1|hip11_ls2)+hip11_fsu1") ++ ++;; Stores of up to two words. ++(define_insn_reservation "hip11_store1" 1 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "store_4,store_8,store_16,\ ++ f_stored,f_stores")) ++ "hip11_ls1|hip11_ls2") ++ ++;; Floating-Point Operations. ++(define_insn_reservation "hip11_fp_arith" 2 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "ffariths,ffarithd,f_minmaxs,\ ++ f_minmaxd,fadds,faddd,neon_fcadd")) ++ "hip11_fsu_pipe") ++ ++(define_insn_reservation "hip11_fp_mul" 3 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_fp_mul_d,neon_fp_mul_d_q,\ ++ neon_fp_mul_s_scalar,neon_fp_mul_s_scalar_q,\ ++ neon_fp_mul_d_scalar_q,fmuld,fmuls")) ++ "hip11_fsu_pipe") ++ ++(define_insn_reservation "hip11_fp_cmp" 2 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "fccmpd,fccmps")) ++ "hip11alu,hip11_fsu_pipe") ++ ++(define_insn_reservation "hip11_fp_csel" 2 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "fcsel")) ++ "hip11alu,hip11_fsu1") ++ ++(define_insn_reservation "hip11_fp_fcmp" 1 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "fcmpd,fcmps")) ++ "hip11_fsu_pipe") ++ ++(define_insn_reservation "hip11_fp_divs" 7 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "fdivs")) ++ "hip11_fsu1") ++ ++(define_insn_reservation "hip11_fp_divd" 10 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "fdivd")) ++ "hip11_fsu1") ++ ++(define_insn_reservation "hip11_fp_sqrts" 9 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "fsqrts")) ++ "hip11_fsu1") ++ ++(define_insn_reservation "hip11_fp_sqrtd" 15 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "fsqrtd")) ++ "hip11_fsu1") ++ ++(define_insn_reservation "hip11_fp_mac" 4 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "fmacs,ffmas,fmacd,ffmad")) ++ "hip11_fsu_pipe") ++ ++(define_insn_reservation "hip11_fp_mov" 1 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "fmov,neon_dup,neon_dup_q,\ ++ neon_from_gp,neon_from_gp_q,\ ++ neon_ins,neon_ins_q,\ ++ neon_to_gp,neon_to_gp_q,\ ++ neon_move,neon_move_q,\ ++ neon_rev,neon_rev_q,\ ++ neon_permute,neon_permute_q,\ ++ neon_shift_imm_narrow_q,\ ++ neon_ext,neon_ext_q,\ ++ neon_rbit,\ ++ crypto_sha3,neon_tbl1,neon_tbl1_q,\ ++ neon_tbl2_q,f_mcr,neon_tst,neon_tst_q,\ ++ neon_move_narrow_q")) ++ "hip11_fsu1") ++ ++;; ASIMD instructions ++(define_insn_reservation "hip11_asimd_simple_arithmetic" 2 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_abs,neon_abs_q,neon_neg,neon_neg_q,\ ++ neon_abd,neon_abd_q,\ ++ neon_add_long,neon_sub_long,neon_sub_widen,neon_add_widen,\ ++ neon_add_halve_narrow_q,neon_sub_halve_narrow_q,\ ++ neon_arith_acc,neon_arith_acc_q,\ ++ neon_compare,neon_compare_q,\ ++ neon_compare_zero,neon_compare_zero_q,\ ++ neon_minmax,neon_minmax_q,\ ++ neon_logic,neon_logic_q,\ ++ neon_reduc_add,neon_reduc_add_q,\ ++ neon_reduc_minmax,neon_reduc_minmax_q,\ ++ neon_fp_to_int_s,neon_fp_to_int_s_q,\ ++ neon_fp_to_int_d,neon_fp_to_int_d_q,\ ++ neon_fp_cvt_widen_s,\ ++ neon_fp_cvt_narrow_d_q,\ ++ neon_cls,neon_cls_q,\ ++ neon_cnt,neon_cnt_q,\ ++ f_rints,f_rintd,f_cvtf2i,f_cvt,\ ++ neon_tbl3,neon_fp_round_s,neon_fp_round_s_q,\ ++ neon_fp_round_d,neon_fp_round_d_q,\ ++ neon_int_to_fp_s,neon_fp_recpe_s,neon_fp_recpe_s_q,\ ++ neon_fp_recpe_d,neon_fp_recpe_d_q,\ ++ neon_fp_cvt_narrow_s_q,\ ++ crypto_aese,crypto_aesmc,\ ++ crypto_sha1_fast,crypto_sha1_xor,\ ++ crypto_sha1_slow,\ ++ crypto_sha256_fast,\ ++ crypto_sha512,crypto_sm3,\ ++ neon_qabs,neon_qabs_q,\ ++ neon_qneg,neon_qneg_q,\ ++ neon_qadd,neon_qadd_q,\ ++ neon_qsub,neon_qsub_q,\ ++ neon_add_halve,neon_add_halve_q,\ ++ neon_sub_halve,neon_sub_halve_q,\ ++ neon_fp_reduc_minmax_s,neon_fp_reduc_minmax_s_q,\ ++ neon_fp_reduc_minmax_d,neon_fp_reduc_minmax_d_q,\ ++ neon_fp_rsqrte_s,neon_fp_rsqrte_s_q,\ ++ neon_fp_rsqrte_d,neon_fp_rsqrte_d_q")) ++ "hip11_fsu1") ++ ++(define_insn_reservation "hip11_asimd_complex_arithmetic" 4 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_mul_b,neon_mul_b_q,\ ++ neon_mul_h,neon_mul_h_q,\ ++ neon_mul_s,neon_mul_s_q,\ ++ neon_mla_b,neon_mla_b_q,\ ++ neon_mla_h,neon_mla_h_q,\ ++ neon_mla_s,\ ++ neon_mla_h_scalar,neon_mla_h_scalar_q,\ ++ neon_mla_s_scalar,neon_mla_s_scalar_q,\ ++ neon_sat_mul_h_scalar,neon_sat_mul_h_scalar_q,\ ++ neon_sat_mul_s_scalar,neon_sat_mul_s_scalar_q,\ ++ neon_sat_mul_b,neon_sat_mul_b_q,\ ++ neon_sat_mul_h,neon_sat_mul_h_q,\ ++ neon_sat_mul_s,neon_sat_mul_s_q,\ ++ neon_mla_b_long,neon_mla_h_long,neon_mla_s_long,\ ++ neon_mul_b_long,neon_mul_h_long,neon_mul_s_long,\ ++ neon_sat_mla_b_long,neon_sat_mla_h_long,neon_sat_mla_s_long,\ ++ neon_sat_mla_h_scalar_long,neon_sat_mla_s_scalar_long,\ ++ neon_sat_mul_b_long,neon_sat_mul_h_long,neon_sat_mul_s_long,\ ++ neon_sat_mul_h_scalar_long,neon_sat_mul_s_scalar_long,\ ++ crypto_pmull,\ ++ neon_sat_shift_reg,neon_sat_shift_reg_q,\ ++ neon_shift_reg,neon_shift_reg_q,\ ++ neon_shift_imm,neon_shift_imm_q,\ ++ neon_shift_imm_long,\ ++ neon_sat_shift_imm,neon_sat_shift_imm_q,\ ++ neon_sat_shift_imm_narrow_q,\ ++ neon_shift_acc,neon_shift_acc_q,\ ++ crypto_sha256_slow")) ++ "hip11_fsu1") ++ ++(define_insn_reservation "hip11_asimd_fp_compare" 2 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_fp_abs_s,neon_fp_abs_s_q,\ ++ neon_fp_abs_d,neon_fp_abs_d_q,\ ++ neon_fp_neg_s,neon_fp_neg_s_q,\ ++ neon_fp_neg_d,neon_fp_neg_d_q,\ ++ neon_fp_compare_s,neon_fp_compare_s_q,\ ++ neon_fp_compare_d,neon_fp_compare_d_q,\ ++ neon_fp_minmax_s,neon_fp_minmax_s_q,\ ++ neon_fp_minmax_d,neon_fp_minmax_d_q,\ ++ neon_fp_addsub_s,neon_fp_addsub_s_q,\ ++ neon_fp_addsub_d,neon_fp_addsub_d_q,\ ++ neon_fp_reduc_add_s,neon_fp_reduc_add_s_q,\ ++ neon_fp_reduc_add_d,neon_fp_reduc_add_d_q,\ ++ neon_fp_abd_s,neon_fp_abd_s_q,\ ++ neon_fp_abd_d,neon_fp_abd_d_q")) ++ "hip11_fsu_pipe") ++ ++(define_insn_reservation "hip11_asimd_fdiv" 10 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_fp_div_s,neon_fp_div_s_q,\ ++ neon_fp_div_d,neon_fp_div_d_q")) ++ "hip11_fsu1") ++ ++(define_insn_reservation "hip11_asimd_fsqrt" 15 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_fp_sqrt_s,neon_fp_sqrt_s_q,\ ++ neon_fp_sqrt_d,neon_fp_sqrt_d_q")) ++ "hip11_fsu1") ++ ++(define_insn_reservation "hip11_asimd_fp_multiply_add" 4 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_fp_mla_s,neon_fp_mla_s_q,\ ++ neon_fp_mla_d,neon_fp_mla_d_q,\ ++ neon_fp_mla_s_scalar,neon_fp_mla_s_scalar_q,\ ++ neon_fp_mul_s,neon_fp_mul_s_q,neon_fcmla,\ ++ neon_fp_recps_s,neon_fp_recps_s_q,\ ++ neon_fp_recps_d,neon_fp_recps_d_q,\ ++ neon_fp_rsqrts_s,neon_fp_rsqrts_s_q,\ ++ neon_fp_rsqrts_d,neon_fp_rsqrts_d_q")) ++ "hip11_fsu_pipe") ++ ++(define_insn_reservation "hip11_asimd_frecpx" 3 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_fp_recpx_s,neon_fp_recpx_s_q,\ ++ neon_fp_recpx_d,neon_fp_recpx_d_q,neon_tbl4,\ ++ neon_dot,neon_dot_q")) ++ "hip11_fsu1") ++ ++(define_insn_reservation "hip11_asimd_mmla" 6 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_mla_s_q")) ++ "hip11_fsu1") +diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +index 7ca60dd64..17d9e4126 100644 +--- a/gcc/doc/invoke.texi ++++ b/gcc/doc/invoke.texi +@@ -19212,7 +19212,7 @@ performance of the code. Permissible values for this option are: + @samp{octeontx2}, @samp{octeontx2t98}, @samp{octeontx2t96} + @samp{octeontx2t93}, @samp{octeontx2f95}, @samp{octeontx2f95n}, + @samp{octeontx2f95mm}, +-@samp{a64fx}, ++@samp{a64fx},@samp{hip11} + @samp{thunderx}, @samp{thunderxt88}, + @samp{thunderxt88p1}, @samp{thunderxt81}, @samp{tsv110}, + @samp{thunderxt83}, @samp{thunderx2t99}, @samp{thunderx3t110}, @samp{zeus}, +-- +2.33.0 + diff --git a/0102-Add-Crc32-Optimization-in-Gzip-For-crc32-algorithm-i.patch b/0102-Add-Crc32-Optimization-in-Gzip-For-crc32-algorithm-i.patch new file mode 100644 index 0000000000000000000000000000000000000000..6fb0cef7ac11ea7483c88f6e86c2936dd0dac366 --- /dev/null +++ b/0102-Add-Crc32-Optimization-in-Gzip-For-crc32-algorithm-i.patch @@ -0,0 +1,2164 @@ +From 8fa9788ac64a9ea5dc92c61c8f2ec11075cd17ec Mon Sep 17 00:00:00 2001 +From: XingYushuai +Date: Thu, 15 Dec 2022 14:34:16 +0800 +Subject: [PATCH 003/157] Add Crc32 Optimization in Gzip For crc32 algorithm in + APBC int_gzip. + +Match crc32 lookup table algorithm. An example for crc32 lookup table +elg: ```c do { c = crc_32_tab[((int)c ^ (*s++)) & 0xff] ^ (c >> 8); } while (--n); + +Usage: `gcc -O3 -march=armv8.1-a -floop-crc yourfile.c` +Node: The cpu you use needs to support the crc32 instructions +--- + gcc/Makefile.in | 1 + + gcc/common.opt | 4 + + gcc/config/aarch64/aarch64-builtins.cc | 30 + + gcc/config/aarch64/aarch64-protos.h | 1 + + gcc/config/aarch64/aarch64.cc | 12 + + gcc/doc/invoke.texi | 6 +- + gcc/doc/tm.texi | 9 + + gcc/doc/tm.texi.in | 2 + + gcc/match.pd | 23 + + gcc/passes.def | 1 + + gcc/target.def | 14 + + .../tree-ssa/loop-crc-loop-condition-fail.c | 85 ++ + .../tree-ssa/loop-crc-loop-form-fail-2.c | 90 ++ + .../gcc.dg/tree-ssa/loop-crc-loop-form-fail.c | 112 ++ + .../gcc.dg/tree-ssa/loop-crc-sucess.c | 83 + + .../tree-ssa/loop-crc-table-check-fail.c | 114 ++ + gcc/timevar.def | 1 + + gcc/tree-pass.h | 1 + + gcc/tree-ssa-loop-crc.cc | 1333 +++++++++++++++++ + 19 files changed, 1921 insertions(+), 1 deletion(-) + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-condition-fail.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail-2.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-sucess.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-table-check-fail.c + create mode 100644 gcc/tree-ssa-loop-crc.cc + +diff --git a/gcc/Makefile.in b/gcc/Makefile.in +index 5cd838270..2b9f025dc 100644 +--- a/gcc/Makefile.in ++++ b/gcc/Makefile.in +@@ -1649,6 +1649,7 @@ OBJS = \ + tree-ssa-ifcombine.o \ + tree-ssa-live.o \ + tree-ssa-loop-ch.o \ ++ tree-ssa-loop-crc.o \ + tree-ssa-loop-im.o \ + tree-ssa-loop-ivcanon.o \ + tree-ssa-loop-ivopts.o \ +diff --git a/gcc/common.opt b/gcc/common.opt +index b18f0b944..42fb2fc19 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1119,6 +1119,10 @@ fcrypto-accel-aes + Common Var(flag_crypto_accel_aes) Init(0) Optimization + Perform crypto acceleration AES pattern matching. + ++floop-crc ++Common Var(flag_loop_crc) Optimization ++Do the loop crc conversion. ++ + fauto-inc-dec + Common Var(flag_auto_inc_dec) Init(1) Optimization + Generate auto-inc/dec instructions. +diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc +index 42276e7ca..3b952ef39 100644 +--- a/gcc/config/aarch64/aarch64-builtins.cc ++++ b/gcc/config/aarch64/aarch64-builtins.cc +@@ -551,6 +551,12 @@ typedef struct + #define VAR1(T, N, MAP, FLAG, A) \ + AARCH64_SIMD_BUILTIN_##T##_##N##A, + ++enum aarch64_crc_builtins{ ++ AARCH64_BUILTIN_CRC32B, ++ AARCH64_BUILTIN_CRC32H, ++ AARCH64_BUILTIN_CRC32W, ++}; ++ + enum aarch64_builtins + { + AARCH64_BUILTIN_MIN, +@@ -1812,6 +1818,30 @@ aarch64_general_builtin_decl (unsigned code, bool) + return aarch64_builtin_decls[code]; + } + ++/* Implement TARGET_GET_CRC_BUILTIN_CODE */ ++unsigned ++get_crc_builtin_code(unsigned code, bool) ++{ ++ if (code > AARCH64_BUILTIN_CRC32W) ++ return AARCH64_BUILTIN_MIN; ++ ++ unsigned res = AARCH64_BUILTIN_MIN; ++ switch (code) { ++ case AARCH64_BUILTIN_CRC32B: ++ res = AARCH64_BUILTIN_crc32b; ++ break; ++ case AARCH64_BUILTIN_CRC32H: ++ res = AARCH64_BUILTIN_crc32h; ++ break; ++ case AARCH64_BUILTIN_CRC32W: ++ res = AARCH64_BUILTIN_crc32w; ++ break; ++ default: ++ break; ++ } ++ return res; ++} ++ + typedef enum + { + SIMD_ARG_COPY_TO_REG, +diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h +index 475d174dd..853197ee9 100644 +--- a/gcc/config/aarch64/aarch64-protos.h ++++ b/gcc/config/aarch64/aarch64-protos.h +@@ -994,6 +994,7 @@ gimple *aarch64_general_gimple_fold_builtin (unsigned int, gcall *, + gimple_stmt_iterator *); + rtx aarch64_general_expand_builtin (unsigned int, tree, rtx, int); + tree aarch64_general_builtin_decl (unsigned, bool); ++unsigned get_crc_builtin_code(unsigned , bool); + tree aarch64_general_builtin_rsqrt (unsigned int); + tree aarch64_builtin_vectorized_function (unsigned int, tree, tree); + void handle_arm_acle_h (void); +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 5537a537c..280e0b618 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -15210,6 +15210,15 @@ aarch64_builtin_decl (unsigned int code, bool initialize_p) + gcc_unreachable (); + } + ++/* Implement TARGET_GET_CRC_BUILTIN_CODE. */ ++static unsigned ++aarch64_get_crc_builtin_code(unsigned code, bool initialize_p) ++{ ++ unsigned subcode = get_crc_builtin_code(code,initialize_p); ++ unsigned res = subcode << AARCH64_BUILTIN_SHIFT; ++ return res; ++} ++ + /* Return true if it is safe and beneficial to use the approximate rsqrt optabs + to optimize 1.0/sqrt. */ + +@@ -27677,6 +27686,9 @@ aarch64_get_v16qi_mode () + #undef TARGET_BUILTIN_DECL + #define TARGET_BUILTIN_DECL aarch64_builtin_decl + ++#undef TARGET_GET_CRC_BUILTIN_CODE ++#define TARGET_GET_CRC_BUILTIN_CODE aarch64_get_crc_builtin_code ++ + #undef TARGET_BUILTIN_RECIPROCAL + #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal + +diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +index 7ca60dd64..c3ce148b0 100644 +--- a/gcc/doc/invoke.texi ++++ b/gcc/doc/invoke.texi +@@ -537,7 +537,7 @@ Objective-C and Objective-C++ Dialects}. + -fisolate-erroneous-paths-dereference -fisolate-erroneous-paths-attribute @gol + -fivopts -fkeep-inline-functions -fkeep-static-functions @gol + -fkeep-static-consts -flimit-function-alignment -flive-range-shrinkage @gol +--floop-block -floop-interchange -floop-strip-mine @gol ++-floop-block -floop-crc -floop-interchange -floop-strip-mine @gol + -floop-unroll-and-jam -floop-nest-optimize @gol + -floop-parallelize-all -flra-remat -flto -flto-compression-level @gol + -flto-partition=@var{alg} -fmerge-all-constants @gol +@@ -12159,6 +12159,10 @@ GIMPLE -> GRAPHITE -> GIMPLE transformation. Some minimal optimizations + are also performed by the code generator isl, like index splitting and + dead code elimination in loops. + ++@item -floop-crc ++@opindex floop-crc ++Do the loop crc conversion ++ + @item -floop-nest-optimize + @opindex floop-nest-optimize + Enable the isl based loop nest optimizer. This is a generic loop nest +diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi +index 851d31c18..5a1e0fe43 100644 +--- a/gcc/doc/tm.texi ++++ b/gcc/doc/tm.texi +@@ -11658,6 +11658,15 @@ If @var{code} is out of range the function should return + @code{error_mark_node}. + @end deftypefn + ++@deftypefn {Target Hook} unsigned TARGET_GET_CRC_BUILTIN_CODE (unsigned @var{code}, bool @var{initialize_p}) ++Define this hook to get crc32 builtin code. It should be a function that ++returns the crc32 builtin function code @var{code}. ++If there is no such builtin and it cannot be initialized at this time ++if @var{initialize_p} is true the function should return @code{NULL_TREE}. ++If @var{code} is out of range the function should return ++@code{error_mark_node}. ++@end deftypefn ++ + @deftypefn {Target Hook} rtx TARGET_EXPAND_BUILTIN (tree @var{exp}, rtx @var{target}, rtx @var{subtarget}, machine_mode @var{mode}, int @var{ignore}) + + Expand a call to a machine specific built-in function that was set up by +diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in +index ac95cdf7a..6ff0eff66 100644 +--- a/gcc/doc/tm.texi.in ++++ b/gcc/doc/tm.texi.in +@@ -7704,6 +7704,8 @@ to by @var{ce_info}. + + @hook TARGET_BUILTIN_DECL + ++@hook TARGET_GET_CRC_BUILTIN_CODE ++ + @hook TARGET_EXPAND_BUILTIN + + @hook TARGET_RESOLVE_OVERLOADED_BUILTIN +diff --git a/gcc/match.pd b/gcc/match.pd +index aee58e47b..1f42090a2 100644 +--- a/gcc/match.pd ++++ b/gcc/match.pd +@@ -4409,6 +4409,29 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) + ) + #endif + ++#if GIMPLE ++/* Try to match ++ _4 = (int) _3; NOP_EXPR (SSA_NAME @2) ++ _5 = _4 ^ c_10; BIT_XOR_EXPR (SSA_NAME@1, SSA_NAME) ++ _6 = _5 & 255; BIT_AND_EXPR (SSA_NAME, INTEGER_CST@3) ++*/ ++(match (crc_match_index @1 @2 @3) ++ (bit_and (bit_xor (nop SSA_NAME@2) SSA_NAME@1) INTEGER_CST@3) ++ (if (INTEGRAL_TYPE_P (type) && tree_to_uhwi(@3) == 255)) ++) ++#endif ++ ++#if GIMPLE ++/* Try to match ++ _8 = c_12 >> 8; RSHIFT_EXPR (SSA_NAME @1, INTEGER_CST @2) ++ c_19 = _7 ^ _8; BIT_XOR_EXPR (SSA_NAME@3, SSA_NAME) ++*/ ++(match (crc_match_res @1 @2 @3) ++ (bit_xor SSA_NAME@3 (rshift SSA_NAME@1 INTEGER_CST@2)) ++ (if (INTEGRAL_TYPE_P (type) && tree_to_uhwi(@2) == 8)) ++) ++#endif ++ + /* Simplification moved from fold_cond_expr_with_comparison. It may also + be extended. */ + /* This pattern implements two kinds simplification: +diff --git a/gcc/passes.def b/gcc/passes.def +index cdc600298..89d6889e5 100644 +--- a/gcc/passes.def ++++ b/gcc/passes.def +@@ -95,6 +95,7 @@ along with GCC; see the file COPYING3. If not see + NEXT_PASS (pass_cd_dce, false /* update_address_taken_p */); + NEXT_PASS (pass_phiopt, true /* early_p */); + NEXT_PASS (pass_array_widen_compare); ++ NEXT_PASS (pass_loop_crc); + NEXT_PASS (pass_tail_recursion); + NEXT_PASS (pass_if_to_switch); + NEXT_PASS (pass_convert_switch); +diff --git a/gcc/target.def b/gcc/target.def +index c9bb2b4c2..8abf49f0a 100644 +--- a/gcc/target.def ++++ b/gcc/target.def +@@ -2413,6 +2413,20 @@ If @var{code} is out of range the function should return\n\ + @code{error_mark_node}.", + tree, (unsigned code, bool initialize_p), NULL) + ++/* Initialize (if INITIALIZE_P is true) and return the real code of ++ target-specific built-in function. ++ Return NULL if that is not possible. Return error_mark_node if CODE ++ is outside of the range of valid crc32 codes. */ ++DEFHOOK ++(get_crc_builtin_code, ++ "Define this hook to get crc32 builtin code. It should be a function that\n\ ++returns the crc32 builtin function code @var{code}.\n\ ++If there is no such builtin and it cannot be initialized at this time\n\ ++if @var{initialize_p} is true the function should return @code{NULL_TREE}.\n\ ++If @var{code} is out of range the function should return\n\ ++@code{error_mark_node}.", ++ unsigned , (unsigned code, bool initialize_p), NULL) ++ + /* Expand a target-specific builtin. */ + DEFHOOK + (expand_builtin, +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-condition-fail.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-condition-fail.c +new file mode 100644 +index 000000000..3620e92f7 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-condition-fail.c +@@ -0,0 +1,85 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -march=armv8.1-a -mabi=lp64 -floop-crc -fdump-tree-loop_crc-details" } */ ++ ++#include ++#include ++typedef unsigned long ulg; ++typedef unsigned char uch; ++ ++static const ulg crc_32_tab[] = { ++ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, ++ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, ++ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, ++ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, ++ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, ++ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, ++ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, ++ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, ++ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, ++ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, ++ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, ++ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, ++ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, ++ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, ++ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, ++ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, ++ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, ++ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, ++ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, ++ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, ++ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, ++ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, ++ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, ++ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, ++ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, ++ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, ++ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, ++ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, ++ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, ++ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, ++ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, ++ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, ++ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, ++ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, ++ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, ++ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, ++ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, ++ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, ++ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, ++ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, ++ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, ++ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, ++ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, ++ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, ++ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, ++ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, ++ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, ++ 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, ++ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, ++ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, ++ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, ++ 0x2d02ef8dL ++}; ++ ++ulg updcrc (s, n) ++ uch *s; /* pointer to bytes to pump through */ ++ unsigned n; /* number of bytes in s[] */ ++{ ++ register ulg c; /* temporary variable */ ++ ++ static ulg crc = (ulg)0xffffffffL; /* shift register contents */ ++ ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ if (n) ++ if (n) do { ++ c = crc_32_tab[((int)c ^ (*s++)) & 0xff] ^ (c >> 8); ++ } while (--n || c != 0) ; ++ } ++ crc = c; ++exit1: ++ return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ ++} ++/* { dg-final { scan-tree-dump-times "Wrong loop form for crc matching." 1 "loop_crc"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail-2.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail-2.c +new file mode 100644 +index 000000000..fac759c67 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail-2.c +@@ -0,0 +1,90 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -march=armv8.1-a -mabi=lp64 -floop-crc -fdump-tree-loop_crc-details" } */ ++ ++#include ++#include ++typedef unsigned long ulg; ++typedef unsigned char uch; ++ ++static const ulg crc_32_tab[] = { ++ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, ++ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, ++ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, ++ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, ++ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, ++ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, ++ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, ++ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, ++ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, ++ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, ++ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, ++ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, ++ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, ++ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, ++ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, ++ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, ++ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, ++ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, ++ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, ++ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, ++ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, ++ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, ++ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, ++ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, ++ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, ++ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, ++ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, ++ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, ++ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, ++ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, ++ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, ++ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, ++ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, ++ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, ++ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, ++ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, ++ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, ++ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, ++ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, ++ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, ++ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, ++ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, ++ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, ++ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, ++ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, ++ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, ++ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, ++ 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, ++ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, ++ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, ++ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, ++ 0x2d02ef8dL ++}; ++int test[5] = {0}; ++ ++ulg updcrc (s, n) ++ uch *s; /* pointer to bytes to pump through */ ++ unsigned n; /* number of bytes in s[] */ ++{ ++ register ulg c; /* temporary variable */ ++ ++ static ulg crc = (ulg)0xffffffffL; /* shift register contents */ ++ ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ if (n) ++ if (n) do { ++ c = crc_32_tab[((int)c ^ (*s++)) & 0xff] ^ (c >> 8); ++ } while (--n) ; ++ } ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); ++ test[c%5] = c; ++ } while (--n) ; ++ crc = c; ++ return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ ++} ++/* { dg-final { scan-tree-dump-times "Table check fail. not only single array is read." 1 "loop_crc"} } */ ++/* { dg-final { scan-tree-dump-times "Wrong crc table for crc matching." 1 "loop_crc"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail.c +new file mode 100644 +index 000000000..ba9e5bb95 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail.c +@@ -0,0 +1,112 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -march=armv8.1-a -mabi=lp64 -floop-crc -fdump-tree-loop_crc-details" } */ ++ ++#include ++#include ++typedef unsigned long ulg; ++typedef unsigned char uch; ++ ++static const ulg crc_32_tab[] = { ++ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, ++ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, ++ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, ++ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, ++ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, ++ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, ++ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, ++ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, ++ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, ++ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, ++ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, ++ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, ++ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, ++ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, ++ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, ++ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, ++ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, ++ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, ++ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, ++ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, ++ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, ++ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, ++ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, ++ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, ++ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, ++ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, ++ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, ++ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, ++ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, ++ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, ++ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, ++ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, ++ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, ++ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, ++ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, ++ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, ++ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, ++ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, ++ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, ++ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, ++ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, ++ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, ++ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, ++ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, ++ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, ++ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, ++ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, ++ 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, ++ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, ++ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, ++ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, ++ 0x2d02ef8dL ++}; ++ ++/* check when the loop have a innor loop, should fail. */ ++ulg updcrc (s, n) ++ uch *s; /* pointer to bytes to pump through */ ++ unsigned n; /* number of bytes in s[] */ ++{ ++ register ulg c; /* temporary variable */ ++ ++ static ulg crc = (ulg)0xffffffffL; /* shift register contents */ ++ ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ if (n) ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); ++ for (int i = 0; i < 5; i++) { ++ c++; ++ } ++ ++ } while (--n); ++ } ++ crc = c; ++ return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ ++} ++ ++/* check when the loop have a second backedge, should fail. */ ++ulg updcrc1(s, n) ++ uch *s; /* pointer to bytes to pump through */ ++ unsigned n; /* number of bytes in s[] */ ++{ ++ register ulg c; /* temporary variable */ ++ ++ static ulg crc = (ulg)0xffffffffL; /* shift register contents */ ++ ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ if (n) ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); ++ } while (--n || c != 0) ; ++ } ++ crc = c; ++ return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ ++} ++/* { dg-final { scan-tree-dump-times "Wrong crc table for crc matching." 1 "loop_crc"} } */ ++/* { dg-final { scan-tree-dump-times "Table check fail. not only single array is read." 1 "loop_crc"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-sucess.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-sucess.c +new file mode 100644 +index 000000000..dad7bdbfc +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-sucess.c +@@ -0,0 +1,83 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -march=armv8.1-a -mabi=lp64 -floop-crc -fdump-tree-loop_crc-details" } */ ++ ++#include ++#include ++typedef unsigned long ulg; ++typedef unsigned char uch; ++ ++static const ulg crc_32_tab[] = { ++ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, ++ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, ++ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, ++ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, ++ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, ++ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, ++ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, ++ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, ++ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, ++ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, ++ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, ++ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, ++ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, ++ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, ++ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, ++ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, ++ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, ++ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, ++ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, ++ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, ++ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, ++ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, ++ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, ++ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, ++ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, ++ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, ++ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, ++ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, ++ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, ++ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, ++ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, ++ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, ++ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, ++ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, ++ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, ++ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, ++ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, ++ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, ++ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, ++ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, ++ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, ++ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, ++ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, ++ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, ++ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, ++ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, ++ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, ++ 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, ++ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, ++ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, ++ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, ++ 0x2d02ef8dL ++}; ++ ++ulg updcrc (s, n) ++ uch *s; /* pointer to bytes to pump through */ ++ unsigned n; /* number of bytes in s[] */ ++{ ++ register ulg c; /* temporary variable */ ++ ++ static ulg crc = (ulg)0xffffffffL; /* shift register contents */ ++ ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ if (n) do { ++ c = crc_32_tab[((int)c ^ (*s++)) & 0xff] ^ (c >> 8); ++ } while (--n); ++ } ++ crc = c; ++ return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ ++} ++/* { dg-final { scan-tree-dump-times "The 1th loop form is success matched,and the loop can be optimized." 1 "loop_crc"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-table-check-fail.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-table-check-fail.c +new file mode 100644 +index 000000000..523a7740c +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-table-check-fail.c +@@ -0,0 +1,114 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -march=armv8.1-a -mabi=lp64 -floop-crc -fdump-tree-loop_crc-details" } */ ++ ++#include ++#include ++typedef unsigned long ulg; ++typedef unsigned char uch; ++ ++static const ulg crc_32_tab[] = { ++ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, ++ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, ++ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, ++ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, ++ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, ++ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, ++ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, ++ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, ++ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, ++ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, ++ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, ++ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, ++ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, ++ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, ++ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, ++ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, ++ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, ++ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, ++ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, ++ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, ++ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, ++ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, ++ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, ++ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, ++ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, ++ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, ++ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, ++ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, ++ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, ++ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, ++ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, ++ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, ++ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, ++ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, ++ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, ++ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, ++ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, ++ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, ++ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, ++ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, ++ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, ++ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, ++ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, ++ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, ++ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, ++ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, ++ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, ++ 0x37d83bf1L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, ++ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, ++ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, ++ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, ++ 0x2d02ef8dL ++}; ++int test[5] = {0}; ++ ++/* check when the loop is doing more then 1 array read or writing an array, both should fail. */ ++ulg updcrc (s, n) ++ uch *s; /* pointer to bytes to pump through */ ++ unsigned n; /* number of bytes in s[] */ ++{ ++ register ulg c; /* temporary variable */ ++ ++ static ulg crc = (ulg)0xffffffffL; /* shift register contents */ ++ ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ if (n) ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8) * test[c%5]; ++ } while (--n) ; ++ } ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); ++ test[c%5] = c; ++ } while (--n) ; ++ crc = c; ++ return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ ++} ++ ++/* check when the loop is not working on a correct crc_table. should fail. */ ++ulg updcrc1(s, n) ++ uch *s; /* pointer to bytes to pump through */ ++ unsigned n; /* number of bytes in s[] */ ++{ ++ register ulg c; /* temporary variable */ ++ ++ static ulg crc = (ulg)0xffffffffL; /* shift register contents */ ++ ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ if (n) ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); ++ } while (--n) ; ++ } ++ crc = c; ++ return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ ++} ++/* { dg-final { scan-tree-dump-times "Table check fail. not only single array is read." 2 "loop_crc"} } */ ++/* { dg-final { scan-tree-dump-times "Wrong crc table for crc matching." 3 "loop_crc"} } */ ++/* { dg-final { scan-tree-dump-times "Table check fail. Table not matching." 1 "loop_crc"} } */ +diff --git a/gcc/timevar.def b/gcc/timevar.def +index 8e7510eb3..8341b9ffd 100644 +--- a/gcc/timevar.def ++++ b/gcc/timevar.def +@@ -220,6 +220,7 @@ DEFTIMEVAR (TV_TREE_COPY_RENAME , "tree rename SSA copies") + DEFTIMEVAR (TV_TREE_SSA_VERIFY , "tree SSA verifier") + DEFTIMEVAR (TV_TREE_STMT_VERIFY , "tree STMT verifier") + DEFTIMEVAR (TV_TREE_ARRAY_WIDEN_COMPARE, "tree array widen compare") ++DEFTIMEVAR (TV_TREE_LOOP_CRC , "tree loop crc") + DEFTIMEVAR (TV_TREE_SWITCH_CONVERSION, "tree switch conversion") + DEFTIMEVAR (TV_TREE_SWITCH_LOWERING, "tree switch lowering") + DEFTIMEVAR (TV_TREE_RECIP , "gimple CSE reciprocals") +diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h +index 34e60bc38..6cd679e10 100644 +--- a/gcc/tree-pass.h ++++ b/gcc/tree-pass.h +@@ -454,6 +454,7 @@ extern gimple_opt_pass *make_pass_phiopt (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_forwprop (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_phiprop (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_array_widen_compare (gcc::context *ctxt); ++extern gimple_opt_pass *make_pass_loop_crc (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_tree_ifcombine (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_dse (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_nrv (gcc::context *ctxt); +diff --git a/gcc/tree-ssa-loop-crc.cc b/gcc/tree-ssa-loop-crc.cc +new file mode 100644 +index 000000000..b9c2f71ca +--- /dev/null ++++ b/gcc/tree-ssa-loop-crc.cc +@@ -0,0 +1,1333 @@ ++/* This pass converts special loops where do CRC algorithms to ++ simple CRC instructions in AArch64. ++ Copyright (C) 2023-2023 Free Software Foundation, Inc. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify it ++under the terms of the GNU General Public License as published by the ++Free Software Foundation; either version 3, or (at your option) any ++later version. ++ ++GCC is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++#include "config.h" ++#include "system.h" ++#include "coretypes.h" ++#include "backend.h" ++#include "target.h" ++#include "tree.h" ++#include "gimple.h" ++#include "tree-pass.h" ++#include "gimple-ssa.h" ++#include "tree-pretty-print.h" ++#include "fold-const.h" ++#include "gimplify.h" ++#include "gimple-iterator.h" ++#include "tree-ssa-loop-manip.h" ++#include "tree-ssa-loop.h" ++#include "ssa.h" ++#include "tree-into-ssa.h" ++#include "cfganal.h" ++#include "cfgloop.h" ++#include "gimple-pretty-print.h" ++#include "tree-cfg.h" ++#include "cgraph.h" ++#include "print-tree.h" ++#include "cfghooks.h" ++#include "gimple-fold.h" ++#include "diagnostic-core.h" ++ ++/* This pass handles scenarios similar to the following: ++ulg updcrc (s, n) ++ uch *s; ++ unsigned n; ++{ ++ register ulg c; ++ ++ static ulg crc = (ulg)0xffffffffL; ++ ++ if (s == NULL) ++ { ++ c = 0xffffffffL; ++ } ++ else ++ { ++ c = crc; ++ if (n) ++ do ++ { ++ c = crc_32_tab[((int)c ^ (*s++)) & 0xff] ^ (c >> 8); ++ } while (--n); ++ } ++ crc = c; ++ return c ^ 0xffffffffL; ++} ++ ++If the hardware supports the crc instruction, then the pass completes the ++conversion of the above scenario into: ++ ++#define SIZE_U32 sizeof(uint32_t) ++unsigned long updcrc(s, n) ++ unsigned char *s; ++ unsigned n; ++{ ++ register unsigned long c; ++ ++ static unsigned long crc = (unsigned long)0xffffffffL; ++ ++ if (s == NULL) ++ { ++ c = 0xffffffffL; ++ } ++ else ++ { ++ c = crc; ++ if (n) ++ { ++ uint32_t nn = n/SIZE_U32; ++ do ++ { ++ c = __crc32w (c,*((uint32_t *)s)); ++ s += SIZE_U32; ++ } while(--nn); ++ } ++ } ++ if (n & sizeof (uint16_t)) ++ { ++ c = __crc32h (c, *((uint16_t *)s)); ++ s += sizeof (uint16_t); ++ } ++ if (n & sizeof (uint8_t)) ++ c = __crc32b (c, *s); ++ crc = c; ++ return c ^ 0xffffffffL; ++} ++ ++This pass is to complete the conversion of such scenarios from ++the internal perspective of the compiler: ++1) match_crc_loop: The function completes the screening of such ++ scenarios; ++2) convert_to_new_loop: The function completes the conversion of ++ origin_loop to new loops, and removes origin_loop; ++3) origin_loop_info: The structure is used to record important ++ information of origin_loop: such as loop exit, initial value ++ of induction variable, etc; ++4) create_new_loops: The function is used as the key content ++ of the pass to complete the creation of new loops. */ ++ ++extern bool gimple_crc_match_index (tree, tree *, tree (*)(tree)); ++extern bool gimple_crc_match_res (tree, tree *, tree (*)(tree)); ++ ++static gimple *crc_table_read_stmt = NULL; ++ ++static gphi *phi_s = NULL; ++static gphi *phi_c = NULL; ++static tree nn_tree = NULL; ++ ++enum aarch64_crc_builtins ++{ ++ AARCH64_BUILTIN_CRC32B, ++ AARCH64_BUILTIN_CRC32H, ++ AARCH64_BUILTIN_CRC32W, ++}; ++ ++/* The useful information of origin loop. */ ++struct origin_loop_info ++{ ++ tree limit; /* The limit index of the array in the old loop. */ ++ tree base_n; /* The initial value of the old loop. */ ++ tree base_s; /* The initial value of the old loop. */ ++ tree base_c; /* The initial value of the old loop. */ ++ edge entry_edge; /* The edge into the old loop. */ ++ edge exit_edge; /* The edge outto the old loop. */ ++ basic_block exit_bb; ++}; ++ ++typedef struct origin_loop_info origin_loop_info; ++ ++static origin_loop_info origin_loop; ++hash_map n_map; ++hash_map nn_map; ++hash_map s_map; ++hash_map c_map; ++hash_map crc_map; ++ ++/* Initialize the origin_loop structure. */ ++static void ++init_origin_loop_structure () ++{ ++ origin_loop.entry_edge = NULL; ++ origin_loop.exit_edge = NULL; ++ origin_loop.exit_bb = NULL; ++ origin_loop.limit = NULL; ++ origin_loop.base_n = NULL; ++ origin_loop.base_s = NULL; ++ origin_loop.base_c = NULL; ++} ++ ++/* Get the edge that first entered the loop. */ ++static edge ++get_loop_preheader_edge (class loop *loop) ++{ ++ edge e; ++ edge_iterator ei; ++ ++ FOR_EACH_EDGE (e, ei, loop->header->preds) ++ if (e->src != loop->latch) ++ break; ++ ++ return e; ++} ++ ++/* Returns true if t is SSA_NAME and user variable exists. */ ++ ++static bool ++ssa_name_var_p (tree t) ++{ ++ if (!t || TREE_CODE (t) != SSA_NAME) ++ return false; ++ if (SSA_NAME_VAR (t)) ++ return true; ++ return false; ++} ++ ++/* Returns true if t1 and t2 are SSA_NAME and belong to the same variable. */ ++ ++static bool ++same_ssa_name_var_p (tree t1, tree t2) ++{ ++ if (!ssa_name_var_p (t1) || !ssa_name_var_p (t2)) ++ return false; ++ if (SSA_NAME_VAR (t1) == SSA_NAME_VAR (t2)) ++ return true; ++ return false; ++} ++ ++/* Get origin loop induction variable upper bound. */ ++ ++static bool ++get_iv_upper_bound (gimple *stmt) ++{ ++ if (origin_loop.limit != NULL || origin_loop.base_n != NULL) ++ return false; ++ ++ tree lhs = gimple_cond_lhs (stmt); ++ tree rhs = gimple_cond_rhs (stmt); ++ ++ if (TREE_CODE (TREE_TYPE (lhs)) != INTEGER_TYPE ++ || TREE_CODE (TREE_TYPE (rhs)) != INTEGER_TYPE) ++ return false; ++ ++ /* TODO: Currently, the input restrictions on lhs and rhs are implemented ++ through PARM_DECL. We may consider relax the restrictions later, and ++ we need to consider the overall adaptation scenario and adding test ++ cases. */ ++ if (ssa_name_var_p (lhs) && TREE_CODE (SSA_NAME_VAR (lhs)) == PARM_DECL) ++ { ++ origin_loop.limit = rhs; ++ origin_loop.base_n = lhs; ++ } ++ else ++ return false; ++ ++ if (origin_loop.limit != NULL && origin_loop.base_n != NULL) ++ return true; ++ ++ return false; ++} ++ ++/* Get origin loop info. */ ++static bool ++get_origin_loop_info (class loop *loop) ++{ ++ auto_vec edges = get_loop_exit_edges (loop); ++ origin_loop.exit_edge = edges[0]; ++ origin_loop.exit_bb = origin_loop.exit_edge->dest; ++ origin_loop.entry_edge = get_loop_preheader_edge (loop); ++ origin_loop.base_s = PHI_ARG_DEF_FROM_EDGE (phi_s,origin_loop.entry_edge); ++ origin_loop.base_c = PHI_ARG_DEF_FROM_EDGE (phi_c,origin_loop.entry_edge); ++ ++ basic_block preheader_bb = origin_loop.entry_edge->src; ++ ++ if (preheader_bb->preds->length () != 1) ++ return false; ++ ++ edge entry_pre_bb_edge = EDGE_PRED (preheader_bb, 0); ++ ++ basic_block pre_preheader_bb = entry_pre_bb_edge->src; ++ ++ gimple_stmt_iterator gsi; ++ gimple *stmt; ++ bool get_upper_bound = false; ++ for (gsi = gsi_start_bb (pre_preheader_bb); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ stmt = gsi_stmt (gsi); ++ if (stmt && gimple_code (stmt) == GIMPLE_COND ++ && get_iv_upper_bound (stmt)) ++ { ++ get_upper_bound = true; ++ break; ++ } ++ } ++ ++ return get_upper_bound; ++} ++ ++/* The loop form check will check the entire loop control flow ++ It should be a loop that: ++ 1. a do-while loop with header and latch only with no other control flow ++ inside the loop ++ 2. have only one exiting edge ++ 3. have only one back edge and one entry edge ++*/ ++static bool ++crc_loop_form_check (class loop *loop) ++{ ++ if (loop->num_nodes > 2 || loop->inner) ++ return false; ++ // Should only have 1 exit edge ++ auto_vec edges = get_loop_exit_edges (loop); ++ if (edges.length() != 1) ++ return false; ++ ++ // The header should have only 2 incoming edges ++ // One of them is the preheader edge and the other is the backedge from the ++ // latch ++ if (EDGE_COUNT (loop->header->preds) != 2) ++ return false; ++ edge e1 = EDGE_PRED (loop->header, 0); ++ edge e2 = EDGE_PRED (loop->header, 1); ++ ++ if ((e1->src == loop->latch && e2->src->loop_father != loop) ++ || (e2->src == loop->latch && e1->src->loop_father != loop)) ++ return true; ++ ++ return false; ++} ++ ++/* Check there is only one array is read in the loop. ++ Return the only array as crc_table. */ ++static bool ++only_one_array_read (class loop *loop, tree &crc_table) ++{ ++ gimple_stmt_iterator gsi; ++ gimple *stmt; ++ bool res = false; ++ for (gsi = gsi_start_bb (loop->header); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ stmt = gsi_stmt (gsi); ++ if (stmt == NULL) ++ return false; ++ ++ if (gimple_code (stmt) == GIMPLE_ASSIGN ++ && TREE_CODE (gimple_assign_lhs (stmt)) == ARRAY_REF) ++ return false; ++ ++ /* Only one-dimensional integer arrays meet the condition. */ ++ if (gimple_code (stmt) == GIMPLE_ASSIGN ++ && TREE_CODE (gimple_assign_rhs1 (stmt)) == ARRAY_REF ++ && TREE_CODE (TREE_OPERAND (gimple_assign_rhs1 (stmt), 0)) == VAR_DECL ++ && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt))) == INTEGER_TYPE) ++ { ++ if (crc_table == NULL ++ && TREE_READONLY (gimple_assign_rhs1 (stmt))) ++ { ++ crc_table = gimple_assign_rhs1 (stmt); ++ crc_table_read_stmt = stmt; ++ res = true; ++ } ++ else ++ return false; ++ } ++ } ++ return res; ++} ++ ++static const unsigned HOST_WIDE_INT crc_32_tab[] = { ++ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, ++ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, ++ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, ++ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, ++ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, ++ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, ++ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, ++ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, ++ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, ++ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, ++ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, ++ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, ++ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, ++ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, ++ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, ++ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, ++ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, ++ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, ++ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, ++ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, ++ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, ++ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, ++ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, ++ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, ++ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, ++ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, ++ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, ++ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, ++ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, ++ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, ++ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, ++ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, ++ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, ++ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, ++ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, ++ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, ++ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, ++ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, ++ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, ++ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, ++ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, ++ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, ++ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, ++ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, ++ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, ++ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, ++ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, ++ 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, ++ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, ++ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, ++ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, ++ 0x2d02ef8dL ++}; ++ ++/* Check the content of the array. */ ++static bool ++match_crc_table (tree crc_table) ++{ ++ const unsigned LOW_BOUND = 0; ++ const unsigned UP_BOUND = 255; ++ const unsigned ELEMENT_SIZE = 8; ++ tree low_bound = array_ref_low_bound (crc_table); ++ tree up_bound = array_ref_up_bound (crc_table); ++ tree element_size = array_ref_element_size (crc_table); ++ if (!tree_fits_uhwi_p(low_bound) || !tree_fits_uhwi_p(up_bound) || ++ !tree_fits_uhwi_p(element_size)) ++ return false; ++ unsigned HOST_WIDE_INT lb = tree_to_uhwi (low_bound); ++ unsigned HOST_WIDE_INT ub = tree_to_uhwi (up_bound); ++ unsigned HOST_WIDE_INT es = tree_to_uhwi (element_size); ++ if (lb != LOW_BOUND || ub != UP_BOUND || es != ELEMENT_SIZE) ++ return false; ++ ++ tree decl = TREE_OPERAND (crc_table, 0); ++ tree ctor = ctor_for_folding(decl); ++ for (int i = lb; i <= ub; i++) ++ { ++ unsigned HOST_WIDE_INT val = tree_to_uhwi (CONSTRUCTOR_ELT (ctor, ++ i)->value); ++ if (crc_32_tab[i] != val) ++ return false; ++ } ++ return true; ++} ++ ++/* Check the crc table. The loop should have only one data reference. ++ And match the data reference with the predefined array. */ ++static bool ++crc_table_check (class loop *loop) ++{ ++ tree crc_table = NULL; ++ if (!only_one_array_read (loop, crc_table)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nTable check fail. not only single array " ++ "is read.\n"); ++ return false; ++ } ++ if (!match_crc_table (crc_table)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nTable check fail. Table not matching.\n"); ++ return false; ++ } ++ return true; ++} ++ ++/* Check whether the evolution pattern of phi is phi = SSA_NAME + target*/ ++static bool ++evolution_pattern_plus_with_p (class loop *loop, gphi *phi, ++ unsigned HOST_WIDE_INT target) ++{ ++ edge backedge = find_edge (loop->latch, loop->header); ++ if (backedge == NULL) ++ return false; ++ tree evolution_node = PHI_ARG_DEF_FROM_EDGE (phi, backedge); ++ gimple *evolution_expr = SSA_NAME_DEF_STMT (evolution_node); ++ ++ if (evolution_expr && (gimple_assign_rhs_code (evolution_expr) == PLUS_EXPR || ++ gimple_assign_rhs_code (evolution_expr) ++ == POINTER_PLUS_EXPR)) ++ { ++ tree rhs1 = gimple_assign_rhs1 (evolution_expr); ++ tree rhs2 = gimple_assign_rhs2 (evolution_expr); ++ if (TREE_CODE (rhs1) == SSA_NAME && TREE_CODE (rhs2) == INTEGER_CST ++ && tree_to_uhwi (rhs2) == target) ++ return true; ++ } ++ return false; ++} ++ ++/* Check whether there are only 3 phi nodes in the header block. ++ Return 3 phi nodes in the capture. */ ++static bool ++check_num_of_phi (basic_block header, gphi *capture[]) ++{ ++ gphi *phi; ++ gphi_iterator gsi; ++ int num_of_phi = 0; ++ ++ for (gsi = gsi_start_phis (header); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ phi = gsi.phi (); ++ if (phi) ++ num_of_phi++; ++ if (num_of_phi > 3) ++ return false; ++ capture[num_of_phi - 1] = phi; ++ } ++ /* Phi node should be exactly 3. */ ++ return num_of_phi == 3; ++} ++ ++/* Check the evolution pattern of three phi nodes. ++ Should be one of the node +1 every time (s), one of the node -1 ++ every time (n), and a 3rd one neither (c). Return 3 phi nodes in ++ the capture with the order of s,n,c.*/ ++static bool ++check_evolution_pattern (class loop *loop, gphi *capture[]) ++{ ++ gphi *s = NULL; ++ gphi *n = NULL; ++ gphi *c = NULL; ++ ++ for (int i = 0; i < 3; i++) ++ { ++ if (evolution_pattern_plus_with_p (loop, capture[i], 1)) ++ { ++ if (s != NULL) ++ return false; ++ s = capture[i]; ++ phi_s = s; ++ } ++ else if (evolution_pattern_plus_with_p (loop, capture[i], 4294967295)) ++ { ++ if (n != NULL) ++ return false; ++ n = capture[i]; ++ } ++ else ++ { ++ if (c != NULL) ++ return false; ++ c = capture[i]; ++ phi_c = c; ++ } ++ } ++ ++ // Some envolution pattern cannot find ++ if (!n || !s || !c) ++ return false; ++ ++ capture[0] = s; ++ capture[1] = n; ++ capture[2] = c; ++ return true; ++} ++/* Check the calculation pattern before and after the crc_table array read stmt. ++ _7 = crc_32_tab[_6]; ++ The caculation of index _6 should be the result of a sequency of calculation ++ by the s and c ++ The result of the array read _7 should be used to calculate the new c. */ ++static bool ++check_calculation_pattern (class loop *loop, gphi *capture[]) ++{ ++ gphi *s = capture[0]; ++ gphi *c = capture[2]; ++ tree res_ops[3]; ++ tree index = TREE_OPERAND (gimple_assign_rhs1 (crc_table_read_stmt), 1); ++ ++ /* Try to match ++ _4 = (int) _3; //NOP_EXPR (SSA_NAME @2) ++ _5 = _4 ^ c_10; //BIT_XOR_EXPR (SSA_NAME, PHI @1) ++ _6 = _5 & 255; //BIT_XOR_EXPR (SSA_NAME, INTEGER_CST@3) ++ */ ++ if (!gimple_crc_match_index (index, res_ops, NULL)) ++ return false; ++ gimple *s_res_stmt = SSA_NAME_DEF_STMT (res_ops[0]); ++ if (!s_res_stmt) ++ return false; ++ gimple *s_def_stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (s_res_stmt)); ++ if (!s_def_stmt) ++ return false; ++ tree s_res = TREE_OPERAND (gimple_assign_rhs1 (s_def_stmt), 0); ++ if (res_ops[1] != gimple_phi_result (c) || s_res != gimple_phi_result (s)) ++ return false; ++ ++ /* Try to match ++ _8 = c_12 >> 8; // RSHIFT_EXPR (SSA_NAME @1, INTEGER_CST @2) ++ c_19 = _7 ^ _8; // BIT_XOR_EXPR (SSA_NAME@3, SSA_NAME) ++ */ ++ edge backedge = find_edge (loop->latch, loop->header); ++ tree updated_c = PHI_ARG_DEF_FROM_EDGE (c, backedge); ++ if (!gimple_crc_match_res (updated_c, res_ops, NULL)) ++ return false; ++ if (res_ops[0] != gimple_phi_result (c) ++ || res_ops[2] != gimple_assign_lhs (crc_table_read_stmt)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\n gimple_crc_match_res pattern check failed.\n"); ++ return false; ++ } ++ ++ return true; ++} ++ ++/* Check the exit condition is n != 0. */ ++static bool ++check_exit_condition (class loop *loop, gphi *n) ++{ ++ edge backedge = find_edge (loop->latch, loop->header); ++ gimple *cond_stmt = gsi_stmt (gsi_last_bb (loop->header)); ++ if (!cond_stmt || gimple_code (cond_stmt) != GIMPLE_COND ++ || gimple_cond_code (cond_stmt) != NE_EXPR ++ || gimple_cond_lhs (cond_stmt) != PHI_ARG_DEF_FROM_EDGE (n, backedge) ++ || tree_to_uhwi (gimple_cond_rhs (cond_stmt)) != 0) ++ return false; ++ ++ return true; ++} ++ ++/* Check the loop body. The loop body we are trying to match is ++ ++# s_10 = PHI ++# n_11 = PHI ++# c_12 = PHI ++_1 = (int) c_12; ++s_18 = s_10 + 1; ++_3 = *s_10; ++_4 = (int) _3; ++_5 = _1 ^ _4; ++_6 = _5 & 255; ++_7 = crc_32_tab[_6]; ++_8 = c_12 >> 8; ++c_19 = _7 ^ _8; ++n_20 = n_11 + 4294967295; ++if (n_20 != 0) ++ goto ; [INV] ++else ++ goto ; [INV] ++ ++which is doing a very simple calculation ++do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); ++} while (--n); ++ ++In this case ,we don't want this loop to have any other operation inside. ++so the matching condition is ++1. There are only 3 loop variant during each itoration, namely s,c,n, ++ which is limited by the condition that the loop have exactly 3 phi nodes. ++2. The 3 loop variants should have evolution pattern as 1 of the 3 nodes is ++ increased by 1 every itoration, 1 of the 3 nodes is decreased by 1 every itor ++ and the 3rd one is neither. These three tree node SSA value will be captured ++ for the later arithmatic pattern matching ++3. Pattern matching for the index of crc_table ++4. pattern matching for the result of c calcuation after read from crc_table ++5. The exit condition matching. ++ */ ++static bool ++crc_loop_body_check (class loop *loop) ++{ ++ basic_block header = loop->header; ++ gphi *capture[3]; ++ if (!check_num_of_phi(header, capture)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\n num of phi noeds check failed.\n"); ++ return false; ++ } ++ if (!check_evolution_pattern (loop, capture)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\n evolution pattern check failed.\n"); ++ return false; ++ } ++ if (!check_calculation_pattern (loop, capture)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\n calculation pattern check failed.\n"); ++ return false; ++ } ++ if (!check_exit_condition (loop, capture[1])) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\n exit condition check failed.\n"); ++ return false; ++ } ++ return true; ++} ++ ++static bool check_prev_bb (basic_block prev_bb, enum tree_code code) ++{ ++ gimple_stmt_iterator gsi; ++ gimple *stmt; ++ for (gsi = gsi_start_bb (prev_bb); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ stmt = gsi_stmt (gsi); ++ if (stmt == NULL) ++ return false; ++ ++ if (gimple_code (stmt) == GIMPLE_COND ++ && gimple_cond_code (stmt) == code ++ && TREE_CODE (gimple_cond_rhs (stmt)) == INTEGER_CST ++ && tree_int_cst_sgn (gimple_cond_rhs (stmt)) == 0) ++ return true; ++ } ++ return false; ++} ++ ++/* Check the prev_bb of prev_bb of loop header. The prev_bb we are trying to ++match is ++ ++c_15 = crc; ++if (n_16 (D) != 0) ++ goto ; [INV] ++else ++ goto ; [INV] ++ ++ In this case , we must be sure that the n is not zero. ++ so the match condition is ++ 1 the n is not zero. ++ ++ : ++if (s_13 (D) == 0B) ++ goto ; [INV] ++else ++ goto ; [INV] ++ ++ In this case, we must be sure the s is not NULL. ++ so the match condition is ++ 1 the s is not NULL. ++*/ ++static bool ++crc_prev_bb_of_loop_header_check (class loop *loop) ++{ ++ basic_block header = loop->header; ++ basic_block prev_header_bb = header->prev_bb; ++ if (NULL == prev_header_bb) ++ return false; ++ ++ basic_block prev_prev_header_bb = prev_header_bb->prev_bb; ++ if (NULL == prev_prev_header_bb) ++ return false; ++ ++ if (!check_prev_bb (prev_prev_header_bb, NE_EXPR)) ++ return false; ++ ++ basic_block first_bb = prev_prev_header_bb->prev_bb; ++ if (NULL == first_bb) ++ return false; ++ ++ if (!check_prev_bb (first_bb, EQ_EXPR)) ++ return false; ++ ++ return true; ++} ++ ++static bool ++match_crc_loop (class loop *loop) ++{ ++ if (!crc_loop_form_check (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nWrong loop form for crc matching.\n"); ++ return false; ++ } ++ if (!crc_table_check (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nWrong crc table for crc matching.\n"); ++ return false; ++ } ++ if (!crc_loop_body_check (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nWrong loop body for crc matching.\n"); ++ return false; ++ } ++ if (!crc_prev_bb_of_loop_header_check (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nWrong prev basic_blocks of loop header for" ++ " crc matching.\n"); ++ return false; ++ } ++ ++ init_origin_loop_structure (); ++ if (!get_origin_loop_info (loop)) ++ return false; ++ ++ return true; ++} ++ ++static void ++create_new_bb (basic_block &new_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer) ++{ ++ new_bb = create_empty_bb (after_bb); ++ add_bb_to_loop (new_bb, outer); ++ set_immediate_dominator (CDI_DOMINATORS, new_bb, dominator_bb); ++} ++ ++static void ++change_preheader_bb (edge entry_edge) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gimple *g; ++ tree lhs1; ++ ++ lhs1 = create_tmp_var (TREE_TYPE (origin_loop.base_n),"nn"); ++ lhs1 = make_ssa_name (lhs1); ++ gsi = gsi_last_bb (entry_edge->src); ++ g = gimple_build_assign (lhs1, RSHIFT_EXPR, origin_loop.base_n, ++ build_int_cst (TREE_TYPE (origin_loop.base_n), 2)); ++ gimple_seq_add_stmt (&stmts, g); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ nn_tree = lhs1; ++ set_current_def (nn_tree, lhs1); ++ nn_map.put (entry_edge->src, lhs1); ++} ++ ++static gphi * ++create_phi_node_for_bb (tree old_name, basic_block bb) ++{ ++ gphi *phi = create_phi_node (NULL_TREE, bb); ++ create_new_def_for (old_name, phi, gimple_phi_result_ptr (phi)); ++ return phi; ++} ++ ++static gimple * ++call_builtin_fun (int code,tree &lhs, tree arg1, tree arg2) ++{ ++ unsigned int builtin_code = targetm.get_crc_builtin_code (code, true); ++ // Get the decl of __builtin_aarch64_crc32w ++ tree fn = targetm.builtin_decl (builtin_code, true); ++ if (!fn || fn == error_mark_node) ++ fatal_error (input_location, ++ "target specific builtin not available"); ++ gimple *call_builtin = gimple_build_call (fn, 2, arg1, arg2); ++ lhs = make_ssa_name (unsigned_type_node); ++ gimple_call_set_lhs (call_builtin, lhs); ++ ++ return call_builtin; ++} ++ ++/* Create loop_header and loop_latch for new loop ++ : ++ # s_14 = PHI ++ # c_16 = PHI ++ # nn_19 = PHI ++ _1 = (unsigned int) c_16; ++ _2 = MEM[(uint32_t *)s_14]; ++ _40 = __builtin_aarch64_crc32w (_1, _2); ++ c_29 = (long unsigned int) _40; ++ s_30 = s_14 + 4; ++ nn_31 = nn_19 + 4294967295; ++ if (nn_31 != 0) ++ The IR of bb is as above. */ ++static void ++create_loop_bb (basic_block &loop_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer, edge entry_edge) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gimple *g; ++ gphi *phi_s_loop; ++ gphi *phi_c_loop; ++ gphi *phi_nn_loop; ++ ++ create_new_bb (loop_bb, after_bb, dominator_bb, outer); ++ redirect_edge_and_branch (entry_edge, loop_bb); ++ gsi = gsi_last_bb (loop_bb); ++ tree entry_nn = get_current_def (nn_tree); ++ phi_s_loop = create_phi_node_for_bb (origin_loop.base_s, loop_bb); ++ phi_c_loop = create_phi_node_for_bb (origin_loop.base_c, loop_bb); ++ phi_nn_loop = create_phi_node_for_bb (entry_nn, loop_bb); ++ ++ tree res_s = gimple_phi_result (phi_s_loop); ++ tree res_nn = gimple_phi_result (phi_nn_loop); ++ tree lhs1 = gimple_build (&stmts, NOP_EXPR, unsigned_type_node, ++ gimple_phi_result (phi_c_loop)); ++ g = gimple_build_assign (make_ssa_name (unsigned_type_node), ++ fold_build2 (MEM_REF, unsigned_type_node, res_s, ++ build_int_cst ( ++ build_pointer_type ( ++ unsigned_type_node),0))); ++ gimple_seq_add_stmt (&stmts, g); ++ tree lhs2 = gimple_assign_lhs (g); // _2 = MEM[(uint32_t *)s_14]; ++ unsigned int code = AARCH64_BUILTIN_CRC32W; ++ tree lhs3; ++ gimple *build_crc32w = call_builtin_fun (code, lhs3, lhs1, lhs2); ++ crc_map.put (loop_bb, lhs3); ++ gimple_seq_add_stmt (&stmts, build_crc32w); ++ ++ tree lhs4 = copy_ssa_name (origin_loop.base_c); ++ g = gimple_build_assign (lhs4, NOP_EXPR, lhs3); ++ gimple_seq_add_stmt (&stmts, g); ++ c_map.put (loop_bb, lhs4); ++ ++ tree lhs5 = copy_ssa_name (origin_loop.base_s); ++ g = gimple_build_assign (lhs5, POINTER_PLUS_EXPR, res_s, ++ build_int_cst (sizetype, 4)); ++ gimple_seq_add_stmt (&stmts, g); ++ s_map.put (loop_bb, lhs5); ++ ++ tree lhs6 = copy_ssa_name (nn_tree); ++ g = gimple_build_assign (lhs6, PLUS_EXPR, res_nn, ++ build_int_cst (TREE_TYPE (res_nn), 4294967295)); ++ gimple_seq_add_stmt (&stmts,g); ++ nn_map.put (loop_bb, lhs6); ++ ++ gcond *cond_stmt = gimple_build_cond (NE_EXPR, lhs6, origin_loop.limit, ++ NULL_TREE, NULL_TREE); ++ gimple_seq_add_stmt (&stmts, cond_stmt); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++} ++ ++/* : ++ # c_6 = PHI ++ # s_46 = PHI ++ _44 = n_26(D) & 2; ++ if (_44 != 0) ++ The IR of bb is as above. */ ++static void ++create_cond_bb (basic_block &cond_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gphi *phi_s_loop; ++ gphi *phi_c_loop; ++ ++ create_new_bb (cond_bb, after_bb, dominator_bb, outer); ++ gsi = gsi_last_bb (cond_bb); ++ tree entry_nn = get_current_def (nn_tree); ++ phi_s_loop = create_phi_node_for_bb (origin_loop.base_s, cond_bb); ++ phi_c_loop = create_phi_node_for_bb (origin_loop.base_c, cond_bb); ++ tree res_s = gimple_phi_result (phi_s_loop); ++ set_current_def (origin_loop.base_s, res_s); ++ s_map.put (cond_bb, res_s); ++ tree res_c = gimple_phi_result (phi_c_loop); ++ set_current_def (origin_loop.base_c, res_c); ++ c_map.put (cond_bb, res_c); ++ ++ tree lhs1 = gimple_build (&stmts, BIT_AND_EXPR, ++ TREE_TYPE (origin_loop.base_n), origin_loop.base_n, ++ build_int_cst (TREE_TYPE (origin_loop.base_n), 2)); ++ gcond *cond_stmt = gimple_build_cond (NE_EXPR, lhs1, origin_loop.limit, ++ NULL_TREE, NULL_TREE); ++ gimple_seq_add_stmt (&stmts, cond_stmt); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++} ++ ++/* : ++ _7 = MEM[(uint16_t *)s_46]; ++ _41 = __builtin_aarch64_crc32h (_8, _7); ++ c_33 = (long unsigned int) _41; ++ s_34 = s_30 + 2; ++ The IR of bb is as above. */ ++static void ++create_cond_true_bb (basic_block &cond_true_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer) ++{ ++ gimple_seq stmts = NULL; ++ gimple *g; ++ gimple_stmt_iterator gsi; ++ ++ create_new_bb (cond_true_bb, after_bb, dominator_bb, outer); ++ gsi = gsi_last_bb (cond_true_bb); ++ tree s_46 = *(s_map.get (after_bb)); ++ tree type = build_pointer_type (short_unsigned_type_node); ++ g = gimple_build_assign (make_ssa_name (short_unsigned_type_node), ++ fold_build2 (MEM_REF, short_unsigned_type_node, s_46, ++ build_int_cst (type, 0))); ++ gimple_seq_add_stmt (&stmts,g); ++ tree lhs1 = gimple_assign_lhs (g); // _7 = MEM[(uint16_t *)s_46]; ++ unsigned int code = AARCH64_BUILTIN_CRC32H; ++ tree lhs2; ++ gimple *call_builtin = call_builtin_fun (code, lhs2, ++ *(crc_map.get ( ++ cond_true_bb->prev_bb->prev_bb)), lhs1); ++ crc_map.put (cond_true_bb,lhs2); ++ gimple_seq_add_stmt (&stmts, call_builtin); ++ ++ tree lhs3 = copy_ssa_name (origin_loop.base_c); ++ g = gimple_build_assign (lhs3, NOP_EXPR, lhs2); ++ gimple_seq_add_stmt (&stmts, g); ++ c_map.put (cond_true_bb, lhs3); ++ ++ tree lhs5 = copy_ssa_name (s_46); ++ g = gimple_build_assign (lhs5, POINTER_PLUS_EXPR, s_46, ++ build_int_cst (sizetype, 2)); // s_30 + 2; ++ gimple_seq_add_stmt (&stmts, g); ++ s_map.put (cond_true_bb, lhs5); ++ ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ s_map.put (cond_true_bb, lhs5); ++} ++ ++/* : ++ # s_15 = PHI ++ # c_17 = PHI ++ _3 = n_26(D) & 1; ++ if (_3 != 0) ++ The IR of bb is as above. */ ++static void ++create_cond_false_bb (basic_block &cond_false_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gphi *phi_s_cond_true_bb; ++ gphi *phi_c_cond_true_bb; ++ ++ create_new_bb (cond_false_bb, after_bb, dominator_bb, outer); ++ make_single_succ_edge (after_bb, cond_false_bb, EDGE_FALLTHRU); ++ ++ tree entry_s = get_current_def (origin_loop.base_s); ++ phi_s_cond_true_bb = create_phi_node_for_bb (entry_s, cond_false_bb); ++ tree entry_c = get_current_def (origin_loop.base_c); ++ phi_c_cond_true_bb = create_phi_node_for_bb (entry_c, cond_false_bb); ++ tree res_s = gimple_phi_result (phi_s_cond_true_bb); ++ set_current_def (origin_loop.base_s, res_s); ++ s_map.put (cond_false_bb, res_s); ++ tree res_c = gimple_phi_result (phi_c_cond_true_bb); ++ set_current_def (origin_loop.base_c, res_c); ++ c_map.put (cond_false_bb, res_c); ++ ++ gsi = gsi_last_bb (cond_false_bb); ++ tree lhs1 = gimple_build (&stmts, BIT_AND_EXPR, ++ TREE_TYPE (origin_loop.base_n), origin_loop.base_n, ++ build_int_cst (TREE_TYPE (origin_loop.base_n), 1)); ++ gcond *cond_stmt = gimple_build_cond (NE_EXPR, lhs1, origin_loop.limit, ++ NULL_TREE, NULL_TREE); ++ gimple_seq_add_stmt (&stmts, cond_stmt); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++} ++ ++/* : ++ _11 = (unsigned int) c_17; ++ _12 = *s_15; ++ _42 = __builtin_aarch64_crc32b (_11, _12); ++ c_36 = (long unsigned int) _42; ++ The IR of bb is as above. */ ++static void ++create_lastcond_true_bb (basic_block &new_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gimple *g; ++ ++ create_new_bb (new_bb, after_bb, dominator_bb, outer); ++ gsi = gsi_last_bb (new_bb); ++ ++ tree lhs1 = gimple_build (&stmts, NOP_EXPR, unsigned_type_node, ++ get_current_def (origin_loop.base_c)); ++ tree lhs2; ++ tree s_15 = get_current_def (origin_loop.base_s); ++ g = gimple_build_assign (make_ssa_name (unsigned_char_type_node), ++ fold_build2 (MEM_REF, unsigned_char_type_node, s_15, ++ build_int_cst (TREE_TYPE (s_15), 0))); ++ gimple_seq_add_stmt (&stmts, g); ++ lhs2 = gimple_assign_lhs (g); ++ ++ unsigned int code = AARCH64_BUILTIN_CRC32B; ++ tree lhs3; ++ gimple *call_builtin = call_builtin_fun (code, lhs3, lhs1, lhs2); ++ crc_map.put (new_bb,lhs3); ++ gimple_seq_add_stmt (&stmts,call_builtin); ++ ++ tree lhs4 = copy_ssa_name (origin_loop.base_c); ++ g = gimple_build_assign (lhs4, NOP_EXPR, lhs3); ++ gimple_seq_add_stmt (&stmts, g); ++ c_map.put (new_bb, lhs4); ++ ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++} ++ ++static bool ++optional_add_phi_arg (gphi * phi, tree phi_res, tree phi_arg, edge e) ++{ ++ location_t loc; ++ if (same_ssa_name_var_p (phi_arg, phi_res)) ++ { ++ if (virtual_operand_p (phi_arg)) ++ loc = UNKNOWN_LOCATION; ++ else ++ loc = gimple_location (SSA_NAME_DEF_STMT (phi_arg)); ++ add_phi_arg (phi, phi_arg, e, loc); ++ ++ return true; ++ } ++ ++ return false; ++} ++ ++/* Add phi_arg for bb with phi node. */ ++static void ++update_phi_nodes (basic_block bb) ++{ ++ edge e; ++ edge_iterator ei; ++ gphi *phi; ++ gphi_iterator gsi; ++ tree res; ++ ++ for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ phi = gsi.phi (); ++ res = gimple_phi_result (phi); ++ ++ FOR_EACH_EDGE (e, ei, bb->preds) ++ { ++ if (PHI_ARG_DEF_FROM_EDGE (phi, e)) ++ continue; ++ tree var_c; ++ tree *ptr_var_c = c_map.get (e->src); ++ if (ptr_var_c == NULL) ++ var_c = origin_loop.base_c; ++ else ++ var_c = *ptr_var_c; ++ if (optional_add_phi_arg (phi, res, var_c, e)) ++ continue; ++ ++ tree var_nn; ++ tree *ptr_var_nn = nn_map.get (e->src); ++ if (ptr_var_nn == NULL) ++ var_nn = nn_tree; ++ else ++ var_nn = *ptr_var_nn; ++ if (optional_add_phi_arg (phi, res, var_nn, e)) ++ continue; ++ ++ tree var_s; ++ tree *ptr_var_s = s_map.get (e->src); ++ if (ptr_var_s == NULL) ++ var_s = origin_loop.base_s; ++ else ++ var_s = *ptr_var_s; ++ if (optional_add_phi_arg (phi, res, var_s, e)) ++ continue; ++ } ++ } ++} ++ ++static void ++create_new_loops (edge entry_edge) ++{ ++ class loop *new_loop = NULL; ++ basic_block loop_bb, cond_bb, cond_true_bb, cond_false_bb, lastcond_true_bb; ++ class loop *outer = entry_edge->src->loop_father; ++ change_preheader_bb (entry_edge); ++ ++ create_loop_bb (loop_bb, entry_edge->src, entry_edge->src, outer, entry_edge); ++ create_cond_bb (cond_bb, loop_bb, loop_bb, outer); ++ make_edge (loop_bb, loop_bb, EDGE_TRUE_VALUE); ++ make_edge (loop_bb, cond_bb, EDGE_FALSE_VALUE); ++ update_phi_nodes (loop_bb); ++ ++ new_loop = alloc_loop (); ++ new_loop->header = loop_bb; ++ new_loop->latch = loop_bb; ++ add_loop (new_loop, outer); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nPrint byte new loop %d:\n", new_loop->num); ++ flow_loop_dump (new_loop, dump_file, NULL, 1); ++ fprintf (dump_file, "\n\n"); ++ } ++ ++ create_cond_true_bb (cond_true_bb, cond_bb, cond_bb, outer); ++ make_edge (cond_bb, cond_true_bb, EDGE_TRUE_VALUE); ++ create_cond_false_bb (cond_false_bb, cond_true_bb, cond_bb, outer); ++ make_edge (cond_bb, cond_false_bb, EDGE_FALSE_VALUE); ++ update_phi_nodes (cond_bb); ++ update_phi_nodes (cond_false_bb); ++ create_lastcond_true_bb (lastcond_true_bb, cond_false_bb, ++ cond_false_bb, outer); ++ make_edge (cond_false_bb, lastcond_true_bb, EDGE_TRUE_VALUE); ++ make_edge (cond_false_bb, origin_loop.exit_bb, EDGE_FALSE_VALUE); ++ make_single_succ_edge (lastcond_true_bb, origin_loop.exit_bb, EDGE_FALLTHRU); ++ ++ update_phi_nodes (origin_loop.exit_bb); ++ remove_edge (origin_loop.exit_edge); ++} ++ ++/* Clear information about the original loop. */ ++static void ++remove_origin_loop (class loop *loop) ++{ ++ basic_block *body = get_loop_body_in_dom_order (loop); ++ unsigned n = loop->num_nodes; ++ for (int i = 0; i < n; ++i) ++ delete_basic_block (body[i]); ++ free (body); ++ delete_loop (loop); ++} ++ ++/* Make sure that the dominance relationship of the newly inserted cfg ++ is not missing. */ ++static void ++update_loop_dominator (cdi_direction dir) ++{ ++ gcc_assert (dom_info_available_p (dir)); ++ ++ basic_block bb; ++ FOR_EACH_BB_FN (bb, cfun) ++ { ++ basic_block imm_bb = get_immediate_dominator (dir, bb); ++ if (!imm_bb || bb == origin_loop.exit_bb) ++ { ++ set_immediate_dominator (CDI_DOMINATORS, bb, ++ recompute_dominator (CDI_DOMINATORS, bb)); ++ continue; ++ } ++ } ++} ++ ++/* Perform the conversion of origin_loop to new_loop. */ ++static void ++convert_to_new_loop (class loop *loop) ++{ ++ create_new_loops (origin_loop.entry_edge); ++ remove_origin_loop (loop); ++ update_loop_dominator (CDI_DOMINATORS); ++ update_ssa (TODO_update_ssa); ++} ++ ++/* The main entry of loop crc optimizes. */ ++static unsigned int ++tree_ssa_loop_crc () ++{ ++ if (TARGET_CRC32 == false) ++ { ++ warning (OPT____,"The loop-crc optimization is not working." \ ++ "You should make sure that the specified architecture" \ ++ "supports crc:-march=armv8.1-a"); ++ return 0; ++ } ++ unsigned int todo = 0; ++ class loop *loop; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ flow_loops_dump (dump_file, NULL, 1); ++ fprintf (dump_file, "\nStarting the loop_crc pass\n"); ++ } ++ ++ enum li_flags LI = LI_FROM_INNERMOST; ++ for (auto loop : loops_list (cfun, LI)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "======================================\n"); ++ fprintf (dump_file, "Processing loop %d:\n", loop->num); ++ fprintf (dump_file, "======================================\n"); ++ flow_loop_dump (loop, dump_file, NULL, 1); ++ fprintf (dump_file, "\n\n"); ++ } ++ ++ if (match_crc_loop (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "The %dth loop form is success matched," ++ "and the loop can be optimized.\n", ++ loop->num); ++ } ++ ++ convert_to_new_loop (loop); ++ todo |= (TODO_update_ssa); ++ } ++ } ++ return todo; ++} ++ ++/* Loop crc. */ ++ ++namespace { ++const pass_data pass_data_tree_loop_crc = ++{ ++ GIMPLE_PASS, ++ "loop_crc", ++ OPTGROUP_LOOP, ++ TV_TREE_LOOP_CRC, ++ (PROP_cfg | PROP_ssa), ++ 0, ++ 0, ++ 0, ++ (TODO_update_ssa | TODO_verify_all) ++}; ++ ++class pass_loop_crc : public gimple_opt_pass ++{ ++public: ++ pass_loop_crc (gcc::context *ctxt) ++ : gimple_opt_pass (pass_data_tree_loop_crc, ctxt) ++ {} ++ ++ /* Opt_pass methods: */ ++ virtual bool gate (function *); ++ virtual unsigned int execute (function *); ++}; // Class pass_loop_crc ++ ++bool ++pass_loop_crc::gate (function *) ++{ ++ return (flag_loop_crc > 0 && optimize >= 3); ++} ++ ++unsigned int ++pass_loop_crc::execute (function *fun) ++{ ++ if (number_of_loops (fun) <= 1) ++ return 0; ++ ++ /* Only supports LP64 data mode. */ ++ if (TYPE_PRECISION (long_integer_type_node) != 64 ++ || POINTER_SIZE != 64 || TYPE_PRECISION (integer_type_node) != 32) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "The current data mode is not supported," ++ "only the LP64 date mode is supported.\n"); ++ return 0; ++ } ++ ++ return tree_ssa_loop_crc (); ++} ++ ++} // Anon namespace ++ ++gimple_opt_pass * ++make_pass_loop_crc (gcc::context *ctxt) ++{ ++ return new pass_loop_crc (ctxt); ++} +-- +2.33.0 + diff --git a/0103-SME-Remove-hip09-and-hip11-in-aarch64-cores.def-to-b.patch b/0103-SME-Remove-hip09-and-hip11-in-aarch64-cores.def-to-b.patch new file mode 100644 index 0000000000000000000000000000000000000000..5589f82dbb1c1b19cf8b1a12816e2f7e0e2b5cf3 --- /dev/null +++ b/0103-SME-Remove-hip09-and-hip11-in-aarch64-cores.def-to-b.patch @@ -0,0 +1,34 @@ +From 72c48ade495ef99ef032a6c44365eb102b74888e Mon Sep 17 00:00:00 2001 +From: xiezhiheng +Date: Fri, 23 Aug 2024 15:14:04 +0800 +Subject: [PATCH 004/157] [SME] Remove hip09 and hip11 in aarch64-cores.def to + backport SME + +Will apply it in the end. +--- + gcc/config/aarch64/aarch64-cores.def | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def +index 601b72abb..70b11eb80 100644 +--- a/gcc/config/aarch64/aarch64-cores.def ++++ b/gcc/config/aarch64/aarch64-cores.def +@@ -130,7 +130,6 @@ AARCH64_CORE("a64fx", a64fx, a64fx, 8_2A, AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F + + /* HiSilicon ('H') cores. */ + AARCH64_CORE("tsv110", tsv110, tsv110, 8_2A, AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | AARCH64_FL_SHA2, tsv110, 0x48, 0xd01, -1) +-AARCH64_CORE("hip09", hip09, hip09, 8_5A, AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_SVE | AARCH64_FL_I8MM | AARCH64_FL_F32MM | AARCH64_FL_F64MM | AARCH64_FL_PROFILE | AARCH64_FL_PREDRES, hip09, 0x48, 0xd02, 0x0) + + /* ARMv8.3-A Architecture Processors. */ + +@@ -173,7 +172,6 @@ AARCH64_CORE("cortex-a710", cortexa710, cortexa57, 9A, AARCH64_FL_FOR_ARCH9 | + AARCH64_CORE("cortex-x2", cortexx2, cortexa57, 9A, AARCH64_FL_FOR_ARCH9 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_MEMTAG | AARCH64_FL_I8MM | AARCH64_FL_BF16, neoversen2, 0x41, 0xd48, -1) + + AARCH64_CORE("neoverse-n2", neoversen2, cortexa57, 9A, AARCH64_FL_FOR_ARCH9 | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_RNG | AARCH64_FL_MEMTAG | AARCH64_FL_PROFILE, neoversen2, 0x41, 0xd49, -1) +-AARCH64_CORE("hip11", hip11, hip11, 8_5A, AARCH64_FL_FOR_ARCH8_5| AARCH64_FL_SVE | AARCH64_FL_SVE2 | AARCH64_FL_F16, hip11, 0x48, 0xd22, -1) + + AARCH64_CORE("demeter", demeter, cortexa57, 9A, AARCH64_FL_FOR_ARCH9 | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_RNG | AARCH64_FL_MEMTAG | AARCH64_FL_PROFILE, neoversev2, 0x41, 0xd4f, -1) + AARCH64_CORE("neoverse-v2", neoversev2, cortexa57, 9A, AARCH64_FL_FOR_ARCH9 | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_RNG | AARCH64_FL_MEMTAG | AARCH64_FL_PROFILE, neoversev2, 0x41, 0xd4f, -1) +-- +2.33.0 + diff --git a/0097-Backport-SME-AArch64-Cleanup-CPU-option-processing-c.patch b/0104-Backport-SME-AArch64-Cleanup-CPU-option-processing-c.patch similarity index 100% rename from 0097-Backport-SME-AArch64-Cleanup-CPU-option-processing-c.patch rename to 0104-Backport-SME-AArch64-Cleanup-CPU-option-processing-c.patch diff --git a/0098-Backport-SME-AArch64-Cleanup-option-processing-code.patch b/0105-Backport-SME-AArch64-Cleanup-option-processing-code.patch similarity index 100% rename from 0098-Backport-SME-AArch64-Cleanup-option-processing-code.patch rename to 0105-Backport-SME-AArch64-Cleanup-option-processing-code.patch diff --git a/0099-Backport-SME-aarch64-Add-march-support-for-Armv9.1-A.patch b/0106-Backport-SME-aarch64-Add-march-support-for-Armv9.1-A.patch similarity index 100% rename from 0099-Backport-SME-aarch64-Add-march-support-for-Armv9.1-A.patch rename to 0106-Backport-SME-aarch64-Add-march-support-for-Armv9.1-A.patch diff --git a/0100-Backport-SME-Revert-aarch64-Define-__ARM_FEATURE_RCP.patch b/0107-Backport-SME-Revert-aarch64-Define-__ARM_FEATURE_RCP.patch similarity index 100% rename from 0100-Backport-SME-Revert-aarch64-Define-__ARM_FEATURE_RCP.patch rename to 0107-Backport-SME-Revert-aarch64-Define-__ARM_FEATURE_RCP.patch diff --git a/0101-Backport-SME-Revert-Ampere-1-and-Ampere-1A-core-defi.patch b/0108-Backport-SME-Revert-Ampere-1-and-Ampere-1A-core-defi.patch similarity index 100% rename from 0101-Backport-SME-Revert-Ampere-1-and-Ampere-1A-core-defi.patch rename to 0108-Backport-SME-Revert-Ampere-1-and-Ampere-1A-core-defi.patch diff --git a/0102-Backport-SME-aarch64-Rename-AARCH64_ISA-architecture.patch b/0109-Backport-SME-aarch64-Rename-AARCH64_ISA-architecture.patch similarity index 100% rename from 0102-Backport-SME-aarch64-Rename-AARCH64_ISA-architecture.patch rename to 0109-Backport-SME-aarch64-Rename-AARCH64_ISA-architecture.patch diff --git a/0103-Backport-SME-aarch64-Rename-AARCH64_FL-architecture-.patch b/0110-Backport-SME-aarch64-Rename-AARCH64_FL-architecture-.patch similarity index 100% rename from 0103-Backport-SME-aarch64-Rename-AARCH64_FL-architecture-.patch rename to 0110-Backport-SME-aarch64-Rename-AARCH64_FL-architecture-.patch diff --git a/0104-Backport-SME-aarch64-Rename-AARCH64_FL_FOR_ARCH-macr.patch b/0111-Backport-SME-aarch64-Rename-AARCH64_FL_FOR_ARCH-macr.patch similarity index 100% rename from 0104-Backport-SME-aarch64-Rename-AARCH64_FL_FOR_ARCH-macr.patch rename to 0111-Backport-SME-aarch64-Rename-AARCH64_FL_FOR_ARCH-macr.patch diff --git a/0105-Backport-SME-aarch64-Add-V-to-aarch64-arches.def-nam.patch b/0112-Backport-SME-aarch64-Add-V-to-aarch64-arches.def-nam.patch similarity index 100% rename from 0105-Backport-SME-aarch64-Add-V-to-aarch64-arches.def-nam.patch rename to 0112-Backport-SME-aarch64-Add-V-to-aarch64-arches.def-nam.patch diff --git a/0106-Backport-SME-aarch64-Small-config.gcc-cleanups.patch b/0113-Backport-SME-aarch64-Small-config.gcc-cleanups.patch similarity index 100% rename from 0106-Backport-SME-aarch64-Small-config.gcc-cleanups.patch rename to 0113-Backport-SME-aarch64-Small-config.gcc-cleanups.patch diff --git a/0107-Backport-SME-aarch64-Avoid-redundancy-in-aarch64-cor.patch b/0114-Backport-SME-aarch64-Avoid-redundancy-in-aarch64-cor.patch similarity index 100% rename from 0107-Backport-SME-aarch64-Avoid-redundancy-in-aarch64-cor.patch rename to 0114-Backport-SME-aarch64-Avoid-redundancy-in-aarch64-cor.patch diff --git a/0108-Backport-SME-aarch64-Remove-AARCH64_FL_RCPC8_4-PR107.patch b/0115-Backport-SME-aarch64-Remove-AARCH64_FL_RCPC8_4-PR107.patch similarity index 100% rename from 0108-Backport-SME-aarch64-Remove-AARCH64_FL_RCPC8_4-PR107.patch rename to 0115-Backport-SME-aarch64-Remove-AARCH64_FL_RCPC8_4-PR107.patch diff --git a/0109-Backport-SME-aarch64-Fix-transitive-closure-of-featu.patch b/0116-Backport-SME-aarch64-Fix-transitive-closure-of-featu.patch similarity index 100% rename from 0109-Backport-SME-aarch64-Fix-transitive-closure-of-featu.patch rename to 0116-Backport-SME-aarch64-Fix-transitive-closure-of-featu.patch diff --git a/0110-Backport-SME-aarch64-Reorder-an-entry-in-aarch64-opt.patch b/0117-Backport-SME-aarch64-Reorder-an-entry-in-aarch64-opt.patch similarity index 100% rename from 0110-Backport-SME-aarch64-Reorder-an-entry-in-aarch64-opt.patch rename to 0117-Backport-SME-aarch64-Reorder-an-entry-in-aarch64-opt.patch diff --git a/0111-Backport-SME-aarch64-Simplify-feature-definitions.patch b/0118-Backport-SME-aarch64-Simplify-feature-definitions.patch similarity index 100% rename from 0111-Backport-SME-aarch64-Simplify-feature-definitions.patch rename to 0118-Backport-SME-aarch64-Simplify-feature-definitions.patch diff --git a/0112-Backport-SME-aarch64-Simplify-generation-of-.arch-st.patch b/0119-Backport-SME-aarch64-Simplify-generation-of-.arch-st.patch similarity index 100% rename from 0112-Backport-SME-aarch64-Simplify-generation-of-.arch-st.patch rename to 0119-Backport-SME-aarch64-Simplify-generation-of-.arch-st.patch diff --git a/0113-Backport-SME-aarch64-Avoid-std-string-in-static-data.patch b/0120-Backport-SME-aarch64-Avoid-std-string-in-static-data.patch similarity index 100% rename from 0113-Backport-SME-aarch64-Avoid-std-string-in-static-data.patch rename to 0120-Backport-SME-aarch64-Avoid-std-string-in-static-data.patch diff --git a/0114-Backport-SME-aarch64-Tweak-constness-of-option-relat.patch b/0121-Backport-SME-aarch64-Tweak-constness-of-option-relat.patch similarity index 100% rename from 0114-Backport-SME-aarch64-Tweak-constness-of-option-relat.patch rename to 0121-Backport-SME-aarch64-Tweak-constness-of-option-relat.patch diff --git a/0115-Backport-SME-aarch64-Make-more-use-of-aarch64_featur.patch b/0122-Backport-SME-aarch64-Make-more-use-of-aarch64_featur.patch similarity index 100% rename from 0115-Backport-SME-aarch64-Make-more-use-of-aarch64_featur.patch rename to 0122-Backport-SME-aarch64-Make-more-use-of-aarch64_featur.patch diff --git a/0116-Backport-SME-aarch64-Tweak-contents-of-flags_on-off-.patch b/0123-Backport-SME-aarch64-Tweak-contents-of-flags_on-off-.patch similarity index 100% rename from 0116-Backport-SME-aarch64-Tweak-contents-of-flags_on-off-.patch rename to 0123-Backport-SME-aarch64-Tweak-contents-of-flags_on-off-.patch diff --git a/0117-Backport-SME-aarch64-Tweak-handling-of-mgeneral-regs.patch b/0124-Backport-SME-aarch64-Tweak-handling-of-mgeneral-regs.patch similarity index 100% rename from 0117-Backport-SME-aarch64-Tweak-handling-of-mgeneral-regs.patch rename to 0124-Backport-SME-aarch64-Tweak-handling-of-mgeneral-regs.patch diff --git a/0118-Backport-SME-aarch64-Remove-redundant-TARGET_-checks.patch b/0125-Backport-SME-aarch64-Remove-redundant-TARGET_-checks.patch similarity index 100% rename from 0118-Backport-SME-aarch64-Remove-redundant-TARGET_-checks.patch rename to 0125-Backport-SME-aarch64-Remove-redundant-TARGET_-checks.patch diff --git a/0119-Backport-SME-aarch64-Define-__ARM_FEATURE_RCPC.patch b/0126-Backport-SME-aarch64-Define-__ARM_FEATURE_RCPC.patch similarity index 100% rename from 0119-Backport-SME-aarch64-Define-__ARM_FEATURE_RCPC.patch rename to 0126-Backport-SME-aarch64-Define-__ARM_FEATURE_RCPC.patch diff --git a/0120-Backport-SME-Add-Ampere-1-and-Ampere-1A-core-definit.patch b/0127-Backport-SME-Add-Ampere-1-and-Ampere-1A-core-definit.patch similarity index 100% rename from 0120-Backport-SME-Add-Ampere-1-and-Ampere-1A-core-definit.patch rename to 0127-Backport-SME-Add-Ampere-1-and-Ampere-1A-core-definit.patch diff --git a/0121-Backport-SME-aarch64-Fix-nosimd-handling-of-FPR-move.patch b/0128-Backport-SME-aarch64-Fix-nosimd-handling-of-FPR-move.patch similarity index 100% rename from 0121-Backport-SME-aarch64-Fix-nosimd-handling-of-FPR-move.patch rename to 0128-Backport-SME-aarch64-Fix-nosimd-handling-of-FPR-move.patch diff --git a/0122-Backport-SME-aarch64-Commonise-some-folding-code.patch b/0129-Backport-SME-aarch64-Commonise-some-folding-code.patch similarity index 100% rename from 0122-Backport-SME-aarch64-Commonise-some-folding-code.patch rename to 0129-Backport-SME-aarch64-Commonise-some-folding-code.patch diff --git a/0123-Backport-SME-aarch64-Add-a-Z-operand-modifier-for-SV.patch b/0130-Backport-SME-aarch64-Add-a-Z-operand-modifier-for-SV.patch similarity index 100% rename from 0123-Backport-SME-aarch64-Add-a-Z-operand-modifier-for-SV.patch rename to 0130-Backport-SME-aarch64-Add-a-Z-operand-modifier-for-SV.patch diff --git a/0124-Backport-SME-mode-switching-Remove-unused-bbnum-fiel.patch b/0131-Backport-SME-mode-switching-Remove-unused-bbnum-fiel.patch similarity index 100% rename from 0124-Backport-SME-mode-switching-Remove-unused-bbnum-fiel.patch rename to 0131-Backport-SME-mode-switching-Remove-unused-bbnum-fiel.patch diff --git a/0125-Backport-SME-mode-switching-Tweak-the-macro-hook-doc.patch b/0132-Backport-SME-mode-switching-Tweak-the-macro-hook-doc.patch similarity index 100% rename from 0125-Backport-SME-mode-switching-Tweak-the-macro-hook-doc.patch rename to 0132-Backport-SME-mode-switching-Tweak-the-macro-hook-doc.patch diff --git a/0126-Backport-SME-mode-switching-Add-note-problem.patch b/0133-Backport-SME-mode-switching-Add-note-problem.patch similarity index 100% rename from 0126-Backport-SME-mode-switching-Add-note-problem.patch rename to 0133-Backport-SME-mode-switching-Add-note-problem.patch diff --git a/0127-Backport-SME-mode-switching-Avoid-quadractic-list-op.patch b/0134-Backport-SME-mode-switching-Avoid-quadractic-list-op.patch similarity index 100% rename from 0127-Backport-SME-mode-switching-Avoid-quadractic-list-op.patch rename to 0134-Backport-SME-mode-switching-Avoid-quadractic-list-op.patch diff --git a/0128-Backport-SME-mode-switching-Fix-the-mode-passed-to-t.patch b/0135-Backport-SME-mode-switching-Fix-the-mode-passed-to-t.patch similarity index 100% rename from 0128-Backport-SME-mode-switching-Fix-the-mode-passed-to-t.patch rename to 0135-Backport-SME-mode-switching-Fix-the-mode-passed-to-t.patch diff --git a/0129-Backport-SME-mode-switching-Simplify-recording-of-tr.patch b/0136-Backport-SME-mode-switching-Simplify-recording-of-tr.patch similarity index 100% rename from 0129-Backport-SME-mode-switching-Simplify-recording-of-tr.patch rename to 0136-Backport-SME-mode-switching-Simplify-recording-of-tr.patch diff --git a/0130-Backport-SME-mode-switching-Tweak-entry-exit-handlin.patch b/0137-Backport-SME-mode-switching-Tweak-entry-exit-handlin.patch similarity index 100% rename from 0130-Backport-SME-mode-switching-Tweak-entry-exit-handlin.patch rename to 0137-Backport-SME-mode-switching-Tweak-entry-exit-handlin.patch diff --git a/0131-Backport-SME-mode-switching-Allow-targets-to-set-the.patch b/0138-Backport-SME-mode-switching-Allow-targets-to-set-the.patch similarity index 100% rename from 0131-Backport-SME-mode-switching-Allow-targets-to-set-the.patch rename to 0138-Backport-SME-mode-switching-Allow-targets-to-set-the.patch diff --git a/0132-Backport-SME-mode-switching-Pass-set-of-live-registe.patch b/0139-Backport-SME-mode-switching-Pass-set-of-live-registe.patch similarity index 100% rename from 0132-Backport-SME-mode-switching-Pass-set-of-live-registe.patch rename to 0139-Backport-SME-mode-switching-Pass-set-of-live-registe.patch diff --git a/0133-Backport-SME-mode-switching-Pass-the-set-of-live-reg.patch b/0140-Backport-SME-mode-switching-Pass-the-set-of-live-reg.patch similarity index 100% rename from 0133-Backport-SME-mode-switching-Pass-the-set-of-live-reg.patch rename to 0140-Backport-SME-mode-switching-Pass-the-set-of-live-reg.patch diff --git a/0134-Backport-SME-mode-switching-Use-1-based-edge-aux-fie.patch b/0141-Backport-SME-mode-switching-Use-1-based-edge-aux-fie.patch similarity index 100% rename from 0134-Backport-SME-mode-switching-Use-1-based-edge-aux-fie.patch rename to 0141-Backport-SME-mode-switching-Use-1-based-edge-aux-fie.patch diff --git a/0135-Backport-SME-mode-switching-Add-a-target-configurabl.patch b/0142-Backport-SME-mode-switching-Add-a-target-configurabl.patch similarity index 100% rename from 0135-Backport-SME-mode-switching-Add-a-target-configurabl.patch rename to 0142-Backport-SME-mode-switching-Add-a-target-configurabl.patch diff --git a/0136-Backport-SME-mode-switching-Add-a-backprop-hook.patch b/0143-Backport-SME-mode-switching-Add-a-backprop-hook.patch similarity index 100% rename from 0136-Backport-SME-mode-switching-Add-a-backprop-hook.patch rename to 0143-Backport-SME-mode-switching-Add-a-backprop-hook.patch diff --git a/0137-Backport-SME-aarch64-Add-a-result_mode-helper-functi.patch b/0144-Backport-SME-aarch64-Add-a-result_mode-helper-functi.patch similarity index 100% rename from 0137-Backport-SME-aarch64-Add-a-result_mode-helper-functi.patch rename to 0144-Backport-SME-aarch64-Add-a-result_mode-helper-functi.patch diff --git a/0138-Backport-SME-rtl-Try-to-remove-EH-edges-after-pro-ep.patch b/0145-Backport-SME-rtl-Try-to-remove-EH-edges-after-pro-ep.patch similarity index 100% rename from 0138-Backport-SME-rtl-Try-to-remove-EH-edges-after-pro-ep.patch rename to 0145-Backport-SME-rtl-Try-to-remove-EH-edges-after-pro-ep.patch diff --git a/0139-Backport-SME-Fix-PR-middle-end-107705-ICE-after-recl.patch b/0146-Backport-SME-Fix-PR-middle-end-107705-ICE-after-recl.patch similarity index 100% rename from 0139-Backport-SME-Fix-PR-middle-end-107705-ICE-after-recl.patch rename to 0146-Backport-SME-Fix-PR-middle-end-107705-ICE-after-recl.patch diff --git a/0140-Backport-SME-function-Change-return-type-of-predicat.patch b/0147-Backport-SME-function-Change-return-type-of-predicat.patch similarity index 100% rename from 0140-Backport-SME-function-Change-return-type-of-predicat.patch rename to 0147-Backport-SME-function-Change-return-type-of-predicat.patch diff --git a/0141-Backport-SME-Allow-prologues-and-epilogues-to-be-ins.patch b/0148-Backport-SME-Allow-prologues-and-epilogues-to-be-ins.patch similarity index 100% rename from 0141-Backport-SME-Allow-prologues-and-epilogues-to-be-ins.patch rename to 0148-Backport-SME-Allow-prologues-and-epilogues-to-be-ins.patch diff --git a/0142-Backport-SME-Add-a-target-hook-for-sibcall-epilogues.patch b/0149-Backport-SME-Add-a-target-hook-for-sibcall-epilogues.patch similarity index 100% rename from 0142-Backport-SME-Add-a-target-hook-for-sibcall-epilogues.patch rename to 0149-Backport-SME-Add-a-target-hook-for-sibcall-epilogues.patch diff --git a/0143-Backport-SME-Add-a-new-target-hook-TARGET_START_CALL.patch b/0150-Backport-SME-Add-a-new-target-hook-TARGET_START_CALL.patch similarity index 100% rename from 0143-Backport-SME-Add-a-new-target-hook-TARGET_START_CALL.patch rename to 0150-Backport-SME-Add-a-new-target-hook-TARGET_START_CALL.patch diff --git a/0144-Backport-SME-Allow-targets-to-add-USEs-to-asms.patch b/0151-Backport-SME-Allow-targets-to-add-USEs-to-asms.patch similarity index 100% rename from 0144-Backport-SME-Allow-targets-to-add-USEs-to-asms.patch rename to 0151-Backport-SME-Allow-targets-to-add-USEs-to-asms.patch diff --git a/0145-Backport-SME-New-compact-syntax-for-insn-and-insn_sp.patch b/0152-Backport-SME-New-compact-syntax-for-insn-and-insn_sp.patch similarity index 100% rename from 0145-Backport-SME-New-compact-syntax-for-insn-and-insn_sp.patch rename to 0152-Backport-SME-New-compact-syntax-for-insn-and-insn_sp.patch diff --git a/0146-Backport-SME-recog-Improve-parser-for-pattern-new-co.patch b/0153-Backport-SME-recog-Improve-parser-for-pattern-new-co.patch similarity index 100% rename from 0146-Backport-SME-recog-Improve-parser-for-pattern-new-co.patch rename to 0153-Backport-SME-recog-Improve-parser-for-pattern-new-co.patch diff --git a/0147-Backport-SME-recog-Support-space-in-cons.patch b/0154-Backport-SME-recog-Support-space-in-cons.patch similarity index 100% rename from 0147-Backport-SME-recog-Support-space-in-cons.patch rename to 0154-Backport-SME-recog-Support-space-in-cons.patch diff --git a/0148-Backport-SME-aarch64-Generalise-require_immediate_la.patch b/0155-Backport-SME-aarch64-Generalise-require_immediate_la.patch similarity index 100% rename from 0148-Backport-SME-aarch64-Generalise-require_immediate_la.patch rename to 0155-Backport-SME-aarch64-Generalise-require_immediate_la.patch diff --git a/0149-Backport-SME-aarch64-Add-backend-support-for-DFP.patch b/0156-Backport-SME-aarch64-Add-backend-support-for-DFP.patch similarity index 100% rename from 0149-Backport-SME-aarch64-Add-backend-support-for-DFP.patch rename to 0156-Backport-SME-aarch64-Add-backend-support-for-DFP.patch diff --git a/0150-Backport-SME-aarch64-Vector-move-fixes-for-nosimd.patch b/0157-Backport-SME-aarch64-Vector-move-fixes-for-nosimd.patch similarity index 100% rename from 0150-Backport-SME-aarch64-Vector-move-fixes-for-nosimd.patch rename to 0157-Backport-SME-aarch64-Vector-move-fixes-for-nosimd.patch diff --git a/0151-Backport-SME-aarch64-Simplify-output-template-emissi.patch b/0158-Backport-SME-aarch64-Simplify-output-template-emissi.patch similarity index 100% rename from 0151-Backport-SME-aarch64-Simplify-output-template-emissi.patch rename to 0158-Backport-SME-aarch64-Simplify-output-template-emissi.patch diff --git a/0152-Backport-SME-Improve-immediate-expansion-PR106583.patch b/0159-Backport-SME-Improve-immediate-expansion-PR106583.patch similarity index 100% rename from 0152-Backport-SME-Improve-immediate-expansion-PR106583.patch rename to 0159-Backport-SME-Improve-immediate-expansion-PR106583.patch diff --git a/0153-Backport-SME-AArch64-Cleanup-move-immediate-code.patch b/0160-Backport-SME-AArch64-Cleanup-move-immediate-code.patch similarity index 100% rename from 0153-Backport-SME-AArch64-Cleanup-move-immediate-code.patch rename to 0160-Backport-SME-AArch64-Cleanup-move-immediate-code.patch diff --git a/0154-Backport-SME-AArch64-convert-some-patterns-to-compac.patch b/0161-Backport-SME-AArch64-convert-some-patterns-to-compac.patch similarity index 100% rename from 0154-Backport-SME-AArch64-convert-some-patterns-to-compac.patch rename to 0161-Backport-SME-AArch64-convert-some-patterns-to-compac.patch diff --git a/0155-Backport-SME-aarch64-Use-SVE-s-RDVL-instruction.patch b/0162-Backport-SME-aarch64-Use-SVE-s-RDVL-instruction.patch similarity index 100% rename from 0155-Backport-SME-aarch64-Use-SVE-s-RDVL-instruction.patch rename to 0162-Backport-SME-aarch64-Use-SVE-s-RDVL-instruction.patch diff --git a/0156-Backport-SME-aarch64-Make-AARCH64_FL_SVE-requirement.patch b/0163-Backport-SME-aarch64-Make-AARCH64_FL_SVE-requirement.patch similarity index 100% rename from 0156-Backport-SME-aarch64-Make-AARCH64_FL_SVE-requirement.patch rename to 0163-Backport-SME-aarch64-Make-AARCH64_FL_SVE-requirement.patch diff --git a/0157-Backport-SME-aarch64-Add-group-suffixes-to-SVE-intri.patch b/0164-Backport-SME-aarch64-Add-group-suffixes-to-SVE-intri.patch similarity index 100% rename from 0157-Backport-SME-aarch64-Add-group-suffixes-to-SVE-intri.patch rename to 0164-Backport-SME-aarch64-Add-group-suffixes-to-SVE-intri.patch diff --git a/0158-Backport-SME-aarch64-Add-sve_type-to-SVE-builtins-co.patch b/0165-Backport-SME-aarch64-Add-sve_type-to-SVE-builtins-co.patch similarity index 100% rename from 0158-Backport-SME-aarch64-Add-sve_type-to-SVE-builtins-co.patch rename to 0165-Backport-SME-aarch64-Add-sve_type-to-SVE-builtins-co.patch diff --git a/0159-Backport-SME-aarch64-Generalise-some-SVE-ACLE-error-.patch b/0166-Backport-SME-aarch64-Generalise-some-SVE-ACLE-error-.patch similarity index 100% rename from 0159-Backport-SME-aarch64-Generalise-some-SVE-ACLE-error-.patch rename to 0166-Backport-SME-aarch64-Generalise-some-SVE-ACLE-error-.patch diff --git a/0160-Backport-SME-aarch64-Replace-vague-previous-argument.patch b/0167-Backport-SME-aarch64-Replace-vague-previous-argument.patch similarity index 100% rename from 0160-Backport-SME-aarch64-Replace-vague-previous-argument.patch rename to 0167-Backport-SME-aarch64-Replace-vague-previous-argument.patch diff --git a/0161-Backport-SME-aarch64-Make-more-use-of-sve_type-in-AC.patch b/0168-Backport-SME-aarch64-Make-more-use-of-sve_type-in-AC.patch similarity index 100% rename from 0161-Backport-SME-aarch64-Make-more-use-of-sve_type-in-AC.patch rename to 0168-Backport-SME-aarch64-Make-more-use-of-sve_type-in-AC.patch diff --git a/0162-Backport-SME-aarch64-Tweak-error-message-for-tuple-v.patch b/0169-Backport-SME-aarch64-Tweak-error-message-for-tuple-v.patch similarity index 100% rename from 0162-Backport-SME-aarch64-Tweak-error-message-for-tuple-v.patch rename to 0169-Backport-SME-aarch64-Tweak-error-message-for-tuple-v.patch diff --git a/0163-Backport-SME-aarch64-Add-tuple-forms-of-svreinterpre.patch b/0170-Backport-SME-aarch64-Add-tuple-forms-of-svreinterpre.patch similarity index 100% rename from 0163-Backport-SME-aarch64-Add-tuple-forms-of-svreinterpre.patch rename to 0170-Backport-SME-aarch64-Add-tuple-forms-of-svreinterpre.patch diff --git a/0164-Backport-SME-attribs-Use-existing-traits-for-excl_ha.patch b/0171-Backport-SME-attribs-Use-existing-traits-for-excl_ha.patch similarity index 100% rename from 0164-Backport-SME-attribs-Use-existing-traits-for-excl_ha.patch rename to 0171-Backport-SME-attribs-Use-existing-traits-for-excl_ha.patch diff --git a/0165-Backport-SME-Allow-target-attributes-in-non-gnu-name.patch b/0172-Backport-SME-Allow-target-attributes-in-non-gnu-name.patch similarity index 100% rename from 0165-Backport-SME-Allow-target-attributes-in-non-gnu-name.patch rename to 0172-Backport-SME-Allow-target-attributes-in-non-gnu-name.patch diff --git a/0166-Backport-SME-aarch64-Fix-plugin-header-install.patch b/0173-Backport-SME-aarch64-Fix-plugin-header-install.patch similarity index 100% rename from 0166-Backport-SME-aarch64-Fix-plugin-header-install.patch rename to 0173-Backport-SME-aarch64-Fix-plugin-header-install.patch diff --git a/0167-Backport-SME-aarch64-Add-arm_streaming-_compatible-a.patch b/0174-Backport-SME-aarch64-Add-arm_streaming-_compatible-a.patch similarity index 100% rename from 0167-Backport-SME-aarch64-Add-arm_streaming-_compatible-a.patch rename to 0174-Backport-SME-aarch64-Add-arm_streaming-_compatible-a.patch diff --git a/0168-Backport-SME-aarch64-Add-sme.patch b/0175-Backport-SME-aarch64-Add-sme.patch similarity index 100% rename from 0168-Backport-SME-aarch64-Add-sme.patch rename to 0175-Backport-SME-aarch64-Add-sme.patch diff --git a/0169-Backport-SME-aarch64-Add-r-m-and-m-r-alternatives-to.patch b/0176-Backport-SME-aarch64-Add-r-m-and-m-r-alternatives-to.patch similarity index 100% rename from 0169-Backport-SME-aarch64-Add-r-m-and-m-r-alternatives-to.patch rename to 0176-Backport-SME-aarch64-Add-r-m-and-m-r-alternatives-to.patch diff --git a/0170-Backport-SME-AArch64-Rewrite-simd-move-immediate-pat.patch b/0177-Backport-SME-AArch64-Rewrite-simd-move-immediate-pat.patch similarity index 100% rename from 0170-Backport-SME-AArch64-Rewrite-simd-move-immediate-pat.patch rename to 0177-Backport-SME-AArch64-Rewrite-simd-move-immediate-pat.patch diff --git a/0171-Backport-SME-AArch64-remove-test-comment-from-mov-mo.patch b/0178-Backport-SME-AArch64-remove-test-comment-from-mov-mo.patch similarity index 100% rename from 0171-Backport-SME-AArch64-remove-test-comment-from-mov-mo.patch rename to 0178-Backport-SME-AArch64-remove-test-comment-from-mov-mo.patch diff --git a/0172-Backport-SME-aarch64-Distinguish-streaming-compatibl.patch b/0179-Backport-SME-aarch64-Distinguish-streaming-compatibl.patch similarity index 100% rename from 0172-Backport-SME-aarch64-Distinguish-streaming-compatibl.patch rename to 0179-Backport-SME-aarch64-Distinguish-streaming-compatibl.patch diff --git a/0173-Backport-SME-aarch64-Mark-relevant-SVE-instructions-.patch b/0180-Backport-SME-aarch64-Mark-relevant-SVE-instructions-.patch similarity index 100% rename from 0173-Backport-SME-aarch64-Mark-relevant-SVE-instructions-.patch rename to 0180-Backport-SME-aarch64-Mark-relevant-SVE-instructions-.patch diff --git a/0174-Backport-SME-AArch64-Support-new-tbranch-optab.patch b/0181-Backport-SME-AArch64-Support-new-tbranch-optab.patch similarity index 100% rename from 0174-Backport-SME-AArch64-Support-new-tbranch-optab.patch rename to 0181-Backport-SME-AArch64-Support-new-tbranch-optab.patch diff --git a/0175-Backport-SME-aarch64-Use-local-frame-vars-in-shrink-.patch b/0182-Backport-SME-aarch64-Use-local-frame-vars-in-shrink-.patch similarity index 100% rename from 0175-Backport-SME-aarch64-Use-local-frame-vars-in-shrink-.patch rename to 0182-Backport-SME-aarch64-Use-local-frame-vars-in-shrink-.patch diff --git a/0176-Backport-SME-aarch64-Avoid-a-use-of-callee_offset.patch b/0183-Backport-SME-aarch64-Avoid-a-use-of-callee_offset.patch similarity index 100% rename from 0176-Backport-SME-aarch64-Avoid-a-use-of-callee_offset.patch rename to 0183-Backport-SME-aarch64-Avoid-a-use-of-callee_offset.patch diff --git a/0177-Backport-SME-aarch64-Explicitly-handle-frames-with-n.patch b/0184-Backport-SME-aarch64-Explicitly-handle-frames-with-n.patch similarity index 100% rename from 0177-Backport-SME-aarch64-Explicitly-handle-frames-with-n.patch rename to 0184-Backport-SME-aarch64-Explicitly-handle-frames-with-n.patch diff --git a/0178-Backport-SME-aarch64-Add-bytes_below_saved_regs-to-f.patch b/0185-Backport-SME-aarch64-Add-bytes_below_saved_regs-to-f.patch similarity index 100% rename from 0178-Backport-SME-aarch64-Add-bytes_below_saved_regs-to-f.patch rename to 0185-Backport-SME-aarch64-Add-bytes_below_saved_regs-to-f.patch diff --git a/0179-Backport-SME-aarch64-Add-bytes_below_hard_fp-to-fram.patch b/0186-Backport-SME-aarch64-Add-bytes_below_hard_fp-to-fram.patch similarity index 100% rename from 0179-Backport-SME-aarch64-Add-bytes_below_hard_fp-to-fram.patch rename to 0186-Backport-SME-aarch64-Add-bytes_below_hard_fp-to-fram.patch diff --git a/0180-Backport-SME-aarch64-Robustify-stack-tie-handling.patch b/0187-Backport-SME-aarch64-Robustify-stack-tie-handling.patch similarity index 100% rename from 0180-Backport-SME-aarch64-Robustify-stack-tie-handling.patch rename to 0187-Backport-SME-aarch64-Robustify-stack-tie-handling.patch diff --git a/0181-Backport-SME-aarch64-Tweak-aarch64_save-restore_call.patch b/0188-Backport-SME-aarch64-Tweak-aarch64_save-restore_call.patch similarity index 100% rename from 0181-Backport-SME-aarch64-Tweak-aarch64_save-restore_call.patch rename to 0188-Backport-SME-aarch64-Tweak-aarch64_save-restore_call.patch diff --git a/0182-Backport-SME-aarch64-Only-calculate-chain_offset-if-.patch b/0189-Backport-SME-aarch64-Only-calculate-chain_offset-if-.patch similarity index 100% rename from 0182-Backport-SME-aarch64-Only-calculate-chain_offset-if-.patch rename to 0189-Backport-SME-aarch64-Only-calculate-chain_offset-if-.patch diff --git a/0183-Backport-SME-aarch64-Rename-locals_offset-to-bytes_a.patch b/0190-Backport-SME-aarch64-Rename-locals_offset-to-bytes_a.patch similarity index 100% rename from 0183-Backport-SME-aarch64-Rename-locals_offset-to-bytes_a.patch rename to 0190-Backport-SME-aarch64-Rename-locals_offset-to-bytes_a.patch diff --git a/0184-Backport-SME-aarch64-Rename-hard_fp_offset-to-bytes_.patch b/0191-Backport-SME-aarch64-Rename-hard_fp_offset-to-bytes_.patch similarity index 100% rename from 0184-Backport-SME-aarch64-Rename-hard_fp_offset-to-bytes_.patch rename to 0191-Backport-SME-aarch64-Rename-hard_fp_offset-to-bytes_.patch diff --git a/0185-Backport-SME-aarch64-Tweak-frame_size-comment.patch b/0192-Backport-SME-aarch64-Tweak-frame_size-comment.patch similarity index 100% rename from 0185-Backport-SME-aarch64-Tweak-frame_size-comment.patch rename to 0192-Backport-SME-aarch64-Tweak-frame_size-comment.patch diff --git a/0186-Backport-SME-aarch64-Measure-reg_offset-from-the-bot.patch b/0193-Backport-SME-aarch64-Measure-reg_offset-from-the-bot.patch similarity index 100% rename from 0186-Backport-SME-aarch64-Measure-reg_offset-from-the-bot.patch rename to 0193-Backport-SME-aarch64-Measure-reg_offset-from-the-bot.patch diff --git a/0187-Backport-SME-aarch64-Simplify-top-of-frame-allocatio.patch b/0194-Backport-SME-aarch64-Simplify-top-of-frame-allocatio.patch similarity index 100% rename from 0187-Backport-SME-aarch64-Simplify-top-of-frame-allocatio.patch rename to 0194-Backport-SME-aarch64-Simplify-top-of-frame-allocatio.patch diff --git a/0188-Backport-SME-aarch64-Minor-initial-adjustment-tweak.patch b/0195-Backport-SME-aarch64-Minor-initial-adjustment-tweak.patch similarity index 100% rename from 0188-Backport-SME-aarch64-Minor-initial-adjustment-tweak.patch rename to 0195-Backport-SME-aarch64-Minor-initial-adjustment-tweak.patch diff --git a/0189-Backport-SME-aarch64-Tweak-stack-clash-boundary-cond.patch b/0196-Backport-SME-aarch64-Tweak-stack-clash-boundary-cond.patch similarity index 100% rename from 0189-Backport-SME-aarch64-Tweak-stack-clash-boundary-cond.patch rename to 0196-Backport-SME-aarch64-Tweak-stack-clash-boundary-cond.patch diff --git a/0190-Backport-SME-aarch64-Put-LR-save-probe-in-first-16-b.patch b/0197-Backport-SME-aarch64-Put-LR-save-probe-in-first-16-b.patch similarity index 100% rename from 0190-Backport-SME-aarch64-Put-LR-save-probe-in-first-16-b.patch rename to 0197-Backport-SME-aarch64-Put-LR-save-probe-in-first-16-b.patch diff --git a/0191-Backport-SME-aarch64-Simplify-probe-of-final-frame-a.patch b/0198-Backport-SME-aarch64-Simplify-probe-of-final-frame-a.patch similarity index 100% rename from 0191-Backport-SME-aarch64-Simplify-probe-of-final-frame-a.patch rename to 0198-Backport-SME-aarch64-Simplify-probe-of-final-frame-a.patch diff --git a/0192-Backport-SME-aarch64-Explicitly-record-probe-registe.patch b/0199-Backport-SME-aarch64-Explicitly-record-probe-registe.patch similarity index 100% rename from 0192-Backport-SME-aarch64-Explicitly-record-probe-registe.patch rename to 0199-Backport-SME-aarch64-Explicitly-record-probe-registe.patch diff --git a/0193-Backport-SME-aarch64-Remove-below_hard_fp_saved_regs.patch b/0200-Backport-SME-aarch64-Remove-below_hard_fp_saved_regs.patch similarity index 100% rename from 0193-Backport-SME-aarch64-Remove-below_hard_fp_saved_regs.patch rename to 0200-Backport-SME-aarch64-Remove-below_hard_fp_saved_regs.patch diff --git a/0194-Backport-SME-aarch64-Make-stack-smash-canary-protect.patch b/0201-Backport-SME-aarch64-Make-stack-smash-canary-protect.patch similarity index 100% rename from 0194-Backport-SME-aarch64-Make-stack-smash-canary-protect.patch rename to 0201-Backport-SME-aarch64-Make-stack-smash-canary-protect.patch diff --git a/0195-Backport-SME-Handle-epilogues-that-contain-jumps.patch b/0202-Backport-SME-Handle-epilogues-that-contain-jumps.patch similarity index 100% rename from 0195-Backport-SME-Handle-epilogues-that-contain-jumps.patch rename to 0202-Backport-SME-Handle-epilogues-that-contain-jumps.patch diff --git a/0196-Backport-SME-aarch64-Use-vecs-to-store-register-save.patch b/0203-Backport-SME-aarch64-Use-vecs-to-store-register-save.patch similarity index 100% rename from 0196-Backport-SME-aarch64-Use-vecs-to-store-register-save.patch rename to 0203-Backport-SME-aarch64-Use-vecs-to-store-register-save.patch diff --git a/0197-Backport-SME-aarch64-Put-LR-save-slot-first-in-more-.patch b/0204-Backport-SME-aarch64-Put-LR-save-slot-first-in-more-.patch similarity index 100% rename from 0197-Backport-SME-aarch64-Put-LR-save-slot-first-in-more-.patch rename to 0204-Backport-SME-aarch64-Put-LR-save-slot-first-in-more-.patch diff --git a/0198-Backport-SME-aarch64-Switch-PSTATE.SM-around-calls.patch b/0205-Backport-SME-aarch64-Switch-PSTATE.SM-around-calls.patch similarity index 100% rename from 0198-Backport-SME-aarch64-Switch-PSTATE.SM-around-calls.patch rename to 0205-Backport-SME-aarch64-Switch-PSTATE.SM-around-calls.patch diff --git a/0199-Backport-SME-aarch64-Add-support-for-SME-ZA-attribut.patch b/0206-Backport-SME-aarch64-Add-support-for-SME-ZA-attribut.patch similarity index 100% rename from 0199-Backport-SME-aarch64-Add-support-for-SME-ZA-attribut.patch rename to 0206-Backport-SME-aarch64-Add-support-for-SME-ZA-attribut.patch diff --git a/0200-Backport-SME-aarch64-Add-a-register-class-for-w12-w1.patch b/0207-Backport-SME-aarch64-Add-a-register-class-for-w12-w1.patch similarity index 100% rename from 0200-Backport-SME-aarch64-Add-a-register-class-for-w12-w1.patch rename to 0207-Backport-SME-aarch64-Add-a-register-class-for-w12-w1.patch diff --git a/0201-Backport-SME-aarch64-Add-a-VNx1TI-mode.patch b/0208-Backport-SME-aarch64-Add-a-VNx1TI-mode.patch similarity index 100% rename from 0201-Backport-SME-aarch64-Add-a-VNx1TI-mode.patch rename to 0208-Backport-SME-aarch64-Add-a-VNx1TI-mode.patch diff --git a/0202-Backport-SME-aarch64-Generalise-unspec_based_functio.patch b/0209-Backport-SME-aarch64-Generalise-unspec_based_functio.patch similarity index 100% rename from 0202-Backport-SME-aarch64-Generalise-unspec_based_functio.patch rename to 0209-Backport-SME-aarch64-Generalise-unspec_based_functio.patch diff --git a/0203-Backport-SME-aarch64-Generalise-_m-rules-for-SVE-int.patch b/0210-Backport-SME-aarch64-Generalise-_m-rules-for-SVE-int.patch similarity index 100% rename from 0203-Backport-SME-aarch64-Generalise-_m-rules-for-SVE-int.patch rename to 0210-Backport-SME-aarch64-Generalise-_m-rules-for-SVE-int.patch diff --git a/0204-Backport-SME-aarch64-Add-support-for-arm_sme.h.patch b/0211-Backport-SME-aarch64-Add-support-for-arm_sme.h.patch similarity index 100% rename from 0204-Backport-SME-aarch64-Add-support-for-arm_sme.h.patch rename to 0211-Backport-SME-aarch64-Add-support-for-arm_sme.h.patch diff --git a/0205-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch b/0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch similarity index 100% rename from 0205-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch rename to 0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch diff --git a/0206-Backport-SME-aarch64-Handle-PSTATE.SM-across-abnorma.patch b/0213-Backport-SME-aarch64-Handle-PSTATE.SM-across-abnorma.patch similarity index 100% rename from 0206-Backport-SME-aarch64-Handle-PSTATE.SM-across-abnorma.patch rename to 0213-Backport-SME-aarch64-Handle-PSTATE.SM-across-abnorma.patch diff --git a/0207-Backport-SME-aarch64-Enforce-inlining-restrictions-f.patch b/0214-Backport-SME-aarch64-Enforce-inlining-restrictions-f.patch similarity index 100% rename from 0207-Backport-SME-aarch64-Enforce-inlining-restrictions-f.patch rename to 0214-Backport-SME-aarch64-Enforce-inlining-restrictions-f.patch diff --git a/0208-Backport-SME-aarch64-Update-sibcall-handling-for-SME.patch b/0215-Backport-SME-aarch64-Update-sibcall-handling-for-SME.patch similarity index 100% rename from 0208-Backport-SME-aarch64-Update-sibcall-handling-for-SME.patch rename to 0215-Backport-SME-aarch64-Update-sibcall-handling-for-SME.patch diff --git a/0209-Backport-SME-libgcc-aarch64-Configure-check-for-.var.patch b/0216-Backport-SME-libgcc-aarch64-Configure-check-for-.var.patch similarity index 100% rename from 0209-Backport-SME-libgcc-aarch64-Configure-check-for-.var.patch rename to 0216-Backport-SME-libgcc-aarch64-Configure-check-for-.var.patch diff --git a/0210-Backport-SME-libgcc-aarch64-Configure-check-for-__ge.patch b/0217-Backport-SME-libgcc-aarch64-Configure-check-for-__ge.patch similarity index 100% rename from 0210-Backport-SME-libgcc-aarch64-Configure-check-for-__ge.patch rename to 0217-Backport-SME-libgcc-aarch64-Configure-check-for-__ge.patch diff --git a/0211-Backport-SME-libgcc-aarch64-Add-SME-runtime-support.patch b/0218-Backport-SME-libgcc-aarch64-Add-SME-runtime-support.patch similarity index 100% rename from 0211-Backport-SME-libgcc-aarch64-Add-SME-runtime-support.patch rename to 0218-Backport-SME-libgcc-aarch64-Add-SME-runtime-support.patch diff --git a/0212-Backport-SME-libgcc-aarch64-Add-SME-unwinder-support.patch b/0219-Backport-SME-libgcc-aarch64-Add-SME-unwinder-support.patch similarity index 100% rename from 0212-Backport-SME-libgcc-aarch64-Add-SME-unwinder-support.patch rename to 0219-Backport-SME-libgcc-aarch64-Add-SME-unwinder-support.patch diff --git a/0213-Backport-SME-libgcc-Fix-config.in.patch b/0220-Backport-SME-libgcc-Fix-config.in.patch similarity index 100% rename from 0213-Backport-SME-libgcc-Fix-config.in.patch rename to 0220-Backport-SME-libgcc-Fix-config.in.patch diff --git a/0214-Backport-SME-aarch64-Add-funwind-tables-to-some-test.patch b/0221-Backport-SME-aarch64-Add-funwind-tables-to-some-test.patch similarity index 100% rename from 0214-Backport-SME-aarch64-Add-funwind-tables-to-some-test.patch rename to 0221-Backport-SME-aarch64-Add-funwind-tables-to-some-test.patch diff --git a/0215-Backport-SME-aarch64-Skip-some-SME-register-save-tes.patch b/0222-Backport-SME-aarch64-Skip-some-SME-register-save-tes.patch similarity index 100% rename from 0215-Backport-SME-aarch64-Skip-some-SME-register-save-tes.patch rename to 0222-Backport-SME-aarch64-Skip-some-SME-register-save-tes.patch diff --git a/0216-Backport-SME-Add-OPTIONS_H_EXTRA-to-GTFILES.patch b/0223-Backport-SME-Add-OPTIONS_H_EXTRA-to-GTFILES.patch similarity index 100% rename from 0216-Backport-SME-Add-OPTIONS_H_EXTRA-to-GTFILES.patch rename to 0223-Backport-SME-Add-OPTIONS_H_EXTRA-to-GTFILES.patch diff --git a/0217-Backport-SME-aarch64-Add-V1DI-mode.patch b/0224-Backport-SME-aarch64-Add-V1DI-mode.patch similarity index 100% rename from 0217-Backport-SME-aarch64-Add-V1DI-mode.patch rename to 0224-Backport-SME-aarch64-Add-V1DI-mode.patch diff --git a/0218-Backport-SME-Allow-md-iterators-to-include-other-ite.patch b/0225-Backport-SME-Allow-md-iterators-to-include-other-ite.patch similarity index 100% rename from 0218-Backport-SME-Allow-md-iterators-to-include-other-ite.patch rename to 0225-Backport-SME-Allow-md-iterators-to-include-other-ite.patch diff --git a/0219-Backport-SME-riscv-Add-support-for-strlen-inline-exp.patch b/0226-Backport-SME-riscv-Add-support-for-strlen-inline-exp.patch similarity index 100% rename from 0219-Backport-SME-riscv-Add-support-for-strlen-inline-exp.patch rename to 0226-Backport-SME-riscv-Add-support-for-strlen-inline-exp.patch diff --git a/0220-Backport-SME-attribs-Add-overloads-with-namespace-na.patch b/0227-Backport-SME-attribs-Add-overloads-with-namespace-na.patch similarity index 100% rename from 0220-Backport-SME-attribs-Add-overloads-with-namespace-na.patch rename to 0227-Backport-SME-attribs-Add-overloads-with-namespace-na.patch diff --git a/0221-Backport-SME-vec-Add-array_slice-constructors-from-n.patch b/0228-Backport-SME-vec-Add-array_slice-constructors-from-n.patch similarity index 100% rename from 0221-Backport-SME-vec-Add-array_slice-constructors-from-n.patch rename to 0228-Backport-SME-vec-Add-array_slice-constructors-from-n.patch diff --git a/0222-Backport-SME-A-couple-of-va_gc_atomic-tweaks.patch b/0229-Backport-SME-A-couple-of-va_gc_atomic-tweaks.patch similarity index 100% rename from 0222-Backport-SME-A-couple-of-va_gc_atomic-tweaks.patch rename to 0229-Backport-SME-A-couple-of-va_gc_atomic-tweaks.patch diff --git a/0223-Backport-SME-middle-end-Fix-issue-of-poly_uint16-1-1.patch b/0230-Backport-SME-middle-end-Fix-issue-of-poly_uint16-1-1.patch similarity index 100% rename from 0223-Backport-SME-middle-end-Fix-issue-of-poly_uint16-1-1.patch rename to 0230-Backport-SME-middle-end-Fix-issue-of-poly_uint16-1-1.patch diff --git a/0224-SME-Add-missing-header-file-in-aarch64.cc.patch b/0231-SME-Add-missing-header-file-in-aarch64.cc.patch similarity index 100% rename from 0224-SME-Add-missing-header-file-in-aarch64.cc.patch rename to 0231-SME-Add-missing-header-file-in-aarch64.cc.patch diff --git a/0225-Backport-SME-c-Add-support-for-__extension__.patch b/0232-Backport-SME-c-Add-support-for-__extension__.patch similarity index 100% rename from 0225-Backport-SME-c-Add-support-for-__extension__.patch rename to 0232-Backport-SME-c-Add-support-for-__extension__.patch diff --git a/0226-Backport-SME-lra-Updates-of-biggest-mode-for-hard-re.patch b/0233-Backport-SME-lra-Updates-of-biggest-mode-for-hard-re.patch similarity index 100% rename from 0226-Backport-SME-lra-Updates-of-biggest-mode-for-hard-re.patch rename to 0233-Backport-SME-lra-Updates-of-biggest-mode-for-hard-re.patch diff --git a/0227-Backport-SME-c-Support-C2x-empty-initializer-braces.patch b/0234-Backport-SME-c-Support-C2x-empty-initializer-braces.patch similarity index 100% rename from 0227-Backport-SME-c-Support-C2x-empty-initializer-braces.patch rename to 0234-Backport-SME-c-Support-C2x-empty-initializer-braces.patch diff --git a/0228-Backport-SME-aarch64-Update-sizeless-tests-for-recen.patch b/0235-Backport-SME-aarch64-Update-sizeless-tests-for-recen.patch similarity index 100% rename from 0228-Backport-SME-aarch64-Update-sizeless-tests-for-recen.patch rename to 0235-Backport-SME-aarch64-Update-sizeless-tests-for-recen.patch diff --git a/0229-Backport-SME-attribs-Namespace-aware-lookup_attribut.patch b/0236-Backport-SME-attribs-Namespace-aware-lookup_attribut.patch similarity index 100% rename from 0229-Backport-SME-attribs-Namespace-aware-lookup_attribut.patch rename to 0236-Backport-SME-attribs-Namespace-aware-lookup_attribut.patch diff --git a/0230-Backport-SME-c-family-ICE-with-gnu-nocf_check-PR1069.patch b/0237-Backport-SME-c-family-ICE-with-gnu-nocf_check-PR1069.patch similarity index 100% rename from 0230-Backport-SME-c-family-ICE-with-gnu-nocf_check-PR1069.patch rename to 0237-Backport-SME-c-family-ICE-with-gnu-nocf_check-PR1069.patch diff --git a/0231-Backport-SME-AArch64-Fix-assert-in-aarch64_move_imm-.patch b/0238-Backport-SME-AArch64-Fix-assert-in-aarch64_move_imm-.patch similarity index 100% rename from 0231-Backport-SME-AArch64-Fix-assert-in-aarch64_move_imm-.patch rename to 0238-Backport-SME-AArch64-Fix-assert-in-aarch64_move_imm-.patch diff --git a/0232-Backport-SME-testsuite-Only-run-fcf-protection-test-.patch b/0239-Backport-SME-testsuite-Only-run-fcf-protection-test-.patch similarity index 100% rename from 0232-Backport-SME-testsuite-Only-run-fcf-protection-test-.patch rename to 0239-Backport-SME-testsuite-Only-run-fcf-protection-test-.patch diff --git a/0233-Backport-SME-Fix-PRs-106764-106765-and-107307-all-IC.patch b/0240-Backport-SME-Fix-PRs-106764-106765-and-107307-all-IC.patch similarity index 100% rename from 0233-Backport-SME-Fix-PRs-106764-106765-and-107307-all-IC.patch rename to 0240-Backport-SME-Fix-PRs-106764-106765-and-107307-all-IC.patch diff --git a/0234-Backport-SME-aarch64-Remove-expected-error-for-compo.patch b/0241-Backport-SME-aarch64-Remove-expected-error-for-compo.patch similarity index 100% rename from 0234-Backport-SME-aarch64-Remove-expected-error-for-compo.patch rename to 0241-Backport-SME-aarch64-Remove-expected-error-for-compo.patch diff --git a/0235-Backport-SME-aarch64-Remove-redundant-builtins-code.patch b/0242-Backport-SME-aarch64-Remove-redundant-builtins-code.patch similarity index 100% rename from 0235-Backport-SME-aarch64-Remove-redundant-builtins-code.patch rename to 0242-Backport-SME-aarch64-Remove-redundant-builtins-code.patch diff --git a/0236-Backport-SME-AArch64-Fix-Armv9-a-warnings-that-get-e.patch b/0243-Backport-SME-AArch64-Fix-Armv9-a-warnings-that-get-e.patch similarity index 100% rename from 0236-Backport-SME-AArch64-Fix-Armv9-a-warnings-that-get-e.patch rename to 0243-Backport-SME-AArch64-Fix-Armv9-a-warnings-that-get-e.patch diff --git a/0237-Backport-SME-Canonicalize-X-Y-as-X-Y-in-match.pd-whe.patch b/0244-Backport-SME-Canonicalize-X-Y-as-X-Y-in-match.pd-whe.patch similarity index 100% rename from 0237-Backport-SME-Canonicalize-X-Y-as-X-Y-in-match.pd-whe.patch rename to 0244-Backport-SME-Canonicalize-X-Y-as-X-Y-in-match.pd-whe.patch diff --git a/0238-Backport-SME-middle-end-Add-new-tbranch-optab-to-add.patch b/0245-Backport-SME-middle-end-Add-new-tbranch-optab-to-add.patch similarity index 100% rename from 0238-Backport-SME-middle-end-Add-new-tbranch-optab-to-add.patch rename to 0245-Backport-SME-middle-end-Add-new-tbranch-optab-to-add.patch diff --git a/0239-Backport-SME-explow-Allow-dynamic-allocations-after-.patch b/0246-Backport-SME-explow-Allow-dynamic-allocations-after-.patch similarity index 100% rename from 0239-Backport-SME-explow-Allow-dynamic-allocations-after-.patch rename to 0246-Backport-SME-explow-Allow-dynamic-allocations-after-.patch diff --git a/0240-Backport-SME-PR105169-Fix-references-to-discarded-se.patch b/0247-Backport-SME-PR105169-Fix-references-to-discarded-se.patch similarity index 100% rename from 0240-Backport-SME-PR105169-Fix-references-to-discarded-se.patch rename to 0247-Backport-SME-PR105169-Fix-references-to-discarded-se.patch diff --git a/0241-Backport-SME-RISC-V-autovec-Verify-that-GET_MODE_NUN.patch b/0248-Backport-SME-RISC-V-autovec-Verify-that-GET_MODE_NUN.patch similarity index 100% rename from 0241-Backport-SME-RISC-V-autovec-Verify-that-GET_MODE_NUN.patch rename to 0248-Backport-SME-RISC-V-autovec-Verify-that-GET_MODE_NUN.patch diff --git a/0242-Backport-SME-Add-operator-to-gimple_stmt_iterator-an.patch b/0249-Backport-SME-Add-operator-to-gimple_stmt_iterator-an.patch similarity index 100% rename from 0242-Backport-SME-Add-operator-to-gimple_stmt_iterator-an.patch rename to 0249-Backport-SME-Add-operator-to-gimple_stmt_iterator-an.patch diff --git a/0243-Backport-SME-tree-optimization-110221-SLP-and-loop-m.patch b/0250-Backport-SME-tree-optimization-110221-SLP-and-loop-m.patch similarity index 100% rename from 0243-Backport-SME-tree-optimization-110221-SLP-and-loop-m.patch rename to 0250-Backport-SME-tree-optimization-110221-SLP-and-loop-m.patch diff --git a/0244-SME-Adapt-some-testsuites.patch b/0251-SME-Adapt-some-testsuites.patch similarity index 100% rename from 0244-SME-Adapt-some-testsuites.patch rename to 0251-SME-Adapt-some-testsuites.patch diff --git a/0245-SME-Fix-error-by-backported-patches-and-IPA-prefetch.patch b/0252-SME-Fix-error-by-backported-patches-and-IPA-prefetch.patch similarity index 100% rename from 0245-SME-Fix-error-by-backported-patches-and-IPA-prefetch.patch rename to 0252-SME-Fix-error-by-backported-patches-and-IPA-prefetch.patch diff --git a/0246-aarch64-Fix-return-register-handling-in-untyped_call.patch b/0253-aarch64-Fix-return-register-handling-in-untyped_call.patch similarity index 100% rename from 0246-aarch64-Fix-return-register-handling-in-untyped_call.patch rename to 0253-aarch64-Fix-return-register-handling-in-untyped_call.patch diff --git a/0247-aarch64-Fix-loose-ldpstp-check.patch b/0254-aarch64-Fix-loose-ldpstp-check.patch similarity index 100% rename from 0247-aarch64-Fix-loose-ldpstp-check.patch rename to 0254-aarch64-Fix-loose-ldpstp-check.patch diff --git a/0248-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ-.patch b/0255-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ-.patch similarity index 100% rename from 0248-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ-.patch rename to 0255-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ-.patch diff --git a/0249-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch b/0256-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch similarity index 100% rename from 0249-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch rename to 0256-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch diff --git a/0250-Make-option-mvzeroupper-independent-of-optimization-.patch b/0257-Make-option-mvzeroupper-independent-of-optimization-.patch similarity index 100% rename from 0250-Make-option-mvzeroupper-independent-of-optimization-.patch rename to 0257-Make-option-mvzeroupper-independent-of-optimization-.patch diff --git a/0251-i386-Sync-tune_string-with-arch_string-for-target-at.patch b/0258-i386-Sync-tune_string-with-arch_string-for-target-at.patch similarity index 100% rename from 0251-i386-Sync-tune_string-with-arch_string-for-target-at.patch rename to 0258-i386-Sync-tune_string-with-arch_string-for-target-at.patch diff --git a/0252-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch b/0259-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch similarity index 100% rename from 0252-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch rename to 0259-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch diff --git a/0253-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch b/0260-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch similarity index 100% rename from 0253-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch rename to 0260-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch diff --git a/0254-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch b/0261-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch similarity index 100% rename from 0254-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch rename to 0261-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch diff --git a/0255-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch b/0262-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch similarity index 100% rename from 0255-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch rename to 0262-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch diff --git a/0256-Software-mitigation-Disable-gather-generation-in-vec.patch b/0263-Software-mitigation-Disable-gather-generation-in-vec.patch similarity index 100% rename from 0256-Software-mitigation-Disable-gather-generation-in-vec.patch rename to 0263-Software-mitigation-Disable-gather-generation-in-vec.patch diff --git a/0257-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch b/0264-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch similarity index 100% rename from 0257-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch rename to 0264-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch diff --git a/0258-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch b/0265-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch similarity index 100% rename from 0258-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch rename to 0265-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch diff --git a/0259-Disparage-slightly-for-the-alternative-which-move-DF.patch b/0266-Disparage-slightly-for-the-alternative-which-move-DF.patch similarity index 100% rename from 0259-Disparage-slightly-for-the-alternative-which-move-DF.patch rename to 0266-Disparage-slightly-for-the-alternative-which-move-DF.patch diff --git a/0260-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch b/0267-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch similarity index 100% rename from 0260-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch rename to 0267-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch diff --git a/0261-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch b/0268-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch similarity index 100% rename from 0261-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch rename to 0268-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch diff --git a/0262-Disable-FMADD-in-chains-for-Zen4-and-generic.patch b/0269-Disable-FMADD-in-chains-for-Zen4-and-generic.patch similarity index 100% rename from 0262-Disable-FMADD-in-chains-for-Zen4-and-generic.patch rename to 0269-Disable-FMADD-in-chains-for-Zen4-and-generic.patch diff --git a/0263-Initial-Raptorlake-Support.patch b/0270-Initial-Raptorlake-Support.patch similarity index 100% rename from 0263-Initial-Raptorlake-Support.patch rename to 0270-Initial-Raptorlake-Support.patch diff --git a/0264-Initial-Meteorlake-Support.patch b/0271-Initial-Meteorlake-Support.patch similarity index 100% rename from 0264-Initial-Meteorlake-Support.patch rename to 0271-Initial-Meteorlake-Support.patch diff --git a/0265-Support-Intel-AMX-FP16-ISA.patch b/0272-Support-Intel-AMX-FP16-ISA.patch similarity index 100% rename from 0265-Support-Intel-AMX-FP16-ISA.patch rename to 0272-Support-Intel-AMX-FP16-ISA.patch diff --git a/0266-Support-Intel-prefetchit0-t1.patch b/0273-Support-Intel-prefetchit0-t1.patch similarity index 100% rename from 0266-Support-Intel-prefetchit0-t1.patch rename to 0273-Support-Intel-prefetchit0-t1.patch diff --git a/0267-Initial-Granite-Rapids-Support.patch b/0274-Initial-Granite-Rapids-Support.patch similarity index 100% rename from 0267-Initial-Granite-Rapids-Support.patch rename to 0274-Initial-Granite-Rapids-Support.patch diff --git a/0268-Support-Intel-AMX-COMPLEX.patch b/0275-Support-Intel-AMX-COMPLEX.patch similarity index 100% rename from 0268-Support-Intel-AMX-COMPLEX.patch rename to 0275-Support-Intel-AMX-COMPLEX.patch diff --git a/0269-i386-Add-AMX-COMPLEX-to-Granite-Rapids.patch b/0276-i386-Add-AMX-COMPLEX-to-Granite-Rapids.patch similarity index 100% rename from 0269-i386-Add-AMX-COMPLEX-to-Granite-Rapids.patch rename to 0276-i386-Add-AMX-COMPLEX-to-Granite-Rapids.patch diff --git a/0270-Initial-Granite-Rapids-D-Support.patch b/0277-Initial-Granite-Rapids-D-Support.patch similarity index 100% rename from 0270-Initial-Granite-Rapids-D-Support.patch rename to 0277-Initial-Granite-Rapids-D-Support.patch diff --git a/0271-Correct-Granite-Rapids-D-documentation.patch b/0278-Correct-Granite-Rapids-D-documentation.patch similarity index 100% rename from 0271-Correct-Granite-Rapids-D-documentation.patch rename to 0278-Correct-Granite-Rapids-D-documentation.patch diff --git a/0272-i386-Remove-Meteorlake-s-family_model.patch b/0279-i386-Remove-Meteorlake-s-family_model.patch similarity index 100% rename from 0272-i386-Remove-Meteorlake-s-family_model.patch rename to 0279-i386-Remove-Meteorlake-s-family_model.patch diff --git a/0273-x86-Update-model-values-for-Alderlake-Rocketlake-and.patch b/0280-x86-Update-model-values-for-Alderlake-Rocketlake-and.patch similarity index 100% rename from 0273-x86-Update-model-values-for-Alderlake-Rocketlake-and.patch rename to 0280-x86-Update-model-values-for-Alderlake-Rocketlake-and.patch diff --git a/0274-x86-Update-model-values-for-Raptorlake.patch b/0281-x86-Update-model-values-for-Raptorlake.patch similarity index 100% rename from 0274-x86-Update-model-values-for-Raptorlake.patch rename to 0281-x86-Update-model-values-for-Raptorlake.patch diff --git a/0275-Fix-target_clone-arch-graniterapids-d.patch b/0282-Fix-target_clone-arch-graniterapids-d.patch similarity index 100% rename from 0275-Fix-target_clone-arch-graniterapids-d.patch rename to 0282-Fix-target_clone-arch-graniterapids-d.patch diff --git a/0276-i386-Change-prefetchi-output-template.patch b/0283-i386-Change-prefetchi-output-template.patch similarity index 100% rename from 0276-i386-Change-prefetchi-output-template.patch rename to 0283-i386-Change-prefetchi-output-template.patch diff --git a/0277-i386-Add-non-optimize-prefetchi-intrins.patch b/0284-i386-Add-non-optimize-prefetchi-intrins.patch similarity index 100% rename from 0277-i386-Add-non-optimize-prefetchi-intrins.patch rename to 0284-i386-Add-non-optimize-prefetchi-intrins.patch diff --git a/0285-SME-Recover-hip09-and-hip11-in-aarch64-cores.def.patch b/0285-SME-Recover-hip09-and-hip11-in-aarch64-cores.def.patch new file mode 100644 index 0000000000000000000000000000000000000000..24dacd1eb01c3309f74cfa17af6c94d333dcbcef --- /dev/null +++ b/0285-SME-Recover-hip09-and-hip11-in-aarch64-cores.def.patch @@ -0,0 +1,32 @@ +From 239f0637307ff2f6afb1473e99d0bb0eaf8946b2 Mon Sep 17 00:00:00 2001 +From: xiezhiheng +Date: Fri, 23 Aug 2024 15:37:17 +0800 +Subject: [PATCH 154/157] [SME] Recover hip09 and hip11 in aarch64-cores.def + +--- + gcc/config/aarch64/aarch64-cores.def | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def +index f069c81cf..3337fd1a0 100644 +--- a/gcc/config/aarch64/aarch64-cores.def ++++ b/gcc/config/aarch64/aarch64-cores.def +@@ -130,6 +130,7 @@ AARCH64_CORE("a64fx", a64fx, a64fx, V8_2A, (F16, SVE), a64fx, 0x46, 0x001, -1) + + /* HiSilicon ('H') cores. */ + AARCH64_CORE("tsv110", tsv110, tsv110, V8_2A, (CRYPTO, F16), tsv110, 0x48, 0xd01, -1) ++AARCH64_CORE("hip09", hip09, hip09, V8_5A, (SVE, I8MM, F32MM, F64MM, PROFILE, PREDRES), hip09, 0x48, 0xd02, 0x0) + + /* ARMv8.3-A Architecture Processors. */ + +@@ -171,6 +172,7 @@ AARCH64_CORE("cortex-a710", cortexa710, cortexa57, V9A, (SVE2_BITPERM, MEMTAG, + AARCH64_CORE("cortex-x2", cortexx2, cortexa57, V9A, (SVE2_BITPERM, MEMTAG, I8MM, BF16), neoversen2, 0x41, 0xd48, -1) + + AARCH64_CORE("neoverse-n2", neoversen2, cortexa57, V9A, (I8MM, BF16, SVE2_BITPERM, RNG, MEMTAG, PROFILE), neoversen2, 0x41, 0xd49, -1) ++AARCH64_CORE("hip11", hip11, hip11, V8_5A, (SVE, SVE2, F16), hip11, 0x48, 0xd22, -1) + + AARCH64_CORE("demeter", demeter, cortexa57, V9A, (I8MM, BF16, SVE2_BITPERM, RNG, MEMTAG, PROFILE), neoversev2, 0x41, 0xd4f, -1) + AARCH64_CORE("neoverse-v2", neoversev2, cortexa57, V9A, (I8MM, BF16, SVE2_BITPERM, RNG, MEMTAG, PROFILE), neoversev2, 0x41, 0xd4f, -1) +-- +2.33.0 + diff --git a/0286-Try-to-use-AI-model-to-guide-optimization.patch b/0286-Try-to-use-AI-model-to-guide-optimization.patch new file mode 100644 index 0000000000000000000000000000000000000000..a697dcc7815d3697b9a09a95881de29f6e1ae30e --- /dev/null +++ b/0286-Try-to-use-AI-model-to-guide-optimization.patch @@ -0,0 +1,671 @@ +diff --git a/gcc/Makefile.in b/gcc/Makefile.in +index fcfa54697..f42aeb8e8 100644 +--- a/gcc/Makefile.in ++++ b/gcc/Makefile.in +@@ -1449,6 +1449,7 @@ OBJS = \ + inchash.o \ + incpath.o \ + init-regs.o \ ++ ipa-hardware-detection.o \ + internal-fn.o \ + ipa-struct-reorg/ipa-struct-reorg.o \ + ipa-cp.o \ +diff --git a/gcc/common.opt b/gcc/common.opt +index fd98382fa..99e626641 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -185,6 +185,9 @@ const char *main_input_basename + Variable + int main_input_baselength + ++Variable ++bool optimize_maximum ++ + ; The base name used for auxiliary output files. + ; dump_base_name minus dump_base_ext. + +@@ -469,6 +472,10 @@ Ofast + Common Optimization + Optimize for speed disregarding exact standards compliance. + ++Om ++Common Optimization ++Optimize for maximizing radical optimization. ++ + Og + Common Optimization + Optimize for debugging experience rather than speed or size. +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 309ecc3d9..ad853af9a 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -18637,6 +18637,134 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind, + return stmt_cost; + } + ++/* Check whether in C language or LTO with only C language. */ ++extern bool lang_c_p (void); ++ ++static void ++override_C_optimize_options (struct gcc_options *opts) ++{ ++ opts->x_flag_ipa_reorder_fields = 1; ++ opts->x_flag_ipa_struct_reorg = 6; ++ opts->x_struct_layout_optimize_level = 6; ++ opts->x_flag_gnu89_inline = 1; ++ opts->x_flag_ccmp2 = 1; ++ opts->x_flag_array_widen_compare = 1; ++ opts->x_flag_convert_minmax = 1; ++ opts->x_flag_tree_slp_transpose_vectorize = 1; ++ opts->x_param_max_inline_insns_auto = 64; ++ opts->x_param_inline_unit_growth = 96; ++} ++ ++/* Check whether in CPP language or LTO with only CPP language. */ ++static bool ++lang_cpp_p (void) ++{ ++ const char *language_string = lang_hooks.name; ++ if (!language_string) ++ { ++ return false; ++ } ++ if (lang_GNU_CXX ()) ++ { ++ return true; ++ } ++ else if (strcmp (language_string, "GNU GIMPLE") == 0) // for LTO check ++ { ++ unsigned i = 0; ++ tree t = NULL_TREE; ++ FOR_EACH_VEC_SAFE_ELT (all_translation_units, i, t) ++ { ++ language_string = TRANSLATION_UNIT_LANGUAGE (t); ++ if (language_string == NULL ++ || strncmp (lang_hooks.name, "GNU C++", 7)) ++ { ++ return false; ++ } ++ } ++ return true; ++ } ++ return false; ++} ++ ++static void ++override_CPP_optimize_options (struct gcc_options *opts) ++{ ++ opts->x_flag_finite_loops = 1; ++ opts->x_flag_omit_frame_pointer = 1; ++ opts->x_flag_sized_deallocation = 0; ++ opts->x_flag_loop_elim = 1; ++ opts->x_flag_convert_minmax = 1; ++ opts->x_param_early_inlining_insns = 256; ++ opts->x_param_max_inline_insns_auto = 128; ++ opts->x_param_inline_unit_growth = 256; ++ opts->x_flag_cmlt_arith = 1; ++} ++ ++static void ++override_optimize_options_1 (struct gcc_options *opts) ++{ ++ opts->x_flag_split_ldp_stp = 1; ++ opts->x_flag_if_conversion_gimple = 1; ++ opts->x_flag_ifcvt_allow_complicated_cmps = 1; ++ opts->x_param_ifcvt_allow_register_renaming = 2; ++ opts->x_param_max_rtl_if_conversion_unpredictable_cost = 48; ++ opts->x_param_max_rtl_if_conversion_predictable_cost = 48; ++} ++ ++static void ++override_Fortran_optimize_options (struct gcc_options *opts) ++{ ++ opts->x_flag_unroll_loops = 1; ++ opts->x_flag_unconstrained_commons = 1; ++ opts->x_param_ipa_cp_eval_threshold = 1; ++ opts->x_param_ipa_cp_unit_growth = 80; ++ opts->x_param_ipa_cp_max_recursive_depth = 8; ++ opts->x_param_large_unit_insns = 30000; ++ opts->x_flag_ira_loop_pressure = 1; ++ opts->x_flag_inline_functions_called_once = 0; ++ opts->x_flag_ira_algorithm = IRA_ALGORITHM_PRIORITY; ++ opts->x_flag_delayed_branch = 1; ++ opts->x_flag_gcse_las = 1; ++ opts->x_flag_gcse_sm = 1; ++ opts->x_flag_ipa_pta = 1; ++ opts->x_flag_reorder_blocks_and_partition = 1; ++ opts->x_flag_reorder_blocks = 1; ++ opts->x_flag_crypto_accel_aes = 1; ++ opts->x_param_flexible_seg_len = 1; ++} ++ ++/* Reset the optimize option. ++ After checking the model result, this function can ++ reset the more appropriate options. */ ++static void ++reset_machine_option (struct gcc_options *opts) ++{ ++ if (!(opts->x_optimize_maximum) ++ || strstr (opts->x_aarch64_tune_string, "hip09") == NULL) ++ { ++ return; ++ } ++ ++ const char *ai_infer_level = getenv ("AI_INFER_LEVEL"); ++ if (ai_infer_level) ++ { ++ override_optimize_options_1 (opts); ++ if (lang_c_p ()) ++ { ++ override_C_optimize_options (opts); ++ } ++ else if (lang_cpp_p ()) ++ { ++ override_CPP_optimize_options (opts); ++ } ++ else if (lang_GNU_Fortran ()) ++ { ++ override_Fortran_optimize_options (opts); ++ } ++ } ++} ++ ++ + /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND + and which when vectorized would operate on vector type VECTYPE. Add the + cost of any embedded operations. */ +@@ -20089,6 +20217,7 @@ aarch64_override_options_internal (struct gcc_options *opts) + && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level) + opts->x_flag_prefetch_loop_arrays = 1; + ++ reset_machine_option (opts); + aarch64_override_options_after_change_1 (opts); + } + +diff --git a/gcc/ipa-hardware-detection.cc b/gcc/ipa-hardware-detection.cc +new file mode 100644 +index 000000000..8085a8c65 +--- /dev/null ++++ b/gcc/ipa-hardware-detection.cc +@@ -0,0 +1,243 @@ ++/* Hardware Detection. ++ Copyright (C) 2024-2024 Free Software Foundation, Inc. ++This file is part of GCC. ++GCC is free software; you can redistribute it and/or modify it ++under the terms of the GNU General Public License as published by the ++Free Software Foundation; either version 3, or (at your option) any ++later version. ++GCC is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++#include "config.h" ++#include "system.h" ++#include "coretypes.h" ++#include "backend.h" ++#include "target.h" ++#include "tree.h" ++#include "gimple.h" ++#include "tree-pass.h" ++#include "gimple-ssa.h" ++#include "tree-pretty-print.h" ++#include "fold-const.h" ++#include "gimplify.h" ++#include "gimple-iterator.h" ++#include "tree-ssa-loop-manip.h" ++#include "tree-ssa-loop.h" ++#include "ssa.h" ++#include "tree-into-ssa.h" ++#include "cfganal.h" ++#include "cfgloop.h" ++#include "gimple-pretty-print.h" ++#include "tree-cfg.h" ++#include "cgraph.h" ++#include "print-tree.h" ++#include "cfghooks.h" ++#include "gimple-fold.h" ++#include "gimplify-me.h" ++ ++namespace { ++ ++/* Build a binary operation and gimplify it. Emit code before GSI. ++ Return the gimple_val holding the result. */ ++ ++static tree ++gimplify_build2 (gimple_stmt_iterator *gsi, enum tree_code code, ++ tree type, tree a, tree b) ++{ ++ tree ret; ++ ++ ret = fold_build2_loc (gimple_location (gsi_stmt (*gsi)), code, type, a, b); ++ return force_gimple_operand_gsi (gsi, ret, true, NULL, true, ++ GSI_SAME_STMT); ++} ++ ++static basic_block ++create_abort_bb (basic_block last_bb) ++{ ++ basic_block bb = create_empty_bb (last_bb); ++ if (last_bb->loop_father != NULL) ++ { ++ add_bb_to_loop (bb, last_bb->loop_father); ++ loops_state_set (LOOPS_NEED_FIXUP); ++ } ++ gimple_stmt_iterator gsi = gsi_last_bb (bb); ++ tree fn = builtin_decl_implicit (BUILT_IN_ABORT); ++ gimple *g = gimple_build_call (fn, 0); ++ gsi_insert_after (&gsi, g, GSI_NEW_STMT); ++ return bb; ++} ++ ++static basic_block ++create_part_bb (basic_block last_bb, tree part_base) ++{ ++ basic_block bb = create_empty_bb (last_bb); ++ if (last_bb->loop_father != NULL) ++ { ++ add_bb_to_loop (bb, last_bb->loop_father); ++ loops_state_set (LOOPS_NEED_FIXUP); ++ } ++ gimple_stmt_iterator gsi = gsi_last_bb (bb); ++ gsi_insert_after (&gsi, gimple_build_nop (), GSI_NEW_STMT); ++ /* This number is used to efficiently identify the supported part range. */ ++ tree part_cond = gimplify_build2 ( ++ &gsi, PLUS_EXPR, unsigned_type_node, part_base, ++ build_int_cst (unsigned_type_node, 4294963967)); ++ gcond *cond = gimple_build_cond (LE_EXPR, part_cond, ++ build_int_cst (unsigned_type_node, 2), ++ NULL_TREE, NULL_TREE); ++ gimple_set_location (cond, input_location); ++ gsi_insert_before (&gsi, cond, GSI_SAME_STMT); ++ gsi_remove (&gsi, true); ++ return bb; ++} ++ ++static void ++create_detection_bb () ++{ ++ edge old_e = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun)); ++ basic_block ret_bb = old_e->dest; ++ ++ basic_block detection_bb = create_empty_bb (ENTRY_BLOCK_PTR_FOR_FN (cfun)); ++ if (ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father != NULL) ++ { ++ add_bb_to_loop (detection_bb, ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father); ++ loops_state_set (LOOPS_NEED_FIXUP); ++ } ++ tree cpuid_decl = build_decl (input_location, VAR_DECL, ++ get_identifier ("cpuid"), unsigned_type_node); ++ add_local_decl (cfun, cpuid_decl); ++ ++ gimple_stmt_iterator gsi = gsi_last_bb (detection_bb); ++ vec *outputs = NULL; ++ tree purpose = build_string (strlen ("=r"), "=r"); ++ tree output = build_tree_list ( ++ build_tree_list (NULL_TREE, purpose), cpuid_decl); ++ vec_safe_push (outputs, output); ++ gasm *asm_stmt = gimple_build_asm_vec ( ++ "mrs %0, MIDR_EL1", NULL, outputs, NULL, NULL); ++ gsi_insert_after (&gsi, asm_stmt, GSI_NEW_STMT); ++ gsi_insert_after (&gsi, gimple_build_nop (), GSI_NEW_STMT); ++ ++ tree implementer = gimplify_build2 ( ++ &gsi, RSHIFT_EXPR, unsigned_type_node, cpuid_decl, ++ build_int_cst (unsigned_type_node, 24)); ++ tree part_base = gimplify_build2 ( ++ &gsi, RSHIFT_EXPR, unsigned_type_node, cpuid_decl, ++ build_int_cst (unsigned_type_node, 4)); ++ tree part = gimplify_build2 ( ++ &gsi, BIT_AND_EXPR, unsigned_type_node, part_base, ++ build_int_cst (unsigned_type_node, 4095)); ++ gcond *implementer_cond = gimple_build_cond ( ++ EQ_EXPR, implementer, ++ build_int_cst (unsigned_type_node, 72), ++ NULL_TREE, NULL_TREE); ++ gimple_set_location (implementer_cond, input_location); ++ gsi_insert_before (&gsi, implementer_cond, GSI_SAME_STMT); ++ gsi_remove (&gsi, true); ++ ++ basic_block part_bb = create_part_bb (detection_bb, part); ++ basic_block abort_bb = create_abort_bb (part_bb); ++ ++ remove_edge_raw (old_e); ++ make_single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun), ++ detection_bb, EDGE_FALLTHRU); ++ edge etrue = make_edge (detection_bb, part_bb, EDGE_TRUE_VALUE); ++ etrue->probability = profile_probability::likely (); ++ edge efalse = make_edge (detection_bb, abort_bb, EDGE_FALSE_VALUE); ++ efalse->probability = profile_probability::unlikely (); ++ edge part_true = make_edge (part_bb, ret_bb, EDGE_TRUE_VALUE); ++ part_true->probability = profile_probability::likely (); ++ edge part_false = make_edge (part_bb, abort_bb, EDGE_FALSE_VALUE); ++ part_false->probability = profile_probability::unlikely (); ++ make_single_succ_edge (abort_bb, ret_bb, EDGE_FALLTHRU); ++ if (dom_info_available_p (CDI_DOMINATORS)) ++ { ++ set_immediate_dominator (CDI_DOMINATORS, part_bb, detection_bb); ++ set_immediate_dominator (CDI_DOMINATORS, ret_bb, detection_bb); ++ set_immediate_dominator (CDI_DOMINATORS, abort_bb, detection_bb); ++ } ++} ++ ++const pass_data pass_data_ipa_hardware_detection = ++{ ++ SIMPLE_IPA_PASS, ++ "hardware_detection", ++ OPTGROUP_NONE, ++ TV_IPA_HARDWARE_DETECTION, ++ (PROP_cfg | PROP_ssa), ++ 0, ++ 0, ++ 0, ++ (TODO_update_ssa | TODO_verify_all) ++}; ++ ++class pass_ipa_hardware_detection : public simple_ipa_opt_pass ++{ ++public: ++ pass_ipa_hardware_detection (gcc::context *ctxt) ++ : simple_ipa_opt_pass (pass_data_ipa_hardware_detection, ctxt) ++ {} ++ ++ virtual bool gate (function *); ++ virtual unsigned int execute (function *); ++}; // class pass_ipa_hardware_detection ++ ++bool ++pass_ipa_hardware_detection::gate (function *) ++{ ++ const char *ai_infer_level = getenv ("AI_INFER_LEVEL"); ++ return (ai_infer_level ++ && optimize_maximum > 0 ++ /* Only enable in lto or whole_program. */ ++ && (in_lto_p || flag_whole_program)); ++} ++ ++unsigned int ++pass_ipa_hardware_detection::execute (function *) ++{ ++ unsigned int ret = 0; ++ cgraph_node *cnode; ++ FOR_EACH_FUNCTION (cnode) ++ { ++ if (!cnode->real_symbol_p ()) ++ { ++ continue; ++ } ++ if (cnode->definition) ++ { ++ if (!cnode->has_gimple_body_p () || cnode->inlined_to) ++ continue; ++ ++ cnode->get_body (); ++ function *fn = DECL_STRUCT_FUNCTION (cnode->decl); ++ if (!fn) ++ continue; ++ ++ if (DECL_NAME (cnode->decl) ++ && MAIN_NAME_P (DECL_NAME (cnode->decl))) ++ { ++ push_cfun (fn); ++ calculate_dominance_info (CDI_DOMINATORS); ++ ++ create_detection_bb (); ++ ++ cgraph_edge::rebuild_edges (); ++ free_dominance_info (CDI_DOMINATORS); ++ pop_cfun (); ++ } ++ } ++ } ++ return ret; ++} ++} // anon namespace ++ ++simple_ipa_opt_pass * ++make_pass_ipa_hardware_detection (gcc::context *ctxt) ++{ ++ return new pass_ipa_hardware_detection (ctxt); ++} +diff --git a/gcc/opts-common.cc b/gcc/opts-common.cc +index 489a6e02a..12c3f7299 100644 +--- a/gcc/opts-common.cc ++++ b/gcc/opts-common.cc +@@ -992,6 +992,158 @@ opts_concat (const char *first, ...) + return newstr; + } + ++typedef int64_t (*run_ai_model_func)(int, const char **, ++ const char *, int, int64_t *); ++#define PTR_UNION_TYPE(TOTYPE) union { void *_q; TOTYPE _nq; } ++#define PTR_UNION_AS_VOID_PTR(NAME) (NAME._q) ++#define PTR_UNION_AS_CAST_PTR(NAME) (NAME._nq) ++ ++static int64_t ++ai_infer_optimization (int argc, const char **argv, ++ const char *mcpu_option, ++ int argc_hw, int64_t *argv_hw) ++{ ++ /* Load dependent AI-framework libraries. */ ++ void *onnxruntime_lib_handle = NULL; ++ const char *onnxruntime_lib_path = "libonnxruntime.so"; ++ ++ onnxruntime_lib_handle = dlopen (onnxruntime_lib_path, ++ RTLD_LAZY | RTLD_GLOBAL); ++ if (!onnxruntime_lib_handle) ++ { ++ return -1; ++ } ++ ++ void *ai4c_lib_handle = NULL; ++ const char *ai4c_lib_path = "libONNXRunner.so"; ++ ++ ai4c_lib_handle = dlopen (ai4c_lib_path, RTLD_LAZY | RTLD_GLOBAL); ++ if (!ai4c_lib_handle) ++ { ++ return -1; ++ } ++ ++ /* Clear any existing error. */ ++ dlerror (); ++ ++ /* Run AI4Compiler model. */ ++ if (ai4c_lib_handle == NULL || onnxruntime_lib_handle == NULL) ++ { ++ return -1; ++ } ++ ++ run_ai_model_func run_ai_model; ++ PTR_UNION_TYPE (run_ai_model_func) run_ai_model_func_union; ++ PTR_UNION_AS_VOID_PTR (run_ai_model_func_union) ++ = dlsym (ai4c_lib_handle, "runONNXModelOptimizer"); ++ run_ai_model = PTR_UNION_AS_CAST_PTR (run_ai_model_func_union); ++ if (!run_ai_model) ++ { ++ dlclose (ai4c_lib_handle); ++ dlclose (onnxruntime_lib_handle); ++ return -1; ++ } ++ int64_t model_pred = (*run_ai_model) (argc, argv, ++ mcpu_option, argc_hw, argv_hw); ++ ++ if (ai4c_lib_handle) ++ dlclose (ai4c_lib_handle); ++ ++ if (onnxruntime_lib_handle) ++ dlclose (onnxruntime_lib_handle); ++ ++ if (model_pred == 1) ++ putenv ("AI_INFER_LEVEL=1"); ++ return model_pred; ++} ++ ++static int ++handle_lto_option (unsigned int lang_mask, ++ unsigned int num_decoded_options, ++ unsigned int argc, ++ const char **argv, ++ struct cl_decoded_option *&opt_array) ++{ ++ int ret = 0; ++ char *lan = ""; ++ char *compiler = xstrdup (argv[0]); ++ lan = strrchr (compiler, '/'); ++ if (lan != NULL) ++ lan ++; ++ else ++ lan = compiler; ++ if (strstr (lan, "gcc") != NULL) ++ { ++ opt_array = XRESIZEVEC (struct cl_decoded_option, opt_array, argc + 2); ++ const char* lto_flag = "-flto=8"; ++ decode_cmdline_option (<o_flag, lang_mask, ++ &opt_array[num_decoded_options]); ++ ret++; ++ const char* ltopartition_flag = "-flto-partition=one"; ++ decode_cmdline_option (<opartition_flag, lang_mask, ++ &opt_array[num_decoded_options + 1]); ++ ret++; ++ } ++ else if (strstr (lan, "g++") != NULL ++ || strstr (lan, "gfortran") != NULL) ++ { ++ opt_array = XRESIZEVEC (struct cl_decoded_option, opt_array, argc + 1); ++ const char* lto_flag = "-flto=8"; ++ decode_cmdline_option (<o_flag, lang_mask, ++ &opt_array[num_decoded_options]); ++ ret++; ++ } ++ if (compiler) ++ free (compiler); ++ return ret; ++} ++ ++static int ++handle_machine_option (unsigned int lang_mask, ++ unsigned int num_decoded_options, ++ unsigned int argc, ++ const char **argv, ++ struct cl_decoded_option *&opt_array) ++{ ++ int ret = 0; ++ bool flag_Om = false; ++ bool flag_hip09 = false; ++ for (unsigned i = 1; i < argc; i ++) ++ { ++ if (strcmp (argv[i], "-Om") == 0) ++ flag_Om = true; ++ if (strstr (argv[i], "mcpu=hip09") != NULL) ++ flag_hip09 = true; ++ } ++ if (!flag_hip09 || !flag_Om) ++ { ++ return ret; ++ } ++ ++ const char *ai_infer_level = getenv ("AI_INFER_LEVEL"); ++ if (ai_infer_level) ++ { ++ return ret; ++ } ++ int argc_hw = 6; ++ int64_t argv_hw[argc_hw] = { ++ global_options.x_param_simultaneous_prefetches, ++ global_options.x_param_l1_cache_size, ++ global_options.x_param_l1_cache_line_size, ++ global_options.x_param_l2_cache_size, ++ global_options.x_param_prefetch_latency, ++ global_options.x_param_ipa_prefetch_distance_factor}; ++ int64_t output_pred = ai_infer_optimization ( ++ argc, argv, "hip09", argc_hw, argv_hw); ++ if (output_pred != 1) ++ { ++ return ret; ++ } ++ ++ return handle_lto_option (lang_mask, num_decoded_options, ++ argc, argv, opt_array); ++} ++ + /* Decode command-line options (ARGC and ARGV being the arguments of + main) into an array, setting *DECODED_OPTIONS to a pointer to that + array and *DECODED_OPTIONS_COUNT to the number of entries in the +@@ -1090,6 +1242,9 @@ decode_cmdline_options_to_array (unsigned int argc, const char **argv, + num_decoded_options++; + } + ++ num_decoded_options += handle_machine_option (lang_mask, num_decoded_options, ++ argc, argv, opt_array); ++ + *decoded_options = opt_array; + *decoded_options_count = num_decoded_options; + prune_options (decoded_options, decoded_options_count, lang_mask); +diff --git a/gcc/opts.cc b/gcc/opts.cc +index e34e5ee8e..d97f6079f 100644 +--- a/gcc/opts.cc ++++ b/gcc/opts.cc +@@ -780,6 +780,14 @@ default_options_optimization (struct gcc_options *opts, + opts->x_optimize_debug = 1; + break; + ++ case OPT_Om: ++ /* -Om adds flags to -O3. */ ++ opts->x_optimize_size = 0; ++ opts->x_optimize = 3; ++ opts->x_optimize_maximum = true; ++ opts->x_optimize_debug = 0; ++ break; ++ + case OPT_fopenacc: + if (opt->value) + openacc_mode = true; +@@ -2733,6 +2741,8 @@ common_handle_option (struct gcc_options *opts, + &= ~(SANITIZE_UNDEFINED | SANITIZE_UNDEFINED_NONDEFAULT); + break; + ++ case OPT_Om: ++ break; + case OPT_O: + case OPT_Os: + case OPT_Ofast: +diff --git a/gcc/passes.def b/gcc/passes.def +index 8797f166f..690d344c0 100644 +--- a/gcc/passes.def ++++ b/gcc/passes.def +@@ -179,6 +179,7 @@ along with GCC; see the file COPYING3. If not see + passes are executed after partitioning and thus see just parts of the + compiled unit. */ + INSERT_PASSES_AFTER (all_late_ipa_passes) ++ NEXT_PASS (pass_ipa_hardware_detection); + NEXT_PASS (pass_ipa_pta); + /* FIXME: this should be a normal IP pass. */ + NEXT_PASS (pass_ipa_struct_reorg); +diff --git a/gcc/timevar.def b/gcc/timevar.def +index 8e7510eb3..bd8c9a4f7 100644 +--- a/gcc/timevar.def ++++ b/gcc/timevar.def +@@ -81,6 +81,7 @@ DEFTIMEVAR (TV_IPA_CONSTANT_PROP , "ipa cp") + DEFTIMEVAR (TV_IPA_INLINING , "ipa inlining heuristics") + DEFTIMEVAR (TV_IPA_FNSPLIT , "ipa function splitting") + DEFTIMEVAR (TV_IPA_COMDATS , "ipa comdats") ++DEFTIMEVAR (TV_IPA_HARDWARE_DETECTION, "ipa detection") + DEFTIMEVAR (TV_IPA_PREFETCH , "ipa prefetch") + DEFTIMEVAR (TV_IPA_STRUCT_REORG , "ipa struct reorg optimization") + DEFTIMEVAR (TV_IPA_OPT , "ipa various optimizations") +diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h +index 1c983ef71..ee873f0b2 100644 +--- a/gcc/tree-pass.h ++++ b/gcc/tree-pass.h +@@ -528,6 +528,8 @@ extern ipa_opt_pass_d *make_pass_ipa_icp (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_odr (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_reference (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_pure_const (gcc::context *ctxt); ++extern simple_ipa_opt_pass *make_pass_ipa_hardware_detection (gcc::context * ++ ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_prefetch (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_struct_reorg (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_pta (gcc::context *ctxt); diff --git a/0287-Add-dynamic-memory-access-checks.patch b/0287-Add-dynamic-memory-access-checks.patch new file mode 100644 index 0000000000000000000000000000000000000000..e23d8f64c0d87f6c3d65f9d5ead4bdace2fdca5d --- /dev/null +++ b/0287-Add-dynamic-memory-access-checks.patch @@ -0,0 +1,774 @@ +From 08fb60d0a0707af4004b20358f4a921e4ae6cca6 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia +Date: Thu, 22 Aug 2024 15:23:36 +0800 +Subject: [PATCH 156/157] Add dynamic memory access checks + +Signed-off-by: Diachkov Ilia +--- + gcc/ipa-prefetch.cc | 622 +++++++++++++++++++++++++++++++++++++------- + gcc/params.opt | 4 + + 2 files changed, 525 insertions(+), 101 deletions(-) + +diff --git a/gcc/ipa-prefetch.cc b/gcc/ipa-prefetch.cc +index 94290ea9c..b000d4d75 100644 +--- a/gcc/ipa-prefetch.cc ++++ b/gcc/ipa-prefetch.cc +@@ -368,6 +368,7 @@ typedef std::map memref_tree_map; + typedef std::set stmt_set; + typedef std::set tree_set; + typedef std::map tree_map; ++typedef std::map tree_poly_offset_map; + + tree_memref_map *tm_map; + funct_mrs_map *fmrs_map; +@@ -710,6 +711,20 @@ get_mem_ref_address_ssa_name (tree mem, tree base) + return NULL_TREE; + } + ++static void ++dump_base_addr (tree base_addr) ++{ ++ if (base_addr) ++ { ++ fprintf (dump_file, "Base addr (%s): ", ++ get_tree_code_name (TREE_CODE (base_addr))); ++ print_generic_expr (dump_file, base_addr); ++ } ++ else ++ fprintf (dump_file, "Base addr (%s): ", "null"); ++ fprintf (dump_file, "\n"); ++} ++ + static void + analyse_mem_ref (gimple *stmt, tree mem, memref_t* mr) + { +@@ -736,14 +751,7 @@ analyse_mem_ref (gimple *stmt, tree mem, memref_t* mr) + { + tree base_addr = get_mem_ref_address_ssa_name (mem, base); + if (dump_file) +- { +- fprintf (dump_file, "Base addr (%s): ", +- base_addr ? get_tree_code_name (TREE_CODE (base_addr)) +- : "null"); +- if (base_addr) +- print_generic_expr (dump_file, base_addr); +- fprintf (dump_file, "\n"); +- } ++ dump_base_addr (base_addr); + if (base_addr) + { + mr->base = analyse_addr_eval (base_addr, mr); +@@ -1187,7 +1195,7 @@ reduce_memref_set (memref_set *set, vec &vec) + } + + static void +-find_nearest_common_dominator (memref_t *mr, basic_block &dom) ++find_nearest_common_post_dominator (memref_t *mr, basic_block &dom) + { + for (unsigned int i = 0; i < mr->stmts.length (); i++) + { +@@ -1196,7 +1204,7 @@ find_nearest_common_dominator (memref_t *mr, basic_block &dom) + if (dom == bb) + continue; + if (dom) +- dom = nearest_common_dominator (CDI_DOMINATORS, dom, bb); ++ dom = nearest_common_dominator (CDI_POST_DOMINATORS, dom, bb); + else + dom = bb; + } +@@ -1495,10 +1503,13 @@ gimple_copy_and_remap (gimple *stmt) + + static gimple * + gimple_copy_and_remap_memref_stmts (memref_t *mr, gimple_seq &stmts, +- int last_idx, stmt_set &processed) ++ int first_idx, int last_idx, ++ stmt_set &processed) + { + gimple *last_stmt = NULL; +- for (int i = mr->stmts.length () - 1; i >= last_idx ; i--) ++ if (first_idx == 0) ++ first_idx = mr->stmts.length () - 1; ++ for (int i = first_idx; i >= last_idx; i--) + { + if (processed.count (mr->stmts[i])) + continue; +@@ -1515,6 +1526,436 @@ gimple_copy_and_remap_memref_stmts (memref_t *mr, gimple_seq &stmts, + return last_stmt; + } + ++/* Check if prefetch insertion may be always unsafe in this case. For now ++ reject cases with access to arrays with no domain or with no elements. */ ++ ++static bool ++check_prefetch_safety (vec &mrs, memref_t *cmr) ++{ ++ for (unsigned int i = 0; i < mrs.length (); i++) ++ { ++ memref_t *mr = mrs[i]; ++ if (mr == cmr || mr->used_mrs.empty ()) ++ continue; ++ bool is_store; ++ tree *mem = simple_mem_ref_in_stmt (mr->stmts[0], &is_store); ++ if (mem == NULL || TREE_CODE (*mem) != ARRAY_REF) ++ continue; ++ tree array = TREE_OPERAND (*mem, 0); ++ tree atype = TREE_TYPE (array); ++ gcc_assert (atype); ++ tree domain = TYPE_DOMAIN (atype); ++ if (!domain || !tree_fits_uhwi_p (TYPE_MIN_VALUE (domain)) ++ || !tree_fits_uhwi_p (TYPE_MAX_VALUE (domain))) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, "Unsupported array type: "); ++ print_generic_expr (dump_file, atype); ++ fprintf (dump_file, "\n"); ++ } ++ return false; ++ } ++ unsigned HOST_WIDE_INT min_val = tree_to_uhwi (TYPE_MIN_VALUE (domain)); ++ unsigned HOST_WIDE_INT max_val = tree_to_uhwi (TYPE_MAX_VALUE (domain)); ++ if (min_val == 0 && max_val == 0) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, "Unsupported array type's bounds: "); ++ print_generic_expr (dump_file, atype); ++ fprintf (dump_file, "\n"); ++ } ++ return false; ++ } ++ } ++ return true; ++} ++ ++/* Collect base addresses which we need to check. */ ++ ++static void ++collect_base_addresses (vec &used_mr_vec, HOST_WIDE_INT dist_val, ++ memref_t *comp_mr, tree_poly_offset_map &offset_map) ++{ ++ if (dump_file) ++ fprintf (dump_file, "Collect base addresses which we need to check.\n"); ++ for (unsigned int i = 0; i < used_mr_vec.length (); i++) ++ { ++ memref_t *mr = used_mr_vec[i]; ++ if (mr == comp_mr || mr->used_mrs.empty ()) ++ continue; ++ bool is_store; ++ tree *mem = simple_mem_ref_in_stmt (mr->stmts[0], &is_store); ++ if (mem == NULL || TREE_CODE (*mem) != MEM_REF) ++ continue; ++ tree base = get_base_address (*mem); ++ tree base_addr = get_mem_ref_address_ssa_name (*mem, base); ++ if (!base_addr) ++ continue; ++ if (dump_file) ++ { ++ dump_base_addr (base_addr); ++ if (base) ++ { ++ fprintf (dump_file, "Base:"); ++ print_generic_expr (dump_file, base); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ if (!TREE_OPERAND (base, 1)) ++ continue; ++ poly_offset_int curr_offset = mem_ref_offset (base); ++ poly_offset_int saved_offset = 0; ++ if (offset_map.count (base_addr)) ++ { ++ saved_offset = offset_map[base_addr]; ++ if ((dist_val > 0 && known_gt (curr_offset, saved_offset)) ++ || (dist_val < 0 && known_lt (curr_offset, saved_offset))) ++ offset_map[base_addr] = curr_offset; ++ else if (dump_file) ++ fprintf (dump_file, "Off: step=%ld gt=%d lt=%d\n", dist_val, ++ known_gt (curr_offset, saved_offset), ++ known_lt (curr_offset, saved_offset)); ++ } ++ else ++ offset_map[base_addr] = curr_offset; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Final list of base addresses:\n"); ++ for (tree_poly_offset_map::iterator it1 = offset_map.begin (); ++ it1 != offset_map.end (); ++it1) ++ { ++ tree base_addr = it1->first; ++ poly_offset_int off = it1->second; ++ fprintf (dump_file, "Base:"); ++ print_generic_expr (dump_file, base_addr); ++ HOST_WIDE_INT val = estimated_poly_value (off.force_shwi (), ++ POLY_VALUE_LIKELY); ++ fprintf (dump_file, "\nOff: %ld\n", val); ++ } ++ fprintf (dump_file, "Finish collecting base addresses.\n"); ++ } ++} ++ ++/* Return true if we need page check to access memory at this address. */ ++ ++static bool ++need_page_check (tree base_addr, tree_set &checked_base_addrs) ++{ ++ if (dump_file) ++ dump_base_addr (base_addr); ++ if (base_addr == NULL) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Base address not found\n"); ++ return false; ++ } ++ if (checked_base_addrs.count (base_addr)) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Base address is already checked\n"); ++ return false; ++ } ++ return true; ++} ++ ++/* Insert instructions to check the original address and newly evaluated ++ adress for prefetch correspond the same page. */ ++ ++static gimple * ++insert_page_check (tree addr, tree_poly_offset_map &offset_map, ++ gimple_seq &stmts) ++{ ++ poly_offset_int offset = 0; ++ if (offset_map.count (addr)) ++ offset = offset_map[addr]; ++ tree addr_type = TREE_TYPE (addr); ++ tree utype = unsigned_type_for (addr_type); ++ tree new_addr = build_int_cst (addr_type, 0); ++ if (decl_map->count (addr)) ++ new_addr = (*decl_map)[addr]; ++ tree t1 = make_ssa_name (utype); ++ tree t2 = make_ssa_name (utype); ++ unsigned long long pmask = ~(param_ipa_prefetch_pagesize - 1); ++ tree pmask_cst = build_int_cst (utype, pmask); ++ tree off_tree = wide_int_to_tree (sizetype, offset); ++ gcc_assert (TREE_CODE (addr_type) == POINTER_TYPE); ++ tree addr_with_offset = gimple_build (&stmts, POINTER_PLUS_EXPR, ++ addr_type, addr, off_tree); ++ tree conv_addr = make_ssa_name (utype); ++ tree conv_new_addr = make_ssa_name (utype); ++ gimple *conv1 = gimple_build_assign (conv_addr, ++ fold_convert (utype, addr_with_offset)); ++ gimple *conv2 = gimple_build_assign (conv_new_addr, ++ fold_convert (utype, new_addr)); ++ gimple *paddr = gimple_build_assign (t1, BIT_AND_EXPR, ++ conv_addr, pmask_cst); ++ gimple *new_paddr = gimple_build_assign (t2, BIT_AND_EXPR, ++ conv_new_addr, pmask_cst); ++ gcond *cond = gimple_build_cond (EQ_EXPR, t1, t2, NULL, NULL); ++ gimple_seq_add_stmt (&stmts, conv1); ++ gimple_seq_add_stmt (&stmts, paddr); ++ gimple_seq_add_stmt (&stmts, conv2); ++ gimple_seq_add_stmt (&stmts, new_paddr); ++ gimple_seq_add_stmt (&stmts, cond); ++ return cond; ++} ++ ++/* Check if this array access needs dynamic address verification. Support only ++ arrays with 1-d indexing. */ ++ ++static bool ++need_array_index_check (tree mem) ++{ ++ /* Check pattern: t1 = (type) t0; ld/st array[t1]. If any index of type (t0) ++ does not go beyond the bounds of the array, we don't need the check. */ ++ tree array = TREE_OPERAND (mem, 0); ++ tree atype = TREE_TYPE (array); ++ tree index = TREE_OPERAND (mem, 1); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Array ind: "); ++ print_generic_expr (dump_file, index); ++ fprintf (dump_file, "\nMem: "); ++ print_generic_expr (dump_file, array); ++ fprintf (dump_file, "\nInd type: "); ++ print_generic_expr (dump_file, TREE_TYPE (index)); ++ fprintf (dump_file, "\nMem type: "); ++ print_generic_expr (dump_file, atype); ++ fprintf (dump_file, "\n"); ++ } ++ tree domain = TYPE_DOMAIN (atype); ++ if (!domain || !tree_fits_uhwi_p (TYPE_MIN_VALUE (domain)) ++ || !tree_fits_uhwi_p (TYPE_MAX_VALUE (domain))) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Unsupported array type domain.\n"); ++ return true; ++ } ++ unsigned HOST_WIDE_INT min_val = tree_to_uhwi (TYPE_MIN_VALUE (domain)); ++ unsigned HOST_WIDE_INT max_val = tree_to_uhwi (TYPE_MAX_VALUE (domain)); ++ if (dump_file) ++ fprintf (dump_file, "Array bounds (%ld, %ld)\n", min_val, max_val); ++ if (TREE_CODE (index) != SSA_NAME) ++ return true; ++ ++ gimple *stmt = SSA_NAME_DEF_STMT (index); ++ if (!is_gimple_assign (stmt)) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, "Is not assign, stop analysis: "); ++ print_gimple_stmt (dump_file, stmt, 3, TDF_DETAILS); ++ } ++ return true; ++ } ++ tree *lhs = gimple_assign_lhs_ptr (stmt); ++ tree *rhs = gimple_assign_rhs1_ptr (stmt); ++ tree lhs_type = TREE_TYPE (*lhs); ++ tree rhs_type = TREE_TYPE (*rhs); ++ tree ind_type = (TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type)) ++ ? lhs_type : rhs_type; ++ if (!ind_type || !tree_fits_uhwi_p (TYPE_MIN_VALUE (ind_type)) ++ || !tree_fits_uhwi_p (TYPE_MAX_VALUE (ind_type))) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Unsupported index type.\n"); ++ return true; ++ } ++ int prec = tree_to_uhwi (TYPE_SIZE (ind_type)); ++ unsigned HOST_WIDE_INT t_max_val = tree_to_uhwi (TYPE_MAX_VALUE (ind_type)); ++ unsigned HOST_WIDE_INT t_min_val = tree_to_uhwi (TYPE_MIN_VALUE (ind_type)); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Index type (%d, %ld, %ld): ", prec, ++ t_min_val, t_max_val); ++ print_generic_expr (dump_file, ind_type); ++ fprintf (dump_file, "\n"); ++ } ++ return !((t_max_val <= max_val) && (t_min_val >= min_val)); ++} ++ ++/* Insert instructions to check the new index is within the array bounds. */ ++ ++static gimple * ++insert_index_check (tree mem, gimple_seq &stmts) ++{ ++ if (dump_file) ++ fprintf (dump_file, "Insert array index check\n"); ++ tree atype = TREE_TYPE (TREE_OPERAND (mem, 0)); ++ tree ind = TREE_OPERAND (mem, 1); ++ if (decl_map->count (ind)) ++ ind = (*decl_map)[ind]; ++ tree domain = TYPE_DOMAIN (atype); ++ gcc_assert (domain && tree_fits_uhwi_p (TYPE_MIN_VALUE (domain)) ++ && tree_fits_uhwi_p (TYPE_MAX_VALUE (domain))); ++ ++ tree ind_min_val = TYPE_MIN_VALUE (domain); ++ tree ind_max_val = TYPE_MAX_VALUE (domain); ++ tree t1 = make_ssa_name (boolean_type_node); ++ tree t2 = make_ssa_name (boolean_type_node); ++ tree t3 = make_ssa_name (boolean_type_node); ++ t1 = fold_build2 (LE_EXPR, boolean_type_node, ind, ind_max_val); ++ t2 = fold_build2 (GE_EXPR, boolean_type_node, ind, ind_min_val); ++ t3 = fold_build2 (TRUTH_ANDIF_EXPR, boolean_type_node, t1, t2); ++ gcond *cond = gimple_build_cond (EQ_EXPR, t3, boolean_true_node, NULL, NULL); ++ gimple_seq_add_stmt (&stmts, cond); ++ return cond; ++} ++ ++/* Insert safety checks for memory access stmts newly created to evaluate ++ prefetch addresses. */ ++ ++static void ++process_used_mr (memref_t *mr, tree_poly_offset_map &offset_map, ++ tree_set &checked_base_addrs, gimple_seq &stmts, ++ vec &bbends) ++{ ++ bool is_store; ++ tree *mem = simple_mem_ref_in_stmt (mr->stmts[0], &is_store); ++ if (mem == NULL) ++ return; ++ if (dump_file) ++ { ++ fprintf (dump_file, "MR (%d) maybe need to insert address check: ", ++ mr->mr_id); ++ print_generic_expr (dump_file, *mem); ++ fprintf (dump_file, "\n"); ++ } ++ gimple *bbend = NULL; ++ if (TREE_CODE (*mem) == MEM_REF) ++ { ++ tree base = get_base_address (*mem); ++ tree base_addr = get_mem_ref_address_ssa_name (*mem, base); ++ if (!need_page_check (base_addr, checked_base_addrs)) ++ return; ++ bbend = insert_page_check (base_addr, offset_map, stmts); ++ checked_base_addrs.insert (base_addr); ++ } ++ else if (TREE_CODE (*mem) == ARRAY_REF && need_array_index_check (*mem)) ++ bbend = insert_index_check (*mem, stmts); ++ if (bbend) ++ bbends.safe_push (bbend); ++} ++ ++/* Create new variables and insert new stmts to evaluate prefetch addresses. */ ++ ++static void ++create_stmts_for_used_mrs (vec &used_mr_vec, vec &bbends, ++ gimple_seq &stmts, stmt_set &processed_stmts, ++ HOST_WIDE_INT dist_val, memref_t *comp_mr) ++{ ++ tree_poly_offset_map offset_map; ++ collect_base_addresses (used_mr_vec, dist_val, comp_mr, offset_map); ++ ++ /* Insert stmts to evaluate prefetch addresses. */ ++ tree_set checked_base_addrs; ++ for (unsigned int i = 0; i < used_mr_vec.length (); i++) ++ { ++ memref_t *mr = used_mr_vec[i]; ++ if (mr == comp_mr) ++ continue; ++ gimple *last_stmt = gimple_copy_and_remap_memref_stmts (mr, stmts, 0, 1, ++ processed_stmts); ++ if (last_stmt && dump_file) ++ { ++ fprintf (dump_file, "MR (%d) new mem: ", mr->mr_id); ++ print_generic_expr (dump_file, gimple_assign_lhs (last_stmt)); ++ fprintf (dump_file, "\n"); ++ } ++ if (!mr->used_mrs.empty ()) ++ process_used_mr (mr, offset_map, checked_base_addrs, stmts, bbends); ++ last_stmt = gimple_copy_and_remap_memref_stmts (mr, stmts, 0, 0, ++ processed_stmts); ++ } ++} ++ ++/* Insert prefetch instructions. */ ++ ++static void ++insert_prefetch_stmts (vec &pcalls, gimple_seq &stmts, ++ gimple *&last_pref, vec &vmrs, ++ stmt_set &processed_stmts) ++{ ++ if (dump_file) ++ fprintf (dump_file, "Evaluate addresses and insert prefetch insns.\n"); ++ ++ tree local; ++ switch (param_ipa_prefetch_locality) ++ { ++ case 0: ++ local = integer_zero_node; ++ break; ++ case 1: ++ local = integer_one_node; ++ break; ++ case 2: ++ local = build_int_cst (integer_type_node, 2); ++ break; ++ default: ++ case 3: ++ local = integer_three_node; ++ break; ++ } ++ tree_set prefetched_addrs; ++ for (unsigned int i = 0; i < vmrs.length (); i++) ++ { ++ memref_t *mr = vmrs[i]; ++ /* Don't need to copy the last stmt, since we insert prefetch insn ++ instead of it. */ ++ gimple_copy_and_remap_memref_stmts (mr, stmts, 0, 1, processed_stmts); ++ gimple *last_stmt = mr->stmts[0]; ++ gcc_assert (last_stmt); ++ ++ tree old_addr = get_mem_ref_address_ssa_name (mr->mem, NULL_TREE); ++ tree new_addr = old_addr; ++ if (decl_map->count (old_addr)) ++ new_addr = (*decl_map)[old_addr]; ++ if (prefetched_addrs.count (new_addr)) ++ continue; ++ /* Insert prefetch intrinsic call. */ ++ tree write_p = mr->is_store ? integer_one_node : integer_zero_node; ++ last_pref = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH), ++ 3, new_addr, write_p, local); ++ pcalls.safe_push (last_pref); ++ gimple_seq_add_stmt (&stmts, last_pref); ++ prefetched_addrs.insert (new_addr); ++ ++ if (dump_file) ++ { ++ fprintf (dump_file, "Insert %d prefetch stmt:\n", i); ++ print_gimple_stmt (dump_file, last_pref, 0); ++ } ++ } ++} ++ ++/* Split bbs after condition stmts and fix control flow graph. */ ++ ++static void ++correct_cfg (vec &bbends, gimple *last_pref, basic_block &dom_bb) ++{ ++ edge e_last = split_block (dom_bb, last_pref); ++ if (!bbends.length () || last_pref == NULL) ++ return; ++ for (int i = bbends.length () - 1; i >= 0; i--) ++ { ++ gimple *bbend = bbends[i]; ++ if (dump_file) ++ { ++ fprintf (dump_file, "Split dom_bb after condition stmts:\n"); ++ print_gimple_stmt (dump_file, bbend, 0); ++ } ++ basic_block last_bb = e_last->dest; ++ edge e = split_block (dom_bb, bbend); ++ e->flags &= ~EDGE_FALLTHRU; ++ e->flags |= EDGE_TRUE_VALUE; ++ edge e_false = make_edge (dom_bb, last_bb, EDGE_FALSE_VALUE); ++ e_false->probability = profile_probability::never (); ++ } ++} ++ + static void + create_cgraph_edge (cgraph_node *n, gimple *stmt) + { +@@ -1529,6 +1970,17 @@ create_cgraph_edge (cgraph_node *n, gimple *stmt) + ipa_call_summaries->get_create (e); + } + ++/* Modify cgraph inserting calls to prefetch intrinsics. */ ++ ++static void ++modify_ipa_info (cgraph_node *n, vec &pcalls) ++{ ++ for (unsigned i = 0; i < pcalls.length (); i++) ++ create_cgraph_edge (n, pcalls[i]); ++ ipa_update_overall_fn_summary (n); ++ renumber_gimple_stmt_uids (DECL_STRUCT_FUNCTION (n->decl)); ++} ++ + /* Insert prefetch intrinsics in this function, return nonzero on success. */ + + static int +@@ -1607,6 +2059,18 @@ optimize_function (cgraph_node *n, function *fn) + return 0; + } + ++ vec used_mr_vec = vNULL; ++ for (memref_set::const_iterator it = used_mrs.begin (); ++ it != used_mrs.end (); it++) ++ used_mr_vec.safe_push (*it); ++ used_mr_vec.qsort (memref_id_cmp); ++ if (!check_prefetch_safety (used_mr_vec, comp_mr)) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Prefetching may be unsafe. Skip the case.\n"); ++ return 0; ++ } ++ + /* Filter out memrefs with the same memory references. + TODO: maybe do the same with used mrs. */ + vec vmrs = vNULL; +@@ -1616,18 +2080,18 @@ optimize_function (cgraph_node *n, function *fn) + /* TODO: maybe it is useful to process also used_mrs. */ + basic_block dom_bb = NULL; + for (unsigned int i = 0; i < vmrs.length (); i++) +- find_nearest_common_dominator (vmrs[i], dom_bb); ++ find_nearest_common_post_dominator (vmrs[i], dom_bb); + + if (!dom_bb) + { + if (dump_file) +- fprintf (dump_file, "Dominator bb for MRs is not found. " ++ fprintf (dump_file, "Post dominator bb for MRs is not found. " + "Skip the case.\n"); + return 0; + } + else if (dump_file) + { +- fprintf (dump_file, "Dominator bb %d for MRs:\n", dom_bb->index); ++ fprintf (dump_file, "Post dominator bb %d for MRs:\n", dom_bb->index); + gimple_dump_bb (dump_file, dom_bb, 0, dump_flags); + fprintf (dump_file, "\n"); + } +@@ -1636,19 +2100,33 @@ optimize_function (cgraph_node *n, function *fn) + gimple *last_used = NULL; + for (gimple_stmt_iterator si = gsi_last_bb (dom_bb); !gsi_end_p (si); + gsi_prev (&si)) +- if (comp_mr->stmts[0] == gsi_stmt (si)) +- { +- last_used = gsi_stmt (si); +- if (dump_file) ++ { ++ bool found = false; ++ for (unsigned int i = 0; i < vmrs.length (); i++) ++ /* TODO: take into account only those MRs that should be ++ checked memory. */ ++ if (vmrs[i]->stmts[0] == gsi_stmt (si)) + { +- fprintf (dump_file, "Last used stmt in dominator bb:\n"); +- print_gimple_stmt (dump_file, last_used, 0); ++ found = true; ++ break; + } +- break; +- } ++ if (found || comp_mr->stmts[0] == gsi_stmt (si)) ++ { ++ last_used = gsi_stmt (si); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Last used stmt in post dominator bb:\n"); ++ print_gimple_stmt (dump_file, last_used, 0); ++ } ++ break; ++ } ++ } + +- split_block (dom_bb, last_used); +- gimple_stmt_iterator gsi = gsi_last_bb (dom_bb); ++ gimple_stmt_iterator gsi; ++ if (last_used) ++ gsi = gsi_for_stmt (last_used); ++ else ++ gsi = gsi_last_bb (dom_bb); + + /* Create new inc var. Insert new_var = old_var + step * factor. */ + decl_map = new tree_map; +@@ -1660,7 +2138,7 @@ optimize_function (cgraph_node *n, function *fn) + stmt_set processed_stmts; + if (!dominated_by_p (CDI_DOMINATORS, dom_bb, gimple_bb (comp_mr->stmts[0]))) + { +- gimple *tmp = gimple_copy_and_remap_memref_stmts (comp_mr, stmts, 0, ++ gimple *tmp = gimple_copy_and_remap_memref_stmts (comp_mr, stmts, 0, 0, + processed_stmts); + inc_var = gimple_assign_lhs (tmp); + } +@@ -1683,86 +2161,26 @@ optimize_function (cgraph_node *n, function *fn) + fprintf (dump_file, "\n"); + } + +- /* Create other new vars. Insert new stmts. */ +- vec used_mr_vec = vNULL; +- for (memref_set::const_iterator it = used_mrs.begin (); +- it != used_mrs.end (); it++) +- used_mr_vec.safe_push (*it); +- used_mr_vec.qsort (memref_id_cmp); +- +- for (unsigned int j = 0; j < used_mr_vec.length (); j++) +- { +- memref_t *mr = used_mr_vec[j]; +- if (mr == comp_mr) +- continue; +- gimple *last_stmt = gimple_copy_and_remap_memref_stmts (mr, stmts, 0, +- processed_stmts); +- gcc_assert (last_stmt); +- if (dump_file) +- { +- fprintf (dump_file, "MR (%d) new mem: ", mr->mr_id); +- print_generic_expr (dump_file, gimple_assign_lhs (last_stmt)); +- fprintf (dump_file, "\n"); +- } +- } +- /* On new load check page fault. */ +- /* Insert prefetch instructions. */ +- if (dump_file) +- fprintf (dump_file, "Evaluate addresses and insert prefetch insn.\n"); ++ vec bbends = vNULL; ++ create_stmts_for_used_mrs (used_mr_vec, bbends, stmts, processed_stmts, ++ dist_val, comp_mr); + + vec pcalls = vNULL; +- tree local; +- switch (param_ipa_prefetch_locality) +- { +- case 0: +- local = integer_zero_node; +- break; +- case 1: +- local = integer_one_node; +- break; +- case 2: +- local = build_int_cst (integer_type_node, 2); +- break; +- default: +- case 3: +- local = integer_three_node; +- break; +- } +- tree_set prefetched_addrs; +- for (unsigned int j = 0; j < vmrs.length (); j++) +- { +- memref_t *mr = vmrs[j]; +- /* Don't need to copy the last stmt, since we insert prefetch insn +- instead of it. */ +- gimple_copy_and_remap_memref_stmts (mr, stmts, 1, processed_stmts); +- gimple *last_stmt = mr->stmts[0]; +- gcc_assert (last_stmt); +- tree write_p = mr->is_store ? integer_one_node : integer_zero_node; +- tree addr = get_mem_ref_address_ssa_name (mr->mem, NULL_TREE); +- if (decl_map->count (addr)) +- addr = (*decl_map)[addr]; +- if (prefetched_addrs.count (addr)) +- continue; +- last_stmt = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH), +- 3, addr, write_p, local); +- pcalls.safe_push (last_stmt); +- gimple_seq_add_stmt (&stmts, last_stmt); +- prefetched_addrs.insert (addr); +- if (dump_file) +- { +- fprintf (dump_file, "Insert %d prefetch stmt:\n", j); +- print_gimple_stmt (dump_file, last_stmt, 0); +- } +- } +- ++ gimple *last_pref = NULL; ++ insert_prefetch_stmts (pcalls, stmts, last_pref, vmrs, processed_stmts); + gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ ++ correct_cfg (bbends, last_pref, dom_bb); ++ + delete decl_map; + +- /* Modify cgraph inserting calls to prefetch intrinsics. */ +- for (unsigned i = 0; i < pcalls.length (); i++) +- create_cgraph_edge (n, pcalls[i]); +- ipa_update_overall_fn_summary (n); +- renumber_gimple_stmt_uids (DECL_STRUCT_FUNCTION (n->decl)); ++ modify_ipa_info (n, pcalls); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "After optimization:\n"); ++ dump_function_to_file (cfun->decl, dump_file, (dump_flags_t)0); ++ } + + return 1; + } +@@ -1781,8 +2199,10 @@ insert_prefetch () + fprintf (dump_file, "Optimize function %s\n", n->dump_name ()); + push_cfun (DECL_STRUCT_FUNCTION (n->decl)); + calculate_dominance_info (CDI_DOMINATORS); ++ calculate_dominance_info (CDI_POST_DOMINATORS); + res |= optimize_function (n, fn); + free_dominance_info (CDI_DOMINATORS); ++ free_dominance_info (CDI_POST_DOMINATORS); + pop_cfun (); + } + return res; +diff --git a/gcc/params.opt b/gcc/params.opt +index 747d0f829..fc700ab79 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -317,6 +317,10 @@ The factor represents the number of inductive variable incrementations to evalua + Common Joined UInteger Var(param_ipa_prefetch_locality) Init(3) IntegerRange(0, 3) Param Optimization + The flag represents temporal locality value between 0 and 3, the higher value means the higher temporal locality in the data. + ++-param=ipa-prefetch-pagesize= ++Common Joined UInteger Var(param_ipa_prefetch_pagesize) Init(4096) Param Optimization ++The flag represents current pagesize for runtime checks of memory access addresses. ++ + -param=ira-loop-reserved-regs= + Common Joined UInteger Var(param_ira_loop_reserved_regs) Init(2) Param Optimization + The number of registers in each class kept unused by loop invariant motion. +-- +2.33.0 + diff --git a/0288-Enable-macro-use-commandline.patch b/0288-Enable-macro-use-commandline.patch new file mode 100644 index 0000000000000000000000000000000000000000..cafe01b5c6851ce46b31cb2e97b778be3ddb029e --- /dev/null +++ b/0288-Enable-macro-use-commandline.patch @@ -0,0 +1,207 @@ +From 7a578a8725f8fd7d92fcbbac14841ea7e8d0870f Mon Sep 17 00:00:00 2001 +From: zhangxiaohua +Date: Sun, 25 Aug 2024 23:08:53 +0800 +Subject: [PATCH 157/157] Enable macro-use-commandline + +Signed-off-by: zhangxiaohua +--- + gcc/c-family/c-opts.cc | 4 +++ + gcc/c-family/c.opt | 4 +++ + gcc/doc/cppopts.texi | 4 +++ + gcc/doc/invoke.texi | 1 + + .../gcc.dg/cpp/macro-use-cmdline-1.c | 26 ++++++++++++++ + .../gcc.dg/cpp/macro-use-cmdline-2.c | 34 +++++++++++++++++++ + libcpp/include/cpplib.h | 3 ++ + libcpp/init.cc | 1 + + libcpp/macro.cc | 16 ++++++++- + 9 files changed, 92 insertions(+), 1 deletion(-) + create mode 100644 gcc/testsuite/gcc.dg/cpp/macro-use-cmdline-1.c + create mode 100644 gcc/testsuite/gcc.dg/cpp/macro-use-cmdline-2.c + +diff --git a/gcc/c-family/c-opts.cc b/gcc/c-family/c-opts.cc +index 5134f6128..744b54dc3 100644 +--- a/gcc/c-family/c-opts.cc ++++ b/gcc/c-family/c-opts.cc +@@ -527,6 +527,10 @@ c_common_handle_option (size_t scode, const char *arg, HOST_WIDE_INT value, + cpp_opts->track_macro_expansion = 2; + break; + ++ case OPT_fmacro_use_commandline: ++ cpp_opts->macro_use_commandline = 1; ++ break; ++ + case OPT_fexec_charset_: + cpp_opts->narrow_charset = arg; + break; +diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt +index 07da40ef4..a36c27f07 100644 +--- a/gcc/c-family/c.opt ++++ b/gcc/c-family/c.opt +@@ -2012,6 +2012,10 @@ ftrack-macro-expansion= + C ObjC C++ ObjC++ JoinedOrMissing RejectNegative UInteger + -ftrack-macro-expansion=<0|1|2> Track locations of tokens coming from macro expansion and display them in error messages. + ++fmacro-use-commandline ++C ObjC C++ ObjC++ JoinedOrMissing RejectNegative UInteger ++Preferentially use options from the commandline. ++ + fpretty-templates + C++ ObjC++ Var(flag_pretty_templates) Init(1) + Do not pretty-print template specializations as the template signature followed by the arguments. +diff --git a/gcc/doc/cppopts.texi b/gcc/doc/cppopts.texi +index c0a92b370..8c8a81eac 100644 +--- a/gcc/doc/cppopts.texi ++++ b/gcc/doc/cppopts.texi +@@ -277,6 +277,10 @@ correct column numbers in warnings or errors, even if tabs appear on the + line. If the value is less than 1 or greater than 100, the option is + ignored. The default is 8. + ++@item -fmacro-use-commandline ++@opindex fmacro-use-commandline ++Preferentially use options from the command line. ++ + @item -ftrack-macro-expansion@r{[}=@var{level}@r{]} + @opindex ftrack-macro-expansion + Track locations of tokens across macro expansions. This allows the +diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +index bdd8b9429..2ff7d860d 100644 +--- a/gcc/doc/invoke.texi ++++ b/gcc/doc/invoke.texi +@@ -630,6 +630,7 @@ Objective-C and Objective-C++ Dialects}. + -fexec-charset=@var{charset} -fextended-identifiers @gol + -finput-charset=@var{charset} -flarge-source-files @gol + -fmacro-prefix-map=@var{old}=@var{new} -fmax-include-depth=@var{depth} @gol ++-fmacro-use-commandline @gol + -fno-canonical-system-headers -fpch-deps -fpch-preprocess @gol + -fpreprocessed -ftabstop=@var{width} -ftrack-macro-expansion @gol + -fwide-exec-charset=@var{charset} -fworking-directory @gol +diff --git a/gcc/testsuite/gcc.dg/cpp/macro-use-cmdline-1.c b/gcc/testsuite/gcc.dg/cpp/macro-use-cmdline-1.c +new file mode 100644 +index 000000000..f85d9c268 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/cpp/macro-use-cmdline-1.c +@@ -0,0 +1,26 @@ ++/* ++ { dg-options "-fmacro-use-commandline -DTEST_MACRO=1 -DTEST_MACRO=20" } ++ { dg-do compile } ++ { dg-do run } ++*/ ++ ++/* { dg-warning "-:redefined" "redef TEST_MACRO" { target *-*-* } 0 } ++ { dg-message "-:previous" "prev def TEST_MACRO" { target *-*-* } 0 } ++*/ ++ ++#if DEBUG ++extern int puts (const char *); ++#else ++#define puts(X) ++#endif ++extern void abort (void); ++ ++#define err(str) do { puts(str); abort(); } while (0) ++ ++int main (int argc, char *argv[]) ++{ ++ int macroValue = TEST_MACRO; ++ if (macroValue != 20) ++ err("macroValue"); ++ return 0; ++} +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/cpp/macro-use-cmdline-2.c b/gcc/testsuite/gcc.dg/cpp/macro-use-cmdline-2.c +new file mode 100644 +index 000000000..99d92d1e4 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/cpp/macro-use-cmdline-2.c +@@ -0,0 +1,34 @@ ++/* ++ { dg-options "-fmacro-use-commandline -DTEST_MACRO=1" } ++ { dg-do compile } ++ { dg-do run } ++*/ ++ ++#define TEST_MACRO 300 ++#define TEST_MACRO_1 400 ++/* ++ { dg-warning "-:redefined" "redef TEST_MACRO" { target *-*-* } 7 } ++ { dg-message "-:previous" "prev def TEST_MACRO" { target *-*-* } 0 } ++*/ ++ ++#if DEBUG ++extern int puts (const char *); ++#else ++#define puts(X) ++#endif ++ ++extern void abort (void); ++ ++#define err(str) do { puts(str); abort(); } while (0) ++ ++int main (int argc, char *argv[]) ++{ ++ int macroValue = TEST_MACRO; ++ if (macroValue != 1) ++ err("macroValue"); ++ ++ int macroValue1 = TEST_MACRO_1; ++ if (macroValue1 != 400) ++ err("macroValue1"); ++ return 0; ++} +\ No newline at end of file +diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h +index 3eba6f74b..c6101ca01 100644 +--- a/libcpp/include/cpplib.h ++++ b/libcpp/include/cpplib.h +@@ -471,6 +471,9 @@ struct cpp_options + consumes the highest amount of memory. */ + unsigned char track_macro_expansion; + ++ /* Use the options on the command line first. */ ++ unsigned char macro_use_commandline; ++ + /* Nonzero means handle C++ alternate operator names. */ + unsigned char operator_names; + +diff --git a/libcpp/init.cc b/libcpp/init.cc +index f4ab83d21..47be60a36 100644 +--- a/libcpp/init.cc ++++ b/libcpp/init.cc +@@ -215,6 +215,7 @@ cpp_create_reader (enum c_lang lang, cpp_hash_table *table, + cpp_options::track_macro_expansion to learn about the other + values. */ + CPP_OPTION (pfile, track_macro_expansion) = 2; ++ CPP_OPTION (pfile, macro_use_commandline) = 0; + CPP_OPTION (pfile, warn_normalize) = normalized_C; + CPP_OPTION (pfile, warn_literal_suffix) = 1; + CPP_OPTION (pfile, canonical_system_headers) +diff --git a/libcpp/macro.cc b/libcpp/macro.cc +index 8ebf360c0..aa9e4ffa6 100644 +--- a/libcpp/macro.cc ++++ b/libcpp/macro.cc +@@ -3852,7 +3852,21 @@ _cpp_create_definition (cpp_reader *pfile, cpp_hashnode *node) + node->value.macro->line, 0, + "this is the location of the previous definition"); + } +- _cpp_free_definition (node); ++#define LOCATION_FROM_LINEMAP 0 ++#define MIN_LINE_OF_MACRO_BEEN_OVERRIDDEN 96 ++#define MAX_LINE_OF_MACRO_BEEN_OVERRIDDEN 128 ++ if (CPP_OPTION (pfile, macro_use_commandline) ++ && node->value.macro->line >= MIN_LINE_OF_MACRO_BEEN_OVERRIDDEN ++ && node->value.macro->line <= MAX_LINE_OF_MACRO_BEEN_OVERRIDDEN ++ && pfile->forced_token_location == LOCATION_FROM_LINEMAP) ++ { ++ cpp_pedwarning_with_line (pfile, CPP_W_NONE, ++ node->value.macro->line, 0, ++ "use the previous definition from commandline"); ++ return false; ++ } ++ else ++ _cpp_free_definition (node); + } + + /* Enter definition in hash table. */ +-- +2.33.0 + diff --git a/0289-tree-ssa-loop-crc.cc-TARGET_CRC32-may-be-not-defined.patch b/0289-tree-ssa-loop-crc.cc-TARGET_CRC32-may-be-not-defined.patch new file mode 100644 index 0000000000000000000000000000000000000000..05818083d7a37e65e0e4e43ca980d3f49391cc39 --- /dev/null +++ b/0289-tree-ssa-loop-crc.cc-TARGET_CRC32-may-be-not-defined.patch @@ -0,0 +1,35 @@ +From 63f99f46e851aecc070496a0e688a0d118c820a4 Mon Sep 17 00:00:00 2001 +From: YunQiang Su +Date: Mon, 2 Sep 2024 17:57:52 +0800 +Subject: [PATCH] tree-ssa-loop-crc.cc: TARGET_CRC32 may be not defined + +TARGET_CRC32 may be not defined on some architectures, RISC-V is one example. +--- + gcc/tree-ssa-loop-crc.cc | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/gcc/tree-ssa-loop-crc.cc b/gcc/tree-ssa-loop-crc.cc +index b9c2f71ca..7eee9446d 100644 +--- a/gcc/tree-ssa-loop-crc.cc ++++ b/gcc/tree-ssa-loop-crc.cc +@@ -1227,6 +1227,9 @@ convert_to_new_loop (class loop *loop) + static unsigned int + tree_ssa_loop_crc () + { ++#ifndef TARGET_CRC32 ++ return 0; ++#else + if (TARGET_CRC32 == false) + { + warning (OPT____,"The loop-crc optimization is not working." \ +@@ -1269,6 +1272,7 @@ tree_ssa_loop_crc () + } + } + return todo; ++#endif + } + + /* Loop crc. */ +-- +2.33.0 + diff --git a/0290-Add-ipa-prefetch-test-for-gcc-s-case.patch b/0290-Add-ipa-prefetch-test-for-gcc-s-case.patch new file mode 100644 index 0000000000000000000000000000000000000000..4545420167bc764595b22b12d7ce486786325429 --- /dev/null +++ b/0290-Add-ipa-prefetch-test-for-gcc-s-case.patch @@ -0,0 +1,209 @@ +From 0534ae05fc313c0d449b48ffe3e01642b644e6d2 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia +Date: Fri, 6 Sep 2024 10:40:50 +0800 +Subject: [PATCH 1/2] Add ipa-prefetch test for gcc's case + +--- + gcc/ipa-prefetch.cc | 4 +- + gcc/testsuite/gcc.dg/ipa/ipa-prefetch-gcc.c | 167 ++++++++++++++++++++ + 2 files changed, 170 insertions(+), 1 deletion(-) + create mode 100644 gcc/testsuite/gcc.dg/ipa/ipa-prefetch-gcc.c + +diff --git a/gcc/ipa-prefetch.cc b/gcc/ipa-prefetch.cc +index b000d4d75..8e628390b 100644 +--- a/gcc/ipa-prefetch.cc ++++ b/gcc/ipa-prefetch.cc +@@ -1668,6 +1668,8 @@ static gimple * + insert_page_check (tree addr, tree_poly_offset_map &offset_map, + gimple_seq &stmts) + { ++ if (dump_file) ++ fprintf (dump_file, "Insert page check.\n"); + poly_offset_int offset = 0; + if (offset_map.count (addr)) + offset = offset_map[addr]; +@@ -1783,7 +1785,7 @@ static gimple * + insert_index_check (tree mem, gimple_seq &stmts) + { + if (dump_file) +- fprintf (dump_file, "Insert array index check\n"); ++ fprintf (dump_file, "Insert array index check.\n"); + tree atype = TREE_TYPE (TREE_OPERAND (mem, 0)); + tree ind = TREE_OPERAND (mem, 1); + if (decl_map->count (ind)) +diff --git a/gcc/testsuite/gcc.dg/ipa/ipa-prefetch-gcc.c b/gcc/testsuite/gcc.dg/ipa/ipa-prefetch-gcc.c +new file mode 100644 +index 000000000..f1001c350 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/ipa/ipa-prefetch-gcc.c +@@ -0,0 +1,167 @@ ++/* { dg-do link } */ ++/* { dg-options "-O3 -fipa-prefetch -flto -flto-partition=one -fdump-ipa-ipa_prefetch" } */ ++/* { dg-require-effective-target lto } */ ++ ++/* Based on opensource gcc code. */ ++ ++#include ++#include ++#include ++ ++#define SPARSESET_ELT_TYPE unsigned int ++#define ALLOCNO_NUM(A) ((A)->num) ++ ++typedef struct sparseset_def ++{ ++ SPARSESET_ELT_TYPE *dense; /* Dense array. */ ++ SPARSESET_ELT_TYPE *sparse; /* Sparse array. */ ++ SPARSESET_ELT_TYPE members; /* Number of elements. */ ++ SPARSESET_ELT_TYPE size; /* Maximum number of elements. */ ++ SPARSESET_ELT_TYPE iter; /* Iterator index. */ ++ unsigned char iter_inc; /* Iteration increment amount. */ ++ bool iterating; ++ SPARSESET_ELT_TYPE elms[2]; /* Combined dense and sparse arrays. */ ++} *sparseset; ++ ++struct ira_allocno ++{ ++ /* The allocno order number starting with 0. Each allocno has an ++ unique number and the number is never changed for the ++ allocno. */ ++ int num; ++ /* Regno for allocno or cap. */ ++ int regno; ++ /*...*/ ++}; ++ ++typedef struct ira_allocno_live_range *allocno_live_range_t; ++typedef struct ira_allocno *ira_allocno_t; ++ ++struct ira_allocno_live_range ++{ ++ /* Allocno whose live range is described by given structure. */ ++ ira_allocno_t allocno; ++ /* Program point range. */ ++ int start, finish; ++ /* Next structure describing program points where the allocno ++ lives. */ ++ allocno_live_range_t next; ++ /* Pointer to structures with the same start/finish. */ ++ allocno_live_range_t start_next, finish_next; ++}; ++ ++bool ++sparseset_bit_p (sparseset s, SPARSESET_ELT_TYPE e) ++{ ++ SPARSESET_ELT_TYPE idx; ++ ++ idx = s->sparse[e]; ++ ++ return idx < s->members && s->dense[idx] == e; ++} ++ ++bool new_pseudos_p; ++int ira_max_point, ira_allocnos_num; ++allocno_live_range_t *ira_finish_point_ranges; ++ ++static inline void ++sparseset_clear (sparseset s) ++{ ++ s->members = 0; ++ s->iterating = false; ++} ++ ++sparseset ++sparseset_alloc (SPARSESET_ELT_TYPE n_elms) ++{ ++ unsigned int n_bytes = sizeof (struct sparseset_def) ++ + ((n_elms - 1) * 2 * sizeof (SPARSESET_ELT_TYPE)); ++ ++ /* We use xcalloc rather than xmalloc to silence some valgrind uninitialized ++ read errors when accessing set->sparse[n] when "n" is not, and never has ++ been, in the set. These uninitialized reads are expected, by design and ++ harmless. If this turns into a performance problem due to some future ++ additional users of sparseset, we can revisit this decision. */ ++ sparseset set = (sparseset) calloc (1, n_bytes); ++ set->dense = &(set->elms[0]); ++ set->sparse = &(set->elms[n_elms]); ++ set->size = n_elms; ++ sparseset_clear (set); ++ return set; ++} ++ ++void ++sparseset_insert_bit (sparseset s, SPARSESET_ELT_TYPE e, SPARSESET_ELT_TYPE idx) ++{ ++ s->sparse[e] = idx; ++ s->dense[idx] = e; ++} ++ ++void ++sparseset_swap (sparseset s, SPARSESET_ELT_TYPE idx1, SPARSESET_ELT_TYPE idx2) ++{ ++ SPARSESET_ELT_TYPE tmp = s->dense[idx2]; ++ sparseset_insert_bit (s, s->dense[idx1], idx2); ++ sparseset_insert_bit (s, tmp, idx1); ++} ++ ++void __attribute__ ((noinline)) ++sparseset_clear_bit (sparseset s, SPARSESET_ELT_TYPE e) ++{ ++ if (sparseset_bit_p (s, e)) ++ { ++ SPARSESET_ELT_TYPE idx = s->sparse[e]; ++ SPARSESET_ELT_TYPE iter = s->iter; ++ SPARSESET_ELT_TYPE mem = s->members - 1; ++ ++ /* If we are iterating over this set and we want to delete a ++ member we've already visited, then we swap the element we ++ want to delete with the element at the current iteration ++ index so that it plays well together with the code below ++ that actually removes the element. */ ++ if (s->iterating && idx <= iter) ++ { ++ if (idx < iter) ++ { ++ sparseset_swap (s, idx, iter); ++ idx = iter; ++ } ++ s->iter_inc = 0; ++ } ++ ++ /* Replace the element we want to delete with the last element ++ in the dense array and then decrement s->members, effectively ++ removing the element we want to delete. */ ++ sparseset_insert_bit (s, s->dense[mem], idx); ++ s->members = mem; ++ } ++} ++ ++allocno_live_range_t r; ++sparseset allocnos_live; ++ ++void ++ira_flattening () ++{ ++ int i; ++ ++ if (new_pseudos_p) ++ { ++ allocnos_live = sparseset_alloc (ira_allocnos_num); ++ for (i = 0; i < ira_max_point; i++) ++ { ++ for (r = ira_finish_point_ranges[i]; r != NULL; r = r->finish_next) ++ sparseset_clear_bit (allocnos_live, ALLOCNO_NUM (r->allocno)); ++ } ++ } ++} ++ ++int main() ++{ ++ ira_flattening (); ++ return 0; ++} ++ ++/* { dg-final { scan-wpa-ipa-dump-times "Insert page check" 1 "ipa_prefetch"} } */ ++/* { dg-final { scan-wpa-ipa-dump-times "Insert 0 prefetch stmt:" 1 "ipa_prefetch"} } */ ++/* { dg-final { scan-wpa-ipa-dump-times "Split dom_bb after condition stmts:" 1 "ipa_prefetch"} } */ +-- +2.33.0 + diff --git a/0291-Fix-settings-for-wide-operations-tests.patch b/0291-Fix-settings-for-wide-operations-tests.patch new file mode 100644 index 0000000000000000000000000000000000000000..1e368b6d4a9ee7dc54af81a9af071f73f3c96ad5 --- /dev/null +++ b/0291-Fix-settings-for-wide-operations-tests.patch @@ -0,0 +1,73 @@ +From 411792b0bbb63715d8e90d46eb4f0d9c810ce8ba Mon Sep 17 00:00:00 2001 +From: Pronin Alexander 00812787 +Date: Tue, 3 Sep 2024 21:26:03 +0800 +Subject: [PATCH 2/2] Fix settings for wide operations tests + +Signed-off-by: lin-houzhong +--- + gcc/testsuite/gcc.dg/double_sized_mul-1.c | 8 +++++--- + gcc/testsuite/gcc.dg/double_sized_mul-2.c | 9 +++++---- + gcc/testsuite/gcc.dg/uaddsub.c | 6 ++++-- + 3 files changed, 14 insertions(+), 9 deletions(-) + +diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-1.c b/gcc/testsuite/gcc.dg/double_sized_mul-1.c +index d32a25223..b848e02de 100644 +--- a/gcc/testsuite/gcc.dg/double_sized_mul-1.c ++++ b/gcc/testsuite/gcc.dg/double_sized_mul-1.c +@@ -1,7 +1,8 @@ +-/* { dg-do compile } */ ++/* { dg-do compile { target aarch64*-*-* x86_64*-*-*} } */ + /* fif-conversion-gimple and fuaddsub-overflow-match-all are required for + proper overflow detection in some cases. */ +-/* { dg-options "-O2 -fif-conversion-gimple -march=armv8.2-a -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */ ++/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */ ++/* { dg-additional-options "-march=armv8.2-a" { target aarch64*-*-* } } */ + #include + + typedef unsigned __int128 uint128_t; +@@ -138,4 +139,5 @@ uint128_t mul128_perm (uint64_t a, uint64_t b) + return res; + } + +-/* { dg-final { scan-tree-dump-times "double sized mul optimized: 1" 6 "widening_mul" } } */ ++/* { dg-final { scan-tree-dump-times "double sized mul optimized: 1" 6 "widening_mul" { target aarch64*-*-* } } } */ ++/* { dg-final { scan-tree-dump-times "double sized mul optimized: 1" 4 "widening_mul" { target x86_64*-*-* } } } */ +diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-2.c b/gcc/testsuite/gcc.dg/double_sized_mul-2.c +index ff35902b7..cf8f0aedd 100644 +--- a/gcc/testsuite/gcc.dg/double_sized_mul-2.c ++++ b/gcc/testsuite/gcc.dg/double_sized_mul-2.c +@@ -1,7 +1,8 @@ +-/* { dg-do compile } */ +-/* fif-conversion-gimple is required for proper overflow detection +- in some cases. */ +-/* { dg-options "-O2 -fif-conversion-gimple -march=armv8.2-a -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */ ++/* { dg-do compile { target aarch64*-*-* x86_64*-*-*} } */ ++/* fif-conversion-gimple and fuaddsub-overflow-match-all are required for ++ proper overflow detection in some cases. */ ++/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */ ++/* { dg-additional-options "-march=armv8.2-a" { target aarch64*-*-* } } */ + #include + + typedef unsigned __int128 uint128_t; +diff --git a/gcc/testsuite/gcc.dg/uaddsub.c b/gcc/testsuite/gcc.dg/uaddsub.c +index 96c26d308..dcb587fc8 100644 +--- a/gcc/testsuite/gcc.dg/uaddsub.c ++++ b/gcc/testsuite/gcc.dg/uaddsub.c +@@ -1,5 +1,6 @@ +-/* { dg-do compile } */ ++/* { dg-do compile { target aarch64*-*-* x86_64-*-* } } */ + /* { dg-options "-O2 -fuaddsub-overflow-match-all -fdump-tree-optimized" } */ ++/* { dg-additional-options "-march=armv8.2-a" { target aarch64*-*-* } } */ + #include + + typedef unsigned __int128 uint128_t; +@@ -140,4 +141,5 @@ uint256_t sub256 (uint128_t a, uint128_t b) + } + + /* { dg-final { scan-tree-dump-times "= .ADD_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 5 "optimized" } } */ +-/* { dg-final { scan-tree-dump-times "= .SUB_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 5 "optimized" } } */ ++/* { dg-final { scan-tree-dump-times "= .SUB_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 5 "optimized" { target aarch64*-*-* } } } */ ++/* { dg-final { scan-tree-dump-times "= .SUB_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 4 "optimized" { target x86_64*-*-* } } } */ +-- +2.33.0 + diff --git a/0292-Fix-errors-in-ipa-prefetch-IAORPF-and-IAOSJ0.patch b/0292-Fix-errors-in-ipa-prefetch-IAORPF-and-IAOSJ0.patch new file mode 100644 index 0000000000000000000000000000000000000000..13341df6672c078b4bc6e3cfba77b18e2c763634 --- /dev/null +++ b/0292-Fix-errors-in-ipa-prefetch-IAORPF-and-IAOSJ0.patch @@ -0,0 +1,42 @@ +From 808294bf0f32aaff1cc7e56a756b246d328b3402 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia +Date: Fri, 6 Sep 2024 11:10:03 +0800 +Subject: [PATCH 2/3] Fix errors in ipa-prefetch (IAORPF and IAOSJ0) + +Signed-off-by: Diachkov Ilia +--- + gcc/ipa-prefetch.cc | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +diff --git a/gcc/ipa-prefetch.cc b/gcc/ipa-prefetch.cc +index b000d4d75..74af55af0 100644 +--- a/gcc/ipa-prefetch.cc ++++ b/gcc/ipa-prefetch.cc +@@ -1681,7 +1681,8 @@ insert_page_check (tree addr, tree_poly_offset_map &offset_map, + unsigned long long pmask = ~(param_ipa_prefetch_pagesize - 1); + tree pmask_cst = build_int_cst (utype, pmask); + tree off_tree = wide_int_to_tree (sizetype, offset); +- gcc_assert (TREE_CODE (addr_type) == POINTER_TYPE); ++ gcc_assert (TREE_CODE (addr_type) == POINTER_TYPE ++ || TREE_CODE (addr_type) == REFERENCE_TYPE); + tree addr_with_offset = gimple_build (&stmts, POINTER_PLUS_EXPR, + addr_type, addr, off_tree); + tree conv_addr = make_ssa_name (utype); +@@ -2082,11 +2083,11 @@ optimize_function (cgraph_node *n, function *fn) + for (unsigned int i = 0; i < vmrs.length (); i++) + find_nearest_common_post_dominator (vmrs[i], dom_bb); + +- if (!dom_bb) ++ if (!dom_bb || dom_bb->index == ENTRY_BLOCK || dom_bb->index == EXIT_BLOCK) + { + if (dump_file) +- fprintf (dump_file, "Post dominator bb for MRs is not found. " +- "Skip the case.\n"); ++ fprintf (dump_file, "Post dominator bb for MRs is not found or " ++ "it's an entry/exit block. Skip the case.\n"); + return 0; + } + else if (dump_file) +-- +2.33.0 + diff --git a/0293-Fix-error-with-stmts-insertion-in-ipa-prefetch-for-I.patch b/0293-Fix-error-with-stmts-insertion-in-ipa-prefetch-for-I.patch new file mode 100644 index 0000000000000000000000000000000000000000..3c9ec2575784c8c42d58935fa61dd66807e1686f --- /dev/null +++ b/0293-Fix-error-with-stmts-insertion-in-ipa-prefetch-for-I.patch @@ -0,0 +1,51 @@ +From bfb77997f423ffe3bdcbd8bb8d7f739fe51ce4f5 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia +Date: Fri, 6 Sep 2024 11:36:11 +0800 +Subject: [PATCH 3/3] Fix error with stmts insertion in ipa-prefetch (for + IAO6R3) + +Signed-off-by: Diachkov Ilia +--- + gcc/ipa-prefetch.cc | 19 +++++++++++++++++-- + 1 file changed, 17 insertions(+), 2 deletions(-) + +diff --git a/gcc/ipa-prefetch.cc b/gcc/ipa-prefetch.cc +index b000d4d75..6190c2ebb 100644 +--- a/gcc/ipa-prefetch.cc ++++ b/gcc/ipa-prefetch.cc +@@ -2096,7 +2096,7 @@ optimize_function (cgraph_node *n, function *fn) + fprintf (dump_file, "\n"); + } + +- /* Try to find comp_mr's stmt in the dominator bb. */ ++ /* Try to find comp_mr's stmt in the post dominator bb. */ + gimple *last_used = NULL; + for (gimple_stmt_iterator si = gsi_last_bb (dom_bb); !gsi_end_p (si); + gsi_prev (&si)) +@@ -2168,7 +2168,22 @@ optimize_function (cgraph_node *n, function *fn) + vec pcalls = vNULL; + gimple *last_pref = NULL; + insert_prefetch_stmts (pcalls, stmts, last_pref, vmrs, processed_stmts); +- gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ ++ gimple *gstmt = gsi_stmt (gsi); ++ bool insert_after = last_used || gstmt == NULL || !is_ctrl_stmt (gstmt); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Insert prefetch sequence %s stmt:\n", ++ insert_after ? "after": "before"); ++ if (gstmt) ++ print_gimple_stmt (dump_file, gstmt, 0); ++ else ++ fprintf (dump_file, "(no stmts)\n"); ++ } ++ if (insert_after) ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ else ++ gsi_insert_seq_before (&gsi, stmts, GSI_NEW_STMT); + + correct_cfg (bbends, last_pref, dom_bb); + +-- +2.33.0 + diff --git a/0294-Fix-errors-in-ipa-prefetch-IAO50J-and-IAO5H7.patch b/0294-Fix-errors-in-ipa-prefetch-IAO50J-and-IAO5H7.patch new file mode 100644 index 0000000000000000000000000000000000000000..43a88b8a4f5c5dd482deb6f23f77e4d47885141d --- /dev/null +++ b/0294-Fix-errors-in-ipa-prefetch-IAO50J-and-IAO5H7.patch @@ -0,0 +1,80 @@ +From cd79fc29d2cdb73836f8699355113e94b833e0e0 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia +Date: Wed, 11 Sep 2024 17:18:58 +0800 +Subject: [PATCH 2/2] Fix errors in ipa-prefetch(IAO50J and IAO5H7) + +Signed-off-by: Diachkov Ilia +--- + gcc/ipa-prefetch.cc | 35 ++++++++++++++++++++++++++++++----- + 1 file changed, 30 insertions(+), 5 deletions(-) + +diff --git a/gcc/ipa-prefetch.cc b/gcc/ipa-prefetch.cc +index 5184687aa..685f9c267 100644 +--- a/gcc/ipa-prefetch.cc ++++ b/gcc/ipa-prefetch.cc +@@ -2099,6 +2099,18 @@ optimize_function (cgraph_node *n, function *fn) + fprintf (dump_file, "\n"); + } + ++ /* Check that all used mrs dominate found post dominator bb. This case ++ may be supported later by copying MR evaluation to the bb. */ ++ for (unsigned int i = 0; i < used_mr_vec.length (); i++) ++ if (!dominated_by_p (CDI_DOMINATORS, dom_bb, ++ gimple_bb (used_mr_vec[i]->stmts[0]))) ++ { ++ if (dump_file) ++ fprintf (dump_file, "MR's (%d) bb is not dominate the found bb %d. " ++ "Skip the case.\n", used_mr_vec[i]->mr_id, dom_bb->index); ++ return 0; ++ } ++ + /* Try to find comp_mr's stmt in the post dominator bb. */ + gimple *last_used = NULL; + for (gimple_stmt_iterator si = gsi_last_bb (dom_bb); !gsi_end_p (si); +@@ -2133,17 +2145,29 @@ optimize_function (cgraph_node *n, function *fn) + + /* Create new inc var. Insert new_var = old_var + step * factor. */ + decl_map = new tree_map; +- gcc_assert (comp_mr->stmts[0] && gimple_assign_single_p (comp_mr->stmts[0])); +- tree inc_var = gimple_assign_lhs (comp_mr->stmts[0]); ++ gimple *old_inc_stmt = comp_mr->stmts[0]; ++ gcc_assert (old_inc_stmt && gimple_assign_single_p (old_inc_stmt)); ++ tree inc_var = gimple_assign_lhs (old_inc_stmt); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Old inc stmt: "); ++ print_gimple_stmt (dump_file, old_inc_stmt, 0); ++ } + /* If old_var definition dominates the current use, just use it, otherwise + evaluate it just before new inc var evaluation. */ + gimple_seq stmts = NULL; + stmt_set processed_stmts; +- if (!dominated_by_p (CDI_DOMINATORS, dom_bb, gimple_bb (comp_mr->stmts[0]))) ++ tree local_inc_var = inc_var; ++ if (!dominated_by_p (CDI_DOMINATORS, dom_bb, gimple_bb (old_inc_stmt))) + { + gimple *tmp = gimple_copy_and_remap_memref_stmts (comp_mr, stmts, 0, 0, + processed_stmts); +- inc_var = gimple_assign_lhs (tmp); ++ local_inc_var = gimple_assign_lhs (tmp); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Localized old inc stmt: "); ++ print_gimple_stmt (dump_file, tmp, 0); ++ } + } + tree var_type = TREE_TYPE (inc_var); + enum tree_code inc_code; +@@ -2155,7 +2179,8 @@ optimize_function (cgraph_node *n, function *fn) + HOST_WIDE_INT dist_val = tree_to_shwi (step) + * param_ipa_prefetch_distance_factor; + tree dist = build_int_cst (TREE_TYPE (step), dist_val); +- tree new_inc_var = gimple_build (&stmts, inc_code, var_type, inc_var, dist); ++ tree new_inc_var = gimple_build (&stmts, inc_code, var_type, local_inc_var, ++ dist); + (*decl_map)[inc_var] = new_inc_var; + if (dump_file) + { +-- +2.33.0 + diff --git a/0295-Fix-error-with-grouped_load-merge-in-slp-transpose-v.patch b/0295-Fix-error-with-grouped_load-merge-in-slp-transpose-v.patch new file mode 100644 index 0000000000000000000000000000000000000000..8540cd4aca03f8077c98480d442cc194836ea137 --- /dev/null +++ b/0295-Fix-error-with-grouped_load-merge-in-slp-transpose-v.patch @@ -0,0 +1,30 @@ +From 7b4cce4896cefefedba9545a9633585e086b7621 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=E9=83=91=E6=99=A8=E5=8D=89?= +Date: Wed, 11 Sep 2024 18:26:22 +0800 +Subject: [PATCH 1/2] Fix error with grouped_load merge in + slp-transpose-vectorize (for IALR8B) + +--- + gcc/tree-vect-slp.cc | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc +index e3e246977..d4870de43 100644 +--- a/gcc/tree-vect-slp.cc ++++ b/gcc/tree-vect-slp.cc +@@ -3807,7 +3807,11 @@ vect_slp_grouped_load_find (bb_vec_info bb_vinfo, vec &visited, + these two grouped loads need to be merged. */ + tree opb = get_op_base_address (first_element); + unsigned int grp_size_b = DR_GROUP_SIZE (first_element); +- if (opa == opb && grp_size_a == grp_size_b) ++ /* Ensure that the elements merge to load group meet the alignment condition (dr_misalignment) */ ++ HOST_WIDE_INT diff = 0; ++ diff = (TREE_INT_CST_LOW (DR_INIT (first_element->dr_aux.dr)) ++ - TREE_INT_CST_LOW (DR_INIT (merge_first_element->dr_aux.dr))); ++ if (opa == opb && grp_size_a == grp_size_b && diff >= 0) + { + res.safe_push (first_element); + visited[i] = true; +-- +2.33.0 + diff --git a/0296-Fix-error-in-slp-transpose-vectorize-for-IAQFM3.patch b/0296-Fix-error-in-slp-transpose-vectorize-for-IAQFM3.patch new file mode 100644 index 0000000000000000000000000000000000000000..34862f283b12674816bd1fd597c62a6101312055 --- /dev/null +++ b/0296-Fix-error-in-slp-transpose-vectorize-for-IAQFM3.patch @@ -0,0 +1,28 @@ +From b3a6a170bf1dc0e460e98a7fd02c92e6b036784a Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=E9=83=91=E6=99=A8=E5=8D=89?= +Date: Fri, 13 Sep 2024 14:13:07 +0800 +Subject: [PATCH 2/2] Fix error in slp-transpose-vectorize (for IAQFM3) + +--- + gcc/tree-vect-slp.cc | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc +index d4870de43..d7e198dff 100644 +--- a/gcc/tree-vect-slp.cc ++++ b/gcc/tree-vect-slp.cc +@@ -3811,7 +3811,10 @@ vect_slp_grouped_load_find (bb_vec_info bb_vinfo, vec &visited, + HOST_WIDE_INT diff = 0; + diff = (TREE_INT_CST_LOW (DR_INIT (first_element->dr_aux.dr)) + - TREE_INT_CST_LOW (DR_INIT (merge_first_element->dr_aux.dr))); +- if (opa == opb && grp_size_a == grp_size_b && diff >= 0) ++ if (opa == opb ++ && grp_size_a == grp_size_b ++ && diff >= 0 ++ && check_same_bb (first_element, merge_first_element)) + { + res.safe_push (first_element); + visited[i] = true; +-- +2.33.0 + diff --git a/0297-Fix-grouped-load-merging-error-in-SLP-transpose-vectorization.patch b/0297-Fix-grouped-load-merging-error-in-SLP-transpose-vectorization.patch new file mode 100644 index 0000000000000000000000000000000000000000..21a24c0f4bcaa8f6dce6f75c5be868871a9c1ea0 --- /dev/null +++ b/0297-Fix-grouped-load-merging-error-in-SLP-transpose-vectorization.patch @@ -0,0 +1,26 @@ +From 8b30d71f881e15bfbc514f9b65fee178610e1536 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=E9=83=91=E6=99=A8=E5=8D=89?= +Date: Wed, 18 Sep 2024 10:48:55 +0800 +Subject: [PATCH] Fix error in slp-transpose-vectorize (for IARHFM) + +--- + gcc/tree-vect-slp.cc | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc +index d7e198dff..fbd638333 100644 +--- a/gcc/tree-vect-slp.cc ++++ b/gcc/tree-vect-slp.cc +@@ -3814,7 +3814,8 @@ vect_slp_grouped_load_find (bb_vec_info bb_vinfo, vec &visited, + if (opa == opb + && grp_size_a == grp_size_b + && diff >= 0 +- && check_same_bb (first_element, merge_first_element)) ++ && check_same_bb (first_element, merge_first_element) ++ && DR_PTR_INFO (first_element->dr_aux.dr) != DR_PTR_INFO (merge_first_element->dr_aux.dr)) + { + res.safe_push (first_element); + visited[i] = true; +-- +2.33.0 + diff --git a/0298-Mark-prefetch-builtin-as-willreturn.patch b/0298-Mark-prefetch-builtin-as-willreturn.patch new file mode 100644 index 0000000000000000000000000000000000000000..7a489a5d9b0d7e1c7454a66a02ef59ac805532a6 --- /dev/null +++ b/0298-Mark-prefetch-builtin-as-willreturn.patch @@ -0,0 +1,99 @@ +From a252bbd11d22481a1e719ed36d800e2192abb369 Mon Sep 17 00:00:00 2001 +From: Pronin Alexander +Date: Thu, 31 Oct 2024 15:49:27 +0800 +Subject: [PATCH 1/6] Mark prefetch builtin as willreturn + +Signed-off-by: Pronin Alexander +--- + gcc/common.opt | 4 ++++ + gcc/gimple.cc | 30 ++++++++++++++++++++++++++++++ + gcc/gimple.h | 1 + + gcc/tree-ssa-pre.cc | 4 +--- + 4 files changed, 36 insertions(+), 3 deletions(-) + +diff --git a/gcc/common.opt b/gcc/common.opt +index 688d65e4d..be5fcc681 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1313,6 +1313,10 @@ fdelete-null-pointer-checks + Common Var(flag_delete_null_pointer_checks) Init(-1) Optimization + Delete useless null pointer checks. + ++fbuiltin-will-return ++Common Var(flag_builtin_will_return) Optimization ++Consider some of the builtins as definitely returning. ++ + fdevirtualize-at-ltrans + Common Var(flag_ltrans_devirtualize) + Stream extra data to support more aggressive devirtualization in LTO local transformation mode. +diff --git a/gcc/gimple.cc b/gcc/gimple.cc +index 9e62da426..04ca9f161 100644 +--- a/gcc/gimple.cc ++++ b/gcc/gimple.cc +@@ -2998,6 +2998,36 @@ nonbarrier_call_p (gimple *call) + return false; + } + ++static inline bool ++will_return_builtin_p (gimple *call) ++{ ++ if (!flag_builtin_will_return) ++ return false; ++ ++ if (!gimple_call_builtin_p (call, BUILT_IN_NORMAL)) ++ return false; ++ ++ switch (DECL_FUNCTION_CODE (gimple_call_fndecl (call))) ++ { ++ case BUILT_IN_PREFETCH: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++bool ++will_return_call_p (gimple *call, function *fun) ++{ ++ int flags = gimple_call_flags (call); ++ if (!(flags & (ECF_CONST|ECF_PURE)) ++ || (flags & ECF_LOOPING_CONST_OR_PURE) ++ || stmt_can_throw_external (fun, call)) ++ return will_return_builtin_p (call); ++ ++ return true; ++} ++ + /* Callback for walk_stmt_load_store_ops. + + Return TRUE if OP will dereference the tree stored in DATA, FALSE +diff --git a/gcc/gimple.h b/gcc/gimple.h +index 77a5a07e9..bb05a7664 100644 +--- a/gcc/gimple.h ++++ b/gcc/gimple.h +@@ -1628,6 +1628,7 @@ extern bool gimple_asm_clobbers_memory_p (const gasm *); + extern void dump_decl_set (FILE *, bitmap); + extern bool nonfreeing_call_p (gimple *); + extern bool nonbarrier_call_p (gimple *); ++extern bool will_return_call_p (gimple *, function *); + extern bool infer_nonnull_range (gimple *, tree); + extern bool infer_nonnull_range_by_dereference (gimple *, tree); + extern bool infer_nonnull_range_by_attribute (gimple *, tree); +diff --git a/gcc/tree-ssa-pre.cc b/gcc/tree-ssa-pre.cc +index 98134b5d3..b5264133a 100644 +--- a/gcc/tree-ssa-pre.cc ++++ b/gcc/tree-ssa-pre.cc +@@ -3988,9 +3988,7 @@ compute_avail (function *fun) + that forbids hoisting possibly trapping expressions + before it. */ + int flags = gimple_call_flags (stmt); +- if (!(flags & (ECF_CONST|ECF_PURE)) +- || (flags & ECF_LOOPING_CONST_OR_PURE) +- || stmt_can_throw_external (fun, stmt)) ++ if (!will_return_call_p (stmt, fun)) + /* Defer setting of BB_MAY_NOTRETURN to avoid it + influencing the processing of the call itself. */ + set_bb_may_notreturn = true; +-- +2.33.0 + diff --git a/0299-Backport-Disallow-pointer-operands-for-and-partly-PR.patch b/0299-Backport-Disallow-pointer-operands-for-and-partly-PR.patch new file mode 100644 index 0000000000000000000000000000000000000000..c0a733c1ad49595a8e148a4f51f1df371a84eb46 --- /dev/null +++ b/0299-Backport-Disallow-pointer-operands-for-and-partly-PR.patch @@ -0,0 +1,156 @@ +From 3b109376d057342a31267ea4c9bd422d940874cb Mon Sep 17 00:00:00 2001 +From: Jakub Jelinek +Date: Thu, 31 Oct 2024 16:09:43 +0800 +Subject: [PATCH 2/6] [Backport]Disallow pointer operands for |,^ and partly + &[PR106878] + +Signed-off-by: Jakub Jelinek +--- + gcc/match.pd | 6 ++++- + .../gcc.c-torture/compile/pr106878.c | 15 +++++++++++++ + gcc/tree-cfg.cc | 22 ++++++++++++++++--- + gcc/tree-ssa-reassoc.cc | 16 +++++++++++++- + 4 files changed, 54 insertions(+), 5 deletions(-) + create mode 100644 gcc/testsuite/gcc.c-torture/compile/pr106878.c + +diff --git a/gcc/match.pd b/gcc/match.pd +index 8f41c292f..822e065e8 100644 +--- a/gcc/match.pd ++++ b/gcc/match.pd +@@ -1655,6 +1655,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) + && (int_fits_type_p (@1, TREE_TYPE (@0)) + || tree_nop_conversion_p (TREE_TYPE (@0), type))) + || types_match (@0, @1)) ++ && !POINTER_TYPE_P (TREE_TYPE (@0)) ++ && TREE_CODE (TREE_TYPE (@0)) != OFFSET_TYPE + /* ??? This transform conflicts with fold-const.cc doing + Convert (T)(x & c) into (T)x & (T)c, if c is an integer + constants (if x has signed type, the sign bit cannot be set +@@ -1691,7 +1693,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) + (if (GIMPLE + && TREE_CODE (@1) != INTEGER_CST + && tree_nop_conversion_p (type, TREE_TYPE (@2)) +- && types_match (type, @0)) ++ && types_match (type, @0) ++ && !POINTER_TYPE_P (TREE_TYPE (@0)) ++ && TREE_CODE (TREE_TYPE (@0)) != OFFSET_TYPE) + (bitop @0 (convert @1))))) + + (for bitop (bit_and bit_ior) +diff --git a/gcc/testsuite/gcc.c-torture/compile/pr106878.c b/gcc/testsuite/gcc.c-torture/compile/pr106878.c +new file mode 100644 +index 000000000..c84571894 +--- /dev/null ++++ b/gcc/testsuite/gcc.c-torture/compile/pr106878.c +@@ -0,0 +1,15 @@ ++/* PR tree-optimization/106878 */ ++ ++typedef __INTPTR_TYPE__ intptr_t; ++typedef __UINTPTR_TYPE__ uintptr_t; ++int a; ++ ++int ++foo (const int *c) ++{ ++ uintptr_t d = ((intptr_t) c | (intptr_t) &a) & 65535 << 16; ++ intptr_t e = (intptr_t) c; ++ if (d != (e & 65535 << 16)) ++ return 1; ++ return 0; ++} +diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc +index 48b52f785..d33aaec8c 100644 +--- a/gcc/tree-cfg.cc ++++ b/gcc/tree-cfg.cc +@@ -4163,7 +4163,9 @@ verify_gimple_assign_binary (gassign *stmt) + case ROUND_MOD_EXPR: + case RDIV_EXPR: + case EXACT_DIV_EXPR: +- /* Disallow pointer and offset types for many of the binary gimple. */ ++ case BIT_IOR_EXPR: ++ case BIT_XOR_EXPR: ++ /* Disallow pointer and offset types for many of the binary gimple. */ + if (POINTER_TYPE_P (lhs_type) + || TREE_CODE (lhs_type) == OFFSET_TYPE) + { +@@ -4178,9 +4180,23 @@ verify_gimple_assign_binary (gassign *stmt) + + case MIN_EXPR: + case MAX_EXPR: +- case BIT_IOR_EXPR: +- case BIT_XOR_EXPR: ++ /* Continue with generic binary expression handling. */ ++ break; ++ + case BIT_AND_EXPR: ++ if (POINTER_TYPE_P (lhs_type) ++ && TREE_CODE (rhs2) == INTEGER_CST) ++ break; ++ /* Disallow pointer and offset types for many of the binary gimple. */ ++ if (POINTER_TYPE_P (lhs_type) ++ || TREE_CODE (lhs_type) == OFFSET_TYPE) ++ { ++ error ("invalid types for %qs", code_name); ++ debug_generic_expr (lhs_type); ++ debug_generic_expr (rhs1_type); ++ debug_generic_expr (rhs2_type); ++ return true; ++ } + /* Continue with generic binary expression handling. */ + break; + +diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc +index e3d521e32..6baef4764 100644 +--- a/gcc/tree-ssa-reassoc.cc ++++ b/gcc/tree-ssa-reassoc.cc +@@ -3617,10 +3617,14 @@ optimize_range_tests_cmp_bitwise (enum tree_code opcode, int first, int length, + tree type2 = NULL_TREE; + bool strict_overflow_p = false; + candidates.truncate (0); ++ if (POINTER_TYPE_P (type1)) ++ type1 = pointer_sized_int_node; + for (j = i; j; j = chains[j - 1]) + { + tree type = TREE_TYPE (ranges[j - 1].exp); + strict_overflow_p |= ranges[j - 1].strict_overflow_p; ++ if (POINTER_TYPE_P (type)) ++ type = pointer_sized_int_node; + if ((b % 4) == 3) + { + /* For the signed < 0 cases, the types should be +@@ -3651,6 +3655,8 @@ optimize_range_tests_cmp_bitwise (enum tree_code opcode, int first, int length, + tree type = TREE_TYPE (ranges[j - 1].exp); + if (j == k) + continue; ++ if (POINTER_TYPE_P (type)) ++ type = pointer_sized_int_node; + if ((b % 4) == 3) + { + if (!useless_type_conversion_p (type1, type)) +@@ -3680,7 +3686,7 @@ optimize_range_tests_cmp_bitwise (enum tree_code opcode, int first, int length, + op = r->exp; + continue; + } +- if (id == l) ++ if (id == l || POINTER_TYPE_P (TREE_TYPE (op))) + { + code = (b % 4) == 3 ? BIT_NOT_EXPR : NOP_EXPR; + g = gimple_build_assign (make_ssa_name (type1), code, op); +@@ -3704,6 +3710,14 @@ optimize_range_tests_cmp_bitwise (enum tree_code opcode, int first, int length, + gimple_seq_add_stmt_without_update (&seq, g); + op = gimple_assign_lhs (g); + } ++ type1 = TREE_TYPE (ranges[k - 1].exp); ++ if (POINTER_TYPE_P (type1)) ++ { ++ gimple *g ++ = gimple_build_assign (make_ssa_name (type1), NOP_EXPR, op); ++ gimple_seq_add_stmt_without_update (&seq, g); ++ op = gimple_assign_lhs (g); ++ } + candidates.pop (); + if (update_range_test (&ranges[k - 1], NULL, candidates.address (), + candidates.length (), opcode, ops, op, +-- +2.33.0 + diff --git a/0300-Remove-erroneous-pattern-from-gimple-ifcvt.patch b/0300-Remove-erroneous-pattern-from-gimple-ifcvt.patch new file mode 100644 index 0000000000000000000000000000000000000000..0eca175156e2190e6135a0e1fb80b979df4f8a7b --- /dev/null +++ b/0300-Remove-erroneous-pattern-from-gimple-ifcvt.patch @@ -0,0 +1,55 @@ +From 91ef8899a80e493042fd2687ad89064c9f90cf17 Mon Sep 17 00:00:00 2001 +From: Pronin Alexander +Date: Thu, 31 Oct 2024 16:14:34 +0800 +Subject: [PATCH 3/6] Remove erroneous pattern from gimple ifcvt + +Signed-off-by: Pronin Alexander +--- + gcc/match.pd | 2 +- + gcc/testsuite/gcc.dg/ifcvt-gimple-1.c | 21 +++++++++++++++++++++ + 2 files changed, 22 insertions(+), 1 deletion(-) + create mode 100644 gcc/testsuite/gcc.dg/ifcvt-gimple-1.c + +diff --git a/gcc/match.pd b/gcc/match.pd +index 8f41c292f..2dd6581d1 100644 +--- a/gcc/match.pd ++++ b/gcc/match.pd +@@ -4276,7 +4276,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) + ) + + (if (flag_if_conversion_gimple) +- (for simple_op (plus minus bit_and bit_ior bit_xor) ++ (for simple_op (plus minus bit_ior bit_xor) + (simplify + (cond @0 (simple_op @1 INTEGER_CST@2) @1) + (switch +diff --git a/gcc/testsuite/gcc.dg/ifcvt-gimple-1.c b/gcc/testsuite/gcc.dg/ifcvt-gimple-1.c +new file mode 100644 +index 000000000..381a4ad51 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/ifcvt-gimple-1.c +@@ -0,0 +1,21 @@ ++/* { dg-do run } */ ++/* { dg-options "-O2 -fno-inline -fif-conversion-gimple" } */ ++ ++#include ++ ++void foo(int a, int *p) { ++ *p = a; ++} ++ ++void verify (int a) { ++ if (a != 3) ++ abort (); ++} ++ ++int main() { ++ int a = 0; ++ foo (3, &a); ++ int tmp = (a > 7) ? a & 1 : a; ++ verify (tmp); ++ return 0; ++} +-- +2.33.0 + diff --git a/0301-Add-required-check-for-iteration-through-uses.patch b/0301-Add-required-check-for-iteration-through-uses.patch new file mode 100644 index 0000000000000000000000000000000000000000..105f4f75616777a5f8e3437645f41c75bc7b5d2b --- /dev/null +++ b/0301-Add-required-check-for-iteration-through-uses.patch @@ -0,0 +1,33 @@ +From ca24d352e98e357f4f7b8f0d262201765705a08a Mon Sep 17 00:00:00 2001 +From: Pronin Alexander +Date: Thu, 31 Oct 2024 16:31:33 +0800 +Subject: [PATCH 4/6] Add required check for iteration through uses + +Signed-off-by: Pronin Alexander +--- + gcc/tree-ssa-math-opts.cc | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc +index 2c06b8a60..80c06fa01 100644 +--- a/gcc/tree-ssa-math-opts.cc ++++ b/gcc/tree-ssa-math-opts.cc +@@ -4938,8 +4938,13 @@ convert_double_size_mul (gimple_stmt_iterator *gsi, gimple *stmt) + + /* Find the mult low part getter. */ + FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, match[3]) +- if (gimple_assign_rhs_code (use_stmt) == REALPART_EXPR) +- break; ++ { ++ if (!is_gimple_assign (use_stmt)) ++ continue; ++ ++ if (gimple_assign_rhs_code (use_stmt) == REALPART_EXPR) ++ break; ++ } + + /* Create high and low (if needed) parts extractors. */ + /* Low part. */ +-- +2.33.0 + diff --git a/0302-Added-param-for-optimization-for-merging-bb-s-with-c.patch b/0302-Added-param-for-optimization-for-merging-bb-s-with-c.patch new file mode 100644 index 0000000000000000000000000000000000000000..da25a9e25c950f625e6e27963f2ab5c54f33d32f --- /dev/null +++ b/0302-Added-param-for-optimization-for-merging-bb-s-with-c.patch @@ -0,0 +1,158 @@ +From 210147e28d542a03588ba3c3fa473301a03bb687 Mon Sep 17 00:00:00 2001 +From: Gmyrikov Konstantin +Date: Thu, 31 Oct 2024 16:45:15 +0800 +Subject: [PATCH 6/6] Added param for optimization for merging bb's with cheap + insns.Zero param means turned off optimization(default implementation),One + means turned on + +Signed-off-by: Gmyrikov Konstantin +--- + gcc/params.opt | 4 +++ + gcc/testsuite/gcc.dg/if_comb1.c | 13 +++++++++ + gcc/testsuite/gcc.dg/if_comb2.c | 13 +++++++++ + gcc/testsuite/gcc.dg/if_comb3.c | 12 +++++++++ + gcc/tree-ssa-ifcombine.cc | 47 ++++++++++++++++++++++++++++++--- + 5 files changed, 86 insertions(+), 3 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/if_comb1.c + create mode 100644 gcc/testsuite/gcc.dg/if_comb2.c + create mode 100644 gcc/testsuite/gcc.dg/if_comb3.c + +diff --git a/gcc/params.opt b/gcc/params.opt +index fc700ab79..3ddfaf5b2 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -789,6 +789,10 @@ Maximum number of VALUEs handled during a single find_base_term call. + Common Joined UInteger Var(param_max_vrp_switch_assertions) Init(10) Param Optimization + Maximum number of assertions to add along the default edge of a switch statement during VRP. + ++-param=merge-assign-stmts-ifcombine= ++Common Joined UInteger Var(param_merge_assign_stmts_ifcombine) Init(0) IntegerRange(0, 1) Param Optimization ++Whether bb's with cheap gimple_assign stmts should be merged in the ifcombine pass. ++ + -param=min-crossjump-insns= + Common Joined UInteger Var(param_min_crossjump_insns) Init(5) IntegerRange(1, 65536) Param Optimization + The minimum number of matching instructions to consider for crossjumping. +diff --git a/gcc/testsuite/gcc.dg/if_comb1.c b/gcc/testsuite/gcc.dg/if_comb1.c +new file mode 100644 +index 000000000..e00adc37d +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/if_comb1.c +@@ -0,0 +1,13 @@ ++/* { dg-do compile } */ ++/* { dg-options "-Ofast -S --param=merge-assign-stmts-ifcombine=1 -fdump-tree-ifcombine" } */ ++ ++int foo (double a, double b, int c) ++{ ++ if (c < 10 || a - b > 1.0) ++ return 0; ++ else ++ return 1; ++} ++ ++/* { dg-final { scan-tree-dump "optimizing two comparisons" "ifcombine"} } */ ++/* { dg-final { scan-tree-dump "Merging blocks" "ifcombine"} } */ +diff --git a/gcc/testsuite/gcc.dg/if_comb2.c b/gcc/testsuite/gcc.dg/if_comb2.c +new file mode 100644 +index 000000000..176e7e726 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/if_comb2.c +@@ -0,0 +1,13 @@ ++/* { dg-do compile } */ ++/* { dg-options "-Ofast -S --param=merge-assign-stmts-ifcombine=1 -fdump-tree-ifcombine" } */ ++ ++int foo (int a, int b, int c) ++{ ++ if (a > 1 || b * c < 10) ++ return 0; ++ else ++ return 1; ++} ++ ++/* { dg-final { scan-tree-dump "optimizing two comparisons" "ifcombine"} } */ ++/* { dg-final { scan-tree-dump "Merging blocks" "ifcombine"} } */ +diff --git a/gcc/testsuite/gcc.dg/if_comb3.c b/gcc/testsuite/gcc.dg/if_comb3.c +new file mode 100644 +index 000000000..aa2e4510c +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/if_comb3.c +@@ -0,0 +1,12 @@ ++/* { dg-do compile } */ ++/* { dg-options "-Ofast -S --param=merge-assign-stmts-ifcombine=1 -fdump-tree-ifcombine" } */ ++ ++int foo (int a, int b, int c) ++{ ++ if (a > 1 && b + c < 10) ++ a++; ++ return a; ++} ++ ++/* { dg-final { scan-tree-dump "optimizing two comparisons" "ifcombine"} } */ ++/* { dg-final { scan-tree-dump "Merging blocks" "ifcombine"} } */ +diff --git a/gcc/tree-ssa-ifcombine.cc b/gcc/tree-ssa-ifcombine.cc +index ce9bbebf9..264a8bcae 100644 +--- a/gcc/tree-ssa-ifcombine.cc ++++ b/gcc/tree-ssa-ifcombine.cc +@@ -110,6 +110,18 @@ recognize_if_then_else (basic_block cond_bb, + return true; + } + ++/* Verify if gimple insn cheap for param=merge-assign-stmts-ifcombine ++ optimization. */ ++ ++bool is_insn_cheap (enum tree_code t) ++{ ++ static enum tree_code cheap_insns[] = {MULT_EXPR, PLUS_EXPR, MINUS_EXPR}; ++ for (int i = 0; i < sizeof (cheap_insns)/sizeof (enum tree_code); i++) ++ if (t == cheap_insns[i]) ++ return 1; ++ return 0; ++} ++ + /* Verify if the basic block BB does not have side-effects. Return + true in this case, else false. */ + +@@ -572,9 +584,38 @@ ifcombine_ifandif (basic_block inner_cond_bb, bool inner_inv, + = param_logical_op_non_short_circuit; + if (!logical_op_non_short_circuit || sanitize_coverage_p ()) + return false; +- /* Only do this optimization if the inner bb contains only the conditional. */ +- if (!gsi_one_before_end_p (gsi_start_nondebug_after_labels_bb (inner_cond_bb))) +- return false; ++ if (param_merge_assign_stmts_ifcombine) ++ { ++ int number_cheap_insns = 0; ++ int number_conds = 0; ++ for (auto i = gsi_start_nondebug_after_labels_bb ++ (outer_cond_bb); !gsi_end_p (i); gsi_next_nondebug (&i)) ++ if (gimple_code (gsi_stmt (i)) == GIMPLE_ASSIGN ++ && is_insn_cheap (gimple_assign_rhs_code (gsi_stmt (i)))) ++ number_cheap_insns++; ++ else if (gimple_code (gsi_stmt (i)) == GIMPLE_COND) ++ number_conds++; ++ for (auto i = gsi_start_nondebug_after_labels_bb ++ (inner_cond_bb); !gsi_end_p (i); gsi_next_nondebug (&i)) ++ if (gimple_code (gsi_stmt (i)) == GIMPLE_ASSIGN ++ && is_insn_cheap (gimple_assign_rhs_code (gsi_stmt (i)))) ++ number_cheap_insns++; ++ else if (gimple_code (gsi_stmt (i)) == GIMPLE_COND) ++ number_conds++; ++ if (!(number_cheap_insns == 1 && number_conds == 2) ++ && !gsi_one_before_end_p (gsi_start_nondebug_after_labels_bb ++ (inner_cond_bb))) ++ return false; ++ } ++ else ++ { ++ /* Only do this optimization if the inner bb contains ++ only the conditional. */ ++ if (!gsi_one_before_end_p (gsi_start_nondebug_after_labels_bb ++ (inner_cond_bb))) ++ return false; ++ } ++ + t1 = fold_build2_loc (gimple_location (inner_cond), + inner_cond_code, + boolean_type_node, +-- +2.33.0 + diff --git a/gcc.spec b/gcc.spec index a49e701513400ce5e6ff9a139ae57e5fe108f2ec..449ea3ba4e26e54261920249341f2bc80a5f2633 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 37 +%global gcc_release 39 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -203,187 +203,212 @@ Patch93: 0093-fix-bugs-within-pointer-compression-and-DFE.patch Patch94: 0094-BUGFIX-AutoBOLT-function-miss-bind-type.patch Patch95: 0095-STABS-remove-gstabs-and-gxcoff-functionality.patch Patch96: 0096-Bugfix-Autofdo-use-PMU-sampling-set-num-eauals-den.patch -Patch97: 0097-Backport-SME-AArch64-Cleanup-CPU-option-processing-c.patch -Patch98: 0098-Backport-SME-AArch64-Cleanup-option-processing-code.patch -Patch99: 0099-Backport-SME-aarch64-Add-march-support-for-Armv9.1-A.patch -Patch100: 0100-Backport-SME-Revert-aarch64-Define-__ARM_FEATURE_RCP.patch -Patch101: 0101-Backport-SME-Revert-Ampere-1-and-Ampere-1A-core-defi.patch -Patch102: 0102-Backport-SME-aarch64-Rename-AARCH64_ISA-architecture.patch -Patch103: 0103-Backport-SME-aarch64-Rename-AARCH64_FL-architecture-.patch -Patch104: 0104-Backport-SME-aarch64-Rename-AARCH64_FL_FOR_ARCH-macr.patch -Patch105: 0105-Backport-SME-aarch64-Add-V-to-aarch64-arches.def-nam.patch -Patch106: 0106-Backport-SME-aarch64-Small-config.gcc-cleanups.patch -Patch107: 0107-Backport-SME-aarch64-Avoid-redundancy-in-aarch64-cor.patch -Patch108: 0108-Backport-SME-aarch64-Remove-AARCH64_FL_RCPC8_4-PR107.patch -Patch109: 0109-Backport-SME-aarch64-Fix-transitive-closure-of-featu.patch -Patch110: 0110-Backport-SME-aarch64-Reorder-an-entry-in-aarch64-opt.patch -Patch111: 0111-Backport-SME-aarch64-Simplify-feature-definitions.patch -Patch112: 0112-Backport-SME-aarch64-Simplify-generation-of-.arch-st.patch -Patch113: 0113-Backport-SME-aarch64-Avoid-std-string-in-static-data.patch -Patch114: 0114-Backport-SME-aarch64-Tweak-constness-of-option-relat.patch -Patch115: 0115-Backport-SME-aarch64-Make-more-use-of-aarch64_featur.patch -Patch116: 0116-Backport-SME-aarch64-Tweak-contents-of-flags_on-off-.patch -Patch117: 0117-Backport-SME-aarch64-Tweak-handling-of-mgeneral-regs.patch -Patch118: 0118-Backport-SME-aarch64-Remove-redundant-TARGET_-checks.patch -Patch119: 0119-Backport-SME-aarch64-Define-__ARM_FEATURE_RCPC.patch -Patch120: 0120-Backport-SME-Add-Ampere-1-and-Ampere-1A-core-definit.patch -Patch121: 0121-Backport-SME-aarch64-Fix-nosimd-handling-of-FPR-move.patch -Patch122: 0122-Backport-SME-aarch64-Commonise-some-folding-code.patch -Patch123: 0123-Backport-SME-aarch64-Add-a-Z-operand-modifier-for-SV.patch -Patch124: 0124-Backport-SME-mode-switching-Remove-unused-bbnum-fiel.patch -Patch125: 0125-Backport-SME-mode-switching-Tweak-the-macro-hook-doc.patch -Patch126: 0126-Backport-SME-mode-switching-Add-note-problem.patch -Patch127: 0127-Backport-SME-mode-switching-Avoid-quadractic-list-op.patch -Patch128: 0128-Backport-SME-mode-switching-Fix-the-mode-passed-to-t.patch -Patch129: 0129-Backport-SME-mode-switching-Simplify-recording-of-tr.patch -Patch130: 0130-Backport-SME-mode-switching-Tweak-entry-exit-handlin.patch -Patch131: 0131-Backport-SME-mode-switching-Allow-targets-to-set-the.patch -Patch132: 0132-Backport-SME-mode-switching-Pass-set-of-live-registe.patch -Patch133: 0133-Backport-SME-mode-switching-Pass-the-set-of-live-reg.patch -Patch134: 0134-Backport-SME-mode-switching-Use-1-based-edge-aux-fie.patch -Patch135: 0135-Backport-SME-mode-switching-Add-a-target-configurabl.patch -Patch136: 0136-Backport-SME-mode-switching-Add-a-backprop-hook.patch -Patch137: 0137-Backport-SME-aarch64-Add-a-result_mode-helper-functi.patch -Patch138: 0138-Backport-SME-rtl-Try-to-remove-EH-edges-after-pro-ep.patch -Patch139: 0139-Backport-SME-Fix-PR-middle-end-107705-ICE-after-recl.patch -Patch140: 0140-Backport-SME-function-Change-return-type-of-predicat.patch -Patch141: 0141-Backport-SME-Allow-prologues-and-epilogues-to-be-ins.patch -Patch142: 0142-Backport-SME-Add-a-target-hook-for-sibcall-epilogues.patch -Patch143: 0143-Backport-SME-Add-a-new-target-hook-TARGET_START_CALL.patch -Patch144: 0144-Backport-SME-Allow-targets-to-add-USEs-to-asms.patch -Patch145: 0145-Backport-SME-New-compact-syntax-for-insn-and-insn_sp.patch -Patch146: 0146-Backport-SME-recog-Improve-parser-for-pattern-new-co.patch -Patch147: 0147-Backport-SME-recog-Support-space-in-cons.patch -Patch148: 0148-Backport-SME-aarch64-Generalise-require_immediate_la.patch -Patch149: 0149-Backport-SME-aarch64-Add-backend-support-for-DFP.patch -Patch150: 0150-Backport-SME-aarch64-Vector-move-fixes-for-nosimd.patch -Patch151: 0151-Backport-SME-aarch64-Simplify-output-template-emissi.patch -Patch152: 0152-Backport-SME-Improve-immediate-expansion-PR106583.patch -Patch153: 0153-Backport-SME-AArch64-Cleanup-move-immediate-code.patch -Patch154: 0154-Backport-SME-AArch64-convert-some-patterns-to-compac.patch -Patch155: 0155-Backport-SME-aarch64-Use-SVE-s-RDVL-instruction.patch -Patch156: 0156-Backport-SME-aarch64-Make-AARCH64_FL_SVE-requirement.patch -Patch157: 0157-Backport-SME-aarch64-Add-group-suffixes-to-SVE-intri.patch -Patch158: 0158-Backport-SME-aarch64-Add-sve_type-to-SVE-builtins-co.patch -Patch159: 0159-Backport-SME-aarch64-Generalise-some-SVE-ACLE-error-.patch -Patch160: 0160-Backport-SME-aarch64-Replace-vague-previous-argument.patch -Patch161: 0161-Backport-SME-aarch64-Make-more-use-of-sve_type-in-AC.patch -Patch162: 0162-Backport-SME-aarch64-Tweak-error-message-for-tuple-v.patch -Patch163: 0163-Backport-SME-aarch64-Add-tuple-forms-of-svreinterpre.patch -Patch164: 0164-Backport-SME-attribs-Use-existing-traits-for-excl_ha.patch -Patch165: 0165-Backport-SME-Allow-target-attributes-in-non-gnu-name.patch -Patch166: 0166-Backport-SME-aarch64-Fix-plugin-header-install.patch -Patch167: 0167-Backport-SME-aarch64-Add-arm_streaming-_compatible-a.patch -Patch168: 0168-Backport-SME-aarch64-Add-sme.patch -Patch169: 0169-Backport-SME-aarch64-Add-r-m-and-m-r-alternatives-to.patch -Patch170: 0170-Backport-SME-AArch64-Rewrite-simd-move-immediate-pat.patch -Patch171: 0171-Backport-SME-AArch64-remove-test-comment-from-mov-mo.patch -Patch172: 0172-Backport-SME-aarch64-Distinguish-streaming-compatibl.patch -Patch173: 0173-Backport-SME-aarch64-Mark-relevant-SVE-instructions-.patch -Patch174: 0174-Backport-SME-AArch64-Support-new-tbranch-optab.patch -Patch175: 0175-Backport-SME-aarch64-Use-local-frame-vars-in-shrink-.patch -Patch176: 0176-Backport-SME-aarch64-Avoid-a-use-of-callee_offset.patch -Patch177: 0177-Backport-SME-aarch64-Explicitly-handle-frames-with-n.patch -Patch178: 0178-Backport-SME-aarch64-Add-bytes_below_saved_regs-to-f.patch -Patch179: 0179-Backport-SME-aarch64-Add-bytes_below_hard_fp-to-fram.patch -Patch180: 0180-Backport-SME-aarch64-Robustify-stack-tie-handling.patch -Patch181: 0181-Backport-SME-aarch64-Tweak-aarch64_save-restore_call.patch -Patch182: 0182-Backport-SME-aarch64-Only-calculate-chain_offset-if-.patch -Patch183: 0183-Backport-SME-aarch64-Rename-locals_offset-to-bytes_a.patch -Patch184: 0184-Backport-SME-aarch64-Rename-hard_fp_offset-to-bytes_.patch -Patch185: 0185-Backport-SME-aarch64-Tweak-frame_size-comment.patch -Patch186: 0186-Backport-SME-aarch64-Measure-reg_offset-from-the-bot.patch -Patch187: 0187-Backport-SME-aarch64-Simplify-top-of-frame-allocatio.patch -Patch188: 0188-Backport-SME-aarch64-Minor-initial-adjustment-tweak.patch -Patch189: 0189-Backport-SME-aarch64-Tweak-stack-clash-boundary-cond.patch -Patch190: 0190-Backport-SME-aarch64-Put-LR-save-probe-in-first-16-b.patch -Patch191: 0191-Backport-SME-aarch64-Simplify-probe-of-final-frame-a.patch -Patch192: 0192-Backport-SME-aarch64-Explicitly-record-probe-registe.patch -Patch193: 0193-Backport-SME-aarch64-Remove-below_hard_fp_saved_regs.patch -Patch194: 0194-Backport-SME-aarch64-Make-stack-smash-canary-protect.patch -Patch195: 0195-Backport-SME-Handle-epilogues-that-contain-jumps.patch -Patch196: 0196-Backport-SME-aarch64-Use-vecs-to-store-register-save.patch -Patch197: 0197-Backport-SME-aarch64-Put-LR-save-slot-first-in-more-.patch -Patch198: 0198-Backport-SME-aarch64-Switch-PSTATE.SM-around-calls.patch -Patch199: 0199-Backport-SME-aarch64-Add-support-for-SME-ZA-attribut.patch -Patch200: 0200-Backport-SME-aarch64-Add-a-register-class-for-w12-w1.patch -Patch201: 0201-Backport-SME-aarch64-Add-a-VNx1TI-mode.patch -Patch202: 0202-Backport-SME-aarch64-Generalise-unspec_based_functio.patch -Patch203: 0203-Backport-SME-aarch64-Generalise-_m-rules-for-SVE-int.patch -Patch204: 0204-Backport-SME-aarch64-Add-support-for-arm_sme.h.patch -Patch205: 0205-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch -Patch206: 0206-Backport-SME-aarch64-Handle-PSTATE.SM-across-abnorma.patch -Patch207: 0207-Backport-SME-aarch64-Enforce-inlining-restrictions-f.patch -Patch208: 0208-Backport-SME-aarch64-Update-sibcall-handling-for-SME.patch -Patch209: 0209-Backport-SME-libgcc-aarch64-Configure-check-for-.var.patch -Patch210: 0210-Backport-SME-libgcc-aarch64-Configure-check-for-__ge.patch -Patch211: 0211-Backport-SME-libgcc-aarch64-Add-SME-runtime-support.patch -Patch212: 0212-Backport-SME-libgcc-aarch64-Add-SME-unwinder-support.patch -Patch213: 0213-Backport-SME-libgcc-Fix-config.in.patch -Patch214: 0214-Backport-SME-aarch64-Add-funwind-tables-to-some-test.patch -Patch215: 0215-Backport-SME-aarch64-Skip-some-SME-register-save-tes.patch -Patch216: 0216-Backport-SME-Add-OPTIONS_H_EXTRA-to-GTFILES.patch -Patch217: 0217-Backport-SME-aarch64-Add-V1DI-mode.patch -Patch218: 0218-Backport-SME-Allow-md-iterators-to-include-other-ite.patch -Patch219: 0219-Backport-SME-riscv-Add-support-for-strlen-inline-exp.patch -Patch220: 0220-Backport-SME-attribs-Add-overloads-with-namespace-na.patch -Patch221: 0221-Backport-SME-vec-Add-array_slice-constructors-from-n.patch -Patch222: 0222-Backport-SME-A-couple-of-va_gc_atomic-tweaks.patch -Patch223: 0223-Backport-SME-middle-end-Fix-issue-of-poly_uint16-1-1.patch -Patch224: 0224-SME-Add-missing-header-file-in-aarch64.cc.patch -Patch225: 0225-Backport-SME-c-Add-support-for-__extension__.patch -Patch226: 0226-Backport-SME-lra-Updates-of-biggest-mode-for-hard-re.patch -Patch227: 0227-Backport-SME-c-Support-C2x-empty-initializer-braces.patch -Patch228: 0228-Backport-SME-aarch64-Update-sizeless-tests-for-recen.patch -Patch229: 0229-Backport-SME-attribs-Namespace-aware-lookup_attribut.patch -Patch230: 0230-Backport-SME-c-family-ICE-with-gnu-nocf_check-PR1069.patch -Patch231: 0231-Backport-SME-AArch64-Fix-assert-in-aarch64_move_imm-.patch -Patch232: 0232-Backport-SME-testsuite-Only-run-fcf-protection-test-.patch -Patch233: 0233-Backport-SME-Fix-PRs-106764-106765-and-107307-all-IC.patch -Patch234: 0234-Backport-SME-aarch64-Remove-expected-error-for-compo.patch -Patch235: 0235-Backport-SME-aarch64-Remove-redundant-builtins-code.patch -Patch236: 0236-Backport-SME-AArch64-Fix-Armv9-a-warnings-that-get-e.patch -Patch237: 0237-Backport-SME-Canonicalize-X-Y-as-X-Y-in-match.pd-whe.patch -Patch238: 0238-Backport-SME-middle-end-Add-new-tbranch-optab-to-add.patch -Patch239: 0239-Backport-SME-explow-Allow-dynamic-allocations-after-.patch -Patch240: 0240-Backport-SME-PR105169-Fix-references-to-discarded-se.patch -Patch241: 0241-Backport-SME-RISC-V-autovec-Verify-that-GET_MODE_NUN.patch -Patch242: 0242-Backport-SME-Add-operator-to-gimple_stmt_iterator-an.patch -Patch243: 0243-Backport-SME-tree-optimization-110221-SLP-and-loop-m.patch -Patch244: 0244-SME-Adapt-some-testsuites.patch -Patch245: 0245-SME-Fix-error-by-backported-patches-and-IPA-prefetch.patch -Patch246: 0246-aarch64-Fix-return-register-handling-in-untyped_call.patch -Patch247: 0247-aarch64-Fix-loose-ldpstp-check.patch -Patch248: 0248-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ-.patch -Patch249: 0249-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch -Patch250: 0250-Make-option-mvzeroupper-independent-of-optimization-.patch -Patch251: 0251-i386-Sync-tune_string-with-arch_string-for-target-at.patch -Patch252: 0252-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch -Patch253: 0253-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch -Patch254: 0254-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch -Patch255: 0255-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch -Patch256: 0256-Software-mitigation-Disable-gather-generation-in-vec.patch -Patch257: 0257-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch -Patch258: 0258-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch -Patch259: 0259-Disparage-slightly-for-the-alternative-which-move-DF.patch -Patch260: 0260-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch -Patch261: 0261-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch -Patch262: 0262-Disable-FMADD-in-chains-for-Zen4-and-generic.patch -Patch263: 0263-Initial-Raptorlake-Support.patch -Patch264: 0264-Initial-Meteorlake-Support.patch -Patch265: 0265-Support-Intel-AMX-FP16-ISA.patch -Patch266: 0266-Support-Intel-prefetchit0-t1.patch -Patch267: 0267-Initial-Granite-Rapids-Support.patch -Patch268: 0268-Support-Intel-AMX-COMPLEX.patch -Patch269: 0269-i386-Add-AMX-COMPLEX-to-Granite-Rapids.patch -Patch270: 0270-Initial-Granite-Rapids-D-Support.patch -Patch271: 0271-Correct-Granite-Rapids-D-documentation.patch -Patch272: 0272-i386-Remove-Meteorlake-s-family_model.patch -Patch273: 0273-x86-Update-model-values-for-Alderlake-Rocketlake-and.patch -Patch274: 0274-x86-Update-model-values-for-Raptorlake.patch -Patch275: 0275-Fix-target_clone-arch-graniterapids-d.patch -Patch276: 0276-i386-Change-prefetchi-output-template.patch -Patch277: 0277-i386-Add-non-optimize-prefetchi-intrins.patch +Patch97: 0097-Improve-non-loop-disambiguation.patch +Patch98: 0098-CHREC-multiplication-and-undefined-overflow.patch +Patch99: 0099-Enable-Transposed-SLP.patch +Patch100: 0100-Add-hip09-machine-discribtion.patch +Patch101: 0101-Add-hip11-CPU-pipeline-scheduling.patch +Patch102: 0102-Add-Crc32-Optimization-in-Gzip-For-crc32-algorithm-i.patch +Patch103: 0103-SME-Remove-hip09-and-hip11-in-aarch64-cores.def-to-b.patch +Patch104: 0104-Backport-SME-AArch64-Cleanup-CPU-option-processing-c.patch +Patch105: 0105-Backport-SME-AArch64-Cleanup-option-processing-code.patch +Patch106: 0106-Backport-SME-aarch64-Add-march-support-for-Armv9.1-A.patch +Patch107: 0107-Backport-SME-Revert-aarch64-Define-__ARM_FEATURE_RCP.patch +Patch108: 0108-Backport-SME-Revert-Ampere-1-and-Ampere-1A-core-defi.patch +Patch109: 0109-Backport-SME-aarch64-Rename-AARCH64_ISA-architecture.patch +Patch110: 0110-Backport-SME-aarch64-Rename-AARCH64_FL-architecture-.patch +Patch111: 0111-Backport-SME-aarch64-Rename-AARCH64_FL_FOR_ARCH-macr.patch +Patch112: 0112-Backport-SME-aarch64-Add-V-to-aarch64-arches.def-nam.patch +Patch113: 0113-Backport-SME-aarch64-Small-config.gcc-cleanups.patch +Patch114: 0114-Backport-SME-aarch64-Avoid-redundancy-in-aarch64-cor.patch +Patch115: 0115-Backport-SME-aarch64-Remove-AARCH64_FL_RCPC8_4-PR107.patch +Patch116: 0116-Backport-SME-aarch64-Fix-transitive-closure-of-featu.patch +Patch117: 0117-Backport-SME-aarch64-Reorder-an-entry-in-aarch64-opt.patch +Patch118: 0118-Backport-SME-aarch64-Simplify-feature-definitions.patch +Patch119: 0119-Backport-SME-aarch64-Simplify-generation-of-.arch-st.patch +Patch120: 0120-Backport-SME-aarch64-Avoid-std-string-in-static-data.patch +Patch121: 0121-Backport-SME-aarch64-Tweak-constness-of-option-relat.patch +Patch122: 0122-Backport-SME-aarch64-Make-more-use-of-aarch64_featur.patch +Patch123: 0123-Backport-SME-aarch64-Tweak-contents-of-flags_on-off-.patch +Patch124: 0124-Backport-SME-aarch64-Tweak-handling-of-mgeneral-regs.patch +Patch125: 0125-Backport-SME-aarch64-Remove-redundant-TARGET_-checks.patch +Patch126: 0126-Backport-SME-aarch64-Define-__ARM_FEATURE_RCPC.patch +Patch127: 0127-Backport-SME-Add-Ampere-1-and-Ampere-1A-core-definit.patch +Patch128: 0128-Backport-SME-aarch64-Fix-nosimd-handling-of-FPR-move.patch +Patch129: 0129-Backport-SME-aarch64-Commonise-some-folding-code.patch +Patch130: 0130-Backport-SME-aarch64-Add-a-Z-operand-modifier-for-SV.patch +Patch131: 0131-Backport-SME-mode-switching-Remove-unused-bbnum-fiel.patch +Patch132: 0132-Backport-SME-mode-switching-Tweak-the-macro-hook-doc.patch +Patch133: 0133-Backport-SME-mode-switching-Add-note-problem.patch +Patch134: 0134-Backport-SME-mode-switching-Avoid-quadractic-list-op.patch +Patch135: 0135-Backport-SME-mode-switching-Fix-the-mode-passed-to-t.patch +Patch136: 0136-Backport-SME-mode-switching-Simplify-recording-of-tr.patch +Patch137: 0137-Backport-SME-mode-switching-Tweak-entry-exit-handlin.patch +Patch138: 0138-Backport-SME-mode-switching-Allow-targets-to-set-the.patch +Patch139: 0139-Backport-SME-mode-switching-Pass-set-of-live-registe.patch +Patch140: 0140-Backport-SME-mode-switching-Pass-the-set-of-live-reg.patch +Patch141: 0141-Backport-SME-mode-switching-Use-1-based-edge-aux-fie.patch +Patch142: 0142-Backport-SME-mode-switching-Add-a-target-configurabl.patch +Patch143: 0143-Backport-SME-mode-switching-Add-a-backprop-hook.patch +Patch144: 0144-Backport-SME-aarch64-Add-a-result_mode-helper-functi.patch +Patch145: 0145-Backport-SME-rtl-Try-to-remove-EH-edges-after-pro-ep.patch +Patch146: 0146-Backport-SME-Fix-PR-middle-end-107705-ICE-after-recl.patch +Patch147: 0147-Backport-SME-function-Change-return-type-of-predicat.patch +Patch148: 0148-Backport-SME-Allow-prologues-and-epilogues-to-be-ins.patch +Patch149: 0149-Backport-SME-Add-a-target-hook-for-sibcall-epilogues.patch +Patch150: 0150-Backport-SME-Add-a-new-target-hook-TARGET_START_CALL.patch +Patch151: 0151-Backport-SME-Allow-targets-to-add-USEs-to-asms.patch +Patch152: 0152-Backport-SME-New-compact-syntax-for-insn-and-insn_sp.patch +Patch153: 0153-Backport-SME-recog-Improve-parser-for-pattern-new-co.patch +Patch154: 0154-Backport-SME-recog-Support-space-in-cons.patch +Patch155: 0155-Backport-SME-aarch64-Generalise-require_immediate_la.patch +Patch156: 0156-Backport-SME-aarch64-Add-backend-support-for-DFP.patch +Patch157: 0157-Backport-SME-aarch64-Vector-move-fixes-for-nosimd.patch +Patch158: 0158-Backport-SME-aarch64-Simplify-output-template-emissi.patch +Patch159: 0159-Backport-SME-Improve-immediate-expansion-PR106583.patch +Patch160: 0160-Backport-SME-AArch64-Cleanup-move-immediate-code.patch +Patch161: 0161-Backport-SME-AArch64-convert-some-patterns-to-compac.patch +Patch162: 0162-Backport-SME-aarch64-Use-SVE-s-RDVL-instruction.patch +Patch163: 0163-Backport-SME-aarch64-Make-AARCH64_FL_SVE-requirement.patch +Patch164: 0164-Backport-SME-aarch64-Add-group-suffixes-to-SVE-intri.patch +Patch165: 0165-Backport-SME-aarch64-Add-sve_type-to-SVE-builtins-co.patch +Patch166: 0166-Backport-SME-aarch64-Generalise-some-SVE-ACLE-error-.patch +Patch167: 0167-Backport-SME-aarch64-Replace-vague-previous-argument.patch +Patch168: 0168-Backport-SME-aarch64-Make-more-use-of-sve_type-in-AC.patch +Patch169: 0169-Backport-SME-aarch64-Tweak-error-message-for-tuple-v.patch +Patch170: 0170-Backport-SME-aarch64-Add-tuple-forms-of-svreinterpre.patch +Patch171: 0171-Backport-SME-attribs-Use-existing-traits-for-excl_ha.patch +Patch172: 0172-Backport-SME-Allow-target-attributes-in-non-gnu-name.patch +Patch173: 0173-Backport-SME-aarch64-Fix-plugin-header-install.patch +Patch174: 0174-Backport-SME-aarch64-Add-arm_streaming-_compatible-a.patch +Patch175: 0175-Backport-SME-aarch64-Add-sme.patch +Patch176: 0176-Backport-SME-aarch64-Add-r-m-and-m-r-alternatives-to.patch +Patch177: 0177-Backport-SME-AArch64-Rewrite-simd-move-immediate-pat.patch +Patch178: 0178-Backport-SME-AArch64-remove-test-comment-from-mov-mo.patch +Patch179: 0179-Backport-SME-aarch64-Distinguish-streaming-compatibl.patch +Patch180: 0180-Backport-SME-aarch64-Mark-relevant-SVE-instructions-.patch +Patch181: 0181-Backport-SME-AArch64-Support-new-tbranch-optab.patch +Patch182: 0182-Backport-SME-aarch64-Use-local-frame-vars-in-shrink-.patch +Patch183: 0183-Backport-SME-aarch64-Avoid-a-use-of-callee_offset.patch +Patch184: 0184-Backport-SME-aarch64-Explicitly-handle-frames-with-n.patch +Patch185: 0185-Backport-SME-aarch64-Add-bytes_below_saved_regs-to-f.patch +Patch186: 0186-Backport-SME-aarch64-Add-bytes_below_hard_fp-to-fram.patch +Patch187: 0187-Backport-SME-aarch64-Robustify-stack-tie-handling.patch +Patch188: 0188-Backport-SME-aarch64-Tweak-aarch64_save-restore_call.patch +Patch189: 0189-Backport-SME-aarch64-Only-calculate-chain_offset-if-.patch +Patch190: 0190-Backport-SME-aarch64-Rename-locals_offset-to-bytes_a.patch +Patch191: 0191-Backport-SME-aarch64-Rename-hard_fp_offset-to-bytes_.patch +Patch192: 0192-Backport-SME-aarch64-Tweak-frame_size-comment.patch +Patch193: 0193-Backport-SME-aarch64-Measure-reg_offset-from-the-bot.patch +Patch194: 0194-Backport-SME-aarch64-Simplify-top-of-frame-allocatio.patch +Patch195: 0195-Backport-SME-aarch64-Minor-initial-adjustment-tweak.patch +Patch196: 0196-Backport-SME-aarch64-Tweak-stack-clash-boundary-cond.patch +Patch197: 0197-Backport-SME-aarch64-Put-LR-save-probe-in-first-16-b.patch +Patch198: 0198-Backport-SME-aarch64-Simplify-probe-of-final-frame-a.patch +Patch199: 0199-Backport-SME-aarch64-Explicitly-record-probe-registe.patch +Patch200: 0200-Backport-SME-aarch64-Remove-below_hard_fp_saved_regs.patch +Patch201: 0201-Backport-SME-aarch64-Make-stack-smash-canary-protect.patch +Patch202: 0202-Backport-SME-Handle-epilogues-that-contain-jumps.patch +Patch203: 0203-Backport-SME-aarch64-Use-vecs-to-store-register-save.patch +Patch204: 0204-Backport-SME-aarch64-Put-LR-save-slot-first-in-more-.patch +Patch205: 0205-Backport-SME-aarch64-Switch-PSTATE.SM-around-calls.patch +Patch206: 0206-Backport-SME-aarch64-Add-support-for-SME-ZA-attribut.patch +Patch207: 0207-Backport-SME-aarch64-Add-a-register-class-for-w12-w1.patch +Patch208: 0208-Backport-SME-aarch64-Add-a-VNx1TI-mode.patch +Patch209: 0209-Backport-SME-aarch64-Generalise-unspec_based_functio.patch +Patch210: 0210-Backport-SME-aarch64-Generalise-_m-rules-for-SVE-int.patch +Patch211: 0211-Backport-SME-aarch64-Add-support-for-arm_sme.h.patch +Patch212: 0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch +Patch213: 0213-Backport-SME-aarch64-Handle-PSTATE.SM-across-abnorma.patch +Patch214: 0214-Backport-SME-aarch64-Enforce-inlining-restrictions-f.patch +Patch215: 0215-Backport-SME-aarch64-Update-sibcall-handling-for-SME.patch +Patch216: 0216-Backport-SME-libgcc-aarch64-Configure-check-for-.var.patch +Patch217: 0217-Backport-SME-libgcc-aarch64-Configure-check-for-__ge.patch +Patch218: 0218-Backport-SME-libgcc-aarch64-Add-SME-runtime-support.patch +Patch219: 0219-Backport-SME-libgcc-aarch64-Add-SME-unwinder-support.patch +Patch220: 0220-Backport-SME-libgcc-Fix-config.in.patch +Patch221: 0221-Backport-SME-aarch64-Add-funwind-tables-to-some-test.patch +Patch222: 0222-Backport-SME-aarch64-Skip-some-SME-register-save-tes.patch +Patch223: 0223-Backport-SME-Add-OPTIONS_H_EXTRA-to-GTFILES.patch +Patch224: 0224-Backport-SME-aarch64-Add-V1DI-mode.patch +Patch225: 0225-Backport-SME-Allow-md-iterators-to-include-other-ite.patch +Patch226: 0226-Backport-SME-riscv-Add-support-for-strlen-inline-exp.patch +Patch227: 0227-Backport-SME-attribs-Add-overloads-with-namespace-na.patch +Patch228: 0228-Backport-SME-vec-Add-array_slice-constructors-from-n.patch +Patch229: 0229-Backport-SME-A-couple-of-va_gc_atomic-tweaks.patch +Patch230: 0230-Backport-SME-middle-end-Fix-issue-of-poly_uint16-1-1.patch +Patch231: 0231-SME-Add-missing-header-file-in-aarch64.cc.patch +Patch232: 0232-Backport-SME-c-Add-support-for-__extension__.patch +Patch233: 0233-Backport-SME-lra-Updates-of-biggest-mode-for-hard-re.patch +Patch234: 0234-Backport-SME-c-Support-C2x-empty-initializer-braces.patch +Patch235: 0235-Backport-SME-aarch64-Update-sizeless-tests-for-recen.patch +Patch236: 0236-Backport-SME-attribs-Namespace-aware-lookup_attribut.patch +Patch237: 0237-Backport-SME-c-family-ICE-with-gnu-nocf_check-PR1069.patch +Patch238: 0238-Backport-SME-AArch64-Fix-assert-in-aarch64_move_imm-.patch +Patch239: 0239-Backport-SME-testsuite-Only-run-fcf-protection-test-.patch +Patch240: 0240-Backport-SME-Fix-PRs-106764-106765-and-107307-all-IC.patch +Patch241: 0241-Backport-SME-aarch64-Remove-expected-error-for-compo.patch +Patch242: 0242-Backport-SME-aarch64-Remove-redundant-builtins-code.patch +Patch243: 0243-Backport-SME-AArch64-Fix-Armv9-a-warnings-that-get-e.patch +Patch244: 0244-Backport-SME-Canonicalize-X-Y-as-X-Y-in-match.pd-whe.patch +Patch245: 0245-Backport-SME-middle-end-Add-new-tbranch-optab-to-add.patch +Patch246: 0246-Backport-SME-explow-Allow-dynamic-allocations-after-.patch +Patch247: 0247-Backport-SME-PR105169-Fix-references-to-discarded-se.patch +Patch248: 0248-Backport-SME-RISC-V-autovec-Verify-that-GET_MODE_NUN.patch +Patch249: 0249-Backport-SME-Add-operator-to-gimple_stmt_iterator-an.patch +Patch250: 0250-Backport-SME-tree-optimization-110221-SLP-and-loop-m.patch +Patch251: 0251-SME-Adapt-some-testsuites.patch +Patch252: 0252-SME-Fix-error-by-backported-patches-and-IPA-prefetch.patch +Patch253: 0253-aarch64-Fix-return-register-handling-in-untyped_call.patch +Patch254: 0254-aarch64-Fix-loose-ldpstp-check.patch +Patch255: 0255-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ-.patch +Patch256: 0256-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch +Patch257: 0257-Make-option-mvzeroupper-independent-of-optimization-.patch +Patch258: 0258-i386-Sync-tune_string-with-arch_string-for-target-at.patch +Patch259: 0259-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch +Patch260: 0260-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch +Patch261: 0261-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch +Patch262: 0262-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch +Patch263: 0263-Software-mitigation-Disable-gather-generation-in-vec.patch +Patch264: 0264-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch +Patch265: 0265-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch +Patch266: 0266-Disparage-slightly-for-the-alternative-which-move-DF.patch +Patch267: 0267-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch +Patch268: 0268-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch +Patch269: 0269-Disable-FMADD-in-chains-for-Zen4-and-generic.patch +Patch270: 0270-Initial-Raptorlake-Support.patch +Patch271: 0271-Initial-Meteorlake-Support.patch +Patch272: 0272-Support-Intel-AMX-FP16-ISA.patch +Patch273: 0273-Support-Intel-prefetchit0-t1.patch +Patch274: 0274-Initial-Granite-Rapids-Support.patch +Patch275: 0275-Support-Intel-AMX-COMPLEX.patch +Patch276: 0276-i386-Add-AMX-COMPLEX-to-Granite-Rapids.patch +Patch277: 0277-Initial-Granite-Rapids-D-Support.patch +Patch278: 0278-Correct-Granite-Rapids-D-documentation.patch +Patch279: 0279-i386-Remove-Meteorlake-s-family_model.patch +Patch280: 0280-x86-Update-model-values-for-Alderlake-Rocketlake-and.patch +Patch281: 0281-x86-Update-model-values-for-Raptorlake.patch +Patch282: 0282-Fix-target_clone-arch-graniterapids-d.patch +Patch283: 0283-i386-Change-prefetchi-output-template.patch +Patch284: 0284-i386-Add-non-optimize-prefetchi-intrins.patch +Patch285: 0285-SME-Recover-hip09-and-hip11-in-aarch64-cores.def.patch +Patch286: 0286-Try-to-use-AI-model-to-guide-optimization.patch +Patch287: 0287-Add-dynamic-memory-access-checks.patch +Patch288: 0288-Enable-macro-use-commandline.patch +Patch289: 0289-tree-ssa-loop-crc.cc-TARGET_CRC32-may-be-not-defined.patch +Patch290: 0290-Add-ipa-prefetch-test-for-gcc-s-case.patch +Patch291: 0291-Fix-settings-for-wide-operations-tests.patch +Patch292: 0292-Fix-errors-in-ipa-prefetch-IAORPF-and-IAOSJ0.patch +Patch293: 0293-Fix-error-with-stmts-insertion-in-ipa-prefetch-for-I.patch +Patch294: 0294-Fix-errors-in-ipa-prefetch-IAO50J-and-IAO5H7.patch +Patch295: 0295-Fix-error-with-grouped_load-merge-in-slp-transpose-v.patch +Patch296: 0296-Fix-error-in-slp-transpose-vectorize-for-IAQFM3.patch +Patch297: 0297-Fix-grouped-load-merging-error-in-SLP-transpose-vectorization.patch +Patch298: 0298-Mark-prefetch-builtin-as-willreturn.patch +Patch299: 0299-Backport-Disallow-pointer-operands-for-and-partly-PR.patch +Patch300: 0300-Remove-erroneous-pattern-from-gimple-ifcvt.patch +Patch301: 0301-Add-required-check-for-iteration-through-uses.patch +Patch302: 0302-Added-param-for-optimization-for-merging-bb-s-with-c.patch # Part 3000 ~ 4999 @@ -1425,6 +1450,31 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch275 -p1 %patch276 -p1 %patch277 -p1 +%patch278 -p1 +%patch279 -p1 +%patch280 -p1 +%patch281 -p1 +%patch282 -p1 +%patch283 -p1 +%patch284 -p1 +%patch285 -p1 +%patch286 -p1 +%patch287 -p1 +%patch288 -p1 +%patch289 -p1 +%patch290 -p1 +%patch291 -p1 +%patch292 -p1 +%patch293 -p1 +%patch294 -p1 +%patch295 -p1 +%patch296 -p1 +%patch297 -p1 +%patch298 -p1 +%patch299 -p1 +%patch300 -p1 +%patch301 -p1 +%patch302 -p1 %ifarch loongarch64 @@ -4014,6 +4064,18 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Thu Nov 21 2024 huangzifeng - 12.3.1-39 +- Type:Sync +- ID:NA +- SUG:NA +- DESC:Sync patches from openeuler/gcc + +* Thu Nov 21 2024 huangzifeng - 12.3.1-38 +- Type:Sync +- ID:NA +- SUG:NA +- DESC:Sync patches from branch openEuler-24.09 + * Wed Nov 20 2024 Hu,Lin1 - 12.3.1-37 - Type:Sync - ID:NA