From c5a8a72bcec0338c515455122aa1cbbcc35c1a6f Mon Sep 17 00:00:00 2001 From: "Cui,Lili" Date: Fri, 11 Aug 2023 10:21:14 +0800 Subject: [PATCH 1/3] [Sync] Add attribute hot judgement for INLINE_HINT_known_hot hint. We set up INLINE_HINT_known_hot hint only when we have profile feedback, now add function attribute judgement for it, when both caller and callee have __attribute__((hot)), we will also set up INLINE_HINT_known_hot hint for it. With this patch applied, ADL Multi-copy: 538.imagic_r 16.7% ICX Multi-copy: 538.imagic_r 15.2% CLX Multi-copy: 538.imagic_r 12.7% Znver3 Multi-copy: 538.imagic_r 10.6% Arm Multi-copy: 538.imagic_r 13.4% gcc/ChangeLog * ipa-inline-analysis.cc (do_estimate_edge_time): Add function attribute judgement for INLINE_HINT_known_hot hint. gcc/testsuite/ChangeLog: * gcc.dg/ipa/inlinehint-6.c: New test. (cherry picked from commit 8cf4a6a8c34172f371c1d9e6e375970b361f7007) --- ...t-judgement-for-INLINE_HINT_known_ho.patch | 124 ++++++++++++++++++ gcc.spec | 8 +- 2 files changed, 131 insertions(+), 1 deletion(-) create mode 100644 0003-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch diff --git a/0003-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch b/0003-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch new file mode 100644 index 0000000..34d0165 --- /dev/null +++ b/0003-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch @@ -0,0 +1,124 @@ +From 355eb8e20327242442d139fb052d3a3befde3dd7 Mon Sep 17 00:00:00 2001 +From: "Cui,Lili" +Date: Tue, 1 Nov 2022 09:16:49 +0800 +Subject: [PATCH] Add attribute hot judgement for INLINE_HINT_known_hot + hint. + +We set up INLINE_HINT_known_hot hint only when we have profile feedback, +now add function attribute judgement for it, when both caller and callee +have __attribute__((hot)), we will also set up INLINE_HINT_known_hot hint +for it. + +With this patch applied, +ADL Multi-copy: 538.imagic_r 16.7% +ICX Multi-copy: 538.imagic_r 15.2% +CLX Multi-copy: 538.imagic_r 12.7% +Znver3 Multi-copy: 538.imagic_r 10.6% +Arm Multi-copy: 538.imagic_r 13.4% + +gcc/ChangeLog + + * ipa-inline-analysis.cc (do_estimate_edge_time): Add function attribute + judgement for INLINE_HINT_known_hot hint. + +gcc/testsuite/ChangeLog: + + * gcc.dg/ipa/inlinehint-6.c: New test. +--- + gcc/ipa-inline-analysis.cc | 13 ++++--- + gcc/testsuite/gcc.dg/ipa/inlinehint-6.c | 47 +++++++++++++++++++++++++ + 2 files changed, 56 insertions(+), 4 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/ipa/inlinehint-6.c + +diff --git a/gcc/ipa-inline-analysis.cc b/gcc/ipa-inline-analysis.cc +index 11d8d09ee..16ac24cfc 100644 +--- a/gcc/ipa-inline-analysis.cc ++++ b/gcc/ipa-inline-analysis.cc +@@ -48,6 +48,7 @@ along with GCC; see the file COPYING3. If not see + #include "ipa-utils.h" + #include "cfgexpand.h" + #include "gimplify.h" ++#include "attribs.h" + + /* Cached node/edge growths. */ + fast_call_summary *edge_growth_cache = NULL; +@@ -249,15 +250,19 @@ do_estimate_edge_time (struct cgraph_edge *edge, sreal *ret_nonspec_time) + hints = estimates.hints; + } + +- /* When we have profile feedback, we can quite safely identify hot +- edges and for those we disable size limits. Don't do that when +- probability that caller will call the callee is low however, since it ++ /* When we have profile feedback or function attribute, we can quite safely ++ identify hot edges and for those we disable size limits. Don't do that ++ when probability that caller will call the callee is low however, since it + may hurt optimization of the caller's hot path. */ +- if (edge->count.ipa ().initialized_p () && edge->maybe_hot_p () ++ if ((edge->count.ipa ().initialized_p () && edge->maybe_hot_p () + && (edge->count.ipa ().apply_scale (2, 1) + > (edge->caller->inlined_to + ? edge->caller->inlined_to->count.ipa () + : edge->caller->count.ipa ()))) ++ || (lookup_attribute ("hot", DECL_ATTRIBUTES (edge->caller->decl)) ++ != NULL ++ && lookup_attribute ("hot", DECL_ATTRIBUTES (edge->callee->decl)) ++ != NULL)) + hints |= INLINE_HINT_known_hot; + + gcc_checking_assert (size >= 0); +diff --git a/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c +new file mode 100644 +index 000000000..1f3be641c +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c +@@ -0,0 +1,47 @@ ++/* { dg-options "-O3 -c -fdump-ipa-inline-details -fno-early-inlining -fno-ipa-cp" } */ ++/* { dg-add-options bind_pic_locally } */ ++ ++#define size_t long long int ++ ++struct A ++{ ++ size_t f1, f2, f3, f4; ++}; ++struct C ++{ ++ struct A a; ++ size_t b; ++}; ++struct C x; ++ ++__attribute__((hot)) struct C callee (struct A *a, struct C *c) ++{ ++ c->a=(*a); ++ ++ if((c->b + 7) & 17) ++ { ++ c->a.f1 = c->a.f2 + c->a.f1; ++ c->a.f2 = c->a.f3 - c->a.f2; ++ c->a.f3 = c->a.f2 + c->a.f3; ++ c->a.f4 = c->a.f2 - c->a.f4; ++ c->b = c->a.f2; ++ ++ } ++ return *c; ++} ++ ++__attribute__((hot)) struct C caller (size_t d, size_t e, size_t f, size_t g, struct C *c) ++{ ++ struct A a; ++ a.f1 = 1 + d; ++ a.f2 = e; ++ a.f3 = 12 + f; ++ a.f4 = 68 + g; ++ if (c->b > 0) ++ return callee (&a, c); ++ else ++ return *c; ++} ++ ++/* { dg-final { scan-ipa-dump "known_hot" "inline" } } */ ++ +-- +2.31.1 + diff --git a/gcc.spec b/gcc.spec index 15da1b7..ee134e3 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 4 +%global gcc_release 5 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -139,6 +139,7 @@ Provides: gcc(major) = %{gcc_major} Patch0: 0000-Version-Set-version-to-12.3.1.patch Patch1: 0001-CONFIG-Regenerate-configure-file.patch Patch2: 0002-libquadmath-Enable-libquadmath-on-kunpeng.patch +Patch3: 0003-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch # On ARM EABI systems, we do want -gnueabi to be part of the # target triple. @@ -609,6 +610,7 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch0 -p1 %patch1 -p1 %patch2 -p1 +%patch3 -p1 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE @@ -2712,6 +2714,10 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Fri Aug 11 2023 Cui,Lili 12.3.1-5 +- Type:Sync +- Add attribute hot judgement for INLINE_HINT_known_hot hint. + * Mon Jul 17 2023 huangxiaoquan 12.3.1-4 - Type:SPEC - DESC:Enable libquadmath on kunpeng -- Gitee From 0322d6d1d91c5fcf42c8a7384ddeb6f6e0cf9849 Mon Sep 17 00:00:00 2001 From: Hongyu Wang Date: Fri, 11 Aug 2023 10:24:40 +0800 Subject: [PATCH 2/3] [Sync] Enable small loop unrolling for O2 Modern processors has multiple way instruction decoders For x86, icelake/zen3 has 5 uops, so for small loop with <= 4 instructions (usually has 3 uops with a cmp/jmp pair that can be macro-fused), the decoder would have 2 uops bubble for each iteration and the pipeline could not be fully utilized. Therefore, this patch enables loop unrolling for small size loop at O2 to fullfill the decoder as much as possible. It turns on rtl loop unrolling when targetm.loop_unroll_adjust exists and O2 plus speed only. In x86 backend the default behavior is to unroll small loops with less than 4 insns by 1 time. This improves 548.exchange2 by 9% on icelake and 7.4% on zen3 with 0.9% codesize increment. For other benchmarks the variants are minor and overall codesize increased by 0.2%. The kernel image size increased by 0.06%, and no impact on eembc. gcc/ChangeLog: * common/config/i386/i386-common.cc (ix86_optimization_table): Enable small loop unroll at O2 by default. * config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll factor if -munroll-only-small-loops enabled and -funroll-loops/ -funroll-all-loops are disabled. * config/i386/i386.h (struct processor_costs): Add 2 field small_unroll_ninsns and small_unroll_factor. * config/i386/i386.opt: Add -munroll-only-small-loops. * doc/invoke.texi: Document -munroll-only-small-loops. * loop-init.cc (pass_rtl_unroll_loops::gate): Enable rtl loop unrolling for -O2-speed and above if target hook loop_unroll_adjust exists. (pass_rtl_unroll_loops::execute): Set UAP_UNROLL flag when target hook loop_unroll_adjust exists. * config/i386/x86-tune-costs.h: Update all processor costs with small_unroll_ninsns = 4 and small_unroll_factor = 2. gcc/testsuite/ChangeLog: * gcc.dg/guality/loop-1.c: Add additional option -mno-unroll-only-small-loops. * gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops. * gcc.target/i386/pr93002.c: Likewise. (cherry picked from commit 79c6c64bfe6cd9e6def74b1486b6978bfb922d4f) --- 0004-Enable-small-loop-unrolling-for-O2.patch | 490 ++++++++++++++++++ gcc.spec | 8 +- 2 files changed, 497 insertions(+), 1 deletion(-) create mode 100644 0004-Enable-small-loop-unrolling-for-O2.patch diff --git a/0004-Enable-small-loop-unrolling-for-O2.patch b/0004-Enable-small-loop-unrolling-for-O2.patch new file mode 100644 index 0000000..3913fcf --- /dev/null +++ b/0004-Enable-small-loop-unrolling-for-O2.patch @@ -0,0 +1,490 @@ +From 1070bc24f53e851cae55320e26715cc594efcd2f Mon Sep 17 00:00:00 2001 +From: Hongyu Wang +Date: Thu, 8 Sep 2022 16:52:02 +0800 +Subject: [PATCH] Enable small loop unrolling for O2 + +Modern processors has multiple way instruction decoders +For x86, icelake/zen3 has 5 uops, so for small loop with <= 4 +instructions (usually has 3 uops with a cmp/jmp pair that can be +macro-fused), the decoder would have 2 uops bubble for each iteration +and the pipeline could not be fully utilized. + +Therefore, this patch enables loop unrolling for small size loop at O2 +to fullfill the decoder as much as possible. It turns on rtl loop +unrolling when targetm.loop_unroll_adjust exists and O2 plus speed only. +In x86 backend the default behavior is to unroll small loops with less +than 4 insns by 1 time. + +This improves 548.exchange2 by 9% on icelake and 7.4% on zen3 with +0.9% codesize increment. For other benchmarks the variants are minor +and overall codesize increased by 0.2%. + +The kernel image size increased by 0.06%, and no impact on eembc. + +gcc/ChangeLog: + + * common/config/i386/i386-common.cc (ix86_optimization_table): + Enable small loop unroll at O2 by default. + * config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll + factor if -munroll-only-small-loops enabled and -funroll-loops/ + -funroll-all-loops are disabled. + * config/i386/i386.h (struct processor_costs): Add 2 field + small_unroll_ninsns and small_unroll_factor. + * config/i386/i386.opt: Add -munroll-only-small-loops. + * doc/invoke.texi: Document -munroll-only-small-loops. + * loop-init.cc (pass_rtl_unroll_loops::gate): Enable rtl + loop unrolling for -O2-speed and above if target hook + loop_unroll_adjust exists. + (pass_rtl_unroll_loops::execute): Set UAP_UNROLL flag + when target hook loop_unroll_adjust exists. + * config/i386/x86-tune-costs.h: Update all processor costs + with small_unroll_ninsns = 4 and small_unroll_factor = 2. + +gcc/testsuite/ChangeLog: + + * gcc.dg/guality/loop-1.c: Add additional option + -mno-unroll-only-small-loops. + * gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops. + * gcc.target/i386/pr93002.c: Likewise. +--- + gcc/common/config/i386/i386-common.cc | 1 + + gcc/config/i386/i386.cc | 18 ++++++++ + gcc/config/i386/i386.h | 5 +++ + gcc/config/i386/i386.opt | 4 ++ + gcc/config/i386/x86-tune-costs.h | 58 +++++++++++++++++++++++++ + gcc/doc/invoke.texi | 11 ++++- + gcc/loop-init.cc | 10 +++-- + gcc/testsuite/gcc.dg/guality/loop-1.c | 2 + + gcc/testsuite/gcc.target/i386/pr86270.c | 2 +- + gcc/testsuite/gcc.target/i386/pr93002.c | 2 +- + 10 files changed, 107 insertions(+), 6 deletions(-) + +diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc +index e2594cae4..cdd5caa55 100644 +--- a/gcc/common/config/i386/i386-common.cc ++++ b/gcc/common/config/i386/i386-common.cc +@@ -1687,6 +1687,7 @@ static const struct default_options ix86_option_optimization_table[] = + /* The STC algorithm produces the smallest code at -Os, for x86. */ + { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL, + REORDER_BLOCKS_ALGORITHM_STC }, ++ { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 }, + /* Turn off -fschedule-insns by default. It tends to make the + problem with not enough registers even worse. */ + { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 }, +diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc +index 9a9ff3b34..e56004300 100644 +--- a/gcc/config/i386/i386.cc ++++ b/gcc/config/i386/i386.cc +@@ -23570,6 +23570,24 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop) + unsigned i; + unsigned mem_count = 0; + ++ /* Unroll small size loop when unroll factor is not explicitly ++ specified. */ ++ if (!(flag_unroll_loops ++ || flag_unroll_all_loops ++ || loop->unroll)) ++ { ++ nunroll = 1; ++ ++ /* Any explicit -f{no-}unroll-{all-}loops turns off ++ -munroll-only-small-loops. */ ++ if (ix86_unroll_only_small_loops ++ && !OPTION_SET_P (flag_unroll_loops) ++ && loop->ninsns <= ix86_cost->small_unroll_ninsns) ++ nunroll = ix86_cost->small_unroll_factor; ++ ++ return nunroll; ++ } ++ + if (!TARGET_ADJUST_UNROLL) + return nunroll; + +diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h +index fce0b3564..688aaabd3 100644 +--- a/gcc/config/i386/i386.h ++++ b/gcc/config/i386/i386.h +@@ -219,6 +219,11 @@ struct processor_costs { + const char *const align_jump; /* Jump alignment. */ + const char *const align_label; /* Label alignment. */ + const char *const align_func; /* Function alignment. */ ++ ++ const unsigned small_unroll_ninsns; /* Insn count limit for small loop ++ to be unrolled. */ ++ const unsigned small_unroll_factor; /* Unroll factor for small loop to ++ be unrolled. */ + }; + + extern const struct processor_costs *ix86_cost; +diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt +index a3675e515..fc1b944ac 100644 +--- a/gcc/config/i386/i386.opt ++++ b/gcc/config/i386/i386.opt +@@ -1214,3 +1214,7 @@ Do not use GOT to access external symbols. + -param=x86-stlf-window-ninsns= + Target Joined UInteger Var(x86_stlf_window_ninsns) Init(64) Param + Instructions number above which STFL stall penalty can be compensated. ++ ++munroll-only-small-loops ++Target Var(ix86_unroll_only_small_loops) Init(0) Save ++Enable conservative small loop unrolling. +diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h +index f105d57ca..db4c2da34 100644 +--- a/gcc/config/i386/x86-tune-costs.h ++++ b/gcc/config/i386/x86-tune-costs.h +@@ -135,6 +135,8 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ + NULL, /* Jump alignment. */ + NULL, /* Label alignment. */ + NULL, /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* Processor costs (relative to an add) */ +@@ -244,6 +246,8 @@ struct processor_costs i386_cost = { /* 386 specific costs */ + "4", /* Jump alignment. */ + NULL, /* Label alignment. */ + "4", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs i486_memcpy[2] = { +@@ -354,6 +358,8 @@ struct processor_costs i486_cost = { /* 486 specific costs */ + "16", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs pentium_memcpy[2] = { +@@ -462,6 +468,8 @@ struct processor_costs pentium_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static const +@@ -563,6 +571,8 @@ struct processor_costs lakemont_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes +@@ -679,6 +689,8 @@ struct processor_costs pentiumpro_cost = { + "16:11:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs geode_memcpy[2] = { +@@ -786,6 +798,8 @@ struct processor_costs geode_cost = { + NULL, /* Jump alignment. */ + NULL, /* Label alignment. */ + NULL, /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs k6_memcpy[2] = { +@@ -896,6 +910,8 @@ struct processor_costs k6_cost = { + "32:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "32", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* For some reason, Athlon deals better with REP prefix (relative to loops) +@@ -1007,6 +1023,8 @@ struct processor_costs athlon_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* K8 has optimized REP instruction for medium sized blocks, but for very +@@ -1127,6 +1145,8 @@ struct processor_costs k8_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for +@@ -1255,6 +1275,8 @@ struct processor_costs amdfam10_cost = { + "32:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "32", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* BDVER has optimized REP instruction for medium sized blocks, but for +@@ -1376,6 +1398,8 @@ const struct processor_costs bdver_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "11", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + +@@ -1529,6 +1553,8 @@ struct processor_costs znver1_cost = { + "16", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* ZNVER2 has optimized REP instruction for medium sized blocks, but for +@@ -1686,6 +1712,8 @@ struct processor_costs znver2_cost = { + "16", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + struct processor_costs znver3_cost = { +@@ -1818,6 +1846,8 @@ struct processor_costs znver3_cost = { + "16", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* This table currently replicates znver3_cost table. */ +@@ -1952,6 +1982,8 @@ struct processor_costs znver4_cost = { + "16", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* skylake_cost should produce code tuned for Skylake familly of CPUs. */ +@@ -2076,6 +2108,8 @@ struct processor_costs skylake_cost = { + "16:11:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* icelake_cost should produce code tuned for Icelake family of CPUs. +@@ -2202,6 +2236,8 @@ struct processor_costs icelake_cost = { + "16:11:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* alderlake_cost should produce code tuned for alderlake family of CPUs. */ +@@ -2322,6 +2358,8 @@ struct processor_costs alderlake_cost = { + "16:11:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* BTVER1 has optimized REP instruction for medium sized blocks, but for +@@ -2435,6 +2473,8 @@ const struct processor_costs btver1_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "11", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs btver2_memcpy[2] = { +@@ -2545,6 +2585,8 @@ const struct processor_costs btver2_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "11", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs pentium4_memcpy[2] = { +@@ -2654,6 +2696,8 @@ struct processor_costs pentium4_cost = { + NULL, /* Jump alignment. */ + NULL, /* Label alignment. */ + NULL, /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs nocona_memcpy[2] = { +@@ -2766,6 +2810,8 @@ struct processor_costs nocona_cost = { + NULL, /* Jump alignment. */ + NULL, /* Label alignment. */ + NULL, /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs atom_memcpy[2] = { +@@ -2876,6 +2922,8 @@ struct processor_costs atom_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs slm_memcpy[2] = { +@@ -2986,6 +3034,8 @@ struct processor_costs slm_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs tremont_memcpy[2] = { +@@ -3110,6 +3160,8 @@ struct processor_costs tremont_cost = { + "16:11:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + static stringop_algs intel_memcpy[2] = { +@@ -3220,6 +3272,8 @@ struct processor_costs intel_cost = { + "16:8:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* Generic should produce code tuned for Core-i7 (and newer chips) +@@ -3339,6 +3393,8 @@ struct processor_costs generic_cost = { + "16:11:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + + /* core_cost should produce code tuned for Core familly of CPUs. */ +@@ -3465,5 +3521,7 @@ struct processor_costs core_cost = { + "16:11:8", /* Jump alignment. */ + "0:0:8", /* Label alignment. */ + "16", /* Func alignment. */ ++ 4, /* Small unroll limit. */ ++ 2, /* Small unroll factor. */ + }; + +diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +index ff8cd032f..16f4b367e 100644 +--- a/gcc/doc/invoke.texi ++++ b/gcc/doc/invoke.texi +@@ -1449,7 +1449,8 @@ See RS/6000 and PowerPC Options. + -mgeneral-regs-only -mcall-ms2sysv-xlogues -mrelax-cmpxchg-loop @gol + -mindirect-branch=@var{choice} -mfunction-return=@var{choice} @gol + -mindirect-branch-register -mharden-sls=@var{choice} @gol +--mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access} ++-mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access @gol ++-munroll-only-small-loops} + + @emph{x86 Windows Options} + @gccoptlist{-mconsole -mcygwin -mno-cygwin -mdll @gol +@@ -33183,6 +33184,14 @@ treat access to protected symbols as local symbols. The default is + @option{-mno-direct-extern-access} and executable compiled with + @option{-mdirect-extern-access} may not be binary compatible if + protected symbols are used in shared libraries and executable. ++ ++@item -munroll-only-small-loops ++@opindex munroll-only-small-loops ++@opindex mno-unroll-only-small-loops ++Controls conservative small loop unrolling. It is default enabled by ++O2, and unrolls loop with less than 4 insns by 1 time. Explicit ++-f[no-]unroll-[all-]loops would disable this flag to avoid any ++unintended unrolling behavior that user does not want. + @end table + + @node x86 Windows Options +diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc +index 1e4f6cfd7..f1c717041 100644 +--- a/gcc/loop-init.cc ++++ b/gcc/loop-init.cc +@@ -565,9 +565,12 @@ public: + {} + + /* opt_pass methods: */ +- virtual bool gate (function *) ++ virtual bool gate (function *fun) + { +- return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll); ++ return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll ++ || (targetm.loop_unroll_adjust ++ && optimize >= 2 ++ && optimize_function_for_speed_p (fun))); + } + + virtual unsigned int execute (function *); +@@ -583,7 +586,8 @@ pass_rtl_unroll_loops::execute (function *fun) + if (dump_file) + df_dump (dump_file); + +- if (flag_unroll_loops) ++ if (flag_unroll_loops ++ || targetm.loop_unroll_adjust) + flags |= UAP_UNROLL; + if (flag_unroll_all_loops) + flags |= UAP_UNROLL_ALL; +diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c +index 1b1f6d322..a32ea445a 100644 +--- a/gcc/testsuite/gcc.dg/guality/loop-1.c ++++ b/gcc/testsuite/gcc.dg/guality/loop-1.c +@@ -1,5 +1,7 @@ + /* { dg-do run } */ + /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */ ++/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */ ++ + + #include "../nop.h" + +diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c +index 81841ef5b..cbc9fbb04 100644 +--- a/gcc/testsuite/gcc.target/i386/pr86270.c ++++ b/gcc/testsuite/gcc.target/i386/pr86270.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2" } */ ++/* { dg-options "-O2 -mno-unroll-only-small-loops" } */ + + int *a; + long len; +diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c +index 0248fcc00..f75a847f7 100644 +--- a/gcc/testsuite/gcc.target/i386/pr93002.c ++++ b/gcc/testsuite/gcc.target/i386/pr93002.c +@@ -1,6 +1,6 @@ + /* PR target/93002 */ + /* { dg-do compile } */ +-/* { dg-options "-O2" } */ ++/* { dg-options "-O2 -mno-unroll-only-small-loops" } */ + /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */ + + volatile int sink; +-- +2.31.1 + diff --git a/gcc.spec b/gcc.spec index ee134e3..95314d0 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 5 +%global gcc_release 6 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -140,6 +140,7 @@ Patch0: 0000-Version-Set-version-to-12.3.1.patch Patch1: 0001-CONFIG-Regenerate-configure-file.patch Patch2: 0002-libquadmath-Enable-libquadmath-on-kunpeng.patch Patch3: 0003-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch +Patch4: 0004-Enable-small-loop-unrolling-for-O2.patch # On ARM EABI systems, we do want -gnueabi to be part of the # target triple. @@ -611,6 +612,7 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch1 -p1 %patch2 -p1 %patch3 -p1 +%patch4 -p1 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE @@ -2714,6 +2716,10 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Fri Aug 11 2023 Hongyu Wang 12.3.1-6 +- Type:Sync +- Enable small loop unrolling for O2. + * Fri Aug 11 2023 Cui,Lili 12.3.1-5 - Type:Sync - Add attribute hot judgement for INLINE_HINT_known_hot hint. -- Gitee From 24eb7b6504792bcb15d67bd1be238aadaf066789 Mon Sep 17 00:00:00 2001 From: Hongyu Wang Date: Fri, 11 Aug 2023 10:26:25 +0800 Subject: [PATCH 3/3] [Sync] i386: Only enable small loop unrolling in backend [PR 107692] Followed by the discussion in pr107692, -munroll-only-small-loops Does not turns on/off -funroll-loops, and current check in pass_rtl_unroll_loops::gate would cause -fno-unroll-loops do not take effect. Revert the change about targetm.loop_unroll_adjust and apply the backend option change to strictly follow the rule that -funroll-loops takes full control of loop unrolling, and munroll-only-small-loops just change its behavior to unroll small size loops. gcc/ChangeLog: PR target/107692 * common/config/i386/i386-common.cc (ix86_optimization_table): Enable loop unroll O2, disable -fweb and -frename-registers by default. * config/i386/i386-options.cc (ix86_override_options_after_change): Disable small loop unroll when funroll-loops enabled, reset cunroll_grow_size when it is not explicitly enabled. (ix86_option_override_internal): Call ix86_override_options_after_change instead of calling ix86_recompute_optlev_based_flags and ix86_default_align separately. * config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll factor if -munroll-only-small-loops enabled. * loop-init.cc (pass_rtl_unroll_loops::gate): Do not enable loop unrolling for -O2-speed. (pass_rtl_unroll_loops::execute): Rmove targetm.loop_unroll_adjust check. gcc/testsuite/ChangeLog: PR target/107692 * gcc.dg/guality/loop-1.c: Remove additional option for ia32. * gcc.target/i386/pr86270.c: Add -fno-unroll-loops. * gcc.target/i386/pr93002.c: Likewise. (cherry picked from commit ad4ee2e6e9bc79e159a22ef422bf3be74060e47d) --- ...-small-loop-unrolling-in-backend-PR-.patch | 230 ++++++++++++++++++ gcc.spec | 8 +- 2 files changed, 237 insertions(+), 1 deletion(-) create mode 100644 0005-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch diff --git a/0005-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch b/0005-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch new file mode 100644 index 0000000..9e89306 --- /dev/null +++ b/0005-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch @@ -0,0 +1,230 @@ +From 96898a9cd8c159625848247bd2f3a09e5c12fcfa Mon Sep 17 00:00:00 2001 +From: Hongyu Wang +Date: Sat, 19 Nov 2022 09:38:00 +0800 +Subject: [PATCH] i386: Only enable small loop unrolling in backend [PR + 107692] + +Followed by the discussion in pr107692, -munroll-only-small-loops +Does not turns on/off -funroll-loops, and current check in +pass_rtl_unroll_loops::gate would cause -fno-unroll-loops do not take +effect. Revert the change about targetm.loop_unroll_adjust and apply +the backend option change to strictly follow the rule that +-funroll-loops takes full control of loop unrolling, and +munroll-only-small-loops just change its behavior to unroll small size +loops. + +gcc/ChangeLog: + + PR target/107692 + * common/config/i386/i386-common.cc (ix86_optimization_table): + Enable loop unroll O2, disable -fweb and -frename-registers + by default. + * config/i386/i386-options.cc + (ix86_override_options_after_change): + Disable small loop unroll when funroll-loops enabled, reset + cunroll_grow_size when it is not explicitly enabled. + (ix86_option_override_internal): Call + ix86_override_options_after_change instead of calling + ix86_recompute_optlev_based_flags and ix86_default_align + separately. + * config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll + factor if -munroll-only-small-loops enabled. + * loop-init.cc (pass_rtl_unroll_loops::gate): Do not enable + loop unrolling for -O2-speed. + (pass_rtl_unroll_loops::execute): Rmove + targetm.loop_unroll_adjust check. + +gcc/testsuite/ChangeLog: + + PR target/107692 + * gcc.dg/guality/loop-1.c: Remove additional option for ia32. + * gcc.target/i386/pr86270.c: Add -fno-unroll-loops. + * gcc.target/i386/pr93002.c: Likewise. +--- + gcc/common/config/i386/i386-common.cc | 8 ++++++ + gcc/config/i386/i386-options.cc | 34 ++++++++++++++++++++++--- + gcc/config/i386/i386.cc | 18 ++++--------- + gcc/loop-init.cc | 10 +++----- + gcc/testsuite/gcc.dg/guality/loop-1.c | 2 -- + gcc/testsuite/gcc.target/i386/pr86270.c | 2 +- + gcc/testsuite/gcc.target/i386/pr93002.c | 2 +- + 7 files changed, 48 insertions(+), 28 deletions(-) + +diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc +index cdd5caa55..f650e255f 100644 +--- a/gcc/common/config/i386/i386-common.cc ++++ b/gcc/common/config/i386/i386-common.cc +@@ -1687,7 +1687,15 @@ static const struct default_options ix86_option_optimization_table[] = + /* The STC algorithm produces the smallest code at -Os, for x86. */ + { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL, + REORDER_BLOCKS_ALGORITHM_STC }, ++ ++ /* Turn on -funroll-loops with -munroll-only-small-loops to enable small ++ loop unrolling at -O2. */ ++ { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_funroll_loops, NULL, 1 }, + { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 }, ++ /* Turns off -frename-registers and -fweb which are enabled by ++ funroll-loops. */ ++ { OPT_LEVELS_ALL, OPT_frename_registers, NULL, 0 }, ++ { OPT_LEVELS_ALL, OPT_fweb, NULL, 0 }, + /* Turn off -fschedule-insns by default. It tends to make the + problem with not enough registers even worse. */ + { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 }, +diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc +index 099cec4b6..ff44ad4e0 100644 +--- a/gcc/config/i386/i386-options.cc ++++ b/gcc/config/i386/i386-options.cc +@@ -1816,8 +1816,37 @@ ix86_recompute_optlev_based_flags (struct gcc_options *opts, + void + ix86_override_options_after_change (void) + { ++ /* Default align_* from the processor table. */ + ix86_default_align (&global_options); ++ + ix86_recompute_optlev_based_flags (&global_options, &global_options_set); ++ ++ /* Disable unrolling small loops when there's explicit ++ -f{,no}unroll-loop. */ ++ if ((OPTION_SET_P (flag_unroll_loops)) ++ || (OPTION_SET_P (flag_unroll_all_loops) ++ && flag_unroll_all_loops)) ++ { ++ if (!OPTION_SET_P (ix86_unroll_only_small_loops)) ++ ix86_unroll_only_small_loops = 0; ++ /* Re-enable -frename-registers and -fweb if funroll-loops ++ enabled. */ ++ if (!OPTION_SET_P (flag_web)) ++ flag_web = flag_unroll_loops; ++ if (!OPTION_SET_P (flag_rename_registers)) ++ flag_rename_registers = flag_unroll_loops; ++ /* -fcunroll-grow-size default follws -f[no]-unroll-loops. */ ++ if (!OPTION_SET_P (flag_cunroll_grow_size)) ++ flag_cunroll_grow_size = flag_unroll_loops ++ || flag_peel_loops ++ || optimize >= 3; ++ } ++ else ++ { ++ if (!OPTION_SET_P (flag_cunroll_grow_size)) ++ flag_cunroll_grow_size = flag_peel_loops || optimize >= 3; ++ } ++ + } + + /* Clear stack slot assignments remembered from previous functions. +@@ -2329,7 +2358,7 @@ ix86_option_override_internal (bool main_args_p, + + set_ix86_tune_features (opts, ix86_tune, opts->x_ix86_dump_tunes); + +- ix86_recompute_optlev_based_flags (opts, opts_set); ++ ix86_override_options_after_change (); + + ix86_tune_cost = processor_cost_table[ix86_tune]; + /* TODO: ix86_cost should be chosen at instruction or function granuality +@@ -2360,9 +2389,6 @@ ix86_option_override_internal (bool main_args_p, + || TARGET_64BIT_P (opts->x_ix86_isa_flags)) + opts->x_ix86_regparm = REGPARM_MAX; + +- /* Default align_* from the processor table. */ +- ix86_default_align (opts); +- + /* Provide default for -mbranch-cost= value. */ + SET_OPTION_IF_UNSET (opts, opts_set, ix86_branch_cost, + ix86_tune_cost->branch_cost); +diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc +index e56004300..462dce10e 100644 +--- a/gcc/config/i386/i386.cc ++++ b/gcc/config/i386/i386.cc +@@ -23572,20 +23572,12 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop) + + /* Unroll small size loop when unroll factor is not explicitly + specified. */ +- if (!(flag_unroll_loops +- || flag_unroll_all_loops +- || loop->unroll)) ++ if (ix86_unroll_only_small_loops && !loop->unroll) + { +- nunroll = 1; +- +- /* Any explicit -f{no-}unroll-{all-}loops turns off +- -munroll-only-small-loops. */ +- if (ix86_unroll_only_small_loops +- && !OPTION_SET_P (flag_unroll_loops) +- && loop->ninsns <= ix86_cost->small_unroll_ninsns) +- nunroll = ix86_cost->small_unroll_factor; +- +- return nunroll; ++ if (loop->ninsns <= ix86_cost->small_unroll_ninsns) ++ return MIN (nunroll, ix86_cost->small_unroll_factor); ++ else ++ return 1; + } + + if (!TARGET_ADJUST_UNROLL) +diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc +index f1c717041..1e4f6cfd7 100644 +--- a/gcc/loop-init.cc ++++ b/gcc/loop-init.cc +@@ -565,12 +565,9 @@ public: + {} + + /* opt_pass methods: */ +- virtual bool gate (function *fun) ++ virtual bool gate (function *) + { +- return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll +- || (targetm.loop_unroll_adjust +- && optimize >= 2 +- && optimize_function_for_speed_p (fun))); ++ return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll); + } + + virtual unsigned int execute (function *); +@@ -586,8 +583,7 @@ pass_rtl_unroll_loops::execute (function *fun) + if (dump_file) + df_dump (dump_file); + +- if (flag_unroll_loops +- || targetm.loop_unroll_adjust) ++ if (flag_unroll_loops) + flags |= UAP_UNROLL; + if (flag_unroll_all_loops) + flags |= UAP_UNROLL_ALL; +diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c +index a32ea445a..1b1f6d322 100644 +--- a/gcc/testsuite/gcc.dg/guality/loop-1.c ++++ b/gcc/testsuite/gcc.dg/guality/loop-1.c +@@ -1,7 +1,5 @@ + /* { dg-do run } */ + /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */ +-/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */ +- + + #include "../nop.h" + +diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c +index cbc9fbb04..98b012caf 100644 +--- a/gcc/testsuite/gcc.target/i386/pr86270.c ++++ b/gcc/testsuite/gcc.target/i386/pr86270.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-unroll-only-small-loops" } */ ++/* { dg-options "-O2 -fno-unroll-loops" } */ + + int *a; + long len; +diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c +index f75a847f7..7e2d869e1 100644 +--- a/gcc/testsuite/gcc.target/i386/pr93002.c ++++ b/gcc/testsuite/gcc.target/i386/pr93002.c +@@ -1,6 +1,6 @@ + /* PR target/93002 */ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-unroll-only-small-loops" } */ ++/* { dg-options "-O2 -fno-unroll-loops" } */ + /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */ + + volatile int sink; +-- +2.31.1 + diff --git a/gcc.spec b/gcc.spec index 95314d0..aae0ee3 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 6 +%global gcc_release 7 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -141,6 +141,7 @@ Patch1: 0001-CONFIG-Regenerate-configure-file.patch Patch2: 0002-libquadmath-Enable-libquadmath-on-kunpeng.patch Patch3: 0003-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch Patch4: 0004-Enable-small-loop-unrolling-for-O2.patch +Patch5: 0005-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch # On ARM EABI systems, we do want -gnueabi to be part of the # target triple. @@ -613,6 +614,7 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch2 -p1 %patch3 -p1 %patch4 -p1 +%patch5 -p1 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE @@ -2716,6 +2718,10 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Fri Aug 11 2023 Hongyu Wang 12.3.1-7 +- Type:Sync +- i386: Only enable small loop unrolling in backend [PR 107692]. + * Fri Aug 11 2023 Hongyu Wang 12.3.1-6 - Type:Sync - Enable small loop unrolling for O2. -- Gitee