From 05a64cf3e6deabc46005161d30c9b3bdfb9e4e54 Mon Sep 17 00:00:00 2001 From: liuf9 Date: Mon, 11 Dec 2023 17:51:07 +0800 Subject: [PATCH] [Sync] Sync patch from openeuler/gcc --- 0152-Add-LLC-Allocation-Pass.patch | 4905 ++++++++++++++++++++++++++++ gcc.spec | 10 +- 2 files changed, 4914 insertions(+), 1 deletion(-) create mode 100644 0152-Add-LLC-Allocation-Pass.patch diff --git a/0152-Add-LLC-Allocation-Pass.patch b/0152-Add-LLC-Allocation-Pass.patch new file mode 100644 index 0000000..0a19214 --- /dev/null +++ b/0152-Add-LLC-Allocation-Pass.patch @@ -0,0 +1,4905 @@ +From e0e139bf642398d1e1b8cfd803ee6ce276404991 Mon Sep 17 00:00:00 2001 +From: huangxiaoquan +Date: Wed, 6 Dec 2023 17:51:11 +0800 +Subject: [PATCH] Add LLC-Allocation Pass LLC allocation allows the compiler to + identify frequently-used data in the program and strengthens the ability to + prefetch and distribute it to the last level cache (LLC) through memory + accesses of the corresponding data variables. Add flag -fllc-allocate to + enable LLC allocation. + +--- + gcc/Makefile.in | 1 + + gcc/cfgloop.h | 3 + + gcc/common.opt | 4 + + gcc/config/aarch64/aarch64-sve.md | 48 +- + gcc/config/aarch64/aarch64.c | 18 + + gcc/doc/tm.texi | 21 + + gcc/doc/tm.texi.in | 6 + + gcc/internal-fn.c | 115 + + gcc/internal-fn.def | 4 + + gcc/optabs.def | 2 + + gcc/params.opt | 53 + + gcc/passes.def | 1 + + gcc/target.def | 31 + + gcc/testsuite/gcc.dg/llc-allocate/llc-1.c | 61 + + gcc/testsuite/gcc.dg/llc-allocate/llc-2.c | 54 + + .../gcc.dg/llc-allocate/llc-allocate.exp | 27 + + .../llc-allocate/llc-issue-builtin-prefetch.c | 48 + + .../gcc.dg/llc-allocate/llc-nonzero-offset.c | 50 + + .../gcc.dg/llc-allocate/llc-ref-trace.c | 62 + + .../llc-allocate/llc-tool-insertion-1.c | 48 + + .../llc-allocate/llc-tool-insertion-2.c | 48 + + .../llc-allocate/llc-tool-insertion-3.c | 48 + + .../llc-allocate/llc-tool-insertion-4.c | 47 + + .../llc-allocate/llc-tool-insertion-5.c | 48 + + .../llc-allocate/llc-tool-insertion-6.c | 47 + + .../llc-tool-insertion-7-null-var-name.c | 52 + + .../llc-tool-insertion-8-tmp-var-name.c | 54 + + .../gfortran.dg/llc-allocate/llc-3.f90 | 213 ++ + .../gfortran.dg/llc-allocate/llc-allocate.exp | 29 + + .../llc-trace-multiple-base-var.f90 | 63 + + .../llc-unknown-type-size-unit.f90 | 58 + + gcc/timevar.def | 1 + + gcc/tree-cfg.c | 11 + + gcc/tree-cfg.h | 1 + + gcc/tree-pass.h | 1 + + gcc/tree-scalar-evolution.c | 8 +- + gcc/tree-scalar-evolution.h | 3 +- + gcc/tree-ssa-llc-allocate.c | 2898 +++++++++++++++++ + gcc/tree-ssa-loop-niter.c | 38 +- + gcc/tree-ssa-loop-niter.h | 3 +- + 40 files changed, 4297 insertions(+), 31 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-1.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-2.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-2.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-3.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-4.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-5.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-6.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-7-null-var-name.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c + create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 + create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp + create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 + create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 + create mode 100644 gcc/tree-ssa-llc-allocate.c + +diff --git a/gcc/Makefile.in b/gcc/Makefile.in +index 2a59acfbe..31bf2cde2 100644 +--- a/gcc/Makefile.in ++++ b/gcc/Makefile.in +@@ -1594,6 +1594,7 @@ OBJS = \ + tree-ssa-loop-array-widen-compare.o \ + tree-ssa-loop-crc.o \ + tree-ssa-loop-prefetch.o \ ++ tree-ssa-llc-allocate.o \ + tree-ssa-loop-split.o \ + tree-ssa-loop-unswitch.o \ + tree-ssa-loop.o \ +diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h +index 18b404e29..e3ecf5076 100644 +--- a/gcc/cfgloop.h ++++ b/gcc/cfgloop.h +@@ -272,6 +272,9 @@ public: + the basic-block from being collected but its index can still be + reused. */ + basic_block former_header; ++ ++ /* Number of latch executions from vectorization. */ ++ tree vec_nb_iterations; + }; + + /* Set if the loop is known to be infinite. */ +diff --git a/gcc/common.opt b/gcc/common.opt +index 4db061b44..2dde0f673 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -2233,6 +2233,10 @@ Common Joined RejectNegative UInteger Var(prefetch_level) Init(0) IntegerRange(0 + Generate prefetch instructions, if available, for arrays in loops. The prefetch + level can control the optimize level to array prefetch. + ++fllc-allocate ++Common Report Var(flag_llc_allocate) Init(-1) Optimization ++Generate LLC hint instructions. ++ + fprofile + Common Report Var(profile_flag) + Enable basic program profiling code. +diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md +index d17a77706..c5b99b6c4 100644 +--- a/gcc/config/aarch64/aarch64-sve.md ++++ b/gcc/config/aarch64/aarch64-sve.md +@@ -1940,7 +1940,7 @@ + (define_insn "@aarch64_sve_prefetch" + [(prefetch (unspec:DI + [(match_operand: 0 "register_operand" "Upl") +- (match_operand:SVE_FULL_I 1 "aarch64_sve_prefetch_operand" "UP") ++ (match_operand:SVE_FULL 1 "aarch64_sve_prefetch_operand" "UP") + (match_operand:DI 2 "const_int_operand")] + UNSPEC_SVE_PREFETCH) + (match_operand:DI 3 "const_int_operand") +@@ -1973,14 +1973,14 @@ + ;; 6: the prefetch operator (an svprfop) + ;; 7: the normal RTL prefetch rw flag + ;; 8: the normal RTL prefetch locality value +-(define_insn "@aarch64_sve_gather_prefetch" ++(define_insn "@aarch64_sve_gather_prefetch" + [(prefetch (unspec:DI + [(match_operand:VNx4BI 0 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl") +- (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk, rk, rk") ++ (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk, rk, rk") + (match_operand:VNx4SI_ONLY 2 "register_operand" "w, w, w, w, w, w") + (match_operand:DI 3 "const_int_operand" "i, i, Z, Ui1, Z, Ui1") +- (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, Ui1, i, i") +- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") ++ (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, Ui1, i, i") ++ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") + (match_operand:DI 6 "const_int_operand")] + UNSPEC_SVE_PREFETCH_GATHER) + (match_operand:DI 7 "const_int_operand") +@@ -1988,12 +1988,12 @@ + "TARGET_SVE" + { + static const char *const insns[][2] = { +- "prf", "%0, [%2.s]", +- "prf", "%0, [%2.s, #%1]", ++ "prf", "%0, [%2.s]", ++ "prf", "%0, [%2.s, #%1]", + "prfb", "%0, [%1, %2.s, sxtw]", + "prfb", "%0, [%1, %2.s, uxtw]", +- "prf", "%0, [%1, %2.s, sxtw %p4]", +- "prf", "%0, [%1, %2.s, uxtw %p4]" ++ "prf", "%0, [%1, %2.s, sxtw %p4]", ++ "prf", "%0, [%1, %2.s, uxtw %p4]" + }; + const char *const *parts = insns[which_alternative]; + return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); +@@ -2002,14 +2002,14 @@ + + ;; Predicated gather prefetches for 64-bit elements. The value of operand 3 + ;; doesn't matter in this case. +-(define_insn "@aarch64_sve_gather_prefetch" ++(define_insn "@aarch64_sve_gather_prefetch" + [(prefetch (unspec:DI + [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl, Upl, Upl") +- (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk") ++ (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk") + (match_operand:VNx2DI_ONLY 2 "register_operand" "w, w, w, w") + (match_operand:DI 3 "const_int_operand") +- (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, i") +- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") ++ (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, i") ++ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") + (match_operand:DI 6 "const_int_operand")] + UNSPEC_SVE_PREFETCH_GATHER) + (match_operand:DI 7 "const_int_operand") +@@ -2017,10 +2017,10 @@ + "TARGET_SVE" + { + static const char *const insns[][2] = { +- "prf", "%0, [%2.d]", +- "prf", "%0, [%2.d, #%1]", ++ "prf", "%0, [%2.d]", ++ "prf", "%0, [%2.d, #%1]", + "prfb", "%0, [%1, %2.d]", +- "prf", "%0, [%1, %2.d, lsl %p4]" ++ "prf", "%0, [%1, %2.d, lsl %p4]" + }; + const char *const *parts = insns[which_alternative]; + return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); +@@ -2028,7 +2028,7 @@ + ) + + ;; Likewise, but with the offset being sign-extended from 32 bits. +-(define_insn_and_rewrite "*aarch64_sve_gather_prefetch_sxtw" ++(define_insn_and_rewrite "*aarch64_sve_gather_prefetch_sxtw" + [(prefetch (unspec:DI + [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl") + (match_operand:DI 1 "register_operand" "rk, rk") +@@ -2039,8 +2039,8 @@ + (match_operand:VNx2DI 2 "register_operand" "w, w")))] + UNSPEC_PRED_X) + (match_operand:DI 3 "const_int_operand") +- (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") +- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") ++ (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") ++ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") + (match_operand:DI 6 "const_int_operand")] + UNSPEC_SVE_PREFETCH_GATHER) + (match_operand:DI 7 "const_int_operand") +@@ -2049,7 +2049,7 @@ + { + static const char *const insns[][2] = { + "prfb", "%0, [%1, %2.d, sxtw]", +- "prf", "%0, [%1, %2.d, sxtw %p4]" ++ "prf", "%0, [%1, %2.d, sxtw %p4]" + }; + const char *const *parts = insns[which_alternative]; + return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); +@@ -2061,7 +2061,7 @@ + ) + + ;; Likewise, but with the offset being zero-extended from 32 bits. +-(define_insn "*aarch64_sve_gather_prefetch_uxtw" ++(define_insn "*aarch64_sve_gather_prefetch_uxtw" + [(prefetch (unspec:DI + [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl") + (match_operand:DI 1 "register_operand" "rk, rk") +@@ -2069,8 +2069,8 @@ + (match_operand:VNx2DI 2 "register_operand" "w, w") + (match_operand:VNx2DI 9 "aarch64_sve_uxtw_immediate")) + (match_operand:DI 3 "const_int_operand") +- (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") +- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") ++ (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") ++ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") + (match_operand:DI 6 "const_int_operand")] + UNSPEC_SVE_PREFETCH_GATHER) + (match_operand:DI 7 "const_int_operand") +@@ -2079,7 +2079,7 @@ + { + static const char *const insns[][2] = { + "prfb", "%0, [%1, %2.d, uxtw]", +- "prf", "%0, [%1, %2.d, uxtw %p4]" ++ "prf", "%0, [%1, %2.d, uxtw %p4]" + }; + const char *const *parts = insns[which_alternative]; + return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); +diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c +index dbdc6dffb..aa077ec0a 100644 +--- a/gcc/config/aarch64/aarch64.c ++++ b/gcc/config/aarch64/aarch64.c +@@ -2367,6 +2367,13 @@ aarch64_sve_data_mode_p (machine_mode mode) + return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA; + } + ++/* Return true if MODE is an full SVE data vector mode. */ ++static bool ++aarch64_full_sve_data_mode_p (machine_mode mode) ++{ ++ return aarch64_classify_vector_mode (mode) == VEC_SVE_DATA; ++} ++ + /* Return the number of defined bytes in one constituent vector of + SVE mode MODE, which has vector flags VEC_FLAGS. */ + static poly_int64 +@@ -24370,6 +24377,17 @@ aarch64_libgcc_floating_mode_supported_p + #undef TARGET_ASM_FUNCTION_EPILOGUE + #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks + ++#undef TARGET_VECTORIZE_CODE_FOR_PREFETCH ++#define TARGET_VECTORIZE_CODE_FOR_PREFETCH code_for_aarch64_sve_prefetch ++ ++#undef TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH ++#define TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH \ ++ code_for_aarch64_sve_gather_prefetch ++ ++#undef TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P ++#define TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P \ ++ aarch64_full_sve_data_mode_p ++ + struct gcc_target targetm = TARGET_INITIALIZER; + + #include "gt-aarch64.h" +diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi +index b46418d0b..ef3566510 100644 +--- a/gcc/doc/tm.texi ++++ b/gcc/doc/tm.texi +@@ -6122,6 +6122,27 @@ The default is @code{NULL_TREE} which means to not vectorize scatter + stores. + @end deftypefn + ++@deftypefn {Target Hook} insn_code TARGET_VECTORIZE_CODE_FOR_PREFETCH (machine_mode @var{arg}) ++This hook should return the decl of a function that implements the ++vectorized variant of the function with the @code{combined_fn} code ++@var{code} or @code{NULL_TREE} if such a function is not available. ++The return type of the vectorized function shall be of vector type ++@var{vec_type_out} and the argument types should be @var{vec_type_in}. ++@end deftypefn ++ ++@deftypefn {Target Hook} insn_code TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH (machine_mode @var{mode_to}, machine_mode @var{mode_form}) ++This hook should return the decl of a function that implements the ++vectorized variant of the function with the @code{combined_fn} code ++@var{code} or @code{NULL_TREE} if such a function is not available. ++The return type of the vectorized function shall be of vector type ++@var{vec_type_out} and the argument types should be @var{vec_type_in}. ++@end deftypefn ++ ++@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P (machine_mode @var{arg}) ++This hook should return true if the target hardware architecture ++supports a full SVE data vector mode. ++@end deftypefn ++ + @deftypefn {Target Hook} int TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN (struct cgraph_node *@var{}, struct cgraph_simd_clone *@var{}, @var{tree}, @var{int}) + This hook should set @var{vecsize_mangle}, @var{vecsize_int}, @var{vecsize_float} + fields in @var{simd_clone} structure pointed by @var{clone_info} argument and also +diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in +index 2663547c7..945d0f696 100644 +--- a/gcc/doc/tm.texi.in ++++ b/gcc/doc/tm.texi.in +@@ -4195,6 +4195,12 @@ address; but often a machine-dependent strategy can generate better code. + + @hook TARGET_VECTORIZE_BUILTIN_SCATTER + ++@hook TARGET_VECTORIZE_CODE_FOR_PREFETCH ++ ++@hook TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH ++ ++@hook TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P ++ + @hook TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN + + @hook TARGET_SIMD_CLONE_ADJUST +diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c +index 644f234e0..e8a3bb654 100644 +--- a/gcc/internal-fn.c ++++ b/gcc/internal-fn.c +@@ -102,10 +102,12 @@ init_internal_fns () + direct_internal_fn. */ + #define not_direct { -2, -2, false } + #define mask_load_direct { -1, 2, false } ++#define mask_prefetch_direct { -1, 2, false } + #define load_lanes_direct { -1, -1, false } + #define mask_load_lanes_direct { -1, -1, false } + #define gather_load_direct { 3, 1, false } + #define mask_store_direct { 3, 2, false } ++#define gather_prefetch_direct { 3, 1, false } + #define store_lanes_direct { 0, 0, false } + #define mask_store_lanes_direct { 0, 0, false } + #define vec_cond_mask_direct { 0, 0, false } +@@ -2520,6 +2522,53 @@ expand_mask_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab) + + #define expand_mask_load_lanes_optab_fn expand_mask_load_optab_fn + ++/* Expand MASK_PREFETCH call STMT using optab OPTAB. ++ .MASK_STORE (_5, 64B, loop_mask_98, vect__8.10_102); ++ .MASK_PREFETCH (_68, 64B, loop_mask_98, vect__8.10_102, 4); ++*/ ++ ++static void ++expand_mask_prefetch_optab_fn (internal_fn, gcall *stmt, direct_optab optab) ++{ ++ if (targetm.vectorize.code_for_prefetch == NULL ++ || targetm.vectorize.prefetch_handleable_mode_p == NULL) ++ return; ++ ++ tree base = gimple_call_arg (stmt, 0); ++ if (base == NULL_TREE) ++ return; ++ ++ tree maskt = gimple_call_arg (stmt, 2); ++ tree target = gimple_call_arg (stmt, 3); ++ tree prfop = gimple_call_arg (stmt, 4); ++ HOST_WIDE_INT prfop_int = tree_to_uhwi (prfop); ++ /* Bit 3 of the prfop selects stores over loads. */ ++ HOST_WIDE_INT access = prfop_int & 8; ++ /* Bits 1 and 2 specify the locality; 0-based for svprfop but ++ 1-based for PREFETCH. */ ++ HOST_WIDE_INT locality = ((prfop_int >> 1) & 3) + 1; ++ ++ machine_mode m_mode = TYPE_MODE (TREE_TYPE (target)); ++ if (!targetm.vectorize.prefetch_handleable_mode_p (m_mode)) ++ return; ++ insn_code icode = targetm.vectorize.code_for_prefetch (m_mode); ++ ++ rtx mask = expand_normal (maskt); ++ rtx base_rtx = expand_normal (base); ++ /* Convert ptr_mode value X to Pmode. */ ++ if (ptr_mode == SImode) ++ base_rtx = simplify_gen_unary (ZERO_EXTEND, DImode, base_rtx, SImode); ++ ++ unsigned i = 0; ++ class expand_operand ops[5]; ++ create_input_operand (&ops[i++], mask, TYPE_MODE (TREE_TYPE (maskt))); ++ create_address_operand (&ops[i++], base_rtx); ++ create_integer_operand (&ops[i++], prfop_int); ++ create_integer_operand (&ops[i++], access); ++ create_integer_operand (&ops[i++], locality); ++ expand_insn (icode, i, ops); ++} ++ + /* Expand MASK_STORE{,_LANES} call STMT using optab OPTAB. */ + + static void +@@ -2920,6 +2969,70 @@ expand_gather_load_optab_fn (internal_fn, gcall *stmt, direct_optab optab) + emit_move_insn (lhs_rtx, ops[0].value); + } + ++/* Expand {MASK_,}GATHER_PREFETCH call CALL using optab OPTAB. ++ vect_patt_97.14_77 = .MASK_GATHER_LOAD (_78, vect__14.13_79, 8, { 0.0, ... }, loop_mask_87); ++ .MASK_GATHER_PREFETCH (_45, vect__14.13_79, 8, { 0.0, ... }, loop_mask_87, vect_patt_97.14_77, 4); ++*/ ++ ++static void ++expand_gather_prefetch_optab_fn (internal_fn, gcall *stmt, direct_optab optab) ++{ ++ if (targetm.vectorize.code_for_gather_prefetch == NULL ++ || targetm.vectorize.prefetch_handleable_mode_p == NULL) ++ return; ++ ++ /* Extracting tree nodes, only expand for scalar base and vector index. */ ++ tree base = gimple_call_arg (stmt, 0); ++ if (VECTOR_TYPE_P (TREE_TYPE (base))) ++ return; ++ tree offset = gimple_call_arg (stmt, 1); ++ if (VECTOR_TYPE_P (TREE_TYPE (offset)) == false) ++ return; ++ ++ tree scale = gimple_call_arg (stmt, 2); ++ tree mask = gimple_call_arg (stmt, 4); ++ tree target = gimple_call_arg (stmt, 5); ++ tree prfop = gimple_call_arg (stmt, 6); ++ ++ /* Convert to the rtx node. */ ++ rtx base_rtx = expand_normal (base); ++ /* Convert ptr_mode value X to Pmode. */ ++ if (ptr_mode == SImode) ++ base_rtx = simplify_gen_unary (ZERO_EXTEND, DImode, base_rtx, SImode); ++ rtx offset_rtx = expand_normal (offset); ++ rtx const_rtx = CONST0_RTX (TYPE_MODE (TREE_TYPE (target))); ++ rtx mask_rtx = expand_normal (mask); ++ HOST_WIDE_INT scale_int = tree_to_shwi (scale); ++ HOST_WIDE_INT prfop_int = tree_to_uhwi (prfop); ++ /* Bit 3 of the prfop selects stores over loads. */ ++ HOST_WIDE_INT access = prfop_int & 8; ++ /* Bits 1 and 2 specify the locality; 0-based for svprfop but ++ 1-based for PREFETCH. */ ++ HOST_WIDE_INT locality = ((prfop_int >> 1) & 3) + 1; ++ ++ /* add operand. */ ++ unsigned int i = 0; ++ class expand_operand ops[9]; ++ create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask))); ++ create_address_operand (&ops[i++], base_rtx); ++ create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset))); ++ /* Check whether the index has unsigned. */ ++ create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset))); ++ create_integer_operand (&ops[i++], scale_int); ++ create_input_operand (&ops[i++], const_rtx, GET_MODE (const_rtx)); ++ create_integer_operand (&ops[i++], prfop_int); ++ create_integer_operand (&ops[i++], access); ++ create_integer_operand (&ops[i++], locality); ++ ++ machine_mode reg_mode = GET_MODE (offset_rtx); ++ machine_mode m_mode = TYPE_MODE (TREE_TYPE (target)); ++ if (!targetm.vectorize.prefetch_handleable_mode_p (m_mode)) ++ return; ++ insn_code icode = targetm.vectorize.code_for_gather_prefetch ++ (m_mode, reg_mode); ++ expand_insn (icode, i, ops); ++} ++ + /* Expand DIVMOD() using: + a) optab handler for udivmod/sdivmod if it is available. + b) If optab_handler doesn't exist, generate call to +@@ -3210,9 +3323,11 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, + #define direct_cond_binary_optab_supported_p direct_optab_supported_p + #define direct_cond_ternary_optab_supported_p direct_optab_supported_p + #define direct_mask_load_optab_supported_p direct_optab_supported_p ++#define direct_mask_prefetch_optab_supported_p direct_optab_supported_p + #define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p + #define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p + #define direct_gather_load_optab_supported_p convert_optab_supported_p ++#define direct_gather_prefetch_optab_supported_p direct_optab_supported_p + #define direct_mask_store_optab_supported_p direct_optab_supported_p + #define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p + #define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p +diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def +index 0c6fc3711..cc0f42b98 100644 +--- a/gcc/internal-fn.def ++++ b/gcc/internal-fn.def +@@ -119,6 +119,8 @@ along with GCC; see the file COPYING3. If not see + #endif + + DEF_INTERNAL_OPTAB_FN (MASK_LOAD, ECF_PURE, maskload, mask_load) ++DEF_INTERNAL_OPTAB_FN (MASK_PREFETCH, ECF_NOVOPS | ECF_LEAF, ++ maskprefetch, mask_prefetch) + DEF_INTERNAL_OPTAB_FN (LOAD_LANES, ECF_CONST, vec_load_lanes, load_lanes) + DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE, + vec_mask_load_lanes, mask_load_lanes) +@@ -126,6 +128,8 @@ DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE, + DEF_INTERNAL_OPTAB_FN (GATHER_LOAD, ECF_PURE, gather_load, gather_load) + DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE, + mask_gather_load, gather_load) ++DEF_INTERNAL_OPTAB_FN (MASK_GATHER_PREFETCH, ECF_NOVOPS | ECF_LEAF, ++ mask_gather_prefetch, gather_prefetch) + + DEF_INTERNAL_OPTAB_FN (SCATTER_STORE, 0, scatter_store, scatter_store) + DEF_INTERNAL_OPTAB_FN (MASK_SCATTER_STORE, 0, +diff --git a/gcc/optabs.def b/gcc/optabs.def +index 0c64eb52a..ee25bc3f7 100644 +--- a/gcc/optabs.def ++++ b/gcc/optabs.def +@@ -90,9 +90,11 @@ OPTAB_CD(vec_cmp_optab, "vec_cmp$a$b") + OPTAB_CD(vec_cmpu_optab, "vec_cmpu$a$b") + OPTAB_CD(vec_cmpeq_optab, "vec_cmpeq$a$b") + OPTAB_CD(maskload_optab, "maskload$a$b") ++OPTAB_CD(maskprefetch_optab, "maskprefetch$a$b") + OPTAB_CD(maskstore_optab, "maskstore$a$b") + OPTAB_CD(gather_load_optab, "gather_load$a$b") + OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b") ++OPTAB_CD(mask_gather_prefetch_optab, "mask_gather_prefetch$a$b") + OPTAB_CD(scatter_store_optab, "scatter_store$a$b") + OPTAB_CD(mask_scatter_store_optab, "mask_scatter_store$a$b") + OPTAB_CD(vec_extract_optab, "vec_extract$a$b") +diff --git a/gcc/params.opt b/gcc/params.opt +index 2044524a3..c429359e3 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -1005,4 +1005,57 @@ Target size of compressed pointer, which should be 8, 16 or 32. + Common Joined UInteger Var(semi_relayout_level) Init(13) IntegerRange(11, 15) Param Optimization + Set capacity of each bucket to semi-relayout to (1 << semi-relayout-level) / 8 . + ++-param=mem-access-ratio= ++Common Joined UInteger Var(param_mem_access_ratio) Init(20) IntegerRange(0, 100) Param Optimization ++Memory access ratio (in percent). ++ ++-param=mem-access-num= ++Common Joined UInteger Var(param_mem_access_num) Init(3) Param Optimization ++Memory access num. ++ ++-param=prefetch-offset= ++Common Joined UInteger Var(param_prefetch_offset) Init(1024) ++IntegerRange(1, 999999) Param Optimization ++Prefetch Offset, which is usually a power of two due to cache line size. ++ ++-param=branch-prob-threshold= ++Common Joined UInteger Var(param_branch_prob_threshold) Init(80) IntegerRange(50, 100) ++Param Optimization ++High Execution Rate Branch Threshold. ++ ++-param=issue-topn= ++Common Joined UInteger Var(param_issue_topn) Init(1) Param Optimization ++Issue topn LLC mem_ref hint. ++ ++-param=force-issue= ++Common Joined UInteger Var(param_force_issue) Init(0) IntegerRange(0, 1) Param ++Force issue the topn LLC mem_ref hint, without generating dynamic multi-branches. ++ ++-param=llc-capacity-per-core= ++Common Joined UInteger Var(param_llc_capacity_per_core) Init(114) IntegerRange(0, 999999) Param ++LLC capacity per core. ++ ++-param=target-variables= ++Common Joined Var(param_target_variables) Init("") Param Optimization ++--param=target-variables=[,,...] Target variables for prefetching, separated by comma, ++without space. The representation of a variable can be complex and containing space, please surround ++it by quotation marks and escape special characters in Linux. The input length should be no more ++than 512 characters. ++ ++-param=use-ref-group-index= ++Common Joined UInteger Var(param_use_ref_group_index) Init(0) IntegerRange(0, 1) Param Optimization ++Prefetch the target variables by their indices in sorted ref_groups, use together with parameter ++target-variables. ++ ++-param=mem-ref-index= ++Common Joined Var(param_mem_ref_index) Init("") Param Optimization ++--param=mem-ref-index=[,,...] Prefetch the target variable at the memory reference ++location with the index of customized order, separated by comma, without space. The input length ++should be no more than 512 characters. ++ ++-param=filter-kernels= ++Common Joined UInteger Var(param_filter_kernels) Init(1) IntegerRange(0, 1) Param ++Allow LLC allocate pass to greedily filter kernels by traversing the corresponding basic blocks ++through edges with branch probability no less than param_branch_prob_threshold. ++ + ; This comment is to ensure we retain the blank line above. +diff --git a/gcc/passes.def b/gcc/passes.def +index df7d65733..ea59fc8ca 100644 +--- a/gcc/passes.def ++++ b/gcc/passes.def +@@ -303,6 +303,7 @@ along with GCC; see the file COPYING3. If not see + /* Run IVOPTs after the last pass that uses data-reference analysis + as that doesn't handle TARGET_MEM_REFs. */ + NEXT_PASS (pass_iv_optimize); ++ NEXT_PASS (pass_llc_allocate); + NEXT_PASS (pass_lim); + NEXT_PASS (pass_tree_loop_done); + POP_INSERT_PASSES () +diff --git a/gcc/target.def b/gcc/target.def +index 34d3561bd..351c94c37 100644 +--- a/gcc/target.def ++++ b/gcc/target.def +@@ -2072,6 +2072,37 @@ DEFHOOK + (void *data), + default_destroy_cost_data) + ++/* Function for vector prefetch operation. */ ++DEFHOOK ++(code_for_prefetch, ++ "This hook should return the decl of a function that implements the\n\ ++vectorized variant of the function with the @code{combined_fn} code\n\ ++@var{code} or @code{NULL_TREE} if such a function is not available.\n\ ++The return type of the vectorized function shall be of vector type\n\ ++@var{vec_type_out} and the argument types should be @var{vec_type_in}.", ++ insn_code, (machine_mode arg), ++ NULL) ++ ++/* Function for vector gather prefetch operation. */ ++DEFHOOK ++(code_for_gather_prefetch, ++ "This hook should return the decl of a function that implements the\n\ ++vectorized variant of the function with the @code{combined_fn} code\n\ ++@var{code} or @code{NULL_TREE} if such a function is not available.\n\ ++The return type of the vectorized function shall be of vector type\n\ ++@var{vec_type_out} and the argument types should be @var{vec_type_in}.", ++ insn_code, (machine_mode mode_to, machine_mode mode_form), ++ NULL) ++ ++/* Function to check whether the target hardware architecture supports ++ a full SVE data vector mode. */ ++DEFHOOK ++(prefetch_handleable_mode_p, ++ "This hook should return true if the target hardware architecture\n\ ++supports a full SVE data vector mode.", ++ bool, (machine_mode arg), ++ NULL) ++ + HOOK_VECTOR_END (vectorize) + + #undef HOOK_PREFIX +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c +new file mode 100644 +index 000000000..a4828eaab +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c +@@ -0,0 +1,61 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param issue-topn=2" } */ ++ ++#include ++ ++#define N 131590 ++#define F 384477 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++int lPtr[F]; ++int uPtr[F]; ++double lowerPtr[F]; ++double upperPtr[F]; ++ ++void ++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, ++ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) ++{ ++ for (int cell=0; cell ++ ++#define N 100000 ++ ++int A_i[N]; ++int A_j[N]; ++double A_data[N]; ++double x_data[N]; ++double y_data[N]; ++int num_rows = N; ++ ++void ++MatMult (int *A_i, int *A_j, double *A_data, double *x_data, ++ int num_rows, double *y_data) ++{ ++ int i = 0; ++ int j = 0; ++ double temp = 0; ++ for (i = 0; i < num_rows; i++) ++ { ++ temp = y_data[i]; ++ for (j = A_i[i]; j < A_i[i+1]; j++) ++ temp += A_data[j] * x_data[A_j[j]]; ++ y_data[i] = temp; ++ } ++} ++ ++int ++main (int argc, char *argv[]) ++{ ++ int testIter = 2; ++ ++ for (int i = 0; i < testIter; i++) ++ MatMult (A_i, A_j, A_data, x_data, num_rows, y_data); ++ ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "Tracing succeeded" 6 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "static_data_size:" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 4 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not ", size: (?!(0\.000000))" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times ", size: 0\.000000" 6 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "\\d x_data \\(0.000000, 1, 0\\) : 3" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "\\d A_j \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "\\d A_data \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "insert svprfd_gather" 2 "llc_allocate" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp +new file mode 100644 +index 000000000..4f34e722f +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp +@@ -0,0 +1,27 @@ ++# Copyright (C) 2022-2023 Free Software Foundation, Inc. ++ ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 3 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with GCC; see the file COPYING3. If not see ++# . ++ ++load_lib gcc-dg.exp ++load_lib target-supports.exp ++ ++# Initialize `dg'. ++dg-init ++ ++dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \ ++ "" "-fllc-allocate" ++ ++# All done. ++dg-finish +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c +new file mode 100644 +index 000000000..2a58c501f +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c +@@ -0,0 +1,48 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=uPtr" } */ ++ ++#include ++ ++#define N 131590 ++#define F 384477 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++int lPtr[F]; ++int uPtr[F]; ++double lowerPtr[F]; ++double upperPtr[F]; ++ ++void ++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, ++ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) ++{ ++ for (int cell=0; cell ++ ++typedef struct stack_def ++{ ++ int top; /* index to top stack element */ ++ unsigned long reg_set; /* set of live registers */ ++ unsigned char reg[128]; /* register - stack mapping */ ++} *stack; ++ ++typedef struct block_info_def ++{ ++ struct stack_def stack_in; /* Input stack configuration. */ ++ struct stack_def stack_out; /* Output stack configuration. */ ++ unsigned long out_reg_set; /* Stack regs live on output. */ ++ int done; /* True if block already converted. */ ++ int predecessors; /* Number of predecessors that need ++ to be visited. */ ++} *block_info; ++ ++typedef struct basic_block_def ++{ ++ void *aux; ++} *basic_block; ++ ++unsigned char ++convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high) ++{ ++ stack output_stack; ++ ++ output_stack = &(((block_info) bb->aux)->stack_in); ++ if (value_reg_low == -1) ++ output_stack->top = -1; ++ else ++ { ++ int reg; ++ output_stack->top = value_reg_high - value_reg_low; ++ for (reg = value_reg_low; reg <= value_reg_high; ++reg) ++ { ++ (output_stack->reg + 16)[value_reg_high - reg] = reg; ++ output_stack->reg_set |= (unsigned long) 1 << reg; ++ } ++ } ++ return output_stack->reg[0]; ++} ++ ++/* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c +new file mode 100644 +index 000000000..27cd574cf +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c +@@ -0,0 +1,62 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0" } */ ++ ++#include ++#include ++ ++#define N 1000 ++ ++long a[N] = {0}; ++long b[N] = {0}; ++long c[N] = {0}; ++ ++double ++referenceTrace (double *psiPtr, int *lPtr, int *uPtr, int nCells) ++{ ++ double sum; ++ for (int cell = 0; cell < nCells; cell++) ++ { ++ // Multi-layer pointer ++ sum += psiPtr[lPtr[cell]]; ++ psiPtr[uPtr[cell]] = sum; ++ ++ // Outer pointer, inner array ++ sum += psiPtr[b[cell]]; ++ psiPtr[a[cell]] = sum; ++ ++ // Multi-layer array, currently failed tracing at b[cell] and a[cell] ++ sum += a[b[cell]]; ++ c[a[cell]] = sum; ++ ++ // Outer array, inner pointer, currently failed tracing at lPtr[cell] ++ sum += a[lPtr[cell]]; ++ c[lPtr[cell]] = sum; ++ } ++ return sum; ++} ++ ++int ++main (int argc, char *argv[]) ++{ ++ int testIter = 2; ++ ++ double *psiPtr = NULL; ++ int *lPtr = NULL; ++ int *uPtr = NULL; ++ psiPtr = (double *) calloc (N, sizeof(double)); ++ lPtr = (int *) calloc (N, sizeof(int)); ++ uPtr = (int *) calloc (N, sizeof(int)); ++ ++ for (int i = 0; i < testIter; i++) ++ referenceTrace (psiPtr, lPtr, uPtr, N); ++ ++ free (psiPtr); ++ free (lPtr); ++ free (uPtr); ++ ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump-times "Tracing succeeded" 16 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "Tracing failed" 8 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "unhandled issue scene" 2 "llc_allocate" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c +new file mode 100644 +index 000000000..276781c4f +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c +@@ -0,0 +1,48 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=lPtr" } */ ++ ++#include ++ ++#define N 131590 ++#define F 384477 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++int lPtr[F]; ++int uPtr[F]; ++double lowerPtr[F]; ++double upperPtr[F]; ++ ++void ++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, ++ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) ++{ ++ for (int cell=0; cell ++ ++#define N 131590 ++#define F 384477 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++int lPtr[F]; ++int uPtr[F]; ++double lowerPtr[F]; ++double upperPtr[F]; ++ ++void ++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, ++ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) ++{ ++ for (int cell=0; cell ++ ++#define N 131590 ++#define F 384477 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++int lPtr[F]; ++int uPtr[F]; ++double lowerPtr[F]; ++double upperPtr[F]; ++ ++void ++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, ++ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) ++{ ++ for (int cell=0; cell ++ ++#define N 131590 ++#define F 384477 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++int lPtr[F]; ++int uPtr[F]; ++double lowerPtr[F]; ++double upperPtr[F]; ++ ++void ++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, ++ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) ++{ ++ for (int cell=0; cell ++ ++#define N 131590 ++#define F 384477 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++int lPtr[F]; ++int uPtr[F]; ++double lowerPtr[F]; ++double upperPtr[F]; ++ ++void ++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, ++ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) ++{ ++ for (int cell=0; cell ++ ++#define N 131590 ++#define F 384477 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++int lPtr[F]; ++int uPtr[F]; ++double lowerPtr[F]; ++double upperPtr[F]; ++ ++void ++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, ++ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) ++{ ++ for (int cell=0; cellaux\"" } */ ++ ++#include ++ ++typedef struct stack_def ++{ ++ int top; /* index to top stack element */ ++ unsigned long reg_set; /* set of live registers */ ++ unsigned char reg[128]; /* register - stack mapping */ ++} *stack; ++ ++typedef struct block_info_def ++{ ++ struct stack_def stack_in; /* Input stack configuration. */ ++ struct stack_def stack_out; /* Output stack configuration. */ ++ unsigned long out_reg_set; /* Stack regs live on output. */ ++ int done; /* True if block already converted. */ ++ int predecessors; /* Number of predecessors that need ++ to be visited. */ ++} *block_info; ++ ++typedef struct basic_block_def ++{ ++ void *aux; ++} *basic_block; ++ ++unsigned char ++convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high) ++{ ++ stack output_stack; ++ ++ output_stack = &(((block_info) bb->aux)->stack_in); ++ if (value_reg_low == -1) ++ output_stack->top = -1; ++ else ++ { ++ int reg; ++ output_stack->top = value_reg_high - value_reg_low; ++ for (reg = value_reg_low; reg <= value_reg_high; ++reg) ++ { ++ (output_stack->reg + 16)[value_reg_high - reg] = reg; ++ output_stack->reg_set |= (unsigned long) 1 << reg; ++ } ++ } ++ return output_stack->reg[0]; ++} ++ ++/* { dg-final { scan-tree-dump-not "Unrecognizable variable name" ++ "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "static issue" "llc_allocate" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c +new file mode 100644 +index 000000000..09a525ce1 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c +@@ -0,0 +1,54 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -c -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0 --param target-variables=tmp_var_0" } */ ++ ++#include ++ ++typedef struct stack_def ++{ ++ int top; /* index to top stack element */ ++ unsigned long reg_set; /* set of live registers */ ++ unsigned char reg[128]; /* register - stack mapping */ ++} *stack; ++ ++typedef struct block_info_def ++{ ++ struct stack_def stack_in; /* Input stack configuration. */ ++ struct stack_def stack_out; /* Output stack configuration. */ ++ unsigned long out_reg_set; /* Stack regs live on output. */ ++ int done; /* True if block already converted. */ ++ int predecessors; /* Number of predecessors that need ++ to be visited. */ ++} *block_info; ++ ++typedef struct basic_block_def ++{ ++ void *aux; ++} *basic_block; ++ ++unsigned char ++convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high) ++{ ++ stack output_stack; ++ ++ output_stack = &(((block_info) bb->aux)->stack_in); ++ if (value_reg_low == -1) ++ output_stack->top = -1; ++ else ++ { ++ int reg; ++ output_stack->top = value_reg_high - value_reg_low; ++ for (reg = value_reg_low; reg <= value_reg_high; ++reg) ++ { ++ (output_stack->reg + 16)[value_reg_high - reg] = reg; ++ output_stack->reg_set |= (unsigned long) 1 << reg; ++ } ++ } ++ return output_stack->reg[0]; ++} ++ ++/* { dg-final { scan-tree-dump-not "Unrecognizable variable name" ++ "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "NOTICE: Prefetching target variable \"" ++ " bb_16(D)->aux \"" 1 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */ +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 +new file mode 100644 +index 000000000..ec918e144 +--- /dev/null ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 +@@ -0,0 +1,213 @@ ++! { dg-do compile { target { aarch64*-*-linux* } } } ++! { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param branch-prob-threshold=50" } ++ ++program main ++ ++ IMPLICIT NONE ++ INTEGER :: ids,ide, jds,jde, kds,kde ++ INTEGER,parameter :: ims=-4,kms=1,jms=-4 ++ INTEGER,parameter :: ime=210,kme=36,jme=192 ++ INTEGER :: its,ite, jts,jte, kts,kte ++ INTEGER :: number_of_small_timesteps,rk_step, rk_order, step ++ ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: t_1, t_2, c2a, p, ph, pm1, al, alt ++ ++ ++ REAL, DIMENSION(ims:ime, jms:jme) :: mu, muts ++ ++ REAL, DIMENSION(kms:kme) :: dnw, rdnw, znu ++ ++ REAL :: rdx,rdy ++ REAL :: dts, t0, smdiv ++ REAL :: random1,time_begin,time_end,total_time ++ ++ INTEGER :: i, j, k ++ INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end ++ INTEGER :: i_endu, j_endv ++ INTEGER :: interval=1 ++ INTEGER :: epoch,iter ++ ++ LOGICAL :: non_hydrostatic ++ ++ data ids, jds, kds, its, jts, kts /6*1/ ++ data ide, ite /2*205/ ++ data jde, jte /2*187/ ++ data kde, kte /2*36/ ++ ++ number_of_small_timesteps = 1 ++ rk_step = 3 ++ rk_order = 1 ++ dts = 1. ++ ++ rdx = 1. ++ rdy = 1. ++ ++ t0 = 0. ++ smdiv = 1. ++ step = 1 ++ non_hydrostatic = .true. ++ ++ call random_number(random1) ++ interval = random1*100 ++ interval=1 ++ ++ call random_seed(put=(/(i,i=1,10000,interval)/)) ++ ++ call random_number(alt) ++ call random_number(c2a) ++ call random_number(ph) ++ call random_number(pm1) ++ call random_number(mu) ++ call random_number(muts) ++ call random_number(dnw) ++ call random_number(rdnw) ++ call random_number(znu) ++ ++ do iter=1,2 ++ call calc_p_rho( al, p, ph, & ++ alt, t_2, t_1, c2a, pm1, & ++ mu, muts, znu, t0, & ++ rdnw, dnw, smdiv, & ++ non_hydrostatic, step, & ++ ids, ide, jds, jde, kds, kde, & ++ ims, ime, jms, jme, kms, kme, & ++ its,ite, jts,jte, kts,kte ) ++ ++ enddo ++ ++end program ++ ++ ++SUBROUTINE calc_p_rho( al, p, ph, & ++ alt, t_2, t_1, c2a, pm1, & ++ mu, muts, znu, t0, & ++ rdnw, dnw, smdiv, & ++ non_hydrostatic, step, & ++ ids, ide, jds, jde, kds, kde, & ++ ims, ime, jms, jme, kms, kme, & ++ its,ite, jts,jte, kts,kte ) ++ ++ IMPLICIT NONE ! religion first ++ !asb ++! declarations for the stuff coming in ++ ++ INTEGER, INTENT(IN ) :: ids,ide, jds,jde, kds,kde ++ INTEGER, INTENT(IN ) :: ims,ime, jms,jme, kms,kme ++ INTEGER, INTENT(IN ) :: its,ite, jts,jte, kts,kte ++ ++ INTEGER, INTENT(IN ) :: step ++ ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT( OUT) :: al, & ++ p ++ ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(IN ) :: alt, & ++ t_2, & ++ t_1, & ++ c2a ++ ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(INOUT) :: ph, pm1 ++ ++ REAL, DIMENSION(ims:ime, jms:jme) , INTENT(IN ) :: mu, & ++ muts ++ ++ REAL, DIMENSION(kms:kme) , INTENT(IN ) :: dnw, & ++ rdnw, & ++ znu ++ ++ REAL, INTENT(IN ) :: t0, smdiv ++ ++ LOGICAL, INTENT(IN ) :: non_hydrostatic ++ ++! local variables ++ ++ INTEGER :: i, j, k ++ INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end ++ REAL :: ptmp ++ ++ i_start = its ++ i_end = min(ite,ide-1) ++ j_start = jts ++ j_end = min(jte,jde-1) ++ k_start = kts ++ k_end = min(kte,kde-1) ++ ++ IF (non_hydrostatic) THEN ++ DO j=j_start, j_end ++ DO k=k_start, k_end ++ DO i=i_start, i_end ++ ++! al computation is all dry, so ok with moisture ++ ++ al(i,k,j)=-1./muts(i,j)*(alt(i,k,j)*mu(i,j) & ++ +rdnw(k)*(ph(i,k+1,j)-ph(i,k,j))) ++ ++! this is temporally linearized p, no moisture correction needed ++ ++ p(i,k,j)=c2a(i,k,j)*(alt(i,k,j)*(t_2(i,k,j)-mu(i,j)*t_1(i,k,j)) & ++ /(muts(i,j)*(t0+t_1(i,k,j)))-al (i,k,j)) ++ ++ ENDDO ++ ENDDO ++ ENDDO ++ ++ ELSE ! hydrostatic calculation ++ ++ DO j=j_start, j_end ++ DO k=k_start, k_end ++ DO i=i_start, i_end ++ p(i,k,j)=mu(i,j)*znu(k) ++ al(i,k,j)=alt(i,k,j)*(t_2(i,k,j)-mu(i,j)*t_1(i,k,j)) & ++ /(muts(i,j)*(t0+t_1(i,k,j)))-p(i,k,j)/c2a(i,k,j) ++ ph(i,k+1,j)=ph(i,k,j)-dnw(k)*(muts(i,j)*al (i,k,j) & ++ +mu(i,j)*alt(i,k,j)) ++ ENDDO ++ ENDDO ++ ENDDO ++ ++ END IF ++ ++! divergence damping setup ++ ++ IF (step == 0) then ! we're initializing small timesteps ++ DO j=j_start, j_end ++ DO k=k_start, k_end ++ DO i=i_start, i_end ++ pm1(i,k,j)=p(i,k,j) ++ ENDDO ++ ENDDO ++ ENDDO ++ ELSE ! we're in the small timesteps ++ DO j=j_start, j_end ! and adding div damping component ++ DO k=k_start, k_end ++ DO i=i_start, i_end ++ ptmp = p(i,k,j) ++ p(i,k,j) = p(i,k,j) + smdiv*(p(i,k,j)-pm1(i,k,j)) ++ pm1(i,k,j) = ptmp ++ ENDDO ++ ENDDO ++ ENDDO ++ END IF ++ ++END SUBROUTINE calc_p_rho ++ ++! { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 6 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "Tracing succeeded" 48 "llc_allocate" } } ++! { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } ++! { dg-final { scan-tree-dump-not "static_data_size:" "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){2}\}" 3 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){4}\}" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-not ", size: (?!(0\.000000))" "llc_allocate" } } ++! { dg-final { scan-tree-dump-times ", size: 0\.000000" 28 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d p \\(0.000000, 3, 0\\) : 8" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d pm1 \\(0.000000, 2, 0\\) : 5" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d ph \\(0.000000, 2, 0\\) : 4" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d al \\(0.000000, 1, 0\\) : 3" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d alt \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d t_1 \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d t_2 \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d c2a \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d mu \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d muts \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "runtime issue" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "insert svprfd" 2 "llc_allocate" } } +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp +new file mode 100644 +index 000000000..068341784 +--- /dev/null ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp +@@ -0,0 +1,29 @@ ++# Copyright (C) 2022-2023 Free Software Foundation, Inc. ++ ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 3 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with GCC; see the file COPYING3. If not see ++# . ++ ++# GCC testsuite that uses the `dg.exp' driver. ++ ++load_lib gfortran-dg.exp ++ ++# Initialize `dg'. ++dg-init ++ ++# Main loop. ++gfortran-dg-runtest [lsort \ ++ [glob -nocomplain $srcdir/$subdir/*.\[fF\]{,90,95,03,08} ] ] "" "" ++ ++# All done. ++dg-finish +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 +new file mode 100644 +index 000000000..23e360540 +--- /dev/null ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 +@@ -0,0 +1,63 @@ ++! { dg-do compile { target { aarch64*-*-linux* } } } ++! { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno" } ++ ++MODULE INPUT ++ IMPLICIT NONE ++ ++ INTEGER, PARAMETER :: wp = 8, jpi = 25, jpj = 39, jpk = 31, kjpt = 2 ++ ++ INTEGER :: kt = 1, jpkm1 = 30, jpjm1 = 38, fs_jpim1 = 24, fs_2 = 2 ++ REAL(wp), DIMENSION(jpi, jpj) :: e12t ++ REAL(wp), DIMENSION(jpi, jpj, jpk) :: fse3t_n ++ REAL(wp), DIMENSION(jpi, jpj, jpk, kjpt) :: pta ++ ++END MODULE INPUT ++ ++PROGRAM MAIN ++ USE INPUT ++ ++ IMPLICIT NONE ++ ++ INTEGER :: EPOCH ++ ++! Initialize arrays ++ ++ e12t = 1 ++ fse3t_n = 1 ++ pta = 1 ++! ++ ++ DO EPOCH=1,2 ++ CALL tra_ldf_iso ++ ENDDO ++ ++END PROGRAM MAIN ++ ++SUBROUTINE tra_ldf_iso ++ USE INPUT ++ ++ IMPLICIT NONE ++ ! ++ INTEGER :: ji, jj, jk, jn ! dummy loop indices ++ REAL(wp) :: zbtr, ztra ! - - ++ REAL(wp), DIMENSION(jpi, jpj, jpk) :: ztfw ++ ++ DO jn = 1, kjpt ++ ztfw(:, :, 1) = 0.e0; ztfw(:, :, jpk) = 0.e0 ++ ++ DO jk = 1, jpkm1 ++ DO jj = 2, jpjm1 ++ DO ji = fs_2, fs_jpim1 ! vector opt. ++ zbtr = 1.0/(e12t(ji, jj)*fse3t_n(ji, jj, jk)) ++ ztra = (ztfw(ji, jj, jk) - ztfw(ji, jj, jk + 1))*zbtr ++ pta(ji, jj, jk, jn) = pta(ji, jj, jk, jn) + ztra ++ END DO ++ END DO ++ END DO ++ ! ++ END DO ++ ! ++END SUBROUTINE tra_ldf_iso ++ ++! { dg-final { scan-tree-dump-times "Traced variables at vectp_ztfw" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "Tracing unusual number or occurrences of base variables. Choose ztfw." 2 "llc_allocate" } } +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 +new file mode 100644 +index 000000000..d76c75b5b +--- /dev/null ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 +@@ -0,0 +1,58 @@ ++! { dg-do compile { target { aarch64*-*-linux* } } } ++! { dg-options "-c -O3 -march=armv8.2-a+sve -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0 --param issue-topn=1 --param mem-access-ratio=5 --param mem-access-num=1" } ++ ++Module module_domain ++ IMPLICIT NONE ++ ++ REAL, PARAMETER :: g = 9.8 ++ TYPE :: grid_type ++ REAL, POINTER :: phb(:,:,:), ph_2(:,:,:), p(:,:,:), pb(:,:,:) ++ REAL, POINTER :: fnm(:), fnp(:) ++ END TYPE ++END Module ++ ++SUBROUTINE calc_p8w(p8w, ix, iy, k_start, k_end) ++ ++ USE module_domain ++ !USE module_model_constants ++ ++ IMPLICIT NONE ++ ++ ++ !TYPE (domain), INTENT(IN) :: grid ++ INTEGER, INTENT(IN) :: k_start, k_end, ix, iy ++ REAL, DIMENSION(k_start:k_end), INTENT(OUT) :: p8w ++ ++ ++ INTEGER :: k ++ REAL :: z0, z1, z2, w1, w2 ++ REAL, DIMENSION(k_start:k_end) :: z_at_w ++ REAL, DIMENSION(k_start:k_end-1) :: z ++ TYPE (grid_type), POINTER :: grid ++ ++ ++ DO k = k_start, k_end ++ z_at_w(k) = (grid%phb(ix,k,iy)+grid%ph_2(ix,k,iy))/g ++ END DO ++ ++ DO k = k_start, k_end-1 ++ z(k) = 0.5*(z_at_w(k) + z_at_w(k+1)) ++ END DO ++ ++ DO k = k_start+1, k_end-1 ++ p8w(k) = grid%fnm(k)*(grid%p(ix,k,iy)+grid%pb(ix,k,iy)) + & ++ grid%fnp(k)*(grid%p(ix,k-1,iy)+grid%pb(ix,k-1,iy)) ++ END DO ++ ++ z0 = z_at_w(k_start) ++ z1 = z(k_start) ++ z2 = z(k_start+1) ++ w1 = (z0 - z2)/(z1 - z2) ++ w2 = 1. - w1 ++ p8w(k_start) = w1*(grid%p(ix,k_start,iy)+grid%pb(ix,k_start,iy)) + & ++ w2*(grid%p(ix,k_start+1,iy)+grid%pb(ix,k_start+1,iy)) ++ ++END SUBROUTINE calc_p8w ++ ++! { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } +diff --git a/gcc/timevar.def b/gcc/timevar.def +index ba86a1b7b..4b643538f 100644 +--- a/gcc/timevar.def ++++ b/gcc/timevar.def +@@ -207,6 +207,7 @@ DEFTIMEVAR (TV_TREE_LOOP_DISTRIBUTION, "tree loop distribution") + DEFTIMEVAR (TV_CHECK_DATA_DEPS , "tree check data dependences") + DEFTIMEVAR (TV_TREE_PREFETCH , "tree prefetching") + DEFTIMEVAR (TV_TREE_LOOP_IVOPTS , "tree iv optimization") ++DEFTIMEVAR (TV_TREE_LLC_ALLOCATE , "tree llc allocation") + DEFTIMEVAR (TV_PREDCOM , "predictive commoning") + DEFTIMEVAR (TV_TREE_CH , "tree copy headers") + DEFTIMEVAR (TV_TREE_SSA_UNCPROP , "tree SSA uncprop") +diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c +index d82fe23d8..9eb173d69 100644 +--- a/gcc/tree-cfg.c ++++ b/gcc/tree-cfg.c +@@ -8365,6 +8365,17 @@ print_loops (FILE *file, int verbosity) + print_loop_and_siblings (file, bb->loop_father, 0, verbosity); + } + ++/* Dump a loop to file. */ ++ ++void ++loop_dump (FILE *file, class loop *loop) ++{ ++ print_loop (file, loop, 0, 0); ++ fprintf (file, "vec_niter = "); ++ print_generic_expr (file, loop->vec_nb_iterations); ++ fprintf (file, "\n"); ++} ++ + /* Dump a loop. */ + + DEBUG_FUNCTION void +diff --git a/gcc/tree-cfg.h b/gcc/tree-cfg.h +index beb4997a6..dad0ca0a6 100644 +--- a/gcc/tree-cfg.h ++++ b/gcc/tree-cfg.h +@@ -83,6 +83,7 @@ extern void dump_function_to_file (tree, FILE *, dump_flags_t); + extern void debug_function (tree, dump_flags_t); + extern void print_loops_bb (FILE *, basic_block, int, int); + extern void print_loops (FILE *, int); ++extern void loop_dump (FILE *file, class loop *loop); + extern void debug (class loop &ref); + extern void debug (class loop *ptr); + extern void debug_verbose (class loop &ref); +diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h +index 027f8992d..a1e215901 100644 +--- a/gcc/tree-pass.h ++++ b/gcc/tree-pass.h +@@ -383,6 +383,7 @@ extern gimple_opt_pass *make_pass_complete_unrolli (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt); ++extern gimple_opt_pass *make_pass_llc_allocate (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt); +diff --git a/gcc/tree-scalar-evolution.c b/gcc/tree-scalar-evolution.c +index edab77827..73ffa0759 100644 +--- a/gcc/tree-scalar-evolution.c ++++ b/gcc/tree-scalar-evolution.c +@@ -2789,7 +2789,7 @@ resolve_mixers (class loop *loop, tree chrec, bool *folded_casts) + the loop body has been executed 6 times. */ + + tree +-number_of_latch_executions (class loop *loop) ++number_of_latch_executions (class loop *loop, bool guarantee) + { + edge exit; + class tree_niter_desc niter_desc; +@@ -2810,7 +2810,8 @@ number_of_latch_executions (class loop *loop) + res = chrec_dont_know; + exit = single_exit (loop); + +- if (exit && number_of_iterations_exit (loop, exit, &niter_desc, false)) ++ if (exit && number_of_iterations_exit (loop, exit, &niter_desc, false, ++ true, NULL, guarantee)) + { + may_be_zero = niter_desc.may_be_zero; + res = niter_desc.niter; +@@ -2836,7 +2837,8 @@ number_of_latch_executions (class loop *loop) + fprintf (dump_file, "))\n"); + } + +- loop->nb_iterations = res; ++ if (guarantee) ++ loop->nb_iterations = res; + return res; + } + +diff --git a/gcc/tree-scalar-evolution.h b/gcc/tree-scalar-evolution.h +index e2fbfb55b..218155650 100644 +--- a/gcc/tree-scalar-evolution.h ++++ b/gcc/tree-scalar-evolution.h +@@ -21,7 +21,8 @@ along with GCC; see the file COPYING3. If not see + #ifndef GCC_TREE_SCALAR_EVOLUTION_H + #define GCC_TREE_SCALAR_EVOLUTION_H + +-extern tree number_of_latch_executions (class loop *); ++extern tree number_of_latch_executions (class loop *, ++ bool guarantee = true); + extern gcond *get_loop_exit_condition (const class loop *); + + extern void scev_initialize (void); +diff --git a/gcc/tree-ssa-llc-allocate.c b/gcc/tree-ssa-llc-allocate.c +new file mode 100644 +index 000000000..746a1cf95 +--- /dev/null ++++ b/gcc/tree-ssa-llc-allocate.c +@@ -0,0 +1,2898 @@ ++/* LLC allocate. ++ Copyright (C) 2022-2023 Free Software Foundation, Inc. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify it ++under the terms of the GNU General Public License as published by the ++Free Software Foundation; either version 3, or (at your option) any ++later version. ++ ++GCC is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++#include "config.h" ++#define INCLUDE_MAP ++#define INCLUDE_SET ++#define INCLUDE_VECTOR ++#define INCLUDE_LIST ++#define INCLUDE_ALGORITHM ++#define INCLUDE_STRING ++#include "system.h" ++#include "coretypes.h" ++#include "backend.h" ++#include "target.h" ++#include "rtl.h" ++#include "tree.h" ++#include "gimple.h" ++#include "predict.h" ++#include "tree-pass.h" ++#include "gimple-ssa.h" ++#include "optabs-query.h" ++#include "tree-pretty-print.h" ++#include "fold-const.h" ++#include "stor-layout.h" ++#include "gimplify.h" ++#include "gimple-iterator.h" ++#include "gimplify-me.h" ++#include "tree-ssa-loop-ivopts.h" ++#include "tree-ssa-loop-manip.h" ++#include "tree-ssa-loop-niter.h" ++#include "tree-ssa-loop.h" ++#include "ssa.h" ++#include "tree-into-ssa.h" ++#include "cfgloop.h" ++#include "tree-scalar-evolution.h" ++#include "langhooks.h" ++#include "tree-inline.h" ++#include "tree-data-ref.h" ++#include "diagnostic-core.h" ++#include "dbgcnt.h" ++#include "gimple-pretty-print.h" ++#include "internal-fn.h" ++#include "tree-cfg.h" ++#include "profile-count.h" ++ ++/* Number of parallel cores. */ ++const unsigned int PARALLEL_NUM = 288; ++ ++/* Indirect access weight. */ ++const unsigned int INDIRECT_ACCESS_VALUE = 2; ++ ++/* Write memory weight. */ ++const unsigned int WRITE_COST = 2; ++ ++/* Prefetch tool input max length. */ ++#ifndef PREFETCH_TOOL_INPUT_MAX_LEN ++#define PREFETCH_TOOL_INPUT_MAX_LEN 512 ++#endif ++ ++/* Prefetch tool number max length. */ ++#ifndef PREFETCH_TOOL_NUM_MAX_LEN ++#define PREFETCH_TOOL_NUM_MAX_LEN 9 ++#endif ++ ++namespace { ++ ++using namespace std; ++ ++/* loop bound info of the memory reference located. */ ++struct loop_bound ++{ ++ /* iv tree_node. */ ++ tree iv; ++ ++ /* define stmt of iv. */ ++ gimple *def_stmt; ++ ++ /* loop where stmt is located. */ ++ class loop *loop; ++ ++ /* loop unroll factor. */ ++ unsigned int unroll; ++ ++ /* Number of iterations of loop. */ ++ tree niters; ++ ++ loop_bound (tree t, gimple *stmt) ++ { ++ iv = t; ++ def_stmt = stmt; ++ loop = loop_containing_stmt (stmt); ++ unroll = 1; ++ niters = chrec_dont_know; ++ } ++}; ++ ++/* method of calculating the data size. */ ++ ++enum calc_type ++{ ++ UNHANDLE_CALC = 0, ++ RUNTIME_CALC, ++ STATIC_CALC ++}; ++ ++/* Describes a info of a memory reference. */ ++ ++struct data_ref ++{ ++ /* The memory reference. */ ++ tree ref; ++ ++ /* Statement where the ref is located. */ ++ gimple *stmt; ++ ++ /* var_decl or param_decl, used for the ref_group. */ ++ tree var; ++ ++ /* Base of the reference. */ ++ tree base; ++ ++ /* Constant offset of the reference. */ ++ tree offset; ++ ++ /* index of the reference. */ ++ tree index; ++ ++ /* Constant step of the reference. */ ++ tree step; ++ ++ /* loop boundary info of each dimension. */ ++ vector loop_bounds; ++ ++ /* memory data size, Unit: MB. */ ++ double data_size; ++ ++ /* method of calculating the data size. */ ++ calc_type calc_by; ++ ++ /* True if the info of ref is traced, and then record it. */ ++ unsigned int trace_status_p : 1; ++ ++ /* True if the loop is vectorized. */ ++ unsigned int vectorize_p : 1; ++ ++ /* True if the memory reference is shared. */ ++ unsigned int parallel_p : 1; ++ ++ /* True if the memory reference is regular. */ ++ unsigned int regular_p : 1; ++ ++ /* True if the memory reference is read. */ ++ unsigned int read_p : 1; ++ ++ data_ref () ++ { ++ ref = NULL_TREE; ++ stmt = NULL; ++ var = NULL_TREE; ++ base = NULL_TREE; ++ offset = NULL_TREE; ++ index = NULL_TREE; ++ step = NULL_TREE; ++ data_size = 0; ++ calc_by = UNHANDLE_CALC; ++ trace_status_p = false; ++ vectorize_p = false; ++ parallel_p = false; ++ regular_p = true; ++ read_p = true; ++ } ++}; ++ ++/* ================ phase 1 get_dense_memory_kernels ================ */ ++ ++/* Add ref node and print. */ ++ ++void ++add_ref (vector &references, tree op, gimple *stmt, ++ bool vectorize_p, bool read_p) ++{ ++ data_ref ref; ++ ref.ref = op; ++ ref.stmt = stmt; ++ ref.vectorize_p = vectorize_p; ++ ref.read_p = read_p; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, ref.ref, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ references.push_back (ref); ++} ++ ++/* Get the references from the simple call (vectorization type). */ ++ ++void ++get_references_in_gimple_call (gimple *stmt, vector &references) ++{ ++ if (gimple_code (stmt) != GIMPLE_CALL) ++ return; ++ ++ if (gimple_call_internal_p (stmt)) ++ { ++ bool read_p = false; ++ switch (gimple_call_internal_fn (stmt)) ++ { ++ case IFN_MASK_GATHER_LOAD: ++ case IFN_MASK_LOAD: ++ { ++ if (gimple_call_lhs (stmt) == NULL_TREE) ++ return; ++ read_p = true; ++ // FALLTHRU ++ } ++ case IFN_MASK_STORE: ++ { ++ /* _1 = &MEM[base: a_2(D), index: ivtmp_3, step: 8, offset: 0B]; ++ vect__1.1 = .MASK_LOAD (_1, 64B, loop_mask_4); ++ ++ _1 = &MEM[base: a_2(D), index: ivtmp_3, step: 8, offset: 0B]; ++ .MASK_STORE (_1, 64B, loop_mask_4, vect__1.2); ++ ++ _1 = (sizetype) a_2(D); ++ vect_patt_3.3 = .MASK_GATHER_LOAD (_1, vect__4.4, 8, ++ { 0.0, ... }, loop_mask_5); ++ */ ++ tree op1 = gimple_call_arg (stmt, 0); ++ if (TREE_CODE (op1) != SSA_NAME) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "get_references_in_gimple_call: "); ++ fprintf (dump_file, "find base that not ssa_name: "); ++ print_generic_expr (dump_file, op1, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ return; ++ } ++ gimple *op1_def = SSA_NAME_DEF_STMT (op1); ++ if (op1_def != NULL && gimple_code (op1_def) == GIMPLE_ASSIGN) ++ { ++ /* &MEM[base: xx] */ ++ tree rhs1 = gimple_assign_rhs1 (op1_def); ++ /* If the definition stmt of the operation is memory ++ reference type, read it directly. */ ++ if (TREE_CODE (rhs1) == ADDR_EXPR ++ && TREE_CODE (TREE_OPERAND (rhs1, 0)) == TARGET_MEM_REF) ++ op1 = TREE_OPERAND (rhs1, 0); /* MEM[base: xx] */ ++ } ++ ++ add_ref (references, op1, stmt, true, read_p); ++ return; ++ } ++ default: ++ return; ++ } ++ } ++} ++ ++/* Stores the locations of memory references in STMT to REFERENCES. */ ++ ++void ++get_references_in_stmt (gimple *stmt, vector &references) ++{ ++ if (!gimple_vuse (stmt)) ++ return; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "gimple_vuse: "); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); ++ } ++ ++ if (gimple_code (stmt) == GIMPLE_ASSIGN) ++ { ++ tree op0 = gimple_assign_lhs (stmt); ++ tree op1 = gimple_assign_rhs1 (stmt); ++ tree base = NULL_TREE; ++ ++ /* _1 = MEM[base: a, index: i, step: 8, offset: 0B]; */ ++ if (REFERENCE_CLASS_P (op1) && (base = get_base_address (op1)) ++ && TREE_CODE (base) != SSA_NAME && !is_gimple_min_invariant (base)) ++ add_ref (references, op1, stmt, false, true); ++ ++ if (REFERENCE_CLASS_P (op0) && get_base_address (op0)) ++ add_ref (references, op0, stmt, false, false); ++ } ++ else if (gimple_code (stmt) == GIMPLE_CALL) ++ get_references_in_gimple_call (stmt, references); ++ ++ return; ++} ++ ++/* flag of loop filter out. */ ++ ++struct loop_filter_out_flag ++{ ++ /* Use external gimple. */ ++ bool use_ext_gimple; ++ ++ /* Use external call. */ ++ bool use_ext_call; ++ ++ /* Use external node. */ ++ bool use_ext_node; ++ ++ /* Use loop defined in macros. */ ++ bool use_macro_loop; ++ ++ /* Use external node. */ ++ bool use_cond_func; ++}; ++ ++/* Check whether an external node is used. */ ++ ++bool use_ext_node_p (const vector &references, ++ unsigned int &start) ++{ ++ expanded_location cfun_xloc ++ = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); ++ ++ unsigned i = start; ++ start = references.size (); ++ for (; i < references.size (); i++) ++ { ++ data_ref ref = references[i]; ++ expanded_location xloc = expand_location (ref.stmt->location); ++ if (xloc.file && filename_cmp (cfun_xloc.file, xloc.file)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "use_ext_node\n\n"); ++ return true; ++ } ++ } ++ return false; ++} ++ ++/* Determine whether to filter out loops by stmt. */ ++ ++bool ++filter_out_loop_by_stmt_p (loop_filter_out_flag &loop_filter, gimple *stmt, ++ const vector &references, unsigned int &start) ++{ ++ /* check use_ext_gimple. */ ++ expanded_location cfun_xloc ++ = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); ++ expanded_location xloc = expand_location (stmt->location); ++ if (xloc.file && filename_cmp (cfun_xloc.file, xloc.file)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "use_ext_gimple: "); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); ++ } ++ loop_filter.use_ext_gimple = true; ++ return true; ++ } ++ ++ /* check use_ext_call. */ ++ if (gimple_code (stmt) == GIMPLE_CALL && !gimple_call_internal_p (stmt)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "use_ext_call: "); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); ++ } ++ loop_filter.use_ext_call = true; ++ return true; ++ } ++ ++ /* check use_macro_loop. */ ++ if (xloc.file && xloc.column != 1) ++ loop_filter.use_macro_loop = false; ++ ++ /* checke use_cond_func, VEC_COND_EXPR/MIN_EXPR/MAX_EXPR. */ ++ if (gimple_code (stmt) == GIMPLE_ASSIGN) ++ { ++ enum tree_code rhs_code = gimple_assign_rhs_code (stmt); ++ if (rhs_code == VEC_COND_EXPR || rhs_code == MIN_EXPR ++ || rhs_code == MAX_EXPR) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "use_cond_func: "); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); ++ } ++ loop_filter.use_cond_func = true; ++ return true; ++ } ++ } ++ ++ /* check use_ext_node. */ ++ if (use_ext_node_p (references, start)) ++ { ++ loop_filter.use_ext_node = true; ++ return true; ++ } ++ ++ return false; ++} ++ ++/* Dump the flag type of the loop is filtered out. */ ++ ++void ++dump_loop_filter_out_flag (loop_filter_out_flag &loop_filter) ++{ ++ if (loop_filter.use_ext_gimple) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: use_ext_gimple\n"); ++ } ++ if (loop_filter.use_ext_call) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: use_ext_call\n"); ++ } ++ ++ if (loop_filter.use_ext_node) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: use_ext_node\n"); ++ } ++ ++ if (loop_filter.use_macro_loop) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: use_macro_loop\n"); ++ } ++ ++ if (loop_filter.use_cond_func) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: use_cond_func\n"); ++ } ++} ++ ++/* Get references in loop. */ ++ ++bool ++get_references_in_loop (vector &references, ++ loop_filter_out_flag &loop_filter, ++ class loop *loop) ++{ ++ unsigned int start = 0; ++ bool filter_out_loop = true; ++ ++ /* Analyze each bb in the loop. */ ++ basic_block *body = get_loop_body_in_dom_order (loop); ++ for (unsigned i = 0; i < loop->num_nodes; i++) ++ { ++ basic_block bb = body[i]; ++ if (bb->loop_father != loop) ++ continue; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\n==== the %dth loop bb body ====\n", i); ++ gimple_dump_bb (dump_file, bb, 0, dump_flags); ++ fprintf (dump_file, "\n"); ++ } ++ ++ gimple_stmt_iterator bsi; ++ for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) ++ { ++ gimple *stmt = gsi_stmt (bsi); ++ get_references_in_stmt (stmt, references); ++ filter_out_loop = filter_out_loop_by_stmt_p (loop_filter, stmt, ++ references, start); ++ if (filter_out_loop) ++ break; ++ } ++ if (filter_out_loop) ++ break; ++ } ++ free (body); ++ return !filter_out_loop; ++} ++ ++/* Determine whether the loop is a single path. */ ++ ++bool ++single_path_p (class loop *loop, basic_block bb) ++{ ++ if (bb == NULL) ++ return false; ++ if (bb == loop->latch) ++ return true; ++ ++ gimple *stmt = last_stmt (bb); ++ bool res = false; ++ ++ if (stmt && gimple_code (stmt) == GIMPLE_COND) ++ { ++ gcc_assert (EDGE_COUNT (bb->succs) == 2); ++ edge true_edge = NULL; ++ edge false_edge = NULL; ++ extract_true_false_edges_from_block (bb, &true_edge, &false_edge); ++ ++ /* Returns false, if a branch occurs. */ ++ if (true_edge->dest->loop_father == loop ++ && false_edge->dest->loop_father == loop) ++ return false; ++ ++ if (true_edge->dest->loop_father == loop) ++ res = single_path_p (loop, true_edge->dest); ++ else ++ res = single_path_p (loop, false_edge->dest); ++ } ++ else ++ { ++ edge e = find_fallthru_edge (bb->succs); ++ if (e) ++ res = single_path_p (loop, e->dest); ++ } ++ return res; ++} ++ ++/* Computes an estimated number of insns in LOOP, weighted by WEIGHTS. ++ Assume that the HPC data reading and calculation process does not involve ++ adding branches in loops. Therefore, all bbs of loops are directly used for ++ calculation (excluding embedded loops) without considering branch weighting. ++*/ ++ ++unsigned ++estimate_loop_insns (class loop *loop, eni_weights *weights) ++{ ++ basic_block *body = get_loop_body (loop); ++ gimple_stmt_iterator gsi; ++ unsigned size = 0, i; ++ ++ for (i = 0; i < loop->num_nodes; i++) ++ { ++ basic_block bb = body[i]; ++ if (bb->loop_father != loop) ++ { ++ continue; ++ } ++ for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi)) ++ size += estimate_num_insns (gsi_stmt (gsi), weights); ++ } ++ free (body); ++ ++ return size; ++} ++ ++/* Check whether the memory access is dense. */ ++ ++bool ++dense_memory_p (const vector &references, class loop *loop) ++{ ++ int ref_count = references.size (); ++ unsigned int ninsns = estimate_loop_insns (loop, &eni_size_weights); ++ float mem_to_insn_ratio = (float)ref_count / (float)ninsns; ++ ++ /* The number of cores to be run and DDR bandwidth information can be ++ transferred to flexibly adjust the threshold. */ ++ bool dense_mem = (mem_to_insn_ratio >= (param_mem_access_ratio / 100.0) ++ && ref_count >= param_mem_access_num); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (cfun->decl)); ++ ++ /* Dump dense memory source code location. */ ++ if (ref_count && references[0].stmt->location) ++ { ++ expanded_location xloc = expand_location ++ (references[0].stmt->location); ++ int fn_start = 0; ++ if (DECL_SOURCE_LOCATION (current_function_decl)) ++ fn_start = expand_location ( ++ DECL_SOURCE_LOCATION (current_function_decl)).line; ++ int fn_end = fn_start; ++ if (cfun->function_end_locus) ++ fn_end = expand_location (cfun->function_end_locus).line; ++ if (xloc.file) ++ fprintf (dump_file, "[%s:%s(%d-%d):%d:%d] ", ++ xloc.file, fn_name, fn_start, fn_end, ++ xloc.line, xloc.column); ++ } ++ ++ /* Dump memory dense information. */ ++ if (dense_mem) ++ fprintf (dump_file, "dense memory access: "); ++ else ++ fprintf (dump_file, "non-dense mem access: "); ++ fprintf (dump_file, ++ "ref_count = %d, ninsns = %d, mem_to_insn_ratio = %f\n\n", ++ ref_count, ninsns, mem_to_insn_ratio); ++ } ++ ++ return dense_mem; ++} ++ ++/* Analyze the inner loop and get the loop with dense memory access. */ ++ ++bool ++get_dense_memory_kernels (vector &kernels, ++ map > &kernels_refs) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nPhase 1: get_dense_memory_kernels\n\n"); ++ class loop *loop = NULL; ++ FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) ++ { ++ number_of_latch_executions (loop); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\n========== Processing loop %d: ==========\n", ++ loop->num); ++ loop_dump (dump_file, loop); ++ flow_loop_dump (loop, dump_file, NULL, 1); ++ fprintf (dump_file, "loop unroll: %d\n", loop->unroll); ++ } ++ ++ if (get_loop_exit_edges (loop).length () != 1 ++ || !single_path_p (loop, loop->header)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: loop_branching\n"); ++ continue; ++ } ++ ++ vector references; ++ loop_filter_out_flag loop_filter = {false, false, false, true, false}; ++ ++ if (!get_references_in_loop (references, loop_filter, loop)) ++ { ++ dump_loop_filter_out_flag (loop_filter); ++ continue; ++ } ++ ++ if (dense_memory_p (references, loop)) ++ { ++ kernels_refs[loop] = references; ++ kernels.push_back (loop); ++ } ++ } ++ return kernels.size () > 0; ++} ++ ++/* ================ phase 2 trace_data_refs_info ================ */ ++ ++/* Determine whether the declaration is a non-vectorized. */ ++ ++bool ++generic_decl_p (tree expr) ++{ ++ if (expr == NULL_TREE) ++ return false; ++ enum tree_code expr_code = TREE_CODE (expr); ++ if (expr_code != VAR_DECL && expr_code != PARM_DECL ++ && expr_code != COMPONENT_REF) ++ return false; ++ ++ tree type = TREE_TYPE (expr); ++ while (type) ++ { ++ if (TREE_CODE (type) != VECTOR_TYPE) ++ /* TREE_TYPE (NODE) ( ++ CONTAINS_STRUCT_CHECK (NODE, TS_TYPED)->typed.type) */ ++ type = CONTAINS_STRUCT_CHECK (type, TS_TYPED) ? TREE_TYPE (type) : NULL; ++ else ++ return false; ++ } ++ return true; ++} ++ ++/* Initial worklist preparation for source variable tracing. ++ Add different initial node based on different gimple statements. */ ++ ++void ++add_worklist (vector &worklist, set &walked, gimple *def_stmt) ++{ ++ if (gimple_code (def_stmt) == GIMPLE_PHI) ++ { ++ for (unsigned i = 0; i < gimple_phi_num_args (def_stmt); i++) ++ { ++ tree node = gimple_phi_arg_def (def_stmt, i); ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ walked.insert (node); ++ } ++ } ++ } ++ else if (is_gimple_assign (def_stmt)) ++ { ++ tree_code rhs_code = gimple_assign_rhs_code (def_stmt); ++ if (rhs_code == POINTER_PLUS_EXPR || rhs_code == NEGATE_EXPR ++ || rhs_code == NOP_EXPR || rhs_code == SSA_NAME ++ || rhs_code == COMPONENT_REF) ++ { ++ tree node = gimple_assign_rhs1 (def_stmt); ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ walked.insert (node); ++ } ++ } ++ else if (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR) ++ { ++ tree node = gimple_assign_rhs1 (def_stmt); ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ walked.insert (node); ++ } ++ node = gimple_assign_rhs2 (def_stmt); ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ walked.insert (node); ++ } ++ } ++ else ++ { ++ /* unhandled assign rhs_code: _219 = _17 * _70; ++ _17 = *grid_56(D).sst.span; ++ _70 = *grid_56(D).sst.dim[0].stride; ++ */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "unhandled assign rhs_code: "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ } ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "unsupported tracing stmt: "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ } ++} ++ ++ ++/* Tracing source variables: ++ vectp.1 = a_2(D) + _3; ++ _4 = &MEM[base: vectp.1, index: ivtmp_5, step: 8, offset: 0B]; ++ vect__1.6 = .MASK_LOAD (_4, 64B, loop_mask_7); ++ ++ _1 = (sizetype) b_2(D); ++ vect_patt_3.3 = .MASK_GATHER_LOAD (_1, vect__4.4, 8, { 0.0, ... }, ++ loop_mask_5); ++ ... ++ Due to previous pass optimizations, the current tracing method can find ++ several source variable candidates. We decide to record them in a map and ++ later filter out the true base variable by some criteria. ++*/ ++ ++void ++trace_base_var_helper (tree arg, set &walked, ++ map& base_var_candid) ++{ ++ if (arg == NULL) ++ return; ++ ++ /* Array type. */ ++ tree op0 = NULL; ++ if (TREE_CODE (arg) == ADDR_EXPR ++ && (op0 = TREE_OPERAND (arg, 0)) && generic_decl_p (op0)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "array type\n"); ++ base_var_candid[op0] += 1; ++ return; ++ } ++ ++ /* Pointer type. */ ++ if (TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE && generic_decl_p (arg)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "pointer type\n"); ++ base_var_candid[arg] += 1; ++ return; ++ } ++ ++ /* SSA_NAME type. */ ++ if (TREE_CODE (arg) != SSA_NAME) ++ return; ++ ++ tree tmp_var = SSA_NAME_VAR (arg); ++ if (tmp_var && generic_decl_p (tmp_var) ++ && TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "ssa pointer type\n"); ++ base_var_candid[tmp_var] += 1; ++ return; ++ } ++ ++ gimple *def_stmt = SSA_NAME_DEF_STMT (arg); ++ if (def_stmt == NULL) ++ return; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, arg, TDF_SLIM); ++ fprintf (dump_file, "\t\t: "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM); ++ } ++ ++ vector worklist; ++ add_worklist (worklist, walked, def_stmt); ++ for (unsigned i = 0; i < worklist.size (); ++i) ++ trace_base_var_helper (worklist[i], walked, base_var_candid); ++} ++ ++/* Identify the base variable traced from base address of memory reference. ++ We recognize that current method could detect several base variable ++ candidates and the temporary criteria for base variable determination ++ is that either one of the following statement is true: ++ 1. The number of base variable candidates is 1; ++ 2. The number of detected gimple statements for some variable is 1. ++ We may use other criteria or relax the current criteria ++ (e.g., criterion 2: 1 -> any odd number). */ ++ ++bool ++trace_base_var (tree &var, tree arg, set &walked) ++{ ++ map base_var_candid; ++ trace_base_var_helper (arg, walked, base_var_candid); ++ bool is_tracing_unusual = false; ++ if (base_var_candid.size () == 1) ++ var = base_var_candid.begin ()->first; ++ else ++ { ++ is_tracing_unusual = true; ++ for (const pair& base_var_count : base_var_candid) ++ if (base_var_count.second == 1) ++ var = base_var_count.first; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Traced variables at "); ++ print_generic_expr (dump_file, arg, TDF_SLIM); ++ fprintf (dump_file, ":\n"); ++ for (const pair& base_var_count : base_var_candid) ++ fprintf (dump_file, "%s:%d, ", get_name (base_var_count.first), ++ base_var_count.second); ++ fprintf (dump_file, "\n"); ++ ++ if (var == NULL_TREE) ++ fprintf (dump_file, "Unhandled scenario for tracing base variable.\n"); ++ else if (is_tracing_unusual && var != NULL_TREE) ++ fprintf (dump_file, "Tracing unusual number or occurrences of base " ++ "variables. Choose %s.\n", get_name (var)); ++ } ++ return var != NULL_TREE; ++} ++ ++/* Tracing direct memory reference information. */ ++ ++bool ++trace_direct_mem_ref (data_ref &mem_ref, set &traced_ref_stmt) ++{ ++ if (TREE_CODE (mem_ref.ref) != TARGET_MEM_REF) ++ return false; ++ ++ /* Direct memory access, regardless of whether it is in vectorized form, ++ can be determined through TARGET_MEM_REF. */ ++ mem_ref.base = TREE_OPERAND (mem_ref.ref, 0); ++ mem_ref.offset = TREE_OPERAND (mem_ref.ref, 1); ++ mem_ref.index = TREE_OPERAND (mem_ref.ref, 2); ++ mem_ref.step = TREE_OPERAND (mem_ref.ref, 3); ++ ++ set walked; ++ if (mem_ref.var == NULL_TREE ++ && !trace_base_var (mem_ref.var, mem_ref.base, walked)) ++ return false; ++ ++ traced_ref_stmt.insert (mem_ref.stmt); ++ return true; ++} ++ ++/* Recursively trace and check whether the definition stmt of the ++ index operand is a recorded stmt in direct access tracing. ++ If true, it is an indirect access. */ ++ ++bool ++trace_indirect_operand (tree arg, set &traced_ref_stmt) ++{ ++ if (TREE_CODE (arg) != SSA_NAME) ++ return false; ++ ++ gimple *def_stmt = SSA_NAME_DEF_STMT (arg); ++ ++ if (traced_ref_stmt.count (def_stmt)) ++ return true; ++ ++ if (!def_stmt || !is_gimple_assign (def_stmt)) ++ return false; ++ ++ tree_code rhs_code = gimple_assign_rhs_code (def_stmt); ++ /* Collect a whitelist of gimple_assign_rhs_code for tracing pointer/array ++ type indirect memory access. Please check examples before function ++ trace_indirect_ptr and trace_indirect_array. */ ++ if (rhs_code != MULT_EXPR && rhs_code != NOP_EXPR ++ && rhs_code != CONVERT_EXPR && rhs_code != PLUS_EXPR ++ && rhs_code != ARRAY_REF) ++ return false; ++ ++ tree op = NULL_TREE; ++ ssa_op_iter iter; ++ FOR_EACH_SSA_TREE_OPERAND (op, def_stmt, iter, SSA_OP_USE) ++ { ++ if (trace_indirect_operand (op, traced_ref_stmt)) ++ return true; ++ } ++ return false; ++} ++ ++/* Trace the pointer of the indirect memory access: ++ 1) obtain the base address of the indirect memory access. ++ 2) ensure that the index has been traced in the direct memory access. ++ ++ _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; // Traced in ++ direct access ++ _4 = (long unsigned int) _1; ++ _5 = _4 * 8; ++ _6 = p(D) + _5; // get base ++ _7 = *_6; // start tracing ++*/ ++ ++bool ++trace_indirect_ptr (tree &base, tree &index, tree arg, ++ set traced_ref_stmt) ++{ ++ gimple *def_stmt = SSA_NAME_DEF_STMT (arg); ++ ++ if (!def_stmt || !is_gimple_assign (def_stmt)) ++ return false; ++ ++ tree_code rhs_code = gimple_assign_rhs_code (def_stmt); ++ if (rhs_code != POINTER_PLUS_EXPR) ++ return false; ++ ++ /* POINTER_PLUS_EXPR, The first operand is always a pointer/reference type. ++ The second operand is always an unsigned integer type compatible with ++ sizetype. */ ++ base = gimple_assign_rhs1 (def_stmt); ++ index = gimple_assign_rhs2 (def_stmt); ++ ++ return trace_indirect_operand (index, traced_ref_stmt); ++} ++ ++/* Trace the array of the indirect memory access: ++ 1) obtain the base address of the indirect memory access. ++ 2) ensure that the index has been traced in the direct memory access. ++ ++ _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; // Traced in ++ direct access ++ _4 = (integer(kind=8)) _1; ++ _5 = _4 + 135; ++ _6 = p[_5]; // start tracing ++*/ ++ ++bool ++trace_indirect_array (tree &base, tree &index, ++ set traced_ref_stmt, tree ref) ++{ ++ if (TREE_CODE (ref) != ARRAY_REF) ++ return false; ++ base = TREE_OPERAND (ref, 0); ++ index = TREE_OPERAND (ref, 1); ++ return trace_indirect_operand (index, traced_ref_stmt); ++} ++ ++/* Tracing indirect memory reference information. ++ Include tracing of base addresses and source variable. ++ _x(ssa name) -> a_2(base addr) -> a(src var) */ ++ ++bool ++trace_indirect_mem_ref (data_ref &mem_ref, ++ set &traced_ref_stmt) ++{ ++ /* Processing of vectorization types. */ ++ if (mem_ref.vectorize_p) ++ { ++ tree op = gimple_call_arg (mem_ref.stmt, 1); ++ if (trace_indirect_operand (op, traced_ref_stmt)) ++ { ++ mem_ref.base = gimple_call_arg (mem_ref.stmt, 0); ++ mem_ref.regular_p = false; ++ set walked; ++ if (mem_ref.var == NULL_TREE ++ && !trace_base_var (mem_ref.var, mem_ref.base, walked)) ++ return false; ++ return true; ++ } ++ return false; ++ } ++ ++ /* Processing of non-vectorized types. */ ++ tree op = NULL_TREE; ++ ssa_op_iter iter; ++ FOR_EACH_SSA_TREE_OPERAND (op, mem_ref.stmt, iter, SSA_OP_USE) ++ { ++ ++ /* Array type: ++ _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; ++ _4 = c[_1]; ++ ++ Pointer type: ++ _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; ++ _4 = (long unsigned int) _1; ++ _5 = _4 * 8; ++ _6 = p(D) + _5; ++ _7 = *_6; ++ */ ++ tree base = NULL_TREE; ++ tree index = NULL_TREE; ++ if (trace_indirect_array (base, index, traced_ref_stmt, mem_ref.ref) ++ || trace_indirect_ptr (base, index, op, traced_ref_stmt)) ++ { ++ /* ARRAY_REF, The first operand is the array; ++ the second is the index. */ ++ mem_ref.base = base; ++ mem_ref.index = index; ++ mem_ref.regular_p = false; ++ set walked; ++ if (mem_ref.var == NULL_TREE ++ && !trace_base_var (mem_ref.var, mem_ref.base, walked)) ++ return false; ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++/* Trace references base info: ++ 1) Parallel analysis ++ 2) Memory access rule analysis ++ 3) Tracing base address and source variable of memory references ++ We will extend parallel analysis later. ++*/ ++ ++void ++trace_ref_info (data_ref &mem_ref, set &traced_ref_stmt) ++{ ++ enum tree_code ref_code = TREE_CODE (mem_ref.ref); ++ if (/* Vectorized and non-vectorized direct access. */ ++ ref_code != TARGET_MEM_REF ++ /* non-vectorized indirect memory access. */ ++ && ref_code != MEM_REF && ref_code != ARRAY_REF ++ /* vectorized indirect memory access. */ ++ && ref_code != SSA_NAME) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "ref is another tree-code: "); ++ fprintf (dump_file, "stmt: "); ++ print_gimple_stmt (dump_file, mem_ref.stmt, 0, TDF_LINENO); ++ fprintf (dump_file, "ref: "); ++ print_generic_expr (dump_file, mem_ref.ref, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ return; ++ } ++ ++ /* 1) Direct and indirect access traces and traces source variables. */ ++ if (!trace_direct_mem_ref (mem_ref, traced_ref_stmt) ++ && !trace_indirect_mem_ref (mem_ref, traced_ref_stmt)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Tracing failed.\n\n"); ++ return; ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Tracing succeeded.\n\n"); ++ mem_ref.trace_status_p = true; ++} ++ ++/* Tracing and sorting reference groups. */ ++ ++void ++trace_data_refs_info (vector &kernels, ++ map > &loop_refs) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nPhase 2: trace_all_references_info\n\n"); ++ ++ set traced_ref_stmt; ++ ++ for (unsigned i = 0; i < kernels.size (); ++i) ++ { ++ class loop* loop = kernels[i]; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "loop header %d:\n", loop->header->index); ++ for (unsigned j = 0; j < loop_refs[loop].size (); ++j) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "trace_references_base_info %d:\n", j); ++ print_generic_expr (dump_file, loop_refs[loop][j].ref, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ trace_ref_info (loop_refs[loop][j], traced_ref_stmt); ++ } ++ } ++} ++ ++/* ================ phase 3 analyze_nested_kernels ================ */ ++ ++/* Return the inner most type for arrays and pointers of TYPE. */ ++ ++tree ++inner_type (tree type) ++{ ++ while (POINTER_TYPE_P (type) ++ || TREE_CODE (type) == ARRAY_TYPE) ++ type = TREE_TYPE (type); ++ return type; ++} ++ ++/* Check whether the input iv is the loop dimension boundary. */ ++ ++bool ++loop_bound_iv_p (tree t, tree &outer_loop_t) ++{ ++ if (t == NULL || TREE_CODE (t) != SSA_NAME ++ || TREE_CODE (TREE_TYPE (t)) != INTEGER_TYPE) ++ return false; ++ ++ gimple *def_stmt = SSA_NAME_DEF_STMT (t); ++ if (gimple_code (def_stmt) != GIMPLE_PHI) ++ return false; ++ ++ /* Filter scenarios with only two phi inputs. */ ++ if (gimple_phi_num_args (def_stmt) != 2) ++ return false; ++ ++ gphi *phi_stmt = as_a (def_stmt); ++ basic_block src0 = gimple_phi_arg_edge (phi_stmt, 0)->src; ++ basic_block src1 = gimple_phi_arg_edge (phi_stmt, 1)->src; ++ ++ class loop *loop = loop_containing_stmt (def_stmt); ++ bool res = false; ++ /* Two phi inputs, one from the current loop and one from the outer loop. */ ++ if ((src0->loop_father == loop) && (src1->loop_father == loop_outer (loop))) ++ { ++ outer_loop_t = gimple_phi_arg_def (def_stmt, 1); ++ res = true; ++ } ++ else if ((src1->loop_father == loop) ++ && (src0->loop_father == loop_outer (loop))) ++ { ++ outer_loop_t = gimple_phi_arg_def (def_stmt, 0); ++ res = true; ++ } ++ ++ if (res) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "===> "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM); ++ } ++ return true; ++ } ++ return false; ++} ++ ++/* add worklist and walked list. */ ++ ++void ++add_worklist_walked (vector &worklist, set &walked, tree node) ++{ ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ /* Avoid phi node cycle introduction, which makes the worklist unable ++ to end. */ ++ walked.insert (node); ++ } ++} ++ ++/* check bound iv and add worklist. */ ++ ++void ++check_bound_iv_and_add_worklist (vector &worklist, set &walked, ++ tree t, data_ref &mem_ref) ++{ ++ if (TREE_CODE (t) != SSA_NAME) ++ return; ++ ++ gimple *def_stmt = SSA_NAME_DEF_STMT (t); ++ if (def_stmt == NULL) ++ return; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, t, TDF_SLIM); ++ fprintf (dump_file, "\t\t: "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM); ++ } ++ ++ if (gimple_code (def_stmt) == GIMPLE_PHI) ++ { ++ tree out_loop_t = NULL_TREE; ++ if (loop_bound_iv_p (t, out_loop_t)) ++ { ++ mem_ref.loop_bounds.push_back (loop_bound (t, def_stmt)); ++ add_worklist_walked (worklist, walked, out_loop_t); ++ } ++ } ++ else if (is_gimple_assign (def_stmt)) ++ { ++ tree_code rhs_code = gimple_assign_rhs_code (def_stmt); ++ ++ /* unary. */ ++ if (rhs_code == SSA_NAME || rhs_code == NOP_EXPR) ++ add_worklist_walked (worklist, walked, gimple_assign_rhs1 (def_stmt)); ++ else if (rhs_code == POINTER_PLUS_EXPR) ++ add_worklist_walked (worklist, walked, gimple_assign_rhs2 (def_stmt)); ++ ++ /* binary. */ ++ else if (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR ++ || rhs_code == MULT_EXPR) ++ { ++ add_worklist_walked (worklist, walked, gimple_assign_rhs1 (def_stmt)); ++ add_worklist_walked (worklist, walked, gimple_assign_rhs2 (def_stmt)); ++ } ++ } ++} ++ ++/* DFS trace the loop bound of iv. */ ++ ++bool ++trace_loop_bound_iv (data_ref &mem_ref) ++{ ++ /* Indirect memory access, the size cannot be determined based on the loop ++ boundary. */ ++ if (!mem_ref.regular_p) ++ return false; ++ ++ /* Determine and record the boundary iv of the current index, ++ but do not trace it. */ ++ tree outer_loop_t = NULL_TREE; ++ if (loop_bound_iv_p (mem_ref.index, outer_loop_t)) ++ mem_ref.loop_bounds.push_back ( ++ loop_bound (mem_ref.index, SSA_NAME_DEF_STMT (mem_ref.index))); ++ ++ vector worklist; ++ worklist.push_back (mem_ref.base); ++ set walked; ++ ++ while (worklist.size ()) ++ { ++ tree t = worklist.back (); ++ worklist.pop_back (); ++ ++ /* add worklist. */ ++ check_bound_iv_and_add_worklist (worklist, walked, t, mem_ref); ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nmem_ref access dimension: %ld\n", ++ mem_ref.loop_bounds.size ()); ++ ++ return mem_ref.loop_bounds.size () > 0; ++} ++ ++/* dump loop bound. */ ++ ++void ++loop_bound_dump (FILE *file, loop_bound &lb) ++{ ++ class loop *loop = lb.loop; ++ fprintf (file, "loop_bound: loop_%d (", loop->num); ++ if (loop->header) ++ fprintf (file, "header = %d", loop->header->index); ++ else ++ { ++ fprintf (file, "deleted)\n"); ++ return; ++ } ++ if (loop->latch) ++ fprintf (file, ", latch = %d", loop->latch->index); ++ fprintf (file, ", lb_niters = "); ++ print_generic_expr (file, lb.niters); ++ fprintf (file, ")\n"); ++} ++ ++/* static calculate data size. */ ++ ++void ++static_calculate_data_size (data_ref &mem_ref) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nstatic_calculate_data_size\n"); ++ ++ tree size_unit = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (mem_ref.var))); ++ HOST_WIDE_INT type_size = size_unit ? tree_to_uhwi (size_unit) : 0; ++ for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i) ++ { ++ HOST_WIDE_INT est_niter = tree_to_uhwi (mem_ref.loop_bounds[i].niters); ++ unsigned int unroll = mem_ref.loop_bounds[i].unroll; ++ if (i == 0) ++ { ++ /* The unit conversion between byte, kilobytes, and megabytes is ++ 1024. */ ++ mem_ref.data_size = double (type_size ++ * est_niter * unroll) / 1024 / 1024; ++ } ++ else ++ mem_ref.data_size *= est_niter * unroll; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "static_data_size: %lf\n", mem_ref.data_size); ++ } ++} ++ ++/* Recursive tracing and creating of dominant nodes. */ ++ ++tree ++trace_and_create_dominate_expr (tree expr, class loop *outermost) ++{ ++ if (expr == NULL_TREE || is_gimple_constant (expr)) ++ return expr; ++ ++ if (TREE_CODE (expr) != SSA_NAME) ++ return NULL_TREE; ++ ++ if (SSA_NAME_IS_DEFAULT_DEF (expr)) ++ return expr; ++ ++ gimple *stmt = SSA_NAME_DEF_STMT (expr); ++ basic_block def_bb = gimple_bb (stmt); ++ if (def_bb == NULL || def_bb->loop_father == NULL) ++ return NULL_TREE; ++ ++ if (dominated_by_p (CDI_DOMINATORS, outermost->header, def_bb)) ++ return expr; ++ ++ if (gimple_code (stmt) != GIMPLE_ASSIGN) ++ return NULL_TREE; ++ ++ enum tree_code rhs_code = gimple_assign_rhs_code (stmt); ++ tree_code_class code_class = TREE_CODE_CLASS (rhs_code); ++ tree type = TREE_TYPE (gimple_assign_lhs (stmt)); ++ tree rhs1 = trace_and_create_dominate_expr ++ (gimple_assign_rhs1 (stmt), outermost); ++ if (rhs1 == NULL_TREE) ++ return NULL_TREE; ++ ++ if (code_class == tcc_unary) ++ { ++ tree expr_new = build1 (rhs_code, type, rhs1); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "expr_new = "); ++ print_generic_expr (dump_file, expr_new, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr_new; ++ } ++ else if (code_class == tcc_binary) ++ { ++ tree rhs2 = trace_and_create_dominate_expr ++ (gimple_assign_rhs2 (stmt), outermost); ++ if (rhs2 == NULL_TREE) ++ return NULL_TREE; ++ ++ tree expr_new = fold_build2 (rhs_code, type, rhs1, rhs2); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "expr_new = "); ++ print_generic_expr (dump_file, expr_new, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr_new; ++ } ++ ++ return NULL_TREE; ++} ++ ++/* Recursive parsing and craating of nodes in expr expressions. */ ++ ++tree ++parse_and_create_expr (tree expr, class loop *outermost) ++{ ++ if (expr == NULL_TREE || expr == chrec_dont_know ++ || is_gimple_constant (expr) || TREE_CODE (expr) == ADDR_EXPR) ++ { ++ /* tcc_expression (e.g., &q) situation combined with tcc_unary. */ ++ if (TREE_CODE (expr) == ADDR_EXPR && dump_file ++ && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "tcc_expression case in ADDR_EXPR: "); ++ print_generic_expr (dump_file, expr, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr; ++ } ++ ++ if (TREE_CODE (expr) == SSA_NAME) ++ return trace_and_create_dominate_expr (expr, outermost); ++ else if (EXPR_P (expr)) ++ { ++ enum tree_code tree_code = TREE_CODE (expr); ++ tree_code_class code_class = TREE_CODE_CLASS (tree_code); ++ tree type = TREE_TYPE (expr); ++ tree op1 = parse_and_create_expr (TREE_OPERAND (expr, 0), outermost); ++ if (op1 == NULL_TREE) ++ return NULL_TREE; ++ ++ if (code_class == tcc_unary) ++ { ++ tree expr_new = build1 (tree_code, type, op1); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "expr_new = "); ++ print_generic_expr (dump_file, expr_new, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr_new; ++ } ++ else if (code_class == tcc_binary) ++ { ++ tree op2 = parse_and_create_expr (TREE_OPERAND (expr, 1), outermost); ++ if (op2 == NULL_TREE) ++ return NULL_TREE; ++ ++ tree expr_new = fold_build2 (tree_code, type, op1, op2); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "expr_new = "); ++ print_generic_expr (dump_file, expr_new, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr_new; ++ } ++ } ++ return NULL_TREE; ++} ++ ++/* Trace and creat dominate loop bounds. */ ++ ++void ++trace_and_create_dominate_loop_bounds (data_ref &mem_ref) ++{ ++ /* Check whether the niters is a loop dominant. ++ If not, trace and determine whether the result is dominant. If yes, create ++ the expr of the dominant node. ++ */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\ntrace_and_create_dominate_loop_bounds\n"); ++ ++ /* Determine the relationship between the boundary of the innermost loop and ++ the dominant of the outer loop and the processing. */ ++ loop_bound &outermost = mem_ref.loop_bounds.back (); ++ for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i) ++ { ++ loop_bound ¤t = mem_ref.loop_bounds[i]; ++ tree &niters = current.niters; ++ if (TREE_CODE (niters) == COND_EXPR) ++ niters = TREE_OPERAND (niters, 1); ++ ++ niters = parse_and_create_expr (niters, outermost.loop); ++ ++ if (niters == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM); ++ fprintf (dump_file, "Tracing loop bound failed at dimension %d", ++ i); ++ } ++ mem_ref.calc_by = UNHANDLE_CALC; ++ break; ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ loop_bound_dump (dump_file, mem_ref.loop_bounds[i]); ++ } ++} ++ ++/* trace the dimension and corresponding loop bounds of mem_ref. ++ This function is used to supplement the information of mem_ref.loop_bounds. ++*/ ++ ++void ++trace_ref_dimension_and_loop_bounds (data_ref &mem_ref) ++{ ++ /* In the same loop, some memory access dimensions are different. Remove ++ variables with fewer dimensions. ++ Previous cyclic filtering conditions and memory access node records and ++ tracing. ++ The false result is also processed. ++ */ ++ if (dump_file) ++ fprintf (dump_file, "\ncalculate_data_size\n"); ++ ++ /* Trace the loop bound iv of ref to determine the dimension. */ ++ /* Record data from the loop perspective to avoid repeated tracing. */ ++ if (!trace_loop_bound_iv (mem_ref)) ++ return; ++ ++ /* The traced mem_ref may have multiple dimensions, which corresponds to ++ multiple loops. */ ++ /* And in the dimension-by-dimensional analysis, the computable way is ++ continuously reduced. */ ++ mem_ref.calc_by = STATIC_CALC; ++ for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i) ++ { ++ class loop *loop = mem_ref.loop_bounds[i].loop; ++ tree &niters = mem_ref.loop_bounds[i].niters; ++ ++ /* Set NULL_TREE to ensure that nb_iterations are retraced and ++ vec_nb_iterations are also extracted. */ ++ loop->nb_iterations = NULL_TREE; ++ niters = number_of_latch_executions (loop, false); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ loop_dump (dump_file, loop); ++ ++ if (loop->unroll) ++ { ++ if (loop->unroll == USHRT_MAX && dump_file ++ && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "loop->unroll = USHRT_MAX = %d", USHRT_MAX); ++ mem_ref.loop_bounds[i].unroll = loop->unroll; ++ } ++ ++ if ((niters == chrec_dont_know) && loop->vec_nb_iterations ++ && (loop->vec_nb_iterations != chrec_dont_know)) ++ niters = loop->vec_nb_iterations; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ loop_bound_dump (dump_file, mem_ref.loop_bounds[i]); ++ ++ if (niters == NULL_TREE || niters == chrec_dont_know) ++ mem_ref.calc_by = min (mem_ref.calc_by, UNHANDLE_CALC); ++ else if (TREE_CODE (niters) != INTEGER_CST) ++ mem_ref.calc_by = min (mem_ref.calc_by, RUNTIME_CALC); ++ else ++ mem_ref.calc_by = min (mem_ref.calc_by, STATIC_CALC); ++ } ++ ++ if (mem_ref.calc_by == RUNTIME_CALC) ++ trace_and_create_dominate_loop_bounds (mem_ref); ++ else if (mem_ref.calc_by == STATIC_CALC) ++ static_calculate_data_size (mem_ref); ++} ++ ++/* analyze nested kernels. ++ 1. multidimension loop analyze. ++ 2. extended outer loop analyze. ++ Later we will extend outer loop analysis. ++*/ ++ ++bool ++analyze_nested_kernels (vector &kernels, ++ map > &loop_refs) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nPhase 3: analyze_nested_kernels\n\n"); ++ ++ for (unsigned i = 0; i < kernels.size (); ++i) ++ { ++ class loop* loop = kernels[i]; ++ if (loop_refs.count (loop) == 0) ++ continue; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\n\nloop header %d:\n", loop->header->index); ++ for (unsigned j = 0; j < loop_refs[loop].size (); ++j) ++ { ++ if (loop_refs[loop][j].trace_status_p == false) ++ continue; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\ntrace_reference_dimension at mem_ref " ++ "index %d in loop %d:\n", j, loop->num); ++ print_generic_expr (dump_file, loop_refs[loop][j].ref, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ trace_ref_dimension_and_loop_bounds (loop_refs[loop][j]); ++ } ++ ++ } ++ return true; ++} ++ ++/* ================ phase 4 filter_and_sort_kernels ================ */ ++ ++/* Get the edge probability information of each basic block in the loop. */ ++ ++float ++get_edge_prob (edge e, float minimum) ++{ ++ float fvalue = 0; ++ ++ profile_probability probability = e->probability; ++ if (probability.initialized_p ()) ++ { ++ fvalue = probability.to_reg_br_prob_base () / float (REG_BR_PROB_BASE); ++ if (fvalue < minimum && probability.to_reg_br_prob_base ()) ++ fvalue = minimum; ++ } ++ return fvalue; ++} ++ ++/* Get the next bb with a high branch probability. */ ++ ++basic_block ++next_high_probability_bb (basic_block bb) ++{ ++ if (bb == NULL) ++ return NULL; ++ ++ /* Limit the minimum probability value. */ ++ const float MINNUM_PROB = 0.00001f; ++ float minimum = MINNUM_PROB; ++ ++ gimple *stmt = last_stmt (bb); ++ if (stmt && gimple_code (stmt) == GIMPLE_COND) ++ { ++ edge true_edge = NULL; ++ edge false_edge = NULL; ++ extract_true_false_edges_from_block (bb, &true_edge, &false_edge); ++ ++ float true_edge_prob = get_edge_prob (true_edge, minimum); ++ float false_edge_prob = get_edge_prob (false_edge, minimum); ++ /* If the content of the branch does not include the candidate ++ kernel, the branch probability may not be limited. */ ++ /* The edge_prob may have precision error during static prediction, ++ so we need to relax the limit before comparison. */ ++ if ((true_edge_prob >= (param_branch_prob_threshold / 100.0) - minimum) ++ && flow_bb_inside_loop_p (bb->loop_father, true_edge->dest)) ++ return true_edge->dest; ++ else if ((false_edge_prob >= (param_branch_prob_threshold / 100.0) ++ - minimum) && flow_bb_inside_loop_p (bb->loop_father, ++ false_edge->dest)) ++ return false_edge->dest; ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "No high probability bb:"); ++ fprintf (dump_file, "current bb: %d, true: %f, false: %f\n", ++ bb->index, true_edge_prob, false_edge_prob); ++ } ++ return NULL; ++ } ++ } ++ else ++ { ++ edge e = find_fallthru_edge (bb->succs); ++ if (e) ++ return e->dest; ++ } ++ return NULL; ++} ++ ++ ++/* Dump loop header bb. */ ++ ++void ++dump_loop_headers (const char *name, vector &loops) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\n\n%s:\n", name); ++ fprintf (dump_file, "{ "); ++ for (unsigned int i = 0; i < loops.size (); i++) ++ fprintf (dump_file, "%d(%d) ", loops[i]->num, loops[i]->header->index); ++ fprintf (dump_file, "}\n\n"); ++ } ++} ++ ++/* Combine and sort candidate loops. */ ++ ++bool ++filter_and_sort_kernels (vector &sorted_kernels, ++ vector &kernels) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nPhase 4: filter_and_sort_kernels:\n\n"); ++ ++ set end_bb; ++ list walked_header_bb; /* Used to record nested loops. */ ++ ++ for (unsigned i = 0; i < kernels.size (); ++i) ++ end_bb.insert (kernels[i]->header); ++ ++ dump_loop_headers ("kernels", kernels); ++ ++ if (!param_filter_kernels) ++ { ++ for (vector::iterator it = kernels.begin (); ++ it != kernels.end (); ++it) ++ sorted_kernels.push_back (*it); ++ } ++ else ++ { ++ basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun); ++ ++ while (bb) ++ { ++ if (bb == NULL) ++ return false; ++ if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun)) ++ break; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "%d ", bb->index); ++ ++ /* bb is not the head of the loop, go to the next. */ ++ if (bb != bb->loop_father->header) ++ { ++ bb = next_high_probability_bb (bb); ++ continue; ++ } ++ ++ /* bb is the head of the loop. */ ++ if (bb != walked_header_bb.back ()) ++ { ++ if (end_bb.count (bb)) ++ { ++ sorted_kernels.push_back (bb->loop_father); ++ bb = single_exit (bb->loop_father)->dest; ++ continue; ++ } ++ if (loop_outer (bb->loop_father) != NULL ++ && get_loop_exit_edges (bb->loop_father).length () != 1) ++ return false; ++ walked_header_bb.push_back (bb); ++ bb = next_high_probability_bb (bb); ++ continue; ++ } ++ else ++ { ++ walked_header_bb.pop_back (); ++ bb = single_exit (bb->loop_father)->dest; ++ continue; ++ } ++ } ++ } ++ ++ dump_loop_headers ("sorted_kernels", sorted_kernels); ++ return true; ++} ++ ++/* ================ phase 5 record_and_sort_ref_groups ================ */ ++/* Memory reference score, different aspects of one memory reference. */ ++ ++struct ref_score ++{ ++ /* certain memory reference. */ ++ data_ref d_ref; ++ ++ /* local count for bb where memory reference is located. */ ++ gcov_type bb_count; ++ ++ /* line-location of memory reference. */ ++ int line; ++}; ++ ++ ++/* Memory reference group, different reference of the same variable. */ ++ ++struct ref_group ++{ ++ /* source variables. */ ++ tree var; ++ ++ /* variable size, Unit: MB. */ ++ double var_size; ++ ++ /* first ref for insert hint. */ ++ data_ref first_use; ++ ++ /* reuse scores of variables. */ ++ unsigned int reuse_level; ++ ++ /* method of calculating the var size. */ ++ calc_type calc_by; ++ ++ /* memory reference index for specific variable. */ ++ unsigned int mem_ref_index; ++ ++ /* Accessing Reference Records in Different Modes (key_index): ++ 000: write, random, non-parallel ++ 001: write, random, parallel ++ 010: write, regular, non-parallel ++ 011: write, regular, parallel ++ 100: read, random, non-parallel ++ 101: read, random, parallel ++ 110: read, regular, non-parallel ++ 111: read, regular, parallel ++ */ ++ map > ref_use; ++ ++ /* scores for different memory references. */ ++ vector ref_scores; ++ ++ ref_group () ++ { ++ var = NULL_TREE; ++ var_size = 0; ++ reuse_level = 0; ++ calc_by = UNHANDLE_CALC; ++ mem_ref_index = 0; ++ } ++}; ++ ++/* calculate reuse level. */ ++ ++unsigned int ++calculate_reuse_level (map > &var_use) ++{ ++ unsigned int level = 0; ++ for (map >::iterator it = var_use.begin (); ++ it != var_use.end (); ++it) ++ { ++ unsigned int parallel = 1; ++ unsigned int regular = 1; ++ unsigned int cost = 1; ++ ++ if ((*it).second[0].parallel_p) ++ parallel = PARALLEL_NUM; ++ if (!(*it).second[0].regular_p) ++ regular = INDIRECT_ACCESS_VALUE; ++ if (!(*it).second[0].read_p) ++ cost = WRITE_COST; ++ ++ /* In serial reuse, we will later check whether they are in the ++ same cacheline. If yes, delete the reuse. For details, see the ++ reuse analysis of prefetching and eliminate redundancy. */ ++ unsigned int add = parallel * ((*it).second.size () * (cost + regular)); ++ level += add; ++ if (add && dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "%d : %d * (%ld * (%d + %d)) = %d\n", ++ (*it).first, parallel, (*it).second.size (), cost, regular, add); ++ } ++ return level; ++} ++ ++/* Comparison of reference reuse level. */ ++ ++bool ++ref_group_reuse_cmp (const ref_group &a, const ref_group &b) ++{ ++ return a.reuse_level > b.reuse_level; ++} ++ ++/* Sort reference groups. */ ++ ++void ++sort_ref_groups (vector &ref_groups, ++ map &ref_groups_map) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nsort_ref_groups_by_reuse_level\n"); ++ ++ for (map::iterator it = ref_groups_map.begin (); ++ it != ref_groups_map.end (); ++it) ++ { ++ (*it).second.reuse_level = calculate_reuse_level ((*it).second.ref_use); ++ ref_groups.push_back ((*it).second); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, (*it).second.var, TDF_SLIM); ++ fprintf (dump_file, " : %d\n", (*it).second.reuse_level); ++ } ++ } ++ ++ sort (ref_groups.begin (), ref_groups.end (), ref_group_reuse_cmp); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nsorted ref_groups:\n"); ++ fprintf (dump_file, "rank var (data_size, num_of_mem_ref, need_tmp_name):" ++ " reuse_level_score\n"); ++ for (unsigned int i = 0; i < ref_groups.size (); ++i) ++ { ++ fprintf (dump_file, "%d ", i); ++ print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM); ++ int need_tmp_name = !get_name (ref_groups[i].var) ? 1 : 0; ++ fprintf (dump_file, " (%lf, %lu, %d)", ref_groups[i].var_size, ++ ref_groups[i].ref_scores.size (), need_tmp_name); ++ fprintf (dump_file, " : %d\n", ref_groups[i].reuse_level); ++ } ++ fprintf (dump_file, "\n"); ++ ++ fprintf (dump_file, "first_use:\n"); ++ for (unsigned int i = 0; i < ref_groups.size (); ++i) ++ { ++ fprintf (dump_file, "%d ", i); ++ print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM); ++ fprintf (dump_file, " : "); ++ if (!ref_groups[i].first_use.vectorize_p) ++ print_generic_expr (dump_file, ref_groups[i].first_use.ref, ++ TDF_SLIM); ++ else ++ print_gimple_stmt (dump_file, ref_groups[i].first_use.stmt, ++ TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ fprintf (dump_file, "\n"); ++ } ++} ++ ++/* Attributes of variable data. */ ++ ++enum data_attribute ++{ ++ DA_PARALLEL = 0, ++ DA_REGULAR, ++ DA_READ ++}; ++ ++/* Record memory reference by use mode. ++ If the reference group is not found, create a group. */ ++ ++void ++record_mem_ref (map &ref_groups, data_ref &mem_ref) ++{ ++ unsigned int index = (mem_ref.parallel_p << DA_PARALLEL) ++ + (mem_ref.regular_p << DA_REGULAR) + (mem_ref.read_p << DA_READ); ++ ++ if (!ref_groups.count (mem_ref.var)) ++ { ++ ref_group ref_group; ++ ref_group.var = mem_ref.var; ++ ref_group.first_use = mem_ref; ++ ref_groups[mem_ref.var] = ref_group; ++ } ++ ++ /* Ref_groups' calc_by depends on the inserted mem_ref's calc_by. ++ Runtime issue requires the specified mem_ref's calc_by to be >= 1. ++ Temporarily modified ref_group's first_use after sorting mem_refs. */ ++ ref_groups[mem_ref.var].calc_by = max (ref_groups[mem_ref.var].calc_by, ++ mem_ref.calc_by); ++ ref_groups[mem_ref.var].var_size = max (ref_groups[mem_ref.var].var_size, ++ mem_ref.data_size); ++ ref_groups[mem_ref.var].ref_use[index].push_back (mem_ref); ++ ++ ref_score ref_level{ mem_ref, ((mem_ref.stmt)->bb->count).to_gcov_type (), ++ expand_location (mem_ref.stmt->location).line }; ++ ref_groups[mem_ref.var].ref_scores.push_back (ref_level); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "recorded in: "); ++ print_generic_expr (dump_file, mem_ref.var, TDF_SLIM); ++ fprintf (dump_file, ":%d:%ld\n", index, ++ ref_groups[mem_ref.var].ref_use[index].size () - 1); ++ ++ fprintf (dump_file, "base: "); ++ print_generic_expr (dump_file, mem_ref.base, TDF_SLIM); ++ ++ fprintf (dump_file, ", index: "); ++ print_generic_expr (dump_file, mem_ref.index, TDF_SLIM); ++ ++ fprintf (dump_file, ", step: "); ++ if (mem_ref.step && cst_and_fits_in_hwi (mem_ref.step)) ++ fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, ++ int_cst_value (mem_ref.step)); ++ else ++ print_generic_expr (dump_file, mem_ref.step, TDF_SLIM); ++ ++ fprintf (dump_file, ", offset: "); ++ if (mem_ref.offset && cst_and_fits_in_hwi (mem_ref.offset)) ++ fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, ++ int_cst_value (mem_ref.offset)); ++ else ++ print_generic_expr (dump_file, mem_ref.offset, TDF_SLIM); ++ fprintf (dump_file, ", %s", mem_ref.read_p ? "read" : "write"); ++ ++ fprintf (dump_file, ", size: %lf", mem_ref.data_size); ++ fprintf (dump_file, "\n\n"); ++ } ++} ++ ++/* Rank data reference index level by the scheme of source code line number. */ ++ ++bool ++data_ref_reuse_cmp (const ref_score &a, const ref_score &b) ++{ ++ return a.line < b.line; ++} ++ ++/* Sort data reference index level within one reference group in non-decreasing ++ order of the customized sorting scheme. */ ++ ++void ++sort_mem_ref_in_ref_group (map &ref_groups_map) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nsorted data_references:\n"); ++ for (map::iterator it = ref_groups_map.begin (); ++ it != ref_groups_map.end (); ++it) ++ { ++ vector &ref_scores = (*it).second.ref_scores; ++ stable_sort (ref_scores.begin (), ref_scores.end (), data_ref_reuse_cmp); ++ /* Update ref_group's first_use and calc_by with the first mem_ref after ++ sorting. */ ++ (*it).second.first_use = (*it).second.ref_scores[0].d_ref; ++ (*it).second.calc_by = (*it).second.first_use.calc_by; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, (*it).first, TDF_SLIM); ++ fprintf (dump_file, " : %lu\n", ref_scores.size ()); ++ for (unsigned int i = 0; i < ref_scores.size (); ++i) ++ { ++ fprintf (dump_file, "mem_ref_index %u: ", i); ++ print_gimple_stmt (dump_file, ref_scores[i].d_ref.stmt, 0, ++ TDF_LINENO); ++ } ++ fprintf (dump_file, "\n\n"); ++ } ++ } ++} ++ ++/* Tracing and sorting reference groups. */ ++ ++bool ++record_and_sort_ref_groups (vector &ref_groups, ++ vector &kernels, ++ map > &loop_refs) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nPhase 5: trace_all_references_details\n\n"); ++ ++ map ref_groups_map; ++ ++ for (unsigned i = 0; i < kernels.size (); ++i) ++ { ++ class loop* loop = kernels[i]; ++ if (loop_refs.count (loop) == 0) ++ continue; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "loop header %d:\n", loop->header->index); ++ for (unsigned j = 0; j < loop_refs[loop].size (); ++j) ++ { ++ if (loop_refs[loop][j].trace_status_p) ++ record_mem_ref (ref_groups_map, loop_refs[loop][j]); ++ } ++ } ++ ++ /* Sort mem_ref within ref_group by local count and update first_use's ++ data_ref, stable sort. */ ++ sort_mem_ref_in_ref_group (ref_groups_map); ++ sort_ref_groups (ref_groups, ref_groups_map); ++ ++ return ref_groups.size () > 0; ++} ++ ++/* ================ phase 6 issue_llc_hint ================ */ ++ ++/* Issue vectorized mask prefetch gimple. */ ++ ++void ++issue_mask_prefetch (gimple *stmt) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "insert svprfd.\n"); ++ ++ /* vect__1.1 = .MASK_LOAD (_2, 32B, loop_mask_3); ++ .MASK_STORE (_4, 32B, loop_mask_5, vect__6.6); ++ */ ++ tree dataref_ptr = gimple_call_arg (stmt, 0); ++ tree scale = gimple_call_arg (stmt, 1); ++ tree final_mask = gimple_call_arg (stmt, 2); ++ tree target = NULL_TREE; ++ if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE) ++ target = gimple_call_arg (stmt, 3); ++ else if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD) ++ target = gimple_call_lhs (stmt); ++ /* 4: PLDL3KEEP. */ ++ tree prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4); ++ ++ /* add offset. */ ++ gimple_stmt_iterator si = gsi_for_stmt (stmt); ++ /* target: vector_type - XXX_type. */ ++ if (target == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "unhandled scene: target vect is null"); ++ return; ++ } ++ HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi ++ (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target)))); ++ tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance); ++ addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true, ++ NULL, true, GSI_SAME_STMT); ++ ++ gcall *call = gimple_build_call_internal (IFN_MASK_PREFETCH, ++ 5, addr, scale, final_mask, target, prfop); ++ gsi_insert_after (&si, call, GSI_SAME_STMT); ++ update_ssa (TODO_update_ssa_only_virtuals); ++} ++ ++/* Issue vectorized mask gather prefetch gimple. */ ++ ++void ++issue_mask_gather_prefetch (gimple *stmt) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "insert svprfd_gather_uxindex.\n"); ++ ++ /* vect_patt_1.1 = .MASK_GATHER_LOAD (_2, vect__3.3, 8, { 0.0, ... }, ++ loop_mask_4); */ ++ tree dataref_ptr = gimple_call_arg (stmt, 0); ++ tree vec_offset = gimple_call_arg (stmt, 1); ++ tree scale = gimple_call_arg (stmt, 2); ++ tree zero = gimple_call_arg (stmt, 3); ++ tree final_mask = gimple_call_arg (stmt, 4); ++ tree prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4); ++ tree target = gimple_call_lhs (stmt); ++ ++ /* add offset. */ ++ gimple_stmt_iterator si = gsi_for_stmt (stmt); ++ if (target == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "unhandled scene: target vect is null"); ++ return; ++ } ++ HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi ++ (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target)))); ++ tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance); ++ addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true, ++ NULL, true, GSI_SAME_STMT); ++ ++ gcall *call = gimple_build_call_internal ++ (IFN_MASK_GATHER_PREFETCH, 7, addr, ++ vec_offset, scale, zero, final_mask, target, prfop); ++ gsi_insert_after (&si, call, GSI_SAME_STMT); ++ update_ssa (TODO_update_ssa_only_virtuals); ++} ++ ++/* Issue builtin prefetch gimple. */ ++ ++void ++issue_builtin_prefetch (data_ref &mem_ref) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "insert prfm.\n"); ++ /* MEM[symbol: diagPtr, index: ivtmp_102, step: 8, offset: 0B] */ ++ gimple* stmt = mem_ref.stmt; ++ tree dataref_ptr = mem_ref.base; ++ tree data_idx = mem_ref.index; ++ tree scale = mem_ref.step; ++ tree offset = mem_ref.offset; ++ /* add offset. */ ++ gimple_stmt_iterator si = gsi_for_stmt (stmt); ++ if (scale == NULL_TREE) ++ { ++ /* _190 = (void *) ivtmp.444_221; ++ Cannot detect size unit at (void *). */ ++ scale = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (mem_ref.var))); ++ if (scale == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "ERROR: Unknown size unit for the prefetching " ++ "variable. Stop builtin_prefetch.\n\n"); ++ return; ++ } ++ } ++ ++ data_idx = data_idx ? data_idx : size_zero_node; ++ data_idx = build1 (NOP_EXPR, TREE_TYPE (scale), data_idx); ++ tree displacement = fold_build2 (MULT_EXPR, TREE_TYPE (scale), data_idx, ++ scale); ++ if (offset != NULL_TREE && TREE_CODE (offset) != TREE_CODE (size_zero_node)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "WARNING: offset's TREE_TYPE is not integer_cst: " ++ "%s\nStop builtin_prefetch.\n", ++ get_tree_code_name (TREE_CODE (offset))); ++ return; ++ } ++ offset = offset ? offset : size_zero_node; ++ offset = build1 (NOP_EXPR, TREE_TYPE (scale), offset); ++ dataref_ptr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (dataref_ptr), ++ dataref_ptr, offset); ++ tree addr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (dataref_ptr), ++ dataref_ptr, displacement); ++ HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi (scale); ++ ++ addr = fold_build_pointer_plus_hwi (addr, distance); ++ addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true, ++ NULL, true, GSI_SAME_STMT); ++ /* __builtin_prefetch (_68, 0, 1); ++ 1st param: *addr, 2nd param: write/read (1/0), 3rd param: temporal locality ++ (high means strong locality) */ ++ gcall *call = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH), ++ 3, addr, integer_zero_node, integer_one_node); ++ gsi_insert_after (&si, call, GSI_SAME_STMT); ++ update_ssa (TODO_update_ssa_only_virtuals); ++} ++ ++/* Retrieve memory reference at the specific index. */ ++ ++data_ref ++get_data_ref_at_idx (ref_group &var_ref_group) ++{ ++ unsigned int mem_ref_size = static_cast( ++ var_ref_group.ref_scores.size ()); ++ if (strlen (param_mem_ref_index) == 0) ++ return var_ref_group.first_use; ++ else ++ { ++ /* Insert prefetch hint at highly-likely-used location with the given ++ index. */ ++ if (var_ref_group.mem_ref_index >= mem_ref_size) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "WARNING: The target data_ref index is out " ++ "of range. Use top index instead!\n"); ++ return var_ref_group.ref_scores[0].d_ref; ++ } ++ return var_ref_group.ref_scores[var_ref_group.mem_ref_index].d_ref; ++ } ++} ++ ++/* Static form insertion and issue instruction. We may check the ++ determination of the ARM SVE architecture before SVE hint insertion. */ ++ ++void ++static_issue (vector &ref_groups, int num_issue_var) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "static issue\n"); ++ ++ for (int i = 0; i < num_issue_var; ++i) ++ { ++ data_ref mem_ref = get_data_ref_at_idx (ref_groups[i]); ++ if (mem_ref.vectorize_p) ++ { ++ enum internal_fn ifn_code = gimple_call_internal_fn ++ (mem_ref.stmt); ++ if (ifn_code == IFN_MASK_STORE || ifn_code == IFN_MASK_LOAD) ++ issue_mask_prefetch (mem_ref.stmt); ++ else if (ifn_code == IFN_MASK_GATHER_LOAD) ++ issue_mask_gather_prefetch (mem_ref.stmt); ++ else ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "other vectorized internal function\n"); ++ } ++ else ++ issue_builtin_prefetch (mem_ref); ++ } ++} ++ ++/* Generate the stmts for calculating the size. Later we will consider nested ++ multi-branches scenarios and check more information of niters when it is ++ a COND_EXPR. */ ++ ++tree ++calc_stmts_gen (vector &ref_groups, gimple_seq &cond_expr_stmt_list, ++ int num_issue_var) ++{ ++ /* Accumulated keep size. */ ++ tree total_size = build_real_from_int_cst ++ (double_type_node, integer_zero_node); ++ for (int i = 0; i < num_issue_var; ++i) ++ { ++ data_ref &mem_ref = ref_groups[i].first_use; ++ tree var = mem_ref.var; ++ for (unsigned j = 0; j < mem_ref.loop_bounds.size (); ++j) ++ { ++ tree niters = mem_ref.loop_bounds[j].niters; ++ ++ /* COND_EXPR. */ ++ if (TREE_CODE (niters) == COND_EXPR) ++ niters = TREE_OPERAND (niters, 1); ++ tree unit = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (var))); ++ /* _190 = (void *) ivtmp.444_221; ++ Cannot detect size unit at (void *). */ ++ if (unit == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "WARNING: Cannot detect size unit " ++ "(use 1 byte) for variable %s: ", get_name (var)); ++ print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ unit = size_one_node; ++ } ++ unit = build1 (NOP_EXPR, TREE_TYPE (niters), unit); ++ tree size = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, unit); ++ size = build1 (FLOAT_EXPR, double_type_node, size); ++ total_size = fold_build2 ++ (PLUS_EXPR, double_type_node, total_size, size); ++ } ++ } ++ /* Create a stmt list for size calculation. */ ++ tree div = build_int_cst (TREE_TYPE (integer_zero_node), 1024 * 1024); ++ div = build1 (NOP_EXPR, double_type_node, div); ++ total_size = fold_build2 (RDIV_EXPR, double_type_node, total_size, div); ++ ++ tree threshold = build_int_cst (TREE_TYPE (integer_zero_node), ++ param_llc_capacity_per_core / 2); ++ threshold = build_real_from_int_cst (double_type_node, threshold); ++ tree cond_expr = fold_build2 ++ (LE_EXPR, boolean_type_node, total_size, threshold); ++ ++ /* Convert cond_expr to stmt list. */ ++ cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr), ++ &cond_expr_stmt_list, is_gimple_condexpr, NULL_TREE); ++ return cond_expr; ++} ++ ++/* Runtime form insertion and issue instruction. */ ++ ++void ++runtime_issue (vector &ref_groups, int num_issue_var) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "runtime issue\n"); ++ ++ if (ref_groups.size () == 0) ++ return; ++ data_ref &mem_ref = ref_groups[0].first_use; ++ class loop *loop = mem_ref.loop_bounds.back ().loop; ++ /* Ensure that variables are in the same loop. */ ++ for (int i = 1; i < num_issue_var; ++i) ++ { ++ data_ref &mem_ref = ref_groups[i].first_use; ++ if (loop != mem_ref.loop_bounds.back ().loop) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "topn var are not in the same loop\n"); ++ return; ++ } ++ } ++ if (loop == NULL) ++ return; ++ ++ /* If the exit edge points to bb with multiple inputs, split the exit edge ++ and create a new bb, make the exit edge point to bb only single input. */ ++ edge e = single_exit (loop); ++ if (e == NULL) ++ return; ++ if (!single_pred_p (e->dest)) ++ { ++ split_loop_exit_edge (e, true); ++ if (dump_enabled_p ()) ++ dump_printf (MSG_NOTE, "split exit edge\n"); ++ } ++ ++ gimple_seq cond_expr_stmt_list = NULL; ++ tree cond_expr = calc_stmts_gen (ref_groups, cond_expr_stmt_list, ++ num_issue_var); ++ ++ /* Use the previous cond and generate a new branch and copy loop. */ ++ basic_block condition_bb = NULL; ++ profile_probability prob = profile_probability::likely (); ++ initialize_original_copy_tables (); ++ class loop *nloop = loop_version (loop, cond_expr, &condition_bb, ++ prob, prob.invert (), prob, prob.invert (), true); ++ free_original_copy_tables (); ++ ++ /* Insert the generated stmt list before cond_expr. */ ++ gimple_stmt_iterator cond_exp_gsi; ++ if (cond_expr_stmt_list) ++ { ++ cond_exp_gsi = gsi_last_bb (condition_bb); ++ gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, ++ GSI_SAME_STMT); ++ } ++ update_ssa (TODO_update_ssa); ++ ++ /* Perform hint issue for branches that meet conditions. */ ++ static_issue (ref_groups, num_issue_var); ++} ++ ++/* Issue llc hints through prefetch instructions. */ ++ ++void ++issue_llc_hint (vector &ref_groups) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "issue_llc_hint:\n"); ++ ++ /* 1. If the issue-topn and force-issue options are available, top N var is ++ forcibly allocated and no runtime branch is generated. ++ 2. If the issue-topn option is available and the size of top N var is ++ statically known, top N is statically allocated and no runtime branch ++ is generated. ++ 3. If the issue-topn option is available and the size of the top N var is ++ unknown, but them is dynamically known, the top N is dynamically ++ allocated and generate runtime branches. (also depends on the screening ++ of the innermost variable boundary type) ++ 4. If the dynamic runtime cannot know the size, such as indirect access, ++ optimization is skipped. ++ */ ++ if (ref_groups.size () == 0) ++ return; ++ ++ int num_issue_var = min (param_issue_topn, ++ static_cast(ref_groups.size ())); ++ if (num_issue_var < param_issue_topn ++ && dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "WARNING: Only %u (less than param_issue_topn = %d) " ++ "ref_group(s) is found for llc hint.\n", ++ num_issue_var, param_issue_topn); ++ } ++ if (param_force_issue) ++ { ++ if (strlen (param_target_variables) > 0) ++ static_issue (ref_groups, static_cast(ref_groups.size ())); ++ else ++ static_issue (ref_groups, num_issue_var); ++ return; ++ } ++ calc_type topn_calc_type = STATIC_CALC; ++ for (int i = 0; i < num_issue_var; ++i) ++ topn_calc_type = min (topn_calc_type, ref_groups[i].calc_by); ++ ++ if (topn_calc_type == STATIC_CALC) ++ { ++ /* Before static issue, we still need to collect data size of all target ++ variables and compare the summation with LLC cache size. */ ++ double prefetch_data_size = 0.; ++ for (int i = 0; i < num_issue_var; ++i) ++ prefetch_data_size += ref_groups[i].var_size; ++ if (prefetch_data_size <= (double) param_llc_capacity_per_core * 0.8) ++ static_issue (ref_groups, num_issue_var); ++ else ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "static issue: Prefetch size exceeds LLC cache " ++ "size: %lf > %lf.\n", prefetch_data_size, ++ (double) param_llc_capacity_per_core * 0.8); ++ } ++ else if (topn_calc_type == RUNTIME_CALC) ++ runtime_issue (ref_groups, num_issue_var); ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "unhandled issue scene\n"); ++ } ++} ++ ++/* ==================== phase entry ==================== */ ++/* Check whether a string can be converted to an unsigned integer. */ ++ ++bool is_unsigned_int (const string &s) ++{ ++ if (s.empty () || s.size () > PREFETCH_TOOL_NUM_MAX_LEN) ++ return false; ++ ++ for (unsigned int i = 0; i < s.size (); ++i) ++ { ++ if (s[i] < '0' || s[i] > '9') ++ return false; ++ } ++ return true; ++} ++ ++/* Parse a substring separated by comma. If the substring is valid and ++ non-empty, store it as a parsed element. */ ++ ++bool ++parse_string_helper (const string &substr, vector& str_elts, ++ bool check_unsigned, size_t start, size_t end) ++{ ++ if (substr == "" && dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "WARNING: The input string from %lu to %lu is " ++ "empty.\n", start, end); ++ else if (check_unsigned && !is_unsigned_int (substr)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "ERROR: not an unsigned integer: %s\n", ++ substr.c_str ()); ++ str_elts.clear (); ++ return false; ++ } ++ else ++ str_elts.push_back (substr); ++ return true; ++} ++ ++/* Parse a user input string, separated by comma. */ ++ ++void ++parse_string (const string &s, vector& str_elts, ++ bool check_unsigned = false) ++{ ++ string delim = ","; ++ size_t start = 0; ++ size_t end = s.find (delim); ++ string substr = s.substr (start, end - start); ++ while (end != string::npos) ++ { ++ if (!parse_string_helper (substr, str_elts, check_unsigned, start, end)) ++ return; ++ start = end + delim.size (); ++ end = s.find (delim, start); ++ substr = s.substr (start, end - start); ++ } ++ parse_string_helper (substr, str_elts, check_unsigned, start, end); ++} ++ ++/* Parse user input of target variables and memory indices and create a map ++ that assigns a target variable to a memory index. */ ++ ++void ++parse_param_inputs (map &var2mem_idx) ++{ ++ /* The user input length should have an input length limit. */ ++ if ((strlen (param_target_variables) >= PREFETCH_TOOL_INPUT_MAX_LEN ++ || strlen (param_mem_ref_index) >= PREFETCH_TOOL_INPUT_MAX_LEN) ++ && dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "INVALID INPUT: The user inputs for target variables " ++ "and/or memory reference indices are too long for parsing.\n"); ++ ++ vector var_names; ++ string target_variables = param_target_variables; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Start parsing target variables:\n"); ++ if (param_use_ref_group_index) ++ parse_string (target_variables, var_names, true); ++ else ++ parse_string (target_variables, var_names, false); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Finish parsing target variables.\n\n"); ++ ++ vector var_mem_indices; ++ string mem_indices = param_mem_ref_index; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Start parsing memory reference indices:\n"); ++ parse_string (mem_indices, var_mem_indices, true); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Finish parsing memory reference indices.\n\n"); ++ ++ /* Construct a map of var_name: var_mem_index. */ ++ if (var_names.size () > 0) ++ { ++ if (var_mem_indices.size () < var_names.size ()) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "WARNING: The number of provided memory " ++ "reference indices is less than that of target " ++ "variables.\nUse the top index for all variables " ++ "instead.\n"); ++ for (string& var_name : var_names) ++ var2mem_idx[var_name] = 0; ++ } ++ else ++ { ++ if (var_mem_indices.size () > var_names.size () ++ && dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "WARNING: The number of target variables is " ++ "less than that of memory reference indices.\n"); ++ for (unsigned int i = 0; i < var_names.size (); ++i) ++ { ++ var2mem_idx[var_names[i]] = static_cast( ++ atoi (var_mem_indices[i].c_str ())); ++ } ++ } ++ } ++} ++ ++/* Filter reference groups by only selecting target variables from the user ++ input. There are two options for prefetching target variables: ++ 1. Specify variable name parsed by the pass, which you can double-check at ++ "sorted ref_groups" section in the dump file. ++ 2. Specify variable rank exhibited at "sorted ref_groups" section in the ++ dump file. ++*/ ++ ++void ++prefetch_variables (const vector& ref_groups, ++ vector& reduced_ref_groups) ++{ ++ map ref_group2mem_idx; ++ ++ map var2mem_idx; /* externally defined. */ ++ parse_param_inputs (var2mem_idx); ++ ++ if (param_use_ref_group_index) ++ { ++ /* Use ref_group index at "sorted ref_groups" section to specify ++ variable. */ ++ /* Collect the variables in "reduced_ref_group" only if their indices ++ show up at "sorted ref_groups" section. */ ++ for (const pair &var_mem_idx : var2mem_idx) ++ { ++ unsigned int var_idx = static_cast(atoi ( ++ var_mem_idx.first.c_str ())); ++ if (var_idx < ref_groups.size ()) ++ ref_group2mem_idx[var_idx] = var_mem_idx.second; ++ else if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "WARNING: The index \"%u\" does not show " ++ "up in the ref_groups.\n", var_idx); ++ } ++ } ++ else ++ { ++ /* Use variable name shown up at "sorted ref_groups" section to specify ++ variable: ++ var2ref_group_idx + var2mem_idx -> ref_group2mem_idx. */ ++ /* Create a map that assigns the variable name to its corresponding ++ ref_group index. */ ++ map var2ref_group_idx; /* internally detected. */ ++ for (unsigned int i = 0; i < ref_groups.size (); ++i) ++ { ++ const ref_group &curr_ref_group = ref_groups[i]; ++ const int UINT_MAX_DIGIT = 10; ++ /* Unrecognizable variable name related to ref_group. */ ++ if (!get_name (curr_ref_group.var)) ++ { ++ /* If the variable name does not have a string representation, ++ we can rename it by "tmp_var_" + . */ ++ char group_idx[UINT_MAX_DIGIT]; ++ sprintf (group_idx, "%u", i); ++ string tmp_var_name = "tmp_var_" + std::string (group_idx); ++ fprintf (dump_file, "Unrecognizable variable name at ref_group " ++ "index %u.\nThe tree expression for variable is: ", i); ++ print_generic_expr (dump_file, curr_ref_group.var, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ var2ref_group_idx[tmp_var_name] = i; ++ } ++ else ++ var2ref_group_idx[std::string (get_name (curr_ref_group.var))] = i; ++ } ++ /* Collect the variables in "reduced_ref_group" only if they show up in ++ the ref_groups. */ ++ for (const pair &var_mem_idx : var2mem_idx) ++ { ++ if (var2ref_group_idx.count (var_mem_idx.first)) ++ { ++ unsigned int ref_group_idx = var2ref_group_idx[var_mem_idx.first]; ++ ref_group2mem_idx[ref_group_idx] = var_mem_idx.second; ++ } ++ else if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "WARNING: Target variable \" %s \" does " ++ "not show up in the ref_groups. Check whether it needs " ++ "temporary variable name.\n", ++ var_mem_idx.first.c_str ()); ++ } ++ } ++ ++ for (const pair &ref_group_mem_idx : ++ ref_group2mem_idx) ++ { ++ ref_group curr_ref_group = ref_groups[ref_group_mem_idx.first]; ++ curr_ref_group.mem_ref_index = ref_group_mem_idx.second; ++ reduced_ref_groups.push_back (curr_ref_group); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nNOTICE: Prefetching target variable \" "); ++ print_generic_expr (dump_file, curr_ref_group.var, TDF_SLIM); ++ fprintf (dump_file, " \" at ref_group index %u and memory location " ++ "index %u.\n", ref_group_mem_idx.first, ++ ref_group_mem_idx.second); ++ } ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\n\n"); ++} ++ ++ ++/* The LLC intelligent allocation consists of 6 steps. */ ++ ++void ++llc_allocate (void) ++{ ++ map > kernels_refs; ++ vector kernels; ++ if (!get_dense_memory_kernels (kernels, kernels_refs)) ++ return; ++ ++ trace_data_refs_info (kernels, kernels_refs); ++ ++ if (!analyze_nested_kernels (kernels, kernels_refs)) ++ return; ++ ++ vector sorted_kernels; ++ if (!filter_and_sort_kernels (sorted_kernels, kernels)) ++ return; ++ ++ vector ref_groups; ++ if (!record_and_sort_ref_groups (ref_groups, sorted_kernels, kernels_refs)) ++ return; ++ ++ if (strlen (param_target_variables) > 0) ++ { ++ /* If "param_target_variables" is not empty, we will issue parsed target ++ variables compulsorily. */ ++ param_force_issue = true; ++ vector reduced_ref_groups; ++ prefetch_variables (ref_groups, reduced_ref_groups); ++ issue_llc_hint (reduced_ref_groups); ++ } ++ else ++ issue_llc_hint (ref_groups); ++} ++ ++/* Check whether the function is an operator reloading function. */ ++ ++bool ++operator_func_p (function *fn) ++{ ++ const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn->decl)); ++ ++ if (fn_name && strncmp (fn_name, "operator", 8) == 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "operator_func: %s ", fn_name); ++ ++ return true; ++ } ++ return false; ++} ++ ++/* Check whether the function file location is known. */ ++ ++bool ++func_location_p (function *fn) ++{ ++ expanded_location fn_decl_xloc ++ = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); ++ expanded_location fn_xloc ++ = expand_location (fn->function_start_locus); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "fn->function_start_locus = %d \n", ++ fn->function_start_locus); ++ fprintf (dump_file, "fn_xloc.file = %s \n", ++ fn_xloc.file ? fn_xloc.file : "NULL"); ++ fprintf (dump_file, "fn_decl_xloc.file = %s \n", ++ fn_decl_xloc.file ? fn_decl_xloc.file : "NULL"); ++ fprintf (dump_file, "LOCATION_FILE (input_location) = %s \n", ++ LOCATION_FILE (input_location) ? LOCATION_FILE (input_location) ++ : "NULL"); ++ } ++ if (fn_decl_xloc.file == NULL) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Function location unknown, skip analysis \n"); ++ return false; ++ } ++ /* Newly generated functions are filtered out, such as function constant ++ propagation func.constprop (). */ ++ if (LOCATION_FILE (input_location) != fn_decl_xloc.file) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Function location non-local, skip analysis \n"); ++ return false; ++ } ++ return true; ++} ++ ++/* Dump function information. */ ++ ++void ++dump_function_info (function *fn) ++{ ++ const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn->decl)); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nfn_name: %s\n", fn_name); ++ expanded_location cfun_xloc ++ = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); ++ if (cfun_xloc.line) ++ { ++ if (cfun_xloc.file) ++ fprintf (dump_file, "[%s:%d:%d]\n", ++ cfun_xloc.file, cfun_xloc.line, cfun_xloc.column); ++ } ++ fprintf (dump_file, "\n"); ++ flow_loops_dump (dump_file, NULL, 1); ++ fprintf (dump_file, "\n"); ++ } ++} ++ ++/* dump param. */ ++ ++void ++dump_param (void) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "LLC allocate parameters:\n"); ++ fprintf (dump_file, " block size: %d\n", param_l1_cache_line_size); ++ fprintf (dump_file, " L1 cache size: %d lines, %d kB\n", ++ param_l1_cache_size * 1024 / param_l1_cache_line_size, ++ param_l1_cache_size); ++ fprintf (dump_file, " L1 cache line size: %d\n", ++ param_l1_cache_line_size); ++ fprintf (dump_file, " L2 cache size: %d kB\n", param_l2_cache_size); ++ fprintf (dump_file, " min mem_access_ratio: %d \n", ++ param_mem_access_ratio); ++ fprintf (dump_file, " min mem_access_num: %d \n", ++ param_mem_access_num); ++ fprintf (dump_file, "\n"); ++ } ++} ++ ++const pass_data pass_data_llc_allocate = ++{ ++ GIMPLE_PASS, /* type. */ ++ "llc_allocate", /* name. */ ++ OPTGROUP_LOOP, /* optinfo_flags. */ ++ TV_TREE_PREFETCH, /* tv_id. */ ++ (PROP_cfg | PROP_ssa), /* properties_required. */ ++ 0, /* properties_provided. */ ++ 0, /* properties_destroyed. */ ++ 0, /* todo_flags_start. */ ++ 0, /* todo_flags_finish. */ ++}; ++ ++class pass_llc_allocate : public gimple_opt_pass ++{ ++public: ++ pass_llc_allocate (gcc::context *ctxt) ++ : gimple_opt_pass (pass_data_llc_allocate, ctxt) ++ {} ++ ++ /* opt_pass methods. */ ++ virtual bool gate (function *) ++ { ++ return (optimize >= 2 && flag_llc_allocate > 0); ++ } ++ virtual unsigned int execute (function *); ++ ++}; // class pass_llc_allocate ++ ++unsigned int ++pass_llc_allocate::execute (function *fn) ++{ ++ unsigned int ret = 0; ++ ++ if (!targetm.have_prefetch () ++ || targetm.vectorize.code_for_prefetch == NULL ++ || targetm.vectorize.prefetch_handleable_mode_p == NULL ++ || targetm.vectorize.code_for_gather_prefetch == NULL) ++ return 0; ++ ++ if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH)) ++ { ++ tree type = build_function_type_list (void_type_node, ++ const_ptr_type_node, NULL_TREE); ++ tree decl = add_builtin_function ("__builtin_prefetch", type, ++ BUILT_IN_PREFETCH, BUILT_IN_NORMAL, ++ NULL, NULL_TREE); ++ DECL_IS_NOVOPS (decl) = true; ++ set_builtin_decl (BUILT_IN_PREFETCH, decl, false); ++ } ++ ++ dump_param (); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "llc_allocate: %s\n", ++ IDENTIFIER_POINTER (DECL_NAME (fn->decl))); ++ ++ if (number_of_loops (fn) <= 1 || !func_location_p (fn) ++ || operator_func_p (fn)) ++ return ret; ++ ++ dump_function_info (fn); ++ ++ llc_allocate (); ++ ++ return ret; ++} ++ ++} // anon namespace ++ ++gimple_opt_pass * ++make_pass_llc_allocate (gcc::context *ctxt) ++{ ++ return new pass_llc_allocate (ctxt); ++} +diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c +index 7775bc727..c500d5e20 100644 +--- a/gcc/tree-ssa-loop-niter.c ++++ b/gcc/tree-ssa-loop-niter.c +@@ -2384,6 +2384,37 @@ loop_only_exit_p (const class loop *loop, basic_block *body, const_edge exit) + return true; + } + ++/* Returns whether the number of vectorized iterations for the loop can be ++ estimated from the given IR and update the corresponding loop attribute, ++ e.g., next_mask_114 = .WHILE_ULT (_122, niters.5_75, { 0, ... }); */ ++ ++bool ++number_of_iterations_vect (class loop *loop, tree lhs, tree rhs) ++{ ++ loop->vec_nb_iterations = chrec_dont_know; ++ ++ if ((TREE_CODE (lhs) != SSA_NAME && TREE_CODE (rhs) != SSA_NAME) ++ || (TREE_CODE (lhs) == SSA_NAME && TREE_CODE (rhs) == SSA_NAME)) ++ return false; ++ ++ tree ssa = TREE_CODE (lhs) == SSA_NAME ? lhs : rhs; ++ gimple *def_stmt = SSA_NAME_DEF_STMT (ssa); ++ ++ if (gimple_code (def_stmt) != GIMPLE_CALL ++ || !gimple_call_internal_p (def_stmt)) ++ return false; ++ ++ internal_fn ifn = gimple_call_internal_fn (def_stmt); ++ if (ifn != IFN_WHILE_ULT) ++ return false; ++ ++ gcall *call = dyn_cast (def_stmt); ++ tree niters = gimple_call_arg (call, 1); ++ loop->vec_nb_iterations = niters; ++ ++ return true; ++} ++ + /* Stores description of number of iterations of LOOP derived from + EXIT (an exit edge of the LOOP) in NITER. Returns true if some useful + information could be derived (and fields of NITER have meaning described +@@ -2454,6 +2485,9 @@ number_of_iterations_exit_assumptions (class loop *loop, edge exit, + op1 = gimple_cond_rhs (stmt); + type = TREE_TYPE (op0); + ++ if (TREE_CODE (type) == VECTOR_TYPE) ++ number_of_iterations_vect (loop, op0, op1); ++ + if (TREE_CODE (type) != INTEGER_TYPE + && !POINTER_TYPE_P (type)) + return false; +@@ -2730,14 +2764,14 @@ bool + number_of_iterations_exit (class loop *loop, edge exit, + class tree_niter_desc *niter, + bool warn, bool every_iteration, +- basic_block *body) ++ basic_block *body, bool guarantee) + { + gcond *stmt; + if (!number_of_iterations_exit_assumptions (loop, exit, niter, + &stmt, every_iteration, body)) + return false; + +- if (integer_nonzerop (niter->assumptions)) ++ if (integer_nonzerop (niter->assumptions) || guarantee == false) + return true; + + if (warn && dump_enabled_p ()) +diff --git a/gcc/tree-ssa-loop-niter.h b/gcc/tree-ssa-loop-niter.h +index eb8d15794..d38472e52 100644 +--- a/gcc/tree-ssa-loop-niter.h ++++ b/gcc/tree-ssa-loop-niter.h +@@ -27,7 +27,8 @@ extern bool loop_only_exit_p (const class loop *, basic_block *body, + extern bool number_of_iterations_exit (class loop *, edge, + class tree_niter_desc *niter, bool, + bool every_iteration = true, +- basic_block * = NULL); ++ basic_block * = NULL, ++ bool guarantee = true); + extern bool number_of_iterations_exit_assumptions (class loop *, edge, + class tree_niter_desc *, + gcond **, bool = true, +-- +2.33.0 + diff --git a/gcc.spec b/gcc.spec index 0c11ad2..d80bb4c 100644 --- a/gcc.spec +++ b/gcc.spec @@ -61,7 +61,7 @@ Summary: Various compilers (C, C++, Objective-C, ...) Name: gcc Version: %{gcc_version} -Release: 40 +Release: 41 License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD URL: https://gcc.gnu.org @@ -259,6 +259,7 @@ Patch148: 0148-Introduce-RTL-ifcvt-enhancements.patch Patch149: 0149-Add-more-flexible-check-for-pointer-aliasing-during-.patch Patch150: 0150-Implement-propagation-of-permutations-in-fwprop.patch Patch151: 0151-Fix-bugs-and-add-tests-for-RTL-ifcvt.patch +Patch152: 0152-Add-LLC-Allocation-Pass.patch %global gcc_target_platform %{_arch}-linux-gnu @@ -863,6 +864,7 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch149 -p1 %patch150 -p1 %patch151 -p1 +%patch152 -p1 %build @@ -2887,6 +2889,12 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Mon Dec 11 2023 Feiyang Liu - 10.3.1-41 +- Type:Spec +- ID:NA +- SUG:NA +- DESC: Sync patch from openeuler/gcc + * Wed Dec 6 2023 Wang Ding - 10.3.1-40 - Type:Spec - ID:NA -- Gitee