diff --git a/0356-add-llc-allocate-feature.patch b/0356-add-llc-allocate-feature.patch new file mode 100644 index 0000000000000000000000000000000000000000..e7c8e126ae3f9ac4549925bbfdd56e90571815c6 --- /dev/null +++ b/0356-add-llc-allocate-feature.patch @@ -0,0 +1,8452 @@ +From 43e93c6df874a0bf78675fb4d3586d9ad1cb7dac Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=E9=99=88=E9=B8=BF?= +Date: Tue, 25 Feb 2025 16:27:36 +0800 +Subject: [PATCH 1/2] add llc allocate feature + +--- + gcc/Makefile.in | 1 + + gcc/auto-profile.cc | 491 +- + gcc/auto-profile.h | 30 + + gcc/builtins.cc | 82 + + gcc/builtins.def | 1 + + gcc/cfgloop.h | 3 + + gcc/common.opt | 28 + + gcc/config/aarch64/aarch64-protos.h | 6 +- + gcc/config/aarch64/aarch64-sve.md | 48 +- + gcc/config/aarch64/aarch64.cc | 18 + + gcc/config/aarch64/aarch64.md | 39 + + gcc/dce.cc | 1 + + gcc/doc/tm.texi | 21 + + gcc/doc/tm.texi.in | 6 + + gcc/internal-fn.cc | 115 + + gcc/internal-fn.def | 4 + + gcc/ipa-pure-const.cc | 1 + + gcc/optabs.def | 2 + + gcc/opts.cc | 52 +- + gcc/params.opt | 62 + + gcc/passes.def | 2 + + gcc/print-rtl.cc | 6 + + gcc/rtl.def | 9 + + gcc/rtl.h | 4 + + gcc/rtlanal.cc | 2 + + gcc/sched-deps.cc | 4 +- + gcc/target-insns.def | 1 + + gcc/target.def | 31 + + .../g++.dg/llc-allocate/llc-allocate.exp | 27 + + .../llc-allocate/llc-relion-expand-kernels.C | 52 + + .../g++.dg/llc-allocate/multidim_array.h | 186 + + gcc/testsuite/gcc.dg/llc-allocate/llc-1.c | 61 + + gcc/testsuite/gcc.dg/llc-allocate/llc-2.c | 54 + + .../gcc.dg/llc-allocate/llc-allocate.exp | 27 + + .../llc-allocate/llc-cross-bb-indir-mem-acc.c | 36 + + .../llc-allocate/llc-extend-outer-loop.c | 61 + + .../llc-feedback-branch-in-loop.c | 39 + + .../llc-allocate/llc-feedback-break-in-loop.c | 41 + + .../llc-allocate/llc-feedback-goto-in-loop.c | 50 + + .../llc-feedback-same-loop-cycle.c | 129 + + .../gcc.dg/llc-allocate/llc-nonzero-offset.c | 50 + + .../llc-prefetch-full-pldl1keep.c | 14 + + .../llc-prefetch-full-pldl1strm.c | 14 + + .../llc-prefetch-full-pldl2keep.c | 14 + + .../llc-prefetch-full-pldl2strm.c | 16 + + .../llc-prefetch-full-pldl3keep.c | 14 + + .../llc-prefetch-full-pldl3strm.c | 14 + + .../llc-prefetch-full-pldl4keep.c | 14 + + .../llc-prefetch-full-pldl4strm.c | 14 + + .../llc-prefetch-full-pstl1keep.c | 14 + + .../llc-prefetch-full-pstl1strm.c | 14 + + .../llc-prefetch-full-pstl2keep.c | 14 + + .../llc-prefetch-full-pstl2strm.c | 14 + + .../llc-prefetch-full-pstl3keep.c | 14 + + .../llc-prefetch-full-pstl3strm.c | 14 + + .../llc-prefetch-full-pstl4keep.c | 14 + + .../llc-prefetch-full-pstl4strm.c | 14 + + .../gcc.dg/llc-allocate/llc-ref-trace.c | 62 + + .../gfortran.dg/llc-allocate/llc-3.f90 | 211 + + .../gfortran.dg/llc-allocate/llc-allocate.exp | 29 + + .../llc-trace-multiple-base-var.f90 | 62 + + .../llc-unknown-type-size-unit.f90 | 58 + + .../llc-allocate/llc-wrf-4-outer-loop-num.f90 | 320 ++ + gcc/timevar.def | 2 + + gcc/toplev.cc | 6 + + gcc/tree-cfg.cc | 11 + + gcc/tree-cfg.h | 1 + + gcc/tree-pass.h | 3 + + gcc/tree-scalar-evolution.cc | 8 +- + gcc/tree-scalar-evolution.h | 3 +- + gcc/tree-ssa-llc-allocate.cc | 4150 +++++++++++++++++ + gcc/tree-ssa-loop-niter.cc | 38 +- + gcc/tree-ssa-loop-niter.h | 3 +- + gcc/tree-vect-loop-manip.cc | 266 ++ + gcc/tree-vect-loop.cc | 10 +- + gcc/tree-vectorizer.h | 1 + + 76 files changed, 7308 insertions(+), 45 deletions(-) + create mode 100644 gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp + create mode 100644 gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C + create mode 100644 gcc/testsuite/g++.dg/llc-allocate/multidim_array.h + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-1.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-2.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-branch-in-loop.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-break-in-loop.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-goto-in-loop.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-same-loop-cycle.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c + create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 + create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp + create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 + create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 + create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90 + create mode 100644 gcc/tree-ssa-llc-allocate.cc + +diff --git a/gcc/Makefile.in b/gcc/Makefile.in +index 65f683bbd..ef7733580 100644 +--- a/gcc/Makefile.in ++++ b/gcc/Makefile.in +@@ -1659,6 +1659,7 @@ OBJS = \ + tree-ssa-loop-niter.o \ + tree-ssa-loop-array-widen-compare.o \ + tree-ssa-loop-prefetch.o \ ++ tree-ssa-llc-allocate.o \ + tree-ssa-loop-split.o \ + tree-ssa-loop-unswitch.o \ + tree-ssa-loop.o \ +diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc +index 5e85381ce..97c3bafd5 100644 +--- a/gcc/auto-profile.cc ++++ b/gcc/auto-profile.cc +@@ -49,6 +49,9 @@ along with GCC; see the file COPYING3. If not see + #include "auto-profile.h" + #include "tree-pretty-print.h" + #include "gimple-pretty-print.h" ++#include ++#include ++#include + + /* The following routines implements AutoFDO optimization. + +@@ -95,6 +98,8 @@ along with GCC; see the file COPYING3. If not see + */ + + #define DEFAULT_AUTO_PROFILE_FILE "fbdata.afdo" ++#define DEFAULT_CACHE_MISSES_PROFILE_FILE "cmsdata.gcov" ++#define DEFAULT_ADDITIONAL_PROFILE_FILE "addldata.gcov" + #define AUTO_PROFILE_VERSION 2 + + namespace autofdo +@@ -117,6 +122,14 @@ private: + bool annotated_; + }; + ++/* pair */ ++static bool ++event_count_cmp (std::pair &a, ++ std::pair &b) ++{ ++ return a.second > b.second; ++} ++ + /* Represent a source location: (function_decl, lineno). */ + typedef std::pair decl_lineno; + +@@ -311,6 +324,9 @@ public: + /* Mark LOC as annotated. */ + void mark_annotated (location_t loc); + ++ /* Compute total count threshold of top functions in sampled data. */ ++ gcov_type calc_topn_function_total_count_thres (unsigned topn) const; ++ + private: + /* Map from function_instance name index (in string_table) to + function_instance. */ +@@ -338,6 +354,244 @@ static autofdo_source_profile *afdo_source_profile; + /* gcov_summary structure to store the profile_info. */ + static gcov_summary *afdo_profile_info; + ++/* Check opts->x_flags and put file name into EVENT_FILES. */ ++ ++static bool ++get_all_profile_names (const char **event_files) ++{ ++ if (!(flag_auto_profile ++ || (flag_cache_misses_profile || flag_additional_profile))) ++ { ++ return false; ++ } ++ ++ event_files[INST_EXEC] = auto_profile_file; ++ ++ if (flag_cache_misses_profile) ++ { ++ if (cache_misses_profile_file == NULL) ++ { ++ if (additional_profile_file == NULL) ++ { ++ additional_profile_file = DEFAULT_ADDITIONAL_PROFILE_FILE; ++ } ++ event_files[PMU_EVENT] = additional_profile_file; ++ } ++ event_files[CACHE_MISSES] = cache_misses_profile_file; ++ } ++ else if (flag_additional_profile) ++ { ++ if (additional_profile_file == NULL) ++ { ++ additional_profile_file = DEFAULT_ADDITIONAL_PROFILE_FILE; ++ } ++ event_files[PMU_EVENT] = additional_profile_file; ++ } ++ ++ return true; ++} ++ ++static void read_profile (void); ++ ++/* Maintain multiple profile data of different events with event_loc_count_map ++ and event_func_count_map. */ ++ ++class extend_auto_profile ++{ ++public: ++ bool auto_profile_exist (enum event_type type); ++ gcov_type get_loc_count (location_t, event_type); ++ gcov_type get_func_count (unsigned, event_type); ++ gcov_type get_topn_function_total_count_thres () const; ++ struct rank_info get_func_rank (unsigned, enum event_type); ++ /* There should be only one instance of class EXTEND_AUTO_PROFILE. */ ++ static extend_auto_profile *create () ++ { ++ extend_auto_profile *map = new extend_auto_profile (); ++ if (map->read ()) ++ { ++ return map; ++ } ++ delete map; ++ return NULL; ++ } ++private: ++ /* Basic maps of extend_auto_profile. */ ++ typedef std::map loc_count_map; ++ typedef std::map func_count_map; ++ ++ /* Map of function_uid to its descending order rank of counts. */ ++ typedef std::map rank_map; ++ ++ /* Mapping hardware events to corresponding basic maps. */ ++ typedef std::map event_loc_count_map; ++ typedef std::map event_func_count_map; ++ typedef std::map event_rank_map; ++ ++ extend_auto_profile () {} ++ bool read (); ++ void set_loc_count (); ++ void process_extend_source_profile (); ++ void read_extend_afdo_file (const char*, event_type); ++ void rank_all_func (); ++ void dump_event (); ++ event_loc_count_map event_loc_map; ++ event_func_count_map event_func_map; ++ event_rank_map func_rank; ++ event_type profile_type; ++ gcov_type topn_function_total_count_thres; ++}; ++ ++/* Member functions for extend_auto_profile. */ ++ ++bool ++extend_auto_profile::auto_profile_exist (enum event_type type) ++{ ++ switch (type) ++ { ++ case INST_EXEC: ++ return event_func_map.count (INST_EXEC) != 0 ++ || event_loc_map.count (INST_EXEC) != 0; ++ case CACHE_MISSES: ++ return event_func_map.count (CACHE_MISSES) != 0 ++ || event_loc_map.count (CACHE_MISSES) != 0; ++ case PMU_EVENT: ++ return event_func_map.count (PMU_EVENT) != 0 ++ || event_loc_map.count (PMU_EVENT) != 0; ++ default: ++ return false; ++ } ++} ++ ++void ++extend_auto_profile::dump_event () ++{ ++ if (dump_file) ++ { ++ switch (profile_type) ++ { ++ case INST_EXEC: ++ fprintf (dump_file, "Processing event instruction execution.\n"); ++ break; ++ case CACHE_MISSES: ++ fprintf (dump_file, "Processing event cache misses.\n"); ++ break; ++ case PMU_EVENT: ++ fprintf (dump_file, "Processing other PMU events.\n"); ++ break; ++ default: ++ break; ++ } ++ } ++} ++ ++/* Return true if any profile data was read. */ ++ ++bool ++extend_auto_profile::read () ++{ ++ const char *event_files[EVENT_NUMBER] = {NULL}; ++ if (!get_all_profile_names (event_files)) ++ { ++ return false; ++ } ++ ++ /* Backup AFDO_STRING_TABLE and AFDO_SOURCE_PROFILE since we will create ++ new ones for each event_type. */ ++ autofdo::string_table *string_table_afdo = afdo_string_table; ++ autofdo::autofdo_source_profile *source_profile_afdo = afdo_source_profile; ++ ++ for (unsigned i = 0; i < EVENT_NUMBER; i++) ++ { ++ if (event_files[i] == NULL) ++ { ++ continue; ++ } ++ profile_type = (enum event_type) i; ++ dump_event (); ++ gcov_close (); ++ auto_profile_file = event_files[i]; ++ read_profile (); ++ gcov_close (); ++ ++ topn_function_total_count_thres = param_llc_allocate_func_counts_threshold; ++ if (param_llc_allocate_func_topn > 0 && profile_type == PMU_EVENT) ++ { ++ topn_function_total_count_thres ++ = afdo_source_profile->calc_topn_function_total_count_thres ( ++ param_llc_allocate_func_topn); ++ } ++ ++ process_extend_source_profile (); ++ ++ delete afdo_source_profile; ++ delete afdo_string_table; ++ } ++ ++ /* Restore AFDO_STRING_TABLE and AFDO_SOURCE_PROFILE. Function ++ END_AUTO_PROFILE will free them at the end of compilation. */ ++ afdo_string_table = string_table_afdo; ++ afdo_source_profile = source_profile_afdo; ++ return true; ++} ++ ++/* Helper functions. */ ++ ++gcov_type ++extend_auto_profile::get_loc_count (location_t loc, event_type type) ++{ ++ event_loc_count_map::iterator event_iter = event_loc_map.find (type); ++ if (event_iter != event_loc_map.end ()) ++ { ++ loc_count_map::iterator loc_iter = event_iter->second.find (loc); ++ if (loc_iter != event_iter->second.end ()) ++ { ++ return loc_iter->second; ++ } ++ } ++ return 0; ++} ++ ++struct rank_info ++extend_auto_profile::get_func_rank (unsigned decl_uid, enum event_type type) ++{ ++ struct rank_info info = {0, 0}; ++ event_rank_map::iterator event_iter = func_rank.find (type); ++ if (event_iter != func_rank.end ()) ++ { ++ rank_map::iterator func_iter = event_iter->second.find (decl_uid); ++ if (func_iter != event_iter->second.end ()) ++ { ++ info.rank = func_iter->second; ++ info.total = event_iter->second.size (); ++ } ++ } ++ return info; ++} ++ ++gcov_type ++extend_auto_profile::get_func_count (unsigned decl_uid, event_type type) ++{ ++ event_func_count_map::iterator event_iter = event_func_map.find (type); ++ if (event_iter != event_func_map.end ()) ++ { ++ func_count_map::iterator func_iter = event_iter->second.find (decl_uid); ++ if (func_iter != event_iter->second.end ()) ++ { ++ return func_iter->second; ++ } ++ } ++ return 0; ++} ++ ++gcov_type ++extend_auto_profile::get_topn_function_total_count_thres () const ++{ ++ return topn_function_total_count_thres; ++} ++ ++static extend_auto_profile *extend_profile; ++ + /* Helper functions. */ + + /* Return the original name of NAME: strip the suffix that starts +@@ -483,7 +737,7 @@ string_table::get_index (const char *name) const + return iter->second; + } + +-/* Return the index of a given function DECL. Return -1 if DECL is not ++/* Return the index of a given function DECL. Return -1 if DECL is not + found in string table. */ + + int +@@ -917,6 +1171,31 @@ autofdo_source_profile::get_function_instance_by_inline_stack ( + return s; + } + ++/* Compute total count threshold of top functions in sampled data. */ ++ ++gcov_type ++autofdo_source_profile::calc_topn_function_total_count_thres ( ++ unsigned topn) const ++{ ++ std::set func_counts; ++ for (name_function_instance_map::const_iterator iter = map_.begin (); ++ iter != map_.end (); ++iter) ++ { ++ if (func_counts.size () < topn) ++ func_counts.insert (iter->second->total_count ()); ++ else if (*func_counts.begin () < iter->second->total_count ()) ++ { ++ func_counts.erase (func_counts.begin ()); ++ func_counts.insert (iter->second->total_count ()); ++ } ++ } ++ ++ gcov_type func_counts_topn = *func_counts.begin (); ++ if (func_counts.size () == topn ++ && param_llc_allocate_func_counts_threshold < func_counts_topn) ++ return func_counts_topn; ++} ++ + /* Module profile is only used by LIPO. Here we simply ignore it. */ + + static void +@@ -1842,6 +2121,132 @@ auto_profile (void) + + return TODO_rebuild_cgraph_edges; + } ++ ++ ++void ++extend_auto_profile::rank_all_func () ++{ ++ std::vector > func_sorted; ++ event_func_count_map::iterator event_iter ++ = event_func_map.find (profile_type); ++ if (event_iter != event_func_map.end ()) ++ { ++ func_count_map::iterator func_iter; ++ for (func_iter = event_iter->second.begin (); ++ func_iter != event_iter->second.end (); func_iter++) ++ { ++ func_sorted.push_back (std::make_pair (func_iter->first, ++ func_iter->second)); ++ } ++ ++ std::sort (func_sorted.begin (), func_sorted.end (), event_count_cmp); ++ ++ for (unsigned i = 0; i < func_sorted.size (); ++i) ++ { ++ func_rank[profile_type][func_sorted[i].first] = i + 1; ++ } ++ } ++} ++ ++/* Iterate stmts in cfun and maintain its count to EVENT_LOC_MAP. */ ++ ++void ++extend_auto_profile::set_loc_count () ++{ ++ basic_block bb; ++ FOR_EACH_BB_FN (bb, cfun) ++ { ++ gimple_stmt_iterator gsi; ++ for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ count_info info; ++ gimple *stmt = gsi_stmt (gsi); ++ if (gimple_clobber_p (stmt) || is_gimple_debug (stmt)) ++ { ++ continue; ++ } ++ if (afdo_source_profile->get_count_info (stmt, &info)) ++ { ++ location_t loc = gimple_location (stmt); ++ event_loc_map[profile_type][loc] += info.count; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "stmt "); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM); ++ fprintf (dump_file, "counts %ld\n", ++ event_loc_map[profile_type][loc]); ++ } ++ } ++ } ++ } ++} ++ ++/* Process data in extend_auto_source_profile, save them into two maps. ++ 1. gimple_location to count. ++ 2. function_index to count. */ ++void ++extend_auto_profile::process_extend_source_profile () ++{ ++ struct cgraph_node *node; ++ if (symtab->state == FINISHED) ++ { ++ return; ++ } ++ FOR_EACH_FUNCTION (node) ++ { ++ if (!gimple_has_body_p (node->decl) || node->inlined_to) ++ { ++ continue; ++ } ++ ++ /* Don't profile functions produced for builtin stuff. */ ++ if (DECL_SOURCE_LOCATION (node->decl) == BUILTINS_LOCATION) ++ { ++ continue; ++ } ++ ++ function *fn = DECL_STRUCT_FUNCTION (node->decl); ++ push_cfun (fn); ++ ++ const function_instance *s ++ = afdo_source_profile->get_function_instance_by_decl ( ++ current_function_decl); ++ ++ if (s == NULL) ++ { ++ pop_cfun (); ++ continue; ++ } ++ unsigned int decl_uid = DECL_UID (current_function_decl); ++ gcov_type count = s->total_count (); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Extend auto-profile for function %s.\n", ++ node->dump_name ()); ++ } ++ event_func_map[profile_type][decl_uid] += count; ++ set_loc_count (); ++ pop_cfun (); ++ } ++ rank_all_func (); ++} ++ ++/* Main entry of extend_auto_profile. */ ++ ++static void ++extend_source_profile () ++{ ++ extend_profile = autofdo::extend_auto_profile::create (); ++ if (dump_file) ++ { ++ if (extend_profile == NULL) ++ { ++ fprintf (dump_file, "No profile file is found.\n"); ++ return; ++ } ++ fprintf (dump_file, "Extend profile info generated.\n"); ++ } ++} + } /* namespace autofdo. */ + + /* Read the profile from the profile data file. */ +@@ -1870,6 +2275,48 @@ end_auto_profile (void) + profile_info = NULL; + } + ++/* Extern function to get profile info in other passes. */ ++ ++bool ++profile_exist (enum event_type type) ++{ ++ return autofdo::extend_profile != NULL ++ && autofdo::extend_profile->auto_profile_exist (type); ++} ++ ++gcov_type ++event_get_loc_count (location_t loc, event_type type) ++{ ++ return autofdo::extend_profile->get_loc_count (loc, type); ++} ++ ++gcov_type ++event_get_func_count (unsigned decl_uid, event_type type) ++{ ++ return autofdo::extend_profile->get_func_count (decl_uid, type); ++} ++ ++struct rank_info ++event_get_func_rank (unsigned decl_uid, enum event_type type) ++{ ++ return autofdo::extend_profile->get_func_rank (decl_uid, type); ++} ++ ++gcov_type ++event_get_topn_function_total_count_thres () ++{ ++ return autofdo::extend_profile->get_topn_function_total_count_thres (); ++} ++ ++void ++free_extend_profile_info () ++{ ++ if (autofdo::extend_profile != NULL) ++ { ++ delete autofdo::extend_profile; ++ } ++} ++ + /* Returns TRUE if EDGE is hot enough to be inlined early. */ + + bool +@@ -1931,8 +2378,50 @@ public: + + } // anon namespace + ++namespace ++{ ++const pass_data pass_data_ipa_extend_auto_profile = ++{ ++ SIMPLE_IPA_PASS, /* type */ ++ "ex-afdo", /* name */ ++ OPTGROUP_NONE, /* optinfo_flags */ ++ TV_IPA_EXTEND_AUTO_PROFILE, /* tv_id */ ++ 0, /* properties_required */ ++ 0, /* properties_provided */ ++ 0, /* properties_destroyed */ ++ 0, /* todo_flags_start */ ++ 0, /* todo_flags_finish */ ++}; ++ ++class pass_ipa_extend_auto_profile : public simple_ipa_opt_pass ++{ ++public: ++ pass_ipa_extend_auto_profile (gcc::context *ctxt) ++ : simple_ipa_opt_pass (pass_data_ipa_extend_auto_profile, ctxt) ++ {} ++ ++ /* opt_pass methods: */ ++ virtual bool gate (function *) {return (flag_ipa_extend_auto_profile > 0);} ++ virtual unsigned int execute (function *); ++ ++}; ++ ++unsigned int ++pass_ipa_extend_auto_profile::execute (function *fun) ++{ ++ autofdo::extend_source_profile (); ++ return 0; ++} ++} // anon namespace ++ + simple_ipa_opt_pass * + make_pass_ipa_auto_profile (gcc::context *ctxt) + { + return new pass_ipa_auto_profile (ctxt); + } ++ ++simple_ipa_opt_pass * ++make_pass_ipa_extend_auto_profile (gcc::context *ctxt) ++{ ++ return new pass_ipa_extend_auto_profile (ctxt); ++} +diff --git a/gcc/auto-profile.h b/gcc/auto-profile.h +index bf3f90f2f..dea0b18e6 100644 +--- a/gcc/auto-profile.h ++++ b/gcc/auto-profile.h +@@ -21,6 +21,14 @@ along with GCC; see the file COPYING3. If not see + #ifndef AUTO_PROFILE_H + #define AUTO_PROFILE_H + ++enum event_type ++{ ++ INST_EXEC = 0, ++ CACHE_MISSES, ++ PMU_EVENT, ++ EVENT_NUMBER ++}; ++ + /* Read, process, finalize AutoFDO data structures. */ + extern void read_autofdo_file (void); + extern void end_auto_profile (void); +@@ -28,4 +36,26 @@ extern void end_auto_profile (void); + /* Returns TRUE if EDGE is hot enough to be inlined early. */ + extern bool afdo_callsite_hot_enough_for_early_inline (struct cgraph_edge *); + ++/* Chcek if profile exists before using this profile. */ ++extern bool profile_exist (enum event_type); ++ ++/* Given func decl_uid or gimple location and event_type, return count. ++ Count is 0 if function or gimple is not sampled. */ ++extern gcov_type event_get_func_count (unsigned, enum event_type); ++extern gcov_type event_get_loc_count (location_t, enum event_type); ++extern gcov_type event_get_topn_function_total_count_thres (); ++ ++struct rank_info ++{ ++ unsigned total; ++ unsigned rank; ++}; ++ ++/* Given function decl_uid and event type, return rank_info. Rank_info ++ is {0, 0} if function was not sampled. */ ++extern struct rank_info event_get_func_rank (unsigned, enum event_type); ++ ++/* Free memory allocated by autofdo::extern_profile. */ ++extern void free_extend_profile_info (); ++ + #endif /* AUTO_PROFILE_H */ +diff --git a/gcc/builtins.cc b/gcc/builtins.cc +index 57929a42b..dc2e9c3f3 100644 +--- a/gcc/builtins.cc ++++ b/gcc/builtins.cc +@@ -1352,6 +1352,85 @@ expand_builtin_prefetch (tree exp) + emit_insn (op0); + } + ++/* Expand a call to __builtin_prefetch_full. */ ++ ++static void ++expand_builtin_prefetch_full (tree exp) ++{ ++ tree arg0, arg1, arg2; ++ int nargs; ++ rtx op0, op1, op2; ++ ++ if (!validate_arglist (exp, POINTER_TYPE, 0)) ++ return; ++ ++ arg0 = CALL_EXPR_ARG (exp, 0); ++ ++ /* Arguments 1 and 2 are optional; argument 1 (read/write) defaults to ++ zero (read) and argument 2 (locality) defaults to 3 (high degree of ++ locality). */ ++ nargs = call_expr_nargs (exp); ++ if (nargs > 1) ++ arg1 = CALL_EXPR_ARG (exp, 1); ++ else ++ arg1 = integer_zero_node; ++ if (nargs > 2) ++ arg2 = CALL_EXPR_ARG (exp, 2); ++ else ++ arg2 = integer_three_node; ++ ++ /* Argument 0 is an address. */ ++ op0 = expand_expr (arg0, NULL_RTX, Pmode, EXPAND_NORMAL); ++ ++ /* Argument 1 (read/write flag) must be a compile-time constant int. */ ++ if (TREE_CODE (arg1) != INTEGER_CST) ++ { ++ error ("second argument to %<__builtin_prefetch_full%> must be a " ++ "constant"); ++ arg1 = integer_zero_node; ++ } ++ op1 = expand_normal (arg1); ++ /* Argument 1 must be either zero or one. */ ++ if (INTVAL (op1) != 0 && INTVAL (op1) != 1) ++ { ++ warning (0, "invalid second argument to %<__builtin_prefetch_full%>;" ++ " using zero"); ++ op1 = const0_rtx; ++ } ++ ++ /* Argument 2 (locality) must be a compile-time constant int. */ ++ if (TREE_CODE (arg2) != INTEGER_CST) ++ { ++ error ("third argument to %<__builtin_prefetch_full%> must be a " ++ "constant"); ++ arg2 = integer_zero_node; ++ } ++ op2 = expand_normal (arg2); ++ /* Argument 2 must be 0-7. */ ++ if (INTVAL (op2) < 0 || INTVAL (op2) > 7) ++ { ++ warning (0, "invalid third argument to %<__builtin_prefetch_full%>; " ++ "using zero"); ++ op2 = const0_rtx; ++ } ++ ++ if (targetm.have_prefetch_full ()) ++ { ++ class expand_operand ops[3]; ++ ++ create_address_operand (&ops[0], op0); ++ create_integer_operand (&ops[1], INTVAL (op1)); ++ create_integer_operand (&ops[2], INTVAL (op2)); ++ if (maybe_expand_insn (targetm.code_for_prefetch_full, 3, ops)) ++ return; ++ } ++ ++ /* Don't do anything with direct references to volatile memory, but ++ generate code to handle other side effects. */ ++ if (!MEM_P (op0) && side_effects_p (op0)) ++ emit_insn (op0); ++} ++ + /* Get a MEM rtx for expression EXP which is the address of an operand + to be used in a string instruction (cmpstrsi, cpymemsi, ..). LEN is + the maximum length of the block of memory that might be accessed or +@@ -7598,6 +7677,9 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode, + case BUILT_IN_PREFETCH: + expand_builtin_prefetch (exp); + return const0_rtx; ++ case BUILT_IN_PREFETCH_FULL: ++ expand_builtin_prefetch_full (exp); ++ return const0_rtx; + + case BUILT_IN_INIT_TRAMPOLINE: + return expand_builtin_init_trampoline (exp, true); +diff --git a/gcc/builtins.def b/gcc/builtins.def +index 005976f34..f2e0c357d 100644 +--- a/gcc/builtins.def ++++ b/gcc/builtins.def +@@ -924,6 +924,7 @@ DEF_GCC_BUILTIN (BUILT_IN_POPCOUNTL, "popcountl", BT_FN_INT_ULONG, ATTR_C + DEF_GCC_BUILTIN (BUILT_IN_POPCOUNTLL, "popcountll", BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST) + DEF_EXT_LIB_BUILTIN (BUILT_IN_POSIX_MEMALIGN, "posix_memalign", BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF) + DEF_GCC_BUILTIN (BUILT_IN_PREFETCH, "prefetch", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST) ++DEF_GCC_BUILTIN (BUILT_IN_PREFETCH_FULL, "prefetch_full", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST) + DEF_LIB_BUILTIN (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE, ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST) + DEF_GCC_BUILTIN (BUILT_IN_RETURN, "return", BT_FN_VOID_PTR, ATTR_NORETURN_NOTHROW_LEAF_LIST) + DEF_GCC_BUILTIN (BUILT_IN_RETURN_ADDRESS, "return_address", BT_FN_PTR_UINT, ATTR_LEAF_LIST) +diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h +index d2714e20c..794bc3ecc 100644 +--- a/gcc/cfgloop.h ++++ b/gcc/cfgloop.h +@@ -272,6 +272,9 @@ public: + the basic-block from being collected but its index can still be + reused. */ + basic_block former_header; ++ ++ /* Number of latch executions from vectorization. */ ++ tree vec_nb_iterations; + }; + + /* Set if the loop is known to be infinite. */ +diff --git a/gcc/common.opt b/gcc/common.opt +index 6ab7ba4cc..e6ffa1c58 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1148,6 +1148,26 @@ Common Joined RejectNegative Var(auto_profile_file) + Use sample profile information for call graph node weights. The profile + file is specified in the argument. + ++fcache-misses-profile ++Common Var(flag_cache_misses_profile) ++Use sample profile information for source code cache miss count. The default ++profile file is cmsdata.gcov in `pwd`. ++ ++fcache-misses-profile= ++Common Joined RejectNegative Var(cache_misses_profile_file) ++Use sample profile information for source code cache miss count. The profile ++file is specified in the argument. ++ ++fadditional-profile ++Common Var(flag_additional_profile) ++Use additional PMU-event sample profile information for source code bb count. ++The default profile file is addldata.gcov in `pwd`. ++ ++fadditional-profile= ++Common Joined RejectNegative Var(additional_profile_file) ++Use additional PMU-event sample profile information for source code bb count. ++The profile file is specified in the argument. ++ + ; -fcheck-bounds causes gcc to generate array bounds checks. + ; For C, C++ and ObjC: defaults off. + ; For Java: defaults to on. +@@ -2074,6 +2094,10 @@ fipa-struct-sfc-shadow + Common Var(flag_ipa_struct_sfc_shadow) Init(0) Optimization + Enable field shadowing optimization in static struct field compression. + ++fipa-extend-auto-profile ++Common Var(flag_ipa_extend_auto_profile) ++Use sample profile information for source code. ++ + fipa-vrp + Common Var(flag_ipa_vrp) Optimization + Perform IPA Value Range Propagation. +@@ -2424,6 +2448,10 @@ fipa-prefetch + Common Var(flag_ipa_prefetch) Init(0) Optimization + Generate prefetch instructions, if available, using IPA info. + ++fllc-allocate ++Common Var(flag_llc_allocate) Init(-1) Optimization ++Generate LLC hint instructions. ++ + fprofile + Common Var(profile_flag) + Enable basic program profiling code. +diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h +index cbb844fbc..af0881f7a 100644 +--- a/gcc/config/aarch64/aarch64-protos.h ++++ b/gcc/config/aarch64/aarch64-protos.h +@@ -702,12 +702,16 @@ extern struct tune_params aarch64_tune_params; + T (PLDL2STRM, pldl2strm, 3) \ + T (PLDL3KEEP, pldl3keep, 4) \ + T (PLDL3STRM, pldl3strm, 5) \ ++ T (PLDL4KEEP, pldl4keep, 6) \ ++ T (PLDL4STRM, pldl4strm, 7) \ + T (PSTL1KEEP, pstl1keep, 8) \ + T (PSTL1STRM, pstl1strm, 9) \ + T (PSTL2KEEP, pstl2keep, 10) \ + T (PSTL2STRM, pstl2strm, 11) \ + T (PSTL3KEEP, pstl3keep, 12) \ +- T (PSTL3STRM, pstl3strm, 13) ++ T (PSTL3STRM, pstl3strm, 13) \ ++ T (PSTL4KEEP, pstl4keep, 14) \ ++ T (PSTL4STRM, pstl4strm, 15) + + #define AARCH64_SVENUM(UPPER, LOWER, VALUE) AARCH64_SV_##UPPER = VALUE, + enum aarch64_svpattern { +diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md +index a8a5dc3a2..7808abf70 100644 +--- a/gcc/config/aarch64/aarch64-sve.md ++++ b/gcc/config/aarch64/aarch64-sve.md +@@ -1952,7 +1952,7 @@ + (define_insn "@aarch64_sve_prefetch" + [(prefetch (unspec:DI + [(match_operand: 0 "register_operand" "Upl") +- (match_operand:SVE_FULL_I 1 "aarch64_sve_prefetch_operand" "UP") ++ (match_operand:SVE_FULL 1 "aarch64_sve_prefetch_operand" "UP") + (match_operand:DI 2 "const_int_operand")] + UNSPEC_SVE_PREFETCH) + (match_operand:DI 3 "const_int_operand") +@@ -1985,14 +1985,14 @@ + ;; 6: the prefetch operator (an svprfop) + ;; 7: the normal RTL prefetch rw flag + ;; 8: the normal RTL prefetch locality value +-(define_insn "@aarch64_sve_gather_prefetch" ++(define_insn "@aarch64_sve_gather_prefetch" + [(prefetch (unspec:DI + [(match_operand:VNx4BI 0 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl") +- (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk, rk, rk") ++ (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk, rk, rk") + (match_operand:VNx4SI_ONLY 2 "register_operand" "w, w, w, w, w, w") + (match_operand:DI 3 "const_int_operand" "i, i, Z, Ui1, Z, Ui1") +- (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, Ui1, i, i") +- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") ++ (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, Ui1, i, i") ++ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") + (match_operand:DI 6 "const_int_operand")] + UNSPEC_SVE_PREFETCH_GATHER) + (match_operand:DI 7 "const_int_operand") +@@ -2000,12 +2000,12 @@ + "TARGET_SVE && TARGET_NON_STREAMING" + { + static const char *const insns[][2] = { +- "prf", "%0, [%2.s]", +- "prf", "%0, [%2.s, #%1]", ++ "prf", "%0, [%2.s]", ++ "prf", "%0, [%2.s, #%1]", + "prfb", "%0, [%1, %2.s, sxtw]", + "prfb", "%0, [%1, %2.s, uxtw]", +- "prf", "%0, [%1, %2.s, sxtw %p4]", +- "prf", "%0, [%1, %2.s, uxtw %p4]" ++ "prf", "%0, [%1, %2.s, sxtw %p4]", ++ "prf", "%0, [%1, %2.s, uxtw %p4]" + }; + const char *const *parts = insns[which_alternative]; + return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); +@@ -2014,14 +2014,14 @@ + + ;; Predicated gather prefetches for 64-bit elements. The value of operand 3 + ;; doesn't matter in this case. +-(define_insn "@aarch64_sve_gather_prefetch" ++(define_insn "@aarch64_sve_gather_prefetch" + [(prefetch (unspec:DI + [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl, Upl, Upl") +- (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk") ++ (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk") + (match_operand:VNx2DI_ONLY 2 "register_operand" "w, w, w, w") + (match_operand:DI 3 "const_int_operand") +- (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, i") +- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") ++ (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, i") ++ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") + (match_operand:DI 6 "const_int_operand")] + UNSPEC_SVE_PREFETCH_GATHER) + (match_operand:DI 7 "const_int_operand") +@@ -2029,10 +2029,10 @@ + "TARGET_SVE && TARGET_NON_STREAMING" + { + static const char *const insns[][2] = { +- "prf", "%0, [%2.d]", +- "prf", "%0, [%2.d, #%1]", ++ "prf", "%0, [%2.d]", ++ "prf", "%0, [%2.d, #%1]", + "prfb", "%0, [%1, %2.d]", +- "prf", "%0, [%1, %2.d, lsl %p4]" ++ "prf", "%0, [%1, %2.d, lsl %p4]" + }; + const char *const *parts = insns[which_alternative]; + return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); +@@ -2040,7 +2040,7 @@ + ) + + ;; Likewise, but with the offset being sign-extended from 32 bits. +-(define_insn_and_rewrite "*aarch64_sve_gather_prefetch_sxtw" ++(define_insn_and_rewrite "*aarch64_sve_gather_prefetch_sxtw" + [(prefetch (unspec:DI + [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl") + (match_operand:DI 1 "register_operand" "rk, rk") +@@ -2051,8 +2051,8 @@ + (match_operand:VNx2DI 2 "register_operand" "w, w")))] + UNSPEC_PRED_X) + (match_operand:DI 3 "const_int_operand") +- (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") +- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") ++ (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") ++ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") + (match_operand:DI 6 "const_int_operand")] + UNSPEC_SVE_PREFETCH_GATHER) + (match_operand:DI 7 "const_int_operand") +@@ -2061,7 +2061,7 @@ + { + static const char *const insns[][2] = { + "prfb", "%0, [%1, %2.d, sxtw]", +- "prf", "%0, [%1, %2.d, sxtw %p4]" ++ "prf", "%0, [%1, %2.d, sxtw %p4]" + }; + const char *const *parts = insns[which_alternative]; + return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); +@@ -2073,7 +2073,7 @@ + ) + + ;; Likewise, but with the offset being zero-extended from 32 bits. +-(define_insn "*aarch64_sve_gather_prefetch_uxtw" ++(define_insn "*aarch64_sve_gather_prefetch_uxtw" + [(prefetch (unspec:DI + [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl") + (match_operand:DI 1 "register_operand" "rk, rk") +@@ -2081,8 +2081,8 @@ + (match_operand:VNx2DI 2 "register_operand" "w, w") + (match_operand:VNx2DI 9 "aarch64_sve_uxtw_immediate")) + (match_operand:DI 3 "const_int_operand") +- (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") +- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") ++ (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") ++ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") + (match_operand:DI 6 "const_int_operand")] + UNSPEC_SVE_PREFETCH_GATHER) + (match_operand:DI 7 "const_int_operand") +@@ -2091,7 +2091,7 @@ + { + static const char *const insns[][2] = { + "prfb", "%0, [%1, %2.d, uxtw]", +- "prf", "%0, [%1, %2.d, uxtw %p4]" ++ "prf", "%0, [%1, %2.d, uxtw %p4]" + }; + const char *const *parts = insns[which_alternative]; + return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index e9c387b24..a06c2c515 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -4408,6 +4408,13 @@ aarch64_sve_data_mode_p (machine_mode mode) + return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA; + } + ++/* Return true if MODE is an full SVE data vector mode. */ ++static bool ++aarch64_full_sve_data_mode_p (machine_mode mode) ++{ ++ return aarch64_classify_vector_mode (mode) == VEC_SVE_DATA; ++} ++ + /* Return the number of defined bytes in one constituent vector of + SVE mode MODE, which has vector flags VEC_FLAGS. */ + static poly_int64 +@@ -31796,6 +31803,17 @@ aarch64_libgcc_floating_mode_supported_p + #undef TARGET_ASM_FUNCTION_EPILOGUE + #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks + ++#undef TARGET_VECTORIZE_CODE_FOR_PREFETCH ++#define TARGET_VECTORIZE_CODE_FOR_PREFETCH code_for_aarch64_sve_prefetch ++ ++#undef TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH ++#define TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH \ ++ code_for_aarch64_sve_gather_prefetch ++ ++#undef TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P ++#define TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P \ ++ aarch64_full_sve_data_mode_p ++ + #undef TARGET_HAVE_SHADOW_CALL_STACK + #define TARGET_HAVE_SHADOW_CALL_STACK true + +diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md +index 2f46bc793..69d296556 100644 +--- a/gcc/config/aarch64/aarch64.md ++++ b/gcc/config/aarch64/aarch64.md +@@ -925,6 +925,45 @@ + [(set_attr "type" "load_4")] + ) + ++(define_insn "prefetch_full" ++ [(prefetch_full (match_operand:DI 0 "aarch64_prefetch_operand" "Dp") ++ (match_operand:QI 1 "const_int_operand" "") ++ (match_operand:QI 2 "const_int_operand" ""))] ++ "" ++ { ++ const char * pftype[2][8] = ++ { ++ {"prfm\\tPLDL1KEEP, %0", ++ "prfm\\tPLDL1STRM, %0", ++ "prfm\\tPLDL2KEEP, %0", ++ "prfm\\tPLDL2STRM, %0", ++ "prfm\\tPLDL3KEEP, %0", ++ "prfm\\tPLDL3STRM, %0", ++ "prfm\\tPLDL4KEEP, %0", ++ "prfm\\tPLDL4STRM, %0"}, ++ {"prfm\\tPSTL1KEEP, %0", ++ "prfm\\tPSTL1STRM, %0", ++ "prfm\\tPSTL2KEEP, %0", ++ "prfm\\tPSTL2STRM, %0", ++ "prfm\\tPSTL3KEEP, %0", ++ "prfm\\tPSTL3STRM, %0", ++ "prfm\\tPSTL4KEEP, %0", ++ "prfm\\tPSTL4STRM, %0"}, ++ }; ++ ++ int prfop = INTVAL (operands[2]); ++ ++ gcc_assert (IN_RANGE (prfop, 0, 7)); ++ ++ /* PRFM accepts the same addresses as a 64-bit LDR so wrap ++ the address into a DImode MEM so that aarch64_print_operand knows ++ how to print it. */ ++ operands[0] = gen_rtx_MEM (DImode, operands[0]); ++ return pftype[INTVAL (operands[1])][prfop]; ++ } ++ [(set_attr "type" "load_4")] ++) ++ + (define_insn "trap" + [(trap_if (const_int 1) (const_int 8))] + "" +diff --git a/gcc/dce.cc b/gcc/dce.cc +index 6676cbcd4..964a0a6d0 100644 +--- a/gcc/dce.cc ++++ b/gcc/dce.cc +@@ -72,6 +72,7 @@ deletable_insn_p_1 (rtx body) + switch (GET_CODE (body)) + { + case PREFETCH: ++ case PREFETCH_FULL: + case TRAP_IF: + /* The UNSPEC case was added here because the ia-64 claims that + USEs do not work after reload and generates UNSPECS rather +diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi +index 50bbbbc42..16ada7aae 100644 +--- a/gcc/doc/tm.texi ++++ b/gcc/doc/tm.texi +@@ -6278,6 +6278,27 @@ The default is @code{NULL_TREE} which means to not vectorize scatter + stores. + @end deftypefn + ++@deftypefn {Target Hook} insn_code TARGET_VECTORIZE_CODE_FOR_PREFETCH (machine_mode @var{arg}) ++This hook should return the decl of a function that implements the ++vectorized variant of the function with the @code{combined_fn} code ++@var{code} or @code{NULL_TREE} if such a function is not available. ++The return type of the vectorized function shall be of vector type ++@var{vec_type_out} and the argument types should be @var{vec_type_in}. ++@end deftypefn ++ ++@deftypefn {Target Hook} insn_code TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH (machine_mode @var{mode_to}, machine_mode @var{mode_form}) ++This hook should return the decl of a function that implements the ++vectorized variant of the function with the @code{combined_fn} code ++@var{code} or @code{NULL_TREE} if such a function is not available. ++The return type of the vectorized function shall be of vector type ++@var{vec_type_out} and the argument types should be @var{vec_type_in}. ++@end deftypefn ++ ++@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P (machine_mode @var{arg}) ++This hook should return true if the target hardware architecture ++supports a full SVE data vector mode. ++@end deftypefn ++ + @deftypefn {Target Hook} int TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN (struct cgraph_node *@var{}, struct cgraph_simd_clone *@var{}, @var{tree}, @var{int}) + This hook should set @var{vecsize_mangle}, @var{vecsize_int}, @var{vecsize_float} + fields in @var{simd_clone} structure pointed by @var{clone_info} argument and also +diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in +index cfda60304..88db8752e 100644 +--- a/gcc/doc/tm.texi.in ++++ b/gcc/doc/tm.texi.in +@@ -4190,6 +4190,12 @@ address; but often a machine-dependent strategy can generate better code. + + @hook TARGET_VECTORIZE_BUILTIN_SCATTER + ++@hook TARGET_VECTORIZE_CODE_FOR_PREFETCH ++ ++@hook TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH ++ ++@hook TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P ++ + @hook TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN + + @hook TARGET_SIMD_CLONE_ADJUST +diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc +index 8b1733e20..19811106f 100644 +--- a/gcc/internal-fn.cc ++++ b/gcc/internal-fn.cc +@@ -107,11 +107,13 @@ init_internal_fns () + direct_internal_fn. */ + #define not_direct { -2, -2, false } + #define mask_load_direct { -1, 2, false } ++#define mask_prefetch_direct { -1, 2, false } + #define load_lanes_direct { -1, -1, false } + #define mask_load_lanes_direct { -1, -1, false } + #define gather_load_direct { 3, 1, false } + #define len_load_direct { -1, -1, false } + #define mask_store_direct { 3, 2, false } ++#define gather_prefetch_direct { 3, 1, false } + #define store_lanes_direct { 0, 0, false } + #define mask_store_lanes_direct { 0, 0, false } + #define vec_cond_mask_direct { 1, 0, false } +@@ -2745,6 +2747,53 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab) + #define expand_mask_load_lanes_optab_fn expand_mask_load_optab_fn + #define expand_len_load_optab_fn expand_partial_load_optab_fn + ++/* Expand MASK_PREFETCH call STMT using optab OPTAB. ++ .MASK_STORE (_5, 64B, loop_mask_98, vect__8.10_102); ++ .MASK_PREFETCH (_68, 64B, loop_mask_98, vect__8.10_102, 4); ++*/ ++ ++static void ++expand_mask_prefetch_optab_fn (internal_fn, gcall *stmt, direct_optab optab) ++{ ++ if (targetm.vectorize.code_for_prefetch == NULL ++ || targetm.vectorize.prefetch_handleable_mode_p == NULL) ++ return; ++ ++ tree base = gimple_call_arg (stmt, 0); ++ if (base == NULL_TREE) ++ return; ++ ++ tree maskt = gimple_call_arg (stmt, 2); ++ tree target = gimple_call_arg (stmt, 3); ++ tree prfop = gimple_call_arg (stmt, 4); ++ HOST_WIDE_INT prfop_int = tree_to_uhwi (prfop); ++ /* Bit 3 of the prfop selects stores over loads. */ ++ HOST_WIDE_INT access = prfop_int & 8; ++ /* Bits 1 and 2 specify the locality; 0-based for svprfop but ++ 1-based for PREFETCH. */ ++ HOST_WIDE_INT locality = ((prfop_int >> 1) & 3) + 1; ++ ++ machine_mode m_mode = TYPE_MODE (TREE_TYPE (target)); ++ if (!targetm.vectorize.prefetch_handleable_mode_p (m_mode)) ++ return; ++ insn_code icode = targetm.vectorize.code_for_prefetch (m_mode); ++ ++ rtx mask = expand_normal (maskt); ++ rtx base_rtx = expand_normal (base); ++ /* Convert ptr_mode value X to Pmode. */ ++ if (ptr_mode == SImode) ++ base_rtx = simplify_gen_unary (ZERO_EXTEND, DImode, base_rtx, SImode); ++ ++ unsigned i = 0; ++ class expand_operand ops[5]; ++ create_input_operand (&ops[i++], mask, TYPE_MODE (TREE_TYPE (maskt))); ++ create_address_operand (&ops[i++], base_rtx); ++ create_integer_operand (&ops[i++], prfop_int); ++ create_integer_operand (&ops[i++], access); ++ create_integer_operand (&ops[i++], locality); ++ expand_insn (icode, i, ops); ++} ++ + /* Expand MASK_STORE{,_LANES} or LEN_STORE call STMT using optab OPTAB. */ + + static void +@@ -3402,6 +3451,70 @@ contains_call_div_mod (rtx_insn *insn) + return false; + } + ++/* Expand {MASK_,}GATHER_PREFETCH call CALL using optab OPTAB. ++ vect_patt_97.14_77 = .MASK_GATHER_LOAD (_78, vect__14.13_79, 8, { 0.0, ... }, loop_mask_87); ++ .MASK_GATHER_PREFETCH (_45, vect__14.13_79, 8, { 0.0, ... }, loop_mask_87, vect_patt_97.14_77, 4); ++*/ ++ ++static void ++expand_gather_prefetch_optab_fn (internal_fn, gcall *stmt, direct_optab optab) ++{ ++ if (targetm.vectorize.code_for_gather_prefetch == NULL ++ || targetm.vectorize.prefetch_handleable_mode_p == NULL) ++ return; ++ ++ /* Extracting tree nodes, only expand for scalar base and vector index. */ ++ tree base = gimple_call_arg (stmt, 0); ++ if (VECTOR_TYPE_P (TREE_TYPE (base))) ++ return; ++ tree offset = gimple_call_arg (stmt, 1); ++ if (VECTOR_TYPE_P (TREE_TYPE (offset)) == false) ++ return; ++ ++ tree scale = gimple_call_arg (stmt, 2); ++ tree mask = gimple_call_arg (stmt, 4); ++ tree target = gimple_call_arg (stmt, 5); ++ tree prfop = gimple_call_arg (stmt, 6); ++ ++ /* Convert to the rtx node. */ ++ rtx base_rtx = expand_normal (base); ++ /* Convert ptr_mode value X to Pmode. */ ++ if (ptr_mode == SImode) ++ base_rtx = simplify_gen_unary (ZERO_EXTEND, DImode, base_rtx, SImode); ++ rtx offset_rtx = expand_normal (offset); ++ rtx const_rtx = CONST0_RTX (TYPE_MODE (TREE_TYPE (target))); ++ rtx mask_rtx = expand_normal (mask); ++ HOST_WIDE_INT scale_int = tree_to_shwi (scale); ++ HOST_WIDE_INT prfop_int = tree_to_uhwi (prfop); ++ /* Bit 3 of the prfop selects stores over loads. */ ++ HOST_WIDE_INT access = prfop_int & 8; ++ /* Bits 1 and 2 specify the locality; 0-based for svprfop but ++ 1-based for PREFETCH. */ ++ HOST_WIDE_INT locality = ((prfop_int >> 1) & 3) + 1; ++ ++ /* add operand. */ ++ unsigned int i = 0; ++ class expand_operand ops[9]; ++ create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask))); ++ create_address_operand (&ops[i++], base_rtx); ++ create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset))); ++ /* Check whether the index has unsigned. */ ++ create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset))); ++ create_integer_operand (&ops[i++], scale_int); ++ create_input_operand (&ops[i++], const_rtx, GET_MODE (const_rtx)); ++ create_integer_operand (&ops[i++], prfop_int); ++ create_integer_operand (&ops[i++], access); ++ create_integer_operand (&ops[i++], locality); ++ ++ machine_mode reg_mode = GET_MODE (offset_rtx); ++ machine_mode m_mode = TYPE_MODE (TREE_TYPE (target)); ++ if (!targetm.vectorize.prefetch_handleable_mode_p (m_mode)) ++ return; ++ insn_code icode = targetm.vectorize.code_for_gather_prefetch ++ (m_mode, reg_mode); ++ expand_insn (icode, i, ops); ++} ++ + /* Expand DIVMOD() using: + a) optab handler for udivmod/sdivmod if it is available. + b) If optab_handler doesn't exist, generate call to +@@ -3767,10 +3880,12 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, + #define direct_cond_binary_optab_supported_p direct_optab_supported_p + #define direct_cond_ternary_optab_supported_p direct_optab_supported_p + #define direct_mask_load_optab_supported_p convert_optab_supported_p ++#define direct_mask_prefetch_optab_supported_p direct_optab_supported_p + #define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p + #define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p + #define direct_gather_load_optab_supported_p convert_optab_supported_p + #define direct_len_load_optab_supported_p direct_optab_supported_p ++#define direct_gather_prefetch_optab_supported_p direct_optab_supported_p + #define direct_mask_store_optab_supported_p convert_optab_supported_p + #define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p + #define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p +diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def +index d2d550d35..05fc50328 100644 +--- a/gcc/internal-fn.def ++++ b/gcc/internal-fn.def +@@ -121,6 +121,8 @@ along with GCC; see the file COPYING3. If not see + #endif + + DEF_INTERNAL_OPTAB_FN (MASK_LOAD, ECF_PURE, maskload, mask_load) ++DEF_INTERNAL_OPTAB_FN (MASK_PREFETCH, ECF_NOVOPS | ECF_LEAF, ++ maskprefetch, mask_prefetch) + DEF_INTERNAL_OPTAB_FN (LOAD_LANES, ECF_CONST, vec_load_lanes, load_lanes) + DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE, + vec_mask_load_lanes, mask_load_lanes) +@@ -128,6 +130,8 @@ DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE, + DEF_INTERNAL_OPTAB_FN (GATHER_LOAD, ECF_PURE, gather_load, gather_load) + DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE, + mask_gather_load, gather_load) ++DEF_INTERNAL_OPTAB_FN (MASK_GATHER_PREFETCH, ECF_NOVOPS | ECF_LEAF, ++ mask_gather_prefetch, gather_prefetch) + + DEF_INTERNAL_OPTAB_FN (LEN_LOAD, ECF_PURE, len_load, len_load) + +diff --git a/gcc/ipa-pure-const.cc b/gcc/ipa-pure-const.cc +index 2642df91e..222fe6465 100644 +--- a/gcc/ipa-pure-const.cc ++++ b/gcc/ipa-pure-const.cc +@@ -534,6 +534,7 @@ builtin_safe_for_const_function_p (bool *looping, tree callee) + *looping = false; + return true; + case BUILT_IN_PREFETCH: ++ case BUILT_IN_PREFETCH_FULL: + *looping = true; + return true; + default: +diff --git a/gcc/optabs.def b/gcc/optabs.def +index dbf529434..8ca25a5cc 100644 +--- a/gcc/optabs.def ++++ b/gcc/optabs.def +@@ -90,9 +90,11 @@ OPTAB_CD(vec_cmp_optab, "vec_cmp$a$b") + OPTAB_CD(vec_cmpu_optab, "vec_cmpu$a$b") + OPTAB_CD(vec_cmpeq_optab, "vec_cmpeq$a$b") + OPTAB_CD(maskload_optab, "maskload$a$b") ++OPTAB_CD(maskprefetch_optab, "maskprefetch$a$b") + OPTAB_CD(maskstore_optab, "maskstore$a$b") + OPTAB_CD(gather_load_optab, "gather_load$a$b") + OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b") ++OPTAB_CD(mask_gather_prefetch_optab, "mask_gather_prefetch$a$b") + OPTAB_CD(scatter_store_optab, "scatter_store$a$b") + OPTAB_CD(mask_scatter_store_optab, "mask_scatter_store$a$b") + OPTAB_CD(vec_extract_optab, "vec_extract$a$b") +diff --git a/gcc/opts.cc b/gcc/opts.cc +index 2433ace06..432b822e8 100644 +--- a/gcc/opts.cc ++++ b/gcc/opts.cc +@@ -2108,6 +2108,13 @@ enable_fdo_optimizations (struct gcc_options *opts, + SET_OPTION_IF_UNSET (opts, opts_set, flag_tree_loop_distribution, value); + } + ++static void ++set_cache_misses_profile_params (struct gcc_options *opts, ++ struct gcc_options *opts_set) ++{ ++ SET_OPTION_IF_UNSET (opts, opts_set, flag_prefetch_loop_arrays, 1); ++} ++ + /* Enable cfgo-related flags. */ + + static void +@@ -3143,10 +3150,20 @@ common_handle_option (struct gcc_options *opts, + /* FALLTHRU */ + case OPT_fauto_profile: + enable_fdo_optimizations (opts, opts_set, value); +- /* 2 is special and means flag_profile_correction trun on by +- -fauto-profile. */ ++ /* 2 is special and means flag_profile_correction trun on by ++ -fauto-profile. */ + SET_OPTION_IF_UNSET (opts, opts_set, flag_profile_correction, +- (value ? 2 : 0)); ++ (value ? 2 : 0)); ++ break; ++ ++ case OPT_fadditional_profile_: ++ opts->x_additional_profile_file = xstrdup (arg); ++ opts->x_flag_additional_profile = true; ++ value = true; ++ /* No break here - do -fadditional-profile processing. */ ++ /* FALLTHRU */ ++ case OPT_fadditional_profile: ++ opts->x_flag_ipa_extend_auto_profile = value; + break; + + case OPT_fipa_struct_reorg_: +@@ -3155,17 +3172,36 @@ common_handle_option (struct gcc_options *opts, + case OPT_fipa_struct_reorg: + opts->x_flag_ipa_struct_reorg = value; + if (value && !opts->x_struct_layout_optimize_level) +- { +- /* Using the -fipa-struct-reorg option is equivalent to using +- -fipa-struct-reorg=1. */ +- opts->x_struct_layout_optimize_level = 1; +- } ++ { ++ /* Using the -fipa-struct-reorg option is equivalent to using ++ -fipa-struct-reorg=1. */ ++ opts->x_struct_layout_optimize_level = 1; ++ } + break; + + case OPT_fipa_reorder_fields: + SET_OPTION_IF_UNSET (opts, opts_set, flag_ipa_struct_reorg, value); + break; + ++ case OPT_fipa_extend_auto_profile: ++ opts->x_flag_ipa_extend_auto_profile = opts->x_flag_cache_misses_profile ++ ? true : value; ++ break; ++ ++ case OPT_fcache_misses_profile_: ++ opts->x_cache_misses_profile_file = xstrdup (arg); ++ opts->x_flag_cache_misses_profile = true; ++ value = true; ++ /* No break here - do -fcache-misses-profile processing. */ ++ /* FALLTHRU */ ++ case OPT_fcache_misses_profile: ++ opts->x_flag_ipa_extend_auto_profile = value; ++ if (value) ++ { ++ set_cache_misses_profile_params (opts, opts_set); ++ } ++ break; ++ + case OPT_fcfgo_profile_generate_: + opts->x_profile_data_prefix = xstrdup (arg); + value = true; +diff --git a/gcc/params.opt b/gcc/params.opt +index e5472dfc8..e06e50611 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -1262,4 +1262,66 @@ Range for depended ldp search in split-ldp-stp path. + Common Joined UInteger Var(semi_relayout_level) Init(13) IntegerRange(11, 15) Param Optimization + Set capacity of each bucket to semi-relayout to (1 << semi-relayout-level) / 8 . + ++-param=mem-access-ratio= ++Common Joined UInteger Var(param_mem_access_ratio) Init(20) IntegerRange(0, 100) Param Optimization ++Memory access ratio (in percent). ++ ++-param=mem-access-num= ++Common Joined UInteger Var(param_mem_access_num) Init(3) Param Optimization ++Memory access num. ++ ++-param=prefetch-offset= ++Common Joined UInteger Var(param_prefetch_offset) Init(1024) ++IntegerRange(1, 999999) Param Optimization ++Prefetch Offset, which is usually a power of two due to cache line size. ++ ++-param=branch-prob-threshold= ++Common Joined UInteger Var(param_branch_prob_threshold) Init(80) IntegerRange(50, 100) ++Param Optimization ++High Execution Rate Branch Threshold. ++ ++-param=issue-topn= ++Common Joined UInteger Var(param_issue_topn) Init(1) Param Optimization ++Issue topn LLC mem_ref hint. ++ ++-param=force-issue= ++Common Joined UInteger Var(param_force_issue) Init(0) IntegerRange(0, 1) Param ++Force issue the topn LLC mem_ref hint, without generating dynamic multi-branches. ++ ++-param=llc-capacity-per-core= ++Common Joined UInteger Var(param_llc_capacity_per_core) Init(107) IntegerRange(0, 999999) Param ++LLC capacity per core. ++ ++-param=filter-kernels= ++Common Joined UInteger Var(param_filter_kernels) Init(1) IntegerRange(0, 1) Param ++Allow LLC allocate pass to greedily filter kernels by traversing the corresponding basic blocks ++through edges with branch probability no less than param_branch_prob_threshold. ++ ++-param=outer-loop-nums= ++Common Joined UInteger Var(param_outer_loop_num) Init(1) IntegerRange(1, 10) Param ++Maximum number of outer loops allowed to extend outer loops for loops that ++cannot recognize inner loop boundaries. ++ ++-param=llc-level= ++Common Joined UInteger Var(param_llc_level) Init(3) IntegerRange(3, 4) ++Param Optimization ++Specifies the HBM cache level. ++ ++-param=filter-mode= ++Common Joined UInteger Var(param_filter_mode) Init(1) IntegerRange(0, 1) Param ++Set kernel filtering mode. Use basic block count by default; use branch probability mode when filter mode is turned off. ++ ++-param=transfer-footprint= ++Common Joined UInteger Var(param_transfer_footprint) Init(1) IntegerRange(0, 1) Param ++Allow transferring the firstly calculated footprint expression to the target memory reference ++from which it is impossible to retrieve the foortprint. ++ ++-param=llc-allocate-func-topn= ++Common Joined UInteger Var(param_llc_allocate_func_topn) Init(0) Param Optimization ++TopN functions of pmu counts to be analyzed in LLC allocation. ++ ++-param=llc-allocate-func-counts-threshold= ++Common Joined UInteger Var(param_llc_allocate_func_counts_threshold) Init(1) Param Optimization ++Threshold functions of pmu counts to be analyzed in LLC allocation. ++ + ; This comment is to ensure we retain the blank line above. +diff --git a/gcc/passes.def b/gcc/passes.def +index 90643d533..49001adde 100644 +--- a/gcc/passes.def ++++ b/gcc/passes.def +@@ -141,6 +141,7 @@ along with GCC; see the file COPYING3. If not see + + NEXT_PASS (pass_target_clone); + NEXT_PASS (pass_ipa_auto_profile); ++ NEXT_PASS (pass_ipa_extend_auto_profile); + NEXT_PASS (pass_ipa_tree_profile); + PUSH_INSERT_PASSES_WITHIN (pass_ipa_tree_profile) + NEXT_PASS (pass_feedback_split_functions); +@@ -325,6 +326,7 @@ along with GCC; see the file COPYING3. If not see + /* Run IVOPTs after the last pass that uses data-reference analysis + as that doesn't handle TARGET_MEM_REFs. */ + NEXT_PASS (pass_iv_optimize); ++ NEXT_PASS (pass_llc_allocate); + NEXT_PASS (pass_lim); + NEXT_PASS (pass_tree_loop_done); + POP_INSERT_PASSES () +diff --git a/gcc/print-rtl.cc b/gcc/print-rtl.cc +index 636113d5b..b7506514a 100644 +--- a/gcc/print-rtl.cc ++++ b/gcc/print-rtl.cc +@@ -1579,6 +1579,12 @@ print_exp (pretty_printer *pp, const_rtx x, int verbose) + op[1] = XEXP (x, 1); + op[2] = XEXP (x, 2); + break; ++ case PREFETCH_FULL: ++ fun = "prefetch_full"; ++ op[0] = XEXP (x, 0); ++ op[1] = XEXP (x, 1); ++ op[2] = XEXP (x, 2); ++ break; + case UNSPEC: + case UNSPEC_VOLATILE: + { +diff --git a/gcc/rtl.def b/gcc/rtl.def +index 08e31fa35..78ec1a021 100644 +--- a/gcc/rtl.def ++++ b/gcc/rtl.def +@@ -282,6 +282,15 @@ DEF_RTL_EXPR(ADDR_DIFF_VEC, "addr_diff_vec", "eEee0", RTX_EXTRA) + whose prefetch instructions do not support them. */ + DEF_RTL_EXPR(PREFETCH, "prefetch", "eee", RTX_EXTRA) + ++/* Memory prefetch, with attributes supported on some targets. ++ Operand 1 is the address of the memory to fetch. ++ Operand 2 is 1 for a write access, 0 otherwise. ++ Operand 3 is the level of prfop. ++ ++ The attributes specified by operands 2 and 3 are ignored for targets ++ whose prefetch instructions do not support them. */ ++DEF_RTL_EXPR(PREFETCH_FULL, "prefetch_full", "eee", RTX_EXTRA) ++ + /* ---------------------------------------------------------------------- + At the top level of an instruction (perhaps under PARALLEL). + ---------------------------------------------------------------------- */ +diff --git a/gcc/rtl.h b/gcc/rtl.h +index a0db225cb..844e1a7c3 100644 +--- a/gcc/rtl.h ++++ b/gcc/rtl.h +@@ -2814,6 +2814,10 @@ do { \ + #define PREFETCH_SCHEDULE_BARRIER_P(RTX) \ + (RTL_FLAG_CHECK1 ("PREFETCH_SCHEDULE_BARRIER_P", (RTX), PREFETCH)->volatil) + ++/* True if RTX is flagged to be a scheduling barrier. */ ++#define PREFETCH_FULL_SCHEDULE_BARRIER_P(RTX) \ ++ (RTL_FLAG_CHECK1 ("PREFETCH_FULL_SCHEDULE_BARRIER_P", (RTX), PREFETCH_FULL)->volatil) ++ + /* Indicate whether the machine has any sort of auto increment addressing. + If not, we can avoid checking for REG_INC notes. */ + +diff --git a/gcc/rtlanal.cc b/gcc/rtlanal.cc +index c436c640c..7f5646ce7 100644 +--- a/gcc/rtlanal.cc ++++ b/gcc/rtlanal.cc +@@ -1198,6 +1198,7 @@ reg_referenced_p (const_rtx x, const_rtx body) + return reg_overlap_mentioned_p (x, TRAP_CONDITION (body)); + + case PREFETCH: ++ case PREFETCH_FULL: + return reg_overlap_mentioned_p (x, XEXP (body, 0)); + + case UNSPEC: +@@ -2042,6 +2043,7 @@ note_uses (rtx *pbody, void (*fun) (rtx *, void *), void *data) + return; + + case PREFETCH: ++ case PREFETCH_FULL: + (*fun) (&XEXP (body, 0), data); + return; + +diff --git a/gcc/sched-deps.cc b/gcc/sched-deps.cc +index 948aa0c3b..db453fb9b 100644 +--- a/gcc/sched-deps.cc ++++ b/gcc/sched-deps.cc +@@ -2705,7 +2705,9 @@ sched_analyze_2 (class deps_desc *deps, rtx x, rtx_insn *insn) + break; + + case PREFETCH: +- if (PREFETCH_SCHEDULE_BARRIER_P (x)) ++ case PREFETCH_FULL: ++ if ((code == PREFETCH && PREFETCH_SCHEDULE_BARRIER_P (x)) ++ || (code == PREFETCH_FULL && PREFETCH_FULL_SCHEDULE_BARRIER_P (x))) + reg_pending_barrier = TRUE_BARRIER; + /* Prefetch insn contains addresses only. So if the prefetch + address has no registers, there will be no dependencies on +diff --git a/gcc/target-insns.def b/gcc/target-insns.def +index de8c0092f..9cfa19475 100644 +--- a/gcc/target-insns.def ++++ b/gcc/target-insns.def +@@ -77,6 +77,7 @@ DEF_TARGET_INSN (omp_simt_vote_any, (rtx x0, rtx x1)) + DEF_TARGET_INSN (omp_simt_xchg_bfly, (rtx x0, rtx x1, rtx x2)) + DEF_TARGET_INSN (omp_simt_xchg_idx, (rtx x0, rtx x1, rtx x2)) + DEF_TARGET_INSN (prefetch, (rtx x0, rtx x1, rtx x2)) ++DEF_TARGET_INSN (prefetch_full, (rtx x0, rtx x1, rtx x2)) + DEF_TARGET_INSN (probe_stack, (rtx x0)) + DEF_TARGET_INSN (probe_stack_address, (rtx x0)) + DEF_TARGET_INSN (prologue, (void)) +diff --git a/gcc/target.def b/gcc/target.def +index 142858fa3..646489540 100644 +--- a/gcc/target.def ++++ b/gcc/target.def +@@ -2064,6 +2064,37 @@ it is for the vector version.", + (vec_info *vinfo, bool costing_for_scalar), + default_vectorize_create_costs) + ++/* Function for vector prefetch operation. */ ++DEFHOOK ++(code_for_prefetch, ++ "This hook should return the decl of a function that implements the\n\ ++vectorized variant of the function with the @code{combined_fn} code\n\ ++@var{code} or @code{NULL_TREE} if such a function is not available.\n\ ++The return type of the vectorized function shall be of vector type\n\ ++@var{vec_type_out} and the argument types should be @var{vec_type_in}.", ++ insn_code, (machine_mode arg), ++ NULL) ++ ++/* Function for vector gather prefetch operation. */ ++DEFHOOK ++(code_for_gather_prefetch, ++ "This hook should return the decl of a function that implements the\n\ ++vectorized variant of the function with the @code{combined_fn} code\n\ ++@var{code} or @code{NULL_TREE} if such a function is not available.\n\ ++The return type of the vectorized function shall be of vector type\n\ ++@var{vec_type_out} and the argument types should be @var{vec_type_in}.", ++ insn_code, (machine_mode mode_to, machine_mode mode_form), ++ NULL) ++ ++/* Function to check whether the target hardware architecture supports ++ a full SVE data vector mode. */ ++DEFHOOK ++(prefetch_handleable_mode_p, ++ "This hook should return true if the target hardware architecture\n\ ++supports a full SVE data vector mode.", ++ bool, (machine_mode arg), ++ NULL) ++ + HOOK_VECTOR_END (vectorize) + + #undef HOOK_PREFIX +diff --git a/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp +new file mode 100644 +index 000000000..1793ba9d1 +--- /dev/null ++++ b/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp +@@ -0,0 +1,27 @@ ++# Copyright (C) 1997-2022 Free Software Foundation, Inc. ++ ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 3 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with GCC; see the file COPYING3. If not see ++# . ++ ++load_lib g++-dg.exp ++load_lib target-supports.exp ++ ++# Initialize `dg'. ++dg-init ++ ++dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.C]] \ ++ "" "-fllc-allocate" ++ ++# All done. ++dg-finish +\ No newline at end of file +diff --git a/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C b/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C +new file mode 100644 +index 000000000..b5bf69510 +--- /dev/null ++++ b/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C +@@ -0,0 +1,52 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param branch-prob-threshold=50 --param filter-kernels=0 --param mem-access-num=2 --param issue-topn=1 --param force-issue=1" } */ ++#include "multidim_array.h" ++ ++class Input ++{ ++ public: ++ int metadata_offset = 13; ++ int exp_nr_images = 1; ++ MultidimArray exp_Mweight; ++ void convertAllSquaredDifferencesToWeights(); ++}; ++ ++int main() ++{ ++ clock_t start = clock(); ++ Input input; ++ int testIter = 2; ++ ++ for (int i = 0; i < testIter; ++i) ++ { ++ input.convertAllSquaredDifferencesToWeights(); ++ } ++ return 0; ++} ++ ++void Input::convertAllSquaredDifferencesToWeights() ++{ ++ for (int img_id = 0; img_id < exp_nr_images; img_id++) ++ { ++ int my_metadata_offset = metadata_offset + img_id; ++ MultidimArray sorted_weight; ++ ++ exp_Mweight.getRow(img_id, sorted_weight); ++ long int np = 0; ++ FOR_ALL_DIRECT_ELEMENTS_IN_MULTIDIMARRAY(sorted_weight) ++ { ++ if (DIRECT_MULTIDIM_ELEM(sorted_weight, n) > 0.) ++ { ++ DIRECT_MULTIDIM_ELEM(sorted_weight, np) = DIRECT_MULTIDIM_ELEM( \ ++ sorted_weight, n); ++ np++; ++ } ++ } ++ } ++} ++ ++ ++ ++/* { dg-final { scan-tree-dump-times "dense memory access" 1 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "__builtin_prefetch" 1 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */ +diff --git a/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h b/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h +new file mode 100644 +index 000000000..682f24703 +--- /dev/null ++++ b/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h +@@ -0,0 +1,186 @@ ++#ifndef MULTIDIM_ARRAY_H ++#define MULTIDIM_ARRAY_H ++ ++#include ++ ++#define RELION_ALIGNED_MALLOC malloc ++#define RELION_ALIGNED_FREE free ++ ++#define STARTINGX(v) ((v).xinit) ++#define STARTINGY(v) ((v).yinit) ++#define NZYXSIZE(v) ((v).nzyxdim) ++ ++#define DIRECT_MULTIDIM_ELEM(v,n) ((v).data[(n)]) ++#define FOR_ALL_DIRECT_ELEMENTS_IN_MULTIDIMARRAY(v) \ ++ for (long int n=0; n ++class MultidimArray ++{ ++public: ++ T* data; ++ bool destroyData; ++ long int ndim; ++ long int zdim; ++ long int ydim; ++ long int xdim; ++ long int yxdim; ++ long int zyxdim; ++ long int nzyxdim; ++ long int zinit; ++ long int yinit; ++ long int xinit; ++ long int nzyxdimAlloc; ++ ++public: ++ void clear() ++ { ++ coreDeallocate(); ++ coreInit(); ++ } ++ ++ void coreInit() ++ { ++ xdim=0; ++ yxdim=0; ++ zyxdim=0; ++ nzyxdim=0; ++ ydim=1; ++ zdim=1; ++ ndim=1; ++ zinit=0; ++ yinit=0; ++ xinit=0; ++ data=NULL; ++ nzyxdimAlloc = 0; ++ destroyData=true; ++ } ++ ++ void coreAllocate(long int _ndim, long int _zdim, long int _ydim, long int _xdim) ++ { ++ if (_ndim <= 0 || _zdim <= 0 || _ydim<=0 || _xdim<=0) ++ { ++ clear(); ++ return; ++ } ++ ++ ndim=_ndim; ++ zdim=_zdim; ++ ydim=_ydim; ++ xdim=_xdim; ++ yxdim=ydim*xdim; ++ zyxdim=zdim*yxdim; ++ nzyxdim=ndim*zyxdim; ++ ++ coreAllocate(); ++ } ++ ++ void coreAllocate() ++ { ++ data = (T*)RELION_ALIGNED_MALLOC(sizeof(T) * nzyxdim); ++ nzyxdimAlloc = nzyxdim; ++ } ++ ++ void coreDeallocate() ++ { ++ if (data != NULL && destroyData) ++ { ++ RELION_ALIGNED_FREE(data); ++ } ++ data=NULL; ++ nzyxdimAlloc = 0; ++ } ++ ++ void resize(long int Ndim, long int Zdim, long int Ydim, long int Xdim) ++ { ++ if (Ndim*Zdim*Ydim*Xdim == nzyxdimAlloc && data != NULL) ++ { ++ ndim = Ndim; ++ xdim = Xdim; ++ ydim = Ydim; ++ zdim = Zdim; ++ yxdim = Ydim * Xdim; ++ zyxdim = Zdim * yxdim; ++ nzyxdim = Ndim * zyxdim; ++ nzyxdimAlloc = nzyxdim; ++ return; ++ } ++ ++ if (Xdim <= 0 || Ydim <= 0 || Zdim <= 0 || Ndim <= 0) ++ { ++ clear(); ++ return; ++ } ++ ++ if (NZYXSIZE(*this) > 0 && data == NULL) ++ { ++ coreAllocate(); ++ return; ++ } ++ ++ size_t YXdim=Ydim*Xdim; ++ size_t ZYXdim=Zdim*YXdim; ++ size_t NZYXdim=Ndim*ZYXdim; ++ ++ T * new_data = (T*)RELION_ALIGNED_MALLOC(sizeof(T) * NZYXdim); ++ for (long int l = 0; l < Ndim; l++) ++ for (long int k = 0; k < Zdim; k++) ++ for (long int i = 0; i < Ydim; i++) ++ for (long int j = 0; j < Xdim; j++) ++ { ++ T val; ++ new_data[l*ZYXdim + k*YXdim+i*Xdim+j] = val; ++ } ++ coreDeallocate(); ++ ++ data = new_data; ++ ndim = Ndim; ++ xdim = Xdim; ++ ydim = Ydim; ++ zdim = Zdim; ++ yxdim = Ydim * Xdim; ++ zyxdim = Zdim * yxdim; ++ nzyxdim = Ndim * zyxdim; ++ nzyxdimAlloc = nzyxdim; ++ } ++ ++ void resize(long int Xdim) ++ { ++ resize(1, 1, 1, Xdim); ++ } ++ ++ inline T& operator()(long int i, long int j) const ++ { ++ return A2D_ELEM(*this, i, j); ++ } ++ ++ inline T& operator()(long int i) const ++ { ++ return A1D_ELEM(*this, i); ++ } ++ ++ void getRow(long int i, MultidimArray& v) const ++ { ++ if (xdim == 0 || ydim == 0) ++ { ++ v.clear(); ++ return; ++ } ++ ++ v.resize(xdim); ++ for (long int j = 0; j < xdim; j++) ++ v(j) = (*this)(i, j); ++ } ++}; ++ ++#endif /* MULTIDIM_ARRAY_H */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c +new file mode 100644 +index 000000000..091e654f9 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c +@@ -0,0 +1,61 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param issue-topn=2 --param branch-prob-threshold=50 --param filter-mode=0" } */ ++ ++#include ++ ++#define N 131590 ++#define F 384477 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++int lPtr[F]; ++int uPtr[F]; ++double lowerPtr[F]; ++double upperPtr[F]; ++ ++void ++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, ++ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) ++{ ++ for (int cell=0; cell ++ ++#define N 100000 ++ ++int A_i[N]; ++int A_j[N]; ++double A_data[N]; ++double x_data[N]; ++double y_data[N]; ++int num_rows = N; ++ ++void ++MatMult (int *A_i, int *A_j, double *A_data, double *x_data, ++ int num_rows, double *y_data) ++{ ++ int i = 0; ++ int j = 0; ++ double temp = 0; ++ for (i = 0; i < num_rows; i++) ++ { ++ temp = y_data[i]; ++ for (j = A_i[i]; j < A_i[i+1]; j++) ++ temp += A_data[j] * x_data[A_j[j]]; ++ y_data[i] = temp; ++ } ++} ++ ++int ++main (int argc, char *argv[]) ++{ ++ int testIter = 2; ++ ++ for (int i = 0; i < testIter; i++) ++ MatMult (A_i, A_j, A_data, x_data, num_rows, y_data); ++ ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 4 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "Tracing succeeded" 14 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "static_data_size:" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not ", size: (?!(0\.000000))" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times ", size: 0\.000000" 6 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "\\d\\tx_data\\t\\(0.000000, 1, 1, 0\\)" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "\\d\\tA_j\\t\\(0.000000, 1, 1, 0\\)" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "\\d\\tA_data\\t\\(0.000000, 1, 1, 0\\)" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "insert svprfd_gather" 2 "llc_allocate" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp +new file mode 100644 +index 000000000..05a3bf842 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp +@@ -0,0 +1,27 @@ ++# Copyright (C) 2022-2023 Free Software Foundation, Inc. ++ ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 3 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with GCC; see the file COPYING3. If not see ++# . ++ ++load_lib gcc-dg.exp ++load_lib target-supports.exp ++ ++# Initialize `dg'. ++dg-init ++ ++dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \ ++ "" "-fllc-allocate" ++ ++# All done. ++dg-finish +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c +new file mode 100644 +index 000000000..113acbceb +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c +@@ -0,0 +1,36 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c --param=mem-access-ratio=1 --param=mem-access-num=0" } */ ++ ++/* In this deja test case, we test how Phase 2 & 3 of llc-allocate pass deals ++ with an indirect memory access in a nested loop where the use-block for the ++ induction variable of this memory access is a child/descendent of its ++ def-block (we make it by defining the induction variable in the outer loop). ++ Therefore, the reference can be successfully traced after outer-loop ++ analysis. */ ++#include ++#include ++ ++void cross_bb_indir_mem_acc (int *arr1, int *arr2, int *arr3, int *arr4, int n) { ++ srand (time (NULL)); ++ ++ int j_s; ++ int j_e = arr1[0]; ++ int k; ++ ++ for (int i = 0; i < n; i++) ++ { ++ j_s = j_e; ++ j_e = arr1[i + 1]; ++ ++ k = arr3[i]; ++ ++ for (int j = j_s; j < j_e; j++) ++ { ++ arr4[j] -= arr2[k]; ++ } ++ ++ } ++} ++ ++/* { dg-final { scan-tree-dump "Unhandled indirect memory access tracing." "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump "Retrace indirect memory access after outer loop analysis:" "llc_allocate" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c +new file mode 100644 +index 000000000..a2e7f66a4 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c +@@ -0,0 +1,61 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++#include ++#define N 131590 ++#define F 384477 ++ ++int ownStartPtr[F]; ++double bPrimePtr[N]; ++double diagPtr[N]; ++double psiPtr[N]; ++double upperPtr[F]; ++double lowerPtr[F]; ++int uPtr[F]; ++ ++void SMOOTH(int *ownStartPtr, double *bPrimePtr, double *diagPtr, double *psiPtr, int *uPtr, double *lowerPtr, double *upperPtr, int nCells); ++ ++int main(int argc, char *argv[]) ++{ ++ int nCells = N; ++ int nFaces = F; ++ int testIter = 2; ++ for (int i = 0; i < testIter; i++) ++ { ++ SMOOTH(ownStartPtr, bPrimePtr, diagPtr, psiPtr, uPtr, lowerPtr, upperPtr, nCells); ++ } ++ return 0; ++} ++ ++ ++void SMOOTH(int *ownStartPtr, double *bPrimePtr, double *diagPtr, double *psiPtr, int *uPtr, double *lowerPtr, double *upperPtr, int nCells) ++{ ++ double psii; ++ int fStart; ++ int fEnd = ownStartPtr[0]; ++ ++ for (int celli = 0; celli < nCells; celli++) ++ { ++ fStart = fEnd; ++ fEnd = ownStartPtr[celli + 1]; ++ psii = bPrimePtr[celli]; ++ ++ for (int facei = fStart; facei ++ ++#define N 131590 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++ ++void ++branch_in_loop (double *diagPtr, double *psiPtr, double *ApsiPtr, int nCells) ++{ ++ for (int cell=0; cell 0) ++ ApsiPtr[cell] = 0; ++ else ++ ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell]; ++ } ++} ++ ++int ++main (int argc, char *argv[]) ++{ ++ int nCells = N; ++ int testIter = 100; ++ ++ for (int i=0; i ++ ++#define N 131590 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++ ++void ++break_in_loop (double *diagPtr, double *psiPtr, double *ApsiPtr, int nCells) ++{ ++ for (int cell=0; cell 0) ++ break; ++ ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell]; ++ } ++} ++ ++int ++main (int argc, char *argv[]) ++{ ++ int nCells = N; ++ int testIter = 2; ++ ++ for (int i=0; i ++ ++#define N 131 ++ ++double diagPtr[N]; ++int psiPtr[N]; ++double ApsiPtr[N]; ++ ++void ++goto_in_loop (double *diagPtr, int *psiPtr, double *ApsiPtr, int nCells) ++{ ++ for (int cell=0; cellnodes; ++ while (v > 1) ++ { ++ basic_block bb = di->dfs_to_bb[v]; ++ edge e; ++ ++ par = di->dfs_parent[v]; ++ k = v; ++ ++ ei = (reverse) ? ei_start (bb->succs) : ei_start (bb->preds); ++ ++ if (reverse) ++ { ++ /* If this block has a fake edge to exit, process that first. */ ++ if (bitmap_bit_p (di->fake_exit_edge, bb->index)) ++ { ++ einext = ei; ++ einext.index = 0; ++ goto do_fake_exit_edge; ++ } ++ } ++ ++ /* Search all direct predecessors for the smallest node with a path ++ to them. That way we have the smallest node with also a path to ++ us only over nodes behind us. In effect we search for our ++ semidominator. */ ++ while (!ei_end_p (ei)) ++ { ++ basic_block b; ++ TBB k1; ++ ++ e = ei_edge (ei); ++ b = (reverse) ? e->dest : e->src; ++ einext = ei; ++ ei_next (&einext); ++ ++ if (b == en_block) ++ { ++ do_fake_exit_edge: ++ k1 = di->dfs_order[last_basic_block]; ++ } ++ else ++ k1 = di->dfs_order[b->index]; ++ ++ /* Call eval() only if really needed. If k1 is above V in DFS tree, ++ then we know, that eval(k1) == k1 and key[k1] == k1. */ ++ if (k1 > v) ++ k1 = di->key[eval (di, k1)]; ++ if (k1 < k) ++ k = k1; ++ ++ ei = einext; ++ } ++ ++ di->key[v] = k; ++ link_roots (di, par, v); ++ di->next_bucket[v] = di->bucket[k]; ++ di->bucket[k] = v; ++ ++ /* Transform semidominators into dominators. */ ++ for (w = di->bucket[par]; w; w = di->next_bucket[w]) ++ { ++ k = eval (di, w); ++ if (di->key[k] < di->key[w]) ++ di->dom[w] = k; ++ else ++ di->dom[w] = par; ++ } ++ /* We don't need to cleanup next_bucket[]. */ ++ di->bucket[par] = 0; ++ v--; ++ } ++ ++ /* Explicitly define the dominators. */ ++ di->dom[1] = 0; ++ for (v = 2; v <= di->nodes; v++) ++ if (di->dom[v] != di->key[v]) ++ di->dom[v] = di->dom[di->dom[v]]; ++} ++ ++/* { dg-final { scan-tree-dump-times "Warning: Find cycle at bb index" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump "static issue" "llc_allocate" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c +new file mode 100644 +index 000000000..e18725f60 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c +@@ -0,0 +1,50 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -c -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0" } */ ++ ++#include ++ ++typedef struct stack_def ++{ ++ int top; /* index to top stack element */ ++ unsigned long reg_set; /* set of live registers */ ++ unsigned char reg[128]; /* register - stack mapping */ ++} *stack; ++ ++typedef struct block_info_def ++{ ++ struct stack_def stack_in; /* Input stack configuration. */ ++ struct stack_def stack_out; /* Output stack configuration. */ ++ unsigned long out_reg_set; /* Stack regs live on output. */ ++ int done; /* True if block already converted. */ ++ int predecessors; /* Number of predecessors that need ++ to be visited. */ ++} *block_info; ++ ++typedef struct basic_block_def ++{ ++ void *aux; ++} *basic_block; ++ ++unsigned char ++convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high) ++{ ++ stack output_stack; ++ ++ output_stack = &(((block_info) bb->aux)->stack_in); ++ if (value_reg_low == -1) ++ output_stack->top = -1; ++ else ++ { ++ int reg; ++ output_stack->top = value_reg_high - value_reg_low; ++ for (reg = value_reg_low; reg <= value_reg_high; ++reg) ++ { ++ (output_stack->reg + 16)[value_reg_high - reg] = reg; ++ output_stack->reg_set |= (unsigned long) 1 << reg; ++ } ++ } ++ return output_stack->reg[0]; ++} ++ ++/* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c +new file mode 100644 +index 000000000..328dc57bc +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options " -S -O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],0,0); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PLDL1KEEP" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c +new file mode 100644 +index 000000000..d9c919869 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],0,1); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PLDL1STRM" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c +new file mode 100644 +index 000000000..806366b5b +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],0,2); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PLDL2KEEP" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c +new file mode 100644 +index 000000000..91567d1e9 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c +@@ -0,0 +1,16 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main() ++{ ++ for(int i = 0; i < 100000; i++) ++ { ++ __builtin_prefetch_full(&val[i], 0, 3); ++ val[i] = i + 1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PLDL2STRM" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c +new file mode 100644 +index 000000000..c28150654 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],0,4); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PLDL3KEEP" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c +new file mode 100644 +index 000000000..e8d9c8693 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],0,5); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PLDL3STRM" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c +new file mode 100644 +index 000000000..b0281882f +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],0,6); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PLDL4KEEP" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c +new file mode 100644 +index 000000000..26807556f +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],0,7); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PLDL4STRM" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c +new file mode 100644 +index 000000000..4f2def13d +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],1,0); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PSTL1KEEP" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c +new file mode 100644 +index 000000000..ecc501f1f +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],1,1); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PSTL1STRM" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c +new file mode 100644 +index 000000000..d140f1ed1 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],1,2); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PSTL2KEEP" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c +new file mode 100644 +index 000000000..d6f170253 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],1,3); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PSTL2STRM" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c +new file mode 100644 +index 000000000..8da092b36 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],1,4); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PSTL3KEEP" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c +new file mode 100644 +index 000000000..4cf65188a +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],1,5); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PSTL3STRM" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c +new file mode 100644 +index 000000000..36f4a3aa0 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],1,6); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PSTL4KEEP" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c +new file mode 100644 +index 000000000..43d2d41d5 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],1,7); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PSTL4STRM" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c +new file mode 100644 +index 000000000..ba90e7ea4 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c +@@ -0,0 +1,62 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0" } */ ++ ++#include ++#include ++ ++#define N 1000 ++ ++long a[N] = {0}; ++long b[N] = {0}; ++long c[N] = {0}; ++ ++double ++referenceTrace (double *psiPtr, int *lPtr, int *uPtr, int nCells) ++{ ++ double sum; ++ for (int cell = 0; cell < nCells; cell++) ++ { ++ // Multi-layer pointer ++ sum += psiPtr[lPtr[cell]]; ++ psiPtr[uPtr[cell]] = sum; ++ ++ // Outer pointer, inner array ++ sum += psiPtr[b[cell]]; ++ psiPtr[a[cell]] = sum; ++ ++ // Multi-layer array ++ sum += a[b[cell]]; ++ c[a[cell]] = sum; ++ ++ // Outer array, inner pointer ++ sum += a[lPtr[cell]]; ++ c[lPtr[cell]] = sum; ++ } ++ return sum; ++} ++ ++int ++main (int argc, char *argv[]) ++{ ++ int testIter = 2; ++ ++ double *psiPtr = NULL; ++ int *lPtr = NULL; ++ int *uPtr = NULL; ++ psiPtr = (double *) calloc (N, sizeof(double)); ++ lPtr = (int *) calloc (N, sizeof(int)); ++ uPtr = (int *) calloc (N, sizeof(int)); ++ ++ for (int i = 0; i < testIter; i++) ++ referenceTrace (psiPtr, lPtr, uPtr, N); ++ ++ free (psiPtr); ++ free (lPtr); ++ free (uPtr); ++ ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump-times "Tracing succeeded" 24 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "unhandled issue scene" 2 "llc_allocate" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 +new file mode 100644 +index 000000000..b0f68ebe3 +--- /dev/null ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 +@@ -0,0 +1,211 @@ ++! { dg-do compile { target { aarch64*-*-linux* } } } ++! { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param branch-prob-threshold=50 --param filter-mode=0" } ++ ++program main ++ ++ IMPLICIT NONE ++ INTEGER :: ids,ide, jds,jde, kds,kde ++ INTEGER,parameter :: ims=-4,kms=1,jms=-4 ++ INTEGER,parameter :: ime=210,kme=36,jme=192 ++ INTEGER :: its,ite, jts,jte, kts,kte ++ INTEGER :: number_of_small_timesteps,rk_step, rk_order, step ++ ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: t_1, t_2, c2a, p, ph, pm1, al, alt ++ ++ ++ REAL, DIMENSION(ims:ime, jms:jme) :: mu, muts ++ ++ REAL, DIMENSION(kms:kme) :: dnw, rdnw, znu ++ ++ REAL :: rdx,rdy ++ REAL :: dts, t0, smdiv ++ REAL :: random1,time_begin,time_end,total_time ++ ++ INTEGER :: i, j, k ++ INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end ++ INTEGER :: i_endu, j_endv ++ INTEGER :: interval=1 ++ INTEGER :: epoch,iter ++ ++ LOGICAL :: non_hydrostatic ++ ++ data ids, jds, kds, its, jts, kts /6*1/ ++ data ide, ite /2*205/ ++ data jde, jte /2*187/ ++ data kde, kte /2*36/ ++ ++ number_of_small_timesteps = 1 ++ rk_step = 3 ++ rk_order = 1 ++ dts = 1. ++ ++ rdx = 1. ++ rdy = 1. ++ ++ t0 = 0. ++ smdiv = 1. ++ step = 1 ++ non_hydrostatic = .true. ++ ++ call random_number(random1) ++ interval = random1*100 ++ interval=1 ++ ++ call random_seed(put=(/(i,i=1,10000,interval)/)) ++ ++ call random_number(alt) ++ call random_number(c2a) ++ call random_number(ph) ++ call random_number(pm1) ++ call random_number(mu) ++ call random_number(muts) ++ call random_number(dnw) ++ call random_number(rdnw) ++ call random_number(znu) ++ ++ do iter=1,2 ++ call calc_p_rho( al, p, ph, & ++ alt, t_2, t_1, c2a, pm1, & ++ mu, muts, znu, t0, & ++ rdnw, dnw, smdiv, & ++ non_hydrostatic, step, & ++ ids, ide, jds, jde, kds, kde, & ++ ims, ime, jms, jme, kms, kme, & ++ its,ite, jts,jte, kts,kte ) ++ ++ enddo ++ ++end program ++ ++ ++SUBROUTINE calc_p_rho( al, p, ph, & ++ alt, t_2, t_1, c2a, pm1, & ++ mu, muts, znu, t0, & ++ rdnw, dnw, smdiv, & ++ non_hydrostatic, step, & ++ ids, ide, jds, jde, kds, kde, & ++ ims, ime, jms, jme, kms, kme, & ++ its,ite, jts,jte, kts,kte ) ++ ++ IMPLICIT NONE ! religion first ++ !asb ++! declarations for the stuff coming in ++ ++ INTEGER, INTENT(IN ) :: ids,ide, jds,jde, kds,kde ++ INTEGER, INTENT(IN ) :: ims,ime, jms,jme, kms,kme ++ INTEGER, INTENT(IN ) :: its,ite, jts,jte, kts,kte ++ ++ INTEGER, INTENT(IN ) :: step ++ ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT( OUT) :: al, & ++ p ++ ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(IN ) :: alt, & ++ t_2, & ++ t_1, & ++ c2a ++ ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(INOUT) :: ph, pm1 ++ ++ REAL, DIMENSION(ims:ime, jms:jme) , INTENT(IN ) :: mu, & ++ muts ++ ++ REAL, DIMENSION(kms:kme) , INTENT(IN ) :: dnw, & ++ rdnw, & ++ znu ++ ++ REAL, INTENT(IN ) :: t0, smdiv ++ ++ LOGICAL, INTENT(IN ) :: non_hydrostatic ++ ++! local variables ++ ++ INTEGER :: i, j, k ++ INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end ++ REAL :: ptmp ++ ++ i_start = its ++ i_end = min(ite,ide-1) ++ j_start = jts ++ j_end = min(jte,jde-1) ++ k_start = kts ++ k_end = min(kte,kde-1) ++ ++ IF (non_hydrostatic) THEN ++ DO j=j_start, j_end ++ DO k=k_start, k_end ++ DO i=i_start, i_end ++ ++! al computation is all dry, so ok with moisture ++ ++ al(i,k,j)=-1./muts(i,j)*(alt(i,k,j)*mu(i,j) & ++ +rdnw(k)*(ph(i,k+1,j)-ph(i,k,j))) ++ ++! this is temporally linearized p, no moisture correction needed ++ ++ p(i,k,j)=c2a(i,k,j)*(alt(i,k,j)*(t_2(i,k,j)-mu(i,j)*t_1(i,k,j)) & ++ /(muts(i,j)*(t0+t_1(i,k,j)))-al (i,k,j)) ++ ++ ENDDO ++ ENDDO ++ ENDDO ++ ++ ELSE ! hydrostatic calculation ++ ++ DO j=j_start, j_end ++ DO k=k_start, k_end ++ DO i=i_start, i_end ++ p(i,k,j)=mu(i,j)*znu(k) ++ al(i,k,j)=alt(i,k,j)*(t_2(i,k,j)-mu(i,j)*t_1(i,k,j)) & ++ /(muts(i,j)*(t0+t_1(i,k,j)))-p(i,k,j)/c2a(i,k,j) ++ ph(i,k+1,j)=ph(i,k,j)-dnw(k)*(muts(i,j)*al (i,k,j) & ++ +mu(i,j)*alt(i,k,j)) ++ ENDDO ++ ENDDO ++ ENDDO ++ ++ END IF ++ ++! divergence damping setup ++ ++ IF (step == 0) then ! we're initializing small timesteps ++ DO j=j_start, j_end ++ DO k=k_start, k_end ++ DO i=i_start, i_end ++ pm1(i,k,j)=p(i,k,j) ++ ENDDO ++ ENDDO ++ ENDDO ++ ELSE ! we're in the small timesteps ++ DO j=j_start, j_end ! and adding div damping component ++ DO k=k_start, k_end ++ DO i=i_start, i_end ++ ptmp = p(i,k,j) ++ p(i,k,j) = p(i,k,j) + smdiv*(p(i,k,j)-pm1(i,k,j)) ++ pm1(i,k,j) = ptmp ++ ENDDO ++ ENDDO ++ ENDDO ++ END IF ++ ++END SUBROUTINE calc_p_rho ++ ++! { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 6 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "Tracing succeeded" 46 "llc_allocate" } } ++! { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){2}\}" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){4}\}" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d\\tp\\t\\(0.000000, 3, 1, 0\\)" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d\\tp\\t\\(0.000000, 3, 3, 0\\)" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d\\tpm1\\t\\(0.000000, 3, 2, 0\\)" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d\\tph\\t\\(0.000000, 3, 2, 0\\)" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d\\tal\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d\\talt\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d\\tt_1\\t\\(0.000000, 3, 1, 0\\)" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d\\tt_2\\t\\(0.000000, 3, 1, 0\\)" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d\\tc2a\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "runtime issue" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "insert svprfd" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "cumul_size.*150960\\)" 1 "llc_allocate" } } +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp +new file mode 100644 +index 000000000..13d225f35 +--- /dev/null ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp +@@ -0,0 +1,29 @@ ++# Copyright (C) 2022-2023 Free Software Foundation, Inc. ++ ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 3 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with GCC; see the file COPYING3. If not see ++# . ++ ++# GCC testsuite that uses the `dg.exp' driver. ++ ++load_lib gfortran-dg.exp ++ ++# Initialize `dg'. ++dg-init ++ ++# Main loop. ++gfortran-dg-runtest [lsort \ ++ [glob -nocomplain $srcdir/$subdir/*.\[fF\]{,90,95,03,08} ] ] "" "" ++ ++# All done. ++dg-finish +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 +new file mode 100644 +index 000000000..501e6e74c +--- /dev/null ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 +@@ -0,0 +1,62 @@ ++! { dg-do compile { target { aarch64*-*-linux* } } } ++! { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno" } ++ ++MODULE INPUT ++ IMPLICIT NONE ++ ++ INTEGER, PARAMETER :: wp = 8, jpi = 25, jpj = 39, jpk = 31, kjpt = 2 ++ ++ INTEGER :: kt = 1, jpkm1 = 30, jpjm1 = 38, fs_jpim1 = 24, fs_2 = 2 ++ REAL(wp), DIMENSION(jpi, jpj) :: e12t ++ REAL(wp), DIMENSION(jpi, jpj, jpk) :: fse3t_n ++ REAL(wp), DIMENSION(jpi, jpj, jpk, kjpt) :: pta ++ ++END MODULE INPUT ++ ++PROGRAM MAIN ++ USE INPUT ++ ++ IMPLICIT NONE ++ ++ INTEGER :: EPOCH ++ ++! Initialize arrays ++ ++ e12t = 1 ++ fse3t_n = 1 ++ pta = 1 ++! ++ ++ DO EPOCH=1,2 ++ CALL tra_ldf_iso ++ ENDDO ++ ++END PROGRAM MAIN ++ ++SUBROUTINE tra_ldf_iso ++ USE INPUT ++ ++ IMPLICIT NONE ++ ! ++ INTEGER :: ji, jj, jk, jn ! dummy loop indices ++ REAL(wp) :: zbtr, ztra ! - - ++ REAL(wp), DIMENSION(jpi, jpj, jpk) :: ztfw ++ ++ DO jn = 1, kjpt ++ ztfw(:, :, 1) = 0.e0; ztfw(:, :, jpk) = 0.e0 ++ ++ DO jk = 1, jpkm1 ++ DO jj = 2, jpjm1 ++ DO ji = fs_2, fs_jpim1 ! vector opt. ++ zbtr = 1.0/(e12t(ji, jj)*fse3t_n(ji, jj, jk)) ++ ztra = (ztfw(ji, jj, jk) - ztfw(ji, jj, jk + 1))*zbtr ++ pta(ji, jj, jk, jn) = pta(ji, jj, jk, jn) + ztra ++ END DO ++ END DO ++ END DO ++ ! ++ END DO ++ ! ++END SUBROUTINE tra_ldf_iso ++ ++! { dg-final { scan-tree-dump-times "Traced variables at vectp_ztfw" 2 "llc_allocate" } } +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 +new file mode 100644 +index 000000000..7345759db +--- /dev/null ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 +@@ -0,0 +1,58 @@ ++! { dg-do compile { target { aarch64*-*-linux* } } } ++! { dg-options "-c -O3 -march=armv8.2-a+sve -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0 --param issue-topn=1 --param mem-access-ratio=5 --param mem-access-num=1" } ++ ++Module module_domain ++ IMPLICIT NONE ++ ++ REAL, PARAMETER :: g = 9.8 ++ TYPE :: grid_type ++ REAL, POINTER :: phb(:,:,:), ph_2(:,:,:), p(:,:,:), pb(:,:,:) ++ REAL, POINTER :: fnm(:), fnp(:) ++ END TYPE ++END Module ++ ++SUBROUTINE calc_p8w(p8w, ix, iy, k_start, k_end) ++ ++ USE module_domain ++ !USE module_model_constants ++ ++ IMPLICIT NONE ++ ++ ++ !TYPE (domain), INTENT(IN) :: grid ++ INTEGER, INTENT(IN) :: k_start, k_end, ix, iy ++ REAL, DIMENSION(k_start:k_end), INTENT(OUT) :: p8w ++ ++ ++ INTEGER :: k ++ REAL :: z0, z1, z2, w1, w2 ++ REAL, DIMENSION(k_start:k_end) :: z_at_w ++ REAL, DIMENSION(k_start:k_end-1) :: z ++ TYPE (grid_type), POINTER :: grid ++ ++ ++ DO k = k_start, k_end ++ z_at_w(k) = (grid%phb(ix,k,iy)+grid%ph_2(ix,k,iy))/g ++ END DO ++ ++ DO k = k_start, k_end-1 ++ z(k) = 0.5*(z_at_w(k) + z_at_w(k+1)) ++ END DO ++ ++ DO k = k_start+1, k_end-1 ++ p8w(k) = grid%fnm(k)*(grid%p(ix,k,iy)+grid%pb(ix,k,iy)) + & ++ grid%fnp(k)*(grid%p(ix,k-1,iy)+grid%pb(ix,k-1,iy)) ++ END DO ++ ++ z0 = z_at_w(k_start) ++ z1 = z(k_start) ++ z2 = z(k_start+1) ++ w1 = (z0 - z2)/(z1 - z2) ++ w2 = 1. - w1 ++ p8w(k_start) = w1*(grid%p(ix,k_start,iy)+grid%pb(ix,k_start,iy)) + & ++ w2*(grid%p(ix,k_start+1,iy)+grid%pb(ix,k_start+1,iy)) ++ ++END SUBROUTINE calc_p8w ++ ++! { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } +\ No newline at end of file +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90 +new file mode 100644 +index 000000000..f79df5d26 +--- /dev/null ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90 +@@ -0,0 +1,320 @@ ++! { dg-do compile { target { aarch64*-*-linux* } } } ++! { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=branch-prob-threshold=50 --param=filter-kernels=0 --param=mem-access-num=2 --param=issue-topn=2 --param=force-issue=1 --param=outer-loop-nums=3" } ++!include "module_small_step_em.F90" ++ ++Module add_type ++ IMPLICIT NONE ++ ++ TYPE :: grid_config_rec_type ++ LOGICAL :: open_xs ++ LOGICAL :: open_ys ++ LOGICAL :: open_xe ++ LOGICAL :: open_ye ++ LOGICAL :: symmetric_xs ++ LOGICAL :: symmetric_xe ++ LOGICAL :: symmetric_ys ++ LOGICAL :: symmetric_ye ++ LOGICAL :: polar ++ LOGICAL :: nested ++ LOGICAL :: periodic_x ++ LOGICAL :: specified ++ END TYPE ++END Module ++ ++program main ++ ++ ++! include "module_small_step_em_modify.F90" ++ ++! use module_small_step_em ++! use module_small_step_em_modify ++ ++ use add_type ++ ++ IMPLICIT NONE ++ INTEGER :: ids,ide, jds,jde, kds,kde ++ INTEGER,parameter :: ims=-4,kms=1,jms=-4 ++ INTEGER,parameter :: ime=210,kme=36,jme=192 ++ INTEGER :: its,ite, jts,jte, kts,kte ++ INTEGER :: number_of_small_timesteps,rk_step, rk_order, step, spec_zone ++ ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme, 1:8) :: llcRefresh ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: u, v, u_1, v_1, t_1, ww_1, ft!u, v, u_1, v_1, w_1, t_1, ww1, ww_1,ph_1, ft ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: u_save, v_save, w_save, t_save, ph_save,h_diabatic ++ ! REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: u_2, v_2, w_2, t_2, ph_2 ++ ! REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: c2a, ww_save, cqw, cqu, cqv, alpha, gamma, a ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: ww!pb, p, ph, php, pm1, al, alt, ww, random_array ++ ! REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: ru_tend, rv_tend ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: t, t_ave, uam, vam, wwam ++ ++ REAL, DIMENSION(ims:ime, jms:jme) :: mu_1,mu_2, mu ++ REAL, DIMENSION(ims:ime, jms:jme) :: mub, muu, muv, mut, & ++ msfux, msfuy, & ++ msfvx, msfvx_inv, msfvy, & ++ msftx, msfty ++ ++ REAL, DIMENSION(ims:ime, jms:jme) :: muus, muvs, muts, mudf, muave ++ REAL, DIMENSION(ims:ime, jms:jme) :: mu_save, mu_tend ++ ++ REAL, DIMENSION(kms:kme) :: rdn, rdnw,dnw, fnm, fnp, znu ++ ++ REAL :: rdx,rdy ++ REAL :: dts, cf1, cf2, cf3, t0, emdiv, smdiv, epssm, g ++ REAL :: random1,time_begin,time_end,total_time ++ ++ INTEGER :: i, j, k ++ INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end ++ INTEGER :: i_endu, j_endv ++ INTEGER :: interval=1 ++ INTEGER :: epoch ++ ++ LOGICAL :: non_hydrostatic, top_lid ++ ++ ++ TYPE (grid_config_rec_type) :: config_flags ++ config_flags%open_xs = .true. ++ config_flags%open_ys = .true. ++ config_flags%open_xe = .true. ++ config_flags%open_ye = .true. ++ config_flags%symmetric_xs = .true. ++ config_flags%symmetric_xe = .true. ++ config_flags%symmetric_ys = .true. ++ config_flags%symmetric_ye = .true. ++ config_flags%polar = .true. ++ config_flags%nested = .true. ++ config_flags%periodic_x = .true. ++ config_flags%specified = .true. ++ ++ data ids, jds, kds, its, jts, kts /6*1/ ++ data ide, ite /2*205/ ++ data jde, jte /2*187/ ++ data kde, kte /2*98/ ++ ++ number_of_small_timesteps = 1 ++ rk_step = 1 ++ rk_order = 1 ++ dts = 1. ++ epssm = 1. ++ g = 1. ++ ++ rdx = 1. ++ rdy = 1. ++ dts = 1. ++ cf1 = 1. ++ cf2 = 1. ++ cf3 = 1. ++ ++ t0 = 0. ++ smdiv = 1. ++ emdiv = 1. ++ step = 1 ++ spec_zone = 1 ++ ++ non_hydrostatic = .true. ++ top_lid = .true. ++ ++ interval=1 ++ ++ ++ total_time=0 ++ ++ call random_seed(put=(/(i,i=1,10000,interval)/)) ++ ++ call random_number(u) ++ call random_number(v) ++ call random_number(u_1) ++ call random_number(v_1) ++ call random_number(t_1) ++ call random_number(ft) ++ ++ call random_number(ww) ++ call random_number(ww_1) ++ call random_number(t) ++ call random_number(t_ave) ++ call random_number(uam) ++ call random_number(vam) ++ call random_number(wwam) ++ ++ call random_number(muu) ++ call random_number(muv) ++ call random_number(mut) ++ call random_number(msfux) ++ call random_number(msfuy) ++ call random_number(msfvx) ++ call random_number(msfvx_inv) ++ call random_number(msfvy) ++ call random_number(msftx) ++ call random_number(msfty) ++ call random_number(mu_tend) ++ ++ call random_number(muave) ++ call random_number(muts) ++ call random_number(mudf) ++ call random_number(mu) ++ ++ call random_number(fnm) ++ call random_number(fnp) ++ call random_number(dnw) ++ call random_number(rdnw) ++ ++ DO j=jms, jme ++ DO k=kms, kme ++ DO i=ims, ime ++ ++ llcRefresh(i,k,j,1)=i+k+j+7 ++ ++ ENDDO ++ ENDDO ++ ENDDO ++ ++ do epoch = 1,2 ++ call advance_mu_t_fortran_plu( ww, ww_1, u, u_1, v, v_1, & ++ mu, mut, muave, muts, muu, muv, & ++ mudf, uam, vam, wwam, t, t_1, & ++ t_ave, ft, mu_tend, & ++ rdx, rdy, dts, epssm, & ++ dnw, fnm, fnp, rdnw, & ++ msfux, msfuy, msfvx, msfvx_inv, & ++ msfvy, msftx, msfty, & ++ step, config_flags, & ++ ids, ide, jds, jde, kds, kde, & ++ ims, ime, jms, jme, kms, kme, & ++ its, ite, jts, jte, kts, kte ) ++ enddo ++end program ++ ++ ++ ++SUBROUTINE advance_mu_t_fortran_plu( ww, ww_1, u, u_1, v, v_1, & ++ mu, mut, muave, muts, muu, muv, & ++ mudf, uam, vam, wwam, t, t_1, & ++ t_ave, ft, mu_tend, & ++ rdx, rdy, dts, epssm, & ++ dnw, fnm, fnp, rdnw, & ++ msfux, msfuy, msfvx, msfvx_inv, & ++ msfvy, msftx, msfty, & ++ step, config_flags, & ++ ids, ide, jds, jde, kds, kde, & ++ ims, ime, jms, jme, kms, kme, & ++ its, ite, jts, jte, kts, kte ) ++ use add_type ++ ++ IMPLICIT NONE ! religion first ++ ++ ! stuff coming in ++ ++ TYPE(grid_config_rec_type), INTENT(IN ) :: config_flags ++ INTEGER, INTENT(IN ) :: ids,ide, jds,jde, kds,kde ++ INTEGER, INTENT(IN ) :: ims,ime, jms,jme, kms,kme ++ INTEGER, INTENT(IN ) :: its,ite, jts,jte, kts,kte ++ ++ INTEGER, INTENT(IN ) :: step ++ ++ REAL, DIMENSION( ims:ime , kms:kme, jms:jme ), & ++ INTENT(IN ) :: & ++ u, & ++ v, & ++ u_1, & ++ v_1, & ++ t_1, & ++ ft ++ ++ REAL, DIMENSION( ims:ime , kms:kme, jms:jme ), & ++ INTENT(INOUT) :: & ++ ww, & ++ ww_1, & ++ t, & ++ t_ave, & ++ uam, & ++ vam, & ++ wwam ++ ++ REAL, DIMENSION( ims:ime , jms:jme ), INTENT(IN ) :: muu, & ++ muv, & ++ mut, & ++ msfux,& ++ msfuy,& ++ msfvx,& ++ msfvx_inv,& ++ msfvy,& ++ msftx,& ++ msfty,& ++ mu_tend ++ ++ REAL, DIMENSION( ims:ime , jms:jme ), INTENT( INOUT) :: muave, & ++ muts, & ++ mudf ++ ++ REAL, DIMENSION( ims:ime , jms:jme ), INTENT(INOUT) :: mu ++ ++ REAL, DIMENSION( kms:kme ), INTENT(IN ) :: fnm, & ++ fnp, & ++ dnw, & ++ rdnw ++ ++ ++ REAL, INTENT(IN ) :: rdx, & ++ rdy, & ++ dts, & ++ epssm ++ ++ REAL, DIMENSION (its:ite, kts:kte) :: wdtn, dvdxi ++ REAL, DIMENSION (its:ite) :: dmdt ++ ++ INTEGER :: i,j,k, i_start, i_end, j_start, j_end, k_start, k_end ++ INTEGER :: i_endu, j_endv ++ REAL :: acc ++ ++ INTEGER :: ubv, lbv, t1, t2, t3, t4, ceild, floord ++ ++ ceild(t1, t2) = ceiling(REAL(t1)/REAL(t2)) ++ floord(t1, t2) = floor(REAL(t1)/REAL(t2)) ++ i_start = its ++ i_end = min(ite,ide-1) ++ j_start = jts ++ j_end = min(jte,jde-1) ++ k_start = kts ++ k_end = kte-1 ++ IF ( .NOT. config_flags%periodic_x )THEN ++ IF ( config_flags%specified .or. config_flags%nested ) then ++ i_start = max(its,ids+1) ++ i_end = min(ite,ide-2) ++ ENDIF ++ ENDIF ++ IF ( config_flags%specified .or. config_flags%nested ) then ++ j_start = max(jts,jds+1) ++ j_end = min(jte,jde-2) ++ ENDIF ++ ++ i_endu = ite ++ j_endv = jte ++ ++ DO j = j_start, j_end ++ ++ DO i=i_start, i_end ++ dmdt(i) = 0. ++ ENDDO ++ ++ DO k=k_start, k_end ++ DO i=i_start, i_end ++ dvdxi(i,k) = msftx(i,j)*msfty(i,j)*( & ++ rdy*((v(i,k,j+1)+muv(i,j+1)*v_1(i,k,j+1)*msfvx_inv(i,j+1)) & ++ -(v(i,k,j )+muv(i,j )*v_1(i,k,j)*msfvx_inv(i,j ))) & ++ +rdx*((u(i+1,k,j)+muu(i+1,j)*u_1(i+1,k,j)/msfuy(i+1,j)) & ++ -(u(i,k,j )+muu(i ,j)*u_1(i,k,j )/msfuy(i,j)) )) ++ dmdt(i) = dmdt(i) + dnw(k)*dvdxi(i,k) ++ ENDDO ++ ENDDO ++ DO i=i_start, i_end ++ muave(i,j) = mu(i,j) ++ mu(i,j) = mu(i,j)+dts*(dmdt(i)+mu_tend(i,j)) ++ mudf(i,j) = (dmdt(i)+mu_tend(i,j)) ! save tendency for div dampfilter ++ muts(i,j) = mut(i,j)+mu(i,j) ++ muave(i,j) =.5*((1.+epssm)*mu(i,j)+(1.-epssm)*muave(i,j)) ++ ENDDO ++ ENDDO ++END SUBROUTINE advance_mu_t_fortran_plu ++ ++! { dg-final { scan-tree-dump "issue_llc_hint" "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "analyze_nested_kernels" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump "Stop tracing the outer loop depth" "llc_allocate" } } +\ No newline at end of file +diff --git a/gcc/timevar.def b/gcc/timevar.def +index 36c3e7d5a..14129a500 100644 +--- a/gcc/timevar.def ++++ b/gcc/timevar.def +@@ -84,6 +84,7 @@ DEFTIMEVAR (TV_IPA_COMDATS , "ipa comdats") + DEFTIMEVAR (TV_IPA_HARDWARE_DETECTION, "ipa detection") + DEFTIMEVAR (TV_IPA_PREFETCH , "ipa prefetch") + DEFTIMEVAR (TV_IPA_STRUCT_REORG , "ipa struct reorg optimization") ++DEFTIMEVAR (TV_IPA_EXTEND_AUTO_PROFILE, "ipa extend auto profile") + DEFTIMEVAR (TV_IPA_OPT , "ipa various optimizations") + DEFTIMEVAR (TV_IPA_LTO_DECOMPRESS , "lto stream decompression") + DEFTIMEVAR (TV_IPA_LTO_COMPRESS , "lto stream compression") +@@ -215,6 +216,7 @@ DEFTIMEVAR (TV_TREE_LOOP_DISTRIBUTION, "tree loop distribution") + DEFTIMEVAR (TV_CHECK_DATA_DEPS , "tree check data dependences") + DEFTIMEVAR (TV_TREE_PREFETCH , "tree prefetching") + DEFTIMEVAR (TV_TREE_LOOP_IVOPTS , "tree iv optimization") ++DEFTIMEVAR (TV_TREE_LLC_ALLOCATE , "tree llc allocation") + DEFTIMEVAR (TV_PREDCOM , "predictive commoning") + DEFTIMEVAR (TV_TREE_CH , "tree copy headers") + DEFTIMEVAR (TV_TREE_SSA_UNCPROP , "tree SSA uncprop") +diff --git a/gcc/toplev.cc b/gcc/toplev.cc +index f00a166df..bdbd4de63 100644 +--- a/gcc/toplev.cc ++++ b/gcc/toplev.cc +@@ -567,6 +567,12 @@ compile_file (void) + targetm.asm_out.output_ident (ident_str); + } + ++ /* Extend auto profile finalization. */ ++ if (flag_ipa_extend_auto_profile) ++ { ++ free_extend_profile_info (); ++ } ++ + /* Auto profile finalization. */ + if (flag_auto_profile) + end_auto_profile (); +diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc +index d33aaec8c..40f67a8ed 100644 +--- a/gcc/tree-cfg.cc ++++ b/gcc/tree-cfg.cc +@@ -8476,6 +8476,17 @@ print_loops (FILE *file, int verbosity) + print_loop_and_siblings (file, bb->loop_father, 0, verbosity); + } + ++/* Dump a loop to file. */ ++ ++void ++loop_dump (FILE *file, class loop *loop) ++{ ++ print_loop (file, loop, 0, 0); ++ fprintf (file, "vec_niter = "); ++ print_generic_expr (file, loop->vec_nb_iterations); ++ fprintf (file, "\n"); ++} ++ + /* Dump a loop. */ + + DEBUG_FUNCTION void +diff --git a/gcc/tree-cfg.h b/gcc/tree-cfg.h +index bfe44c073..0982fa7cf 100644 +--- a/gcc/tree-cfg.h ++++ b/gcc/tree-cfg.h +@@ -83,6 +83,7 @@ extern void dump_function_to_file (tree, FILE *, dump_flags_t); + extern void debug_function (tree, dump_flags_t); + extern void print_loops_bb (FILE *, basic_block, int, int); + extern void print_loops (FILE *, int); ++extern void loop_dump (FILE *file, class loop *loop); + extern void debug (class loop &ref); + extern void debug (class loop *ptr); + extern void debug_verbose (class loop &ref); +diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h +index a98f84397..468353d13 100644 +--- a/gcc/tree-pass.h ++++ b/gcc/tree-pass.h +@@ -395,6 +395,7 @@ extern gimple_opt_pass *make_pass_slp_vectorize_late (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt); ++extern gimple_opt_pass *make_pass_llc_allocate (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt); +@@ -536,6 +537,8 @@ extern simple_ipa_opt_pass *make_pass_ipa_hardware_detection (gcc::context * + ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_prefetch (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_struct_reorg (gcc::context *ctxt); ++extern simple_ipa_opt_pass *make_pass_ipa_extend_auto_profile (gcc::context ++ *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_pta (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_tm (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_target_clone (gcc::context *ctxt); +diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc +index 44157265c..4c014fb23 100644 +--- a/gcc/tree-scalar-evolution.cc ++++ b/gcc/tree-scalar-evolution.cc +@@ -2789,7 +2789,7 @@ resolve_mixers (class loop *loop, tree chrec, bool *folded_casts) + the loop body has been executed 6 times. */ + + tree +-number_of_latch_executions (class loop *loop) ++number_of_latch_executions (class loop *loop, bool guarantee) + { + edge exit; + class tree_niter_desc niter_desc; +@@ -2810,7 +2810,8 @@ number_of_latch_executions (class loop *loop) + res = chrec_dont_know; + exit = single_exit (loop); + +- if (exit && number_of_iterations_exit (loop, exit, &niter_desc, false)) ++ if (exit && number_of_iterations_exit (loop, exit, &niter_desc, false, ++ true, NULL, guarantee)) + { + may_be_zero = niter_desc.may_be_zero; + res = niter_desc.niter; +@@ -2836,7 +2837,8 @@ number_of_latch_executions (class loop *loop) + fprintf (dump_file, "))\n"); + } + +- loop->nb_iterations = res; ++ if (guarantee) ++ loop->nb_iterations = res; + return res; + } + +diff --git a/gcc/tree-scalar-evolution.h b/gcc/tree-scalar-evolution.h +index 0f90207bc..dc27d9545 100644 +--- a/gcc/tree-scalar-evolution.h ++++ b/gcc/tree-scalar-evolution.h +@@ -21,7 +21,8 @@ along with GCC; see the file COPYING3. If not see + #ifndef GCC_TREE_SCALAR_EVOLUTION_H + #define GCC_TREE_SCALAR_EVOLUTION_H + +-extern tree number_of_latch_executions (class loop *); ++extern tree number_of_latch_executions (class loop *, ++ bool guarantee = true); + extern gcond *get_loop_exit_condition (const class loop *); + + extern void scev_initialize (void); +diff --git a/gcc/tree-ssa-llc-allocate.cc b/gcc/tree-ssa-llc-allocate.cc +new file mode 100644 +index 000000000..da6d72b94 +--- /dev/null ++++ b/gcc/tree-ssa-llc-allocate.cc +@@ -0,0 +1,4150 @@ ++/* LLC allocate. ++ Copyright (C) 2022-2023 Free Software Foundation, Inc. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify it ++under the terms of the GNU General Public License as published by the ++Free Software Foundation; either version 3, or (at your option) any ++later version. ++ ++GCC is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++#include "config.h" ++#define INCLUDE_MAP ++#define INCLUDE_SET ++#define INCLUDE_VECTOR ++#define INCLUDE_LIST ++#define INCLUDE_ALGORITHM ++#include "system.h" ++#include "coretypes.h" ++#include "backend.h" ++#include "target.h" ++#include "rtl.h" ++#include "tree.h" ++#include "gimple.h" ++#include "predict.h" ++#include "tree-pass.h" ++#include "gimple-ssa.h" ++#include "optabs-query.h" ++#include "tree-pretty-print.h" ++#include "fold-const.h" ++#include "stor-layout.h" ++#include "gimplify.h" ++#include "gimple-iterator.h" ++#include "gimplify-me.h" ++#include "tree-ssa-loop-ivopts.h" ++#include "tree-ssa-loop-manip.h" ++#include "tree-ssa-loop-niter.h" ++#include "tree-ssa-loop.h" ++#include "ssa.h" ++#include "tree-into-ssa.h" ++#include "cfgloop.h" ++#include "tree-scalar-evolution.h" ++#include "langhooks.h" ++#include "tree-inline.h" ++#include "tree-data-ref.h" ++#include "diagnostic-core.h" ++#include "dbgcnt.h" ++#include "gimple-pretty-print.h" ++#include "internal-fn.h" ++#include "tree-cfg.h" ++#include "profile-count.h" ++#include "auto-profile.h" ++ ++/* Number of parallel cores. */ ++const unsigned int PARALLEL_NUM = 304; ++ ++/* Indirect access weight. */ ++const unsigned int INDIRECT_ACCESS_VALUE = 3; ++ ++/* Write memory weight. */ ++const unsigned int WRITE_COST = 4; ++ ++/* Maximum ratio of total prefetch data size to cache size. */ ++const double PREFETCH_CACHE_SIZE_RATIO = 0.8; ++ ++/* Prefetch tool input max length. */ ++#ifndef PREFETCH_TOOL_INPUT_MAX_LEN ++#define PREFETCH_TOOL_INPUT_MAX_LEN 512 ++#endif ++ ++/* Prefetch tool number max length. */ ++#ifndef PREFETCH_TOOL_NUM_MAX_LEN ++#define PREFETCH_TOOL_NUM_MAX_LEN 9 ++#endif ++ ++#ifndef PREFETCH_FUNC_TOPN ++#define PREFETCH_FUNC_TOPN param_llc_allocate_func_topn ++#endif ++ ++namespace { ++ ++/* loop bound info of the memory reference located. */ ++struct loop_bound ++{ ++ /* iv tree_node. */ ++ tree iv; ++ ++ /* define stmt of iv. */ ++ gimple *def_stmt; ++ ++ /* loop where stmt is located. */ ++ class loop *loop; ++ ++ /* loop unroll factor. */ ++ unsigned int unroll; ++ ++ /* Number of iterations of loop. */ ++ tree niters; ++ ++ loop_bound (tree t, gimple *stmt) ++ { ++ iv = t; ++ def_stmt = stmt; ++ loop = loop_containing_stmt (stmt); ++ unroll = 1; ++ niters = chrec_dont_know; ++ } ++}; ++ ++/* method of calculating the data size. */ ++ ++enum calc_type ++{ ++ UNHANDLE_CALC = 0, ++ RUNTIME_CALC, ++ STATIC_CALC ++}; ++ ++/* Describes a info of a memory reference. */ ++ ++struct data_ref ++{ ++ /* The memory reference. */ ++ tree ref; ++ ++ /* Statement where the ref is located. */ ++ gimple *stmt; ++ ++ /* var_decl or param_decl, used for the ref_group. */ ++ tree var; ++ ++ /* Base of the reference. */ ++ tree base; ++ ++ /* Constant offset of the reference. */ ++ tree offset; ++ ++ /* index of the reference. */ ++ tree index; ++ ++ /* Constant step of the reference. */ ++ tree step; ++ ++ /* loop boundary info of each dimension. */ ++ std::vector loop_bounds; ++ ++ /* memory data size, Unit: MB. */ ++ double data_size; ++ ++ /* method of calculating the data size. */ ++ calc_type calc_by; ++ ++ /* True if the info of ref is traced, and then record it. */ ++ unsigned int trace_status_p : 1; ++ ++ /* True if the loop is vectorized. */ ++ unsigned int vectorize_p : 1; ++ ++ /* True if the memory reference is shared. */ ++ unsigned int parallel_p : 1; ++ ++ /* True if the memory reference is regular. */ ++ unsigned int regular_p : 1; ++ ++ /* True if the memory reference is read. */ ++ unsigned int read_p : 1; ++ ++ /* loop father depth. */ ++ unsigned int loop_depth; ++ ++ /* bb index. */ ++ int bb_idx; ++ ++ /* loop index. */ ++ int loop_idx; ++ ++ data_ref () ++ { ++ ref = NULL_TREE; ++ stmt = NULL; ++ var = NULL_TREE; ++ base = NULL_TREE; ++ offset = NULL_TREE; ++ index = NULL_TREE; ++ step = NULL_TREE; ++ data_size = 0; ++ calc_by = UNHANDLE_CALC; ++ trace_status_p = false; ++ vectorize_p = false; ++ parallel_p = false; ++ regular_p = true; ++ read_p = true; ++ loop_depth = 0; ++ bb_idx = 0; ++ loop_idx = 0; ++ } ++}; ++ ++/* ================ phase 1 get_dense_memory_kernels ================ */ ++ ++/* Add ref node and print. */ ++ ++void ++add_ref (std::vector &references, tree op, gimple *stmt, ++ bool vectorize_p, bool read_p) ++{ ++ data_ref ref; ++ ref.ref = op; ++ ref.stmt = stmt; ++ ref.vectorize_p = vectorize_p; ++ ref.read_p = read_p; ++ ref.loop_depth = loop_depth (stmt->bb->loop_father); ++ ref.bb_idx = stmt->bb->index; ++ ref.loop_idx = stmt->bb->loop_father->num; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, ref.ref, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ references.push_back (ref); ++} ++ ++/* Get the references from the simple call (vectorization type). */ ++ ++void ++get_references_in_gimple_call (gimple *stmt, std::vector &references) ++{ ++ if (gimple_code (stmt) != GIMPLE_CALL) ++ return; ++ ++ if (gimple_call_internal_p (stmt)) ++ { ++ bool read_p = false; ++ switch (gimple_call_internal_fn (stmt)) ++ { ++ case IFN_MASK_GATHER_LOAD: ++ case IFN_MASK_LOAD: ++ { ++ if (gimple_call_lhs (stmt) == NULL_TREE) ++ return; ++ read_p = true; ++ // FALLTHRU ++ } ++ case IFN_MASK_STORE: ++ { ++ /* _1 = &MEM[base: a_2(D), index: ivtmp_3, step: 8, offset: 0B]; ++ vect__1.1 = .MASK_LOAD (_1, 64B, loop_mask_4); ++ ++ _1 = &MEM[base: a_2(D), index: ivtmp_3, step: 8, offset: 0B]; ++ .MASK_STORE (_1, 64B, loop_mask_4, vect__1.2); ++ ++ _1 = (sizetype) a_2(D); ++ vect_patt_3.3 = .MASK_GATHER_LOAD (_1, vect__4.4, 8, ++ { 0.0, ... }, loop_mask_5); ++ */ ++ tree op1 = gimple_call_arg (stmt, 0); ++ if (TREE_CODE (op1) != SSA_NAME) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "get_references_in_gimple_call: "); ++ fprintf (dump_file, "find base that not ssa_name: "); ++ print_generic_expr (dump_file, op1, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ return; ++ } ++ gimple *op1_def = SSA_NAME_DEF_STMT (op1); ++ if (op1_def != NULL && gimple_code (op1_def) == GIMPLE_ASSIGN) ++ { ++ /* &MEM[base: xx] */ ++ tree rhs1 = gimple_assign_rhs1 (op1_def); ++ /* If the definition stmt of the operation is memory ++ reference type, read it directly. */ ++ if (TREE_CODE (rhs1) == ADDR_EXPR ++ && TREE_CODE (TREE_OPERAND (rhs1, 0)) == TARGET_MEM_REF) ++ op1 = TREE_OPERAND (rhs1, 0); /* MEM[base: xx] */ ++ } ++ ++ add_ref (references, op1, stmt, true, read_p); ++ return; ++ } ++ default: ++ return; ++ } ++ } ++} ++ ++/* Check whether memory reference is located exactly in main function. ++ There are some other unexpected scenarios where mem ref or function is ++ tracing failed without loc info (newly generated gimple/function). */ ++ ++bool ++is_reference_in_main_p (gimple *stmt) ++{ ++ expanded_location xloc = expand_location (stmt->location); ++ if (DECL_NAME (cfun->decl) && MAIN_NAME_P (DECL_NAME (cfun->decl))) ++ { ++ /* NEXT STEP: Check why some functions have no end_locus. */ ++ if (!(DECL_SOURCE_LOCATION (current_function_decl) ++ && cfun->function_end_locus)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Cannot find function start-end location.\n"); ++ return true; ++ } ++ else if (!(xloc.file && xloc.line)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Cannot find gimple statement location.\n"); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); ++ } ++ return false; ++ } ++ int fn_start = expand_location ( ++ DECL_SOURCE_LOCATION (current_function_decl)).line; ++ int fn_end = expand_location (cfun->function_end_locus).line; ++ ++ if (xloc.line >= fn_start && xloc.line <= fn_end) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Memory access in main function: "); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); ++ } ++ return true; ++ } ++ } ++ return false; ++} ++ ++/* Stores the locations of memory references in STMT to REFERENCES. */ ++ ++void ++get_references_in_stmt (gimple *stmt, std::vector &references) ++{ ++ if (!gimple_vuse (stmt)) ++ return; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "gimple_vuse: "); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); ++ } ++ ++ /* Filter out memory references located in main function. This is a ++ experimental filtering scheme ONLY for HPC case verification as ++ some HPC cases assign values for variables (mem ref) in main function. */ ++ if (is_reference_in_main_p (stmt)) ++ return; ++ ++ if (gimple_code (stmt) == GIMPLE_ASSIGN) ++ { ++ tree op0 = gimple_assign_lhs (stmt); ++ tree op1 = gimple_assign_rhs1 (stmt); ++ tree base = NULL_TREE; ++ ++ /* _1 = MEM[base: a, index: i, step: 8, offset: 0B]; */ ++ if (REFERENCE_CLASS_P (op1) && (base = get_base_address (op1)) ++ && TREE_CODE (base) != SSA_NAME && !is_gimple_min_invariant (base)) ++ add_ref (references, op1, stmt, false, true); ++ ++ if (REFERENCE_CLASS_P (op0) && get_base_address (op0)) ++ add_ref (references, op0, stmt, false, false); ++ } ++ else if (gimple_code (stmt) == GIMPLE_CALL) ++ get_references_in_gimple_call (stmt, references); ++ ++ return; ++} ++ ++/* flag of loop filter out. */ ++ ++struct loop_filter_out_flag ++{ ++ /* Use external call. */ ++ bool use_ext_call; ++ ++ /* Use external node. */ ++ bool use_ext_node; ++ ++ /* Use loop defined in macros. */ ++ bool use_macro_loop; ++ ++ /* Use external node. */ ++ bool use_cond_func; ++}; ++ ++/* Check whether an external node is used. */ ++ ++bool use_ext_node_p (const std::vector &references, ++ unsigned int &start) ++{ ++ expanded_location cfun_xloc ++ = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); ++ ++ unsigned i = start; ++ start = references.size (); ++ for (; i < references.size (); i++) ++ { ++ data_ref ref = references[i]; ++ expanded_location xloc = expand_location (ref.stmt->location); ++ if (xloc.file && filename_cmp (cfun_xloc.file, xloc.file)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "use_ext_node\n\n"); ++ return true; ++ } ++ } ++ return false; ++} ++ ++/* Determine whether to filter out loops by stmt. */ ++ ++bool ++filter_out_loop_by_stmt_p (loop_filter_out_flag &loop_filter, gimple *stmt, ++ const std::vector &references, ++ unsigned int &start) ++{ ++ expanded_location xloc = expand_location (stmt->location); ++ /* check use_ext_call. */ ++ if (gimple_code (stmt) == GIMPLE_CALL && !gimple_call_internal_p (stmt)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "use_ext_call: "); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); ++ } ++ loop_filter.use_ext_call = true; ++ return true; ++ } ++ ++ /* check use_macro_loop. */ ++ if (xloc.file && xloc.column != 1) ++ loop_filter.use_macro_loop = false; ++ ++ /* check use_cond_func, VEC_COND_EXPR/MIN_EXPR/MAX_EXPR. */ ++ if (gimple_code (stmt) == GIMPLE_ASSIGN) ++ { ++ enum tree_code rhs_code = gimple_assign_rhs_code (stmt); ++ if (rhs_code == VEC_COND_EXPR || rhs_code == MIN_EXPR ++ || rhs_code == MAX_EXPR) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "use_cond_func: "); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); ++ } ++ loop_filter.use_cond_func = true; ++ return true; ++ } ++ } ++ ++ /* check use_ext_node. */ ++ if (use_ext_node_p (references, start)) ++ { ++ loop_filter.use_ext_node = true; ++ return true; ++ } ++ ++ return false; ++} ++ ++/* Dump the flag type of the loop is filtered out. */ ++ ++void ++dump_loop_filter_out_flag (loop_filter_out_flag &loop_filter) ++{ ++ if (loop_filter.use_ext_call) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: use_ext_call\n"); ++ } ++ ++ if (loop_filter.use_ext_node) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: use_ext_node\n"); ++ } ++ ++ if (loop_filter.use_macro_loop) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: use_macro_loop\n"); ++ } ++ ++ if (loop_filter.use_cond_func) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: use_cond_func\n"); ++ } ++} ++ ++/* Get references in loop. */ ++ ++bool ++get_references_in_loop (std::vector &references, ++ loop_filter_out_flag &loop_filter, ++ class loop *loop) ++{ ++ unsigned int start = 0; ++ bool filter_out_loop = true; ++ ++ /* Analyze each bb in the loop. */ ++ basic_block *body = get_loop_body_in_dom_order (loop); ++ for (unsigned i = 0; i < loop->num_nodes; i++) ++ { ++ basic_block bb = body[i]; ++ if (bb->loop_father != loop) ++ continue; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\n==== the %dth loop bb body ====\n", i); ++ gimple_dump_bb (dump_file, bb, 0, dump_flags); ++ fprintf (dump_file, "\n"); ++ } ++ ++ gimple_stmt_iterator bsi; ++ for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) ++ { ++ gimple *stmt = gsi_stmt (bsi); ++ get_references_in_stmt (stmt, references); ++ filter_out_loop = filter_out_loop_by_stmt_p (loop_filter, stmt, ++ references, start); ++ if (filter_out_loop) ++ break; ++ } ++ if (filter_out_loop) ++ break; ++ } ++ free (body); ++ return !filter_out_loop; ++} ++ ++/* Computes an estimated number of insns in LOOP, weighted by WEIGHTS. ++ Assume that the HPC data reading and calculation process does not involve ++ adding branches in loops. Therefore, all bbs of loops are directly used for ++ calculation (excluding embedded loops) without considering branch weighting. ++*/ ++ ++unsigned ++estimate_loop_insns (class loop *loop, eni_weights *weights) ++{ ++ basic_block *body = get_loop_body (loop); ++ gimple_stmt_iterator gsi; ++ unsigned size = 0, i; ++ ++ for (i = 0; i < loop->num_nodes; i++) ++ { ++ basic_block bb = body[i]; ++ if (bb->loop_father != loop) ++ { ++ continue; ++ } ++ for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi)) ++ size += estimate_num_insns (gsi_stmt (gsi), weights); ++ } ++ free (body); ++ ++ return size; ++} ++ ++/* Check whether the memory access is dense. */ ++ ++bool ++dense_memory_p (const std::vector &references, class loop *loop) ++{ ++ int ref_count = references.size (); ++ unsigned int ninsns = estimate_loop_insns (loop, &eni_size_weights); ++ float mem_to_insn_ratio = (float)ref_count / (float)ninsns; ++ ++ /* The number of cores to be run and DDR bandwidth information can be ++ transferred to flexibly adjust the threshold. */ ++ bool dense_mem = (mem_to_insn_ratio >= (param_mem_access_ratio / 100.0) ++ && ref_count >= param_mem_access_num); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (cfun->decl)); ++ ++ /* Dump dense memory source code location. */ ++ if (ref_count && references[0].stmt->location) ++ { ++ expanded_location xloc = expand_location ++ (references[0].stmt->location); ++ int fn_start = 0; ++ if (DECL_SOURCE_LOCATION (current_function_decl)) ++ fn_start = expand_location ( ++ DECL_SOURCE_LOCATION (current_function_decl)).line; ++ int fn_end = fn_start; ++ if (cfun->function_end_locus) ++ fn_end = expand_location (cfun->function_end_locus).line; ++ if (xloc.file) ++ fprintf (dump_file, "[%s:%s(%d-%d):%d:%d] ", ++ xloc.file, fn_name, fn_start, fn_end, ++ xloc.line, xloc.column); ++ } ++ ++ /* Dump memory dense information. */ ++ if (dense_mem) ++ fprintf (dump_file, "dense memory access: "); ++ else ++ fprintf (dump_file, "non-dense mem access: "); ++ fprintf (dump_file, ++ "ref_count = %d, ninsns = %d, mem_to_insn_ratio = %f\n\n", ++ ref_count, ninsns, mem_to_insn_ratio); ++ } ++ ++ return dense_mem; ++} ++ ++/* Analyze the inner loop and get the loop with dense memory access. */ ++ ++void ++analyze_loop_dense_memory (std::vector &kernels, ++ std::map > &kernels_refs, ++ class loop *loop) ++{ ++ std::vector references; ++ number_of_latch_executions (loop); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\n========== Processing loop %d: ==========\n", ++ loop->num); ++ loop_dump (dump_file, loop); ++ flow_loop_dump (loop, dump_file, NULL, 1); ++ fprintf (dump_file, "loop unroll: %d\n", loop->unroll); ++ } ++ ++ if (get_loop_exit_edges (loop).length () != 1) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: loop_multiple_exits\n"); ++ return; ++ } ++ ++ loop_filter_out_flag loop_filter = {false, false, true, false}; ++ ++ if (!get_references_in_loop (references, loop_filter, loop)) ++ { ++ dump_loop_filter_out_flag (loop_filter); ++ return; ++ } ++ ++ if (dense_memory_p (references, loop)) ++ { ++ kernels_refs[loop] = references; ++ kernels.push_back (loop); ++ } ++} ++/* Analyze the inner loop and get the loop with dense memory access. */ ++ ++bool ++get_dense_memory_kernels (std::vector &kernels, ++ std::map > &kernels_refs) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nPhase 1: get_dense_memory_kernels\n\n"); ++ for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST)) ++ analyze_loop_dense_memory (kernels, kernels_refs, loop); ++ return kernels.size () > 0; ++} ++ ++/* ================ phase 2 trace_data_refs_info ================ */ ++ ++/* Determine whether the declaration is a non-vectorized. */ ++ ++bool ++generic_decl_p (tree expr) ++{ ++ if (expr == NULL_TREE) ++ return false; ++ enum tree_code expr_code = TREE_CODE (expr); ++ if (expr_code != VAR_DECL && expr_code != PARM_DECL ++ && expr_code != COMPONENT_REF) ++ return false; ++ return true; ++} ++ ++/* Initial worklist preparation for source variable tracing. ++ Add different initial node based on different gimple statements. */ ++ ++void ++add_worklist (std::vector &worklist, std::set &walked, ++ gimple *def_stmt) ++{ ++ if (gimple_code (def_stmt) == GIMPLE_PHI) ++ { ++ for (unsigned i = 0; i < gimple_phi_num_args (def_stmt); i++) ++ { ++ tree node = gimple_phi_arg_def (def_stmt, i); ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ walked.insert (node); ++ } ++ } ++ } ++ else if (is_gimple_assign (def_stmt)) ++ { ++ tree_code rhs_code = gimple_assign_rhs_code (def_stmt); ++ if (rhs_code == POINTER_PLUS_EXPR || rhs_code == NEGATE_EXPR ++ || rhs_code == NOP_EXPR || rhs_code == SSA_NAME ++ || rhs_code == COMPONENT_REF) ++ { ++ tree node = gimple_assign_rhs1 (def_stmt); ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ walked.insert (node); ++ } ++ } ++ else if (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR) ++ { ++ tree node = gimple_assign_rhs1 (def_stmt); ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ walked.insert (node); ++ } ++ node = gimple_assign_rhs2 (def_stmt); ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ walked.insert (node); ++ } ++ } ++ else if (rhs_code == TARGET_MEM_REF || rhs_code == MEM_REF) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "possibly unnested indirect memory access: "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ else ++ { ++ /* unhandled assign rhs_code: _219 = _17 * _70; ++ _17 = *grid_56(D).sst.span; ++ _70 = *grid_56(D).sst.dim[0].stride; ++ */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "unhandled assign rhs_code: "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ } ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "unsupported tracing stmt: "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ } ++} ++ ++ ++/* Tracing source variables: ++ vectp.1 = a_2(D) + _3; ++ _4 = &MEM[base: vectp.1, index: ivtmp_5, step: 8, offset: 0B]; ++ vect__1.6 = .MASK_LOAD (_4, 64B, loop_mask_7); ++ ++ _1 = (sizetype) b_2(D); ++ vect_patt_3.3 = .MASK_GATHER_LOAD (_1, vect__4.4, 8, { 0.0, ... }, ++ loop_mask_5); ++ ... ++ Due to previous pass optimizations, the current tracing method can find ++ several source variable candidates. We decide to record them in a map and ++ later filter out the true base variable by some criteria. ++*/ ++ ++void ++trace_base_var_helper (tree arg, std::set &walked, ++ std::map& base_var_candid, bool is_vect_type) ++{ ++ if (arg == NULL) ++ return; ++ ++ /* Var_decl type: base address extracted from ARRAY_REF. */ ++ if (TREE_CODE (TREE_TYPE (arg)) == ARRAY_TYPE && TREE_CODE (arg) == VAR_DECL ++ && generic_decl_p (arg)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "var_decl type\n"); ++ base_var_candid[arg] += 1; ++ return; ++ } ++ ++ /* Array type. */ ++ tree op0 = NULL; ++ if (TREE_CODE (arg) == ADDR_EXPR ++ && (op0 = TREE_OPERAND (arg, 0)) && generic_decl_p (op0)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "array type\n"); ++ base_var_candid[op0] += 1; ++ return; ++ } ++ ++ /* Pointer type. */ ++ if (TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE && generic_decl_p (arg)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "pointer type\n"); ++ base_var_candid[arg] += 1; ++ return; ++ } ++ ++ /* SSA_NAME type. */ ++ if (TREE_CODE (arg) != SSA_NAME) ++ return; ++ ++ tree tmp_var = SSA_NAME_VAR (arg); ++ if (tmp_var && !is_vect_type && TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "ssa pointer type\n"); ++ base_var_candid[tmp_var] += 1; ++ return; ++ } ++ ++ gimple *def_stmt = SSA_NAME_DEF_STMT (arg); ++ if (def_stmt == NULL) ++ return; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, arg, TDF_SLIM); ++ fprintf (dump_file, "\t\t: "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM); ++ } ++ ++ if (gimple_code (def_stmt) == GIMPLE_NOP) ++ { ++ if (!walked.count (tmp_var)) ++ walked.insert (tmp_var); ++ trace_base_var_helper (tmp_var, walked, base_var_candid, is_vect_type); ++ } ++ else ++ { ++ std::vector worklist; ++ add_worklist (worklist, walked, def_stmt); ++ for (unsigned i = 0; i < worklist.size (); ++i) ++ trace_base_var_helper (worklist[i], walked, base_var_candid, is_vect_type); ++ } ++} ++ ++/* Identify the base variable traced from base address of memory reference. ++ We recognize that current method could detect several base variable ++ candidates and the temporary criteria for base variable determination ++ is that either one of the following statement is true: ++ 1) The number of base variable candidates is 1; ++ 2) The number of detected gimple statements for some variable is 1. ++ We may use other criteria or relax the current criteria ++ (e.g., criterion 2: 1 -> any odd number). */ ++ ++bool ++trace_base_var (data_ref &mem_ref, std::set &walked) ++{ ++ tree &var = mem_ref.var; ++ tree arg = mem_ref.base; ++ std::map base_var_candid; ++ bool is_vect_type = TREE_CODE (TREE_TYPE (mem_ref.ref)) == VECTOR_TYPE; ++ trace_base_var_helper (arg, walked, base_var_candid, is_vect_type); ++ bool is_tracing_unusual = false; ++ if (base_var_candid.size () == 1) ++ var = base_var_candid.begin ()->first; ++ else ++ { ++ is_tracing_unusual = true; ++ for (std::map::iterator it = base_var_candid.begin (); ++ it != base_var_candid.end (); ++it) ++ var = it->second == 1 ? it->first : var; ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Traced variables at "); ++ print_generic_expr (dump_file, arg, TDF_SLIM); ++ fprintf (dump_file, ":\n"); ++ for (std::map::iterator it = base_var_candid.begin (); ++ it != base_var_candid.end (); ++it) ++ fprintf (dump_file, "%s:%d, ", get_name (it->first), it->second); ++ fprintf (dump_file, "\n"); ++ ++ if (var == NULL_TREE) ++ fprintf (dump_file, "Unhandled scenario for tracing base variable.\n"); ++ else if (is_tracing_unusual && var != NULL_TREE) ++ fprintf (dump_file, "Tracing unusual number or occurrences of base " ++ "variables. Choose %s.\n", ++ get_name (var)); ++ } ++ return var != NULL_TREE; ++} ++ ++/* Recursively trace and check whether the definition stmt of the ++ index operand is a recorded stmt in direct access tracing. ++ Return 0 if ref is a direct access a[]. ++ Return 1 if ref is a non-nested indirect access a[b[]]. ++ Return 2 if ref is a complex indirect memory access, such as a[f(b[])]. */ ++ ++int ++trace_indirect_operand (tree arg, std::set &traced_ref_stmt) ++{ ++ /* Return 0 if tree `arg` is not an SSA for further tracing. */ ++ if (TREE_CODE (arg) != SSA_NAME) ++ return 0; ++ ++ gimple *def_stmt = SSA_NAME_DEF_STMT (arg); ++ ++ /* Return 1 if `index` has been detected as a traced direct memory access ++ before. */ ++ if (traced_ref_stmt.count (def_stmt)) ++ return 1; ++ ++ /* Return 0 if def stmt of `arg` is not in gimple assign type. Stop tracing ++ index operand and currently no memory access operand is detected. */ ++ if (!def_stmt || !is_gimple_assign (def_stmt)) ++ return 0; ++ ++ tree_code rhs_code = gimple_assign_rhs_code (def_stmt); ++ /* Collect a whitelist of gimple_assign_rhs_code for tracing pointer/array ++ type indirect memory access. */ ++ if (rhs_code != MULT_EXPR && rhs_code != NOP_EXPR ++ && rhs_code != CONVERT_EXPR && rhs_code != PLUS_EXPR) ++ { ++ /* Return 2 if tree code has any type representing references to storge, ++ implying a complex indirect memory access scenario for future ++ analysis. */ ++ if (rhs_code == MEM_REF || rhs_code == TARGET_MEM_REF ++ || rhs_code == ARRAY_REF || rhs_code == ARRAY_RANGE_REF ++ || rhs_code == COMPONENT_REF || rhs_code == ADDR_EXPR ++ || rhs_code == INDIRECT_REF) ++ return 2; ++ ++ /* Return 0 and stop tracing if tree code is not a common tracing ++ operand, but still reflected as a non-reference type. ++ Caveats: if we never deal with this tree code before, maybe it is ++ more suitable to treat this scenario strictly. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "unknown tracing tree code: %s\n", ++ get_tree_code_name (rhs_code)); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return 0; ++ } ++ ++ tree op = NULL_TREE; ++ ssa_op_iter iter; ++ FOR_EACH_SSA_TREE_OPERAND (op, def_stmt, iter, SSA_OP_USE) ++ { ++ int trace_indir_p = trace_indirect_operand (op, traced_ref_stmt); ++ if (trace_indir_p != 0) ++ return trace_indir_p; ++ } ++ return 0; ++} ++ ++/* Trace the pointer of the direct/indirect memory access: ++ 1) Obtain the base address of the memory access. ++ 2) If index variable is formed by another memory access operation (i.e., an ++ indication of indirect memory access), ensure that the index has been ++ traced in an already discovered direct memory access. ++ 3) Otherwise, the memory access is in a more complex scenario and we need to ++ postpone the analysis later. For example, the indirect memory access is ++ nested, a[b[c[...]]], or the index variable (formed in another memory ++ access) has not been recorded/traced yet. ++ e.g., ++ _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; ++ _4 = (long unsigned int) _1; ++ _5 = _4 * 8; ++ _6 = p(D) + _5; // get base ++ _7 = *_6; // start tracing ++*/ ++ ++bool ++trace_ptr_mem_ref (data_ref &mem_ref, std::set &traced_ref_stmt, ++ std::vector &unresolved_refs) ++{ ++ /* Simple scenario: ++ _2208 = np.120_2207 * 8; ++ _1921 = sorted_weight$data_381 + _2208; ++ *_1921 = _2206; ++ ++ Complex scenario: ++ MEM[base: _3235, index: ivtmp.2768_3189, step: 4, offset: 0B] = _105; ++ _3236 = (sizetype) _214; ++ _3237 = _3236 * 4; ++ _3238 = _857 + _3237; // base + index * step ++ _3239 = _3238 + 4; // offset ++ MEM[base: _3239, index: ivtmp.2768_3189, step: 4, offset: 0B] = 0.0; ++ */ ++ tree pointer = TREE_OPERAND (mem_ref.ref, 0); ++ tree offset = TREE_OPERAND (mem_ref.ref, 1); ++ if (TREE_CODE (offset) != INTEGER_CST) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Unhandled scenario for non-constant offset.\n"); ++ ++ return false; ++ } ++ if (TREE_CODE (pointer) != SSA_NAME) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Unhandled scenario for non-ssa pointer.\n"); ++ ++ return false; ++ } ++ ++ /* Tracing back base address from SSA. */ ++ gimple *ptr_def_stmt = SSA_NAME_DEF_STMT (pointer); ++ if (ptr_def_stmt == NULL || gimple_code (ptr_def_stmt) != GIMPLE_ASSIGN ++ || gimple_assign_rhs_code (ptr_def_stmt) != POINTER_PLUS_EXPR) ++ return false; ++ tree base = gimple_assign_rhs1 (ptr_def_stmt); ++ /* index_offset = index * step. */ ++ tree index_offset = gimple_assign_rhs2 (ptr_def_stmt); ++ ++ /* Tracing back index from SSA. */ ++ if (TREE_CODE (index_offset) != SSA_NAME) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ if (TREE_CODE (index_offset) == INTEGER_CST) ++ fprintf (dump_file, "Constant index for memory access.\n"); ++ else ++ fprintf (dump_file, "Unhandled scenario for index tracing.\n"); ++ } ++ return false; ++ } ++ ++ gimple *idx_def_stmt = SSA_NAME_DEF_STMT (index_offset); ++ if (idx_def_stmt == NULL || gimple_code (idx_def_stmt) != GIMPLE_ASSIGN ++ || gimple_assign_rhs_code (idx_def_stmt) != MULT_EXPR) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Unhandled scenario for index tracing.\n"); ++ return false; ++ } ++ ++ /* Split array index from total offset of index, `index * step`. */ ++ mem_ref.base = base; ++ mem_ref.offset = offset; ++ mem_ref.index = gimple_assign_rhs1 (idx_def_stmt); ++ mem_ref.step = gimple_assign_rhs2 (idx_def_stmt); ++ if (TREE_CODE (gimple_assign_rhs1 (idx_def_stmt)) == INTEGER_CST) ++ { ++ mem_ref.index = gimple_assign_rhs2 (idx_def_stmt); ++ mem_ref.step = gimple_assign_rhs1 (idx_def_stmt); ++ } ++ ++ int trace_index_indir_p = trace_indirect_operand (mem_ref.index, ++ traced_ref_stmt); ++ if (trace_index_indir_p == 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Direct memory access tracing succeeded.\n"); ++ } ++ else if (trace_index_indir_p == 1) ++ { ++ mem_ref.regular_p = false; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Indirect memory access tracing succeeded.\n"); ++ } ++ else ++ { ++ /* Record indirect memory access with complex scenarios for future ++ analysis. */ ++ unresolved_refs.push_back (mem_ref); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Unhandled indirect memory access tracing.\n"); ++ return false; ++ } ++ ++ return true; ++} ++ ++/* Tracing direct memory reference information. */ ++ ++bool ++trace_direct_mem_ref (data_ref &mem_ref) ++{ ++ /* Direct memory access, regardless of whether it is in vectorized form, ++ can be determined through TARGET_MEM_REF: ++ address = base + index * step + offset. ++ MASK_LOAD example: ++ _43 = &MEM[base: _42, index: ivtmp_140, step: 8, offset: 0B]; ++ vect__42.11_160 = .MASK_LOAD (_43, 64B, loop_mask_163); ++ ++ In some cases (2D-array or complex-index 1D array), mem_ref's `base` ++ may actually represent `base + index * step` when `base` address updates ++ by a PHI operation, e.g., ++ MEM[base: _51, offset: 0B] ++ _51 = (void *) ivtmp.18_11; ++ ivtmp.18_11 = PHI ++ ivtmp.18_43 = ivtmp.18_11 + 16; ++ ivtmp.18_52 = (unsigned long) _10; ++ _10 = arr2D_29(D) + _9; ++ */ ++ mem_ref.base = TREE_OPERAND (mem_ref.ref, 0); ++ mem_ref.offset = TREE_OPERAND (mem_ref.ref, 1); ++ mem_ref.index = TREE_OPERAND (mem_ref.ref, 2); ++ mem_ref.step = TREE_OPERAND (mem_ref.ref, 3); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Direct memory access tracing succeeded.\n"); ++ ++ return true; ++} ++ ++/* Tracing vectorized indirect memory reference information. ++ MASK_GATHER_LOAD example: ++ vect__45.13_146 = .MASK_LOAD (_41, 32B, loop_mask_153); ++ vect__46.14_145 = (vector([2,2]) long unsigned int) vect__45.13_146; ++ vect_patt_163.15_143 = .MASK_GATHER_LOAD (_144, vect__46.14_145, 8, ++ { 0.0, ... }, loop_mask_153); */ ++ ++bool ++trace_indirect_mem_ref_vectorized (data_ref &mem_ref, ++ std::set &traced_ref_stmt) ++{ ++ /* Processing of vectorization types. */ ++ if (mem_ref.vectorize_p) ++ { ++ tree op = gimple_call_arg (mem_ref.stmt, 1); ++ if (trace_indirect_operand (op, traced_ref_stmt)) ++ { ++ mem_ref.base = gimple_call_arg (mem_ref.stmt, 0); ++ mem_ref.index = gimple_call_arg (mem_ref.stmt, 1); ++ mem_ref.step = gimple_call_arg (mem_ref.stmt, 2); ++ mem_ref.regular_p = false; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Indirect memory access tracing succeeded.\n"); ++ return true; ++ } ++ } ++ return false; ++} ++ ++/* Trace the array of the indirect memory access: ++ 1) Obtain the base address of the indirect memory access. ++ 2) Ensure that the index has been traced in the direct memory access. ++ e.g., ++ _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; ++ _4 = (integer(kind=8)) _1; ++ _5 = _4 + 135; ++ _6 = p[_5]; // start tracing ++*/ ++ ++bool ++trace_indirect_array (data_ref &mem_ref, std::set &traced_ref_stmt) ++{ ++ tree base = TREE_OPERAND (mem_ref.ref, 0); ++ tree index = TREE_OPERAND (mem_ref.ref, 1); ++ if (trace_indirect_operand (index, traced_ref_stmt)) ++ { ++ /* ARRAY_REF, The first operand is the array; ++ the second is the index. */ ++ mem_ref.base = base; ++ mem_ref.index = index; ++ mem_ref.regular_p = false; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Indirect memory access tracing succeeded.\n"); ++ ++ return true; ++ } ++ ++ return false; ++} ++ ++/* Trace memory references base info: ++ 1) Memory access rule analysis and reference info tracing ++ 2) Source variable tracing, along base address of memory reference ++ We will extend parallel analysis later. ++*/ ++ ++void ++trace_ref_info (data_ref &mem_ref, std::set &traced_ref_stmt, ++ std::vector &unresolved_refs) ++{ ++ enum tree_code ref_code = TREE_CODE (mem_ref.ref); ++ /* 1) Direct and indirect access traces. */ ++ switch (ref_code) ++ { ++ case MEM_REF: ++ /* Non-vectorized direct/indirect access by pointer. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "MEM_REF\n"); ++ if (!trace_ptr_mem_ref (mem_ref, traced_ref_stmt, unresolved_refs)) ++ return; ++ break; ++ case TARGET_MEM_REF: ++ /* Vectorized and non-vectorized direct access. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "TARGET_MEM_REF\n"); ++ if (!trace_direct_mem_ref (mem_ref)) ++ return; ++ break; ++ case SSA_NAME: ++ /* Vectorized indirect memory access. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "SSA_NAME\n"); ++ if (!trace_indirect_mem_ref_vectorized (mem_ref, traced_ref_stmt)) ++ return; ++ break; ++ case ARRAY_REF: ++ /* Non-vectorized indirect memory access. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "ARRAY_REF\n"); ++ if (!trace_indirect_array (mem_ref, traced_ref_stmt)) ++ return; ++ break; ++ default: ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "ref is another tree-code: "); ++ fprintf (dump_file, "stmt: "); ++ print_gimple_stmt (dump_file, mem_ref.stmt, 0, TDF_LINENO); ++ fprintf (dump_file, "ref: "); ++ print_generic_expr (dump_file, mem_ref.ref, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ return; ++ } ++ ++ /* 2) Source variable tracing. */ ++ std::set walked; ++ if (mem_ref.var == NULL_TREE ++ && !trace_base_var (mem_ref, walked)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Source variable tracing failed.\n\n"); ++ return; ++ } ++ ++ if (mem_ref.regular_p) ++ traced_ref_stmt.insert (mem_ref.stmt); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Tracing succeeded.\n\n"); ++ mem_ref.trace_status_p = true; ++} ++ ++/* Trace all references in the loop. */ ++ ++void ++trace_loop_refs_info (std::vector &refs, ++ std::set &traced_ref_stmt, ++ std::vector &unresolved_refs) ++{ ++ for (unsigned i = 0; i < refs.size (); ++i) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "trace_references_base_info %d:\n", i); ++ print_generic_expr (dump_file, refs[i].ref, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ trace_ref_info (refs[i], traced_ref_stmt, unresolved_refs); ++ } ++} ++ ++/* Tracing and sorting reference groups. */ ++ ++void ++trace_data_refs_info (std::vector &kernels, ++ std::map > &loop_refs, ++ std::set &traced_ref_stmt, ++ std::vector &unresolved_refs) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nPhase 2: trace_all_references_info\n\n"); ++ ++ for (unsigned i = 0; i < kernels.size (); ++i) ++ { ++ class loop *loop = kernels[i]; ++ if (loop_refs.count (loop) == 0) ++ continue; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "loop header %d:\n", loop->header->index); ++ trace_loop_refs_info (loop_refs[loop], traced_ref_stmt, unresolved_refs); ++ } ++} ++ ++/* Retrace references base info for complex scenarios in indirect memory access ++ after Phase 3. */ ++ ++void ++retrace_ref_info_unresolved (data_ref &mem_ref, ++ std::set &traced_ref_stmt) ++{ ++ /* 1) Indirect access traces. */ ++ int trace_index_indir_p = trace_indirect_operand (mem_ref.index, ++ traced_ref_stmt); ++ if (trace_index_indir_p == 1) ++ { ++ mem_ref.regular_p = false; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Indirect memory access tracing succeeded.\n"); ++ } ++ ++ /* 2) Source variable tracing. */ ++ std::set walked; ++ if (mem_ref.var == NULL_TREE ++ && !trace_base_var (mem_ref, walked)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Source variable tracing failed.\n\n"); ++ return; ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Tracing succeeded.\n\n"); ++ mem_ref.trace_status_p = true; ++} ++ ++/* Retrace all unresolved references. */ ++ ++void ++retrace_loop_refs_info_unresolved (std::vector &unresolved_refs, ++ std::set &traced_ref_stmt) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, ++ "\nRetrace indirect memory access after outer loop analysis:\n"); ++ for (unsigned i = 0; i < unresolved_refs.size (); ++i) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "trace_references_base_info %d:\n", i); ++ print_generic_expr (dump_file, unresolved_refs[i].ref, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ retrace_ref_info_unresolved (unresolved_refs[i], traced_ref_stmt); ++ } ++} ++ ++/* ================ phase 3 analyze_nested_kernels ================ */ ++ ++/* Return the inner most type for arrays and pointers of TYPE. */ ++ ++tree ++inner_type (tree type) ++{ ++ while (POINTER_TYPE_P (type) ++ || TREE_CODE (type) == ARRAY_TYPE) ++ type = TREE_TYPE (type); ++ return type; ++} ++ ++/* Check whether the input iv is the loop dimension boundary. */ ++ ++bool ++loop_bound_iv_p (tree t, tree &outer_loop_t) ++{ ++ if (t == NULL || TREE_CODE (t) != SSA_NAME ++ || TREE_CODE (TREE_TYPE (t)) != INTEGER_TYPE) ++ return false; ++ ++ gimple *def_stmt = SSA_NAME_DEF_STMT (t); ++ ++ /* NOP_EXPR convertion between PHI node and memory reference due to MACRO. ++ n_898 = PHI ++ _757 = (sizetype) n_898; ++ _900 = MEM[base: _726, index: _757, step: 8, offset: 0B]; ++ */ ++ while (gimple_code (def_stmt) == GIMPLE_ASSIGN ++ && gimple_assign_rhs_code (def_stmt) == NOP_EXPR) ++ def_stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (def_stmt)); ++ ++ if (gimple_code (def_stmt) != GIMPLE_PHI) ++ return false; ++ ++ /* Filter scenarios with only two phi inputs. */ ++ if (gimple_phi_num_args (def_stmt) != 2) ++ return false; ++ ++ gphi *phi_stmt = as_a (def_stmt); ++ basic_block src0 = gimple_phi_arg_edge (phi_stmt, 0)->src; ++ basic_block src1 = gimple_phi_arg_edge (phi_stmt, 1)->src; ++ ++ class loop *loop = loop_containing_stmt (def_stmt); ++ bool res = false; ++ /* Two phi inputs, one from the current loop and one from the outer loop. */ ++ if ((src0->loop_father == loop) && (src1->loop_father == loop_outer (loop))) ++ { ++ outer_loop_t = gimple_phi_arg_def (def_stmt, 1); ++ res = true; ++ } ++ else if ((src1->loop_father == loop) ++ && (src0->loop_father == loop_outer (loop))) ++ { ++ outer_loop_t = gimple_phi_arg_def (def_stmt, 0); ++ res = true; ++ } ++ ++ if (res) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "===> "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM); ++ } ++ return true; ++ } ++ return false; ++} ++ ++/* add worklist and walked list. */ ++ ++void ++add_worklist_walked (std::vector &worklist, std::set &walked, ++ tree node) ++{ ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ /* Avoid phi node cycle introduction, which makes the worklist unable ++ to end. */ ++ walked.insert (node); ++ } ++} ++ ++/* check bound iv and add worklist. */ ++ ++void ++check_bound_iv_and_add_worklist (std::vector &worklist, ++ std::set &walked, ++ std::set &walked_loop, ++ tree t, data_ref &mem_ref) ++{ ++ if (t == NULL_TREE || TREE_CODE (t) != SSA_NAME) ++ return; ++ ++ gimple *def_stmt = SSA_NAME_DEF_STMT (t); ++ if (def_stmt == NULL) ++ return; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, t, TDF_SLIM); ++ fprintf (dump_file, "\t\t: "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM); ++ } ++ ++ if (gimple_code (def_stmt) == GIMPLE_PHI) ++ { ++ tree out_loop_t = NULL_TREE; ++ if (loop_bound_iv_p (t, out_loop_t)) ++ { ++ basic_block bb = gimple_bb (def_stmt); ++ if (!walked_loop.count (bb)) ++ { ++ mem_ref.loop_bounds.push_back (loop_bound (t, def_stmt)); ++ walked_loop.insert (bb); ++ } ++ add_worklist_walked (worklist, walked, out_loop_t); ++ } ++ } ++ else if (is_gimple_assign (def_stmt)) ++ { ++ tree_code rhs_code = gimple_assign_rhs_code (def_stmt); ++ ++ /* unary. */ ++ if (rhs_code == SSA_NAME || rhs_code == NOP_EXPR) ++ add_worklist_walked (worklist, walked, gimple_assign_rhs1 (def_stmt)); ++ else if (rhs_code == POINTER_PLUS_EXPR) ++ add_worklist_walked (worklist, walked, gimple_assign_rhs2 (def_stmt)); ++ ++ /* binary. */ ++ else if (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR ++ || rhs_code == MULT_EXPR) ++ { ++ add_worklist_walked (worklist, walked, gimple_assign_rhs1 (def_stmt)); ++ add_worklist_walked (worklist, walked, gimple_assign_rhs2 (def_stmt)); ++ } ++ } ++} ++ ++/* DFS trace the loop bound of iv. */ ++ ++bool ++trace_loop_bound_iv (data_ref &mem_ref) ++{ ++ /* In indirect memory access, the size cannot be determined based on the ++ loop boundary. However, we can take advantage of loop bound as an upper ++ bound (unrepeated memory access) to predict the variable footprint ++ involved in the specific loop dimension. */ ++ ++ /* Determine and record the boundary iv of the current index, ++ but do not trace it. */ ++ tree outer_loop_t = NULL_TREE; ++ /* indirect access example, mem_ref.index = _64 ++ _62 = MEM[symbol: uPtr, index: ivtmp.22_96, step: 4, offset: 0B]; ++ _63 = (long unsigned int) _62; ++ _64 = _63 * 8; ++ _65 = [openfoam_smooth.c:28:28] &bPrimePtr + _64; ++ _66 = *_65; */ ++ if (loop_bound_iv_p (mem_ref.index, outer_loop_t) || !mem_ref.regular_p) ++ { ++ mem_ref.loop_bounds.push_back ( ++ loop_bound (mem_ref.index, SSA_NAME_DEF_STMT (mem_ref.index))); ++ if (!mem_ref.regular_p) ++ return false; ++ } ++ ++ std::vector worklist; ++ worklist.push_back (mem_ref.base); ++ std::set walked; ++ std::set walked_loop; ++ ++ while (worklist.size ()) ++ { ++ tree t = worklist.back (); ++ worklist.pop_back (); ++ ++ /* add worklist. */ ++ check_bound_iv_and_add_worklist (worklist, walked, walked_loop, t, mem_ref); ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nmem_ref access dimension: %ld\n", ++ mem_ref.loop_bounds.size ()); ++ fprintf (dump_file, "Traced variables: "); ++ print_generic_expr (dump_file, mem_ref.base, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ ++ return mem_ref.loop_bounds.size () > 0; ++} ++ ++/* dump loop bound. */ ++ ++void ++loop_bound_dump (FILE *file, loop_bound &lb) ++{ ++ class loop *loop = lb.loop; ++ fprintf (file, "loop_bound: loop_%d (", loop->num); ++ if (loop->header) ++ fprintf (file, "header = %d", loop->header->index); ++ else ++ { ++ fprintf (file, "deleted)\n"); ++ return; ++ } ++ if (loop->latch) ++ fprintf (file, ", latch = %d", loop->latch->index); ++ fprintf (file, ", lb_niters = "); ++ print_generic_expr (file, lb.niters); ++ fprintf (file, ")\n\n"); ++} ++ ++/* static calculate data size. */ ++ ++void ++static_calculate_data_size (data_ref &mem_ref) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nstatic_calculate_data_size\n"); ++ ++ tree size_unit = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (mem_ref.var))); ++ unsigned HOST_WIDE_INT type_size = size_unit ? tree_to_uhwi (size_unit) : 0; ++ for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i) ++ { ++ unsigned HOST_WIDE_INT est_niter = tree_to_uhwi ++ (mem_ref.loop_bounds[i].niters); ++ unsigned int unroll = mem_ref.loop_bounds[i].unroll; ++ if (i == 0) ++ { ++ /* The unit conversion between byte, kilobytes, and megabytes is ++ 1024. */ ++ mem_ref.data_size = double (type_size ++ * est_niter * unroll) / 1024 / 1024; ++ } ++ else ++ mem_ref.data_size *= est_niter * unroll; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "static_data_size: %lf\n", mem_ref.data_size); ++ } ++} ++ ++/* Recursive tracing and creating of dominant nodes. */ ++ ++tree ++trace_and_create_dominate_expr (tree expr, class loop *outermost) ++{ ++ if (expr == NULL_TREE || is_gimple_constant (expr)) ++ return expr; ++ ++ if (TREE_CODE (expr) != SSA_NAME) ++ return NULL_TREE; ++ ++ if (SSA_NAME_IS_DEFAULT_DEF (expr)) ++ return expr; ++ ++ gimple *stmt = SSA_NAME_DEF_STMT (expr); ++ basic_block def_bb = gimple_bb (stmt); ++ if (def_bb == NULL || def_bb->loop_father == NULL) ++ return NULL_TREE; ++ ++ if (dominated_by_p (CDI_DOMINATORS, outermost->header, def_bb)) ++ return expr; ++ ++ if (gimple_code (stmt) != GIMPLE_ASSIGN) ++ return NULL_TREE; ++ ++ enum tree_code rhs_code = gimple_assign_rhs_code (stmt); ++ tree_code_class code_class = TREE_CODE_CLASS (rhs_code); ++ tree type = TREE_TYPE (gimple_assign_lhs (stmt)); ++ tree rhs1 = trace_and_create_dominate_expr (gimple_assign_rhs1 (stmt), ++ outermost); ++ if (rhs1 == NULL_TREE) ++ return NULL_TREE; ++ ++ if (code_class == tcc_unary) ++ { ++ tree expr_new = build1 (rhs_code, type, rhs1); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "expr_new = "); ++ print_generic_expr (dump_file, expr_new, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr_new; ++ } ++ else if (code_class == tcc_binary) ++ { ++ tree rhs2 = trace_and_create_dominate_expr (gimple_assign_rhs2 (stmt), ++ outermost); ++ if (rhs2 == NULL_TREE) ++ return NULL_TREE; ++ ++ tree expr_new = fold_build2 (rhs_code, type, rhs1, rhs2); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "expr_new = "); ++ print_generic_expr (dump_file, expr_new, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr_new; ++ } ++ ++ return NULL_TREE; ++} ++ ++/* Recursive parsing and craating of nodes in expr expressions. */ ++ ++tree ++parse_and_create_expr (tree expr, class loop *outermost) ++{ ++ if (expr == NULL_TREE || expr == chrec_dont_know ++ || is_gimple_constant (expr) || TREE_CODE (expr) == ADDR_EXPR) ++ { ++ /* tcc_expression (e.g., &q) situation combined with tcc_unary. */ ++ if (TREE_CODE (expr) == ADDR_EXPR && dump_file ++ && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "tcc_expression case in ADDR_EXPR: "); ++ print_generic_expr (dump_file, expr, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr; ++ } ++ ++ if (TREE_CODE (expr) == SSA_NAME) ++ return trace_and_create_dominate_expr (expr, outermost); ++ else if (EXPR_P (expr)) ++ { ++ enum tree_code tree_code = TREE_CODE (expr); ++ tree_code_class code_class = TREE_CODE_CLASS (tree_code); ++ tree type = TREE_TYPE (expr); ++ tree op1 = parse_and_create_expr (TREE_OPERAND (expr, 0), outermost); ++ if (op1 == NULL_TREE) ++ return NULL_TREE; ++ ++ if (code_class == tcc_unary) ++ { ++ tree expr_new = build1 (tree_code, type, op1); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "expr_new = "); ++ print_generic_expr (dump_file, expr_new, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr_new; ++ } ++ else if (code_class == tcc_binary) ++ { ++ tree op2 = parse_and_create_expr (TREE_OPERAND (expr, 1), outermost); ++ if (op2 == NULL_TREE) ++ return NULL_TREE; ++ ++ tree expr_new = fold_build2 (tree_code, type, op1, op2); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "expr_new = "); ++ print_generic_expr (dump_file, expr_new, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr_new; ++ } ++ } ++ return NULL_TREE; ++} ++ ++/* Trace and creat dominate loop bounds. */ ++ ++void ++trace_and_create_dominate_loop_bounds (data_ref &mem_ref) ++{ ++ /* Check whether the niters is a loop dominant. ++ If not, trace and determine whether the result is dominant. If yes, ++ create the expr of the dominant node. ++ */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\ntrace_and_create_dominate_loop_bounds\n"); ++ ++ /* Determine the relationship between the boundary of the innermost loop and ++ the dominant of the outer loop and the processing. */ ++ loop_bound &outermost = mem_ref.loop_bounds.back (); ++ for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i) ++ { ++ loop_bound ¤t = mem_ref.loop_bounds[i]; ++ tree &niters = current.niters; ++ if (TREE_CODE (niters) == COND_EXPR) ++ niters = TREE_OPERAND (niters, 1); ++ ++ niters = parse_and_create_expr (niters, outermost.loop); ++ ++ if (niters == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM); ++ fprintf (dump_file, "Tracing loop bound failed at dimension %d\n", ++ i); ++ } ++ mem_ref.calc_by = UNHANDLE_CALC; ++ break; ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ loop_bound_dump (dump_file, mem_ref.loop_bounds[i]); ++ } ++} ++ ++/* trace the dimension and corresponding loop bounds of mem_ref. ++ This function is used to supplement the information of mem_ref.loop_bounds. ++*/ ++ ++void ++trace_ref_dimension_and_loop_bounds (data_ref &mem_ref) ++{ ++ /* In the same loop, some memory access dimensions are different. Remove ++ variables with fewer dimensions. ++ Previous cyclic filtering conditions and memory access node records and ++ tracing. ++ The false result is also processed. ++ */ ++ if (dump_file) ++ fprintf (dump_file, "\ncalculate_data_size\n"); ++ ++ /* Trace the loop bound iv of ref to determine the dimension. */ ++ /* Record data from the loop perspective to avoid repeated tracing. */ ++ if (!trace_loop_bound_iv (mem_ref)) ++ return; ++ ++ /* The traced mem_ref may have multiple dimensions, which corresponds to ++ multiple loops. */ ++ /* And in the dimension-by-dimensional analysis, the computable way is ++ continuously reduced. */ ++ mem_ref.calc_by = STATIC_CALC; ++ for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i) ++ { ++ class loop *loop = mem_ref.loop_bounds[i].loop; ++ tree &niters = mem_ref.loop_bounds[i].niters; ++ ++ /* Set NULL_TREE to ensure that nb_iterations are retraced and ++ vec_nb_iterations are also extracted. */ ++ loop->nb_iterations = NULL_TREE; ++ niters = number_of_latch_executions (loop, false); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ loop_dump (dump_file, loop); ++ ++ if (loop->unroll) ++ { ++ if (loop->unroll == USHRT_MAX && dump_file ++ && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "loop->unroll = USHRT_MAX = %d", USHRT_MAX); ++ mem_ref.loop_bounds[i].unroll = loop->unroll; ++ } ++ ++ if ((niters == chrec_dont_know) && loop->vec_nb_iterations ++ && (loop->vec_nb_iterations != chrec_dont_know)) ++ niters = loop->vec_nb_iterations; ++ ++ if (niters == chrec_dont_know) ++ { ++ /* We derive est_loop_niters from function ++ `estimated_loop_iterations_int`. Usually only the innermost loop is ++ vectorized, so vec_nb_iterations can be 4 or 8 times as large as ++ `est_loop_niters` due to vectorization. However, function ++ `estimated_loop_iterations_int` only returns an integer instead of ++ a tree node expression, so it cannot substitute ++ function `number_of_latch_executions` in runtime computation. */ ++ HOST_WIDE_INT est_loop_niters = estimated_loop_iterations_int (loop); ++ if (est_loop_niters >= 0 && est_loop_niters < INT_MAX) ++ /* e.g., loop iterations from `estimated_loop_iterations_int`: (-1) ++ loop_144 (header = 519, latch = 625, niter = scev_not_known, ++ upper_bound = 1073741823, likely_upper_bound = 1073741823, ++ unroll = 1) */ ++ /* variable `niters` from `loop->vec_nb_iterations` ++ constant 34> */ ++ niters = build_int_cst (integer_type_node, (int) est_loop_niters); ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ loop_bound_dump (dump_file, mem_ref.loop_bounds[i]); ++ ++ if (niters == NULL_TREE || niters == chrec_dont_know) ++ mem_ref.calc_by = std::min (mem_ref.calc_by, UNHANDLE_CALC); ++ else if (TREE_CODE (niters) != INTEGER_CST) ++ mem_ref.calc_by = std::min (mem_ref.calc_by, RUNTIME_CALC); ++ else ++ mem_ref.calc_by = std::min (mem_ref.calc_by, STATIC_CALC); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ if (mem_ref.calc_by == 2) ++ { ++ fprintf (dump_file, "\nniters: "); ++ print_generic_expr (dump_file, niters, TDF_SLIM); ++ fprintf (dump_file, "\nSTATIC_CALC.\n"); ++ } ++ else if (mem_ref.calc_by == 1) ++ { ++ fprintf (dump_file, "\nniters: "); ++ print_generic_expr (dump_file, niters, TDF_SLIM); ++ fprintf (dump_file, "\nRUNTIME_CALC.\n"); ++ } ++ else ++ fprintf (dump_file, "\nUNHANDLE_CALC.\n"); ++ } ++ } ++ ++ if (mem_ref.calc_by == RUNTIME_CALC) ++ trace_and_create_dominate_loop_bounds (mem_ref); ++ else if (mem_ref.calc_by == STATIC_CALC) ++ static_calculate_data_size (mem_ref); ++} ++ ++/* Get the loop's niters tree. ++ Return NULL_TREE if not found. */ ++ ++tree ++get_cur_loop_niters (std::map > &loop_refs, ++ class loop *loop) ++{ ++ if (loop_refs.count (loop) == 0) ++ return NULL_TREE; ++ std::vector bounds = loop_refs[loop][0].loop_bounds; ++ return bounds.size () ? bounds[0].niters : NULL_TREE; ++} ++ ++/* Trace the sources of the niters tree and return the ++ outermost depth of the loops containing them. ++ Return start_depth if not found. ++ ++ example: ++ niters:(long) (((int) i_end_417 - (int) i_start_452) + 1) ++ operand_num: 1, subtree:(long) (((int) i_end_417 - (int) i_start_452) + 1) ++ operand_num: 2, subtree:((int) i_end_417 - (int) i_start_452) + 1 ++ operand_num: 2, subtree:(int) i_end_417 - (int) i_start_452 ++ operand_num: 1, subtree:(int) i_end_417 ++ SSA_NAME of niters: i_end_417 ++ gimple of SSA: i_end_417 = PHI ++ return gimple depth; ++*/ ++ ++unsigned ++trace_outer_loop_depth (tree niters, unsigned start_depth) ++{ ++ /* If niter does not exist or the type is INTEGER_CST, ++ the loop bound is determined and return start_depth. */ ++ if (niters == NULL_TREE || TREE_CODE (niters) == INTEGER_CST) ++ return start_depth; ++ ++ gimple *def_stmt = NULL; ++ /* niters examples: i_start_452, fEnd_35, fEnd_100. */ ++ enum tree_code niter_code = TREE_CODE (niters); ++ if (niter_code == SSA_NAME) ++ { ++ /* Trace the SSA that define this niter. */ ++ def_stmt = SSA_NAME_DEF_STMT (niters); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "ssa_name of niters: "); ++ print_generic_expr (dump_file, niters); ++ fprintf (dump_file, "\ngimple of ssa: \n"); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ /* Termination condition of dfs. Return the depth of the bb block. */ ++ if (gimple_code (def_stmt) == GIMPLE_PHI ++ || gimple_code (def_stmt) == GIMPLE_NOP) ++ { ++ basic_block def_bb = gimple_bb (SSA_NAME_DEF_STMT (niters)); ++ if (def_bb == NULL || def_bb->loop_father == NULL) ++ return start_depth; ++ unsigned ret_depth = loop_depth (def_bb->loop_father); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Stop tracing the outer loop depth, "); ++ fprintf (dump_file, "current depth: %d, current bb: %d\n", ++ ret_depth, def_bb->index); ++ } ++ return ret_depth; ++ } ++ /* 'ASSIGN': Use dfs to trace the rhs of the assignment statement. */ ++ else if (gimple_code (def_stmt) == GIMPLE_ASSIGN) ++ { ++ tree rhs = gimple_assign_rhs1 (def_stmt); ++ if (TREE_CODE (rhs) == TARGET_MEM_REF) ++ /* fEnd_35 = MEM[base: _19, index: ivtmp.96, step: 4, ++ offset: 0B] */ ++ return trace_outer_loop_depth (TREE_OPERAND (rhs, 2), start_depth); ++ else ++ { ++ /* M.218_658 = MIN_EXPR <_631, _657> */ ++ unsigned min_depth = start_depth; ++ unsigned operand_num = gimple_num_ops (def_stmt); ++ /* 'ASSIGN': start from 1 because op[0] is the lhs. */ ++ for (unsigned i = 1; i < operand_num; i++) ++ { ++ tree subtree = dyn_cast(def_stmt)->op[i]; ++ if (subtree == NULL) ++ continue; ++ unsigned depth = trace_outer_loop_depth (subtree, \ ++ start_depth); ++ min_depth = MIN (min_depth, depth); ++ } ++ return min_depth; ++ } ++ } ++ else ++ { ++ /* Adding termination conditions: ++ 1) Niters is MEM variable; ++ 2) Niters is a runtime value (smooth_uPtr), and consider ++ finding footprint in other mem_ref; ++ 3) Niters is loop variable (i_start/i_end), and the boundary in ++ the outer loop depends on the variable j_start/j_end. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "The loop termination condition is " ++ "extended.\n"); ++ } ++ return start_depth; ++ } ++ } ++ /* The operand nums can be obtained when the tree code is as follows. */ ++ else if (niter_code == NOP_EXPR || niter_code == MEM_REF ++ || niter_code == ARRAY_REF || niter_code == COND_EXPR ++ || niter_code == PLUS_EXPR || niter_code == MINUS_EXPR ++ || niter_code == TARGET_MEM_REF || niter_code == POINTER_PLUS_EXPR) ++ { ++ /* operand_num is the operand in the niters statement. ++ example: In the following niter statement, operand_num = 3. ++ (unsigned int) fEnd_35 - (unsigned int) fEnd_100 + 4294967295. */ ++ unsigned operand_num = TREE_OPERAND_LENGTH (niters); ++ unsigned min_depth = start_depth; ++ for (unsigned i = 0; i < operand_num; i++) ++ { ++ tree subtree = TREE_OPERAND (niters, i); ++ if (subtree == NULL) ++ continue; ++ unsigned depth = trace_outer_loop_depth (subtree, start_depth); ++ min_depth = MIN (min_depth, depth); ++ } ++ return min_depth; ++ } ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "niters is another tree code: %s\n", ++ get_tree_code_name (niter_code)); ++ print_generic_expr (dump_file, niters, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return start_depth; ++ } ++} ++ ++/* Traces the ref dimension information in each loop. */ ++ ++void ++analyze_loop_refs_dimension (std::vector &refs) ++{ ++ for (unsigned i = 0; i < refs.size (); ++i) ++ { ++ if (refs[i].trace_status_p == false) ++ continue; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "trace_reference_dimension %d:\n", i); ++ print_generic_expr (dump_file, refs[i].ref, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ trace_ref_dimension_and_loop_bounds (refs[i]); ++ } ++} ++ ++/* analyze nested kernels ++ 1) multidimension loop analyze ++ 2) extended outer loop analyze ++*/ ++ ++bool ++analyze_nested_kernels (std::vector &kernels, ++ std::map > &loop_refs, ++ std::set &traced_ref_stmt, ++ std::vector &unresolved_refs) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nPhase 3: analyze_nested_kernels\n\n"); ++ ++ /* `kernels` may be added in during outer loop extension phase, ++ thus using initial size to avoid repeatedly analyzing. */ ++ unsigned init_kernels_size = kernels.size (); ++ for (unsigned i = 0; i < init_kernels_size; ++i) ++ { ++ class loop *loop = kernels[i]; ++ if (loop_refs.count (loop) == 0) ++ continue; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "loop header %d:\n", loop->header->index); ++ analyze_loop_refs_dimension (loop_refs[loop]); ++ ++ unsigned depth = loop_depth (loop); ++ unsigned outer_depth = trace_outer_loop_depth (get_cur_loop_niters ++ (loop_refs, loop), depth); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "cur_depth: %d, outer_depth: %d\n", ++ depth, outer_depth); ++ /* param_outer_loop_num: number of loops of the extended outer loop. ++ Outermost loop should not be extended when outer_depth = 0. ++ `outer_depth == depth` means the current loop is the loop which ++ boundary is known, so there is no need to extend the outer loop. */ ++ if (outer_depth == 0 || outer_depth == depth ++ || depth > outer_depth + param_outer_loop_num) ++ continue; ++ ++ /* Extend outer loop. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nStart extending outer loop\n"); ++ /* Superloops of the loop, start from the loop closest to the ++ current loop in the outermost loop. */ ++ for (int j = 0; j < param_outer_loop_num && --depth; ++j) ++ { ++ class loop *outer_loop = (*loop->superloops)[depth]; ++ /* The outer loop may be added when analyzing previous inner loops, ++ i.e. the outer loop contains two or more inner loops. */ ++ if (loop_refs.count (outer_loop)) ++ continue; ++ /* phase1 ~ phase3 analysis on the extended outer loop. */ ++ analyze_loop_dense_memory (kernels, loop_refs, outer_loop); ++ if (loop_refs.count (outer_loop) == 0) ++ continue; ++ for (unsigned k = 0; k < loop_refs[outer_loop].size (); ++k) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "outer_analyze_nested_kernels %d: ", k); ++ print_generic_expr (dump_file, loop_refs[outer_loop][k].ref, ++ TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ trace_loop_refs_info (loop_refs[outer_loop], traced_ref_stmt, ++ unresolved_refs); ++ analyze_loop_refs_dimension (loop_refs[outer_loop]); ++ outer_depth = trace_outer_loop_depth (get_cur_loop_niters ++ (loop_refs, outer_loop), depth); ++ /* `outer_depth == depth` means the current loop is the loop which ++ boundary is known, so there is no need to extend the outer loop. */ ++ if (outer_depth == depth) ++ break; ++ else ++ /* The outer loop cannot find the current loop boundary, ++ Remove the record of outer_loop from the loop_refs. */ ++ loop_refs.erase (outer_loop); ++ } ++ } ++ return true; ++} ++ ++/* ================ phase 4 filter_and_sort_kernels ================ */ ++ ++/* Get the edge probability information of each basic block in the loop. */ ++ ++float ++get_edge_prob (edge e, float minimum) ++{ ++ float fvalue = 0; ++ ++ profile_probability probability = e->probability; ++ if (probability.initialized_p ()) ++ { ++ fvalue = probability.to_reg_br_prob_base () / float (REG_BR_PROB_BASE); ++ if (fvalue < minimum && probability.to_reg_br_prob_base ()) ++ fvalue = minimum; ++ } ++ return fvalue; ++} ++ ++/* Get the next bb with a high branch probability. */ ++ ++basic_block ++next_high_probability_bb (basic_block bb) ++{ ++ if (bb == NULL) ++ return NULL; ++ ++ /* Limit the minimum probability value. */ ++ const float MINNUM_PROB = 0.00001f; ++ float minimum = MINNUM_PROB; ++ ++ gimple *stmt = last_stmt (bb); ++ if (stmt && gimple_code (stmt) == GIMPLE_COND) ++ { ++ edge true_edge = NULL; ++ edge false_edge = NULL; ++ extract_true_false_edges_from_block (bb, &true_edge, &false_edge); ++ ++ float true_edge_prob = get_edge_prob (true_edge, minimum); ++ float false_edge_prob = get_edge_prob (false_edge, minimum); ++ /* If the content of the branch does not include the candidate ++ kernel, the branch probability may not be limited. */ ++ /* The edge_prob may have precision error during static prediction, ++ so we need to relax the limit before comparison. */ ++ if ((true_edge_prob >= (param_branch_prob_threshold / 100.0) - minimum) ++ && flow_bb_inside_loop_p (bb->loop_father, true_edge->dest)) ++ return true_edge->dest; ++ else if ((false_edge_prob ++ >= (param_branch_prob_threshold / 100.0) - minimum) ++ && flow_bb_inside_loop_p (bb->loop_father, false_edge->dest)) ++ return false_edge->dest; ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "No high probability bb:"); ++ fprintf (dump_file, "current bb: %d, true: %f, false: %f\n", ++ bb->index, true_edge_prob, false_edge_prob); ++ } ++ return NULL; ++ } ++ } ++ else ++ { ++ edge e = find_fallthru_edge (bb->succs); ++ if (e) ++ return e->dest; ++ } ++ return NULL; ++} ++ ++ ++/* Dump loop header bb. */ ++ ++void ++dump_loop_headers (const char *name, std::vector &loops) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\n\n%s:\n", name); ++ fprintf (dump_file, "{ "); ++ for (unsigned int i = 0; i < loops.size (); i++) ++ fprintf (dump_file, "%d(%d) ", loops[i]->num, loops[i]->header->index); ++ fprintf (dump_file, "}\n\n"); ++ } ++} ++ ++/* Combine and sort candidate loops. */ ++ ++bool ++filter_and_sort_kernels (std::vector &sorted_kernels, ++ std::vector &kernels) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nPhase 4: filter_and_sort_kernels:\n\n"); ++ ++ std::set end_bb; ++ std::list walked_header_bb; /* Used to record nested loops. */ ++ std::set walked_non_header_bb_idx; ++ ++ for (unsigned i = 0; i < kernels.size (); ++i) ++ { ++ if (kernels[i]->inner == NULL) ++ end_bb.insert (kernels[i]->header); ++ } ++ ++ dump_loop_headers ("kernels", kernels); ++ ++ if (!param_filter_kernels) ++ { ++ for (std::vector::iterator it = kernels.begin (); ++ it != kernels.end (); ++it) ++ sorted_kernels.push_back (*it); ++ } ++ else ++ { ++ basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun); ++ ++ while (bb) ++ { ++ if (bb == NULL) ++ return false; ++ if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun)) ++ break; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "%d ", bb->index); ++ ++ /* bb is not the head of the loop, go to the next. */ ++ if (bb != bb->loop_father->header) ++ { ++ if (walked_non_header_bb_idx.count (bb->index)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Find same-loop cycle. " ++ "Abort filtering process.\n"); ++ return false; ++ } ++ walked_non_header_bb_idx.insert (bb->index); ++ bb = next_high_probability_bb (bb); ++ continue; ++ } ++ ++ /* bb is the head of the loop. */ ++ if (bb != walked_header_bb.back ()) ++ { ++ if (end_bb.count (bb)) ++ { ++ sorted_kernels.push_back (bb->loop_father); ++ bb = single_exit (bb->loop_father)->dest; ++ continue; ++ } ++ if (loop_outer (bb->loop_father) != NULL ++ && get_loop_exit_edges (bb->loop_father).length () != 1) ++ return false; ++ walked_header_bb.push_back (bb); ++ bb = next_high_probability_bb (bb); ++ continue; ++ } ++ else ++ { ++ walked_header_bb.pop_back (); ++ bb = single_exit (bb->loop_father)->dest; ++ continue; ++ } ++ } ++ } ++ ++ dump_loop_headers ("sorted_kernels", sorted_kernels); ++ return true; ++} ++ ++/* Check whether the given bb is null. */ ++ ++bool ++check_null_bb (basic_block bb) ++{ ++ if (bb == NULL) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Unexpected error at null bb.\n"); ++ return true; ++ } ++ return false; ++} ++ ++/* Check whether the loop father of the given bb is null. */ ++ ++bool ++check_null_loop_father (basic_block bb) ++{ ++ if (check_null_bb (bb)) ++ return true; ++ ++ if (bb->loop_father == NULL) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "bb %d's loop father is null.\n", bb->index); ++ return true; ++ } ++ return false; ++} ++ ++/* States for bb during path traversal. */ ++ ++enum bb_traversal_state ++{ ++ NOT_TRAVERSED = 0, ++ UNDER_TRAVERSAL, ++ FULLY_TRAVERSED ++}; ++ ++/* Detect abnormal revisit for bb during path traversal where bb is ++ 1) fully traversed, ++ 2) non-loop-header bb but currently under traversal. */ ++ ++bool ++revisit_bb_abnormal_p (basic_block bb, std::vector &bb_visited, ++ const std::set &header_bb_idx_set, ++ std::set > &unused_edges, ++ int src_bb_idx) ++{ ++ /* If the header bb has been already fully traversed, early exit ++ the function. */ ++ if (bb_visited[bb->index] == FULLY_TRAVERSED) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Already visited bb index %d. Abort.\n", ++ bb->index); ++ return true; ++ } ++ ++ /* If we revisit a non-header bb during next-bb traversal, we detect ++ an inner-loop cycle and dump warning info. Record this abnormal edge ++ in `unused_edges` for special treatment in path weight update. */ ++ if (!header_bb_idx_set.count (bb->index) ++ && bb_visited[bb->index] == UNDER_TRAVERSAL) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Warning: Find cycle at bb index %d. Abort.\n", ++ bb->index); ++ unused_edges.insert (std::make_pair (src_bb_idx, bb->index)); ++ return true; ++ } ++ ++ return false; ++} ++ ++/* Check successor bb through edge e. Return true if successor bb is NULL or ++ out of loop. */ ++ ++bool ++check_succ_bb_abnormal_p (basic_block bb, edge e) ++{ ++ if (check_null_bb (e->dest)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Null bb connected to src bb %d.\n", bb->index); ++ ++ return true; ++ } ++ ++ /* If bb is within one loop and the edge is pointing to the ++ outer loop, skip edge processing until a backedge to header ++ bb. `loop->num = 0` represents function body. */ ++ if (bb->loop_father->num != 0 ++ && !flow_bb_inside_loop_p (bb->loop_father, e->dest)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Find edges to the outer loop at bb " ++ "index %d to bb index %d. Abort.\n", ++ bb->index, e->dest->index); ++ ++ return true; ++ } ++ ++ return false; ++} ++ ++/* Criteria for retrieving the next bb in modified control-flow graph, which ++ creates a topological order for the bb traversal. */ ++ ++void ++get_next_toposort_bb (basic_block bb, std::vector &bb_visited, ++ std::list &bb_topo_order, ++ const std::set &header_bb_idx_set, ++ std::set > &unused_edges, ++ int src_bb_idx) ++{ ++ /* 1) Before bb returns to the loop header, bb will not go to the outer loop. ++ 2) After returning to the loop header, traverse all exit_bbs. ++ NEXT STEP: ++ 1) If goto jumps out of 2 loops, goto has to traverse smaller jumps first. ++ 2) If path length is the same => choose higher depth traversal path. */ ++ if (check_null_bb (bb) || check_null_loop_father (bb)) ++ return; ++ ++ /* Find last bb of function. */ ++ if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun)) ++ return; ++ ++ if (revisit_bb_abnormal_p (bb, bb_visited, header_bb_idx_set, unused_edges, ++ src_bb_idx)) ++ return; ++ ++ /* If we revisit the header bb of a loop, traverse all exit bbs. */ ++ if (header_bb_idx_set.count (bb->index) ++ && bb_visited[bb->index] == UNDER_TRAVERSAL) ++ { ++ unsigned i; ++ edge e; ++ auto_vec exits = get_loop_exit_edges (bb->loop_father); ++ ++ if (exits.length () > 1 && dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Detect multiple exits at loop %d.\n", ++ bb->loop_father->num); ++ ++ FOR_EACH_VEC_ELT (exits, i, e) ++ { ++ get_next_toposort_bb (e->dest, bb_visited, bb_topo_order, ++ header_bb_idx_set, unused_edges, src_bb_idx); ++ } ++ return; ++ } ++ ++ /* Post-order traversal for normal bb. */ ++ bb_visited[bb->index] = UNDER_TRAVERSAL; ++ edge e; ++ edge_iterator ei; ++ ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ if (check_succ_bb_abnormal_p (bb, e)) ++ continue; ++ ++ get_next_toposort_bb (e->dest, bb_visited, bb_topo_order, ++ header_bb_idx_set, unused_edges, bb->index); ++ } ++ ++ /* bb is marked as fully traversed and all its descendents have been ++ fully traversed due to post-order traversal. */ ++ bb_visited[bb->index] = FULLY_TRAVERSED; ++ bb_topo_order.push_back (bb); ++} ++ ++/* A struct that represents the longest path weight at each bb. */ ++ ++struct weight ++{ ++ /* Longest path weight at current bb. */ ++ gcov_type bb_count; ++ ++ /* Prev bb from the current longest path. */ ++ int prev_bb_idx; ++}; ++ ++/* A helper function for checking whether overflow will occur when adding two ++ gcov_type weights. */ ++ ++bool ++check_weight_overflow (gcov_type a, gcov_type b) ++{ ++ if ((a > 0 && b > INT64_MAX - a) || (a < 0 && b < INT64_MIN - a)) ++ return true; ++ ++ return false; ++} ++ ++/* A helper function that update the weight of the current longest path to ++ bb_idx_dst and a new path pointing from bb_idx_src to bb_idx_dst. */ ++ ++void ++update_path_weight (std::vector &bb_weights, int bb_idx_src, ++ int bb_idx_dst, gcov_type weight_dst) ++{ ++ if (check_weight_overflow (bb_weights[bb_idx_src].bb_count, weight_dst) ++ && dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "WARNING: Path weight overflow at src bb %d " ++ "and dest bb %d.\n", ++ bb_idx_src, bb_idx_dst); ++ } ++ if (bb_weights[bb_idx_dst].bb_count ++ < bb_weights[bb_idx_src].bb_count + weight_dst) ++ { ++ bb_weights[bb_idx_dst].bb_count ++ = bb_weights[bb_idx_src].bb_count + weight_dst; ++ bb_weights[bb_idx_dst].prev_bb_idx = bb_idx_src; ++ } ++} ++ ++/* Check whether the required bb/loop info for path update is null. */ ++ ++bool ++check_null_info_in_path_update (basic_block bb, edge e) ++{ ++ if (check_null_bb (e->dest)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Null bb detected for edge connected " ++ "to src bb %d.\n", ++ bb->index); ++ return true; ++ } ++ ++ if (check_null_loop_father (bb) || check_null_loop_father (e->dest)) ++ return true; ++ ++ return false; ++} ++ ++/* Update path weight to loop exit bbs where the current source bb is connected ++ to header bb using a backedge. */ ++ ++void ++update_backedge_path_weight (std::vector &bb_weights, basic_block bb, ++ const std::set > &unused_edges) ++{ ++ unsigned i; ++ edge e_exit; ++ auto_vec exits = get_loop_exit_edges (bb->loop_father); ++ FOR_EACH_VEC_ELT (exits, i, e_exit) ++ { ++ if (check_null_bb (e_exit->dest)) ++ { ++ if (e_exit->src != NULL && dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Null bb detected for exiting edge " ++ "connected to src bb %d.\n", ++ e_exit->src->index); ++ continue; ++ } ++ ++ if (unused_edges.count (std::make_pair (bb->index, e_exit->dest->index))) ++ { ++ /* Inner-loop-cycle backedge case. */ ++ continue; ++ } ++ update_path_weight (bb_weights, bb->index, e_exit->dest->index, ++ e_exit->dest->count.to_gcov_type ()); ++ } ++} ++ ++/* Update the longest length of the path through control flow graph. */ ++ ++void ++update_max_length_of_path (std::vector &bb_weights, ++ std::list &bb_topo_order, ++ const std::set &header_bb_idx_set, ++ const std::set > &unused_edges) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Start update weight traversal:\n"); ++ ++ while (!bb_topo_order.empty ()) ++ { ++ basic_block bb = bb_topo_order.back (); ++ bb_topo_order.pop_back (); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "%d ", bb->index); ++ ++ edge e; ++ edge_iterator ei; ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ if (check_null_info_in_path_update (bb, e)) ++ continue; ++ ++ if (unused_edges.count (std::make_pair (bb->index, e->dest->index))) ++ { ++ /* Inner-loop-cycle backedge case. */ ++ continue; ++ } ++ else if (bb->loop_father->num != 0 ++ && !flow_bb_inside_loop_p (bb->loop_father, e->dest)) ++ { ++ /* Outer-loop edge case. */ ++ continue; ++ } ++ else if (header_bb_idx_set.count (e->dest->index) ++ && bb->loop_father == e->dest->loop_father) ++ { ++ /* Backedge case. */ ++ update_backedge_path_weight (bb_weights, bb, unused_edges); ++ } ++ else ++ { ++ /* Normal edge case. */ ++ update_path_weight (bb_weights, bb->index, e->dest->index, ++ e->dest->count.to_gcov_type ()); ++ } ++ } ++ } ++} ++ ++/* Collect all header bb of loops in the function beforehand. */ ++ ++void ++collect_header_bb_for_fn (std::set &header_bb_idx_set) ++{ ++ for (auto loop : loops_list (cfun, LI_FROM_INNERMOST)) ++ header_bb_idx_set.insert (loop->header->index); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nCheck header bbs:\n"); ++ for (std::set::iterator it = header_bb_idx_set.begin (); ++ it != header_bb_idx_set.end (); ++it) ++ fprintf (dump_file, "%d ", *it); ++ fprintf (dump_file, "\n"); ++ } ++} ++ ++/* Record loop executing order and bb high-executing path. */ ++ ++void ++record_high_execution_path (std::vector &sorted_kernel, ++ std::vector &bb_path, int bb_num_max) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nPATH FOR %s: ", get_name (cfun->decl)); ++ ++ std::set loop_set; ++ for (int i = bb_path.size() - 1; i >= 0; --i) ++ { ++ int bb_idx = bb_path[i]; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "%d ", bb_idx); ++ gcc_assert (bb_idx < bb_num_max); ++ ++ class loop *loop = BASIC_BLOCK_FOR_FN (cfun, bb_idx)->loop_father; ++ if (!loop_set.count (loop->num)) ++ { ++ loop_set.insert (loop->num); ++ sorted_kernel.push_back (loop); ++ } ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\n"); ++} ++ ++/* Combine and sort candidate loops using feedback information. */ ++ ++bool ++filter_and_sort_kernels_feedback (std::vector &sorted_kernel, ++ std::set &bb_pathset) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nPhase 4: filter_and_sort_kernels:\n\n"); ++ ++ std::set header_bb_idx_set; ++ std::list bb_topo_order; ++ ++ /* Quoted from GCC internal, Chapter 15.1, "the index for any block should ++ never be greater than `last_basic_block`." Therefore, we use this ++ variable for retrieving the max bb index of a function. */ ++ /* Since the pass does not add/remove/merge basic blocks until Phase 6 ++ and previous passes will update ssa accordingly, we do not need to ++ `compact_blocks` to update bb indices currently. */ ++ int bb_num_max = last_basic_block_for_fn (cfun) + 1; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nMaximal number of possible bbs in the " ++ "function: %d\n", ++ bb_num_max); ++ std::vector bb_visited = std::vector(bb_num_max, 0); ++ ++ collect_header_bb_for_fn (header_bb_idx_set); ++ basic_block bb_start = ENTRY_BLOCK_PTR_FOR_FN (cfun); ++ ++ /* Step 1: Get topological order of bb during traversal. */ ++ std::set > unused_edges; ++ get_next_toposort_bb (bb_start, bb_visited, bb_topo_order, header_bb_idx_set, ++ unused_edges, -1); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nCheck bbs in topological order:\n"); ++ for (std::list::iterator it = bb_topo_order.begin (); ++ it != bb_topo_order.end (); ++it) ++ fprintf (dump_file, "%d ", (*it)->index); ++ fprintf (dump_file, "\n"); ++ } ++ ++ /* Step 2: Update weights of nodes and path. */ ++ weight weight_init = {-1, -1}; ++ std::vector bb_weights = std::vector(bb_num_max, weight_init); ++ bb_weights[0].bb_count = 0; /* ENTRY bb has count 0 and prev bb as -1. */ ++ update_max_length_of_path (bb_weights, bb_topo_order, header_bb_idx_set, ++ unused_edges); ++ ++ /* Step 3: Backtrack a path from EXIT bb to ENTRY bb. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nCheck counts for each bb:\n"); ++ ++ std::vector bb_path; ++ int tmp_bb_idx = 1; ++ bb_pathset.insert (tmp_bb_idx); ++ bb_path.push_back (tmp_bb_idx); ++ tmp_bb_idx = bb_weights[tmp_bb_idx].prev_bb_idx; ++ while (tmp_bb_idx > 0 && tmp_bb_idx < bb_num_max) ++ { ++ if (bb_pathset.count (tmp_bb_idx)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf(dump_file, "ERROR: already seen bb index %d\n", ++ tmp_bb_idx); ++ return false; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "%d: %ld, ", tmp_bb_idx, ++ bb_weights[tmp_bb_idx].bb_count); ++ bb_pathset.insert (tmp_bb_idx); ++ bb_path.push_back (tmp_bb_idx); ++ tmp_bb_idx = bb_weights[tmp_bb_idx].prev_bb_idx; ++ } ++ /* It is possible that the function exit code is wrapped around as an ++ variable, and thus, EXIT_BB in cfg is not connected to any bb. */ ++ if (tmp_bb_idx < 0 || tmp_bb_idx >= bb_num_max) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "unhandled scenario at backtracking highly " ++ "executed path with tmp_bb_idx %d", ++ tmp_bb_idx); ++ } ++ return false; ++ } ++ ++ record_high_execution_path (sorted_kernel, bb_path, bb_num_max); ++ ++ return true; ++} ++ ++ ++/* ================ phase 5 record_and_sort_ref_groups ================ */ ++/* Memory reference score, different aspects of one memory reference. */ ++ ++struct ref_score ++{ ++ /* certain memory reference. */ ++ data_ref d_ref; ++ ++ /* local count for bb where memory reference is located. */ ++ gcov_type bb_count; ++ ++ /* line-location of memory reference. */ ++ int line; ++}; ++ ++/* Memory reference group, different reference of the same variable. */ ++ ++struct ref_group ++{ ++ /* source variables. */ ++ tree var; ++ ++ /* variable size, Unit: MB. */ ++ double var_size; ++ ++ /* first ref for insert hint. */ ++ data_ref first_use; ++ ++ /* first ref with the highest-order CALC. */ ++ data_ref first_calc_use; ++ ++ /* reuse scores of variables. */ ++ float reuse_level; ++ ++ /* method of calculating the var size. */ ++ calc_type calc_by; ++ ++ /* memory reference index for specific variable. */ ++ unsigned int mem_ref_index; ++ ++ /* variable dimension. */ ++ unsigned int dim; ++ ++ /* True if first_calc_use's footprint replaces that of first_use. */ ++ unsigned int transfer_ft; ++ ++ /* Accessing Reference Records in Different Modes (key_index): ++ 000: write, random, non-parallel ++ 001: write, random, parallel ++ 010: write, regular, non-parallel ++ 011: write, regular, parallel ++ 100: read, random, non-parallel ++ 101: read, random, parallel ++ 110: read, regular, non-parallel ++ 111: read, regular, parallel ++ */ ++ std::map > ref_use; ++ ++ /* scores for different memory references. */ ++ std::vector ref_scores; ++ ++ ref_group () ++ { ++ var = NULL_TREE; ++ var_size = 0; ++ reuse_level = 0; ++ calc_by = UNHANDLE_CALC; ++ mem_ref_index = 0; ++ dim = 1; ++ transfer_ft = 0; ++ } ++}; ++ ++/* Get the integer part for log(x) with the given base. */ ++ ++static unsigned int ++flog (float x, float base) ++{ ++ unsigned int res = 0; ++ while (x >= base) ++ { ++ ++res; ++ x /= base; ++ } ++ return res; ++} ++ ++/* Calculate reuse time for a memory reference in ref_group. */ ++ ++float ++calculate_reuse_times (std::vector &mem_refs, std::set &loop_set, ++ std::set &bb_set, unsigned int var_dim) ++{ ++ const float SAME_BB_REUSE_WEIGHT = 0.1; ++ const float SAME_LOOP_REUSE_WEIGHT = 0.5; ++ const float NORMAL_REUSE_WEIGHT = 1.; ++ ++ float reuse_time_sum = 0.; ++ for (std::vector::iterator it = mem_refs.begin (); ++ it != mem_refs.end (); ++it) ++ { ++ const data_ref &mem_ref = *it; ++ float reuse_time = 0.; ++ if (bb_set.count (mem_ref.bb_idx)) ++ { ++ /* If the two mem_ref belong to the same bb, the new reuse ++ weight will not exceed 0.1 divided by the mem_ref mode group ++ size. ++ NEXT STEP: The following equation may hold and cause commutative ++ property of read and write op not holding: ++ write + (reused) read != read + (reused) write. ++ However, it seems that write mem_ref is always before read mem_ref, ++ so the above comparison does not show up in calculation due to ++ intrinsic in-order property of tree map, but this condition is ++ quite fragile anyway. */ ++ reuse_time = SAME_BB_REUSE_WEIGHT / mem_refs.size (); ++ } ++ else ++ { ++ bb_set.insert (mem_ref.bb_idx); ++ if (loop_set.count (mem_ref.loop_idx)) ++ { ++ /* If the mem_ref belongs to a loop where any other mem_ref is in, ++ the new reuse weight will be 0.5. */ ++ reuse_time = SAME_LOOP_REUSE_WEIGHT; ++ } ++ else ++ { ++ /* If the mem_ref is reused but not in the same group with any ++ other mem_ref, the new reuse weight will be 1. */ ++ loop_set.insert (mem_ref.loop_idx); ++ reuse_time = NORMAL_REUSE_WEIGHT; ++ } ++ } ++ unsigned int used_dim = std::min (mem_ref.loop_depth, var_dim); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "used_dim : %u, loop_depth : %u\n", used_dim, ++ mem_ref.loop_depth); ++ unsigned int power = flog (std::max (0u, mem_ref.loop_depth - used_dim) ++ + 2, 2.); ++ reuse_time_sum += reuse_time * (used_dim * used_dim / 2.) * (power); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "(%f * (%u * %u / 2) * (%u) = %f\n", ++ reuse_time, used_dim, used_dim, power, ++ reuse_time * (used_dim * used_dim / 2.) * (power)); ++ } ++ return reuse_time_sum; ++} ++ ++/* Calculate reuse level. */ ++ ++float ++calculate_reuse_level (std::map > &var_use, ++ unsigned int var_dim, double var_size) ++{ ++ const float VAR_SIZE_CACHE_CAPACITY = 1 / 4.; ++ const int WITHIN_CACHE_SIZE_COST = 4; ++ const float BYTE_CONVERT_RATIO = 1024.; ++ ++ float level = 0.; ++ std::set loop_set; ++ std::set bb_set; ++ bool has_write_op = false; ++ for (std::map >::iterator it = var_use.begin (); ++ it != var_use.end (); ++it) ++ { ++ unsigned int parallel = 1; ++ unsigned int regular = 1; ++ ++ if ((*it).second[0].parallel_p) ++ parallel = PARALLEL_NUM; ++ if (!(*it).second[0].regular_p) ++ regular = INDIRECT_ACCESS_VALUE; ++ if (!(*it).second[0].read_p) ++ has_write_op = true; ++ ++ /* In serial reuse, we will later check whether they are in the ++ same cacheline. If yes, delete the reuse. For details, see the ++ reuse analysis of prefetching and eliminate redundancy. */ ++ float reuse_times = calculate_reuse_times ((*it).second, loop_set, ++ bb_set, var_dim); ++ float add = parallel * reuse_times * regular; ++ level += add; ++ if (add && dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "%d : %d * %f * %d = %f\n", ++ (*it).first, parallel, reuse_times, regular, add); ++ } ++ ++ bool within_llc_size = var_size > param_l2_cache_size / BYTE_CONVERT_RATIO ++ && var_size < VAR_SIZE_CACHE_CAPACITY ++ * param_llc_capacity_per_core; ++ ++ float final_level = has_write_op ? (level * WRITE_COST) : level; ++ final_level = within_llc_size ? (final_level * WITHIN_CACHE_SIZE_COST) ++ : final_level; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "final level : %d * %f * %d = %f\n", ++ has_write_op ? WRITE_COST : 1, level, ++ within_llc_size ? WITHIN_CACHE_SIZE_COST : 1, final_level); ++ return final_level; ++} ++ ++/* Comparison of reference reuse level. */ ++ ++bool ++ref_group_reuse_cmp (const ref_group &a, const ref_group &b) ++{ ++ if (a.reuse_level != b.reuse_level) ++ return a.reuse_level > b.reuse_level; ++ else ++ return get_name (a.var) < get_name (b.var); ++} ++ ++/* Dump key information of reference group and memory access for llc hint. */ ++ ++void ++dump_key_info_for_llc_hint (std::vector &ref_groups) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nLLC hint info:\n"); ++ fprintf (dump_file, "rank\tvar\t(lineno, direct, vectorized, write)\n"); ++ for (unsigned int i = 0; i < ref_groups.size (); ++i) ++ { ++ fprintf (dump_file, "%d\t", i); ++ print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM); ++ data_ref &mem_ref = ref_groups[i].first_use; ++ fprintf (dump_file, "\t(%d, %u, %u, %u)", ++ expand_location (mem_ref.stmt->location).line, ++ mem_ref.regular_p, mem_ref.vectorize_p, 1 - mem_ref.read_p); ++ fprintf (dump_file, "\n"); ++ } ++ fprintf (dump_file, "\n"); ++ } ++} ++ ++/* Sort reference groups. */ ++ ++void ++sort_ref_groups (std::vector &ref_groups, ++ std::map &ref_groups_map) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nsort_ref_groups_by_reuse_level\n"); ++ ++ for (std::map::iterator it = ref_groups_map.begin (); ++ it != ref_groups_map.end (); ++it) ++ { ++ (*it).second.reuse_level = calculate_reuse_level ((*it).second.ref_use, ++ (*it).second.dim, ++ (*it).second.var_size); ++ ref_groups.push_back ((*it).second); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, (*it).second.var, TDF_SLIM); ++ fprintf (dump_file, " : %f\n\n", (*it).second.reuse_level); ++ } ++ } ++ ++ std::sort (ref_groups.begin (), ref_groups.end (), ref_group_reuse_cmp); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nsorted ref_groups:\n"); ++ fprintf (dump_file, "rank\tvar\t(data_size, dim, num_of_mem_ref, " ++ "need_tmp_name): reuse_level_score\n"); ++ for (unsigned int i = 0; i < ref_groups.size (); ++i) ++ { ++ fprintf (dump_file, "%d\t", i); ++ print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM); ++ int need_tmp_name = !get_name (ref_groups[i].var) ? 1 : 0; ++ fprintf (dump_file, "\t(%lf, %u, %lu, %d)", ref_groups[i].var_size, ++ ref_groups[i].dim, ref_groups[i].ref_scores.size (), ++ need_tmp_name); ++ fprintf (dump_file, " : %f\n", ref_groups[i].reuse_level); ++ } ++ fprintf (dump_file, "\n"); ++ ++ fprintf (dump_file, "first_use:\n"); ++ for (unsigned int i = 0; i < ref_groups.size (); ++i) ++ { ++ fprintf (dump_file, "%d ", i); ++ print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM); ++ fprintf (dump_file, " : "); ++ if (!ref_groups[i].first_use.vectorize_p) ++ print_generic_expr (dump_file, ref_groups[i].first_use.ref, ++ TDF_SLIM); ++ else ++ print_gimple_stmt (dump_file, ref_groups[i].first_use.stmt, ++ TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ fprintf (dump_file, "\n"); ++ } ++ dump_key_info_for_llc_hint (ref_groups); ++} ++ ++/* Attributes of variable data. */ ++ ++enum data_attribute ++{ ++ DA_PARALLEL = 0, ++ DA_REGULAR, ++ DA_READ ++}; ++ ++/* Record memory reference by use mode. ++ If the reference group is not found, create a group. */ ++ ++void ++record_mem_ref (std::map &ref_groups, data_ref &mem_ref) ++{ ++ unsigned int index = (mem_ref.parallel_p << DA_PARALLEL) ++ + (mem_ref.regular_p << DA_REGULAR) + (mem_ref.read_p << DA_READ); ++ ++ if (!ref_groups.count (mem_ref.var)) ++ { ++ ref_group ref_group; ++ ref_group.var = mem_ref.var; ++ ref_group.first_use = mem_ref; ++ ref_group.first_calc_use = mem_ref; ++ ref_groups[mem_ref.var] = ref_group; ++ } ++ ++ /* Ref_groups' calc_by reflects the highest order of calc_by that can be ++ achieved by all mem_ref of ref_groups. The first mem_ref that achieves ++ this order is defined to be `first_calc_use`. Later after sorting ++ mem_refs, calc_by will be replaced by the calc_by of `first_use`, and ++ even by the calc_by of `first_calc_use`. */ ++ if (mem_ref.calc_by > ref_groups[mem_ref.var].calc_by) ++ { ++ ref_groups[mem_ref.var].calc_by = mem_ref.calc_by; ++ ref_groups[mem_ref.var].first_calc_use = mem_ref; ++ } ++ ref_groups[mem_ref.var].var_size = std::max (ref_groups[mem_ref.var].var_size, ++ mem_ref.data_size); ++ ref_groups[mem_ref.var].dim = std::max (ref_groups[mem_ref.var].dim, ++ (unsigned int) mem_ref.loop_bounds.size ()); ++ ref_groups[mem_ref.var].ref_use[index].push_back (mem_ref); ++ ++ ref_score ref_level = { mem_ref, ((mem_ref.stmt)->bb->count).to_gcov_type (), ++ expand_location (mem_ref.stmt->location).line }; ++ ref_groups[mem_ref.var].ref_scores.push_back (ref_level); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "recorded in: "); ++ print_generic_expr (dump_file, mem_ref.var, TDF_SLIM); ++ fprintf (dump_file, ":%d:%ld\n", index, ++ ref_groups[mem_ref.var].ref_use[index].size () - 1); ++ ++ fprintf (dump_file, "base: "); ++ print_generic_expr (dump_file, mem_ref.base, TDF_SLIM); ++ ++ fprintf (dump_file, ", index: "); ++ print_generic_expr (dump_file, mem_ref.index, TDF_SLIM); ++ ++ fprintf (dump_file, ", step: "); ++ if (mem_ref.step && cst_and_fits_in_hwi (mem_ref.step)) ++ fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, ++ int_cst_value (mem_ref.step)); ++ else ++ print_generic_expr (dump_file, mem_ref.step, TDF_SLIM); ++ ++ fprintf (dump_file, ", offset: "); ++ if (mem_ref.offset && cst_and_fits_in_hwi (mem_ref.offset)) ++ fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, ++ int_cst_value (mem_ref.offset)); ++ else ++ print_generic_expr (dump_file, mem_ref.offset, TDF_SLIM); ++ fprintf (dump_file, ", %s", mem_ref.read_p ? "read" : "write"); ++ ++ fprintf (dump_file, ", size: %lf", mem_ref.data_size); ++ fprintf (dump_file, "\n\n"); ++ } ++} ++ ++/* Rank data reference index level. */ ++ ++bool ++best_insert_cmp (const ref_score &a, const ref_score &b) ++{ ++ /* NEXT STEP: We can also calculate gap using static/feedback info inferred ++ from historical maximum bb count: ++ gap = hist_max_bb_ct / (alpha * max (a.bb_ct, b.bb_ct)) + 1. ++ Also, bb count needs to be smoothed and scaled as divisor can be 0. ++ history maximum bb count can be obtained in Phase 4. */ ++ const float gap = 1; ++ if (a.d_ref.loop_depth != b.d_ref.loop_depth) ++ return a.d_ref.loop_depth > b.d_ref.loop_depth; ++ else if (a.d_ref.regular_p != b.d_ref.regular_p) ++ return a.d_ref.regular_p > b.d_ref.regular_p; ++ else if (abs (double (std::max (a.bb_count, b.bb_count) + 1) ++ / double (std::min (a.bb_count, b.bb_count) + 1) - 1) > gap) ++ return a.bb_count > b.bb_count; ++ else if (a.line != b.line) ++ return a.line < b.line; ++ else if (a.d_ref.read_p != b.d_ref.read_p) ++ return a.d_ref.read_p < b.d_ref.read_p; ++ else ++ return a.d_ref.vectorize_p > b.d_ref.vectorize_p; ++} ++ ++/* Sort data reference index level within one reference group in non-decreasing ++ order of the customized sorting scheme. */ ++ ++void ++sort_mem_ref_in_ref_group (std::map &ref_groups_map) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nsorted data_references:\n"); ++ for (std::map::iterator it = ref_groups_map.begin (); ++ it != ref_groups_map.end (); ++it) ++ { ++ ref_group &curr_ref_group = (*it).second; ++ std::vector &ref_scores = curr_ref_group.ref_scores; ++ std::stable_sort (ref_scores.begin (), ref_scores.end (), ++ best_insert_cmp); ++ /* Update ref_group's first_use and calc_by with the first mem_ref after ++ sorting. */ ++ curr_ref_group.first_use = curr_ref_group.ref_scores[0].d_ref; ++ curr_ref_group.calc_by = curr_ref_group.first_use.calc_by; ++ ++ /* When transferring footprint is enabled, it is allowed to transfer ++ the statically-calculated footprint of a mem_ref from the same ++ ref_group to `first_use` mem_ref. */ ++ if (param_transfer_footprint ++ && curr_ref_group.first_use.calc_by == UNHANDLE_CALC) ++ { ++ if (curr_ref_group.first_calc_use.calc_by > RUNTIME_CALC) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, (*it).first, TDF_SLIM); ++ fprintf (dump_file, "\nfirst_use: "); ++ print_gimple_stmt (dump_file, curr_ref_group.first_use.stmt, ++ 0, TDF_LINENO); ++ fprintf (dump_file, "first_calc_use: "); ++ print_gimple_stmt (dump_file, ++ curr_ref_group.first_calc_use.stmt, ++ 0, TDF_LINENO); ++ } ++ ++ curr_ref_group.calc_by = curr_ref_group.first_calc_use.calc_by; ++ curr_ref_group.transfer_ft = 1; ++ } ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, (*it).first, TDF_SLIM); ++ fprintf (dump_file, ": cannot transfer footprint to " ++ "first use mem_ref.\n"); ++ } ++ } ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, (*it).first, TDF_SLIM); ++ fprintf (dump_file, " : %lu\n", ref_scores.size ()); ++ for (unsigned int i = 0; i < ref_scores.size (); ++i) ++ { ++ fprintf (dump_file, "mem_ref_index %u: ", i); ++ print_gimple_stmt (dump_file, ref_scores[i].d_ref.stmt, 0, ++ TDF_LINENO); ++ fprintf (dump_file, "bb-%d ", ++ ref_scores[i].d_ref.stmt->bb->index); ++ fprintf (dump_file, "count %ld\n", ref_scores[i].bb_count); ++ } ++ fprintf (dump_file, "\n\n"); ++ } ++ } ++} ++ ++/* Tracing and sorting reference groups. */ ++ ++bool ++record_and_sort_ref_groups (std::vector &ref_groups, ++ std::vector &kernels, ++ std::map > &loop_refs, ++ std::set bb_pathset) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nPhase 5: trace_all_references_details\n\n"); ++ ++ std::map ref_groups_map; ++ ++ for (unsigned i = 0; i < kernels.size (); ++i) ++ { ++ class loop *loop = kernels[i]; ++ if (loop_refs.count (loop) == 0) ++ continue; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "loop header %d:\n", loop->header->index); ++ for (unsigned j = 0; j < loop_refs[loop].size (); ++j) ++ { ++ data_ref &mem_ref = loop_refs[loop][j]; ++ if (mem_ref.trace_status_p) ++ { ++ if (!param_filter_mode || (param_filter_mode ++ && bb_pathset.count (mem_ref.stmt->bb->index))) ++ record_mem_ref (ref_groups_map, mem_ref); ++ } ++ } ++ } ++ ++ /* Sort mem_ref within ref_group by local count and update first_use's ++ data_ref, stable sort. */ ++ sort_mem_ref_in_ref_group (ref_groups_map); ++ sort_ref_groups (ref_groups, ref_groups_map); ++ ++ return ref_groups.size () > 0; ++} ++ ++/* ================ phase 6 issue_llc_hint ================ */ ++ ++/* Issue vectorized mask prefetch gimple. */ ++ ++void ++issue_mask_prefetch (gimple *stmt) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "insert svprfd.\n"); ++ ++ /* vect__1.1 = .MASK_LOAD (_2, 32B, loop_mask_3); ++ .MASK_STORE (_4, 32B, loop_mask_5, vect__6.6); ++ */ ++ tree dataref_ptr = gimple_call_arg (stmt, 0); ++ tree scale = gimple_call_arg (stmt, 1); ++ tree final_mask = gimple_call_arg (stmt, 2); ++ tree target = NULL_TREE; ++ if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE) ++ target = gimple_call_arg (stmt, 3); ++ else if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD) ++ target = gimple_call_lhs (stmt); ++ tree prfop = NULL_TREE; ++ if (param_llc_level == 3) ++ /* for simulation, 4: PLDL3KEEP. */ ++ prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4); ++ else if (param_llc_level == 4) ++ /* 6: PLDL4KEEP. */ ++ prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6); ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "LLC cache levels are illegal.\n"); ++ return; ++ } ++ ++ /* add offset. */ ++ gimple_stmt_iterator si = gsi_for_stmt (stmt); ++ /* target: vector_type - XXX_type. */ ++ if (target == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "unhandled scene: target vect is null"); ++ return; ++ } ++ unsigned HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi ++ (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target)))); ++ tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance); ++ addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true, ++ NULL, true, GSI_SAME_STMT); ++ ++ gcall *call = gimple_build_call_internal (IFN_MASK_PREFETCH, 5, addr, scale, ++ final_mask, target, prfop); ++ gsi_insert_after (&si, call, GSI_SAME_STMT); ++ update_ssa (TODO_update_ssa_only_virtuals); ++} ++ ++/* Issue vectorized mask gather prefetch gimple. */ ++ ++void ++issue_mask_gather_prefetch (gimple *stmt) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "insert svprfd_gather_uxindex.\n"); ++ ++ /* vect_patt_1.1 = .MASK_GATHER_LOAD (_2, vect__3.3, 8, { 0.0, ... }, ++ loop_mask_4); */ ++ tree dataref_ptr = gimple_call_arg (stmt, 0); ++ tree vec_offset = gimple_call_arg (stmt, 1); ++ tree scale = gimple_call_arg (stmt, 2); ++ tree zero = gimple_call_arg (stmt, 3); ++ tree final_mask = gimple_call_arg (stmt, 4); ++ tree prfop = NULL_TREE; ++ if (param_llc_level == 3) // for simulation ++ prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4); // 4: PLDL3KEEP ++ else if (param_llc_level == 4) ++ prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6); // 6: PLDL4KEEP ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "LLC cache levels are illegal.\n"); ++ return; ++ } ++ ++ tree target = gimple_call_lhs (stmt); ++ /* add offset. */ ++ gimple_stmt_iterator si = gsi_for_stmt (stmt); ++ if (target == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "unhandled scene: target vect is null"); ++ return; ++ } ++ unsigned HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi ++ (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target)))); ++ tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance); ++ addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true, ++ NULL, true, GSI_SAME_STMT); ++ ++ gcall *call = gimple_build_call_internal (IFN_MASK_GATHER_PREFETCH, 7, addr, ++ vec_offset, scale, zero, ++ final_mask, target, prfop); ++ gsi_insert_after (&si, call, GSI_SAME_STMT); ++ update_ssa (TODO_update_ssa_only_virtuals); ++} ++ ++/* Issue builtin prefetch gimple. */ ++ ++void ++issue_builtin_prefetch (data_ref &mem_ref) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "insert prfm.\n"); ++ /* MEM[symbol: diagPtr, index: ivtmp_102, step: 8, offset: 0B] */ ++ gimple *stmt = mem_ref.stmt; ++ tree ref = mem_ref.ref; ++ ++ tree scale = mem_ref.step; ++ gimple_stmt_iterator si = gsi_for_stmt (stmt); ++ if (scale == NULL_TREE) ++ { ++ /* _190 = (void *) ivtmp.444_221; ++ Cannot detect size unit at (void *). */ ++ scale = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (mem_ref.var))); ++ if (scale == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "ERROR: Unknown size unit for the prefetching " ++ "variable. Stop builtin_prefetch.\n\n"); ++ return; ++ } ++ } ++ ++ tree addr = build_fold_addr_expr_with_type (ref, ptr_type_node); ++ addr = force_gimple_operand_gsi (&si, unshare_expr (addr), ++ true, NULL, true, GSI_SAME_STMT); ++ unsigned HOST_WIDE_INT distance = param_prefetch_offset ++ * tree_to_uhwi (scale); ++ ++ addr = fold_build_pointer_plus_hwi (addr, distance); ++ addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true, ++ NULL, true, GSI_SAME_STMT); ++ /* __builtin_prefetch (_68, 0, 1); ++ 1st param: *addr, 2nd param: write/read (1/0), 3rd param: temporal locality ++ (high means strong locality) */ ++ gcall *call = NULL; ++ if (param_llc_level == 3) ++ { ++ /* for simulation. ++ BUILT_IN_PREFETCH (addr, rw, locality). */ ++ call = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH), ++ 3, addr, integer_zero_node, integer_one_node); ++ } ++ else if (param_llc_level == 4) ++ { ++ tree prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6); ++ call = gimple_build_call ( ++ builtin_decl_explicit (BUILT_IN_PREFETCH_FULL), ++ 3, addr, integer_zero_node, prfop); ++ } ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "LLC cache levels are illegal.\n"); ++ return; ++ } ++ ++ gsi_insert_after (&si, call, GSI_SAME_STMT); ++ update_ssa (TODO_update_ssa_only_virtuals); ++} ++ ++/* Static form insertion and issue instruction. We may check the ++ determination of the ARM SVE architecture before SVE hint insertion. */ ++ ++void ++static_issue (std::vector &ref_groups, int num_issue_var) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "static issue\n"); ++ ++ for (int i = 0; i < num_issue_var; ++i) ++ { ++ data_ref mem_ref = ref_groups[i].first_use; ++ if (mem_ref.vectorize_p) ++ { ++ enum internal_fn ifn_code = gimple_call_internal_fn (mem_ref.stmt); ++ if (ifn_code == IFN_MASK_STORE || ifn_code == IFN_MASK_LOAD) ++ issue_mask_prefetch (mem_ref.stmt); ++ else if (ifn_code == IFN_MASK_GATHER_LOAD) ++ issue_mask_gather_prefetch (mem_ref.stmt); ++ else ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "other vectorized internal function\n"); ++ } ++ else ++ issue_builtin_prefetch (mem_ref); ++ } ++} ++ ++/* Check whether all loop bounds (niters) used for calculating the footprints ++ of previously-executed ref_groups are defined in a dominated bb to the ++ currentbranch bb, where the conditional expression requires the loop bound ++ info. */ ++ ++bool ++check_def_use_chain (std::vector &ref_groups, ++ basic_block &branch_header_bb, ++ std::vector &ref_group_idx) ++{ ++ for (std::vector::iterator it = ref_group_idx.begin (); ++ it != ref_group_idx.end (); ++it) ++ { ++ /* Transferring mem_ref only takes place during footprint calculation. */ ++ ref_group &ref_group_curr = ref_groups[*it]; ++ data_ref mem_ref = ref_group_curr.transfer_ft ++ ? ref_group_curr.first_calc_use ++ : ref_group_curr.first_use; ++ for (unsigned j = 0; j < mem_ref.loop_bounds.size (); ++j) ++ { ++ tree niters = mem_ref.loop_bounds[j].niters; ++ gimple *def_stmt = SSA_NAME_DEF_STMT (niters); ++ basic_block def_bb = gimple_bb (def_stmt); ++ /* Check dominator relationship of def bb and branch bb. */ ++ /* Case 1: Check whether the def bb is the single predecessor block ++ of header bb. */ ++ if (single_pred_p (branch_header_bb)) ++ { ++ basic_block branch_bb_prev = single_pred (branch_header_bb); ++ if (branch_bb_prev->index == def_bb->index) ++ continue; ++ } ++ /* Case 2: Check whether the branch bb is dominated by the def ++ bb. */ ++ if (!dominated_by_p (CDI_DOMINATORS, branch_header_bb, def_bb)) ++ return false; ++ } ++ } ++ return true; ++} ++ ++/* Generate the stmts for calculating the size. Later we will consider nested ++ multi-branches scenarios and check more information of niters when it is ++ a COND_EXPR. */ ++ ++tree ++calc_stmts_gen (std::vector &ref_groups, ++ gimple_seq &cond_expr_stmt_list, ++ basic_block branch_header_bb, ++ std::vector &ref_group_idx_curr, ++ std::vector &ref_group_idx_prev, tree &cumul_size) ++{ ++ /* Check whether the bbs of def stmt for footprint loop bounds dominates ++ the bb of new runtime branching conditional. */ ++ if (!check_def_use_chain (ref_groups, branch_header_bb, ref_group_idx_prev)) ++ return NULL_TREE; ++ ++ /* Accumulated allocation size. */ ++ for (std::vector::iterator it = ref_group_idx_curr.begin (); ++ it != ref_group_idx_curr.end (); ++it) ++ { ++ /* Transferring mem_ref only takes place during footprint calculation. */ ++ ref_group &ref_group_curr = ref_groups[*it]; ++ data_ref mem_ref = ref_group_curr.transfer_ft ++ ? ref_group_curr.first_calc_use ++ : ref_group_curr.first_use; ++ tree var = mem_ref.var; ++ tree unit = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (var))); ++ /* _190 = (void *) ivtmp.444_221; ++ Cannot detect size unit at (void *). */ ++ if (unit == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "WARNING: Cannot detect size unit " ++ "(use 1 byte) for variable %s: ", ++ get_name (var)); ++ print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ unit = size_one_node; ++ } ++ tree size = NULL_TREE; ++ for (unsigned j = 0; j < mem_ref.loop_bounds.size (); ++j) ++ { ++ tree niters = mem_ref.loop_bounds[j].niters; ++ ++ /* COND_EXPR. */ ++ if (TREE_CODE (niters) == COND_EXPR) ++ niters = TREE_OPERAND (niters, 1); ++ if (size == NULL_TREE) ++ { ++ size = niters; ++ } else { ++ size = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, ++ size); ++ } ++ } ++ unit = build1 (NOP_EXPR, TREE_TYPE (size), unit); ++ size = fold_build2 (MULT_EXPR, TREE_TYPE (size), size, unit); ++ size = build1 (FLOAT_EXPR, double_type_node, size); ++ cumul_size = fold_build2 (PLUS_EXPR, double_type_node, cumul_size, ++ size); ++ ref_group_idx_prev.push_back (*it); ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "cumul_size = "); ++ print_generic_expr (dump_file, cumul_size, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ /* Create a stmt list for size calculation. */ ++ tree div = build_int_cst (TREE_TYPE (integer_zero_node), 1024 * 1024); ++ div = build1 (NOP_EXPR, double_type_node, div); ++ tree total_size = fold_build2 (RDIV_EXPR, double_type_node, cumul_size, div); ++ ++ tree threshold = build_int_cst (TREE_TYPE (integer_zero_node), ++ param_llc_capacity_per_core / 2); ++ threshold = build_real_from_int_cst (double_type_node, threshold); ++ tree cond_expr = fold_build2 (LE_EXPR, boolean_type_node, total_size, ++ threshold); ++ ++ /* Convert cond_expr to stmt list. */ ++ cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr), ++ &cond_expr_stmt_list, is_gimple_condexpr, ++ NULL_TREE); ++ return cond_expr; ++} ++ ++/* Retrieve the least number of loops that cover all target mem_refs. ++ Try to merge loops that the mem_refs reside to a common superloop and ++ maintain a worklist which relates NEED-TO-COPY loops with the target mem ++ refs inside using the following criteria: ++ 1) If loop A is a superloop of loop B in the worklist, replace loop B with ++ loop A in the worklist, and attach all target mem_refs of loop B, ++ together with loop A's, to loop A. ++ 2) If loop B in the worklist is a superloop of loop A, attach loop A's ++ target mem_ref to loop B. ++ 3) If loop A is not a superloop/subloop of loop B in the worklist, replace ++ loop B with their lowest common superloop C in the worklist, and attach ++ all target mem_refs of loop A and loop B to loop C. ++ 4) If loop A and loop B's lowest common superloop is function body ++ (loop 0), stop merging and maintain loop independence. */ ++ ++void ++get_loop_worklist (std::vector &ref_groups, int num_issue_var, ++ std::map > &loop_worklist) ++{ ++ for (int i = 0; i < num_issue_var; ++i) ++ { ++ data_ref &mem_ref = ref_groups[i].first_use; ++ class loop *loop_new = mem_ref.loop_bounds.front ().loop; ++ class loop *common_superloop = loop_new; ++ bool add_loop_worklist = false; ++ ++ /* Use greedy algorithm to merge loops to a common superloop that can ++ contain the current mem_refs. */ ++ std::map >::iterator it_tmp; ++ std::vector ref_group_idx_tmp; ++ std::map >::iterator it; ++ for (it = loop_worklist.begin (); it != loop_worklist.end ();) ++ { ++ class loop *loop_old = it->first; ++ common_superloop = find_common_loop (loop_new, loop_old); ++ if (common_superloop == NULL || common_superloop->num == 0) ++ { ++ /* Stop merging two loops if there is no common superloop for ++ them except function body (loop 0). */ ++ if (common_superloop != NULL ++ && dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "ref_group %d's loop %d has no common " ++ "superloop with existing loop %d\n", ++ i, loop_new->num, loop_old->num); ++ } ++ ++it; ++ continue; ++ } ++ ++ if (common_superloop->num == loop_old->num) ++ { ++ /* If loop_old is the superloop of loop_new, add current ++ ref_group index to loop's worklist. */ ++ loop_worklist[common_superloop].push_back (i); ++ ++it; ++ } ++ else ++ { ++ /* If loop_old is not a superloop of loop_new, replace ++ loop_old with the common superloop. */ ++ it_tmp = it; ++ ++it_tmp; ++ ref_group_idx_tmp = it->second; ++ loop_worklist.erase (it); ++ it = it_tmp; ++ add_loop_worklist = true; ++ } ++ } ++ ++ if (loop_worklist.empty () || add_loop_worklist) ++ { ++ /* Update the new common superloop in loop_worklist. */ ++ std::vector &ref_groups_tmp = loop_worklist[common_superloop]; ++ ref_groups_tmp.push_back (i); ++ for (std::vector::iterator it = ref_group_idx_tmp.begin (); ++ it != ref_group_idx_tmp.end (); ++it) ++ ref_groups_tmp.push_back (*it); ++ std::sort (ref_groups_tmp.begin (), ref_groups_tmp.end ()); ++ } ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "runtime loop list:\n"); ++ std::map >::iterator it; ++ for (it = loop_worklist.begin (); it != loop_worklist.end (); ++it) ++ { ++ fprintf (dump_file, "loop %d:", it->first->num); ++ for (std::vector::iterator idx_it = it->second.begin (); ++ idx_it != it->second.end (); ++idx_it) ++ { ++ fprintf (dump_file, " %d", *idx_it); ++ } ++ fprintf (dump_file, "\n"); ++ } ++ } ++} ++ ++/* Runtime form insertion and issue instruction. */ ++ ++void ++runtime_issue (std::vector &ref_groups, int num_issue_var, ++ std::vector &sorted_kernels) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "runtime issue\n"); ++ ++ /* It is possible that the loop father of some mem_ref's bb may contain the ++ loop fathers of the others. Therefore, we intend to only copy loops ++ without inclusion relationship. */ ++ std::map > loop_worklist; ++ get_loop_worklist (ref_groups, num_issue_var, loop_worklist); ++ bool get_first_ref_group = false; ++ std::vector ref_group_idx_prev; ++ ++ /* NEXT STEP: Multiple loop copies (possibly nested within one loop can cost ++ front-end bound due to branching within loop), we need to set up a ++ threshold such that we may compensate this time cost by space cost ++ in binary (copying outer loop). */ ++ tree cumul_size = build_real_from_int_cst (double_type_node, ++ integer_zero_node); ++ for (std::vector::iterator it = sorted_kernels.begin (); ++ it != sorted_kernels.end (); ++it) ++ { ++ /* Start runtime branching until finding the first ref_group's loop. ++ Skip any ref_groups if their `first_use` mem_refs are executed ++ before the mem_ref of the first ref_group. */ ++ class loop *loop = *it; ++ if (!loop_worklist.count (loop) ++ || (!get_first_ref_group && loop_worklist[loop][0] != 0)) ++ continue; ++ ++ std::vector ref_group_idx_curr = loop_worklist[loop]; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "copy loop num: %d\n", loop->num); ++ } ++ /* If the exit edge points to bb with multiple inputs, split the exit ++ edge and create a new bb, make the exit edge point to bb with only ++ single input. */ ++ edge e = single_exit (loop); ++ if (e == NULL) ++ return; ++ if (!single_pred_p (e->dest)) ++ { ++ split_loop_exit_edge (e, true); ++ if (dump_enabled_p ()) ++ dump_printf (MSG_NOTE, "split exit edge\n"); ++ } ++ ++ /* After updating SSA, we are not sure whether the gimple_seq stmt list ++ is initialized and unchanged during iterations. Therefore, we need to ++ recreate this stmt list for every loop copy. */ ++ gimple_seq cond_expr_stmt_list = NULL; ++ tree cond_expr = calc_stmts_gen (ref_groups, cond_expr_stmt_list, ++ loop->header, ref_group_idx_curr, ++ ref_group_idx_prev, cumul_size); ++ if (cond_expr == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "incalculable variables for conditional\n"); ++ return; ++ } ++ ++ /* Use the previous cond and generate a new branch and copy loop. */ ++ basic_block condition_bb = NULL; ++ profile_probability prob = profile_probability::likely (); ++ initialize_original_copy_tables (); ++ class loop *nloop = loop_version (loop, cond_expr, &condition_bb, ++ prob, prob.invert (), prob, ++ prob.invert (), true); ++ free_original_copy_tables (); ++ ++ /* Insert the generated stmt list before cond_expr. */ ++ gimple_stmt_iterator cond_exp_gsi; ++ if (cond_expr_stmt_list) ++ { ++ /* Function `gsi_insert_seq_before` will insert `cond_expr` (1st ++ stmt) of `condition_bb` to the end of `cond_expr_stmt_list`. */ ++ cond_exp_gsi = gsi_last_bb (condition_bb); ++ gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, ++ GSI_SAME_STMT); ++ } ++ } ++ ++ update_ssa (TODO_update_ssa); ++ ++ /* Perform hint issue for branches that meet conditions. */ ++ static_issue (ref_groups, num_issue_var); ++} ++ ++/* Issue llc hints through prefetch instructions. */ ++ ++void ++issue_llc_hint (std::vector &ref_groups, ++ std::vector &sorted_kernels) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "issue_llc_hint:\n"); ++ ++ /* 1) If the issue-topn and force-issue options are available, top N var is ++ forcibly allocated then no runtime branch is generated. ++ 2) If the issue-topn option is available and the size of top N var is ++ statically known, top N is statically allocated and no runtime branch ++ is generated. ++ 3) If the issue-topn option is available and the size of the top N var is ++ unknown, but them is dynamically known, the top N is dynamically ++ allocated and generate runtime branches. (also depends on the screening ++ of the innermost variable boundary type) ++ 4) If the dynamic runtime cannot know the size, such as indirect access, ++ optimization is skipped. ++ */ ++ int num_issue_var = std::min (param_issue_topn, (int) ref_groups.size ()); ++ if (num_issue_var == 0) ++ return; ++ ++ if (num_issue_var < param_issue_topn ++ && dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "WARNING: Only %u (less than param_issue_topn = %d) " ++ "ref_group(s) is found for llc hint.\n", ++ num_issue_var, param_issue_topn); ++ } ++ if (param_force_issue) ++ { ++ static_issue (ref_groups, num_issue_var); ++ return; ++ } ++ calc_type topn_calc_type = STATIC_CALC; ++ for (int i = 0; i < num_issue_var; ++i) ++ topn_calc_type = std::min (topn_calc_type, ref_groups[i].calc_by); ++ ++ if (topn_calc_type == STATIC_CALC) ++ { ++ /* Before static issue, we still need to collect data size of all target ++ variables and compare the summation with LLC cache size. */ ++ double prefetch_data_size = 0.; ++ for (int i = 0; i < num_issue_var; ++i) ++ prefetch_data_size += ref_groups[i].var_size; ++ ++ if (prefetch_data_size <= (double) param_llc_capacity_per_core ++ * PREFETCH_CACHE_SIZE_RATIO) ++ static_issue (ref_groups, num_issue_var); ++ else ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "static issue: Prefetch size exceeds LLC cache " ++ "size: %lf > %lf.\n", ++ prefetch_data_size, ++ (double) param_llc_capacity_per_core ++ * PREFETCH_CACHE_SIZE_RATIO); ++ } ++ else if (topn_calc_type == RUNTIME_CALC) ++ runtime_issue (ref_groups, num_issue_var, sorted_kernels); ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "unhandled issue scene\n"); ++ } ++} ++ ++/* ==================== phase entry ==================== */ ++ ++/* The LLC intelligent allocation consists of 6 steps. */ ++ ++void ++llc_allocate (void) ++{ ++ std::map > kernels_refs; ++ std::vector kernels; ++ if (!get_dense_memory_kernels (kernels, kernels_refs)) ++ return; ++ ++ std::set traced_ref_stmt; ++ std::vector unresolved_refs; ++ trace_data_refs_info (kernels, kernels_refs, traced_ref_stmt, ++ unresolved_refs); ++ ++ if (!analyze_nested_kernels (kernels, kernels_refs, traced_ref_stmt, ++ unresolved_refs)) ++ return; ++ ++ retrace_loop_refs_info_unresolved (unresolved_refs, traced_ref_stmt); ++ ++ std::vector sorted_kernels; ++ std::vector ref_groups; ++ if (param_filter_mode) ++ { ++ /* AutoFDO mode: include ENTRY bb and EXIT bb indices. */ ++ std::set bb_pathset; ++ bb_pathset.insert (0); ++ bb_pathset.insert (1); ++ if (!filter_and_sort_kernels_feedback (sorted_kernels, bb_pathset)) ++ return; ++ ++ if (!record_and_sort_ref_groups (ref_groups, kernels, kernels_refs, ++ bb_pathset)) ++ return; ++ } ++ else ++ { ++ /* static mode. */ ++ std::set bb_pathset; ++ if (!filter_and_sort_kernels (sorted_kernels, kernels)) ++ return; ++ ++ if (!record_and_sort_ref_groups (ref_groups, sorted_kernels, kernels_refs, ++ bb_pathset)) ++ return; ++ } ++ ++ issue_llc_hint (ref_groups, sorted_kernels); ++} ++ ++/* Check whether the function is an operator reloading function. */ ++ ++bool ++operator_func_p (function *fn) ++{ ++ const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn->decl)); ++ ++ if (fn_name && strncmp (fn_name, "operator", 8) == 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "operator_func: %s ", fn_name); ++ ++ return true; ++ } ++ return false; ++} ++ ++/* Check whether the function file location is known. */ ++ ++bool ++func_location_p (function *fn) ++{ ++ expanded_location fn_decl_xloc ++ = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); ++ expanded_location fn_xloc ++ = expand_location (fn->function_start_locus); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "fn->function_start_locus = %d \n", ++ fn->function_start_locus); ++ fprintf (dump_file, "fn_xloc.file = %s \n", ++ fn_xloc.file ? fn_xloc.file : "NULL"); ++ fprintf (dump_file, "fn_decl_xloc.file = %s \n", ++ fn_decl_xloc.file ? fn_decl_xloc.file : "NULL"); ++ fprintf (dump_file, "LOCATION_FILE (input_location) = %s \n", ++ LOCATION_FILE (input_location) ? LOCATION_FILE (input_location) ++ : "NULL"); ++ } ++ if (fn_decl_xloc.file == NULL) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Function location unknown, skip analysis \n"); ++ return false; ++ } ++ /* Newly generated functions are filtered out, such as function constant ++ propagation func.constprop (). */ ++ if (LOCATION_FILE (input_location) != fn_decl_xloc.file) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Function location non-local, skip analysis \n"); ++ return false; ++ } ++ return true; ++} ++ ++/* Dump function information. */ ++ ++void ++dump_function_info (function *fn) ++{ ++ const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn->decl)); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nfn_name: %s\n", fn_name); ++ expanded_location cfun_xloc ++ = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); ++ if (cfun_xloc.line) ++ { ++ if (cfun_xloc.file) ++ fprintf (dump_file, "[%s:%d:%d]\n", ++ cfun_xloc.file, cfun_xloc.line, cfun_xloc.column); ++ } ++ fprintf (dump_file, "\n"); ++ flow_loops_dump (dump_file, NULL, 1); ++ fprintf (dump_file, "\n"); ++ } ++} ++ ++/* dump param. */ ++ ++void ++dump_param (void) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "LLC allocate parameters:\n"); ++ fprintf (dump_file, " block size: %d\n", param_l1_cache_line_size); ++ fprintf (dump_file, " L1 cache size: %d lines, %d kB\n", ++ param_l1_cache_size * 1024 / param_l1_cache_line_size, ++ param_l1_cache_size); ++ fprintf (dump_file, " L1 cache line size: %d\n", ++ param_l1_cache_line_size); ++ fprintf (dump_file, " L2 cache size: %d kB\n", param_l2_cache_size); ++ fprintf (dump_file, " min mem_access_ratio: %d \n", ++ param_mem_access_ratio); ++ fprintf (dump_file, " min mem_access_num: %d \n", ++ param_mem_access_num); ++ fprintf (dump_file, "\n"); ++ } ++} ++ ++/* Determine whether to analyze the function according to ++ the ordering of functions containing cycle counts. */ ++ ++static bool ++should_analyze_func_p (void) ++{ ++ gcov_type decl_uid = DECL_UID (current_function_decl); ++ gcov_type func_count = event_get_func_count (decl_uid, PMU_EVENT); ++ if (func_count == 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "function uid %ld cannot find profile data " ++ "and skip prefetch analysis\n", ++ decl_uid); ++ } ++ return false; ++ } ++ if (func_count < event_get_topn_function_total_count_thres ()) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "function uid %ld total counts is %lu: " ++ "counts %lu < perf's top %d threshold %lu, " ++ "skip prefetch analysis\n", ++ decl_uid, func_count, func_count, ++ PREFETCH_FUNC_TOPN, event_get_topn_function_total_count_thres ()); ++ } ++ return false; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "function uid %ld total counts is %lu: " ++ "counts %lu >= perf's top %d threshold %lu, " ++ "continue prefetch analysis\n", ++ decl_uid, func_count, func_count, ++ PREFETCH_FUNC_TOPN, event_get_topn_function_total_count_thres ()); ++ } ++ return true; ++} ++ ++const pass_data pass_data_llc_allocate = ++{ ++ GIMPLE_PASS, /* type. */ ++ "llc_allocate", /* name. */ ++ OPTGROUP_LOOP, /* optinfo_flags. */ ++ TV_TREE_PREFETCH, /* tv_id. */ ++ (PROP_cfg | PROP_ssa), /* properties_required. */ ++ 0, /* properties_provided. */ ++ 0, /* properties_destroyed. */ ++ 0, /* todo_flags_start. */ ++ 0, /* todo_flags_finish. */ ++}; ++ ++class pass_llc_allocate : public gimple_opt_pass ++{ ++public: ++ pass_llc_allocate (gcc::context *ctxt) ++ : gimple_opt_pass (pass_data_llc_allocate, ctxt) ++ {} ++ ++ /* opt_pass methods. */ ++ virtual bool gate (function *) ++ { ++ return (optimize >= 2 && flag_llc_allocate > 0); ++ } ++ virtual unsigned int execute (function *); ++ ++}; // class pass_llc_allocate ++ ++unsigned int ++pass_llc_allocate::execute (function *fn) ++{ ++ unsigned int ret = 0; ++ ++ if (!targetm.have_prefetch () ++ || targetm.vectorize.code_for_prefetch == NULL ++ || targetm.vectorize.prefetch_handleable_mode_p == NULL ++ || targetm.vectorize.code_for_gather_prefetch == NULL) ++ return 0; ++ ++ if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH)) ++ { ++ tree type = build_function_type_list (void_type_node, ++ const_ptr_type_node, NULL_TREE); ++ tree decl = add_builtin_function ("__builtin_prefetch", type, ++ BUILT_IN_PREFETCH, BUILT_IN_NORMAL, ++ NULL, NULL_TREE); ++ DECL_IS_NOVOPS (decl) = true; ++ set_builtin_decl (BUILT_IN_PREFETCH, decl, false); ++ } ++ if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH_FULL)) ++ { ++ tree type = build_function_type_list (void_type_node, ++ const_ptr_type_node, NULL_TREE); ++ tree decl = add_builtin_function ("__builtin_prefetch_full", type, ++ BUILT_IN_PREFETCH_FULL, BUILT_IN_NORMAL, ++ NULL, NULL_TREE); ++ DECL_IS_NOVOPS (decl) = true; ++ set_builtin_decl (BUILT_IN_PREFETCH_FULL, decl, false); ++ } ++ ++ dump_param (); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "llc_allocate: %s\n", ++ IDENTIFIER_POINTER (DECL_NAME (fn->decl))); ++ ++ if (number_of_loops (fn) <= 1 || !func_location_p (fn) ++ || operator_func_p (fn)) ++ return ret; ++ ++ /* Filter only when combined with PMU event. When the should_analyze_func_p ++ analysis fails (for example, the function without PMU-event count), ++ in order to ensure the accuracy of the LLC allocation analysis, the ++ function does not perform native allocation processing. */ ++ if (flag_additional_profile && (!profile_exist (PMU_EVENT) || !should_analyze_func_p ())) ++ { ++ return 0; ++ } ++ ++ dump_function_info (fn); ++ ++ llc_allocate (); ++ ++ return ret; ++} ++ ++} // anon namespace ++ ++gimple_opt_pass * ++make_pass_llc_allocate (gcc::context *ctxt) ++{ ++ return new pass_llc_allocate (ctxt); ++} +diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc +index 0353ffd30..0492dc6fd 100644 +--- a/gcc/tree-ssa-loop-niter.cc ++++ b/gcc/tree-ssa-loop-niter.cc +@@ -2489,6 +2489,37 @@ loop_only_exit_p (const class loop *loop, basic_block *body, const_edge exit) + return true; + } + ++/* Returns whether the number of vectorized iterations for the loop can be ++ estimated from the given IR and update the corresponding loop attribute, ++ e.g., next_mask_114 = .WHILE_ULT (_122, niters.5_75, { 0, ... }); */ ++ ++bool ++number_of_iterations_vect (class loop *loop, tree lhs, tree rhs) ++{ ++ loop->vec_nb_iterations = chrec_dont_know; ++ ++ if ((TREE_CODE (lhs) != SSA_NAME && TREE_CODE (rhs) != SSA_NAME) ++ || (TREE_CODE (lhs) == SSA_NAME && TREE_CODE (rhs) == SSA_NAME)) ++ return false; ++ ++ tree ssa = TREE_CODE (lhs) == SSA_NAME ? lhs : rhs; ++ gimple *def_stmt = SSA_NAME_DEF_STMT (ssa); ++ ++ if (gimple_code (def_stmt) != GIMPLE_CALL ++ || !gimple_call_internal_p (def_stmt)) ++ return false; ++ ++ internal_fn ifn = gimple_call_internal_fn (def_stmt); ++ if (ifn != IFN_WHILE_ULT) ++ return false; ++ ++ gcall *call = dyn_cast (def_stmt); ++ tree niters = gimple_call_arg (call, 1); ++ loop->vec_nb_iterations = niters; ++ ++ return true; ++} ++ + /* Stores description of number of iterations of LOOP derived from + EXIT (an exit edge of the LOOP) in NITER. Returns true if some useful + information could be derived (and fields of NITER have meaning described +@@ -2559,6 +2590,9 @@ number_of_iterations_exit_assumptions (class loop *loop, edge exit, + op1 = gimple_cond_rhs (stmt); + type = TREE_TYPE (op0); + ++ if (TREE_CODE (type) == VECTOR_TYPE) ++ number_of_iterations_vect (loop, op0, op1); ++ + if (TREE_CODE (type) != INTEGER_TYPE + && !POINTER_TYPE_P (type)) + return false; +@@ -2852,14 +2886,14 @@ bool + number_of_iterations_exit (class loop *loop, edge exit, + class tree_niter_desc *niter, + bool warn, bool every_iteration, +- basic_block *body) ++ basic_block *body, bool guarantee) + { + gcond *stmt; + if (!number_of_iterations_exit_assumptions (loop, exit, niter, + &stmt, every_iteration, body)) + return false; + +- if (integer_nonzerop (niter->assumptions)) ++ if (integer_nonzerop (niter->assumptions) || guarantee == false) + return true; + + if (warn && dump_enabled_p ()) +diff --git a/gcc/tree-ssa-loop-niter.h b/gcc/tree-ssa-loop-niter.h +index ceaf65e07..8f03458f7 100644 +--- a/gcc/tree-ssa-loop-niter.h ++++ b/gcc/tree-ssa-loop-niter.h +@@ -27,7 +27,8 @@ extern bool loop_only_exit_p (const class loop *, basic_block *body, + extern bool number_of_iterations_exit (class loop *, edge, + class tree_niter_desc *niter, bool, + bool every_iteration = true, +- basic_block * = NULL); ++ basic_block * = NULL, ++ bool guarantee = true); + extern bool number_of_iterations_exit_assumptions (class loop *, edge, + class tree_niter_desc *, + gcond **, bool = true, +diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc +index 9d21e6d03..6e61f7140 100644 +--- a/gcc/tree-vect-loop-manip.cc ++++ b/gcc/tree-vect-loop-manip.cc +@@ -3738,3 +3738,269 @@ vect_loop_versioning (loop_vec_info loop_vinfo, + + return nloop; + } ++ ++class loop * ++vect_loop_versioning_2 (loop_vec_info loop_vinfo, ++ gimple *loop_vectorized_call) ++{ ++ class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop; ++ class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); ++ basic_block condition_bb; ++ gphi_iterator gsi; ++ gimple_stmt_iterator cond_exp_gsi; ++ basic_block merge_bb; ++ basic_block new_exit_bb; ++ edge new_exit_e, e; ++ gphi *orig_phi, *new_phi; ++ tree cond_expr = NULL_TREE; ++ gimple_seq cond_expr_stmt_list = NULL; ++ tree arg; ++ profile_probability prob = profile_probability::likely (); ++ gimple_seq gimplify_stmt_list = NULL; ++ tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo); ++ bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo); ++ bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo); ++ bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo); ++ poly_uint64 versioning_threshold ++ = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); ++ tree version_simd_if_cond ++ = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo); ++ unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); ++ ++ if (vect_apply_runtime_profitability_check_p (loop_vinfo) ++ && !ordered_p (th, versioning_threshold)) ++ cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters, ++ build_int_cst (TREE_TYPE (scalar_loop_iters), ++ th - 1)); ++ if (maybe_ne (versioning_threshold, 0U)) ++ { ++ tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters, ++ build_int_cst (TREE_TYPE (scalar_loop_iters), ++ versioning_threshold - 1)); ++ if (cond_expr) ++ cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node, ++ expr, cond_expr); ++ else ++ cond_expr = expr; ++ } ++ ++ if (version_niter) ++ vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr); ++ ++ if (cond_expr) ++ cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr), ++ &cond_expr_stmt_list, ++ is_gimple_condexpr, NULL_TREE); ++ ++ if (version_align) ++ vect_create_cond_for_align_checks (loop_vinfo, &cond_expr, ++ &cond_expr_stmt_list); ++ ++ if (version_alias) ++ { ++ vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr); ++ vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr); ++ vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr); ++ } ++ ++ if (version_simd_if_cond) ++ { ++ gcc_assert (dom_info_available_p (CDI_DOMINATORS)); ++ if (flag_checking) ++ if (basic_block bb ++ = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond))) ++ gcc_assert (bb != loop->header ++ && dominated_by_p (CDI_DOMINATORS, loop->header, bb) ++ && (scalar_loop == NULL ++ || (bb != scalar_loop->header ++ && dominated_by_p (CDI_DOMINATORS, ++ scalar_loop->header, bb)))); ++ tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond)); ++ tree c = fold_build2 (NE_EXPR, boolean_type_node, ++ version_simd_if_cond, zero); ++ if (cond_expr) ++ cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, ++ c, cond_expr); ++ else ++ cond_expr = c; ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "created versioning for simd if condition check.\n"); ++ } ++ ++ cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr), ++ &gimplify_stmt_list, ++ is_gimple_condexpr, NULL_TREE); ++ gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list); ++ ++ /* Compute the outermost loop cond_expr and cond_expr_stmt_list are ++ invariant in. */ ++ class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr); ++ for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list); ++ !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ gimple *stmt = gsi_stmt (gsi); ++ update_stmt (stmt); ++ ssa_op_iter iter; ++ use_operand_p use_p; ++ basic_block def_bb; ++ FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE) ++ if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p)))) ++ && flow_bb_inside_loop_p (outermost, def_bb)) ++ outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1); ++ } ++ ++ /* Search for the outermost loop we can version. Avoid versioning of ++ non-perfect nests but allow if-conversion versioned loops inside. */ ++ class loop *loop_to_version = loop; ++ if (flow_loop_nested_p (outermost, loop)) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "trying to apply versioning to outer loop %d\n", ++ outermost->num); ++ if (outermost->num == 0) ++ outermost = superloop_at_depth (loop, 1); ++ /* And avoid applying versioning on non-perfect nests. */ ++ while (loop_to_version != outermost ++ && single_exit (loop_outer (loop_to_version)) ++ && (!loop_outer (loop_to_version)->inner->next ++ || vect_loop_vectorized_call (loop_to_version)) ++ && (!loop_outer (loop_to_version)->inner->next ++ || !loop_outer (loop_to_version)->inner->next->next)) ++ loop_to_version = loop_outer (loop_to_version); ++ } ++ ++ /* Apply versioning. If there is already a scalar version created by ++ if-conversion re-use that. Note we cannot re-use the copy of ++ an if-converted outer-loop when vectorizing the inner loop only. */ ++ gcond *cond; ++ if ((!loop_to_version->inner || loop == loop_to_version) ++ && loop_vectorized_call) ++ { ++ gcc_assert (scalar_loop); ++ condition_bb = gimple_bb (loop_vectorized_call); ++ cond = as_a (last_stmt (condition_bb)); ++ gimple_cond_set_condition_from_tree (cond, cond_expr); ++ update_stmt (cond); ++ ++ if (cond_expr_stmt_list) ++ { ++ cond_exp_gsi = gsi_for_stmt (loop_vectorized_call); ++ gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, ++ GSI_SAME_STMT); ++ } ++ ++ /* if-conversion uses profile_probability::always () for both paths, ++ reset the paths probabilities appropriately. */ ++ edge te, fe; ++ extract_true_false_edges_from_block (condition_bb, &te, &fe); ++ te->probability = prob; ++ fe->probability = prob.invert (); ++ /* We can scale loops counts immediately but have to postpone ++ scaling the scalar loop because we re-use it during peeling. */ ++ scale_loop_frequencies (loop_to_version, te->probability); ++ LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability; ++ ++ nloop = scalar_loop; ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "reusing %sloop version created by if conversion\n", ++ loop_to_version != loop ? "outer " : ""); ++ } ++ else ++ { ++ if (loop_to_version != loop ++ && dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "applying loop versioning to outer loop %d\n", ++ loop_to_version->num); ++ ++ initialize_original_copy_tables (); ++ nloop = loop_version (loop_to_version, cond_expr, &condition_bb, ++ prob, prob.invert (), prob, prob.invert (), true); ++ gcc_assert (nloop); ++ nloop = get_loop_copy (loop); ++ ++ /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will ++ reap those otherwise; they also refer to the original ++ loops. */ ++ class loop *l = loop; ++ while (gimple *call = vect_loop_vectorized_call (l)) ++ { ++ call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call))); ++ fold_loop_internal_call (call, boolean_false_node); ++ l = loop_outer (l); ++ } ++ free_original_copy_tables (); ++ ++ if (cond_expr_stmt_list) ++ { ++ cond_exp_gsi = gsi_last_bb (condition_bb); ++ gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, ++ GSI_SAME_STMT); ++ } ++ ++ /* Loop versioning violates an assumption we try to maintain during ++ vectorization - that the loop exit block has a single predecessor. ++ After versioning, the exit block of both loop versions is the same ++ basic block (i.e. it has two predecessors). Just in order to simplify ++ following transformations in the vectorizer, we fix this situation ++ here by adding a new (empty) block on the exit-edge of the loop, ++ with the proper loop-exit phis to maintain loop-closed-form. ++ If loop versioning wasn't done from loop, but scalar_loop instead, ++ merge_bb will have already just a single successor. */ ++ ++ merge_bb = single_exit (loop_to_version)->dest; ++ if (EDGE_COUNT (merge_bb->preds) >= 2) ++ { ++ gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2); ++ new_exit_bb = split_edge (single_exit (loop_to_version)); ++ new_exit_e = single_exit (loop_to_version); ++ e = EDGE_SUCC (new_exit_bb, 0); ++ ++ for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ tree new_res; ++ orig_phi = gsi.phi (); ++ new_res = copy_ssa_name (PHI_RESULT (orig_phi)); ++ new_phi = create_phi_node (new_res, new_exit_bb); ++ arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); ++ add_phi_arg (new_phi, arg, new_exit_e, ++ gimple_phi_arg_location_from_edge (orig_phi, e)); ++ adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi)); ++ } ++ } ++ ++ update_ssa (TODO_update_ssa); ++ } ++ ++ if (version_niter) ++ { ++ /* The versioned loop could be infinite, we need to clear existing ++ niter information which is copied from the original loop. */ ++ gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE)); ++ vect_free_loop_info_assumptions (nloop); ++ /* And set constraint LOOP_C_INFINITE for niter analyzer. */ ++ loop_constraint_set (loop, LOOP_C_INFINITE); ++ } ++ ++ if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION ++ && dump_enabled_p ()) ++ { ++ if (version_alias) ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING, ++ vect_location, ++ "loop versioned for vectorization because of " ++ "possible aliasing\n"); ++ if (version_align) ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING, ++ vect_location, ++ "loop versioned for vectorization to enhance " ++ "alignment\n"); ++ ++ } ++ ++ return nloop; ++} +diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc +index 7f7577951..023a83c38 100644 +--- a/gcc/tree-vect-loop.cc ++++ b/gcc/tree-vect-loop.cc +@@ -9735,8 +9735,11 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) + + if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) + { +- class loop *sloop +- = vect_loop_versioning (loop_vinfo, loop_vectorized_call); ++ class loop *sloop; ++ if (!(optimize >= 2 && flag_llc_allocate > 0)) ++ sloop = vect_loop_versioning (loop_vinfo, loop_vectorized_call); ++ else ++ sloop = vect_loop_versioning_2 (loop_vinfo, loop_vectorized_call); + sloop->force_vectorize = false; + check_profitability = false; + } +@@ -9989,7 +9992,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) + niters_vector_mult_vf, !niters_no_overflow); + + unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); +- scale_profile_for_vect_loop (loop, assumed_vf); ++ if (!(optimize >= 2 && flag_llc_allocate > 0)) ++ scale_profile_for_vect_loop (loop, assumed_vf); + + /* True if the final iteration might not handle a full vector's + worth of scalar iterations. */ +diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h +index e13bc6c99..85018f250 100644 +--- a/gcc/tree-vectorizer.h ++++ b/gcc/tree-vectorizer.h +@@ -2177,6 +2177,7 @@ extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge); + class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *, + class loop *, edge); + class loop *vect_loop_versioning (loop_vec_info, gimple *); ++class loop *vect_loop_versioning_2 (loop_vec_info, gimple *); + extern class loop *vect_do_peeling (loop_vec_info, tree, tree, + tree *, tree *, tree *, int, bool, bool, + tree *); +-- +2.44.0.windows.1 + diff --git a/0357-Enhancing-BOLT-Optimization-with-AI.patch b/0357-Enhancing-BOLT-Optimization-with-AI.patch new file mode 100644 index 0000000000000000000000000000000000000000..64f2239301274bb00c931403aa62cbd22717ec6e --- /dev/null +++ b/0357-Enhancing-BOLT-Optimization-with-AI.patch @@ -0,0 +1,72 @@ +From 3dd233c1a7b20de2182ae4e98909ddace6612a0a Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=E9=99=88=E9=B8=BF?= +Date: Tue, 25 Feb 2025 16:32:39 +0800 +Subject: [PATCH 2/2] Enhancing BOLT Optimization with AI. + +--- + gcc/ipa-hardware-detection.cc | 2 +- + gcc/onnx.fdata | 2 +- + gcc/opts.cc | 13 ++++++++++++- + 3 files changed, 14 insertions(+), 3 deletions(-) + +diff --git a/gcc/ipa-hardware-detection.cc b/gcc/ipa-hardware-detection.cc +index 75b74aa03..6b36d685c 100644 +--- a/gcc/ipa-hardware-detection.cc ++++ b/gcc/ipa-hardware-detection.cc +@@ -89,7 +89,7 @@ create_part_bb (basic_block last_bb, tree part_base) + &gsi, PLUS_EXPR, unsigned_type_node, part_base, + build_int_cst (unsigned_type_node, 4294963967)); + gcond *cond = gimple_build_cond (LE_EXPR, part_cond, +- build_int_cst (unsigned_type_node, 2), ++ build_int_cst (unsigned_type_node, 128), + NULL_TREE, NULL_TREE); + gimple_set_location (cond, input_location); + gsi_insert_before (&gsi, cond, GSI_SAME_STMT); +diff --git a/gcc/onnx.fdata b/gcc/onnx.fdata +index 234b1a045..77f4d9b1d 100644 +--- a/gcc/onnx.fdata ++++ b/gcc/onnx.fdata +@@ -1 +1 @@ +-316365613139376535626535626234666331363163303835336362393535613530636234643633626364386566396132333232373733633230393865663664633761393137633266616431663436343236613231663865636236346133616662623761373633663830623231393063616534633032316538626436633731643237666333386462313164333630303936336137323863313634613031393931613164363237643262353162376133643935373036306336346161376563383862613138666663393538363731333639396239666362393336373737643238636639643761343231346131333463353261623633343633343866663966663365346231356532663139306164303361383836396333393339616236383439363661313661303665643535633961666563613431303466333534346564633533373862323031396339626536613030383761623236663432633564653130353935353135313736656235373632373739343662663034343334633035626465356237633439313164313338373637383365326138366162363234323765393736616438656463343339613031316630643031613465386464326334383565343838366435313137313166383433396531626137353932616538333330653164326438656166343339363262366264326632376564396434396333356565343733383164363264633937356663663338666530336166316634623264393031393536333863383165616536656238346462656337333638323338646535303638363933646565616264363966356566323465346538613762623864303766646338666264643466666537303263623162326539653435643130313061386235623631306630636163303536343164663364383738353266386330376562343962393037306133383363326138393238376435613332353933663235313030326664366166373632343532613130323237303265373433623362623162633661633363303235613236383166313465396162353938363931613765316565313864313038cd68834331701041d1d21041f17c20432483a94386647e4157c8e33b5f3d5d3ec5275e3ea689863c435a0f3a76acd63d5d9b803b24467c3baf847c3b67b89e3b852a313b2127853900000000d58ac23b200ab53a000000807d3119bc22f7a63a81549f3b93b5013baee4a33b62c1153b9ae08b3a6929a33b20038f399475983b430ab53a73fc0b3a2daa0ebad595953bc2f1e0bb33e9ccbbb978d83a5e77a53b41e4c93adf10a73bdf36643ad7fd983a61e8d93bc04a283a30c072382f942c3b5b3cc73a4392e43a422b093c79bc61b9a5309e3b00000000757baa3a03d8a93c3c31e33af526ebbb000000006431d43a1d0ae73aa450783b8c57afb9b8eae939ec8fab3b9581d83920d7a1ba0fc1af38b6aece3ab50bafbbd50db63a26aba33bcdeda33b00d9493ac22dac3cf8c4233bc2966e3bdf1bca3a8fb4d13af9b0983b2cbda73bdae2aa3bc93bae3b39e1ba380857953be8e7a73b49e9df3b20b0233b9fe3d43a0dbcaa3bd10cf0b978eea53b761ebe3b0a50a23b70bd47b79a7720bc6cd4ee3ae0d0f93a9c333ebb5098dfbbbf8fa53b445efebac7b9993b6182b93aef267c3a4aa09e3b46d9a83b9f95983a379e913c6516123a1b2ebd3aaf943c3a0b90803becba92bce68f673be723253c5d7f813ad779613800000080af3c65ba6999743900000080957a003d82f2fe39baab4d3b7f348c39b8d3323b3c1e253ace952dbbc9d364bc3aafaf373d0a633be8fdee3968b0fa39eb70a83a7cba4e3bdf2407bc40f50f3d94f4c3b9a828573b3f2bc3b99a5763bcccb838bb24f011bae3400dbdc3074fba30a829bb3dde6e3ad7c2caba2b2aa7b8d479a7bbebe2603a7025583b00000000017414ba680386bc9b365e3aaacb03bc000000006afd90b9a64e263980eb223c80a48ebcca9703392310573b1fd419bbf7368abc17a2083a3ceafab95eb11cbcf29995b9a64264bc8bae403bc1dc6139631c88bc12e3373c07cf0c3cdc93a6b97edbc0b917754d3b5cdc143c61ef393b40a809baf3861dbbafce623be550513b828382bc359d513afa4a25ba31394c3bb013da3a9835553bf3d9553bec2b65bcee09bab9f6343e3c03a59f39fb11053a078e7cbc5bd006bcfe23363b08d12cbb3cfb533bb98a8fbadcb99139cbd1573b24725e3b01014fb6dcbc45ba6ee024bb318db1baf39ce9b952d625bc41afddb91d7dffbbc0ba163b0387b93b2594623b00000000f60cf9ba483c983b0000008015e6c6bcbd45983b77d62ebcfbb69f3b7b5752bcc334ab3b4f9806bc9d89063cc0675a3b807426bca81a9f3b7ef56f3b6a96a13a045937bcd4a2f33cb92173bc40af783b26ac40bc5fef6b3beba6fe3b8c7207bc5e25443bfd99a33be7e7403b4c2508bc0c87bb3bb95dcd3abe228b3bac03deb91a2ab03add753bbc000000002e04703be98f1fbccef2af3b17ebe93c0000000020e37a3b46ba913b1fd7003b1f3f133df85d423bacc843bc5fada7bbc8680d3d8423503b2afc6c3b4e43033dcfcc7c3bcece053cdbb44ebc4151823ba14426bc6e942c3b3bdc4d3a34967f3b7687783bd0cd3ebcfc75053ade324ebcd10c32bc9ff9fbbb0b7430bcf60e4abcd6b6e03b295db43b25c75d3b88334fbc8d95883ac9c73ebcddf941bc2b18083c43044c3b405414bd7617963b9910a03bd5e70c3d9356f23c3a2750bc472107bce47d47bc0125243b3c41953b0f6134bc8c403bbc8fb3873ba5e218bcae5d06bc2dfe103b758a493b43cef63cd7438d3c2bf1eb3b2d4a833cf13a43bc5d14c4bd000000002932a7bc3191e4bb00000080224e753dea87dfbb41e28a3cbeb44b3d731d8f3c1312d2bb54e44dbc232b84bc74f9d9bd033bcb3cdda410bbeeeb47bdd7e44e3b3c3e21bb435712bdb3e6413c82e770393f20a53cc6642dbc325484bc410c4e3dcb49823dc262bd3c204a563d032393bb0887753c0cad943d3946abbbcb77b3bc9151c6ba860dc0bd00000080e5880d3a2f960ebd1bba99bcce3910bd0000008037acde3be98a983bd60b7c3c66ee27bd2431aab98b2b95bded06813bc17429bdf5a9e9bc4ff297bafad924bdc14d53bc901784bcad96073cd34989bc84580fbd1e276b3ca48e513c189796bbe15f8cbb39fa473cce9c693cbdd0843a4f07443dbf40c03c38a1893c3790ab3cb48c58bcc5e9863b684448bcb5c32abc0726a6ba1def9ebb57ce273d772b84bc1925c63d2e26d8bc24460cbcb0f807bc8dd0a5bc9ba312bd6ed5393c32e1f43cf3c58bbc8a5334bc8e0c53bbf78cb13c7805793c8d5800bbc4a5c2bcfc2c85bba79d3c3df00f493db55cb73cce71c43dc030f03cc953823c79c1f13d614db73d0000000074e98f3ca415183c000000801104803d9afea83ddf9ff93d835f9bbce8d8623cd67e093c453d143d7d8c90bc1434e23d24580b3e00711d3c729b903d81a0253c82e9b53cba65123ca564a23d7a53003c2c82ec3de139f93c58f58ebce101813ba5782d3d4e198e3dbaa40fbb58e2bc3bbf92943c98421e3df32c0c3cbc235ebcc2fe443c2789033e00000080b94ca73be81815bd1758e53c5df7053b00000080f9f63f3cc7a9893cb846823c65d2143c9bb50e3cced60c3e92fb983b583593bbbbfe263e390bdf3b696887bbd13e823c207890bc1cf0c23cd688163dd14e16bdd3cb813c95a6593c70d7083cd6c6e43b6d4d9b3c9455683c876e1f3e599ff83c4b377f3c2afd953cbeedd43ccbdb163d2d78fd3bcc84363c5c7fa63c22fedf3c3318e83d0ecdba3d0ea690bc462e9a3d0b11013cf19f503da4f8813db249c0bba7300f3c2d6c223dd1d7663b56b4c43d56e5f93c4799e43b0702a73d4e15ae3de8040e3cdfad72bc0ab6593d1fb7c9bb6f90b43dbfcab83b4cd802bbbd3c993be1a91c3c8f677cbaa83420bb0000008084bf263b6336adba00000000373bd13b521cffba733ac83bee8c9bba1306f73bbf5471ba8651773bc863ac3a6ed119bb926fc43b9368e5ba34f319bba9c8ebbaa74acb3b39169ebc812d573b4764beba5815ea3b5211caba956ec23a9e107d3b64dbc4ba674ac73be88107bb5354493b688c5cbaaf4571bab3d6b3bae566603b11b0b0ba6bd1d03b000000005b6cc1ba0720833c5210c7ba85cd97bc000000003c1fc4bab35dbbba30b6fe3b389ea2bc97eb8eba37bae43b697a293b87969abc5c9e04bb83acc2ba5f8fadbcf872c5bab03daf3ad509fe3b2f81f9ba4317863cd808bb3b0177f03b02dabdbab2efbdba3b03d83bc09f223c6030ec3b0137c13b29f5663bf195ce3bc10eff3b18cda93b35486cba7dd7c8ba0d51003c34dc93ba891be33bb785ea3bbb75a73ae04f2abba21d8f3b9065c3ba8892bbbac37d96bc6a0c9dbc596cef3bce5a063bd64cec3be62fb0baaa5cbbba1acbd03b5cdfe13b0f37e9ba48bb653cc513733bc352a0bacba0ffba469ababdf17dae3c939271bd7af718be283c113ed15fbb3d00000080920e90badfa4d63d00000080df70d63c18f5c03dbbb677bde981e83d5a78d2bd0985093e663cffbd9a12803d3fc0b33d65b388bd50a0dd3d011acc3d2df0203ed04095bd5c9a8abd7294323dd404b33dabc5bdbd8042a33d93cb6f3d3b81f7bd4e2e823dfdba273d83ea863d7d0f3cbeaff2133e565a3d3d0d66ca3d035bea398af8073ec3b79abd00000000d078ad3dbe475c3c9267013e1db874bd000000809e1db03d23f9cb3d902b14be16ea52bdc41ac33db52abbbdf07981bed9442ebd4d94a83de0f7a93d745d41bd4cebb33d01f57c3da16adbbd8ae0b63d2d6c763cc1f40ebe098cbebddc59b83d256bb13d359fa9bdf886e5bd0dfcbbbdd68d0dbd726807be83579fbd05c9dcbdd8e8983ccd620d3edcf69f3dc980e8bdcb6c923e1cdcb3bdcc6ec1bde600823d030d993d9b3a1d3d420bcd3d754dde3df8132cbdfabb85bdce89c3bdefd054beafc6c8bd931e7c3d30fed83dbef795bd7fb1b3bd6d9eb73de344993cceb603bebd216b3de45c803d70a1953e1dfb62bea91d2a3ed284113f544008bfc50498be00000000d460e13b0d05b2be0000000066f28ebe9adb9ebe4ee6323efd01c7be63c7ac3e888dfabe44e5e23ec6f631bed07890beb0fe4a3e6217babee93daabecf501abfd1b9573e7bd53a3f805fe2be10a48cbecac09a3e52c67bbef48823be9a9dd83eb74340be75deeabd3fbf42be9d1c3c3f74e10abf923b05beaec4a4be66229fbbb35ff6be8a146c3e00000080638d84be04cb14be4dcfe7be53ec253f00000000208c89be7039a8bef1170c3f70ac0d3fdcd59bbe68f4973e6b1f903f16a3eb3e6d0a86be361a81be4cba023f82c88cbe01132ebea3feb73e79f391beeb6826be5130063fc4d9953e4a6690becd328abe13b7803efa6fc23e9823993efea5b83d60e0f43eff276c3ed453ba3e374b63bdd41502bf17e36dbe5034c53ea2ac9dbfff04903e64879e3e4c9d35bede8b6cbe435ed2be4e73abbe3020bfbef8c6e83e8630343ff9de9e3e78d65c3f3659a63eee6f35bed729b4be3954623ef8778f3ef4758fbe75fd4ebed9e3ed3e770a23be6b403ebed7e596bd656a093d6e463cbddfa103be2213f73d8a4c973d0000000050dc0bbb9d64b03d00000080d5e9283d3e429c3d1d0b3ebd35e5c03d101cacbd51b0e83dd890d5bd1378483dcc6a903d431d54bda0cab63d6376a63df1ab0a3e1a586bbd6bcfdbbd5cfb8d3db01a903dced598bdc300823d43fc3a3db75ccebdfea14a3d8f27fe3cf98c533de7d924bef762fc3df248103dde32a53d8d067e3a17bee53d5b6573bd00000080285b8b3d5cd0283d3447d93dd249c3bd00000000317f8d3db47da63df06ffebd933ca7bd3da29e3d0f9496bd7f1c6dbe2feb8abdd761863d8d3f883dc33d9abdecee903dd3ed453dc93cb4bd785f933d08c8393d5eb3f4bdef249bbd4ffb943d98be8e3d7b5487bd365bbebd103397bd19bdd2bcbc53e4bd9eb77cbd7d6fb5bdcc15713c4f00f03d54fc7e3d1c98c0bd05dd853e0dea8fbd5a2d9cbd35aa4b3d879a713df19b783d214da73d821bb73dc52889bdb707d4bdfd3d9ebdb6e53cbe1ee9a2bdc740443d3594b23de3046bbd4acb8fbdbf54943d23de5e3dbd36ddbddae5363d5155473de961aa3ed3a432be5d8e583e1d9a173f318110bfbc47aabe00000000a572363c9755c8be00000000c5235abe14fbb1be3f59573e57f8dbbe0adcc13e40d705bf1517f43eb1d467be07b3a2be40216f3eb6d9cfbe84b1bbbeba571cbf5c30843e96700c3f700eb6beee52a3bedcd6ab3eb39b93be9fd159be7524eb3efdd668beee2a1abe9dd171be0acf3d3f227e12bfbb312dbe3966bbbee3f99bbcec2f01bf2790883e0000008047ae9dbe59a438be26a4f9bedaf7f93e000000004b62a0bea849bdbe00ac123fa905d63e3e96b2bed43da93eef558d3f0c73b23e65bf96be67359abe8cc9c53e372ba4be184265becf66cb3edd11a7bed6d249be506f0d3f7429af3e0ba1a8bed2aca1becacc973e154ad73e9df5a93e53dd003e4cbd023fedc88d3e01e5cc3eff3eb3bd194d0abfa27090beb4a9d93ed0bf9dbf7c95a13e7bb5af3eb12a6bbeace488beb5419fbe86c8bebedea8d1beb735b03e6889073fb9f1b13e34835e3f7d7bb73ef8e961be08c5cabe6ff4833e956fa13e33e0a7befd926ebe6009fe3e8bc653be843b65be6a18a0bdd534f93c01fc52bd6b6f09be77e3fd3dc1c59f3d00000000e97d92bbf664b83d00000000972f193da896a43d65e754bd5e81c83d269cb4bd1d84ef3d1f11ddbd55385e3d020b993d4fdf69bd12a5be3ddd95ae3d62cb0d3e7ff87fbd7418c7bdf8ca803d1ed5983d13dea1bd480c8b3df3be503d2202d6bd540d5e3dc93e133d530d673d1b012bbe8f7c013eb79d243d9072ad3d772a273acfa6ec3d0ce383bd00000000fd56943de9119e3c0f64e03dd0cdb0bd00000080274f963deba6ae3d1c0a05be683297bd1f18a73d49b19fbdc64270be58657bbd263a8f3d374f913d63948bbda1b0993ddab55b3d7485bcbd00fc9b3df3edb03c892c00be9350a6bd25af9d3d968b973d79fe90bdeca4c9bdba46a0bddb1004bdbc84ebbde25788bd3ab0bdbda690963ccab6f63d60cc883d918ec8bd51ba873e9a3e99bdd01da5bde567613d50fd813d6981613d7163af3ddbdcbe3d473578bd8c07c0bd6924a7bd21c53fbe8ca6abbdaaf9573d158dba3d86c37fbd572399bd2a109d3d6994dc3c1e95e4bd4e1d4b3d7eca5a3d032771bdc8f5403d4b1a09bde53ceabdbfd4db3de573753d000000004909b9ba32b88f3d00000000c1c16c3d8b42803d994910bd8ba8a03dac6f8bbd482cca3db803b7bd3e900f3dbd44693d09b223bd2e39963db96c893d58fbf83dd2092ebd01b411be3149c43d891f633dd5b879bd93444b3d60e4033dfcc3aebd6c231b3de078bd3c57241d3dc16d17bed815e03d92dad63cc208853d82c1823a3acdc63d6d833ebd00000000ad06563de91bf63ca912bb3d854001be00000000941b5e3d5cd1873d744ce1bd91fce3bd57a17b3d804275bd64d465be11f8bdbdae6a583d0b73503dfc7bd1bd7155633d3e6a0c3db57194bd8eae6b3d6798093d58a9d7bdc3db71bdbd2b693d602c5f3d20c14fbd73e59cbdd53e77bdf28694bc1c71c5bd7f773ebd704a96bdd115383cf8ebd13df406403d27019fbde07f823eef7668bd4fcf7fbd6184123d9a113f3ddd37ad3dfe6a8a3dda459a3d3a30bbbdafb00cbe663c80bd0e3832be203886bdf262123d1972913df4a036bd978267bd99a5673d02f7283d6ae8bfbdd981033d3e82193df70172bdf1b33b3d2c8309bdf837ebbd4944dc3d875a763d000000001630b3baef2e903d000000008d6c663da8b4803d58ca10bd9221a13d74e08bbd7fa6ca3d338bb7bd700e103d71236a3dc13924bd00b2963d3de8893d2c6cf93d4ab12ebd894d11be9e58c13d81f0633dae8e7abd69054c3d5b51043d5d42afbdc1b21b3db199bd3cdab61d3d61aa17be7e84e03d2c57d73c907b853d18e27a3ac747c73d073e3fbd0000000013d0563d4ba4ec3cf98ebb3db3d400be0000000068ea5e3df445883d2b22e2bded78e2bd97817c3d232376bdb02f66becefebbbd053d593d6b38513d5d50cfbdab27643d1ce40c3d1ce994bdc0866c3dc167043d7161d8bd9abb72bd8c016a3ddcfa5f3d6e8650bdf66e9dbda51c78bd8b6494bc14dcc5bd07203fbd87b396bd8478363cab61d23dd8be403d137e9fbda19d833edb4969bdf35780bd9506133d4ac83f3d4157a93d2be18a3d44c19a3db553b9bd422c0cbe29ae80bd8b1f32be2ab086bd7fe6123d5ce9913d1b4c37bd253c68bdf37a683dda1d253d1866c0bd1fef033d630f1a3d55a78ebd9363e43c877813bdf06300be01e5f03d82788b3d00000000c3734fba035aa53d00000000e511873d7b68913d4e6e1cbd7718b73d813fa9bdcc68e13d5460cdbdbca5213daa5f843dade935bd383bac3d31919b3d2bf8073e69c848bd95ef2cbe5d65d83dba17833d49ee93bddd1f693de61c123dab7ac5bdd1dd2d3d5cf9643c6e50333d290923bebe38f63d2111e33cba73993d0a4b1c3aff26de3de00c5abd0000008092d2793d9b92fc3c1457d13de3451bbe00000080993e803dcd659b3d9082f7bd691808bef0f6913d51b390bdbab66cbe5acce9bda06e743dd62e733d250efbbd92ab833d46cc1d3d0c53b4bdea20873d8341113dbc72edbda26d8cbd369a873d164c813d395d75bdbe4ab3bd341392bd2d3f81bc05cadcbd939c5ebdc34eb6bd0175de3b5116e93d1b04603d00c2c2bda59a863e93b488bd09f997bda483253d75f4573dfd1ad83d23ed9c3d4bbfad3dc5f9e5bdf05c27be036399bd72e93bbe8204a0bd71d8243d7988a73d352350bdb95188bdcbd0863df73d3e3d0cbed5bd59aa133d40402b3d52f370bd85b63b3d0fd708bd723aeabdcdb1db3d87df743d000000806cedb3bad56b8f3d00000000d46d663d35e87f3d67e50fbd8859a03da22e8bbd43f6c93d37deb6bd811c0f3daeb1683d765223bdb9ef953d1c24893da7e7f83d15a22dbdd3ac11bee609c13d9f83623d703979bd21b74a3d7b7b033def92aebd70b81a3d730bbd3c5cbd1c3d0ecd17bef7f0df3d8140d63cd8b9843d1d757e3ac997c63d11023ebd00000000937d553d0e57ef3c0edaba3dc82f01be000000807c885d3ded82873d247ae1bd3436e3bd170e7b3d5ab774bd32eb68bef817bdbd6edc573ddbec4f3d4721d1bde3c1623d5ffa0b3df23994bd8a176b3d63e0053deec1d7bd7a5971bd5899683d57945e3d72434fbd92a59cbd4c9e76bde50b94bcc364c5bdab1a3ebd841a96bd88b5363cb8bbd13d548a3f3dd1e69ebd72ce833edbe867bd6f517fbda50d123dc88a3e3d8f2ba93dd51b8a3d83fc993d10a3babddc870cbe05e47fbdc04732be17f885bdfdfe113d7b27913dd82436bd100667bddb16673d808c263dbfbcbfbdfa2c033d6118193dc7b077bde7d4433ddcae0dbdc3a7efbd20dfe03dbb5f7c3d00000000603dbdbab68b933d00000080e852703d98cf833d651615bd4fbfa43dce3a8fbdd0edce3d2297bbbdfe4c143dbdf46f3de7d728bd892b9a3d092a8d3d635efe3dce9033bdcff113be186ac83d49a3693d124780bd8e56513d255a083d402eb3bde733203df262c43c6343223d04aa1abe2d29e53d0392de3cb1a9883d8ea7833ab384cb3d0e5744bd00000000d2555c3d8ed7f63caaa1bf3d462903be000000801f8e643d88808b3d81a9e6bd24a3e8bd4854813dee347cbdeafb6bbe1474c1bd62c75e3d92a6563d5559d5bda6e2693dbb14113dd06d98bd8065723d3e040a3d071cddbdc1d478bd83d76f3d8d9e653d47f855bde307a1bdaf257ebdce119abce91fcabd9f3a44bdee319abd37ff3e3ccecad63d6be6453d101fa3bd177f853e88256fbdab7483bd6e52173d12e6443da4ecaf3d32268e3d324b9e3d0603bfbd44ea0ebe44cb83bdad6235be57ee89bd933e173da14e953dd84a3cbd7c026ebd384d6e3d9d5f2b3de293c4bd3104083dbe891e3d97b16fbd8d903a3dc1e207bd7219eabdc854db3df7f4733d00000000e850b1ba61f98e3d000000005023653d9b007f3dc20d0fbd7beb9f3dfab88abdd293c93d1e74b6bdb4490e3d3bc7673d755122bd3e7e953d89af883dc493f83d8dbc2cbdcd0711be5457c03d389b613d165078bd9bcf493d41ae023d0226aebd2ddc193d9366bb3c94df1b3d008d17be2995df3d3dbcd43c3446843d9367763ad233c63da91b3dbd000000805b93543d7659ea3c8b72ba3d3a4100be00000080009f5c3dc40f873d922ce1bd32cce0bdcc237a3df4cb73bd988d68bef1ddbabdc7f2563d12034f3d497ccebd01d8613dde280b3d17c693bdf72d6a3dd434033d60b6d7bdb06b70bddbae673d96ab5d3d24584ebdc2339cbdb0b675bd189c92bc81e0c4bd3e2b3dbdbd8e95bd1ac1343c405bd13dbca23e3d396f9ebda2b7833e68ff66bd3d687ebd9a39113d03a53d3df37fa83d74a9893df58c993d4e6db8bdf6030cbedcfa7ebd142532be818285bd7426113d02b5903d4d4035bd28f265bdd02b663da385233d2056bfbd115b023db03c183db67f6bbd2238373dd9f904bd22b7e6bd6a08d83dee556f3d00000000ade0acba75698c3d00000000f81e613d4b3c7a3de7e50bbd952b9d3da23788bdd76dc63d2d7cb3bd59220b3d7e4f633d7cf91ebde1db923d2333863dd90ef53dfb1b29bd309d0fbe4f9dbd3d9c365d3d57a273bd67be453d2576ff3c0b43abbde685163d9d25b73c9e81183dc2bb15be5940dc3dd0c4cf3cd1d5813da770743ad315c33df03939bd00000080845d503d6a53e83ccd6fb73da62cfebd00000080394c583db596843d6201debd1f8fdfbd2971753d652e6fbd538966be80adb9bdc0b3523dd9e04a3ddf6dcdbdbf735d3d780f083d5d2a91bd6ead653d6e14023d8e3fd4bd8fda6bbd9f37633dc054593d64394abd3a8099bd5d0f71bd21308fbccde4c1bdbb5139bd100693bdb242303c8a24ce3d22bc3a3dc7bb9bbdb39e823e728962bd89a779bd5f050e3d61c0393db4f7a53d2828873d21de963dd6f4b6bd38820abe1b387abd071b30be521083bd6bf50d3d5f208e3d217c31bd92a961bd07ba613dbd25223d3f48bcbd38d9fe3c41ed143dd523d63e2f30a6be5ce7723ed5b3503f886143bf5a97d9be00000000c1361f3cccf6febe00000000c6a7ccbe1e6ae3be53717f3e548d0ebf056cf73e199933bf669a223feb077ebe93bfcebea700913e864805bf36c6f3be43625dbfe4309a3e1cd6843f05862bbfa03fc9be5676dd3e6810b4be955869be71381b3f575a89be84a727be4f278bbe140b873fb12947bf300e3ebed7eaebbe6f18e1bb9c9930bf87c0a83e00000080a1a9bdbe755354be942526bf6d1c6c3f0000008047d1c4be18e0f0be20e6483f35e1493fbe1adfbe1a73d93e563bcf3fc2fb273f6dc3bfbe63b5b8bec4753a3f2078c9be5a7578bef4c2033f06e3d0beea966dbe5368403f7778d63e0baccebe57bfc5be121fb83eee420b3f4424db3e763c033e51892f3ffbd8a83e686e053fdedaa1bd5d843abfda1faabe45450d3f5488eabf600cce3e4ce3e23ed6a181be523aa9be643b16bf1a7ff5be76e208bf7cdd253f4b25803f6a67e33e789a9e3fe61fee3e8b9781bea90601bfbac0a13ead42cd3e5d54cdbe8fe093be127f2a3fccd368be1ee887be106a0c3eeee1893e82029f3d5725303e5642663f3d48ee3e04d172bdd485bc3d3547ff3ebf8421bebc6bdd3e023dcb3e23feb53e08b1363fd7ad833e92133b3f3091903e0fafc13f51adf03e4b2e0c3e6612e43e92e1dd3e0d091e3fb063833e89ea713e2698003ff850313fb951543ede510b3feed7bd3ff97d843ee29fd73e0161653eeff1ad3e7755773e09023e3ffd34bd3ebdd90e3f0fa2503eee46033feb2b833eef03febd8727d83e8ae5a83f665d0a3f17b35d3ed825d5bba70bf43e5126033f1900253e8a569a3e88b7ba3e7c70703eb6557e3ec9ba7d3e7c13ae3e5830d03ecc7b683e5adeee3ec9c7b83f2190693e9937bc3e9a92b03f24c51b3e101df53d7e77e33eff50233fd3666b3edf57163eee32983e5416253eee1e513e11fe1f3ecdef4e3ef340053e45e4273f99bccd3eb76e623e886d9e3f8fbd5a3eb44f543e27dbc33fd1e4d63eea1a123f46521a3f75a2083fdbf8533e77cf6c3e0bdb8a3eec4fc83e2d98653ec5310c3f5ec9ea3e3bf2513e3fb13f3e7277c13edbe3bf3ea6c69e3ea869d03eee4ba73eb83d76be0000803f +\ No newline at end of file ++656137356462666463346364333361623035396139323366643262383764363763323530613631653861653634666630333030316562323662346133633566313233326432366139383465346338376266393132363438333765656463366235613461313434346139333334396265306163333731646537376430643834323664623863366163343363643130313435636565623834363361316133393230363937653835653762353534626439663133633538623062353439646237616630333237666136663433386334626639643465303163653832333062643863333664336630376231643964316231663933656333386338656262303734376137313565643963396535653131303763646533393234333735613333633132353061393531333935623539643834373266303861633739373862366663376365383233326139383939363566373061373361613939336537366631353334346563313061373365663635633332663437653136383235343635623234366430373330366336363237623962656465373233346131343264313137653838643334616430346339363732613237623866636364313232613934343261643231386531356430343965303330306332326266336634626163333461643139653962326566303064343333623037313762303934626336363537616339343637306633633066333231613063623339333539376461316632653234353938616133616463623534356232346135666261616339646638373031356633306161626465643665633066616264623965656138613233353331303236363565616133323131653935643363353832366663633434626236376663386335666364333530336433353234383031636264353761616638663031613263353738326438656265623236653338323232386565626464393034633962373835363264656664616439353336623462376139333134656662373033626135336138333136643032636430653334303861616439333736306363383862306439623962646435383931613161653334623666313236366366373962356536656434396231303338646265323666386461366430396262353536313433636132653466623061346164303635636162336536383062306637306438626232646636393462353563366437386531316463383239373361643230643566333736663330656538643461313161306163666361663064383962373736636162323565383865336630333461343939336231366437386265323439626332336166376262623837353163376533353066636339313233323761613766333633343432623331373530376432376534623831333339383964633439653966303663303439623739346133626330646333373831393930316439326233646565623761646664356230336233363230383833616266643463626536336133346664656433373630343738343262373863646131373633653939636430616439393731306435313664323166313530636464393664613738643461356437333564343036316262353462313336663335313763323661363564343330623965363866616534323163616337613964356465333333663739313835623363316462346539643539636435303166306664366135313063333630336531336532346134306234626632313565333739356139613430643630613834353666623762363363326431316538313730316132333165383561666434623564623831616263316464353664323731373332656430323435663836393162646335646164333437366436663633353630373762316161316436333461393763343130396235396237333534376639343063316463386432316330663138373338386632643361386565636665653766643836373930326664346163333162613163396664663531626161326134333762666330393261316637333265326138663036363736333734326230373730613665323665333266326265356534376133373330393466336639623431383863383433643563346265613561326134616334626262646637666163376365333962323036323537333737313534313066323364623937303534623665623237306566636366623763623431373666383236383134326365613136653238623231646339353938373030656631646238653961643434653765313834316231346231323563636633356131376538636666613866303638323362643436636638353665346166323633623861616639363862613835306234643961666662656334393366613066653061383333653965343633653837613263636530633863303861313363636361346132613431363962336539613463313366633934633761363761623039396365393263633638656134376162616631363838616238613137323032663864613035313363353335396432373530363233663234343136346339663435333834656630366537353336306137366434646264303466303633663630386363636337383066316631323836353135393134646139373035323637346164303965323536666335393864636364346433383564616435383862366464326130336536313934636139386438373462333162366230623165323533336263313430386430643661386261633061313631613639313232313734383136336464636231323130336231613131373336636238656635353635626666303535663331363332353338313363396334313631333034656133666365353561643830356565633137346638643739636136376432303761633436666465336232356236653164353163346461656165333038653266336161383966633961323462376262386430363536323932373263343731313562356139336265383765336139643837333966623265386131666561366161333138353261663139303338393733346335313934363761393137633266616431663436343236613231663865636236346133616662623761373633663830623231393063616534633032316538626436633731643261313866666339353836373133363939623966636239333637373764323863663964376134323134613133346335326162363334363334386666396666336534646565616264363966356566323465346538613762623864303766646338666264643466666537303263623162326539653435643130313061386235623631306232303139633962653661303038376162323666343263356465313035393535313531373665623537363237373934366266303434333463303562646535623762313565326631393061643033613838363963333933396162363834393636613136613036656435356339616665636134313034663335343465646335333738663063616330353634316466336438373835326638633037656234396239303730613338336332613839323837643561333235393366323531303032666436616435313137313166383433396531626137353932616538333330653164326438656166343339363262366264326632376564396434396333356565343733383137666333386462313164333630303936336137323863313634613031393931613164363237643262353162376133643935373036306336346161376563383862316365613139376535626535626234666331363163303835336362393535613530636234643633626364386566396132333232373733633230393865663664636334393131643133383736373833653261383661623632343237653937366164386564633433396130313166306430316134653864643263343835653438383687144d4395a52c41e3842b41b03f3743f207264565084e41e0fbdf3bd429b13da15eb33d7e47b93b309daa3890d2d93d0000803f2bd5bbb95024b1399db4d33b2956c2392e99b93969b868ba90d4eb3bd611a9b92d66c739a2115b3b1257b6b9ed46b2b917deac3b5b0d2a3c14887e3ce74b3eba00000000386ea33945cf5abb9cbce9ba3c4ff0ba5cc05b3bb4bc9eb91e09b03a6113e9ba3e17b539267dab397fdea9b906eea5397713973bfc7a2c3b51ff733b50c313bb000000003ea5d239842c8c3b6d275e3af5c58b394139b2391f56403bf47e56bb357a81bbc956923b57d2cc39909aaf3944807e3c4a78cbba9938a9b9d84ba7b99796bcb9d7bea5b925e37a39ff19a73ba25dc1397357b1b9c13cd6bb2d36423b6da35a3b90e500bbd402a0b95f930bbbbddfaeb9071fa9395281a6b934e3a23b0f238a3b93e1973964d9a9b98b2f273ceb4a9aba3ccab6b9e3ba033b48220ebb19fcaf3b6290a93b034407bb782fa1b89335943bb388a7b9655cc0b96a361d3c67b9b0b99e28cf39076f2dbaf05afdbafa6cb43962bc2e3bd590bebc30339f3b4476a23963e6663bd7ad183ca5c00cbb8a69c13956f574ba97dcf93b97098d3bd471ae3bcbb06dbbfddca9b9d7c803bc3eb703bc9bdaa8bb1c6718bc8e5bcebb224c013bcb7684bb4d85943b2ef403bcc090893bf736aa3beb62a13b3c1cb6bbb821a7bbaee808bbfe3dd33c0000008027b0f5bbd6e09d3ce2db183d576fb7bb3b9761bc4111813be54cbcbbab65043c93ce1cbce0b3c6bbca4e953bb347fabbc466ff3a88f532bc11bf4c3b72636dbc00000080850afebbaaba5e3b476f973c7ae755bbde44dbbbfac1633bab7793bbf3ae90bb010a84bb3a1c2cbc6597d6badcd277bbddbbfe3b5898933b6063903b0d8fb33b028d8e3b2adcdcba9f1585bb2e91b9bb582aa03b6334f8bb6e31333b5569253b1e71533b8e977e3bcaf945bba6b59c3b603ab2bb02c88e3b0ae4493bfc457ebb2ae3ddbbe8f9943b0c38a4bb160ed73b73adab3bb6100e3bf28fd83baa70603bad2683bb9d90033c52703a3c9546693b7d34903b63a6b83bf1af9cbb3b19a23bedd0f8bb58efb13b105b72bb17300abc025488bb0a5b543e33573a3b24eff2bb286f75bb5d55a9bbd8e4803beea116bca9cc2d3b1e1205bc1adf75bbfcaa413bf8b491bb3050903b6537a83c1736213cc6271abaf7b6293c98bae33be7b710bb07e333ba55389dbb41441f3c8bf89abbb9acb6bbdeb9acbb416e053be7a63fba124e36bc396d97bb000000809a5a083c15d242bca07d0abca36bba3c09ac8e3cf4ae85bb51f5cd3b8abb18bccc0f383c4a58dc3b335a9ebba65c0b3c2324d9bb35bbb83c748069bba62961bc000000003e13193c863f71bb752b1cbc578e6b3b5807f33b633e9abb9e8dac3c78dcb03c7b35193ac5df483cca37e83ad74a50bcbfd529bc33469cbb786498bbafc6c1bbe0f895bba7a7ed3a8ef7a339cb5ccd3be140abbbc2041a3ce9e950bb828f5ebbd63a6fbbdd1d85bb9d07a93cdc24a7bb765ac53b467b96bbe863b4bb083ea73a7869f83b4cf49dbb874003ba2440dfbb356db8bbf63d2abb79b4d5bb3b1bafbb27487139a45115bc45e7a8bcf34775bbfb9e98bb68e4c7bb1e930fb736e8acbb6c4b753cce27b5bb830ba83c91ce1b3cdc1def3ae198133ef5e688bb6b67073cdd1ac53a27b589ba81469fbbe150293c3c2884bba35bdd3b1583ba3ac3a194bb3debb13c495799bb078c2ebc384b7d3c97a8493cebb6853c73bb4d3c024dd83c696ca13b1cc702bc6847243ce3ac5dbcfa771abc96221fbc6147fabb3641023cb496023d28f266bb00000000bf65693ccfff0d3cbf68a8bb3b268bbc15de51bca098e1bb9c5980bdec61ee3c66cf983c1024423c51e0febb12336c3c25bd1c3dcebf50bc88eb4cbc0eaea73c00000000c56c163c597c5fbcaa7b593cb7d2e33b7b235a3c0c2447bc95c9edbba8320abc75b98b3cb7d19a3cf51d623bcd0a023d7e91ef3c725604bc91fafdbbdf1f21bc2f3bfcbb274a493b7c78393cdb0e213c01a719bc0a0368bd6ad23cbc210741bc0288df3c6303d1bb8c93bbbb97420fbcbe16353cc8bc18bcf9a61c3d62882a3c8a25683cff2e01bcb0580e3c5b11f33c47dd10bc5b522abc4230df3c6ca21c3d13c0103c8129e33c199e5c3c05752dbc9515f0bba0b724bcac0b383c1f2b12bcbe768d3c80baf23cd6fbc9bb1b04843c84da173cae2dbabc64e63dbc08d8613cbd872b3c029c2c3ce63be03c5e41853c608888bd90f692bd68bf2c3cd0f843bc4497bcbb3161e4bb4ede983c42e72bbc0283dd3b176925bcd6d71bbc7ba42cbb911a6a3dadca0b3c68582fbcc664f23c629c1b3c4f33133cd046413c7eb83c3ddbe0d1baf307ad3b00000080101215bcc0708e3ca925dd3b7899f33cfdb1493dc7bbf43bfb8b5dbb6ee9d2bb584924bcb3cf0abc0b6c0c3ce93e14bccd51943a3302dd3acfabb03ca9c22cbc000000005cb22abcaef7f33cdf6aae3ba949d6bb1ef214bc878db23c3863783b62c5363b67d0eb3c5e192dbc75b57cbb3d43ecbae55cd1bb0cd10a3c00d4083c8efd223cccb5063c1b2d70bb3abcbd3c883c14bc0f01133cee04963c6d7bad3c7480353cba707fbbc16cf63bcdd4a939ad1f123cde2f0cbc2c76053c59ff09bb009bba3cb24a0cbca7750b3c64f9493d982287bbbe111e3c7f48483cd59b5bbbea43e1baf906243d99f3e1bbbd01863c4f95d73c25a8073cfa81273c42548f3cef4b153c800841bc318aa9bb7c98873a55c218bc1338373dfa26bb3c22f2c53c6bbc0dbcc7900e3d0819da3ccec359bb9a671fbcfee714bcbb3f74bb8851ce3c1c76ac3cd7fef039437e0a3ccb42bcba11ea103a388af63b94700f3a4ef6293ad86816b900078b3bd1c62cba4ee1213a7a4b2a3bb98b30ba291c30ba4d97813b71c3033c9815a33b627549ba000000804646103a39a742bb747c0dbbe3ff41bb1186043b9aad27ba947610bad46e52bac8a9053acb9e223a86122dbafce80f3ae1144cba610bc73a34b4543be4f2bbbb00000000dbcd253ac4727d3bcf8cf33a44811d3ac114213ab973fd3a78684ebbfd3384bb29a1ed3a4a70063a11fb103a3f3aa83b629d2aba74f52cba6d1d2cba7aad31bab36e2bba566e013a79481d3b5c663c3a09cf2fba731f29bcc28e1c3b7067313b8bbaa6b9454d28ba15bab6bad4e82eba107a263a51992bbabaa220babbdc023bfd62103a28172dba1db3003cd7ef11b9c15330bae79bbc3a25ae20ba21b1feb95d37183be0206fba67882bbb784a883bba292cba034132ba4df5f43b60222fbaf5b10a3a75e20a3996f8f1bad29f0e3adb20b8397d2257bc2faa893ba3d90f3ab059bf3abba4db3b7757cdb9cde80d3a934d85bb4b8be639fadd0e3b2bb99b3b6dc792bb95cf2dbae68660be8b1d7f3e2632363e6a588c3e0dc7453e8a4979bd1d81593e87b001beae177e3e4f7449be4d9119be437e10bed7162dbe5e43333e3d94acbe80726ebd000000008bb1683edfc6573ebafeb0bdda2e62be36b673bed771d6bd2c5f453e93cd52be9a55963e74b73f3e90e102be14af6c3e2db5afbe2920f4bec3a4fabdc533b53e00000080be5a753e34650dbe4d70473f2edcce3d45ca513ebb4615be2b554bbf9d8900bf1c142f3e7350a23efc363f3dce2ea5bed70f5dbe30da00be4a45fabd5c2c24bebf70f5bdec31433dac40363ef82f333e4d140fbe16dca83e1aabccbd39e8c2bd7f85c2bd7d4bd7bd62b629bf3e2d0bbe0a0a2d3edcbcf6bd11c8c3be4ca7123e92f7543ea87e02be0c1d2d3e5f362abebc491bbe7f4c95bd614426bee96ed8bed1ba3c3e666850bee6ddd53eda2819be1c5efbbd970e2abed2e41c3e043410be6a7d983ecb6910be90d301bf8248823e5c5f1f3e50043c3ed31cfbbdb4c6663e1b1d083e27e14b3eee2aecbd72008c3eb518e03e444e783e3fe10a3e59940dbe1c03e6be9f11fdbd845eda3dbbd702beb486b7bdce1210be4662cabd997aff3ca8acd8bd5d06843d914d02beb963c83d9ca89c3d954a933d7154ab3da863b4bd1ba5283ef949f03c000000004c7ceebddd4ad1bdb34f333d5226db3d8e4ee83d5ec3593d8c34cabd488ed83de8691abe0e1ec4bd1041853dd79cf2bd503b2f3e7b316f3e4cdf7b3d0f623abe000000807d95fbbd06008d3ddcd1c6be1f1152bd17cdd6bd0a1d983d8d68d03ea043833eba4ab0bdf7d026be8359c0bc3967213e041ce33d3e29833d14a97e3d139ca73da1ae793d8060c4bc8356b7bde92cb7bd2ed5913d1c2d2abe50a24f3ddcf4463d94d5473d83a45a3dfce4b13ed5ce8d3dd2d1b0bdde047b3d6462433e65aa94bdaa15dabdfbda843ddf6caebd139fae3db36f9e3d67e8183d6bf4aa3d48e8573e985cbdbdc40fd63dbacb50be5309983d43cb7f3ddaafad3d4ad59ebd6bfe923d7fa51cbe58b4933de72d873e13a905bede95a0bd7d6cbabdcc197e3d5e80ecbd63418abd023cccbd87c7723da8b70fbe2a2867be97f7febde8028dbd3f9c8e3d8e4a6b3e19c6803d64c364be492c7e3ea28b373e97c28b3e8750453edc6c7ebdc7ad5a3ebebd01bead267d3ef16349beeb7919beaf7510be7b8e2dbe0fc8333e5bb5a0bea8de6ebd00000080e2f0673e49ed5b3e0ff5b0bdadf362be411c79be6dc5d6bd38f5473e5cfe52be16a8953ea54c3f3efbec02bec1e56b3ea8e6a6beceeefcbe360af8bde438b13e00000000fe7f743e219a0bbe578b503f0c26cf3d7a3a513e365516be9dbc4bbfd3f500bf66902f3e5686a13efa6a403dd3e599be7b1e5dbe9ce800be3b6cfabddc0224be459ef5bd6a68443da0f7363ee9db323ee40d0fbe9c41a63e8c16cbbd8073c2bdc758c5bd759dd7bd9bd829bf9d2c0bbe6ac32c3e52e9f6bd1dcdb9be9ba9123eaa61543eaa8a02beaf7d2d3ebbb72abe432f1bbe509694bd083127be2a22cdbec25e3d3e8c9950be2984e23e170217be3d83fbbd95da29be504f1d3ebd2b10bec5ca973e92d110be045402bf55c9813e1ceb1f3ed27c403ef27ffabd3b0a663e5ed4073e34104d3ecfe2eebde06a8b3e4098ea3e7f7e7c3ea2ba0a3ead3e0dbe56e6e7be4d34fdbda585d93d565202be9adfb6bdd9830fbe879ac9bde754003dfad4d7bd69c8833d9cc801bea9a2c73def389c3ddaed923dddc2aa3d3cc1b3bdc90c283ebfadf13c00000000f887edbd1c7dd0bd6f8b333d324cda3da364e73d3ead593d736fc9bd35b7d73d87d519bedf5fc3bd5c00853d67a4f1bd58a12e3e929a6e3ecf6f7b3da3c539be000000006f93fabd0aa88c3d88b0c6bebaf651bd98f4d5bd99ae973df84dd03e3efc823e46afafbd813726bec6c4c1bc63d1203e0f38e23d56ed823d85417e3dfb17a73ddb51793d9acec5bc5aafb6bd5682b6bd407b913d899329be5d954f3dedfb463d3dd9473dd18b5a3d75b6b13ebc7c8d3d7832b0bdc9a57a3d02c6423e454294bde339d9bd419b843d8dd4adbdce06ae3d31fc9d3d5952193d8662aa3d264d573eafabbcbdf03bd53d532f50be229b973dad607f3dfe20ad3d86599ebd9fa1923d940f1cbe814e933d2fe8863eda2105be8016a0bd11c0b9bd30a57d3d8f8eebbd6eee89bd1e75cbbdf16a723d0f290fbe186166be26f5fdbd51aa8cbd14418e3da3b26a3ef78e803db37c55be8869803e551f343ee1688d3ef091463e67047abd32cf543e448a01beb9c97f3e5edf43be38bb19bef68410be0e5427be2b1c313eda8aa5be8b506bbd000000001f106a3ea4754d3e08b6afbd824756be411864be609dd5bd781b453ed7a654beea8f973e7373403ecebe02be091e6e3e4d0aacbe8d43eabed2b3f5bd1d09b73e00000080f3ed763e3d9d09be7ae8433fd0eccd3d6ccb523eb88314bec41f4cbfad7a00bfb3112d3eafc0a33e9b7c3a3d956b9ebe160a5fbeb9af00beb8d9f9bd597c24bebbf7f4bd1724403d29fd333e8996323ec9160fbee170a63ea862cabde5e5c1bd27dfc3bdf772d6bd58af2cbf06240bbe0b7d2d3eae46f6bdbadcbfbe0ae7113eb604563e525a02be213f2b3e9a632bbe177c1bbef8df94bd6ac827be0811d4be4cee393eff3152bee817cd3eac6e14be13f6fabd0d742abe89e31b3e3a3e10be16c5993ea6e910be3fe703bfac2f833ef7a21d3e7909373e8ee3f7bdd21e683eddac073e9689483e3d17eebdeb118d3edf18e23eadd2783e23610a3e06310bbef644e6be7ea1fcbdd5fc54be10f67e3ee44a323e40788c3e81c4443ef9a975bd46e1523e93c9ffbdb4e77d3ea3f442be80fe17be65d30ebeed6f26be7f4a2f3efb84a4be77da67bd00000000242e683e481d4c3e10a2adbdd0c855be48bc62bef58fd2bdbc95443e4eb652be269f963eb89e3e3ea51801bec03a6c3ea7feaabef522eabedd0ff4bdd458b63e000000001c0c753eadc408beb11d433f38f3ca3d58f1503e7ba413beaa0b4cbff38300bfe6452b3e46cfa23e204c383d95699dbe50115dbe5e19febd0d9cf6bd6fb722be14bef1bdc62d3c3d292d323ee5ef313e1d660dbe7c24a63e8ad5c8bdf659c0bd7e0dc1bd856cd3bd5f882cbf457609be5eb42b3e130df3bd07cabebe3a36103e2128543ed9b400be3f72293eb79729be3bbc19bea27493bd7b0026be49f4d2be4216383e044450be0965cc3e0f9213bee4b7f7bd54ab28be262a1a3ef8880ebeb615993e2f3b0fbe2ec103bf733d823e33e41b3e43a3353ea33af6bd793b663e5506063e46aa463e6cffeabd171e8c3e1e40e23e3653783e79b8083e09560abe8942e6be3d6ff9bdbd38d93d863e02bebf73b6bd4a890fbed80bc9bdf4adfb3c5db3d7bd6f6a823d34b501be4a36c73d35f69a3d6f9f913d6f17aa3dd251b3bd7c66283ede43ec3c00000080903fedbd9345d0bdc3a9303d1506da3d815ee73d29ce563d1dc8c7bdcc47d73ddbee19beeecac2bdf8a3833d3564f1bd010b2f3e814a6f3e1579793d78ea39be00000080c45dfabda5c78b3d4464c7be72ac4fbd4780d5bde0c0963d090bd13e3866833e4436afbd9f6326be3986bdbc511e213eb3dbe13d858e813d097d7b3dc9e3a53d9088763de970c1bcde44b6bd13e0b5bd472b903d739129bee15d4d3dd9b3443d1167453db9af573d3f56b23eca288c3d9c79afbdb1db773d2c51433e6f8f93bd58cfd8bd553e833d5e58adbdba47ad3d43bc9c3d72ed163d379ea93dedf6573e1a4fbcbd4fc8d43dae9a50be7ecf963d489e7c3d47f5ab3d02bc9dbd1253913d5c271cbef35d923de557873e491805be1b819fbdf021b9bd76a77b3d5f44ebbdc82689bd8737cbbd642b703d59300fbe636d65beae3efcbd3fe88bbd665d8d3d2b616b3e6f5e7e3d33c4ef3d5a4ff3bd7221abbd56eb05be0ea3bcbdde3df03cb7dfc9bdeb15773d1eb0f2bd5dc1ba3dd366923d99b6893d81cd9f3d1c33a8bd0aed1c3e4f05e13c00000080dacfddbd5efae8bdeae7273d86fcec3d8ad5023e45194c3db68abbbdffc7c93d53820fbe25d1b6bddc5d793dd2a2e1bdba0d233ec440853ea3646b3d857249be000000007043eabd30ab833d07e5cfbe76a644bdd424c8bdbff98d3d7c0be53e2564933ec465a4bd3e031bbe2e56b5bcca32163e3192d33daa7b753da05e6e3def8e9c3d62bf693d4910b9bc23efaabdd2cfaabd035c883d22501ebef256423d6b493a3d002e3b3dc8e94c3de1f9c33ea59f843d07eda4bdcdfd6a3daec8353e0fc58abd45e9cabd99a0783d9faaa2bd18e4a23d8b0c943d8d7d0f3db27d9f3d48dd483e4686b0bdc977c73d3ebb61beade78d3d7a6b6f3da931a23d523394bd726f893d4b9311be6fee893da5ab933ee588f8bdb6d395bdc374c6bda6756d3d41f9dbbdec1c81bde952bebd800b633d339705bec81c70be3e29ecbd59ab83bdd329853df8b67e3eb00a713ddfe7da3d302f03bedef5b7bd617710be3adecabd9925003dac34d9bdff59843d94a402be1adfc83dc9099d3d93a6933da9bbab3dbad0b4bd7523293e061df13c000000803316efbdebccd1bd67cf333d53b0db3d8ee3e83d1c555a3dfab0cabd6816d93d61d91abe7895c4bd4295853d833af3bdefc02f3e6800703e197b7c3d20f43abe00000000ea3afcbd76558d3dc78ec7becc9652bde652d7bd9478983d8d2ed13eacb8833e0bb5b0bda74d27be3905c1bcc5dd213ed4ace33d817c833dbf4b7f3deb03a83de74e7a3d9a0fc5bc92c5b7bdcf9ab7bd5030923de4ac2abe712a503d8179473d7a5a483d66365b3d318db23ea6278e3d893bb1bd31a67b3db0fe433ee10395bd989edabd042f853df9d5aebd7b08af3dc5d19e3dd85a193d715bab3d3f9c583ee0cfbdbde795d63d457751bed464983d2137803d951bae3da5349fbd095a933dee161dbe810d943d77a7873e8b0306be42f6a0bd9eddbabd9fb67e3db718edbd44958abd3cbaccbd105f733df81b10bef0ee67be7fa0ffbd3c588dbd8ff28e3def146c3e0718813de067dc3d06c0edbd469cb0bdeadf02be9afab7bdf09be93cdb30d3bd137f703d05c5ecbd0b1cc03da3928e3dee15863dd94aa73dd501adbd8925193ee9f7db3c0000008069b6d8bd8bd8d8bd54ad233dc8f8de3d90f3ef3d7c8c463dcc09b7bd9ce5c43de6400cbe894cb2bd5ab9723da474dcbd0a1c1f3e7f24853e7081683d7f3629be000000806996e4bd6029833d2114cebebb803fbda83cc3bd89748e3d60d6fe3ef831963e57f5a8bdeb7f17be8437b0bc0497123e6177ce3db5ee6e3d23fd673d627f983d127a633d12e6b3bc091ab0bd9d90a6bd6dc3843dcf6f1abe97e73d3d42b3353db840363d7958473d2e08d53e711d813d78cea0bde6b0643d7357313eab9c8cbda337c6bd9000723d57d4a6bd08de9e3db62e903d2adc0a3dc98a9b3da3e4433e9761b6bda3a2c23d88e95cbe44258e3dc703693d85019e3d260797bd4ed0853dad460ebec774863d7b2d9a3e4bddf2bdaf889cbd62bdbebdd3b36a3d1eead6bd600882bd10dac5bd982e5d3d5b8d02be3cc450be27a8e6bdc1da84bd8da8843d7dbe813ec89a6a3d5d4fd93d504a02bef881b6bdda840fbe6651c9bd08e5fc3c039fd7bde720833dfabf01be1a5ac73de7b49b3d255c923dbf54aa3d0961b3bd2b1b283e27d3ed3c00000000bf63edbd2f3ed0bd8cc5313d5e17da3d593de73d7e1b583daf29c9bd3481d73d4fdc19bef20ec3bdf15a843dc583f1bd42b22e3e17b56e3ef0137a3d86d939be00000080327bfabd7f108c3d1eafc6be445850bdf2b8d5bd3027973d7c49d03e1809833e8d49afbd494426beb813bebcb2dc203eb50ce23d8f44823d19e57c3d21a3a63dc8ed773d3b12c2bc7b52b6bd2322b6bd84e7903d63a229be96fa4d3d0855453df235463de0fc583de3bab13e86e38c3d66c9afbd4844793d62dc423e51b693bdd100d9bd36f5833da96cadbd4d9fad3d067b9d3de17a173d2ef6a93d9f66573ea456bcbd4703d53dde4750be7513973da7067e3d68b4ac3dbbdb9dbdc20f923d03181cbe9ac1923db0f4863e731b05bec99b9fbd8f67b9bdef4c7c3d4e68ebbd955389bd3531cbbdf603713db5290fbeafa966be4ce2fdbd4e138cbdb0ab8d3d20cd6a3ebec67f3d9ef957bec152813e51cb353e30558e3ebd52483e7b4a7fbd8d69563e912703beafcb803e766246beae671bbe1e3012bed9ca29be58b7323e5333a6be425070bd0000000002d56b3e9e104f3eb773b2bd96c058be00b365bea4b5d8bd05fb473e264856be637f983e1c29423e675d04be1de46f3e63e3acbeebabebbed539fabdc7e5b73e0000008056b7783e41f20bbe89cb433f31f4d03db489543e23e316be25e44cbf124b01bfc8ae2e3e2cb1a43e2ba9403df0169fbee4a860be1f4e02be7910fdbd392e26befc28f8bd68ad443d439d353ed46c353e7ac010be6a06a83ec8a1cebdf813c6bd4ed7c6bd5a93d9bd51652dbfe8c90cbe592c2f3e997af9bd79a1c0beb47e133eabc4573e0df903be26d92c3ea30c2dbe23271dbe1b9e98bde47029bea0c8d4bef08a3b3e45d453be30e9cd3e5fcf16beb92dfebdf1282cbe84801d3e71e411bedab19a3e8e8e12be2aad04bfee17843efd391f3e6195383e086cfcbde1e1693e263f093eca2b4a3edc29f1bd7efb8d3e0590e33ef7c97b3ebbf50b3e55880dbe9ddbe7be46e8ffbd1e25dc3d43edebbdb496b2bde2cb01be45cbb6bd770cdf3c01c6d8bdc1256f3d43f5eabd8362c53d3cc28d3d4c54853dd29ba53d9770afbd10b4173ecd44da3c000000007929d7bda647d6bdc5a7223d5f05df3dcf97f03d8766453d51a8aebd044bb93dab090bbe092cb1bd0b5d713d14dcdabd85401f3ef69e863e5642673d834627be0000000027e2e2bd9f81833d66c6cfbe2b5e3ebd45efc1bd2f718f3db1c6033fe13b983e9b6eabbd711b16bea4adaebcd0d0103e190bc23db6976d3d9daf663d159c973d4532623dd556b2bc09a2b2bde48ba5bdf103843d55930ebecdbc3a3dfc6c323d04ba2c3d4a32463d0351da3ec763803dded69fbdad67633de850323e85e68cbd5ce2c4bd54a5703dbedaa8bd0bf1953d535b8f3da31c073d67eb923d5e27463e09a1b9bd6537b73d18d55dbe873e8f3d0bb5673df5129d3d348197bdfb0e853dd2060dbeb05e7d3daf629c3e4dfaf0bd170c9abdae6abcbde6c6693d0463d5bdf8c081bd8700cabdae54513df47a01be929b4abeac77dabd8ba984bd5f2e853dc5af833e124a693da068db3d435203beb583b8bd3e9310be5542cbbd5318013dc8b9d9bd6eef843d55bf02beff62c93d6e9d9d3d903b943d7c4bac3d1b60b5bdac48293e54fff23c000000800d6aefbdb152d2bd28e5343d122edc3df060e93d657b5b3d5716cbbda297d93decde1abedbfcc4bd882a863d5e8cf3bd52e42f3e28f76f3e889f7d3df5ae3abe000000000d91fcbd8ce78d3dd882c7be65b053bd17b2d7bd7b08993d5422d13e67b2833e2a45b1bdc15b27beeca6c2bc9806223ef729e43df711843d263b803d8595a83d58797b3d59b0c6bcfd53b8bd1a0fb8bd67c5923d5abe2abe1d48513d1a93483da975493de55b5c3de780b23ee0bc8e3dbaa9b1bda4d07c3d8618443e2c9895bd10fddabd50c4853da666afbd4696af3def649f3d875f1a3d24eaab3de3ac583eff5cbebd1d18d73de03451bef2f5983d2acc803dc3abae3dd1c79fbde8ee933da02a1dbe939f943df69f873ee62506be5d89a1bdf268bbbdc9d87f3de56dedbd112a8bbdd742cdbde882743d663810be419467be8de4ffbdfcec8dbd77838f3dfa106c3effac813dbb9bd83df8e801be3be5b5bd5c1f0fbe06acc8bdc9bcfc3cc8ecd6bd0ab4823d0c5f01be68b1c63dba2d9b3d33de913d7bc1a93d9ec6b2bd76ae273e0ec5ed3c00000000c2aaecbd1590cfbd9163313d7863d93de182e63df77c573d8680c8bddaced63d3f7419bedf6dc2bd69ec833df5c8f0bdb5442e3e68416e3e4a4b793dc16a39be00000000a3bcf9bd9f9a8b3dbc70c6befed24fbd1c0cd5bdaea5963d030ad03ee3ce823e66b2aebdb3d925be5c59bebc4671203edb54e13d8bd8813d0d167c3dea11a63d5025773d6255c2bcd7b5b5bd8f8ab5bd1b6b903d653529bedd6c4d3df0d3443de5b2453dab5b583d587eb13e516b8c3dc436afbdba79783d666c423e6c3893bd4e52d8bd1787833dfcd6acbd3f09ad3d1ff29c3d1640173d2f63a93dfdf4563e91b5bbbd4c52d43d74d64fbe0592963dcf357d3de31dac3dd9539dbd6592913d44af1bbe5244923d61ba863e42b904be32129fbd88c8b8bd8a817b3d3fb0eabdcee088bd3986cabd3046703d5bc40ebe001b66be521ffdbd859d8bbd10348d3dc4596a3e52f37e3dd45312bfaa602f3fe11df63ebb01413fd9d0073f7a322cbe4829113ff6b1b0be71a42e3fb04c06bf5589d1be8e02c5be4ccee5be0ce2f13e93a161bf7fa221be0000008066e21f3fe5430c3f2b1371be2dd712bf8b9d1bbfebcd91be8376073f3e0611bfeac64e3f7c9e033f4456b2be03a1223f78686abf9a0ca0bf4028a9bed568793f0000000054a3283f4649bdbe97d3044098a78d3eb917103f8020ccbea7380bc00f93afbf6e6cec3e44555f3fdc9c023e1ef757bfe81018bf1187afbe5e6baabecc24e0bec01aa7bed44b053ef6d5f53e55fcf53e6312c3bed4af633f98978bbe09c985be245586bef96492be41baedbf6cb8bdbe2880ed3e13f7a7be76a582bfc688c73edc3f123fd4cbb1be33ebe93eab18eabe37eed3be14f64dbe0e34e5be785690bfe6ddfd3ea95b0fbf14af8b3f3207ccbef42fabbe9c3ee8be041ed53e8aa7c4bea8cd513faa28c6beffc3b4bf8917333f1f72d73eb7f2f93ebba5aabe9d8a1e3fcd9fb93ef1dd083f7006a3bef684403fd48d9a3fe9ab2a3f1f4fbd3e516fbfbe72649dbf5459acbe3ce2d63ea5fae63e06cfa93e0415b53e83fcd73e8a1c973ee8499d3e09299f3e10e6c63e0965983e4656b23e3343b03e9e43b23e57bd953ee7f3f33e658b873e941a7bbe04d2bb3efa54463f9b0d723ede9fb43e4aa5163f555c8c3e7521803ed201e23e9267873eb546b83e07bcb03e8701af3ef7e2da3e2b1a333fcc08b23e71089b3e6536c2bdf12ce43ec3c4a43ec9deee3e6c04af3e90f0c53e19aa783ed7025a3f4a48103f9794d23ef147973ed7634a3e9084ee3ef8e9e73e1837a83e1fe2ac3e7ea6bb3e24ec9b3e3dc32f3e27f7bd3e2315cd3e3a67af3e5dde993e6209bd3e991da23e0112953eeb4cc73e4ea7373f4389b63e53e5bd3e38aba33ea7beed3e2dded33eadb68e3ecaeeb23ec79a963ef6c5d53ef7deb93e4ef4a53e4b2ec33e7825013f4d50bf3efc34e03e747f973e4718a23eeadfc03ea089cb3eab6da43e8e25a33ebef7e43ea9b3ea3eebbb0f3fcbaa8f3ef829043fcc263e3f917b973e8b8aa43ea938eb3ef7ec9a3e9acda03e25379f3ef927b13e73b5b23eedf0d13e6f15993ef928033fdae3d63e81e4963d +\ No newline at end of file +diff --git a/gcc/opts.cc b/gcc/opts.cc +index 432b822e8..7508fc817 100644 +--- a/gcc/opts.cc ++++ b/gcc/opts.cc +@@ -3486,12 +3486,19 @@ common_handle_option (struct gcc_options *opts, + break; + + case OPT_fauto_bolt_: +- opts->x_flag_auto_bolt = true; ++ if (get_optimize_decision_from_ai4c ()) ++ { ++ opts->x_flag_auto_bolt = true; ++ } + /* FALLTHRU */ + case OPT_fauto_bolt: + if (opts->x_flag_bolt_use) + error_at (loc, + "-fauto-bolt conflicts with -fbolt-use."); ++ if (get_optimize_decision_from_ai4c ()) ++ { ++ opts->x_flag_auto_bolt = true; ++ } + break; + + case OPT_fbolt_use_: +@@ -3499,6 +3506,10 @@ common_handle_option (struct gcc_options *opts, + if (opts->x_flag_auto_bolt) + error_at (loc, + "-fauto-bolt conflicts with -fbolt-use."); ++ if (get_optimize_decision_from_ai4c ()) ++ { ++ opts->x_flag_bolt_use = true; ++ } + break; + + case OPT_fbolt_target_: +-- +2.44.0.windows.1 + diff --git a/gcc.spec b/gcc.spec index 5712046ad5db3f3a033db29a3fe0c6df5abf2a4f..940b51688c13604f01b4aa55541ac4dc90c6d795 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 73 +%global gcc_release 74 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -461,6 +461,8 @@ Patch352: 0352-Add-hip10c-machine-discription.patch Patch353: 0353-Add-hip10a-machine-discription.patch Patch354: 0354-Fix-for-hip11-and-hip10c-addrcost_table.patch Patch355: 0355-Fix-errors-in-ipa-struct-sfc-IBMY84-IBN2JO-IBN42Q.patch +Patch356: 0356-add-llc-allocate-feature.patch +Patch357: 0357-Enhancing-BOLT-Optimization-with-AI.patch # Part 1001-1999 %ifarch sw_64 @@ -1602,6 +1604,8 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch -P353 -p1 %patch -P354 -p1 %patch -P355 -p1 +%patch -P356 -p1 +%patch -P357 -p1 %ifarch sw_64 %patch -P1001 -p1 @@ -4239,6 +4243,12 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Mon Feb 24 2025 chenhong - 12.3.1-74 +- Type:Sync +- ID:NA +- SUG:NA +- DESC:Add feedback llc allocate and support llc prefetch instruction + * Mon Feb 24 2025 huzife <634763349@qq.com> - 12.3.1-73 - Type:Bugfix - ID:NA