diff --git a/0365-add-llc-allocate-feature.patch b/0365-add-llc-allocate-feature.patch new file mode 100644 index 0000000000000000000000000000000000000000..e7c8e126ae3f9ac4549925bbfdd56e90571815c6 --- /dev/null +++ b/0365-add-llc-allocate-feature.patch @@ -0,0 +1,8452 @@ +From 43e93c6df874a0bf78675fb4d3586d9ad1cb7dac Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=E9=99=88=E9=B8=BF?= +Date: Tue, 25 Feb 2025 16:27:36 +0800 +Subject: [PATCH 1/2] add llc allocate feature + +--- + gcc/Makefile.in | 1 + + gcc/auto-profile.cc | 491 +- + gcc/auto-profile.h | 30 + + gcc/builtins.cc | 82 + + gcc/builtins.def | 1 + + gcc/cfgloop.h | 3 + + gcc/common.opt | 28 + + gcc/config/aarch64/aarch64-protos.h | 6 +- + gcc/config/aarch64/aarch64-sve.md | 48 +- + gcc/config/aarch64/aarch64.cc | 18 + + gcc/config/aarch64/aarch64.md | 39 + + gcc/dce.cc | 1 + + gcc/doc/tm.texi | 21 + + gcc/doc/tm.texi.in | 6 + + gcc/internal-fn.cc | 115 + + gcc/internal-fn.def | 4 + + gcc/ipa-pure-const.cc | 1 + + gcc/optabs.def | 2 + + gcc/opts.cc | 52 +- + gcc/params.opt | 62 + + gcc/passes.def | 2 + + gcc/print-rtl.cc | 6 + + gcc/rtl.def | 9 + + gcc/rtl.h | 4 + + gcc/rtlanal.cc | 2 + + gcc/sched-deps.cc | 4 +- + gcc/target-insns.def | 1 + + gcc/target.def | 31 + + .../g++.dg/llc-allocate/llc-allocate.exp | 27 + + .../llc-allocate/llc-relion-expand-kernels.C | 52 + + .../g++.dg/llc-allocate/multidim_array.h | 186 + + gcc/testsuite/gcc.dg/llc-allocate/llc-1.c | 61 + + gcc/testsuite/gcc.dg/llc-allocate/llc-2.c | 54 + + .../gcc.dg/llc-allocate/llc-allocate.exp | 27 + + .../llc-allocate/llc-cross-bb-indir-mem-acc.c | 36 + + .../llc-allocate/llc-extend-outer-loop.c | 61 + + .../llc-feedback-branch-in-loop.c | 39 + + .../llc-allocate/llc-feedback-break-in-loop.c | 41 + + .../llc-allocate/llc-feedback-goto-in-loop.c | 50 + + .../llc-feedback-same-loop-cycle.c | 129 + + .../gcc.dg/llc-allocate/llc-nonzero-offset.c | 50 + + .../llc-prefetch-full-pldl1keep.c | 14 + + .../llc-prefetch-full-pldl1strm.c | 14 + + .../llc-prefetch-full-pldl2keep.c | 14 + + .../llc-prefetch-full-pldl2strm.c | 16 + + .../llc-prefetch-full-pldl3keep.c | 14 + + .../llc-prefetch-full-pldl3strm.c | 14 + + .../llc-prefetch-full-pldl4keep.c | 14 + + .../llc-prefetch-full-pldl4strm.c | 14 + + .../llc-prefetch-full-pstl1keep.c | 14 + + .../llc-prefetch-full-pstl1strm.c | 14 + + .../llc-prefetch-full-pstl2keep.c | 14 + + .../llc-prefetch-full-pstl2strm.c | 14 + + .../llc-prefetch-full-pstl3keep.c | 14 + + .../llc-prefetch-full-pstl3strm.c | 14 + + .../llc-prefetch-full-pstl4keep.c | 14 + + .../llc-prefetch-full-pstl4strm.c | 14 + + .../gcc.dg/llc-allocate/llc-ref-trace.c | 62 + + .../gfortran.dg/llc-allocate/llc-3.f90 | 211 + + .../gfortran.dg/llc-allocate/llc-allocate.exp | 29 + + .../llc-trace-multiple-base-var.f90 | 62 + + .../llc-unknown-type-size-unit.f90 | 58 + + .../llc-allocate/llc-wrf-4-outer-loop-num.f90 | 320 ++ + gcc/timevar.def | 2 + + gcc/toplev.cc | 6 + + gcc/tree-cfg.cc | 11 + + gcc/tree-cfg.h | 1 + + gcc/tree-pass.h | 3 + + gcc/tree-scalar-evolution.cc | 8 +- + gcc/tree-scalar-evolution.h | 3 +- + gcc/tree-ssa-llc-allocate.cc | 4150 +++++++++++++++++ + gcc/tree-ssa-loop-niter.cc | 38 +- + gcc/tree-ssa-loop-niter.h | 3 +- + gcc/tree-vect-loop-manip.cc | 266 ++ + gcc/tree-vect-loop.cc | 10 +- + gcc/tree-vectorizer.h | 1 + + 76 files changed, 7308 insertions(+), 45 deletions(-) + create mode 100644 gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp + create mode 100644 gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C + create mode 100644 gcc/testsuite/g++.dg/llc-allocate/multidim_array.h + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-1.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-2.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-branch-in-loop.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-break-in-loop.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-goto-in-loop.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-same-loop-cycle.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c + create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 + create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp + create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 + create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 + create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90 + create mode 100644 gcc/tree-ssa-llc-allocate.cc + +diff --git a/gcc/Makefile.in b/gcc/Makefile.in +index 65f683bbd..ef7733580 100644 +--- a/gcc/Makefile.in ++++ b/gcc/Makefile.in +@@ -1659,6 +1659,7 @@ OBJS = \ + tree-ssa-loop-niter.o \ + tree-ssa-loop-array-widen-compare.o \ + tree-ssa-loop-prefetch.o \ ++ tree-ssa-llc-allocate.o \ + tree-ssa-loop-split.o \ + tree-ssa-loop-unswitch.o \ + tree-ssa-loop.o \ +diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc +index 5e85381ce..97c3bafd5 100644 +--- a/gcc/auto-profile.cc ++++ b/gcc/auto-profile.cc +@@ -49,6 +49,9 @@ along with GCC; see the file COPYING3. If not see + #include "auto-profile.h" + #include "tree-pretty-print.h" + #include "gimple-pretty-print.h" ++#include ++#include ++#include + + /* The following routines implements AutoFDO optimization. + +@@ -95,6 +98,8 @@ along with GCC; see the file COPYING3. If not see + */ + + #define DEFAULT_AUTO_PROFILE_FILE "fbdata.afdo" ++#define DEFAULT_CACHE_MISSES_PROFILE_FILE "cmsdata.gcov" ++#define DEFAULT_ADDITIONAL_PROFILE_FILE "addldata.gcov" + #define AUTO_PROFILE_VERSION 2 + + namespace autofdo +@@ -117,6 +122,14 @@ private: + bool annotated_; + }; + ++/* pair */ ++static bool ++event_count_cmp (std::pair &a, ++ std::pair &b) ++{ ++ return a.second > b.second; ++} ++ + /* Represent a source location: (function_decl, lineno). */ + typedef std::pair decl_lineno; + +@@ -311,6 +324,9 @@ public: + /* Mark LOC as annotated. */ + void mark_annotated (location_t loc); + ++ /* Compute total count threshold of top functions in sampled data. */ ++ gcov_type calc_topn_function_total_count_thres (unsigned topn) const; ++ + private: + /* Map from function_instance name index (in string_table) to + function_instance. */ +@@ -338,6 +354,244 @@ static autofdo_source_profile *afdo_source_profile; + /* gcov_summary structure to store the profile_info. */ + static gcov_summary *afdo_profile_info; + ++/* Check opts->x_flags and put file name into EVENT_FILES. */ ++ ++static bool ++get_all_profile_names (const char **event_files) ++{ ++ if (!(flag_auto_profile ++ || (flag_cache_misses_profile || flag_additional_profile))) ++ { ++ return false; ++ } ++ ++ event_files[INST_EXEC] = auto_profile_file; ++ ++ if (flag_cache_misses_profile) ++ { ++ if (cache_misses_profile_file == NULL) ++ { ++ if (additional_profile_file == NULL) ++ { ++ additional_profile_file = DEFAULT_ADDITIONAL_PROFILE_FILE; ++ } ++ event_files[PMU_EVENT] = additional_profile_file; ++ } ++ event_files[CACHE_MISSES] = cache_misses_profile_file; ++ } ++ else if (flag_additional_profile) ++ { ++ if (additional_profile_file == NULL) ++ { ++ additional_profile_file = DEFAULT_ADDITIONAL_PROFILE_FILE; ++ } ++ event_files[PMU_EVENT] = additional_profile_file; ++ } ++ ++ return true; ++} ++ ++static void read_profile (void); ++ ++/* Maintain multiple profile data of different events with event_loc_count_map ++ and event_func_count_map. */ ++ ++class extend_auto_profile ++{ ++public: ++ bool auto_profile_exist (enum event_type type); ++ gcov_type get_loc_count (location_t, event_type); ++ gcov_type get_func_count (unsigned, event_type); ++ gcov_type get_topn_function_total_count_thres () const; ++ struct rank_info get_func_rank (unsigned, enum event_type); ++ /* There should be only one instance of class EXTEND_AUTO_PROFILE. */ ++ static extend_auto_profile *create () ++ { ++ extend_auto_profile *map = new extend_auto_profile (); ++ if (map->read ()) ++ { ++ return map; ++ } ++ delete map; ++ return NULL; ++ } ++private: ++ /* Basic maps of extend_auto_profile. */ ++ typedef std::map loc_count_map; ++ typedef std::map func_count_map; ++ ++ /* Map of function_uid to its descending order rank of counts. */ ++ typedef std::map rank_map; ++ ++ /* Mapping hardware events to corresponding basic maps. */ ++ typedef std::map event_loc_count_map; ++ typedef std::map event_func_count_map; ++ typedef std::map event_rank_map; ++ ++ extend_auto_profile () {} ++ bool read (); ++ void set_loc_count (); ++ void process_extend_source_profile (); ++ void read_extend_afdo_file (const char*, event_type); ++ void rank_all_func (); ++ void dump_event (); ++ event_loc_count_map event_loc_map; ++ event_func_count_map event_func_map; ++ event_rank_map func_rank; ++ event_type profile_type; ++ gcov_type topn_function_total_count_thres; ++}; ++ ++/* Member functions for extend_auto_profile. */ ++ ++bool ++extend_auto_profile::auto_profile_exist (enum event_type type) ++{ ++ switch (type) ++ { ++ case INST_EXEC: ++ return event_func_map.count (INST_EXEC) != 0 ++ || event_loc_map.count (INST_EXEC) != 0; ++ case CACHE_MISSES: ++ return event_func_map.count (CACHE_MISSES) != 0 ++ || event_loc_map.count (CACHE_MISSES) != 0; ++ case PMU_EVENT: ++ return event_func_map.count (PMU_EVENT) != 0 ++ || event_loc_map.count (PMU_EVENT) != 0; ++ default: ++ return false; ++ } ++} ++ ++void ++extend_auto_profile::dump_event () ++{ ++ if (dump_file) ++ { ++ switch (profile_type) ++ { ++ case INST_EXEC: ++ fprintf (dump_file, "Processing event instruction execution.\n"); ++ break; ++ case CACHE_MISSES: ++ fprintf (dump_file, "Processing event cache misses.\n"); ++ break; ++ case PMU_EVENT: ++ fprintf (dump_file, "Processing other PMU events.\n"); ++ break; ++ default: ++ break; ++ } ++ } ++} ++ ++/* Return true if any profile data was read. */ ++ ++bool ++extend_auto_profile::read () ++{ ++ const char *event_files[EVENT_NUMBER] = {NULL}; ++ if (!get_all_profile_names (event_files)) ++ { ++ return false; ++ } ++ ++ /* Backup AFDO_STRING_TABLE and AFDO_SOURCE_PROFILE since we will create ++ new ones for each event_type. */ ++ autofdo::string_table *string_table_afdo = afdo_string_table; ++ autofdo::autofdo_source_profile *source_profile_afdo = afdo_source_profile; ++ ++ for (unsigned i = 0; i < EVENT_NUMBER; i++) ++ { ++ if (event_files[i] == NULL) ++ { ++ continue; ++ } ++ profile_type = (enum event_type) i; ++ dump_event (); ++ gcov_close (); ++ auto_profile_file = event_files[i]; ++ read_profile (); ++ gcov_close (); ++ ++ topn_function_total_count_thres = param_llc_allocate_func_counts_threshold; ++ if (param_llc_allocate_func_topn > 0 && profile_type == PMU_EVENT) ++ { ++ topn_function_total_count_thres ++ = afdo_source_profile->calc_topn_function_total_count_thres ( ++ param_llc_allocate_func_topn); ++ } ++ ++ process_extend_source_profile (); ++ ++ delete afdo_source_profile; ++ delete afdo_string_table; ++ } ++ ++ /* Restore AFDO_STRING_TABLE and AFDO_SOURCE_PROFILE. Function ++ END_AUTO_PROFILE will free them at the end of compilation. */ ++ afdo_string_table = string_table_afdo; ++ afdo_source_profile = source_profile_afdo; ++ return true; ++} ++ ++/* Helper functions. */ ++ ++gcov_type ++extend_auto_profile::get_loc_count (location_t loc, event_type type) ++{ ++ event_loc_count_map::iterator event_iter = event_loc_map.find (type); ++ if (event_iter != event_loc_map.end ()) ++ { ++ loc_count_map::iterator loc_iter = event_iter->second.find (loc); ++ if (loc_iter != event_iter->second.end ()) ++ { ++ return loc_iter->second; ++ } ++ } ++ return 0; ++} ++ ++struct rank_info ++extend_auto_profile::get_func_rank (unsigned decl_uid, enum event_type type) ++{ ++ struct rank_info info = {0, 0}; ++ event_rank_map::iterator event_iter = func_rank.find (type); ++ if (event_iter != func_rank.end ()) ++ { ++ rank_map::iterator func_iter = event_iter->second.find (decl_uid); ++ if (func_iter != event_iter->second.end ()) ++ { ++ info.rank = func_iter->second; ++ info.total = event_iter->second.size (); ++ } ++ } ++ return info; ++} ++ ++gcov_type ++extend_auto_profile::get_func_count (unsigned decl_uid, event_type type) ++{ ++ event_func_count_map::iterator event_iter = event_func_map.find (type); ++ if (event_iter != event_func_map.end ()) ++ { ++ func_count_map::iterator func_iter = event_iter->second.find (decl_uid); ++ if (func_iter != event_iter->second.end ()) ++ { ++ return func_iter->second; ++ } ++ } ++ return 0; ++} ++ ++gcov_type ++extend_auto_profile::get_topn_function_total_count_thres () const ++{ ++ return topn_function_total_count_thres; ++} ++ ++static extend_auto_profile *extend_profile; ++ + /* Helper functions. */ + + /* Return the original name of NAME: strip the suffix that starts +@@ -483,7 +737,7 @@ string_table::get_index (const char *name) const + return iter->second; + } + +-/* Return the index of a given function DECL. Return -1 if DECL is not ++/* Return the index of a given function DECL. Return -1 if DECL is not + found in string table. */ + + int +@@ -917,6 +1171,31 @@ autofdo_source_profile::get_function_instance_by_inline_stack ( + return s; + } + ++/* Compute total count threshold of top functions in sampled data. */ ++ ++gcov_type ++autofdo_source_profile::calc_topn_function_total_count_thres ( ++ unsigned topn) const ++{ ++ std::set func_counts; ++ for (name_function_instance_map::const_iterator iter = map_.begin (); ++ iter != map_.end (); ++iter) ++ { ++ if (func_counts.size () < topn) ++ func_counts.insert (iter->second->total_count ()); ++ else if (*func_counts.begin () < iter->second->total_count ()) ++ { ++ func_counts.erase (func_counts.begin ()); ++ func_counts.insert (iter->second->total_count ()); ++ } ++ } ++ ++ gcov_type func_counts_topn = *func_counts.begin (); ++ if (func_counts.size () == topn ++ && param_llc_allocate_func_counts_threshold < func_counts_topn) ++ return func_counts_topn; ++} ++ + /* Module profile is only used by LIPO. Here we simply ignore it. */ + + static void +@@ -1842,6 +2121,132 @@ auto_profile (void) + + return TODO_rebuild_cgraph_edges; + } ++ ++ ++void ++extend_auto_profile::rank_all_func () ++{ ++ std::vector > func_sorted; ++ event_func_count_map::iterator event_iter ++ = event_func_map.find (profile_type); ++ if (event_iter != event_func_map.end ()) ++ { ++ func_count_map::iterator func_iter; ++ for (func_iter = event_iter->second.begin (); ++ func_iter != event_iter->second.end (); func_iter++) ++ { ++ func_sorted.push_back (std::make_pair (func_iter->first, ++ func_iter->second)); ++ } ++ ++ std::sort (func_sorted.begin (), func_sorted.end (), event_count_cmp); ++ ++ for (unsigned i = 0; i < func_sorted.size (); ++i) ++ { ++ func_rank[profile_type][func_sorted[i].first] = i + 1; ++ } ++ } ++} ++ ++/* Iterate stmts in cfun and maintain its count to EVENT_LOC_MAP. */ ++ ++void ++extend_auto_profile::set_loc_count () ++{ ++ basic_block bb; ++ FOR_EACH_BB_FN (bb, cfun) ++ { ++ gimple_stmt_iterator gsi; ++ for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ count_info info; ++ gimple *stmt = gsi_stmt (gsi); ++ if (gimple_clobber_p (stmt) || is_gimple_debug (stmt)) ++ { ++ continue; ++ } ++ if (afdo_source_profile->get_count_info (stmt, &info)) ++ { ++ location_t loc = gimple_location (stmt); ++ event_loc_map[profile_type][loc] += info.count; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "stmt "); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM); ++ fprintf (dump_file, "counts %ld\n", ++ event_loc_map[profile_type][loc]); ++ } ++ } ++ } ++ } ++} ++ ++/* Process data in extend_auto_source_profile, save them into two maps. ++ 1. gimple_location to count. ++ 2. function_index to count. */ ++void ++extend_auto_profile::process_extend_source_profile () ++{ ++ struct cgraph_node *node; ++ if (symtab->state == FINISHED) ++ { ++ return; ++ } ++ FOR_EACH_FUNCTION (node) ++ { ++ if (!gimple_has_body_p (node->decl) || node->inlined_to) ++ { ++ continue; ++ } ++ ++ /* Don't profile functions produced for builtin stuff. */ ++ if (DECL_SOURCE_LOCATION (node->decl) == BUILTINS_LOCATION) ++ { ++ continue; ++ } ++ ++ function *fn = DECL_STRUCT_FUNCTION (node->decl); ++ push_cfun (fn); ++ ++ const function_instance *s ++ = afdo_source_profile->get_function_instance_by_decl ( ++ current_function_decl); ++ ++ if (s == NULL) ++ { ++ pop_cfun (); ++ continue; ++ } ++ unsigned int decl_uid = DECL_UID (current_function_decl); ++ gcov_type count = s->total_count (); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Extend auto-profile for function %s.\n", ++ node->dump_name ()); ++ } ++ event_func_map[profile_type][decl_uid] += count; ++ set_loc_count (); ++ pop_cfun (); ++ } ++ rank_all_func (); ++} ++ ++/* Main entry of extend_auto_profile. */ ++ ++static void ++extend_source_profile () ++{ ++ extend_profile = autofdo::extend_auto_profile::create (); ++ if (dump_file) ++ { ++ if (extend_profile == NULL) ++ { ++ fprintf (dump_file, "No profile file is found.\n"); ++ return; ++ } ++ fprintf (dump_file, "Extend profile info generated.\n"); ++ } ++} + } /* namespace autofdo. */ + + /* Read the profile from the profile data file. */ +@@ -1870,6 +2275,48 @@ end_auto_profile (void) + profile_info = NULL; + } + ++/* Extern function to get profile info in other passes. */ ++ ++bool ++profile_exist (enum event_type type) ++{ ++ return autofdo::extend_profile != NULL ++ && autofdo::extend_profile->auto_profile_exist (type); ++} ++ ++gcov_type ++event_get_loc_count (location_t loc, event_type type) ++{ ++ return autofdo::extend_profile->get_loc_count (loc, type); ++} ++ ++gcov_type ++event_get_func_count (unsigned decl_uid, event_type type) ++{ ++ return autofdo::extend_profile->get_func_count (decl_uid, type); ++} ++ ++struct rank_info ++event_get_func_rank (unsigned decl_uid, enum event_type type) ++{ ++ return autofdo::extend_profile->get_func_rank (decl_uid, type); ++} ++ ++gcov_type ++event_get_topn_function_total_count_thres () ++{ ++ return autofdo::extend_profile->get_topn_function_total_count_thres (); ++} ++ ++void ++free_extend_profile_info () ++{ ++ if (autofdo::extend_profile != NULL) ++ { ++ delete autofdo::extend_profile; ++ } ++} ++ + /* Returns TRUE if EDGE is hot enough to be inlined early. */ + + bool +@@ -1931,8 +2378,50 @@ public: + + } // anon namespace + ++namespace ++{ ++const pass_data pass_data_ipa_extend_auto_profile = ++{ ++ SIMPLE_IPA_PASS, /* type */ ++ "ex-afdo", /* name */ ++ OPTGROUP_NONE, /* optinfo_flags */ ++ TV_IPA_EXTEND_AUTO_PROFILE, /* tv_id */ ++ 0, /* properties_required */ ++ 0, /* properties_provided */ ++ 0, /* properties_destroyed */ ++ 0, /* todo_flags_start */ ++ 0, /* todo_flags_finish */ ++}; ++ ++class pass_ipa_extend_auto_profile : public simple_ipa_opt_pass ++{ ++public: ++ pass_ipa_extend_auto_profile (gcc::context *ctxt) ++ : simple_ipa_opt_pass (pass_data_ipa_extend_auto_profile, ctxt) ++ {} ++ ++ /* opt_pass methods: */ ++ virtual bool gate (function *) {return (flag_ipa_extend_auto_profile > 0);} ++ virtual unsigned int execute (function *); ++ ++}; ++ ++unsigned int ++pass_ipa_extend_auto_profile::execute (function *fun) ++{ ++ autofdo::extend_source_profile (); ++ return 0; ++} ++} // anon namespace ++ + simple_ipa_opt_pass * + make_pass_ipa_auto_profile (gcc::context *ctxt) + { + return new pass_ipa_auto_profile (ctxt); + } ++ ++simple_ipa_opt_pass * ++make_pass_ipa_extend_auto_profile (gcc::context *ctxt) ++{ ++ return new pass_ipa_extend_auto_profile (ctxt); ++} +diff --git a/gcc/auto-profile.h b/gcc/auto-profile.h +index bf3f90f2f..dea0b18e6 100644 +--- a/gcc/auto-profile.h ++++ b/gcc/auto-profile.h +@@ -21,6 +21,14 @@ along with GCC; see the file COPYING3. If not see + #ifndef AUTO_PROFILE_H + #define AUTO_PROFILE_H + ++enum event_type ++{ ++ INST_EXEC = 0, ++ CACHE_MISSES, ++ PMU_EVENT, ++ EVENT_NUMBER ++}; ++ + /* Read, process, finalize AutoFDO data structures. */ + extern void read_autofdo_file (void); + extern void end_auto_profile (void); +@@ -28,4 +36,26 @@ extern void end_auto_profile (void); + /* Returns TRUE if EDGE is hot enough to be inlined early. */ + extern bool afdo_callsite_hot_enough_for_early_inline (struct cgraph_edge *); + ++/* Chcek if profile exists before using this profile. */ ++extern bool profile_exist (enum event_type); ++ ++/* Given func decl_uid or gimple location and event_type, return count. ++ Count is 0 if function or gimple is not sampled. */ ++extern gcov_type event_get_func_count (unsigned, enum event_type); ++extern gcov_type event_get_loc_count (location_t, enum event_type); ++extern gcov_type event_get_topn_function_total_count_thres (); ++ ++struct rank_info ++{ ++ unsigned total; ++ unsigned rank; ++}; ++ ++/* Given function decl_uid and event type, return rank_info. Rank_info ++ is {0, 0} if function was not sampled. */ ++extern struct rank_info event_get_func_rank (unsigned, enum event_type); ++ ++/* Free memory allocated by autofdo::extern_profile. */ ++extern void free_extend_profile_info (); ++ + #endif /* AUTO_PROFILE_H */ +diff --git a/gcc/builtins.cc b/gcc/builtins.cc +index 57929a42b..dc2e9c3f3 100644 +--- a/gcc/builtins.cc ++++ b/gcc/builtins.cc +@@ -1352,6 +1352,85 @@ expand_builtin_prefetch (tree exp) + emit_insn (op0); + } + ++/* Expand a call to __builtin_prefetch_full. */ ++ ++static void ++expand_builtin_prefetch_full (tree exp) ++{ ++ tree arg0, arg1, arg2; ++ int nargs; ++ rtx op0, op1, op2; ++ ++ if (!validate_arglist (exp, POINTER_TYPE, 0)) ++ return; ++ ++ arg0 = CALL_EXPR_ARG (exp, 0); ++ ++ /* Arguments 1 and 2 are optional; argument 1 (read/write) defaults to ++ zero (read) and argument 2 (locality) defaults to 3 (high degree of ++ locality). */ ++ nargs = call_expr_nargs (exp); ++ if (nargs > 1) ++ arg1 = CALL_EXPR_ARG (exp, 1); ++ else ++ arg1 = integer_zero_node; ++ if (nargs > 2) ++ arg2 = CALL_EXPR_ARG (exp, 2); ++ else ++ arg2 = integer_three_node; ++ ++ /* Argument 0 is an address. */ ++ op0 = expand_expr (arg0, NULL_RTX, Pmode, EXPAND_NORMAL); ++ ++ /* Argument 1 (read/write flag) must be a compile-time constant int. */ ++ if (TREE_CODE (arg1) != INTEGER_CST) ++ { ++ error ("second argument to %<__builtin_prefetch_full%> must be a " ++ "constant"); ++ arg1 = integer_zero_node; ++ } ++ op1 = expand_normal (arg1); ++ /* Argument 1 must be either zero or one. */ ++ if (INTVAL (op1) != 0 && INTVAL (op1) != 1) ++ { ++ warning (0, "invalid second argument to %<__builtin_prefetch_full%>;" ++ " using zero"); ++ op1 = const0_rtx; ++ } ++ ++ /* Argument 2 (locality) must be a compile-time constant int. */ ++ if (TREE_CODE (arg2) != INTEGER_CST) ++ { ++ error ("third argument to %<__builtin_prefetch_full%> must be a " ++ "constant"); ++ arg2 = integer_zero_node; ++ } ++ op2 = expand_normal (arg2); ++ /* Argument 2 must be 0-7. */ ++ if (INTVAL (op2) < 0 || INTVAL (op2) > 7) ++ { ++ warning (0, "invalid third argument to %<__builtin_prefetch_full%>; " ++ "using zero"); ++ op2 = const0_rtx; ++ } ++ ++ if (targetm.have_prefetch_full ()) ++ { ++ class expand_operand ops[3]; ++ ++ create_address_operand (&ops[0], op0); ++ create_integer_operand (&ops[1], INTVAL (op1)); ++ create_integer_operand (&ops[2], INTVAL (op2)); ++ if (maybe_expand_insn (targetm.code_for_prefetch_full, 3, ops)) ++ return; ++ } ++ ++ /* Don't do anything with direct references to volatile memory, but ++ generate code to handle other side effects. */ ++ if (!MEM_P (op0) && side_effects_p (op0)) ++ emit_insn (op0); ++} ++ + /* Get a MEM rtx for expression EXP which is the address of an operand + to be used in a string instruction (cmpstrsi, cpymemsi, ..). LEN is + the maximum length of the block of memory that might be accessed or +@@ -7598,6 +7677,9 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode, + case BUILT_IN_PREFETCH: + expand_builtin_prefetch (exp); + return const0_rtx; ++ case BUILT_IN_PREFETCH_FULL: ++ expand_builtin_prefetch_full (exp); ++ return const0_rtx; + + case BUILT_IN_INIT_TRAMPOLINE: + return expand_builtin_init_trampoline (exp, true); +diff --git a/gcc/builtins.def b/gcc/builtins.def +index 005976f34..f2e0c357d 100644 +--- a/gcc/builtins.def ++++ b/gcc/builtins.def +@@ -924,6 +924,7 @@ DEF_GCC_BUILTIN (BUILT_IN_POPCOUNTL, "popcountl", BT_FN_INT_ULONG, ATTR_C + DEF_GCC_BUILTIN (BUILT_IN_POPCOUNTLL, "popcountll", BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST) + DEF_EXT_LIB_BUILTIN (BUILT_IN_POSIX_MEMALIGN, "posix_memalign", BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF) + DEF_GCC_BUILTIN (BUILT_IN_PREFETCH, "prefetch", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST) ++DEF_GCC_BUILTIN (BUILT_IN_PREFETCH_FULL, "prefetch_full", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST) + DEF_LIB_BUILTIN (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE, ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST) + DEF_GCC_BUILTIN (BUILT_IN_RETURN, "return", BT_FN_VOID_PTR, ATTR_NORETURN_NOTHROW_LEAF_LIST) + DEF_GCC_BUILTIN (BUILT_IN_RETURN_ADDRESS, "return_address", BT_FN_PTR_UINT, ATTR_LEAF_LIST) +diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h +index d2714e20c..794bc3ecc 100644 +--- a/gcc/cfgloop.h ++++ b/gcc/cfgloop.h +@@ -272,6 +272,9 @@ public: + the basic-block from being collected but its index can still be + reused. */ + basic_block former_header; ++ ++ /* Number of latch executions from vectorization. */ ++ tree vec_nb_iterations; + }; + + /* Set if the loop is known to be infinite. */ +diff --git a/gcc/common.opt b/gcc/common.opt +index 6ab7ba4cc..e6ffa1c58 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1148,6 +1148,26 @@ Common Joined RejectNegative Var(auto_profile_file) + Use sample profile information for call graph node weights. The profile + file is specified in the argument. + ++fcache-misses-profile ++Common Var(flag_cache_misses_profile) ++Use sample profile information for source code cache miss count. The default ++profile file is cmsdata.gcov in `pwd`. ++ ++fcache-misses-profile= ++Common Joined RejectNegative Var(cache_misses_profile_file) ++Use sample profile information for source code cache miss count. The profile ++file is specified in the argument. ++ ++fadditional-profile ++Common Var(flag_additional_profile) ++Use additional PMU-event sample profile information for source code bb count. ++The default profile file is addldata.gcov in `pwd`. ++ ++fadditional-profile= ++Common Joined RejectNegative Var(additional_profile_file) ++Use additional PMU-event sample profile information for source code bb count. ++The profile file is specified in the argument. ++ + ; -fcheck-bounds causes gcc to generate array bounds checks. + ; For C, C++ and ObjC: defaults off. + ; For Java: defaults to on. +@@ -2074,6 +2094,10 @@ fipa-struct-sfc-shadow + Common Var(flag_ipa_struct_sfc_shadow) Init(0) Optimization + Enable field shadowing optimization in static struct field compression. + ++fipa-extend-auto-profile ++Common Var(flag_ipa_extend_auto_profile) ++Use sample profile information for source code. ++ + fipa-vrp + Common Var(flag_ipa_vrp) Optimization + Perform IPA Value Range Propagation. +@@ -2424,6 +2448,10 @@ fipa-prefetch + Common Var(flag_ipa_prefetch) Init(0) Optimization + Generate prefetch instructions, if available, using IPA info. + ++fllc-allocate ++Common Var(flag_llc_allocate) Init(-1) Optimization ++Generate LLC hint instructions. ++ + fprofile + Common Var(profile_flag) + Enable basic program profiling code. +diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h +index cbb844fbc..af0881f7a 100644 +--- a/gcc/config/aarch64/aarch64-protos.h ++++ b/gcc/config/aarch64/aarch64-protos.h +@@ -702,12 +702,16 @@ extern struct tune_params aarch64_tune_params; + T (PLDL2STRM, pldl2strm, 3) \ + T (PLDL3KEEP, pldl3keep, 4) \ + T (PLDL3STRM, pldl3strm, 5) \ ++ T (PLDL4KEEP, pldl4keep, 6) \ ++ T (PLDL4STRM, pldl4strm, 7) \ + T (PSTL1KEEP, pstl1keep, 8) \ + T (PSTL1STRM, pstl1strm, 9) \ + T (PSTL2KEEP, pstl2keep, 10) \ + T (PSTL2STRM, pstl2strm, 11) \ + T (PSTL3KEEP, pstl3keep, 12) \ +- T (PSTL3STRM, pstl3strm, 13) ++ T (PSTL3STRM, pstl3strm, 13) \ ++ T (PSTL4KEEP, pstl4keep, 14) \ ++ T (PSTL4STRM, pstl4strm, 15) + + #define AARCH64_SVENUM(UPPER, LOWER, VALUE) AARCH64_SV_##UPPER = VALUE, + enum aarch64_svpattern { +diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md +index a8a5dc3a2..7808abf70 100644 +--- a/gcc/config/aarch64/aarch64-sve.md ++++ b/gcc/config/aarch64/aarch64-sve.md +@@ -1952,7 +1952,7 @@ + (define_insn "@aarch64_sve_prefetch" + [(prefetch (unspec:DI + [(match_operand: 0 "register_operand" "Upl") +- (match_operand:SVE_FULL_I 1 "aarch64_sve_prefetch_operand" "UP") ++ (match_operand:SVE_FULL 1 "aarch64_sve_prefetch_operand" "UP") + (match_operand:DI 2 "const_int_operand")] + UNSPEC_SVE_PREFETCH) + (match_operand:DI 3 "const_int_operand") +@@ -1985,14 +1985,14 @@ + ;; 6: the prefetch operator (an svprfop) + ;; 7: the normal RTL prefetch rw flag + ;; 8: the normal RTL prefetch locality value +-(define_insn "@aarch64_sve_gather_prefetch" ++(define_insn "@aarch64_sve_gather_prefetch" + [(prefetch (unspec:DI + [(match_operand:VNx4BI 0 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl") +- (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk, rk, rk") ++ (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk, rk, rk") + (match_operand:VNx4SI_ONLY 2 "register_operand" "w, w, w, w, w, w") + (match_operand:DI 3 "const_int_operand" "i, i, Z, Ui1, Z, Ui1") +- (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, Ui1, i, i") +- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") ++ (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, Ui1, i, i") ++ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") + (match_operand:DI 6 "const_int_operand")] + UNSPEC_SVE_PREFETCH_GATHER) + (match_operand:DI 7 "const_int_operand") +@@ -2000,12 +2000,12 @@ + "TARGET_SVE && TARGET_NON_STREAMING" + { + static const char *const insns[][2] = { +- "prf", "%0, [%2.s]", +- "prf", "%0, [%2.s, #%1]", ++ "prf", "%0, [%2.s]", ++ "prf", "%0, [%2.s, #%1]", + "prfb", "%0, [%1, %2.s, sxtw]", + "prfb", "%0, [%1, %2.s, uxtw]", +- "prf", "%0, [%1, %2.s, sxtw %p4]", +- "prf", "%0, [%1, %2.s, uxtw %p4]" ++ "prf", "%0, [%1, %2.s, sxtw %p4]", ++ "prf", "%0, [%1, %2.s, uxtw %p4]" + }; + const char *const *parts = insns[which_alternative]; + return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); +@@ -2014,14 +2014,14 @@ + + ;; Predicated gather prefetches for 64-bit elements. The value of operand 3 + ;; doesn't matter in this case. +-(define_insn "@aarch64_sve_gather_prefetch" ++(define_insn "@aarch64_sve_gather_prefetch" + [(prefetch (unspec:DI + [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl, Upl, Upl") +- (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk") ++ (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk") + (match_operand:VNx2DI_ONLY 2 "register_operand" "w, w, w, w") + (match_operand:DI 3 "const_int_operand") +- (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, i") +- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") ++ (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, i") ++ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") + (match_operand:DI 6 "const_int_operand")] + UNSPEC_SVE_PREFETCH_GATHER) + (match_operand:DI 7 "const_int_operand") +@@ -2029,10 +2029,10 @@ + "TARGET_SVE && TARGET_NON_STREAMING" + { + static const char *const insns[][2] = { +- "prf", "%0, [%2.d]", +- "prf", "%0, [%2.d, #%1]", ++ "prf", "%0, [%2.d]", ++ "prf", "%0, [%2.d, #%1]", + "prfb", "%0, [%1, %2.d]", +- "prf", "%0, [%1, %2.d, lsl %p4]" ++ "prf", "%0, [%1, %2.d, lsl %p4]" + }; + const char *const *parts = insns[which_alternative]; + return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); +@@ -2040,7 +2040,7 @@ + ) + + ;; Likewise, but with the offset being sign-extended from 32 bits. +-(define_insn_and_rewrite "*aarch64_sve_gather_prefetch_sxtw" ++(define_insn_and_rewrite "*aarch64_sve_gather_prefetch_sxtw" + [(prefetch (unspec:DI + [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl") + (match_operand:DI 1 "register_operand" "rk, rk") +@@ -2051,8 +2051,8 @@ + (match_operand:VNx2DI 2 "register_operand" "w, w")))] + UNSPEC_PRED_X) + (match_operand:DI 3 "const_int_operand") +- (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") +- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") ++ (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") ++ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") + (match_operand:DI 6 "const_int_operand")] + UNSPEC_SVE_PREFETCH_GATHER) + (match_operand:DI 7 "const_int_operand") +@@ -2061,7 +2061,7 @@ + { + static const char *const insns[][2] = { + "prfb", "%0, [%1, %2.d, sxtw]", +- "prf", "%0, [%1, %2.d, sxtw %p4]" ++ "prf", "%0, [%1, %2.d, sxtw %p4]" + }; + const char *const *parts = insns[which_alternative]; + return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); +@@ -2073,7 +2073,7 @@ + ) + + ;; Likewise, but with the offset being zero-extended from 32 bits. +-(define_insn "*aarch64_sve_gather_prefetch_uxtw" ++(define_insn "*aarch64_sve_gather_prefetch_uxtw" + [(prefetch (unspec:DI + [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl") + (match_operand:DI 1 "register_operand" "rk, rk") +@@ -2081,8 +2081,8 @@ + (match_operand:VNx2DI 2 "register_operand" "w, w") + (match_operand:VNx2DI 9 "aarch64_sve_uxtw_immediate")) + (match_operand:DI 3 "const_int_operand") +- (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") +- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") ++ (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") ++ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") + (match_operand:DI 6 "const_int_operand")] + UNSPEC_SVE_PREFETCH_GATHER) + (match_operand:DI 7 "const_int_operand") +@@ -2091,7 +2091,7 @@ + { + static const char *const insns[][2] = { + "prfb", "%0, [%1, %2.d, uxtw]", +- "prf", "%0, [%1, %2.d, uxtw %p4]" ++ "prf", "%0, [%1, %2.d, uxtw %p4]" + }; + const char *const *parts = insns[which_alternative]; + return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index e9c387b24..a06c2c515 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -4408,6 +4408,13 @@ aarch64_sve_data_mode_p (machine_mode mode) + return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA; + } + ++/* Return true if MODE is an full SVE data vector mode. */ ++static bool ++aarch64_full_sve_data_mode_p (machine_mode mode) ++{ ++ return aarch64_classify_vector_mode (mode) == VEC_SVE_DATA; ++} ++ + /* Return the number of defined bytes in one constituent vector of + SVE mode MODE, which has vector flags VEC_FLAGS. */ + static poly_int64 +@@ -31796,6 +31803,17 @@ aarch64_libgcc_floating_mode_supported_p + #undef TARGET_ASM_FUNCTION_EPILOGUE + #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks + ++#undef TARGET_VECTORIZE_CODE_FOR_PREFETCH ++#define TARGET_VECTORIZE_CODE_FOR_PREFETCH code_for_aarch64_sve_prefetch ++ ++#undef TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH ++#define TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH \ ++ code_for_aarch64_sve_gather_prefetch ++ ++#undef TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P ++#define TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P \ ++ aarch64_full_sve_data_mode_p ++ + #undef TARGET_HAVE_SHADOW_CALL_STACK + #define TARGET_HAVE_SHADOW_CALL_STACK true + +diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md +index 2f46bc793..69d296556 100644 +--- a/gcc/config/aarch64/aarch64.md ++++ b/gcc/config/aarch64/aarch64.md +@@ -925,6 +925,45 @@ + [(set_attr "type" "load_4")] + ) + ++(define_insn "prefetch_full" ++ [(prefetch_full (match_operand:DI 0 "aarch64_prefetch_operand" "Dp") ++ (match_operand:QI 1 "const_int_operand" "") ++ (match_operand:QI 2 "const_int_operand" ""))] ++ "" ++ { ++ const char * pftype[2][8] = ++ { ++ {"prfm\\tPLDL1KEEP, %0", ++ "prfm\\tPLDL1STRM, %0", ++ "prfm\\tPLDL2KEEP, %0", ++ "prfm\\tPLDL2STRM, %0", ++ "prfm\\tPLDL3KEEP, %0", ++ "prfm\\tPLDL3STRM, %0", ++ "prfm\\tPLDL4KEEP, %0", ++ "prfm\\tPLDL4STRM, %0"}, ++ {"prfm\\tPSTL1KEEP, %0", ++ "prfm\\tPSTL1STRM, %0", ++ "prfm\\tPSTL2KEEP, %0", ++ "prfm\\tPSTL2STRM, %0", ++ "prfm\\tPSTL3KEEP, %0", ++ "prfm\\tPSTL3STRM, %0", ++ "prfm\\tPSTL4KEEP, %0", ++ "prfm\\tPSTL4STRM, %0"}, ++ }; ++ ++ int prfop = INTVAL (operands[2]); ++ ++ gcc_assert (IN_RANGE (prfop, 0, 7)); ++ ++ /* PRFM accepts the same addresses as a 64-bit LDR so wrap ++ the address into a DImode MEM so that aarch64_print_operand knows ++ how to print it. */ ++ operands[0] = gen_rtx_MEM (DImode, operands[0]); ++ return pftype[INTVAL (operands[1])][prfop]; ++ } ++ [(set_attr "type" "load_4")] ++) ++ + (define_insn "trap" + [(trap_if (const_int 1) (const_int 8))] + "" +diff --git a/gcc/dce.cc b/gcc/dce.cc +index 6676cbcd4..964a0a6d0 100644 +--- a/gcc/dce.cc ++++ b/gcc/dce.cc +@@ -72,6 +72,7 @@ deletable_insn_p_1 (rtx body) + switch (GET_CODE (body)) + { + case PREFETCH: ++ case PREFETCH_FULL: + case TRAP_IF: + /* The UNSPEC case was added here because the ia-64 claims that + USEs do not work after reload and generates UNSPECS rather +diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi +index 50bbbbc42..16ada7aae 100644 +--- a/gcc/doc/tm.texi ++++ b/gcc/doc/tm.texi +@@ -6278,6 +6278,27 @@ The default is @code{NULL_TREE} which means to not vectorize scatter + stores. + @end deftypefn + ++@deftypefn {Target Hook} insn_code TARGET_VECTORIZE_CODE_FOR_PREFETCH (machine_mode @var{arg}) ++This hook should return the decl of a function that implements the ++vectorized variant of the function with the @code{combined_fn} code ++@var{code} or @code{NULL_TREE} if such a function is not available. ++The return type of the vectorized function shall be of vector type ++@var{vec_type_out} and the argument types should be @var{vec_type_in}. ++@end deftypefn ++ ++@deftypefn {Target Hook} insn_code TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH (machine_mode @var{mode_to}, machine_mode @var{mode_form}) ++This hook should return the decl of a function that implements the ++vectorized variant of the function with the @code{combined_fn} code ++@var{code} or @code{NULL_TREE} if such a function is not available. ++The return type of the vectorized function shall be of vector type ++@var{vec_type_out} and the argument types should be @var{vec_type_in}. ++@end deftypefn ++ ++@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P (machine_mode @var{arg}) ++This hook should return true if the target hardware architecture ++supports a full SVE data vector mode. ++@end deftypefn ++ + @deftypefn {Target Hook} int TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN (struct cgraph_node *@var{}, struct cgraph_simd_clone *@var{}, @var{tree}, @var{int}) + This hook should set @var{vecsize_mangle}, @var{vecsize_int}, @var{vecsize_float} + fields in @var{simd_clone} structure pointed by @var{clone_info} argument and also +diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in +index cfda60304..88db8752e 100644 +--- a/gcc/doc/tm.texi.in ++++ b/gcc/doc/tm.texi.in +@@ -4190,6 +4190,12 @@ address; but often a machine-dependent strategy can generate better code. + + @hook TARGET_VECTORIZE_BUILTIN_SCATTER + ++@hook TARGET_VECTORIZE_CODE_FOR_PREFETCH ++ ++@hook TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH ++ ++@hook TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P ++ + @hook TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN + + @hook TARGET_SIMD_CLONE_ADJUST +diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc +index 8b1733e20..19811106f 100644 +--- a/gcc/internal-fn.cc ++++ b/gcc/internal-fn.cc +@@ -107,11 +107,13 @@ init_internal_fns () + direct_internal_fn. */ + #define not_direct { -2, -2, false } + #define mask_load_direct { -1, 2, false } ++#define mask_prefetch_direct { -1, 2, false } + #define load_lanes_direct { -1, -1, false } + #define mask_load_lanes_direct { -1, -1, false } + #define gather_load_direct { 3, 1, false } + #define len_load_direct { -1, -1, false } + #define mask_store_direct { 3, 2, false } ++#define gather_prefetch_direct { 3, 1, false } + #define store_lanes_direct { 0, 0, false } + #define mask_store_lanes_direct { 0, 0, false } + #define vec_cond_mask_direct { 1, 0, false } +@@ -2745,6 +2747,53 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab) + #define expand_mask_load_lanes_optab_fn expand_mask_load_optab_fn + #define expand_len_load_optab_fn expand_partial_load_optab_fn + ++/* Expand MASK_PREFETCH call STMT using optab OPTAB. ++ .MASK_STORE (_5, 64B, loop_mask_98, vect__8.10_102); ++ .MASK_PREFETCH (_68, 64B, loop_mask_98, vect__8.10_102, 4); ++*/ ++ ++static void ++expand_mask_prefetch_optab_fn (internal_fn, gcall *stmt, direct_optab optab) ++{ ++ if (targetm.vectorize.code_for_prefetch == NULL ++ || targetm.vectorize.prefetch_handleable_mode_p == NULL) ++ return; ++ ++ tree base = gimple_call_arg (stmt, 0); ++ if (base == NULL_TREE) ++ return; ++ ++ tree maskt = gimple_call_arg (stmt, 2); ++ tree target = gimple_call_arg (stmt, 3); ++ tree prfop = gimple_call_arg (stmt, 4); ++ HOST_WIDE_INT prfop_int = tree_to_uhwi (prfop); ++ /* Bit 3 of the prfop selects stores over loads. */ ++ HOST_WIDE_INT access = prfop_int & 8; ++ /* Bits 1 and 2 specify the locality; 0-based for svprfop but ++ 1-based for PREFETCH. */ ++ HOST_WIDE_INT locality = ((prfop_int >> 1) & 3) + 1; ++ ++ machine_mode m_mode = TYPE_MODE (TREE_TYPE (target)); ++ if (!targetm.vectorize.prefetch_handleable_mode_p (m_mode)) ++ return; ++ insn_code icode = targetm.vectorize.code_for_prefetch (m_mode); ++ ++ rtx mask = expand_normal (maskt); ++ rtx base_rtx = expand_normal (base); ++ /* Convert ptr_mode value X to Pmode. */ ++ if (ptr_mode == SImode) ++ base_rtx = simplify_gen_unary (ZERO_EXTEND, DImode, base_rtx, SImode); ++ ++ unsigned i = 0; ++ class expand_operand ops[5]; ++ create_input_operand (&ops[i++], mask, TYPE_MODE (TREE_TYPE (maskt))); ++ create_address_operand (&ops[i++], base_rtx); ++ create_integer_operand (&ops[i++], prfop_int); ++ create_integer_operand (&ops[i++], access); ++ create_integer_operand (&ops[i++], locality); ++ expand_insn (icode, i, ops); ++} ++ + /* Expand MASK_STORE{,_LANES} or LEN_STORE call STMT using optab OPTAB. */ + + static void +@@ -3402,6 +3451,70 @@ contains_call_div_mod (rtx_insn *insn) + return false; + } + ++/* Expand {MASK_,}GATHER_PREFETCH call CALL using optab OPTAB. ++ vect_patt_97.14_77 = .MASK_GATHER_LOAD (_78, vect__14.13_79, 8, { 0.0, ... }, loop_mask_87); ++ .MASK_GATHER_PREFETCH (_45, vect__14.13_79, 8, { 0.0, ... }, loop_mask_87, vect_patt_97.14_77, 4); ++*/ ++ ++static void ++expand_gather_prefetch_optab_fn (internal_fn, gcall *stmt, direct_optab optab) ++{ ++ if (targetm.vectorize.code_for_gather_prefetch == NULL ++ || targetm.vectorize.prefetch_handleable_mode_p == NULL) ++ return; ++ ++ /* Extracting tree nodes, only expand for scalar base and vector index. */ ++ tree base = gimple_call_arg (stmt, 0); ++ if (VECTOR_TYPE_P (TREE_TYPE (base))) ++ return; ++ tree offset = gimple_call_arg (stmt, 1); ++ if (VECTOR_TYPE_P (TREE_TYPE (offset)) == false) ++ return; ++ ++ tree scale = gimple_call_arg (stmt, 2); ++ tree mask = gimple_call_arg (stmt, 4); ++ tree target = gimple_call_arg (stmt, 5); ++ tree prfop = gimple_call_arg (stmt, 6); ++ ++ /* Convert to the rtx node. */ ++ rtx base_rtx = expand_normal (base); ++ /* Convert ptr_mode value X to Pmode. */ ++ if (ptr_mode == SImode) ++ base_rtx = simplify_gen_unary (ZERO_EXTEND, DImode, base_rtx, SImode); ++ rtx offset_rtx = expand_normal (offset); ++ rtx const_rtx = CONST0_RTX (TYPE_MODE (TREE_TYPE (target))); ++ rtx mask_rtx = expand_normal (mask); ++ HOST_WIDE_INT scale_int = tree_to_shwi (scale); ++ HOST_WIDE_INT prfop_int = tree_to_uhwi (prfop); ++ /* Bit 3 of the prfop selects stores over loads. */ ++ HOST_WIDE_INT access = prfop_int & 8; ++ /* Bits 1 and 2 specify the locality; 0-based for svprfop but ++ 1-based for PREFETCH. */ ++ HOST_WIDE_INT locality = ((prfop_int >> 1) & 3) + 1; ++ ++ /* add operand. */ ++ unsigned int i = 0; ++ class expand_operand ops[9]; ++ create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask))); ++ create_address_operand (&ops[i++], base_rtx); ++ create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset))); ++ /* Check whether the index has unsigned. */ ++ create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset))); ++ create_integer_operand (&ops[i++], scale_int); ++ create_input_operand (&ops[i++], const_rtx, GET_MODE (const_rtx)); ++ create_integer_operand (&ops[i++], prfop_int); ++ create_integer_operand (&ops[i++], access); ++ create_integer_operand (&ops[i++], locality); ++ ++ machine_mode reg_mode = GET_MODE (offset_rtx); ++ machine_mode m_mode = TYPE_MODE (TREE_TYPE (target)); ++ if (!targetm.vectorize.prefetch_handleable_mode_p (m_mode)) ++ return; ++ insn_code icode = targetm.vectorize.code_for_gather_prefetch ++ (m_mode, reg_mode); ++ expand_insn (icode, i, ops); ++} ++ + /* Expand DIVMOD() using: + a) optab handler for udivmod/sdivmod if it is available. + b) If optab_handler doesn't exist, generate call to +@@ -3767,10 +3880,12 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, + #define direct_cond_binary_optab_supported_p direct_optab_supported_p + #define direct_cond_ternary_optab_supported_p direct_optab_supported_p + #define direct_mask_load_optab_supported_p convert_optab_supported_p ++#define direct_mask_prefetch_optab_supported_p direct_optab_supported_p + #define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p + #define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p + #define direct_gather_load_optab_supported_p convert_optab_supported_p + #define direct_len_load_optab_supported_p direct_optab_supported_p ++#define direct_gather_prefetch_optab_supported_p direct_optab_supported_p + #define direct_mask_store_optab_supported_p convert_optab_supported_p + #define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p + #define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p +diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def +index d2d550d35..05fc50328 100644 +--- a/gcc/internal-fn.def ++++ b/gcc/internal-fn.def +@@ -121,6 +121,8 @@ along with GCC; see the file COPYING3. If not see + #endif + + DEF_INTERNAL_OPTAB_FN (MASK_LOAD, ECF_PURE, maskload, mask_load) ++DEF_INTERNAL_OPTAB_FN (MASK_PREFETCH, ECF_NOVOPS | ECF_LEAF, ++ maskprefetch, mask_prefetch) + DEF_INTERNAL_OPTAB_FN (LOAD_LANES, ECF_CONST, vec_load_lanes, load_lanes) + DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE, + vec_mask_load_lanes, mask_load_lanes) +@@ -128,6 +130,8 @@ DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE, + DEF_INTERNAL_OPTAB_FN (GATHER_LOAD, ECF_PURE, gather_load, gather_load) + DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE, + mask_gather_load, gather_load) ++DEF_INTERNAL_OPTAB_FN (MASK_GATHER_PREFETCH, ECF_NOVOPS | ECF_LEAF, ++ mask_gather_prefetch, gather_prefetch) + + DEF_INTERNAL_OPTAB_FN (LEN_LOAD, ECF_PURE, len_load, len_load) + +diff --git a/gcc/ipa-pure-const.cc b/gcc/ipa-pure-const.cc +index 2642df91e..222fe6465 100644 +--- a/gcc/ipa-pure-const.cc ++++ b/gcc/ipa-pure-const.cc +@@ -534,6 +534,7 @@ builtin_safe_for_const_function_p (bool *looping, tree callee) + *looping = false; + return true; + case BUILT_IN_PREFETCH: ++ case BUILT_IN_PREFETCH_FULL: + *looping = true; + return true; + default: +diff --git a/gcc/optabs.def b/gcc/optabs.def +index dbf529434..8ca25a5cc 100644 +--- a/gcc/optabs.def ++++ b/gcc/optabs.def +@@ -90,9 +90,11 @@ OPTAB_CD(vec_cmp_optab, "vec_cmp$a$b") + OPTAB_CD(vec_cmpu_optab, "vec_cmpu$a$b") + OPTAB_CD(vec_cmpeq_optab, "vec_cmpeq$a$b") + OPTAB_CD(maskload_optab, "maskload$a$b") ++OPTAB_CD(maskprefetch_optab, "maskprefetch$a$b") + OPTAB_CD(maskstore_optab, "maskstore$a$b") + OPTAB_CD(gather_load_optab, "gather_load$a$b") + OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b") ++OPTAB_CD(mask_gather_prefetch_optab, "mask_gather_prefetch$a$b") + OPTAB_CD(scatter_store_optab, "scatter_store$a$b") + OPTAB_CD(mask_scatter_store_optab, "mask_scatter_store$a$b") + OPTAB_CD(vec_extract_optab, "vec_extract$a$b") +diff --git a/gcc/opts.cc b/gcc/opts.cc +index 2433ace06..432b822e8 100644 +--- a/gcc/opts.cc ++++ b/gcc/opts.cc +@@ -2108,6 +2108,13 @@ enable_fdo_optimizations (struct gcc_options *opts, + SET_OPTION_IF_UNSET (opts, opts_set, flag_tree_loop_distribution, value); + } + ++static void ++set_cache_misses_profile_params (struct gcc_options *opts, ++ struct gcc_options *opts_set) ++{ ++ SET_OPTION_IF_UNSET (opts, opts_set, flag_prefetch_loop_arrays, 1); ++} ++ + /* Enable cfgo-related flags. */ + + static void +@@ -3143,10 +3150,20 @@ common_handle_option (struct gcc_options *opts, + /* FALLTHRU */ + case OPT_fauto_profile: + enable_fdo_optimizations (opts, opts_set, value); +- /* 2 is special and means flag_profile_correction trun on by +- -fauto-profile. */ ++ /* 2 is special and means flag_profile_correction trun on by ++ -fauto-profile. */ + SET_OPTION_IF_UNSET (opts, opts_set, flag_profile_correction, +- (value ? 2 : 0)); ++ (value ? 2 : 0)); ++ break; ++ ++ case OPT_fadditional_profile_: ++ opts->x_additional_profile_file = xstrdup (arg); ++ opts->x_flag_additional_profile = true; ++ value = true; ++ /* No break here - do -fadditional-profile processing. */ ++ /* FALLTHRU */ ++ case OPT_fadditional_profile: ++ opts->x_flag_ipa_extend_auto_profile = value; + break; + + case OPT_fipa_struct_reorg_: +@@ -3155,17 +3172,36 @@ common_handle_option (struct gcc_options *opts, + case OPT_fipa_struct_reorg: + opts->x_flag_ipa_struct_reorg = value; + if (value && !opts->x_struct_layout_optimize_level) +- { +- /* Using the -fipa-struct-reorg option is equivalent to using +- -fipa-struct-reorg=1. */ +- opts->x_struct_layout_optimize_level = 1; +- } ++ { ++ /* Using the -fipa-struct-reorg option is equivalent to using ++ -fipa-struct-reorg=1. */ ++ opts->x_struct_layout_optimize_level = 1; ++ } + break; + + case OPT_fipa_reorder_fields: + SET_OPTION_IF_UNSET (opts, opts_set, flag_ipa_struct_reorg, value); + break; + ++ case OPT_fipa_extend_auto_profile: ++ opts->x_flag_ipa_extend_auto_profile = opts->x_flag_cache_misses_profile ++ ? true : value; ++ break; ++ ++ case OPT_fcache_misses_profile_: ++ opts->x_cache_misses_profile_file = xstrdup (arg); ++ opts->x_flag_cache_misses_profile = true; ++ value = true; ++ /* No break here - do -fcache-misses-profile processing. */ ++ /* FALLTHRU */ ++ case OPT_fcache_misses_profile: ++ opts->x_flag_ipa_extend_auto_profile = value; ++ if (value) ++ { ++ set_cache_misses_profile_params (opts, opts_set); ++ } ++ break; ++ + case OPT_fcfgo_profile_generate_: + opts->x_profile_data_prefix = xstrdup (arg); + value = true; +diff --git a/gcc/params.opt b/gcc/params.opt +index e5472dfc8..e06e50611 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -1262,4 +1262,66 @@ Range for depended ldp search in split-ldp-stp path. + Common Joined UInteger Var(semi_relayout_level) Init(13) IntegerRange(11, 15) Param Optimization + Set capacity of each bucket to semi-relayout to (1 << semi-relayout-level) / 8 . + ++-param=mem-access-ratio= ++Common Joined UInteger Var(param_mem_access_ratio) Init(20) IntegerRange(0, 100) Param Optimization ++Memory access ratio (in percent). ++ ++-param=mem-access-num= ++Common Joined UInteger Var(param_mem_access_num) Init(3) Param Optimization ++Memory access num. ++ ++-param=prefetch-offset= ++Common Joined UInteger Var(param_prefetch_offset) Init(1024) ++IntegerRange(1, 999999) Param Optimization ++Prefetch Offset, which is usually a power of two due to cache line size. ++ ++-param=branch-prob-threshold= ++Common Joined UInteger Var(param_branch_prob_threshold) Init(80) IntegerRange(50, 100) ++Param Optimization ++High Execution Rate Branch Threshold. ++ ++-param=issue-topn= ++Common Joined UInteger Var(param_issue_topn) Init(1) Param Optimization ++Issue topn LLC mem_ref hint. ++ ++-param=force-issue= ++Common Joined UInteger Var(param_force_issue) Init(0) IntegerRange(0, 1) Param ++Force issue the topn LLC mem_ref hint, without generating dynamic multi-branches. ++ ++-param=llc-capacity-per-core= ++Common Joined UInteger Var(param_llc_capacity_per_core) Init(107) IntegerRange(0, 999999) Param ++LLC capacity per core. ++ ++-param=filter-kernels= ++Common Joined UInteger Var(param_filter_kernels) Init(1) IntegerRange(0, 1) Param ++Allow LLC allocate pass to greedily filter kernels by traversing the corresponding basic blocks ++through edges with branch probability no less than param_branch_prob_threshold. ++ ++-param=outer-loop-nums= ++Common Joined UInteger Var(param_outer_loop_num) Init(1) IntegerRange(1, 10) Param ++Maximum number of outer loops allowed to extend outer loops for loops that ++cannot recognize inner loop boundaries. ++ ++-param=llc-level= ++Common Joined UInteger Var(param_llc_level) Init(3) IntegerRange(3, 4) ++Param Optimization ++Specifies the HBM cache level. ++ ++-param=filter-mode= ++Common Joined UInteger Var(param_filter_mode) Init(1) IntegerRange(0, 1) Param ++Set kernel filtering mode. Use basic block count by default; use branch probability mode when filter mode is turned off. ++ ++-param=transfer-footprint= ++Common Joined UInteger Var(param_transfer_footprint) Init(1) IntegerRange(0, 1) Param ++Allow transferring the firstly calculated footprint expression to the target memory reference ++from which it is impossible to retrieve the foortprint. ++ ++-param=llc-allocate-func-topn= ++Common Joined UInteger Var(param_llc_allocate_func_topn) Init(0) Param Optimization ++TopN functions of pmu counts to be analyzed in LLC allocation. ++ ++-param=llc-allocate-func-counts-threshold= ++Common Joined UInteger Var(param_llc_allocate_func_counts_threshold) Init(1) Param Optimization ++Threshold functions of pmu counts to be analyzed in LLC allocation. ++ + ; This comment is to ensure we retain the blank line above. +diff --git a/gcc/passes.def b/gcc/passes.def +index 90643d533..49001adde 100644 +--- a/gcc/passes.def ++++ b/gcc/passes.def +@@ -141,6 +141,7 @@ along with GCC; see the file COPYING3. If not see + + NEXT_PASS (pass_target_clone); + NEXT_PASS (pass_ipa_auto_profile); ++ NEXT_PASS (pass_ipa_extend_auto_profile); + NEXT_PASS (pass_ipa_tree_profile); + PUSH_INSERT_PASSES_WITHIN (pass_ipa_tree_profile) + NEXT_PASS (pass_feedback_split_functions); +@@ -325,6 +326,7 @@ along with GCC; see the file COPYING3. If not see + /* Run IVOPTs after the last pass that uses data-reference analysis + as that doesn't handle TARGET_MEM_REFs. */ + NEXT_PASS (pass_iv_optimize); ++ NEXT_PASS (pass_llc_allocate); + NEXT_PASS (pass_lim); + NEXT_PASS (pass_tree_loop_done); + POP_INSERT_PASSES () +diff --git a/gcc/print-rtl.cc b/gcc/print-rtl.cc +index 636113d5b..b7506514a 100644 +--- a/gcc/print-rtl.cc ++++ b/gcc/print-rtl.cc +@@ -1579,6 +1579,12 @@ print_exp (pretty_printer *pp, const_rtx x, int verbose) + op[1] = XEXP (x, 1); + op[2] = XEXP (x, 2); + break; ++ case PREFETCH_FULL: ++ fun = "prefetch_full"; ++ op[0] = XEXP (x, 0); ++ op[1] = XEXP (x, 1); ++ op[2] = XEXP (x, 2); ++ break; + case UNSPEC: + case UNSPEC_VOLATILE: + { +diff --git a/gcc/rtl.def b/gcc/rtl.def +index 08e31fa35..78ec1a021 100644 +--- a/gcc/rtl.def ++++ b/gcc/rtl.def +@@ -282,6 +282,15 @@ DEF_RTL_EXPR(ADDR_DIFF_VEC, "addr_diff_vec", "eEee0", RTX_EXTRA) + whose prefetch instructions do not support them. */ + DEF_RTL_EXPR(PREFETCH, "prefetch", "eee", RTX_EXTRA) + ++/* Memory prefetch, with attributes supported on some targets. ++ Operand 1 is the address of the memory to fetch. ++ Operand 2 is 1 for a write access, 0 otherwise. ++ Operand 3 is the level of prfop. ++ ++ The attributes specified by operands 2 and 3 are ignored for targets ++ whose prefetch instructions do not support them. */ ++DEF_RTL_EXPR(PREFETCH_FULL, "prefetch_full", "eee", RTX_EXTRA) ++ + /* ---------------------------------------------------------------------- + At the top level of an instruction (perhaps under PARALLEL). + ---------------------------------------------------------------------- */ +diff --git a/gcc/rtl.h b/gcc/rtl.h +index a0db225cb..844e1a7c3 100644 +--- a/gcc/rtl.h ++++ b/gcc/rtl.h +@@ -2814,6 +2814,10 @@ do { \ + #define PREFETCH_SCHEDULE_BARRIER_P(RTX) \ + (RTL_FLAG_CHECK1 ("PREFETCH_SCHEDULE_BARRIER_P", (RTX), PREFETCH)->volatil) + ++/* True if RTX is flagged to be a scheduling barrier. */ ++#define PREFETCH_FULL_SCHEDULE_BARRIER_P(RTX) \ ++ (RTL_FLAG_CHECK1 ("PREFETCH_FULL_SCHEDULE_BARRIER_P", (RTX), PREFETCH_FULL)->volatil) ++ + /* Indicate whether the machine has any sort of auto increment addressing. + If not, we can avoid checking for REG_INC notes. */ + +diff --git a/gcc/rtlanal.cc b/gcc/rtlanal.cc +index c436c640c..7f5646ce7 100644 +--- a/gcc/rtlanal.cc ++++ b/gcc/rtlanal.cc +@@ -1198,6 +1198,7 @@ reg_referenced_p (const_rtx x, const_rtx body) + return reg_overlap_mentioned_p (x, TRAP_CONDITION (body)); + + case PREFETCH: ++ case PREFETCH_FULL: + return reg_overlap_mentioned_p (x, XEXP (body, 0)); + + case UNSPEC: +@@ -2042,6 +2043,7 @@ note_uses (rtx *pbody, void (*fun) (rtx *, void *), void *data) + return; + + case PREFETCH: ++ case PREFETCH_FULL: + (*fun) (&XEXP (body, 0), data); + return; + +diff --git a/gcc/sched-deps.cc b/gcc/sched-deps.cc +index 948aa0c3b..db453fb9b 100644 +--- a/gcc/sched-deps.cc ++++ b/gcc/sched-deps.cc +@@ -2705,7 +2705,9 @@ sched_analyze_2 (class deps_desc *deps, rtx x, rtx_insn *insn) + break; + + case PREFETCH: +- if (PREFETCH_SCHEDULE_BARRIER_P (x)) ++ case PREFETCH_FULL: ++ if ((code == PREFETCH && PREFETCH_SCHEDULE_BARRIER_P (x)) ++ || (code == PREFETCH_FULL && PREFETCH_FULL_SCHEDULE_BARRIER_P (x))) + reg_pending_barrier = TRUE_BARRIER; + /* Prefetch insn contains addresses only. So if the prefetch + address has no registers, there will be no dependencies on +diff --git a/gcc/target-insns.def b/gcc/target-insns.def +index de8c0092f..9cfa19475 100644 +--- a/gcc/target-insns.def ++++ b/gcc/target-insns.def +@@ -77,6 +77,7 @@ DEF_TARGET_INSN (omp_simt_vote_any, (rtx x0, rtx x1)) + DEF_TARGET_INSN (omp_simt_xchg_bfly, (rtx x0, rtx x1, rtx x2)) + DEF_TARGET_INSN (omp_simt_xchg_idx, (rtx x0, rtx x1, rtx x2)) + DEF_TARGET_INSN (prefetch, (rtx x0, rtx x1, rtx x2)) ++DEF_TARGET_INSN (prefetch_full, (rtx x0, rtx x1, rtx x2)) + DEF_TARGET_INSN (probe_stack, (rtx x0)) + DEF_TARGET_INSN (probe_stack_address, (rtx x0)) + DEF_TARGET_INSN (prologue, (void)) +diff --git a/gcc/target.def b/gcc/target.def +index 142858fa3..646489540 100644 +--- a/gcc/target.def ++++ b/gcc/target.def +@@ -2064,6 +2064,37 @@ it is for the vector version.", + (vec_info *vinfo, bool costing_for_scalar), + default_vectorize_create_costs) + ++/* Function for vector prefetch operation. */ ++DEFHOOK ++(code_for_prefetch, ++ "This hook should return the decl of a function that implements the\n\ ++vectorized variant of the function with the @code{combined_fn} code\n\ ++@var{code} or @code{NULL_TREE} if such a function is not available.\n\ ++The return type of the vectorized function shall be of vector type\n\ ++@var{vec_type_out} and the argument types should be @var{vec_type_in}.", ++ insn_code, (machine_mode arg), ++ NULL) ++ ++/* Function for vector gather prefetch operation. */ ++DEFHOOK ++(code_for_gather_prefetch, ++ "This hook should return the decl of a function that implements the\n\ ++vectorized variant of the function with the @code{combined_fn} code\n\ ++@var{code} or @code{NULL_TREE} if such a function is not available.\n\ ++The return type of the vectorized function shall be of vector type\n\ ++@var{vec_type_out} and the argument types should be @var{vec_type_in}.", ++ insn_code, (machine_mode mode_to, machine_mode mode_form), ++ NULL) ++ ++/* Function to check whether the target hardware architecture supports ++ a full SVE data vector mode. */ ++DEFHOOK ++(prefetch_handleable_mode_p, ++ "This hook should return true if the target hardware architecture\n\ ++supports a full SVE data vector mode.", ++ bool, (machine_mode arg), ++ NULL) ++ + HOOK_VECTOR_END (vectorize) + + #undef HOOK_PREFIX +diff --git a/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp +new file mode 100644 +index 000000000..1793ba9d1 +--- /dev/null ++++ b/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp +@@ -0,0 +1,27 @@ ++# Copyright (C) 1997-2022 Free Software Foundation, Inc. ++ ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 3 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with GCC; see the file COPYING3. If not see ++# . ++ ++load_lib g++-dg.exp ++load_lib target-supports.exp ++ ++# Initialize `dg'. ++dg-init ++ ++dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.C]] \ ++ "" "-fllc-allocate" ++ ++# All done. ++dg-finish +\ No newline at end of file +diff --git a/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C b/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C +new file mode 100644 +index 000000000..b5bf69510 +--- /dev/null ++++ b/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C +@@ -0,0 +1,52 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param branch-prob-threshold=50 --param filter-kernels=0 --param mem-access-num=2 --param issue-topn=1 --param force-issue=1" } */ ++#include "multidim_array.h" ++ ++class Input ++{ ++ public: ++ int metadata_offset = 13; ++ int exp_nr_images = 1; ++ MultidimArray exp_Mweight; ++ void convertAllSquaredDifferencesToWeights(); ++}; ++ ++int main() ++{ ++ clock_t start = clock(); ++ Input input; ++ int testIter = 2; ++ ++ for (int i = 0; i < testIter; ++i) ++ { ++ input.convertAllSquaredDifferencesToWeights(); ++ } ++ return 0; ++} ++ ++void Input::convertAllSquaredDifferencesToWeights() ++{ ++ for (int img_id = 0; img_id < exp_nr_images; img_id++) ++ { ++ int my_metadata_offset = metadata_offset + img_id; ++ MultidimArray sorted_weight; ++ ++ exp_Mweight.getRow(img_id, sorted_weight); ++ long int np = 0; ++ FOR_ALL_DIRECT_ELEMENTS_IN_MULTIDIMARRAY(sorted_weight) ++ { ++ if (DIRECT_MULTIDIM_ELEM(sorted_weight, n) > 0.) ++ { ++ DIRECT_MULTIDIM_ELEM(sorted_weight, np) = DIRECT_MULTIDIM_ELEM( \ ++ sorted_weight, n); ++ np++; ++ } ++ } ++ } ++} ++ ++ ++ ++/* { dg-final { scan-tree-dump-times "dense memory access" 1 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "__builtin_prefetch" 1 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */ +diff --git a/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h b/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h +new file mode 100644 +index 000000000..682f24703 +--- /dev/null ++++ b/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h +@@ -0,0 +1,186 @@ ++#ifndef MULTIDIM_ARRAY_H ++#define MULTIDIM_ARRAY_H ++ ++#include ++ ++#define RELION_ALIGNED_MALLOC malloc ++#define RELION_ALIGNED_FREE free ++ ++#define STARTINGX(v) ((v).xinit) ++#define STARTINGY(v) ((v).yinit) ++#define NZYXSIZE(v) ((v).nzyxdim) ++ ++#define DIRECT_MULTIDIM_ELEM(v,n) ((v).data[(n)]) ++#define FOR_ALL_DIRECT_ELEMENTS_IN_MULTIDIMARRAY(v) \ ++ for (long int n=0; n ++class MultidimArray ++{ ++public: ++ T* data; ++ bool destroyData; ++ long int ndim; ++ long int zdim; ++ long int ydim; ++ long int xdim; ++ long int yxdim; ++ long int zyxdim; ++ long int nzyxdim; ++ long int zinit; ++ long int yinit; ++ long int xinit; ++ long int nzyxdimAlloc; ++ ++public: ++ void clear() ++ { ++ coreDeallocate(); ++ coreInit(); ++ } ++ ++ void coreInit() ++ { ++ xdim=0; ++ yxdim=0; ++ zyxdim=0; ++ nzyxdim=0; ++ ydim=1; ++ zdim=1; ++ ndim=1; ++ zinit=0; ++ yinit=0; ++ xinit=0; ++ data=NULL; ++ nzyxdimAlloc = 0; ++ destroyData=true; ++ } ++ ++ void coreAllocate(long int _ndim, long int _zdim, long int _ydim, long int _xdim) ++ { ++ if (_ndim <= 0 || _zdim <= 0 || _ydim<=0 || _xdim<=0) ++ { ++ clear(); ++ return; ++ } ++ ++ ndim=_ndim; ++ zdim=_zdim; ++ ydim=_ydim; ++ xdim=_xdim; ++ yxdim=ydim*xdim; ++ zyxdim=zdim*yxdim; ++ nzyxdim=ndim*zyxdim; ++ ++ coreAllocate(); ++ } ++ ++ void coreAllocate() ++ { ++ data = (T*)RELION_ALIGNED_MALLOC(sizeof(T) * nzyxdim); ++ nzyxdimAlloc = nzyxdim; ++ } ++ ++ void coreDeallocate() ++ { ++ if (data != NULL && destroyData) ++ { ++ RELION_ALIGNED_FREE(data); ++ } ++ data=NULL; ++ nzyxdimAlloc = 0; ++ } ++ ++ void resize(long int Ndim, long int Zdim, long int Ydim, long int Xdim) ++ { ++ if (Ndim*Zdim*Ydim*Xdim == nzyxdimAlloc && data != NULL) ++ { ++ ndim = Ndim; ++ xdim = Xdim; ++ ydim = Ydim; ++ zdim = Zdim; ++ yxdim = Ydim * Xdim; ++ zyxdim = Zdim * yxdim; ++ nzyxdim = Ndim * zyxdim; ++ nzyxdimAlloc = nzyxdim; ++ return; ++ } ++ ++ if (Xdim <= 0 || Ydim <= 0 || Zdim <= 0 || Ndim <= 0) ++ { ++ clear(); ++ return; ++ } ++ ++ if (NZYXSIZE(*this) > 0 && data == NULL) ++ { ++ coreAllocate(); ++ return; ++ } ++ ++ size_t YXdim=Ydim*Xdim; ++ size_t ZYXdim=Zdim*YXdim; ++ size_t NZYXdim=Ndim*ZYXdim; ++ ++ T * new_data = (T*)RELION_ALIGNED_MALLOC(sizeof(T) * NZYXdim); ++ for (long int l = 0; l < Ndim; l++) ++ for (long int k = 0; k < Zdim; k++) ++ for (long int i = 0; i < Ydim; i++) ++ for (long int j = 0; j < Xdim; j++) ++ { ++ T val; ++ new_data[l*ZYXdim + k*YXdim+i*Xdim+j] = val; ++ } ++ coreDeallocate(); ++ ++ data = new_data; ++ ndim = Ndim; ++ xdim = Xdim; ++ ydim = Ydim; ++ zdim = Zdim; ++ yxdim = Ydim * Xdim; ++ zyxdim = Zdim * yxdim; ++ nzyxdim = Ndim * zyxdim; ++ nzyxdimAlloc = nzyxdim; ++ } ++ ++ void resize(long int Xdim) ++ { ++ resize(1, 1, 1, Xdim); ++ } ++ ++ inline T& operator()(long int i, long int j) const ++ { ++ return A2D_ELEM(*this, i, j); ++ } ++ ++ inline T& operator()(long int i) const ++ { ++ return A1D_ELEM(*this, i); ++ } ++ ++ void getRow(long int i, MultidimArray& v) const ++ { ++ if (xdim == 0 || ydim == 0) ++ { ++ v.clear(); ++ return; ++ } ++ ++ v.resize(xdim); ++ for (long int j = 0; j < xdim; j++) ++ v(j) = (*this)(i, j); ++ } ++}; ++ ++#endif /* MULTIDIM_ARRAY_H */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c +new file mode 100644 +index 000000000..091e654f9 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c +@@ -0,0 +1,61 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param issue-topn=2 --param branch-prob-threshold=50 --param filter-mode=0" } */ ++ ++#include ++ ++#define N 131590 ++#define F 384477 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++int lPtr[F]; ++int uPtr[F]; ++double lowerPtr[F]; ++double upperPtr[F]; ++ ++void ++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, ++ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) ++{ ++ for (int cell=0; cell ++ ++#define N 100000 ++ ++int A_i[N]; ++int A_j[N]; ++double A_data[N]; ++double x_data[N]; ++double y_data[N]; ++int num_rows = N; ++ ++void ++MatMult (int *A_i, int *A_j, double *A_data, double *x_data, ++ int num_rows, double *y_data) ++{ ++ int i = 0; ++ int j = 0; ++ double temp = 0; ++ for (i = 0; i < num_rows; i++) ++ { ++ temp = y_data[i]; ++ for (j = A_i[i]; j < A_i[i+1]; j++) ++ temp += A_data[j] * x_data[A_j[j]]; ++ y_data[i] = temp; ++ } ++} ++ ++int ++main (int argc, char *argv[]) ++{ ++ int testIter = 2; ++ ++ for (int i = 0; i < testIter; i++) ++ MatMult (A_i, A_j, A_data, x_data, num_rows, y_data); ++ ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 4 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "Tracing succeeded" 14 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "static_data_size:" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not ", size: (?!(0\.000000))" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times ", size: 0\.000000" 6 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "\\d\\tx_data\\t\\(0.000000, 1, 1, 0\\)" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "\\d\\tA_j\\t\\(0.000000, 1, 1, 0\\)" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "\\d\\tA_data\\t\\(0.000000, 1, 1, 0\\)" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "insert svprfd_gather" 2 "llc_allocate" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp +new file mode 100644 +index 000000000..05a3bf842 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp +@@ -0,0 +1,27 @@ ++# Copyright (C) 2022-2023 Free Software Foundation, Inc. ++ ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 3 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with GCC; see the file COPYING3. If not see ++# . ++ ++load_lib gcc-dg.exp ++load_lib target-supports.exp ++ ++# Initialize `dg'. ++dg-init ++ ++dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \ ++ "" "-fllc-allocate" ++ ++# All done. ++dg-finish +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c +new file mode 100644 +index 000000000..113acbceb +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c +@@ -0,0 +1,36 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c --param=mem-access-ratio=1 --param=mem-access-num=0" } */ ++ ++/* In this deja test case, we test how Phase 2 & 3 of llc-allocate pass deals ++ with an indirect memory access in a nested loop where the use-block for the ++ induction variable of this memory access is a child/descendent of its ++ def-block (we make it by defining the induction variable in the outer loop). ++ Therefore, the reference can be successfully traced after outer-loop ++ analysis. */ ++#include ++#include ++ ++void cross_bb_indir_mem_acc (int *arr1, int *arr2, int *arr3, int *arr4, int n) { ++ srand (time (NULL)); ++ ++ int j_s; ++ int j_e = arr1[0]; ++ int k; ++ ++ for (int i = 0; i < n; i++) ++ { ++ j_s = j_e; ++ j_e = arr1[i + 1]; ++ ++ k = arr3[i]; ++ ++ for (int j = j_s; j < j_e; j++) ++ { ++ arr4[j] -= arr2[k]; ++ } ++ ++ } ++} ++ ++/* { dg-final { scan-tree-dump "Unhandled indirect memory access tracing." "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump "Retrace indirect memory access after outer loop analysis:" "llc_allocate" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c +new file mode 100644 +index 000000000..a2e7f66a4 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c +@@ -0,0 +1,61 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++#include ++#define N 131590 ++#define F 384477 ++ ++int ownStartPtr[F]; ++double bPrimePtr[N]; ++double diagPtr[N]; ++double psiPtr[N]; ++double upperPtr[F]; ++double lowerPtr[F]; ++int uPtr[F]; ++ ++void SMOOTH(int *ownStartPtr, double *bPrimePtr, double *diagPtr, double *psiPtr, int *uPtr, double *lowerPtr, double *upperPtr, int nCells); ++ ++int main(int argc, char *argv[]) ++{ ++ int nCells = N; ++ int nFaces = F; ++ int testIter = 2; ++ for (int i = 0; i < testIter; i++) ++ { ++ SMOOTH(ownStartPtr, bPrimePtr, diagPtr, psiPtr, uPtr, lowerPtr, upperPtr, nCells); ++ } ++ return 0; ++} ++ ++ ++void SMOOTH(int *ownStartPtr, double *bPrimePtr, double *diagPtr, double *psiPtr, int *uPtr, double *lowerPtr, double *upperPtr, int nCells) ++{ ++ double psii; ++ int fStart; ++ int fEnd = ownStartPtr[0]; ++ ++ for (int celli = 0; celli < nCells; celli++) ++ { ++ fStart = fEnd; ++ fEnd = ownStartPtr[celli + 1]; ++ psii = bPrimePtr[celli]; ++ ++ for (int facei = fStart; facei ++ ++#define N 131590 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++ ++void ++branch_in_loop (double *diagPtr, double *psiPtr, double *ApsiPtr, int nCells) ++{ ++ for (int cell=0; cell 0) ++ ApsiPtr[cell] = 0; ++ else ++ ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell]; ++ } ++} ++ ++int ++main (int argc, char *argv[]) ++{ ++ int nCells = N; ++ int testIter = 100; ++ ++ for (int i=0; i ++ ++#define N 131590 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++ ++void ++break_in_loop (double *diagPtr, double *psiPtr, double *ApsiPtr, int nCells) ++{ ++ for (int cell=0; cell 0) ++ break; ++ ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell]; ++ } ++} ++ ++int ++main (int argc, char *argv[]) ++{ ++ int nCells = N; ++ int testIter = 2; ++ ++ for (int i=0; i ++ ++#define N 131 ++ ++double diagPtr[N]; ++int psiPtr[N]; ++double ApsiPtr[N]; ++ ++void ++goto_in_loop (double *diagPtr, int *psiPtr, double *ApsiPtr, int nCells) ++{ ++ for (int cell=0; cellnodes; ++ while (v > 1) ++ { ++ basic_block bb = di->dfs_to_bb[v]; ++ edge e; ++ ++ par = di->dfs_parent[v]; ++ k = v; ++ ++ ei = (reverse) ? ei_start (bb->succs) : ei_start (bb->preds); ++ ++ if (reverse) ++ { ++ /* If this block has a fake edge to exit, process that first. */ ++ if (bitmap_bit_p (di->fake_exit_edge, bb->index)) ++ { ++ einext = ei; ++ einext.index = 0; ++ goto do_fake_exit_edge; ++ } ++ } ++ ++ /* Search all direct predecessors for the smallest node with a path ++ to them. That way we have the smallest node with also a path to ++ us only over nodes behind us. In effect we search for our ++ semidominator. */ ++ while (!ei_end_p (ei)) ++ { ++ basic_block b; ++ TBB k1; ++ ++ e = ei_edge (ei); ++ b = (reverse) ? e->dest : e->src; ++ einext = ei; ++ ei_next (&einext); ++ ++ if (b == en_block) ++ { ++ do_fake_exit_edge: ++ k1 = di->dfs_order[last_basic_block]; ++ } ++ else ++ k1 = di->dfs_order[b->index]; ++ ++ /* Call eval() only if really needed. If k1 is above V in DFS tree, ++ then we know, that eval(k1) == k1 and key[k1] == k1. */ ++ if (k1 > v) ++ k1 = di->key[eval (di, k1)]; ++ if (k1 < k) ++ k = k1; ++ ++ ei = einext; ++ } ++ ++ di->key[v] = k; ++ link_roots (di, par, v); ++ di->next_bucket[v] = di->bucket[k]; ++ di->bucket[k] = v; ++ ++ /* Transform semidominators into dominators. */ ++ for (w = di->bucket[par]; w; w = di->next_bucket[w]) ++ { ++ k = eval (di, w); ++ if (di->key[k] < di->key[w]) ++ di->dom[w] = k; ++ else ++ di->dom[w] = par; ++ } ++ /* We don't need to cleanup next_bucket[]. */ ++ di->bucket[par] = 0; ++ v--; ++ } ++ ++ /* Explicitly define the dominators. */ ++ di->dom[1] = 0; ++ for (v = 2; v <= di->nodes; v++) ++ if (di->dom[v] != di->key[v]) ++ di->dom[v] = di->dom[di->dom[v]]; ++} ++ ++/* { dg-final { scan-tree-dump-times "Warning: Find cycle at bb index" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump "static issue" "llc_allocate" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c +new file mode 100644 +index 000000000..e18725f60 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c +@@ -0,0 +1,50 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -c -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0" } */ ++ ++#include ++ ++typedef struct stack_def ++{ ++ int top; /* index to top stack element */ ++ unsigned long reg_set; /* set of live registers */ ++ unsigned char reg[128]; /* register - stack mapping */ ++} *stack; ++ ++typedef struct block_info_def ++{ ++ struct stack_def stack_in; /* Input stack configuration. */ ++ struct stack_def stack_out; /* Output stack configuration. */ ++ unsigned long out_reg_set; /* Stack regs live on output. */ ++ int done; /* True if block already converted. */ ++ int predecessors; /* Number of predecessors that need ++ to be visited. */ ++} *block_info; ++ ++typedef struct basic_block_def ++{ ++ void *aux; ++} *basic_block; ++ ++unsigned char ++convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high) ++{ ++ stack output_stack; ++ ++ output_stack = &(((block_info) bb->aux)->stack_in); ++ if (value_reg_low == -1) ++ output_stack->top = -1; ++ else ++ { ++ int reg; ++ output_stack->top = value_reg_high - value_reg_low; ++ for (reg = value_reg_low; reg <= value_reg_high; ++reg) ++ { ++ (output_stack->reg + 16)[value_reg_high - reg] = reg; ++ output_stack->reg_set |= (unsigned long) 1 << reg; ++ } ++ } ++ return output_stack->reg[0]; ++} ++ ++/* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c +new file mode 100644 +index 000000000..328dc57bc +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options " -S -O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],0,0); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PLDL1KEEP" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c +new file mode 100644 +index 000000000..d9c919869 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],0,1); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PLDL1STRM" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c +new file mode 100644 +index 000000000..806366b5b +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],0,2); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PLDL2KEEP" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c +new file mode 100644 +index 000000000..91567d1e9 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c +@@ -0,0 +1,16 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main() ++{ ++ for(int i = 0; i < 100000; i++) ++ { ++ __builtin_prefetch_full(&val[i], 0, 3); ++ val[i] = i + 1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PLDL2STRM" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c +new file mode 100644 +index 000000000..c28150654 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],0,4); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PLDL3KEEP" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c +new file mode 100644 +index 000000000..e8d9c8693 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],0,5); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PLDL3STRM" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c +new file mode 100644 +index 000000000..b0281882f +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],0,6); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PLDL4KEEP" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c +new file mode 100644 +index 000000000..26807556f +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],0,7); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PLDL4STRM" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c +new file mode 100644 +index 000000000..4f2def13d +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],1,0); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PSTL1KEEP" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c +new file mode 100644 +index 000000000..ecc501f1f +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],1,1); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PSTL1STRM" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c +new file mode 100644 +index 000000000..d140f1ed1 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],1,2); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PSTL2KEEP" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c +new file mode 100644 +index 000000000..d6f170253 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],1,3); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PSTL2STRM" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c +new file mode 100644 +index 000000000..8da092b36 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],1,4); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PSTL3KEEP" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c +new file mode 100644 +index 000000000..4cf65188a +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],1,5); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PSTL3STRM" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c +new file mode 100644 +index 000000000..36f4a3aa0 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],1,6); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PSTL4KEEP" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c +new file mode 100644 +index 000000000..43d2d41d5 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c +@@ -0,0 +1,14 @@ ++ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++ ++ ++int val[100000]; ++int main(){ ++ for(int i=0;i<100000;i++){ ++ __builtin_prefetch_full(&val[i],1,7); ++ val[i]=i+1; ++ } ++} ++ ++/* { dg-final { scan-assembler "PSTL4STRM" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c +new file mode 100644 +index 000000000..ba90e7ea4 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c +@@ -0,0 +1,62 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0" } */ ++ ++#include ++#include ++ ++#define N 1000 ++ ++long a[N] = {0}; ++long b[N] = {0}; ++long c[N] = {0}; ++ ++double ++referenceTrace (double *psiPtr, int *lPtr, int *uPtr, int nCells) ++{ ++ double sum; ++ for (int cell = 0; cell < nCells; cell++) ++ { ++ // Multi-layer pointer ++ sum += psiPtr[lPtr[cell]]; ++ psiPtr[uPtr[cell]] = sum; ++ ++ // Outer pointer, inner array ++ sum += psiPtr[b[cell]]; ++ psiPtr[a[cell]] = sum; ++ ++ // Multi-layer array ++ sum += a[b[cell]]; ++ c[a[cell]] = sum; ++ ++ // Outer array, inner pointer ++ sum += a[lPtr[cell]]; ++ c[lPtr[cell]] = sum; ++ } ++ return sum; ++} ++ ++int ++main (int argc, char *argv[]) ++{ ++ int testIter = 2; ++ ++ double *psiPtr = NULL; ++ int *lPtr = NULL; ++ int *uPtr = NULL; ++ psiPtr = (double *) calloc (N, sizeof(double)); ++ lPtr = (int *) calloc (N, sizeof(int)); ++ uPtr = (int *) calloc (N, sizeof(int)); ++ ++ for (int i = 0; i < testIter; i++) ++ referenceTrace (psiPtr, lPtr, uPtr, N); ++ ++ free (psiPtr); ++ free (lPtr); ++ free (uPtr); ++ ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump-times "Tracing succeeded" 24 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "unhandled issue scene" 2 "llc_allocate" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 +new file mode 100644 +index 000000000..b0f68ebe3 +--- /dev/null ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 +@@ -0,0 +1,211 @@ ++! { dg-do compile { target { aarch64*-*-linux* } } } ++! { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param branch-prob-threshold=50 --param filter-mode=0" } ++ ++program main ++ ++ IMPLICIT NONE ++ INTEGER :: ids,ide, jds,jde, kds,kde ++ INTEGER,parameter :: ims=-4,kms=1,jms=-4 ++ INTEGER,parameter :: ime=210,kme=36,jme=192 ++ INTEGER :: its,ite, jts,jte, kts,kte ++ INTEGER :: number_of_small_timesteps,rk_step, rk_order, step ++ ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: t_1, t_2, c2a, p, ph, pm1, al, alt ++ ++ ++ REAL, DIMENSION(ims:ime, jms:jme) :: mu, muts ++ ++ REAL, DIMENSION(kms:kme) :: dnw, rdnw, znu ++ ++ REAL :: rdx,rdy ++ REAL :: dts, t0, smdiv ++ REAL :: random1,time_begin,time_end,total_time ++ ++ INTEGER :: i, j, k ++ INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end ++ INTEGER :: i_endu, j_endv ++ INTEGER :: interval=1 ++ INTEGER :: epoch,iter ++ ++ LOGICAL :: non_hydrostatic ++ ++ data ids, jds, kds, its, jts, kts /6*1/ ++ data ide, ite /2*205/ ++ data jde, jte /2*187/ ++ data kde, kte /2*36/ ++ ++ number_of_small_timesteps = 1 ++ rk_step = 3 ++ rk_order = 1 ++ dts = 1. ++ ++ rdx = 1. ++ rdy = 1. ++ ++ t0 = 0. ++ smdiv = 1. ++ step = 1 ++ non_hydrostatic = .true. ++ ++ call random_number(random1) ++ interval = random1*100 ++ interval=1 ++ ++ call random_seed(put=(/(i,i=1,10000,interval)/)) ++ ++ call random_number(alt) ++ call random_number(c2a) ++ call random_number(ph) ++ call random_number(pm1) ++ call random_number(mu) ++ call random_number(muts) ++ call random_number(dnw) ++ call random_number(rdnw) ++ call random_number(znu) ++ ++ do iter=1,2 ++ call calc_p_rho( al, p, ph, & ++ alt, t_2, t_1, c2a, pm1, & ++ mu, muts, znu, t0, & ++ rdnw, dnw, smdiv, & ++ non_hydrostatic, step, & ++ ids, ide, jds, jde, kds, kde, & ++ ims, ime, jms, jme, kms, kme, & ++ its,ite, jts,jte, kts,kte ) ++ ++ enddo ++ ++end program ++ ++ ++SUBROUTINE calc_p_rho( al, p, ph, & ++ alt, t_2, t_1, c2a, pm1, & ++ mu, muts, znu, t0, & ++ rdnw, dnw, smdiv, & ++ non_hydrostatic, step, & ++ ids, ide, jds, jde, kds, kde, & ++ ims, ime, jms, jme, kms, kme, & ++ its,ite, jts,jte, kts,kte ) ++ ++ IMPLICIT NONE ! religion first ++ !asb ++! declarations for the stuff coming in ++ ++ INTEGER, INTENT(IN ) :: ids,ide, jds,jde, kds,kde ++ INTEGER, INTENT(IN ) :: ims,ime, jms,jme, kms,kme ++ INTEGER, INTENT(IN ) :: its,ite, jts,jte, kts,kte ++ ++ INTEGER, INTENT(IN ) :: step ++ ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT( OUT) :: al, & ++ p ++ ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(IN ) :: alt, & ++ t_2, & ++ t_1, & ++ c2a ++ ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(INOUT) :: ph, pm1 ++ ++ REAL, DIMENSION(ims:ime, jms:jme) , INTENT(IN ) :: mu, & ++ muts ++ ++ REAL, DIMENSION(kms:kme) , INTENT(IN ) :: dnw, & ++ rdnw, & ++ znu ++ ++ REAL, INTENT(IN ) :: t0, smdiv ++ ++ LOGICAL, INTENT(IN ) :: non_hydrostatic ++ ++! local variables ++ ++ INTEGER :: i, j, k ++ INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end ++ REAL :: ptmp ++ ++ i_start = its ++ i_end = min(ite,ide-1) ++ j_start = jts ++ j_end = min(jte,jde-1) ++ k_start = kts ++ k_end = min(kte,kde-1) ++ ++ IF (non_hydrostatic) THEN ++ DO j=j_start, j_end ++ DO k=k_start, k_end ++ DO i=i_start, i_end ++ ++! al computation is all dry, so ok with moisture ++ ++ al(i,k,j)=-1./muts(i,j)*(alt(i,k,j)*mu(i,j) & ++ +rdnw(k)*(ph(i,k+1,j)-ph(i,k,j))) ++ ++! this is temporally linearized p, no moisture correction needed ++ ++ p(i,k,j)=c2a(i,k,j)*(alt(i,k,j)*(t_2(i,k,j)-mu(i,j)*t_1(i,k,j)) & ++ /(muts(i,j)*(t0+t_1(i,k,j)))-al (i,k,j)) ++ ++ ENDDO ++ ENDDO ++ ENDDO ++ ++ ELSE ! hydrostatic calculation ++ ++ DO j=j_start, j_end ++ DO k=k_start, k_end ++ DO i=i_start, i_end ++ p(i,k,j)=mu(i,j)*znu(k) ++ al(i,k,j)=alt(i,k,j)*(t_2(i,k,j)-mu(i,j)*t_1(i,k,j)) & ++ /(muts(i,j)*(t0+t_1(i,k,j)))-p(i,k,j)/c2a(i,k,j) ++ ph(i,k+1,j)=ph(i,k,j)-dnw(k)*(muts(i,j)*al (i,k,j) & ++ +mu(i,j)*alt(i,k,j)) ++ ENDDO ++ ENDDO ++ ENDDO ++ ++ END IF ++ ++! divergence damping setup ++ ++ IF (step == 0) then ! we're initializing small timesteps ++ DO j=j_start, j_end ++ DO k=k_start, k_end ++ DO i=i_start, i_end ++ pm1(i,k,j)=p(i,k,j) ++ ENDDO ++ ENDDO ++ ENDDO ++ ELSE ! we're in the small timesteps ++ DO j=j_start, j_end ! and adding div damping component ++ DO k=k_start, k_end ++ DO i=i_start, i_end ++ ptmp = p(i,k,j) ++ p(i,k,j) = p(i,k,j) + smdiv*(p(i,k,j)-pm1(i,k,j)) ++ pm1(i,k,j) = ptmp ++ ENDDO ++ ENDDO ++ ENDDO ++ END IF ++ ++END SUBROUTINE calc_p_rho ++ ++! { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 6 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "Tracing succeeded" 46 "llc_allocate" } } ++! { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){2}\}" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){4}\}" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d\\tp\\t\\(0.000000, 3, 1, 0\\)" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d\\tp\\t\\(0.000000, 3, 3, 0\\)" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d\\tpm1\\t\\(0.000000, 3, 2, 0\\)" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d\\tph\\t\\(0.000000, 3, 2, 0\\)" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d\\tal\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d\\talt\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d\\tt_1\\t\\(0.000000, 3, 1, 0\\)" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d\\tt_2\\t\\(0.000000, 3, 1, 0\\)" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d\\tc2a\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "runtime issue" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "insert svprfd" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "cumul_size.*150960\\)" 1 "llc_allocate" } } +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp +new file mode 100644 +index 000000000..13d225f35 +--- /dev/null ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp +@@ -0,0 +1,29 @@ ++# Copyright (C) 2022-2023 Free Software Foundation, Inc. ++ ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 3 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with GCC; see the file COPYING3. If not see ++# . ++ ++# GCC testsuite that uses the `dg.exp' driver. ++ ++load_lib gfortran-dg.exp ++ ++# Initialize `dg'. ++dg-init ++ ++# Main loop. ++gfortran-dg-runtest [lsort \ ++ [glob -nocomplain $srcdir/$subdir/*.\[fF\]{,90,95,03,08} ] ] "" "" ++ ++# All done. ++dg-finish +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 +new file mode 100644 +index 000000000..501e6e74c +--- /dev/null ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 +@@ -0,0 +1,62 @@ ++! { dg-do compile { target { aarch64*-*-linux* } } } ++! { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno" } ++ ++MODULE INPUT ++ IMPLICIT NONE ++ ++ INTEGER, PARAMETER :: wp = 8, jpi = 25, jpj = 39, jpk = 31, kjpt = 2 ++ ++ INTEGER :: kt = 1, jpkm1 = 30, jpjm1 = 38, fs_jpim1 = 24, fs_2 = 2 ++ REAL(wp), DIMENSION(jpi, jpj) :: e12t ++ REAL(wp), DIMENSION(jpi, jpj, jpk) :: fse3t_n ++ REAL(wp), DIMENSION(jpi, jpj, jpk, kjpt) :: pta ++ ++END MODULE INPUT ++ ++PROGRAM MAIN ++ USE INPUT ++ ++ IMPLICIT NONE ++ ++ INTEGER :: EPOCH ++ ++! Initialize arrays ++ ++ e12t = 1 ++ fse3t_n = 1 ++ pta = 1 ++! ++ ++ DO EPOCH=1,2 ++ CALL tra_ldf_iso ++ ENDDO ++ ++END PROGRAM MAIN ++ ++SUBROUTINE tra_ldf_iso ++ USE INPUT ++ ++ IMPLICIT NONE ++ ! ++ INTEGER :: ji, jj, jk, jn ! dummy loop indices ++ REAL(wp) :: zbtr, ztra ! - - ++ REAL(wp), DIMENSION(jpi, jpj, jpk) :: ztfw ++ ++ DO jn = 1, kjpt ++ ztfw(:, :, 1) = 0.e0; ztfw(:, :, jpk) = 0.e0 ++ ++ DO jk = 1, jpkm1 ++ DO jj = 2, jpjm1 ++ DO ji = fs_2, fs_jpim1 ! vector opt. ++ zbtr = 1.0/(e12t(ji, jj)*fse3t_n(ji, jj, jk)) ++ ztra = (ztfw(ji, jj, jk) - ztfw(ji, jj, jk + 1))*zbtr ++ pta(ji, jj, jk, jn) = pta(ji, jj, jk, jn) + ztra ++ END DO ++ END DO ++ END DO ++ ! ++ END DO ++ ! ++END SUBROUTINE tra_ldf_iso ++ ++! { dg-final { scan-tree-dump-times "Traced variables at vectp_ztfw" 2 "llc_allocate" } } +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 +new file mode 100644 +index 000000000..7345759db +--- /dev/null ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 +@@ -0,0 +1,58 @@ ++! { dg-do compile { target { aarch64*-*-linux* } } } ++! { dg-options "-c -O3 -march=armv8.2-a+sve -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0 --param issue-topn=1 --param mem-access-ratio=5 --param mem-access-num=1" } ++ ++Module module_domain ++ IMPLICIT NONE ++ ++ REAL, PARAMETER :: g = 9.8 ++ TYPE :: grid_type ++ REAL, POINTER :: phb(:,:,:), ph_2(:,:,:), p(:,:,:), pb(:,:,:) ++ REAL, POINTER :: fnm(:), fnp(:) ++ END TYPE ++END Module ++ ++SUBROUTINE calc_p8w(p8w, ix, iy, k_start, k_end) ++ ++ USE module_domain ++ !USE module_model_constants ++ ++ IMPLICIT NONE ++ ++ ++ !TYPE (domain), INTENT(IN) :: grid ++ INTEGER, INTENT(IN) :: k_start, k_end, ix, iy ++ REAL, DIMENSION(k_start:k_end), INTENT(OUT) :: p8w ++ ++ ++ INTEGER :: k ++ REAL :: z0, z1, z2, w1, w2 ++ REAL, DIMENSION(k_start:k_end) :: z_at_w ++ REAL, DIMENSION(k_start:k_end-1) :: z ++ TYPE (grid_type), POINTER :: grid ++ ++ ++ DO k = k_start, k_end ++ z_at_w(k) = (grid%phb(ix,k,iy)+grid%ph_2(ix,k,iy))/g ++ END DO ++ ++ DO k = k_start, k_end-1 ++ z(k) = 0.5*(z_at_w(k) + z_at_w(k+1)) ++ END DO ++ ++ DO k = k_start+1, k_end-1 ++ p8w(k) = grid%fnm(k)*(grid%p(ix,k,iy)+grid%pb(ix,k,iy)) + & ++ grid%fnp(k)*(grid%p(ix,k-1,iy)+grid%pb(ix,k-1,iy)) ++ END DO ++ ++ z0 = z_at_w(k_start) ++ z1 = z(k_start) ++ z2 = z(k_start+1) ++ w1 = (z0 - z2)/(z1 - z2) ++ w2 = 1. - w1 ++ p8w(k_start) = w1*(grid%p(ix,k_start,iy)+grid%pb(ix,k_start,iy)) + & ++ w2*(grid%p(ix,k_start+1,iy)+grid%pb(ix,k_start+1,iy)) ++ ++END SUBROUTINE calc_p8w ++ ++! { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } +\ No newline at end of file +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90 +new file mode 100644 +index 000000000..f79df5d26 +--- /dev/null ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90 +@@ -0,0 +1,320 @@ ++! { dg-do compile { target { aarch64*-*-linux* } } } ++! { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=branch-prob-threshold=50 --param=filter-kernels=0 --param=mem-access-num=2 --param=issue-topn=2 --param=force-issue=1 --param=outer-loop-nums=3" } ++!include "module_small_step_em.F90" ++ ++Module add_type ++ IMPLICIT NONE ++ ++ TYPE :: grid_config_rec_type ++ LOGICAL :: open_xs ++ LOGICAL :: open_ys ++ LOGICAL :: open_xe ++ LOGICAL :: open_ye ++ LOGICAL :: symmetric_xs ++ LOGICAL :: symmetric_xe ++ LOGICAL :: symmetric_ys ++ LOGICAL :: symmetric_ye ++ LOGICAL :: polar ++ LOGICAL :: nested ++ LOGICAL :: periodic_x ++ LOGICAL :: specified ++ END TYPE ++END Module ++ ++program main ++ ++ ++! include "module_small_step_em_modify.F90" ++ ++! use module_small_step_em ++! use module_small_step_em_modify ++ ++ use add_type ++ ++ IMPLICIT NONE ++ INTEGER :: ids,ide, jds,jde, kds,kde ++ INTEGER,parameter :: ims=-4,kms=1,jms=-4 ++ INTEGER,parameter :: ime=210,kme=36,jme=192 ++ INTEGER :: its,ite, jts,jte, kts,kte ++ INTEGER :: number_of_small_timesteps,rk_step, rk_order, step, spec_zone ++ ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme, 1:8) :: llcRefresh ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: u, v, u_1, v_1, t_1, ww_1, ft!u, v, u_1, v_1, w_1, t_1, ww1, ww_1,ph_1, ft ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: u_save, v_save, w_save, t_save, ph_save,h_diabatic ++ ! REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: u_2, v_2, w_2, t_2, ph_2 ++ ! REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: c2a, ww_save, cqw, cqu, cqv, alpha, gamma, a ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: ww!pb, p, ph, php, pm1, al, alt, ww, random_array ++ ! REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: ru_tend, rv_tend ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: t, t_ave, uam, vam, wwam ++ ++ REAL, DIMENSION(ims:ime, jms:jme) :: mu_1,mu_2, mu ++ REAL, DIMENSION(ims:ime, jms:jme) :: mub, muu, muv, mut, & ++ msfux, msfuy, & ++ msfvx, msfvx_inv, msfvy, & ++ msftx, msfty ++ ++ REAL, DIMENSION(ims:ime, jms:jme) :: muus, muvs, muts, mudf, muave ++ REAL, DIMENSION(ims:ime, jms:jme) :: mu_save, mu_tend ++ ++ REAL, DIMENSION(kms:kme) :: rdn, rdnw,dnw, fnm, fnp, znu ++ ++ REAL :: rdx,rdy ++ REAL :: dts, cf1, cf2, cf3, t0, emdiv, smdiv, epssm, g ++ REAL :: random1,time_begin,time_end,total_time ++ ++ INTEGER :: i, j, k ++ INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end ++ INTEGER :: i_endu, j_endv ++ INTEGER :: interval=1 ++ INTEGER :: epoch ++ ++ LOGICAL :: non_hydrostatic, top_lid ++ ++ ++ TYPE (grid_config_rec_type) :: config_flags ++ config_flags%open_xs = .true. ++ config_flags%open_ys = .true. ++ config_flags%open_xe = .true. ++ config_flags%open_ye = .true. ++ config_flags%symmetric_xs = .true. ++ config_flags%symmetric_xe = .true. ++ config_flags%symmetric_ys = .true. ++ config_flags%symmetric_ye = .true. ++ config_flags%polar = .true. ++ config_flags%nested = .true. ++ config_flags%periodic_x = .true. ++ config_flags%specified = .true. ++ ++ data ids, jds, kds, its, jts, kts /6*1/ ++ data ide, ite /2*205/ ++ data jde, jte /2*187/ ++ data kde, kte /2*98/ ++ ++ number_of_small_timesteps = 1 ++ rk_step = 1 ++ rk_order = 1 ++ dts = 1. ++ epssm = 1. ++ g = 1. ++ ++ rdx = 1. ++ rdy = 1. ++ dts = 1. ++ cf1 = 1. ++ cf2 = 1. ++ cf3 = 1. ++ ++ t0 = 0. ++ smdiv = 1. ++ emdiv = 1. ++ step = 1 ++ spec_zone = 1 ++ ++ non_hydrostatic = .true. ++ top_lid = .true. ++ ++ interval=1 ++ ++ ++ total_time=0 ++ ++ call random_seed(put=(/(i,i=1,10000,interval)/)) ++ ++ call random_number(u) ++ call random_number(v) ++ call random_number(u_1) ++ call random_number(v_1) ++ call random_number(t_1) ++ call random_number(ft) ++ ++ call random_number(ww) ++ call random_number(ww_1) ++ call random_number(t) ++ call random_number(t_ave) ++ call random_number(uam) ++ call random_number(vam) ++ call random_number(wwam) ++ ++ call random_number(muu) ++ call random_number(muv) ++ call random_number(mut) ++ call random_number(msfux) ++ call random_number(msfuy) ++ call random_number(msfvx) ++ call random_number(msfvx_inv) ++ call random_number(msfvy) ++ call random_number(msftx) ++ call random_number(msfty) ++ call random_number(mu_tend) ++ ++ call random_number(muave) ++ call random_number(muts) ++ call random_number(mudf) ++ call random_number(mu) ++ ++ call random_number(fnm) ++ call random_number(fnp) ++ call random_number(dnw) ++ call random_number(rdnw) ++ ++ DO j=jms, jme ++ DO k=kms, kme ++ DO i=ims, ime ++ ++ llcRefresh(i,k,j,1)=i+k+j+7 ++ ++ ENDDO ++ ENDDO ++ ENDDO ++ ++ do epoch = 1,2 ++ call advance_mu_t_fortran_plu( ww, ww_1, u, u_1, v, v_1, & ++ mu, mut, muave, muts, muu, muv, & ++ mudf, uam, vam, wwam, t, t_1, & ++ t_ave, ft, mu_tend, & ++ rdx, rdy, dts, epssm, & ++ dnw, fnm, fnp, rdnw, & ++ msfux, msfuy, msfvx, msfvx_inv, & ++ msfvy, msftx, msfty, & ++ step, config_flags, & ++ ids, ide, jds, jde, kds, kde, & ++ ims, ime, jms, jme, kms, kme, & ++ its, ite, jts, jte, kts, kte ) ++ enddo ++end program ++ ++ ++ ++SUBROUTINE advance_mu_t_fortran_plu( ww, ww_1, u, u_1, v, v_1, & ++ mu, mut, muave, muts, muu, muv, & ++ mudf, uam, vam, wwam, t, t_1, & ++ t_ave, ft, mu_tend, & ++ rdx, rdy, dts, epssm, & ++ dnw, fnm, fnp, rdnw, & ++ msfux, msfuy, msfvx, msfvx_inv, & ++ msfvy, msftx, msfty, & ++ step, config_flags, & ++ ids, ide, jds, jde, kds, kde, & ++ ims, ime, jms, jme, kms, kme, & ++ its, ite, jts, jte, kts, kte ) ++ use add_type ++ ++ IMPLICIT NONE ! religion first ++ ++ ! stuff coming in ++ ++ TYPE(grid_config_rec_type), INTENT(IN ) :: config_flags ++ INTEGER, INTENT(IN ) :: ids,ide, jds,jde, kds,kde ++ INTEGER, INTENT(IN ) :: ims,ime, jms,jme, kms,kme ++ INTEGER, INTENT(IN ) :: its,ite, jts,jte, kts,kte ++ ++ INTEGER, INTENT(IN ) :: step ++ ++ REAL, DIMENSION( ims:ime , kms:kme, jms:jme ), & ++ INTENT(IN ) :: & ++ u, & ++ v, & ++ u_1, & ++ v_1, & ++ t_1, & ++ ft ++ ++ REAL, DIMENSION( ims:ime , kms:kme, jms:jme ), & ++ INTENT(INOUT) :: & ++ ww, & ++ ww_1, & ++ t, & ++ t_ave, & ++ uam, & ++ vam, & ++ wwam ++ ++ REAL, DIMENSION( ims:ime , jms:jme ), INTENT(IN ) :: muu, & ++ muv, & ++ mut, & ++ msfux,& ++ msfuy,& ++ msfvx,& ++ msfvx_inv,& ++ msfvy,& ++ msftx,& ++ msfty,& ++ mu_tend ++ ++ REAL, DIMENSION( ims:ime , jms:jme ), INTENT( INOUT) :: muave, & ++ muts, & ++ mudf ++ ++ REAL, DIMENSION( ims:ime , jms:jme ), INTENT(INOUT) :: mu ++ ++ REAL, DIMENSION( kms:kme ), INTENT(IN ) :: fnm, & ++ fnp, & ++ dnw, & ++ rdnw ++ ++ ++ REAL, INTENT(IN ) :: rdx, & ++ rdy, & ++ dts, & ++ epssm ++ ++ REAL, DIMENSION (its:ite, kts:kte) :: wdtn, dvdxi ++ REAL, DIMENSION (its:ite) :: dmdt ++ ++ INTEGER :: i,j,k, i_start, i_end, j_start, j_end, k_start, k_end ++ INTEGER :: i_endu, j_endv ++ REAL :: acc ++ ++ INTEGER :: ubv, lbv, t1, t2, t3, t4, ceild, floord ++ ++ ceild(t1, t2) = ceiling(REAL(t1)/REAL(t2)) ++ floord(t1, t2) = floor(REAL(t1)/REAL(t2)) ++ i_start = its ++ i_end = min(ite,ide-1) ++ j_start = jts ++ j_end = min(jte,jde-1) ++ k_start = kts ++ k_end = kte-1 ++ IF ( .NOT. config_flags%periodic_x )THEN ++ IF ( config_flags%specified .or. config_flags%nested ) then ++ i_start = max(its,ids+1) ++ i_end = min(ite,ide-2) ++ ENDIF ++ ENDIF ++ IF ( config_flags%specified .or. config_flags%nested ) then ++ j_start = max(jts,jds+1) ++ j_end = min(jte,jde-2) ++ ENDIF ++ ++ i_endu = ite ++ j_endv = jte ++ ++ DO j = j_start, j_end ++ ++ DO i=i_start, i_end ++ dmdt(i) = 0. ++ ENDDO ++ ++ DO k=k_start, k_end ++ DO i=i_start, i_end ++ dvdxi(i,k) = msftx(i,j)*msfty(i,j)*( & ++ rdy*((v(i,k,j+1)+muv(i,j+1)*v_1(i,k,j+1)*msfvx_inv(i,j+1)) & ++ -(v(i,k,j )+muv(i,j )*v_1(i,k,j)*msfvx_inv(i,j ))) & ++ +rdx*((u(i+1,k,j)+muu(i+1,j)*u_1(i+1,k,j)/msfuy(i+1,j)) & ++ -(u(i,k,j )+muu(i ,j)*u_1(i,k,j )/msfuy(i,j)) )) ++ dmdt(i) = dmdt(i) + dnw(k)*dvdxi(i,k) ++ ENDDO ++ ENDDO ++ DO i=i_start, i_end ++ muave(i,j) = mu(i,j) ++ mu(i,j) = mu(i,j)+dts*(dmdt(i)+mu_tend(i,j)) ++ mudf(i,j) = (dmdt(i)+mu_tend(i,j)) ! save tendency for div dampfilter ++ muts(i,j) = mut(i,j)+mu(i,j) ++ muave(i,j) =.5*((1.+epssm)*mu(i,j)+(1.-epssm)*muave(i,j)) ++ ENDDO ++ ENDDO ++END SUBROUTINE advance_mu_t_fortran_plu ++ ++! { dg-final { scan-tree-dump "issue_llc_hint" "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "analyze_nested_kernels" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump "Stop tracing the outer loop depth" "llc_allocate" } } +\ No newline at end of file +diff --git a/gcc/timevar.def b/gcc/timevar.def +index 36c3e7d5a..14129a500 100644 +--- a/gcc/timevar.def ++++ b/gcc/timevar.def +@@ -84,6 +84,7 @@ DEFTIMEVAR (TV_IPA_COMDATS , "ipa comdats") + DEFTIMEVAR (TV_IPA_HARDWARE_DETECTION, "ipa detection") + DEFTIMEVAR (TV_IPA_PREFETCH , "ipa prefetch") + DEFTIMEVAR (TV_IPA_STRUCT_REORG , "ipa struct reorg optimization") ++DEFTIMEVAR (TV_IPA_EXTEND_AUTO_PROFILE, "ipa extend auto profile") + DEFTIMEVAR (TV_IPA_OPT , "ipa various optimizations") + DEFTIMEVAR (TV_IPA_LTO_DECOMPRESS , "lto stream decompression") + DEFTIMEVAR (TV_IPA_LTO_COMPRESS , "lto stream compression") +@@ -215,6 +216,7 @@ DEFTIMEVAR (TV_TREE_LOOP_DISTRIBUTION, "tree loop distribution") + DEFTIMEVAR (TV_CHECK_DATA_DEPS , "tree check data dependences") + DEFTIMEVAR (TV_TREE_PREFETCH , "tree prefetching") + DEFTIMEVAR (TV_TREE_LOOP_IVOPTS , "tree iv optimization") ++DEFTIMEVAR (TV_TREE_LLC_ALLOCATE , "tree llc allocation") + DEFTIMEVAR (TV_PREDCOM , "predictive commoning") + DEFTIMEVAR (TV_TREE_CH , "tree copy headers") + DEFTIMEVAR (TV_TREE_SSA_UNCPROP , "tree SSA uncprop") +diff --git a/gcc/toplev.cc b/gcc/toplev.cc +index f00a166df..bdbd4de63 100644 +--- a/gcc/toplev.cc ++++ b/gcc/toplev.cc +@@ -567,6 +567,12 @@ compile_file (void) + targetm.asm_out.output_ident (ident_str); + } + ++ /* Extend auto profile finalization. */ ++ if (flag_ipa_extend_auto_profile) ++ { ++ free_extend_profile_info (); ++ } ++ + /* Auto profile finalization. */ + if (flag_auto_profile) + end_auto_profile (); +diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc +index d33aaec8c..40f67a8ed 100644 +--- a/gcc/tree-cfg.cc ++++ b/gcc/tree-cfg.cc +@@ -8476,6 +8476,17 @@ print_loops (FILE *file, int verbosity) + print_loop_and_siblings (file, bb->loop_father, 0, verbosity); + } + ++/* Dump a loop to file. */ ++ ++void ++loop_dump (FILE *file, class loop *loop) ++{ ++ print_loop (file, loop, 0, 0); ++ fprintf (file, "vec_niter = "); ++ print_generic_expr (file, loop->vec_nb_iterations); ++ fprintf (file, "\n"); ++} ++ + /* Dump a loop. */ + + DEBUG_FUNCTION void +diff --git a/gcc/tree-cfg.h b/gcc/tree-cfg.h +index bfe44c073..0982fa7cf 100644 +--- a/gcc/tree-cfg.h ++++ b/gcc/tree-cfg.h +@@ -83,6 +83,7 @@ extern void dump_function_to_file (tree, FILE *, dump_flags_t); + extern void debug_function (tree, dump_flags_t); + extern void print_loops_bb (FILE *, basic_block, int, int); + extern void print_loops (FILE *, int); ++extern void loop_dump (FILE *file, class loop *loop); + extern void debug (class loop &ref); + extern void debug (class loop *ptr); + extern void debug_verbose (class loop &ref); +diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h +index a98f84397..468353d13 100644 +--- a/gcc/tree-pass.h ++++ b/gcc/tree-pass.h +@@ -395,6 +395,7 @@ extern gimple_opt_pass *make_pass_slp_vectorize_late (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt); ++extern gimple_opt_pass *make_pass_llc_allocate (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt); +@@ -536,6 +537,8 @@ extern simple_ipa_opt_pass *make_pass_ipa_hardware_detection (gcc::context * + ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_prefetch (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_struct_reorg (gcc::context *ctxt); ++extern simple_ipa_opt_pass *make_pass_ipa_extend_auto_profile (gcc::context ++ *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_pta (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_tm (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_target_clone (gcc::context *ctxt); +diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc +index 44157265c..4c014fb23 100644 +--- a/gcc/tree-scalar-evolution.cc ++++ b/gcc/tree-scalar-evolution.cc +@@ -2789,7 +2789,7 @@ resolve_mixers (class loop *loop, tree chrec, bool *folded_casts) + the loop body has been executed 6 times. */ + + tree +-number_of_latch_executions (class loop *loop) ++number_of_latch_executions (class loop *loop, bool guarantee) + { + edge exit; + class tree_niter_desc niter_desc; +@@ -2810,7 +2810,8 @@ number_of_latch_executions (class loop *loop) + res = chrec_dont_know; + exit = single_exit (loop); + +- if (exit && number_of_iterations_exit (loop, exit, &niter_desc, false)) ++ if (exit && number_of_iterations_exit (loop, exit, &niter_desc, false, ++ true, NULL, guarantee)) + { + may_be_zero = niter_desc.may_be_zero; + res = niter_desc.niter; +@@ -2836,7 +2837,8 @@ number_of_latch_executions (class loop *loop) + fprintf (dump_file, "))\n"); + } + +- loop->nb_iterations = res; ++ if (guarantee) ++ loop->nb_iterations = res; + return res; + } + +diff --git a/gcc/tree-scalar-evolution.h b/gcc/tree-scalar-evolution.h +index 0f90207bc..dc27d9545 100644 +--- a/gcc/tree-scalar-evolution.h ++++ b/gcc/tree-scalar-evolution.h +@@ -21,7 +21,8 @@ along with GCC; see the file COPYING3. If not see + #ifndef GCC_TREE_SCALAR_EVOLUTION_H + #define GCC_TREE_SCALAR_EVOLUTION_H + +-extern tree number_of_latch_executions (class loop *); ++extern tree number_of_latch_executions (class loop *, ++ bool guarantee = true); + extern gcond *get_loop_exit_condition (const class loop *); + + extern void scev_initialize (void); +diff --git a/gcc/tree-ssa-llc-allocate.cc b/gcc/tree-ssa-llc-allocate.cc +new file mode 100644 +index 000000000..da6d72b94 +--- /dev/null ++++ b/gcc/tree-ssa-llc-allocate.cc +@@ -0,0 +1,4150 @@ ++/* LLC allocate. ++ Copyright (C) 2022-2023 Free Software Foundation, Inc. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify it ++under the terms of the GNU General Public License as published by the ++Free Software Foundation; either version 3, or (at your option) any ++later version. ++ ++GCC is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++#include "config.h" ++#define INCLUDE_MAP ++#define INCLUDE_SET ++#define INCLUDE_VECTOR ++#define INCLUDE_LIST ++#define INCLUDE_ALGORITHM ++#include "system.h" ++#include "coretypes.h" ++#include "backend.h" ++#include "target.h" ++#include "rtl.h" ++#include "tree.h" ++#include "gimple.h" ++#include "predict.h" ++#include "tree-pass.h" ++#include "gimple-ssa.h" ++#include "optabs-query.h" ++#include "tree-pretty-print.h" ++#include "fold-const.h" ++#include "stor-layout.h" ++#include "gimplify.h" ++#include "gimple-iterator.h" ++#include "gimplify-me.h" ++#include "tree-ssa-loop-ivopts.h" ++#include "tree-ssa-loop-manip.h" ++#include "tree-ssa-loop-niter.h" ++#include "tree-ssa-loop.h" ++#include "ssa.h" ++#include "tree-into-ssa.h" ++#include "cfgloop.h" ++#include "tree-scalar-evolution.h" ++#include "langhooks.h" ++#include "tree-inline.h" ++#include "tree-data-ref.h" ++#include "diagnostic-core.h" ++#include "dbgcnt.h" ++#include "gimple-pretty-print.h" ++#include "internal-fn.h" ++#include "tree-cfg.h" ++#include "profile-count.h" ++#include "auto-profile.h" ++ ++/* Number of parallel cores. */ ++const unsigned int PARALLEL_NUM = 304; ++ ++/* Indirect access weight. */ ++const unsigned int INDIRECT_ACCESS_VALUE = 3; ++ ++/* Write memory weight. */ ++const unsigned int WRITE_COST = 4; ++ ++/* Maximum ratio of total prefetch data size to cache size. */ ++const double PREFETCH_CACHE_SIZE_RATIO = 0.8; ++ ++/* Prefetch tool input max length. */ ++#ifndef PREFETCH_TOOL_INPUT_MAX_LEN ++#define PREFETCH_TOOL_INPUT_MAX_LEN 512 ++#endif ++ ++/* Prefetch tool number max length. */ ++#ifndef PREFETCH_TOOL_NUM_MAX_LEN ++#define PREFETCH_TOOL_NUM_MAX_LEN 9 ++#endif ++ ++#ifndef PREFETCH_FUNC_TOPN ++#define PREFETCH_FUNC_TOPN param_llc_allocate_func_topn ++#endif ++ ++namespace { ++ ++/* loop bound info of the memory reference located. */ ++struct loop_bound ++{ ++ /* iv tree_node. */ ++ tree iv; ++ ++ /* define stmt of iv. */ ++ gimple *def_stmt; ++ ++ /* loop where stmt is located. */ ++ class loop *loop; ++ ++ /* loop unroll factor. */ ++ unsigned int unroll; ++ ++ /* Number of iterations of loop. */ ++ tree niters; ++ ++ loop_bound (tree t, gimple *stmt) ++ { ++ iv = t; ++ def_stmt = stmt; ++ loop = loop_containing_stmt (stmt); ++ unroll = 1; ++ niters = chrec_dont_know; ++ } ++}; ++ ++/* method of calculating the data size. */ ++ ++enum calc_type ++{ ++ UNHANDLE_CALC = 0, ++ RUNTIME_CALC, ++ STATIC_CALC ++}; ++ ++/* Describes a info of a memory reference. */ ++ ++struct data_ref ++{ ++ /* The memory reference. */ ++ tree ref; ++ ++ /* Statement where the ref is located. */ ++ gimple *stmt; ++ ++ /* var_decl or param_decl, used for the ref_group. */ ++ tree var; ++ ++ /* Base of the reference. */ ++ tree base; ++ ++ /* Constant offset of the reference. */ ++ tree offset; ++ ++ /* index of the reference. */ ++ tree index; ++ ++ /* Constant step of the reference. */ ++ tree step; ++ ++ /* loop boundary info of each dimension. */ ++ std::vector loop_bounds; ++ ++ /* memory data size, Unit: MB. */ ++ double data_size; ++ ++ /* method of calculating the data size. */ ++ calc_type calc_by; ++ ++ /* True if the info of ref is traced, and then record it. */ ++ unsigned int trace_status_p : 1; ++ ++ /* True if the loop is vectorized. */ ++ unsigned int vectorize_p : 1; ++ ++ /* True if the memory reference is shared. */ ++ unsigned int parallel_p : 1; ++ ++ /* True if the memory reference is regular. */ ++ unsigned int regular_p : 1; ++ ++ /* True if the memory reference is read. */ ++ unsigned int read_p : 1; ++ ++ /* loop father depth. */ ++ unsigned int loop_depth; ++ ++ /* bb index. */ ++ int bb_idx; ++ ++ /* loop index. */ ++ int loop_idx; ++ ++ data_ref () ++ { ++ ref = NULL_TREE; ++ stmt = NULL; ++ var = NULL_TREE; ++ base = NULL_TREE; ++ offset = NULL_TREE; ++ index = NULL_TREE; ++ step = NULL_TREE; ++ data_size = 0; ++ calc_by = UNHANDLE_CALC; ++ trace_status_p = false; ++ vectorize_p = false; ++ parallel_p = false; ++ regular_p = true; ++ read_p = true; ++ loop_depth = 0; ++ bb_idx = 0; ++ loop_idx = 0; ++ } ++}; ++ ++/* ================ phase 1 get_dense_memory_kernels ================ */ ++ ++/* Add ref node and print. */ ++ ++void ++add_ref (std::vector &references, tree op, gimple *stmt, ++ bool vectorize_p, bool read_p) ++{ ++ data_ref ref; ++ ref.ref = op; ++ ref.stmt = stmt; ++ ref.vectorize_p = vectorize_p; ++ ref.read_p = read_p; ++ ref.loop_depth = loop_depth (stmt->bb->loop_father); ++ ref.bb_idx = stmt->bb->index; ++ ref.loop_idx = stmt->bb->loop_father->num; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, ref.ref, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ references.push_back (ref); ++} ++ ++/* Get the references from the simple call (vectorization type). */ ++ ++void ++get_references_in_gimple_call (gimple *stmt, std::vector &references) ++{ ++ if (gimple_code (stmt) != GIMPLE_CALL) ++ return; ++ ++ if (gimple_call_internal_p (stmt)) ++ { ++ bool read_p = false; ++ switch (gimple_call_internal_fn (stmt)) ++ { ++ case IFN_MASK_GATHER_LOAD: ++ case IFN_MASK_LOAD: ++ { ++ if (gimple_call_lhs (stmt) == NULL_TREE) ++ return; ++ read_p = true; ++ // FALLTHRU ++ } ++ case IFN_MASK_STORE: ++ { ++ /* _1 = &MEM[base: a_2(D), index: ivtmp_3, step: 8, offset: 0B]; ++ vect__1.1 = .MASK_LOAD (_1, 64B, loop_mask_4); ++ ++ _1 = &MEM[base: a_2(D), index: ivtmp_3, step: 8, offset: 0B]; ++ .MASK_STORE (_1, 64B, loop_mask_4, vect__1.2); ++ ++ _1 = (sizetype) a_2(D); ++ vect_patt_3.3 = .MASK_GATHER_LOAD (_1, vect__4.4, 8, ++ { 0.0, ... }, loop_mask_5); ++ */ ++ tree op1 = gimple_call_arg (stmt, 0); ++ if (TREE_CODE (op1) != SSA_NAME) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "get_references_in_gimple_call: "); ++ fprintf (dump_file, "find base that not ssa_name: "); ++ print_generic_expr (dump_file, op1, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ return; ++ } ++ gimple *op1_def = SSA_NAME_DEF_STMT (op1); ++ if (op1_def != NULL && gimple_code (op1_def) == GIMPLE_ASSIGN) ++ { ++ /* &MEM[base: xx] */ ++ tree rhs1 = gimple_assign_rhs1 (op1_def); ++ /* If the definition stmt of the operation is memory ++ reference type, read it directly. */ ++ if (TREE_CODE (rhs1) == ADDR_EXPR ++ && TREE_CODE (TREE_OPERAND (rhs1, 0)) == TARGET_MEM_REF) ++ op1 = TREE_OPERAND (rhs1, 0); /* MEM[base: xx] */ ++ } ++ ++ add_ref (references, op1, stmt, true, read_p); ++ return; ++ } ++ default: ++ return; ++ } ++ } ++} ++ ++/* Check whether memory reference is located exactly in main function. ++ There are some other unexpected scenarios where mem ref or function is ++ tracing failed without loc info (newly generated gimple/function). */ ++ ++bool ++is_reference_in_main_p (gimple *stmt) ++{ ++ expanded_location xloc = expand_location (stmt->location); ++ if (DECL_NAME (cfun->decl) && MAIN_NAME_P (DECL_NAME (cfun->decl))) ++ { ++ /* NEXT STEP: Check why some functions have no end_locus. */ ++ if (!(DECL_SOURCE_LOCATION (current_function_decl) ++ && cfun->function_end_locus)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Cannot find function start-end location.\n"); ++ return true; ++ } ++ else if (!(xloc.file && xloc.line)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Cannot find gimple statement location.\n"); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); ++ } ++ return false; ++ } ++ int fn_start = expand_location ( ++ DECL_SOURCE_LOCATION (current_function_decl)).line; ++ int fn_end = expand_location (cfun->function_end_locus).line; ++ ++ if (xloc.line >= fn_start && xloc.line <= fn_end) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Memory access in main function: "); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); ++ } ++ return true; ++ } ++ } ++ return false; ++} ++ ++/* Stores the locations of memory references in STMT to REFERENCES. */ ++ ++void ++get_references_in_stmt (gimple *stmt, std::vector &references) ++{ ++ if (!gimple_vuse (stmt)) ++ return; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "gimple_vuse: "); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); ++ } ++ ++ /* Filter out memory references located in main function. This is a ++ experimental filtering scheme ONLY for HPC case verification as ++ some HPC cases assign values for variables (mem ref) in main function. */ ++ if (is_reference_in_main_p (stmt)) ++ return; ++ ++ if (gimple_code (stmt) == GIMPLE_ASSIGN) ++ { ++ tree op0 = gimple_assign_lhs (stmt); ++ tree op1 = gimple_assign_rhs1 (stmt); ++ tree base = NULL_TREE; ++ ++ /* _1 = MEM[base: a, index: i, step: 8, offset: 0B]; */ ++ if (REFERENCE_CLASS_P (op1) && (base = get_base_address (op1)) ++ && TREE_CODE (base) != SSA_NAME && !is_gimple_min_invariant (base)) ++ add_ref (references, op1, stmt, false, true); ++ ++ if (REFERENCE_CLASS_P (op0) && get_base_address (op0)) ++ add_ref (references, op0, stmt, false, false); ++ } ++ else if (gimple_code (stmt) == GIMPLE_CALL) ++ get_references_in_gimple_call (stmt, references); ++ ++ return; ++} ++ ++/* flag of loop filter out. */ ++ ++struct loop_filter_out_flag ++{ ++ /* Use external call. */ ++ bool use_ext_call; ++ ++ /* Use external node. */ ++ bool use_ext_node; ++ ++ /* Use loop defined in macros. */ ++ bool use_macro_loop; ++ ++ /* Use external node. */ ++ bool use_cond_func; ++}; ++ ++/* Check whether an external node is used. */ ++ ++bool use_ext_node_p (const std::vector &references, ++ unsigned int &start) ++{ ++ expanded_location cfun_xloc ++ = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); ++ ++ unsigned i = start; ++ start = references.size (); ++ for (; i < references.size (); i++) ++ { ++ data_ref ref = references[i]; ++ expanded_location xloc = expand_location (ref.stmt->location); ++ if (xloc.file && filename_cmp (cfun_xloc.file, xloc.file)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "use_ext_node\n\n"); ++ return true; ++ } ++ } ++ return false; ++} ++ ++/* Determine whether to filter out loops by stmt. */ ++ ++bool ++filter_out_loop_by_stmt_p (loop_filter_out_flag &loop_filter, gimple *stmt, ++ const std::vector &references, ++ unsigned int &start) ++{ ++ expanded_location xloc = expand_location (stmt->location); ++ /* check use_ext_call. */ ++ if (gimple_code (stmt) == GIMPLE_CALL && !gimple_call_internal_p (stmt)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "use_ext_call: "); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); ++ } ++ loop_filter.use_ext_call = true; ++ return true; ++ } ++ ++ /* check use_macro_loop. */ ++ if (xloc.file && xloc.column != 1) ++ loop_filter.use_macro_loop = false; ++ ++ /* check use_cond_func, VEC_COND_EXPR/MIN_EXPR/MAX_EXPR. */ ++ if (gimple_code (stmt) == GIMPLE_ASSIGN) ++ { ++ enum tree_code rhs_code = gimple_assign_rhs_code (stmt); ++ if (rhs_code == VEC_COND_EXPR || rhs_code == MIN_EXPR ++ || rhs_code == MAX_EXPR) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "use_cond_func: "); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); ++ } ++ loop_filter.use_cond_func = true; ++ return true; ++ } ++ } ++ ++ /* check use_ext_node. */ ++ if (use_ext_node_p (references, start)) ++ { ++ loop_filter.use_ext_node = true; ++ return true; ++ } ++ ++ return false; ++} ++ ++/* Dump the flag type of the loop is filtered out. */ ++ ++void ++dump_loop_filter_out_flag (loop_filter_out_flag &loop_filter) ++{ ++ if (loop_filter.use_ext_call) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: use_ext_call\n"); ++ } ++ ++ if (loop_filter.use_ext_node) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: use_ext_node\n"); ++ } ++ ++ if (loop_filter.use_macro_loop) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: use_macro_loop\n"); ++ } ++ ++ if (loop_filter.use_cond_func) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: use_cond_func\n"); ++ } ++} ++ ++/* Get references in loop. */ ++ ++bool ++get_references_in_loop (std::vector &references, ++ loop_filter_out_flag &loop_filter, ++ class loop *loop) ++{ ++ unsigned int start = 0; ++ bool filter_out_loop = true; ++ ++ /* Analyze each bb in the loop. */ ++ basic_block *body = get_loop_body_in_dom_order (loop); ++ for (unsigned i = 0; i < loop->num_nodes; i++) ++ { ++ basic_block bb = body[i]; ++ if (bb->loop_father != loop) ++ continue; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\n==== the %dth loop bb body ====\n", i); ++ gimple_dump_bb (dump_file, bb, 0, dump_flags); ++ fprintf (dump_file, "\n"); ++ } ++ ++ gimple_stmt_iterator bsi; ++ for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) ++ { ++ gimple *stmt = gsi_stmt (bsi); ++ get_references_in_stmt (stmt, references); ++ filter_out_loop = filter_out_loop_by_stmt_p (loop_filter, stmt, ++ references, start); ++ if (filter_out_loop) ++ break; ++ } ++ if (filter_out_loop) ++ break; ++ } ++ free (body); ++ return !filter_out_loop; ++} ++ ++/* Computes an estimated number of insns in LOOP, weighted by WEIGHTS. ++ Assume that the HPC data reading and calculation process does not involve ++ adding branches in loops. Therefore, all bbs of loops are directly used for ++ calculation (excluding embedded loops) without considering branch weighting. ++*/ ++ ++unsigned ++estimate_loop_insns (class loop *loop, eni_weights *weights) ++{ ++ basic_block *body = get_loop_body (loop); ++ gimple_stmt_iterator gsi; ++ unsigned size = 0, i; ++ ++ for (i = 0; i < loop->num_nodes; i++) ++ { ++ basic_block bb = body[i]; ++ if (bb->loop_father != loop) ++ { ++ continue; ++ } ++ for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi)) ++ size += estimate_num_insns (gsi_stmt (gsi), weights); ++ } ++ free (body); ++ ++ return size; ++} ++ ++/* Check whether the memory access is dense. */ ++ ++bool ++dense_memory_p (const std::vector &references, class loop *loop) ++{ ++ int ref_count = references.size (); ++ unsigned int ninsns = estimate_loop_insns (loop, &eni_size_weights); ++ float mem_to_insn_ratio = (float)ref_count / (float)ninsns; ++ ++ /* The number of cores to be run and DDR bandwidth information can be ++ transferred to flexibly adjust the threshold. */ ++ bool dense_mem = (mem_to_insn_ratio >= (param_mem_access_ratio / 100.0) ++ && ref_count >= param_mem_access_num); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (cfun->decl)); ++ ++ /* Dump dense memory source code location. */ ++ if (ref_count && references[0].stmt->location) ++ { ++ expanded_location xloc = expand_location ++ (references[0].stmt->location); ++ int fn_start = 0; ++ if (DECL_SOURCE_LOCATION (current_function_decl)) ++ fn_start = expand_location ( ++ DECL_SOURCE_LOCATION (current_function_decl)).line; ++ int fn_end = fn_start; ++ if (cfun->function_end_locus) ++ fn_end = expand_location (cfun->function_end_locus).line; ++ if (xloc.file) ++ fprintf (dump_file, "[%s:%s(%d-%d):%d:%d] ", ++ xloc.file, fn_name, fn_start, fn_end, ++ xloc.line, xloc.column); ++ } ++ ++ /* Dump memory dense information. */ ++ if (dense_mem) ++ fprintf (dump_file, "dense memory access: "); ++ else ++ fprintf (dump_file, "non-dense mem access: "); ++ fprintf (dump_file, ++ "ref_count = %d, ninsns = %d, mem_to_insn_ratio = %f\n\n", ++ ref_count, ninsns, mem_to_insn_ratio); ++ } ++ ++ return dense_mem; ++} ++ ++/* Analyze the inner loop and get the loop with dense memory access. */ ++ ++void ++analyze_loop_dense_memory (std::vector &kernels, ++ std::map > &kernels_refs, ++ class loop *loop) ++{ ++ std::vector references; ++ number_of_latch_executions (loop); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\n========== Processing loop %d: ==========\n", ++ loop->num); ++ loop_dump (dump_file, loop); ++ flow_loop_dump (loop, dump_file, NULL, 1); ++ fprintf (dump_file, "loop unroll: %d\n", loop->unroll); ++ } ++ ++ if (get_loop_exit_edges (loop).length () != 1) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: loop_multiple_exits\n"); ++ return; ++ } ++ ++ loop_filter_out_flag loop_filter = {false, false, true, false}; ++ ++ if (!get_references_in_loop (references, loop_filter, loop)) ++ { ++ dump_loop_filter_out_flag (loop_filter); ++ return; ++ } ++ ++ if (dense_memory_p (references, loop)) ++ { ++ kernels_refs[loop] = references; ++ kernels.push_back (loop); ++ } ++} ++/* Analyze the inner loop and get the loop with dense memory access. */ ++ ++bool ++get_dense_memory_kernels (std::vector &kernels, ++ std::map > &kernels_refs) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nPhase 1: get_dense_memory_kernels\n\n"); ++ for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST)) ++ analyze_loop_dense_memory (kernels, kernels_refs, loop); ++ return kernels.size () > 0; ++} ++ ++/* ================ phase 2 trace_data_refs_info ================ */ ++ ++/* Determine whether the declaration is a non-vectorized. */ ++ ++bool ++generic_decl_p (tree expr) ++{ ++ if (expr == NULL_TREE) ++ return false; ++ enum tree_code expr_code = TREE_CODE (expr); ++ if (expr_code != VAR_DECL && expr_code != PARM_DECL ++ && expr_code != COMPONENT_REF) ++ return false; ++ return true; ++} ++ ++/* Initial worklist preparation for source variable tracing. ++ Add different initial node based on different gimple statements. */ ++ ++void ++add_worklist (std::vector &worklist, std::set &walked, ++ gimple *def_stmt) ++{ ++ if (gimple_code (def_stmt) == GIMPLE_PHI) ++ { ++ for (unsigned i = 0; i < gimple_phi_num_args (def_stmt); i++) ++ { ++ tree node = gimple_phi_arg_def (def_stmt, i); ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ walked.insert (node); ++ } ++ } ++ } ++ else if (is_gimple_assign (def_stmt)) ++ { ++ tree_code rhs_code = gimple_assign_rhs_code (def_stmt); ++ if (rhs_code == POINTER_PLUS_EXPR || rhs_code == NEGATE_EXPR ++ || rhs_code == NOP_EXPR || rhs_code == SSA_NAME ++ || rhs_code == COMPONENT_REF) ++ { ++ tree node = gimple_assign_rhs1 (def_stmt); ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ walked.insert (node); ++ } ++ } ++ else if (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR) ++ { ++ tree node = gimple_assign_rhs1 (def_stmt); ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ walked.insert (node); ++ } ++ node = gimple_assign_rhs2 (def_stmt); ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ walked.insert (node); ++ } ++ } ++ else if (rhs_code == TARGET_MEM_REF || rhs_code == MEM_REF) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "possibly unnested indirect memory access: "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ else ++ { ++ /* unhandled assign rhs_code: _219 = _17 * _70; ++ _17 = *grid_56(D).sst.span; ++ _70 = *grid_56(D).sst.dim[0].stride; ++ */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "unhandled assign rhs_code: "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ } ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "unsupported tracing stmt: "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ } ++} ++ ++ ++/* Tracing source variables: ++ vectp.1 = a_2(D) + _3; ++ _4 = &MEM[base: vectp.1, index: ivtmp_5, step: 8, offset: 0B]; ++ vect__1.6 = .MASK_LOAD (_4, 64B, loop_mask_7); ++ ++ _1 = (sizetype) b_2(D); ++ vect_patt_3.3 = .MASK_GATHER_LOAD (_1, vect__4.4, 8, { 0.0, ... }, ++ loop_mask_5); ++ ... ++ Due to previous pass optimizations, the current tracing method can find ++ several source variable candidates. We decide to record them in a map and ++ later filter out the true base variable by some criteria. ++*/ ++ ++void ++trace_base_var_helper (tree arg, std::set &walked, ++ std::map& base_var_candid, bool is_vect_type) ++{ ++ if (arg == NULL) ++ return; ++ ++ /* Var_decl type: base address extracted from ARRAY_REF. */ ++ if (TREE_CODE (TREE_TYPE (arg)) == ARRAY_TYPE && TREE_CODE (arg) == VAR_DECL ++ && generic_decl_p (arg)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "var_decl type\n"); ++ base_var_candid[arg] += 1; ++ return; ++ } ++ ++ /* Array type. */ ++ tree op0 = NULL; ++ if (TREE_CODE (arg) == ADDR_EXPR ++ && (op0 = TREE_OPERAND (arg, 0)) && generic_decl_p (op0)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "array type\n"); ++ base_var_candid[op0] += 1; ++ return; ++ } ++ ++ /* Pointer type. */ ++ if (TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE && generic_decl_p (arg)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "pointer type\n"); ++ base_var_candid[arg] += 1; ++ return; ++ } ++ ++ /* SSA_NAME type. */ ++ if (TREE_CODE (arg) != SSA_NAME) ++ return; ++ ++ tree tmp_var = SSA_NAME_VAR (arg); ++ if (tmp_var && !is_vect_type && TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "ssa pointer type\n"); ++ base_var_candid[tmp_var] += 1; ++ return; ++ } ++ ++ gimple *def_stmt = SSA_NAME_DEF_STMT (arg); ++ if (def_stmt == NULL) ++ return; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, arg, TDF_SLIM); ++ fprintf (dump_file, "\t\t: "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM); ++ } ++ ++ if (gimple_code (def_stmt) == GIMPLE_NOP) ++ { ++ if (!walked.count (tmp_var)) ++ walked.insert (tmp_var); ++ trace_base_var_helper (tmp_var, walked, base_var_candid, is_vect_type); ++ } ++ else ++ { ++ std::vector worklist; ++ add_worklist (worklist, walked, def_stmt); ++ for (unsigned i = 0; i < worklist.size (); ++i) ++ trace_base_var_helper (worklist[i], walked, base_var_candid, is_vect_type); ++ } ++} ++ ++/* Identify the base variable traced from base address of memory reference. ++ We recognize that current method could detect several base variable ++ candidates and the temporary criteria for base variable determination ++ is that either one of the following statement is true: ++ 1) The number of base variable candidates is 1; ++ 2) The number of detected gimple statements for some variable is 1. ++ We may use other criteria or relax the current criteria ++ (e.g., criterion 2: 1 -> any odd number). */ ++ ++bool ++trace_base_var (data_ref &mem_ref, std::set &walked) ++{ ++ tree &var = mem_ref.var; ++ tree arg = mem_ref.base; ++ std::map base_var_candid; ++ bool is_vect_type = TREE_CODE (TREE_TYPE (mem_ref.ref)) == VECTOR_TYPE; ++ trace_base_var_helper (arg, walked, base_var_candid, is_vect_type); ++ bool is_tracing_unusual = false; ++ if (base_var_candid.size () == 1) ++ var = base_var_candid.begin ()->first; ++ else ++ { ++ is_tracing_unusual = true; ++ for (std::map::iterator it = base_var_candid.begin (); ++ it != base_var_candid.end (); ++it) ++ var = it->second == 1 ? it->first : var; ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Traced variables at "); ++ print_generic_expr (dump_file, arg, TDF_SLIM); ++ fprintf (dump_file, ":\n"); ++ for (std::map::iterator it = base_var_candid.begin (); ++ it != base_var_candid.end (); ++it) ++ fprintf (dump_file, "%s:%d, ", get_name (it->first), it->second); ++ fprintf (dump_file, "\n"); ++ ++ if (var == NULL_TREE) ++ fprintf (dump_file, "Unhandled scenario for tracing base variable.\n"); ++ else if (is_tracing_unusual && var != NULL_TREE) ++ fprintf (dump_file, "Tracing unusual number or occurrences of base " ++ "variables. Choose %s.\n", ++ get_name (var)); ++ } ++ return var != NULL_TREE; ++} ++ ++/* Recursively trace and check whether the definition stmt of the ++ index operand is a recorded stmt in direct access tracing. ++ Return 0 if ref is a direct access a[]. ++ Return 1 if ref is a non-nested indirect access a[b[]]. ++ Return 2 if ref is a complex indirect memory access, such as a[f(b[])]. */ ++ ++int ++trace_indirect_operand (tree arg, std::set &traced_ref_stmt) ++{ ++ /* Return 0 if tree `arg` is not an SSA for further tracing. */ ++ if (TREE_CODE (arg) != SSA_NAME) ++ return 0; ++ ++ gimple *def_stmt = SSA_NAME_DEF_STMT (arg); ++ ++ /* Return 1 if `index` has been detected as a traced direct memory access ++ before. */ ++ if (traced_ref_stmt.count (def_stmt)) ++ return 1; ++ ++ /* Return 0 if def stmt of `arg` is not in gimple assign type. Stop tracing ++ index operand and currently no memory access operand is detected. */ ++ if (!def_stmt || !is_gimple_assign (def_stmt)) ++ return 0; ++ ++ tree_code rhs_code = gimple_assign_rhs_code (def_stmt); ++ /* Collect a whitelist of gimple_assign_rhs_code for tracing pointer/array ++ type indirect memory access. */ ++ if (rhs_code != MULT_EXPR && rhs_code != NOP_EXPR ++ && rhs_code != CONVERT_EXPR && rhs_code != PLUS_EXPR) ++ { ++ /* Return 2 if tree code has any type representing references to storge, ++ implying a complex indirect memory access scenario for future ++ analysis. */ ++ if (rhs_code == MEM_REF || rhs_code == TARGET_MEM_REF ++ || rhs_code == ARRAY_REF || rhs_code == ARRAY_RANGE_REF ++ || rhs_code == COMPONENT_REF || rhs_code == ADDR_EXPR ++ || rhs_code == INDIRECT_REF) ++ return 2; ++ ++ /* Return 0 and stop tracing if tree code is not a common tracing ++ operand, but still reflected as a non-reference type. ++ Caveats: if we never deal with this tree code before, maybe it is ++ more suitable to treat this scenario strictly. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "unknown tracing tree code: %s\n", ++ get_tree_code_name (rhs_code)); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return 0; ++ } ++ ++ tree op = NULL_TREE; ++ ssa_op_iter iter; ++ FOR_EACH_SSA_TREE_OPERAND (op, def_stmt, iter, SSA_OP_USE) ++ { ++ int trace_indir_p = trace_indirect_operand (op, traced_ref_stmt); ++ if (trace_indir_p != 0) ++ return trace_indir_p; ++ } ++ return 0; ++} ++ ++/* Trace the pointer of the direct/indirect memory access: ++ 1) Obtain the base address of the memory access. ++ 2) If index variable is formed by another memory access operation (i.e., an ++ indication of indirect memory access), ensure that the index has been ++ traced in an already discovered direct memory access. ++ 3) Otherwise, the memory access is in a more complex scenario and we need to ++ postpone the analysis later. For example, the indirect memory access is ++ nested, a[b[c[...]]], or the index variable (formed in another memory ++ access) has not been recorded/traced yet. ++ e.g., ++ _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; ++ _4 = (long unsigned int) _1; ++ _5 = _4 * 8; ++ _6 = p(D) + _5; // get base ++ _7 = *_6; // start tracing ++*/ ++ ++bool ++trace_ptr_mem_ref (data_ref &mem_ref, std::set &traced_ref_stmt, ++ std::vector &unresolved_refs) ++{ ++ /* Simple scenario: ++ _2208 = np.120_2207 * 8; ++ _1921 = sorted_weight$data_381 + _2208; ++ *_1921 = _2206; ++ ++ Complex scenario: ++ MEM[base: _3235, index: ivtmp.2768_3189, step: 4, offset: 0B] = _105; ++ _3236 = (sizetype) _214; ++ _3237 = _3236 * 4; ++ _3238 = _857 + _3237; // base + index * step ++ _3239 = _3238 + 4; // offset ++ MEM[base: _3239, index: ivtmp.2768_3189, step: 4, offset: 0B] = 0.0; ++ */ ++ tree pointer = TREE_OPERAND (mem_ref.ref, 0); ++ tree offset = TREE_OPERAND (mem_ref.ref, 1); ++ if (TREE_CODE (offset) != INTEGER_CST) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Unhandled scenario for non-constant offset.\n"); ++ ++ return false; ++ } ++ if (TREE_CODE (pointer) != SSA_NAME) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Unhandled scenario for non-ssa pointer.\n"); ++ ++ return false; ++ } ++ ++ /* Tracing back base address from SSA. */ ++ gimple *ptr_def_stmt = SSA_NAME_DEF_STMT (pointer); ++ if (ptr_def_stmt == NULL || gimple_code (ptr_def_stmt) != GIMPLE_ASSIGN ++ || gimple_assign_rhs_code (ptr_def_stmt) != POINTER_PLUS_EXPR) ++ return false; ++ tree base = gimple_assign_rhs1 (ptr_def_stmt); ++ /* index_offset = index * step. */ ++ tree index_offset = gimple_assign_rhs2 (ptr_def_stmt); ++ ++ /* Tracing back index from SSA. */ ++ if (TREE_CODE (index_offset) != SSA_NAME) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ if (TREE_CODE (index_offset) == INTEGER_CST) ++ fprintf (dump_file, "Constant index for memory access.\n"); ++ else ++ fprintf (dump_file, "Unhandled scenario for index tracing.\n"); ++ } ++ return false; ++ } ++ ++ gimple *idx_def_stmt = SSA_NAME_DEF_STMT (index_offset); ++ if (idx_def_stmt == NULL || gimple_code (idx_def_stmt) != GIMPLE_ASSIGN ++ || gimple_assign_rhs_code (idx_def_stmt) != MULT_EXPR) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Unhandled scenario for index tracing.\n"); ++ return false; ++ } ++ ++ /* Split array index from total offset of index, `index * step`. */ ++ mem_ref.base = base; ++ mem_ref.offset = offset; ++ mem_ref.index = gimple_assign_rhs1 (idx_def_stmt); ++ mem_ref.step = gimple_assign_rhs2 (idx_def_stmt); ++ if (TREE_CODE (gimple_assign_rhs1 (idx_def_stmt)) == INTEGER_CST) ++ { ++ mem_ref.index = gimple_assign_rhs2 (idx_def_stmt); ++ mem_ref.step = gimple_assign_rhs1 (idx_def_stmt); ++ } ++ ++ int trace_index_indir_p = trace_indirect_operand (mem_ref.index, ++ traced_ref_stmt); ++ if (trace_index_indir_p == 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Direct memory access tracing succeeded.\n"); ++ } ++ else if (trace_index_indir_p == 1) ++ { ++ mem_ref.regular_p = false; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Indirect memory access tracing succeeded.\n"); ++ } ++ else ++ { ++ /* Record indirect memory access with complex scenarios for future ++ analysis. */ ++ unresolved_refs.push_back (mem_ref); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Unhandled indirect memory access tracing.\n"); ++ return false; ++ } ++ ++ return true; ++} ++ ++/* Tracing direct memory reference information. */ ++ ++bool ++trace_direct_mem_ref (data_ref &mem_ref) ++{ ++ /* Direct memory access, regardless of whether it is in vectorized form, ++ can be determined through TARGET_MEM_REF: ++ address = base + index * step + offset. ++ MASK_LOAD example: ++ _43 = &MEM[base: _42, index: ivtmp_140, step: 8, offset: 0B]; ++ vect__42.11_160 = .MASK_LOAD (_43, 64B, loop_mask_163); ++ ++ In some cases (2D-array or complex-index 1D array), mem_ref's `base` ++ may actually represent `base + index * step` when `base` address updates ++ by a PHI operation, e.g., ++ MEM[base: _51, offset: 0B] ++ _51 = (void *) ivtmp.18_11; ++ ivtmp.18_11 = PHI ++ ivtmp.18_43 = ivtmp.18_11 + 16; ++ ivtmp.18_52 = (unsigned long) _10; ++ _10 = arr2D_29(D) + _9; ++ */ ++ mem_ref.base = TREE_OPERAND (mem_ref.ref, 0); ++ mem_ref.offset = TREE_OPERAND (mem_ref.ref, 1); ++ mem_ref.index = TREE_OPERAND (mem_ref.ref, 2); ++ mem_ref.step = TREE_OPERAND (mem_ref.ref, 3); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Direct memory access tracing succeeded.\n"); ++ ++ return true; ++} ++ ++/* Tracing vectorized indirect memory reference information. ++ MASK_GATHER_LOAD example: ++ vect__45.13_146 = .MASK_LOAD (_41, 32B, loop_mask_153); ++ vect__46.14_145 = (vector([2,2]) long unsigned int) vect__45.13_146; ++ vect_patt_163.15_143 = .MASK_GATHER_LOAD (_144, vect__46.14_145, 8, ++ { 0.0, ... }, loop_mask_153); */ ++ ++bool ++trace_indirect_mem_ref_vectorized (data_ref &mem_ref, ++ std::set &traced_ref_stmt) ++{ ++ /* Processing of vectorization types. */ ++ if (mem_ref.vectorize_p) ++ { ++ tree op = gimple_call_arg (mem_ref.stmt, 1); ++ if (trace_indirect_operand (op, traced_ref_stmt)) ++ { ++ mem_ref.base = gimple_call_arg (mem_ref.stmt, 0); ++ mem_ref.index = gimple_call_arg (mem_ref.stmt, 1); ++ mem_ref.step = gimple_call_arg (mem_ref.stmt, 2); ++ mem_ref.regular_p = false; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Indirect memory access tracing succeeded.\n"); ++ return true; ++ } ++ } ++ return false; ++} ++ ++/* Trace the array of the indirect memory access: ++ 1) Obtain the base address of the indirect memory access. ++ 2) Ensure that the index has been traced in the direct memory access. ++ e.g., ++ _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; ++ _4 = (integer(kind=8)) _1; ++ _5 = _4 + 135; ++ _6 = p[_5]; // start tracing ++*/ ++ ++bool ++trace_indirect_array (data_ref &mem_ref, std::set &traced_ref_stmt) ++{ ++ tree base = TREE_OPERAND (mem_ref.ref, 0); ++ tree index = TREE_OPERAND (mem_ref.ref, 1); ++ if (trace_indirect_operand (index, traced_ref_stmt)) ++ { ++ /* ARRAY_REF, The first operand is the array; ++ the second is the index. */ ++ mem_ref.base = base; ++ mem_ref.index = index; ++ mem_ref.regular_p = false; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Indirect memory access tracing succeeded.\n"); ++ ++ return true; ++ } ++ ++ return false; ++} ++ ++/* Trace memory references base info: ++ 1) Memory access rule analysis and reference info tracing ++ 2) Source variable tracing, along base address of memory reference ++ We will extend parallel analysis later. ++*/ ++ ++void ++trace_ref_info (data_ref &mem_ref, std::set &traced_ref_stmt, ++ std::vector &unresolved_refs) ++{ ++ enum tree_code ref_code = TREE_CODE (mem_ref.ref); ++ /* 1) Direct and indirect access traces. */ ++ switch (ref_code) ++ { ++ case MEM_REF: ++ /* Non-vectorized direct/indirect access by pointer. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "MEM_REF\n"); ++ if (!trace_ptr_mem_ref (mem_ref, traced_ref_stmt, unresolved_refs)) ++ return; ++ break; ++ case TARGET_MEM_REF: ++ /* Vectorized and non-vectorized direct access. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "TARGET_MEM_REF\n"); ++ if (!trace_direct_mem_ref (mem_ref)) ++ return; ++ break; ++ case SSA_NAME: ++ /* Vectorized indirect memory access. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "SSA_NAME\n"); ++ if (!trace_indirect_mem_ref_vectorized (mem_ref, traced_ref_stmt)) ++ return; ++ break; ++ case ARRAY_REF: ++ /* Non-vectorized indirect memory access. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "ARRAY_REF\n"); ++ if (!trace_indirect_array (mem_ref, traced_ref_stmt)) ++ return; ++ break; ++ default: ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "ref is another tree-code: "); ++ fprintf (dump_file, "stmt: "); ++ print_gimple_stmt (dump_file, mem_ref.stmt, 0, TDF_LINENO); ++ fprintf (dump_file, "ref: "); ++ print_generic_expr (dump_file, mem_ref.ref, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ return; ++ } ++ ++ /* 2) Source variable tracing. */ ++ std::set walked; ++ if (mem_ref.var == NULL_TREE ++ && !trace_base_var (mem_ref, walked)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Source variable tracing failed.\n\n"); ++ return; ++ } ++ ++ if (mem_ref.regular_p) ++ traced_ref_stmt.insert (mem_ref.stmt); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Tracing succeeded.\n\n"); ++ mem_ref.trace_status_p = true; ++} ++ ++/* Trace all references in the loop. */ ++ ++void ++trace_loop_refs_info (std::vector &refs, ++ std::set &traced_ref_stmt, ++ std::vector &unresolved_refs) ++{ ++ for (unsigned i = 0; i < refs.size (); ++i) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "trace_references_base_info %d:\n", i); ++ print_generic_expr (dump_file, refs[i].ref, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ trace_ref_info (refs[i], traced_ref_stmt, unresolved_refs); ++ } ++} ++ ++/* Tracing and sorting reference groups. */ ++ ++void ++trace_data_refs_info (std::vector &kernels, ++ std::map > &loop_refs, ++ std::set &traced_ref_stmt, ++ std::vector &unresolved_refs) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nPhase 2: trace_all_references_info\n\n"); ++ ++ for (unsigned i = 0; i < kernels.size (); ++i) ++ { ++ class loop *loop = kernels[i]; ++ if (loop_refs.count (loop) == 0) ++ continue; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "loop header %d:\n", loop->header->index); ++ trace_loop_refs_info (loop_refs[loop], traced_ref_stmt, unresolved_refs); ++ } ++} ++ ++/* Retrace references base info for complex scenarios in indirect memory access ++ after Phase 3. */ ++ ++void ++retrace_ref_info_unresolved (data_ref &mem_ref, ++ std::set &traced_ref_stmt) ++{ ++ /* 1) Indirect access traces. */ ++ int trace_index_indir_p = trace_indirect_operand (mem_ref.index, ++ traced_ref_stmt); ++ if (trace_index_indir_p == 1) ++ { ++ mem_ref.regular_p = false; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Indirect memory access tracing succeeded.\n"); ++ } ++ ++ /* 2) Source variable tracing. */ ++ std::set walked; ++ if (mem_ref.var == NULL_TREE ++ && !trace_base_var (mem_ref, walked)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Source variable tracing failed.\n\n"); ++ return; ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Tracing succeeded.\n\n"); ++ mem_ref.trace_status_p = true; ++} ++ ++/* Retrace all unresolved references. */ ++ ++void ++retrace_loop_refs_info_unresolved (std::vector &unresolved_refs, ++ std::set &traced_ref_stmt) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, ++ "\nRetrace indirect memory access after outer loop analysis:\n"); ++ for (unsigned i = 0; i < unresolved_refs.size (); ++i) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "trace_references_base_info %d:\n", i); ++ print_generic_expr (dump_file, unresolved_refs[i].ref, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ retrace_ref_info_unresolved (unresolved_refs[i], traced_ref_stmt); ++ } ++} ++ ++/* ================ phase 3 analyze_nested_kernels ================ */ ++ ++/* Return the inner most type for arrays and pointers of TYPE. */ ++ ++tree ++inner_type (tree type) ++{ ++ while (POINTER_TYPE_P (type) ++ || TREE_CODE (type) == ARRAY_TYPE) ++ type = TREE_TYPE (type); ++ return type; ++} ++ ++/* Check whether the input iv is the loop dimension boundary. */ ++ ++bool ++loop_bound_iv_p (tree t, tree &outer_loop_t) ++{ ++ if (t == NULL || TREE_CODE (t) != SSA_NAME ++ || TREE_CODE (TREE_TYPE (t)) != INTEGER_TYPE) ++ return false; ++ ++ gimple *def_stmt = SSA_NAME_DEF_STMT (t); ++ ++ /* NOP_EXPR convertion between PHI node and memory reference due to MACRO. ++ n_898 = PHI ++ _757 = (sizetype) n_898; ++ _900 = MEM[base: _726, index: _757, step: 8, offset: 0B]; ++ */ ++ while (gimple_code (def_stmt) == GIMPLE_ASSIGN ++ && gimple_assign_rhs_code (def_stmt) == NOP_EXPR) ++ def_stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (def_stmt)); ++ ++ if (gimple_code (def_stmt) != GIMPLE_PHI) ++ return false; ++ ++ /* Filter scenarios with only two phi inputs. */ ++ if (gimple_phi_num_args (def_stmt) != 2) ++ return false; ++ ++ gphi *phi_stmt = as_a (def_stmt); ++ basic_block src0 = gimple_phi_arg_edge (phi_stmt, 0)->src; ++ basic_block src1 = gimple_phi_arg_edge (phi_stmt, 1)->src; ++ ++ class loop *loop = loop_containing_stmt (def_stmt); ++ bool res = false; ++ /* Two phi inputs, one from the current loop and one from the outer loop. */ ++ if ((src0->loop_father == loop) && (src1->loop_father == loop_outer (loop))) ++ { ++ outer_loop_t = gimple_phi_arg_def (def_stmt, 1); ++ res = true; ++ } ++ else if ((src1->loop_father == loop) ++ && (src0->loop_father == loop_outer (loop))) ++ { ++ outer_loop_t = gimple_phi_arg_def (def_stmt, 0); ++ res = true; ++ } ++ ++ if (res) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "===> "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM); ++ } ++ return true; ++ } ++ return false; ++} ++ ++/* add worklist and walked list. */ ++ ++void ++add_worklist_walked (std::vector &worklist, std::set &walked, ++ tree node) ++{ ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ /* Avoid phi node cycle introduction, which makes the worklist unable ++ to end. */ ++ walked.insert (node); ++ } ++} ++ ++/* check bound iv and add worklist. */ ++ ++void ++check_bound_iv_and_add_worklist (std::vector &worklist, ++ std::set &walked, ++ std::set &walked_loop, ++ tree t, data_ref &mem_ref) ++{ ++ if (t == NULL_TREE || TREE_CODE (t) != SSA_NAME) ++ return; ++ ++ gimple *def_stmt = SSA_NAME_DEF_STMT (t); ++ if (def_stmt == NULL) ++ return; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, t, TDF_SLIM); ++ fprintf (dump_file, "\t\t: "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM); ++ } ++ ++ if (gimple_code (def_stmt) == GIMPLE_PHI) ++ { ++ tree out_loop_t = NULL_TREE; ++ if (loop_bound_iv_p (t, out_loop_t)) ++ { ++ basic_block bb = gimple_bb (def_stmt); ++ if (!walked_loop.count (bb)) ++ { ++ mem_ref.loop_bounds.push_back (loop_bound (t, def_stmt)); ++ walked_loop.insert (bb); ++ } ++ add_worklist_walked (worklist, walked, out_loop_t); ++ } ++ } ++ else if (is_gimple_assign (def_stmt)) ++ { ++ tree_code rhs_code = gimple_assign_rhs_code (def_stmt); ++ ++ /* unary. */ ++ if (rhs_code == SSA_NAME || rhs_code == NOP_EXPR) ++ add_worklist_walked (worklist, walked, gimple_assign_rhs1 (def_stmt)); ++ else if (rhs_code == POINTER_PLUS_EXPR) ++ add_worklist_walked (worklist, walked, gimple_assign_rhs2 (def_stmt)); ++ ++ /* binary. */ ++ else if (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR ++ || rhs_code == MULT_EXPR) ++ { ++ add_worklist_walked (worklist, walked, gimple_assign_rhs1 (def_stmt)); ++ add_worklist_walked (worklist, walked, gimple_assign_rhs2 (def_stmt)); ++ } ++ } ++} ++ ++/* DFS trace the loop bound of iv. */ ++ ++bool ++trace_loop_bound_iv (data_ref &mem_ref) ++{ ++ /* In indirect memory access, the size cannot be determined based on the ++ loop boundary. However, we can take advantage of loop bound as an upper ++ bound (unrepeated memory access) to predict the variable footprint ++ involved in the specific loop dimension. */ ++ ++ /* Determine and record the boundary iv of the current index, ++ but do not trace it. */ ++ tree outer_loop_t = NULL_TREE; ++ /* indirect access example, mem_ref.index = _64 ++ _62 = MEM[symbol: uPtr, index: ivtmp.22_96, step: 4, offset: 0B]; ++ _63 = (long unsigned int) _62; ++ _64 = _63 * 8; ++ _65 = [openfoam_smooth.c:28:28] &bPrimePtr + _64; ++ _66 = *_65; */ ++ if (loop_bound_iv_p (mem_ref.index, outer_loop_t) || !mem_ref.regular_p) ++ { ++ mem_ref.loop_bounds.push_back ( ++ loop_bound (mem_ref.index, SSA_NAME_DEF_STMT (mem_ref.index))); ++ if (!mem_ref.regular_p) ++ return false; ++ } ++ ++ std::vector worklist; ++ worklist.push_back (mem_ref.base); ++ std::set walked; ++ std::set walked_loop; ++ ++ while (worklist.size ()) ++ { ++ tree t = worklist.back (); ++ worklist.pop_back (); ++ ++ /* add worklist. */ ++ check_bound_iv_and_add_worklist (worklist, walked, walked_loop, t, mem_ref); ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nmem_ref access dimension: %ld\n", ++ mem_ref.loop_bounds.size ()); ++ fprintf (dump_file, "Traced variables: "); ++ print_generic_expr (dump_file, mem_ref.base, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ ++ return mem_ref.loop_bounds.size () > 0; ++} ++ ++/* dump loop bound. */ ++ ++void ++loop_bound_dump (FILE *file, loop_bound &lb) ++{ ++ class loop *loop = lb.loop; ++ fprintf (file, "loop_bound: loop_%d (", loop->num); ++ if (loop->header) ++ fprintf (file, "header = %d", loop->header->index); ++ else ++ { ++ fprintf (file, "deleted)\n"); ++ return; ++ } ++ if (loop->latch) ++ fprintf (file, ", latch = %d", loop->latch->index); ++ fprintf (file, ", lb_niters = "); ++ print_generic_expr (file, lb.niters); ++ fprintf (file, ")\n\n"); ++} ++ ++/* static calculate data size. */ ++ ++void ++static_calculate_data_size (data_ref &mem_ref) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nstatic_calculate_data_size\n"); ++ ++ tree size_unit = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (mem_ref.var))); ++ unsigned HOST_WIDE_INT type_size = size_unit ? tree_to_uhwi (size_unit) : 0; ++ for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i) ++ { ++ unsigned HOST_WIDE_INT est_niter = tree_to_uhwi ++ (mem_ref.loop_bounds[i].niters); ++ unsigned int unroll = mem_ref.loop_bounds[i].unroll; ++ if (i == 0) ++ { ++ /* The unit conversion between byte, kilobytes, and megabytes is ++ 1024. */ ++ mem_ref.data_size = double (type_size ++ * est_niter * unroll) / 1024 / 1024; ++ } ++ else ++ mem_ref.data_size *= est_niter * unroll; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "static_data_size: %lf\n", mem_ref.data_size); ++ } ++} ++ ++/* Recursive tracing and creating of dominant nodes. */ ++ ++tree ++trace_and_create_dominate_expr (tree expr, class loop *outermost) ++{ ++ if (expr == NULL_TREE || is_gimple_constant (expr)) ++ return expr; ++ ++ if (TREE_CODE (expr) != SSA_NAME) ++ return NULL_TREE; ++ ++ if (SSA_NAME_IS_DEFAULT_DEF (expr)) ++ return expr; ++ ++ gimple *stmt = SSA_NAME_DEF_STMT (expr); ++ basic_block def_bb = gimple_bb (stmt); ++ if (def_bb == NULL || def_bb->loop_father == NULL) ++ return NULL_TREE; ++ ++ if (dominated_by_p (CDI_DOMINATORS, outermost->header, def_bb)) ++ return expr; ++ ++ if (gimple_code (stmt) != GIMPLE_ASSIGN) ++ return NULL_TREE; ++ ++ enum tree_code rhs_code = gimple_assign_rhs_code (stmt); ++ tree_code_class code_class = TREE_CODE_CLASS (rhs_code); ++ tree type = TREE_TYPE (gimple_assign_lhs (stmt)); ++ tree rhs1 = trace_and_create_dominate_expr (gimple_assign_rhs1 (stmt), ++ outermost); ++ if (rhs1 == NULL_TREE) ++ return NULL_TREE; ++ ++ if (code_class == tcc_unary) ++ { ++ tree expr_new = build1 (rhs_code, type, rhs1); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "expr_new = "); ++ print_generic_expr (dump_file, expr_new, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr_new; ++ } ++ else if (code_class == tcc_binary) ++ { ++ tree rhs2 = trace_and_create_dominate_expr (gimple_assign_rhs2 (stmt), ++ outermost); ++ if (rhs2 == NULL_TREE) ++ return NULL_TREE; ++ ++ tree expr_new = fold_build2 (rhs_code, type, rhs1, rhs2); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "expr_new = "); ++ print_generic_expr (dump_file, expr_new, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr_new; ++ } ++ ++ return NULL_TREE; ++} ++ ++/* Recursive parsing and craating of nodes in expr expressions. */ ++ ++tree ++parse_and_create_expr (tree expr, class loop *outermost) ++{ ++ if (expr == NULL_TREE || expr == chrec_dont_know ++ || is_gimple_constant (expr) || TREE_CODE (expr) == ADDR_EXPR) ++ { ++ /* tcc_expression (e.g., &q) situation combined with tcc_unary. */ ++ if (TREE_CODE (expr) == ADDR_EXPR && dump_file ++ && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "tcc_expression case in ADDR_EXPR: "); ++ print_generic_expr (dump_file, expr, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr; ++ } ++ ++ if (TREE_CODE (expr) == SSA_NAME) ++ return trace_and_create_dominate_expr (expr, outermost); ++ else if (EXPR_P (expr)) ++ { ++ enum tree_code tree_code = TREE_CODE (expr); ++ tree_code_class code_class = TREE_CODE_CLASS (tree_code); ++ tree type = TREE_TYPE (expr); ++ tree op1 = parse_and_create_expr (TREE_OPERAND (expr, 0), outermost); ++ if (op1 == NULL_TREE) ++ return NULL_TREE; ++ ++ if (code_class == tcc_unary) ++ { ++ tree expr_new = build1 (tree_code, type, op1); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "expr_new = "); ++ print_generic_expr (dump_file, expr_new, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr_new; ++ } ++ else if (code_class == tcc_binary) ++ { ++ tree op2 = parse_and_create_expr (TREE_OPERAND (expr, 1), outermost); ++ if (op2 == NULL_TREE) ++ return NULL_TREE; ++ ++ tree expr_new = fold_build2 (tree_code, type, op1, op2); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "expr_new = "); ++ print_generic_expr (dump_file, expr_new, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr_new; ++ } ++ } ++ return NULL_TREE; ++} ++ ++/* Trace and creat dominate loop bounds. */ ++ ++void ++trace_and_create_dominate_loop_bounds (data_ref &mem_ref) ++{ ++ /* Check whether the niters is a loop dominant. ++ If not, trace and determine whether the result is dominant. If yes, ++ create the expr of the dominant node. ++ */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\ntrace_and_create_dominate_loop_bounds\n"); ++ ++ /* Determine the relationship between the boundary of the innermost loop and ++ the dominant of the outer loop and the processing. */ ++ loop_bound &outermost = mem_ref.loop_bounds.back (); ++ for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i) ++ { ++ loop_bound ¤t = mem_ref.loop_bounds[i]; ++ tree &niters = current.niters; ++ if (TREE_CODE (niters) == COND_EXPR) ++ niters = TREE_OPERAND (niters, 1); ++ ++ niters = parse_and_create_expr (niters, outermost.loop); ++ ++ if (niters == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM); ++ fprintf (dump_file, "Tracing loop bound failed at dimension %d\n", ++ i); ++ } ++ mem_ref.calc_by = UNHANDLE_CALC; ++ break; ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ loop_bound_dump (dump_file, mem_ref.loop_bounds[i]); ++ } ++} ++ ++/* trace the dimension and corresponding loop bounds of mem_ref. ++ This function is used to supplement the information of mem_ref.loop_bounds. ++*/ ++ ++void ++trace_ref_dimension_and_loop_bounds (data_ref &mem_ref) ++{ ++ /* In the same loop, some memory access dimensions are different. Remove ++ variables with fewer dimensions. ++ Previous cyclic filtering conditions and memory access node records and ++ tracing. ++ The false result is also processed. ++ */ ++ if (dump_file) ++ fprintf (dump_file, "\ncalculate_data_size\n"); ++ ++ /* Trace the loop bound iv of ref to determine the dimension. */ ++ /* Record data from the loop perspective to avoid repeated tracing. */ ++ if (!trace_loop_bound_iv (mem_ref)) ++ return; ++ ++ /* The traced mem_ref may have multiple dimensions, which corresponds to ++ multiple loops. */ ++ /* And in the dimension-by-dimensional analysis, the computable way is ++ continuously reduced. */ ++ mem_ref.calc_by = STATIC_CALC; ++ for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i) ++ { ++ class loop *loop = mem_ref.loop_bounds[i].loop; ++ tree &niters = mem_ref.loop_bounds[i].niters; ++ ++ /* Set NULL_TREE to ensure that nb_iterations are retraced and ++ vec_nb_iterations are also extracted. */ ++ loop->nb_iterations = NULL_TREE; ++ niters = number_of_latch_executions (loop, false); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ loop_dump (dump_file, loop); ++ ++ if (loop->unroll) ++ { ++ if (loop->unroll == USHRT_MAX && dump_file ++ && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "loop->unroll = USHRT_MAX = %d", USHRT_MAX); ++ mem_ref.loop_bounds[i].unroll = loop->unroll; ++ } ++ ++ if ((niters == chrec_dont_know) && loop->vec_nb_iterations ++ && (loop->vec_nb_iterations != chrec_dont_know)) ++ niters = loop->vec_nb_iterations; ++ ++ if (niters == chrec_dont_know) ++ { ++ /* We derive est_loop_niters from function ++ `estimated_loop_iterations_int`. Usually only the innermost loop is ++ vectorized, so vec_nb_iterations can be 4 or 8 times as large as ++ `est_loop_niters` due to vectorization. However, function ++ `estimated_loop_iterations_int` only returns an integer instead of ++ a tree node expression, so it cannot substitute ++ function `number_of_latch_executions` in runtime computation. */ ++ HOST_WIDE_INT est_loop_niters = estimated_loop_iterations_int (loop); ++ if (est_loop_niters >= 0 && est_loop_niters < INT_MAX) ++ /* e.g., loop iterations from `estimated_loop_iterations_int`: (-1) ++ loop_144 (header = 519, latch = 625, niter = scev_not_known, ++ upper_bound = 1073741823, likely_upper_bound = 1073741823, ++ unroll = 1) */ ++ /* variable `niters` from `loop->vec_nb_iterations` ++ constant 34> */ ++ niters = build_int_cst (integer_type_node, (int) est_loop_niters); ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ loop_bound_dump (dump_file, mem_ref.loop_bounds[i]); ++ ++ if (niters == NULL_TREE || niters == chrec_dont_know) ++ mem_ref.calc_by = std::min (mem_ref.calc_by, UNHANDLE_CALC); ++ else if (TREE_CODE (niters) != INTEGER_CST) ++ mem_ref.calc_by = std::min (mem_ref.calc_by, RUNTIME_CALC); ++ else ++ mem_ref.calc_by = std::min (mem_ref.calc_by, STATIC_CALC); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ if (mem_ref.calc_by == 2) ++ { ++ fprintf (dump_file, "\nniters: "); ++ print_generic_expr (dump_file, niters, TDF_SLIM); ++ fprintf (dump_file, "\nSTATIC_CALC.\n"); ++ } ++ else if (mem_ref.calc_by == 1) ++ { ++ fprintf (dump_file, "\nniters: "); ++ print_generic_expr (dump_file, niters, TDF_SLIM); ++ fprintf (dump_file, "\nRUNTIME_CALC.\n"); ++ } ++ else ++ fprintf (dump_file, "\nUNHANDLE_CALC.\n"); ++ } ++ } ++ ++ if (mem_ref.calc_by == RUNTIME_CALC) ++ trace_and_create_dominate_loop_bounds (mem_ref); ++ else if (mem_ref.calc_by == STATIC_CALC) ++ static_calculate_data_size (mem_ref); ++} ++ ++/* Get the loop's niters tree. ++ Return NULL_TREE if not found. */ ++ ++tree ++get_cur_loop_niters (std::map > &loop_refs, ++ class loop *loop) ++{ ++ if (loop_refs.count (loop) == 0) ++ return NULL_TREE; ++ std::vector bounds = loop_refs[loop][0].loop_bounds; ++ return bounds.size () ? bounds[0].niters : NULL_TREE; ++} ++ ++/* Trace the sources of the niters tree and return the ++ outermost depth of the loops containing them. ++ Return start_depth if not found. ++ ++ example: ++ niters:(long) (((int) i_end_417 - (int) i_start_452) + 1) ++ operand_num: 1, subtree:(long) (((int) i_end_417 - (int) i_start_452) + 1) ++ operand_num: 2, subtree:((int) i_end_417 - (int) i_start_452) + 1 ++ operand_num: 2, subtree:(int) i_end_417 - (int) i_start_452 ++ operand_num: 1, subtree:(int) i_end_417 ++ SSA_NAME of niters: i_end_417 ++ gimple of SSA: i_end_417 = PHI ++ return gimple depth; ++*/ ++ ++unsigned ++trace_outer_loop_depth (tree niters, unsigned start_depth) ++{ ++ /* If niter does not exist or the type is INTEGER_CST, ++ the loop bound is determined and return start_depth. */ ++ if (niters == NULL_TREE || TREE_CODE (niters) == INTEGER_CST) ++ return start_depth; ++ ++ gimple *def_stmt = NULL; ++ /* niters examples: i_start_452, fEnd_35, fEnd_100. */ ++ enum tree_code niter_code = TREE_CODE (niters); ++ if (niter_code == SSA_NAME) ++ { ++ /* Trace the SSA that define this niter. */ ++ def_stmt = SSA_NAME_DEF_STMT (niters); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "ssa_name of niters: "); ++ print_generic_expr (dump_file, niters); ++ fprintf (dump_file, "\ngimple of ssa: \n"); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ /* Termination condition of dfs. Return the depth of the bb block. */ ++ if (gimple_code (def_stmt) == GIMPLE_PHI ++ || gimple_code (def_stmt) == GIMPLE_NOP) ++ { ++ basic_block def_bb = gimple_bb (SSA_NAME_DEF_STMT (niters)); ++ if (def_bb == NULL || def_bb->loop_father == NULL) ++ return start_depth; ++ unsigned ret_depth = loop_depth (def_bb->loop_father); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Stop tracing the outer loop depth, "); ++ fprintf (dump_file, "current depth: %d, current bb: %d\n", ++ ret_depth, def_bb->index); ++ } ++ return ret_depth; ++ } ++ /* 'ASSIGN': Use dfs to trace the rhs of the assignment statement. */ ++ else if (gimple_code (def_stmt) == GIMPLE_ASSIGN) ++ { ++ tree rhs = gimple_assign_rhs1 (def_stmt); ++ if (TREE_CODE (rhs) == TARGET_MEM_REF) ++ /* fEnd_35 = MEM[base: _19, index: ivtmp.96, step: 4, ++ offset: 0B] */ ++ return trace_outer_loop_depth (TREE_OPERAND (rhs, 2), start_depth); ++ else ++ { ++ /* M.218_658 = MIN_EXPR <_631, _657> */ ++ unsigned min_depth = start_depth; ++ unsigned operand_num = gimple_num_ops (def_stmt); ++ /* 'ASSIGN': start from 1 because op[0] is the lhs. */ ++ for (unsigned i = 1; i < operand_num; i++) ++ { ++ tree subtree = dyn_cast(def_stmt)->op[i]; ++ if (subtree == NULL) ++ continue; ++ unsigned depth = trace_outer_loop_depth (subtree, \ ++ start_depth); ++ min_depth = MIN (min_depth, depth); ++ } ++ return min_depth; ++ } ++ } ++ else ++ { ++ /* Adding termination conditions: ++ 1) Niters is MEM variable; ++ 2) Niters is a runtime value (smooth_uPtr), and consider ++ finding footprint in other mem_ref; ++ 3) Niters is loop variable (i_start/i_end), and the boundary in ++ the outer loop depends on the variable j_start/j_end. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "The loop termination condition is " ++ "extended.\n"); ++ } ++ return start_depth; ++ } ++ } ++ /* The operand nums can be obtained when the tree code is as follows. */ ++ else if (niter_code == NOP_EXPR || niter_code == MEM_REF ++ || niter_code == ARRAY_REF || niter_code == COND_EXPR ++ || niter_code == PLUS_EXPR || niter_code == MINUS_EXPR ++ || niter_code == TARGET_MEM_REF || niter_code == POINTER_PLUS_EXPR) ++ { ++ /* operand_num is the operand in the niters statement. ++ example: In the following niter statement, operand_num = 3. ++ (unsigned int) fEnd_35 - (unsigned int) fEnd_100 + 4294967295. */ ++ unsigned operand_num = TREE_OPERAND_LENGTH (niters); ++ unsigned min_depth = start_depth; ++ for (unsigned i = 0; i < operand_num; i++) ++ { ++ tree subtree = TREE_OPERAND (niters, i); ++ if (subtree == NULL) ++ continue; ++ unsigned depth = trace_outer_loop_depth (subtree, start_depth); ++ min_depth = MIN (min_depth, depth); ++ } ++ return min_depth; ++ } ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "niters is another tree code: %s\n", ++ get_tree_code_name (niter_code)); ++ print_generic_expr (dump_file, niters, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return start_depth; ++ } ++} ++ ++/* Traces the ref dimension information in each loop. */ ++ ++void ++analyze_loop_refs_dimension (std::vector &refs) ++{ ++ for (unsigned i = 0; i < refs.size (); ++i) ++ { ++ if (refs[i].trace_status_p == false) ++ continue; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "trace_reference_dimension %d:\n", i); ++ print_generic_expr (dump_file, refs[i].ref, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ trace_ref_dimension_and_loop_bounds (refs[i]); ++ } ++} ++ ++/* analyze nested kernels ++ 1) multidimension loop analyze ++ 2) extended outer loop analyze ++*/ ++ ++bool ++analyze_nested_kernels (std::vector &kernels, ++ std::map > &loop_refs, ++ std::set &traced_ref_stmt, ++ std::vector &unresolved_refs) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nPhase 3: analyze_nested_kernels\n\n"); ++ ++ /* `kernels` may be added in during outer loop extension phase, ++ thus using initial size to avoid repeatedly analyzing. */ ++ unsigned init_kernels_size = kernels.size (); ++ for (unsigned i = 0; i < init_kernels_size; ++i) ++ { ++ class loop *loop = kernels[i]; ++ if (loop_refs.count (loop) == 0) ++ continue; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "loop header %d:\n", loop->header->index); ++ analyze_loop_refs_dimension (loop_refs[loop]); ++ ++ unsigned depth = loop_depth (loop); ++ unsigned outer_depth = trace_outer_loop_depth (get_cur_loop_niters ++ (loop_refs, loop), depth); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "cur_depth: %d, outer_depth: %d\n", ++ depth, outer_depth); ++ /* param_outer_loop_num: number of loops of the extended outer loop. ++ Outermost loop should not be extended when outer_depth = 0. ++ `outer_depth == depth` means the current loop is the loop which ++ boundary is known, so there is no need to extend the outer loop. */ ++ if (outer_depth == 0 || outer_depth == depth ++ || depth > outer_depth + param_outer_loop_num) ++ continue; ++ ++ /* Extend outer loop. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nStart extending outer loop\n"); ++ /* Superloops of the loop, start from the loop closest to the ++ current loop in the outermost loop. */ ++ for (int j = 0; j < param_outer_loop_num && --depth; ++j) ++ { ++ class loop *outer_loop = (*loop->superloops)[depth]; ++ /* The outer loop may be added when analyzing previous inner loops, ++ i.e. the outer loop contains two or more inner loops. */ ++ if (loop_refs.count (outer_loop)) ++ continue; ++ /* phase1 ~ phase3 analysis on the extended outer loop. */ ++ analyze_loop_dense_memory (kernels, loop_refs, outer_loop); ++ if (loop_refs.count (outer_loop) == 0) ++ continue; ++ for (unsigned k = 0; k < loop_refs[outer_loop].size (); ++k) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "outer_analyze_nested_kernels %d: ", k); ++ print_generic_expr (dump_file, loop_refs[outer_loop][k].ref, ++ TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ trace_loop_refs_info (loop_refs[outer_loop], traced_ref_stmt, ++ unresolved_refs); ++ analyze_loop_refs_dimension (loop_refs[outer_loop]); ++ outer_depth = trace_outer_loop_depth (get_cur_loop_niters ++ (loop_refs, outer_loop), depth); ++ /* `outer_depth == depth` means the current loop is the loop which ++ boundary is known, so there is no need to extend the outer loop. */ ++ if (outer_depth == depth) ++ break; ++ else ++ /* The outer loop cannot find the current loop boundary, ++ Remove the record of outer_loop from the loop_refs. */ ++ loop_refs.erase (outer_loop); ++ } ++ } ++ return true; ++} ++ ++/* ================ phase 4 filter_and_sort_kernels ================ */ ++ ++/* Get the edge probability information of each basic block in the loop. */ ++ ++float ++get_edge_prob (edge e, float minimum) ++{ ++ float fvalue = 0; ++ ++ profile_probability probability = e->probability; ++ if (probability.initialized_p ()) ++ { ++ fvalue = probability.to_reg_br_prob_base () / float (REG_BR_PROB_BASE); ++ if (fvalue < minimum && probability.to_reg_br_prob_base ()) ++ fvalue = minimum; ++ } ++ return fvalue; ++} ++ ++/* Get the next bb with a high branch probability. */ ++ ++basic_block ++next_high_probability_bb (basic_block bb) ++{ ++ if (bb == NULL) ++ return NULL; ++ ++ /* Limit the minimum probability value. */ ++ const float MINNUM_PROB = 0.00001f; ++ float minimum = MINNUM_PROB; ++ ++ gimple *stmt = last_stmt (bb); ++ if (stmt && gimple_code (stmt) == GIMPLE_COND) ++ { ++ edge true_edge = NULL; ++ edge false_edge = NULL; ++ extract_true_false_edges_from_block (bb, &true_edge, &false_edge); ++ ++ float true_edge_prob = get_edge_prob (true_edge, minimum); ++ float false_edge_prob = get_edge_prob (false_edge, minimum); ++ /* If the content of the branch does not include the candidate ++ kernel, the branch probability may not be limited. */ ++ /* The edge_prob may have precision error during static prediction, ++ so we need to relax the limit before comparison. */ ++ if ((true_edge_prob >= (param_branch_prob_threshold / 100.0) - minimum) ++ && flow_bb_inside_loop_p (bb->loop_father, true_edge->dest)) ++ return true_edge->dest; ++ else if ((false_edge_prob ++ >= (param_branch_prob_threshold / 100.0) - minimum) ++ && flow_bb_inside_loop_p (bb->loop_father, false_edge->dest)) ++ return false_edge->dest; ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "No high probability bb:"); ++ fprintf (dump_file, "current bb: %d, true: %f, false: %f\n", ++ bb->index, true_edge_prob, false_edge_prob); ++ } ++ return NULL; ++ } ++ } ++ else ++ { ++ edge e = find_fallthru_edge (bb->succs); ++ if (e) ++ return e->dest; ++ } ++ return NULL; ++} ++ ++ ++/* Dump loop header bb. */ ++ ++void ++dump_loop_headers (const char *name, std::vector &loops) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\n\n%s:\n", name); ++ fprintf (dump_file, "{ "); ++ for (unsigned int i = 0; i < loops.size (); i++) ++ fprintf (dump_file, "%d(%d) ", loops[i]->num, loops[i]->header->index); ++ fprintf (dump_file, "}\n\n"); ++ } ++} ++ ++/* Combine and sort candidate loops. */ ++ ++bool ++filter_and_sort_kernels (std::vector &sorted_kernels, ++ std::vector &kernels) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nPhase 4: filter_and_sort_kernels:\n\n"); ++ ++ std::set end_bb; ++ std::list walked_header_bb; /* Used to record nested loops. */ ++ std::set walked_non_header_bb_idx; ++ ++ for (unsigned i = 0; i < kernels.size (); ++i) ++ { ++ if (kernels[i]->inner == NULL) ++ end_bb.insert (kernels[i]->header); ++ } ++ ++ dump_loop_headers ("kernels", kernels); ++ ++ if (!param_filter_kernels) ++ { ++ for (std::vector::iterator it = kernels.begin (); ++ it != kernels.end (); ++it) ++ sorted_kernels.push_back (*it); ++ } ++ else ++ { ++ basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun); ++ ++ while (bb) ++ { ++ if (bb == NULL) ++ return false; ++ if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun)) ++ break; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "%d ", bb->index); ++ ++ /* bb is not the head of the loop, go to the next. */ ++ if (bb != bb->loop_father->header) ++ { ++ if (walked_non_header_bb_idx.count (bb->index)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Find same-loop cycle. " ++ "Abort filtering process.\n"); ++ return false; ++ } ++ walked_non_header_bb_idx.insert (bb->index); ++ bb = next_high_probability_bb (bb); ++ continue; ++ } ++ ++ /* bb is the head of the loop. */ ++ if (bb != walked_header_bb.back ()) ++ { ++ if (end_bb.count (bb)) ++ { ++ sorted_kernels.push_back (bb->loop_father); ++ bb = single_exit (bb->loop_father)->dest; ++ continue; ++ } ++ if (loop_outer (bb->loop_father) != NULL ++ && get_loop_exit_edges (bb->loop_father).length () != 1) ++ return false; ++ walked_header_bb.push_back (bb); ++ bb = next_high_probability_bb (bb); ++ continue; ++ } ++ else ++ { ++ walked_header_bb.pop_back (); ++ bb = single_exit (bb->loop_father)->dest; ++ continue; ++ } ++ } ++ } ++ ++ dump_loop_headers ("sorted_kernels", sorted_kernels); ++ return true; ++} ++ ++/* Check whether the given bb is null. */ ++ ++bool ++check_null_bb (basic_block bb) ++{ ++ if (bb == NULL) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Unexpected error at null bb.\n"); ++ return true; ++ } ++ return false; ++} ++ ++/* Check whether the loop father of the given bb is null. */ ++ ++bool ++check_null_loop_father (basic_block bb) ++{ ++ if (check_null_bb (bb)) ++ return true; ++ ++ if (bb->loop_father == NULL) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "bb %d's loop father is null.\n", bb->index); ++ return true; ++ } ++ return false; ++} ++ ++/* States for bb during path traversal. */ ++ ++enum bb_traversal_state ++{ ++ NOT_TRAVERSED = 0, ++ UNDER_TRAVERSAL, ++ FULLY_TRAVERSED ++}; ++ ++/* Detect abnormal revisit for bb during path traversal where bb is ++ 1) fully traversed, ++ 2) non-loop-header bb but currently under traversal. */ ++ ++bool ++revisit_bb_abnormal_p (basic_block bb, std::vector &bb_visited, ++ const std::set &header_bb_idx_set, ++ std::set > &unused_edges, ++ int src_bb_idx) ++{ ++ /* If the header bb has been already fully traversed, early exit ++ the function. */ ++ if (bb_visited[bb->index] == FULLY_TRAVERSED) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Already visited bb index %d. Abort.\n", ++ bb->index); ++ return true; ++ } ++ ++ /* If we revisit a non-header bb during next-bb traversal, we detect ++ an inner-loop cycle and dump warning info. Record this abnormal edge ++ in `unused_edges` for special treatment in path weight update. */ ++ if (!header_bb_idx_set.count (bb->index) ++ && bb_visited[bb->index] == UNDER_TRAVERSAL) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Warning: Find cycle at bb index %d. Abort.\n", ++ bb->index); ++ unused_edges.insert (std::make_pair (src_bb_idx, bb->index)); ++ return true; ++ } ++ ++ return false; ++} ++ ++/* Check successor bb through edge e. Return true if successor bb is NULL or ++ out of loop. */ ++ ++bool ++check_succ_bb_abnormal_p (basic_block bb, edge e) ++{ ++ if (check_null_bb (e->dest)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Null bb connected to src bb %d.\n", bb->index); ++ ++ return true; ++ } ++ ++ /* If bb is within one loop and the edge is pointing to the ++ outer loop, skip edge processing until a backedge to header ++ bb. `loop->num = 0` represents function body. */ ++ if (bb->loop_father->num != 0 ++ && !flow_bb_inside_loop_p (bb->loop_father, e->dest)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Find edges to the outer loop at bb " ++ "index %d to bb index %d. Abort.\n", ++ bb->index, e->dest->index); ++ ++ return true; ++ } ++ ++ return false; ++} ++ ++/* Criteria for retrieving the next bb in modified control-flow graph, which ++ creates a topological order for the bb traversal. */ ++ ++void ++get_next_toposort_bb (basic_block bb, std::vector &bb_visited, ++ std::list &bb_topo_order, ++ const std::set &header_bb_idx_set, ++ std::set > &unused_edges, ++ int src_bb_idx) ++{ ++ /* 1) Before bb returns to the loop header, bb will not go to the outer loop. ++ 2) After returning to the loop header, traverse all exit_bbs. ++ NEXT STEP: ++ 1) If goto jumps out of 2 loops, goto has to traverse smaller jumps first. ++ 2) If path length is the same => choose higher depth traversal path. */ ++ if (check_null_bb (bb) || check_null_loop_father (bb)) ++ return; ++ ++ /* Find last bb of function. */ ++ if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun)) ++ return; ++ ++ if (revisit_bb_abnormal_p (bb, bb_visited, header_bb_idx_set, unused_edges, ++ src_bb_idx)) ++ return; ++ ++ /* If we revisit the header bb of a loop, traverse all exit bbs. */ ++ if (header_bb_idx_set.count (bb->index) ++ && bb_visited[bb->index] == UNDER_TRAVERSAL) ++ { ++ unsigned i; ++ edge e; ++ auto_vec exits = get_loop_exit_edges (bb->loop_father); ++ ++ if (exits.length () > 1 && dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Detect multiple exits at loop %d.\n", ++ bb->loop_father->num); ++ ++ FOR_EACH_VEC_ELT (exits, i, e) ++ { ++ get_next_toposort_bb (e->dest, bb_visited, bb_topo_order, ++ header_bb_idx_set, unused_edges, src_bb_idx); ++ } ++ return; ++ } ++ ++ /* Post-order traversal for normal bb. */ ++ bb_visited[bb->index] = UNDER_TRAVERSAL; ++ edge e; ++ edge_iterator ei; ++ ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ if (check_succ_bb_abnormal_p (bb, e)) ++ continue; ++ ++ get_next_toposort_bb (e->dest, bb_visited, bb_topo_order, ++ header_bb_idx_set, unused_edges, bb->index); ++ } ++ ++ /* bb is marked as fully traversed and all its descendents have been ++ fully traversed due to post-order traversal. */ ++ bb_visited[bb->index] = FULLY_TRAVERSED; ++ bb_topo_order.push_back (bb); ++} ++ ++/* A struct that represents the longest path weight at each bb. */ ++ ++struct weight ++{ ++ /* Longest path weight at current bb. */ ++ gcov_type bb_count; ++ ++ /* Prev bb from the current longest path. */ ++ int prev_bb_idx; ++}; ++ ++/* A helper function for checking whether overflow will occur when adding two ++ gcov_type weights. */ ++ ++bool ++check_weight_overflow (gcov_type a, gcov_type b) ++{ ++ if ((a > 0 && b > INT64_MAX - a) || (a < 0 && b < INT64_MIN - a)) ++ return true; ++ ++ return false; ++} ++ ++/* A helper function that update the weight of the current longest path to ++ bb_idx_dst and a new path pointing from bb_idx_src to bb_idx_dst. */ ++ ++void ++update_path_weight (std::vector &bb_weights, int bb_idx_src, ++ int bb_idx_dst, gcov_type weight_dst) ++{ ++ if (check_weight_overflow (bb_weights[bb_idx_src].bb_count, weight_dst) ++ && dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "WARNING: Path weight overflow at src bb %d " ++ "and dest bb %d.\n", ++ bb_idx_src, bb_idx_dst); ++ } ++ if (bb_weights[bb_idx_dst].bb_count ++ < bb_weights[bb_idx_src].bb_count + weight_dst) ++ { ++ bb_weights[bb_idx_dst].bb_count ++ = bb_weights[bb_idx_src].bb_count + weight_dst; ++ bb_weights[bb_idx_dst].prev_bb_idx = bb_idx_src; ++ } ++} ++ ++/* Check whether the required bb/loop info for path update is null. */ ++ ++bool ++check_null_info_in_path_update (basic_block bb, edge e) ++{ ++ if (check_null_bb (e->dest)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Null bb detected for edge connected " ++ "to src bb %d.\n", ++ bb->index); ++ return true; ++ } ++ ++ if (check_null_loop_father (bb) || check_null_loop_father (e->dest)) ++ return true; ++ ++ return false; ++} ++ ++/* Update path weight to loop exit bbs where the current source bb is connected ++ to header bb using a backedge. */ ++ ++void ++update_backedge_path_weight (std::vector &bb_weights, basic_block bb, ++ const std::set > &unused_edges) ++{ ++ unsigned i; ++ edge e_exit; ++ auto_vec exits = get_loop_exit_edges (bb->loop_father); ++ FOR_EACH_VEC_ELT (exits, i, e_exit) ++ { ++ if (check_null_bb (e_exit->dest)) ++ { ++ if (e_exit->src != NULL && dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Null bb detected for exiting edge " ++ "connected to src bb %d.\n", ++ e_exit->src->index); ++ continue; ++ } ++ ++ if (unused_edges.count (std::make_pair (bb->index, e_exit->dest->index))) ++ { ++ /* Inner-loop-cycle backedge case. */ ++ continue; ++ } ++ update_path_weight (bb_weights, bb->index, e_exit->dest->index, ++ e_exit->dest->count.to_gcov_type ()); ++ } ++} ++ ++/* Update the longest length of the path through control flow graph. */ ++ ++void ++update_max_length_of_path (std::vector &bb_weights, ++ std::list &bb_topo_order, ++ const std::set &header_bb_idx_set, ++ const std::set > &unused_edges) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Start update weight traversal:\n"); ++ ++ while (!bb_topo_order.empty ()) ++ { ++ basic_block bb = bb_topo_order.back (); ++ bb_topo_order.pop_back (); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "%d ", bb->index); ++ ++ edge e; ++ edge_iterator ei; ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ if (check_null_info_in_path_update (bb, e)) ++ continue; ++ ++ if (unused_edges.count (std::make_pair (bb->index, e->dest->index))) ++ { ++ /* Inner-loop-cycle backedge case. */ ++ continue; ++ } ++ else if (bb->loop_father->num != 0 ++ && !flow_bb_inside_loop_p (bb->loop_father, e->dest)) ++ { ++ /* Outer-loop edge case. */ ++ continue; ++ } ++ else if (header_bb_idx_set.count (e->dest->index) ++ && bb->loop_father == e->dest->loop_father) ++ { ++ /* Backedge case. */ ++ update_backedge_path_weight (bb_weights, bb, unused_edges); ++ } ++ else ++ { ++ /* Normal edge case. */ ++ update_path_weight (bb_weights, bb->index, e->dest->index, ++ e->dest->count.to_gcov_type ()); ++ } ++ } ++ } ++} ++ ++/* Collect all header bb of loops in the function beforehand. */ ++ ++void ++collect_header_bb_for_fn (std::set &header_bb_idx_set) ++{ ++ for (auto loop : loops_list (cfun, LI_FROM_INNERMOST)) ++ header_bb_idx_set.insert (loop->header->index); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nCheck header bbs:\n"); ++ for (std::set::iterator it = header_bb_idx_set.begin (); ++ it != header_bb_idx_set.end (); ++it) ++ fprintf (dump_file, "%d ", *it); ++ fprintf (dump_file, "\n"); ++ } ++} ++ ++/* Record loop executing order and bb high-executing path. */ ++ ++void ++record_high_execution_path (std::vector &sorted_kernel, ++ std::vector &bb_path, int bb_num_max) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nPATH FOR %s: ", get_name (cfun->decl)); ++ ++ std::set loop_set; ++ for (int i = bb_path.size() - 1; i >= 0; --i) ++ { ++ int bb_idx = bb_path[i]; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "%d ", bb_idx); ++ gcc_assert (bb_idx < bb_num_max); ++ ++ class loop *loop = BASIC_BLOCK_FOR_FN (cfun, bb_idx)->loop_father; ++ if (!loop_set.count (loop->num)) ++ { ++ loop_set.insert (loop->num); ++ sorted_kernel.push_back (loop); ++ } ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\n"); ++} ++ ++/* Combine and sort candidate loops using feedback information. */ ++ ++bool ++filter_and_sort_kernels_feedback (std::vector &sorted_kernel, ++ std::set &bb_pathset) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nPhase 4: filter_and_sort_kernels:\n\n"); ++ ++ std::set header_bb_idx_set; ++ std::list bb_topo_order; ++ ++ /* Quoted from GCC internal, Chapter 15.1, "the index for any block should ++ never be greater than `last_basic_block`." Therefore, we use this ++ variable for retrieving the max bb index of a function. */ ++ /* Since the pass does not add/remove/merge basic blocks until Phase 6 ++ and previous passes will update ssa accordingly, we do not need to ++ `compact_blocks` to update bb indices currently. */ ++ int bb_num_max = last_basic_block_for_fn (cfun) + 1; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nMaximal number of possible bbs in the " ++ "function: %d\n", ++ bb_num_max); ++ std::vector bb_visited = std::vector(bb_num_max, 0); ++ ++ collect_header_bb_for_fn (header_bb_idx_set); ++ basic_block bb_start = ENTRY_BLOCK_PTR_FOR_FN (cfun); ++ ++ /* Step 1: Get topological order of bb during traversal. */ ++ std::set > unused_edges; ++ get_next_toposort_bb (bb_start, bb_visited, bb_topo_order, header_bb_idx_set, ++ unused_edges, -1); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nCheck bbs in topological order:\n"); ++ for (std::list::iterator it = bb_topo_order.begin (); ++ it != bb_topo_order.end (); ++it) ++ fprintf (dump_file, "%d ", (*it)->index); ++ fprintf (dump_file, "\n"); ++ } ++ ++ /* Step 2: Update weights of nodes and path. */ ++ weight weight_init = {-1, -1}; ++ std::vector bb_weights = std::vector(bb_num_max, weight_init); ++ bb_weights[0].bb_count = 0; /* ENTRY bb has count 0 and prev bb as -1. */ ++ update_max_length_of_path (bb_weights, bb_topo_order, header_bb_idx_set, ++ unused_edges); ++ ++ /* Step 3: Backtrack a path from EXIT bb to ENTRY bb. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nCheck counts for each bb:\n"); ++ ++ std::vector bb_path; ++ int tmp_bb_idx = 1; ++ bb_pathset.insert (tmp_bb_idx); ++ bb_path.push_back (tmp_bb_idx); ++ tmp_bb_idx = bb_weights[tmp_bb_idx].prev_bb_idx; ++ while (tmp_bb_idx > 0 && tmp_bb_idx < bb_num_max) ++ { ++ if (bb_pathset.count (tmp_bb_idx)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf(dump_file, "ERROR: already seen bb index %d\n", ++ tmp_bb_idx); ++ return false; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "%d: %ld, ", tmp_bb_idx, ++ bb_weights[tmp_bb_idx].bb_count); ++ bb_pathset.insert (tmp_bb_idx); ++ bb_path.push_back (tmp_bb_idx); ++ tmp_bb_idx = bb_weights[tmp_bb_idx].prev_bb_idx; ++ } ++ /* It is possible that the function exit code is wrapped around as an ++ variable, and thus, EXIT_BB in cfg is not connected to any bb. */ ++ if (tmp_bb_idx < 0 || tmp_bb_idx >= bb_num_max) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "unhandled scenario at backtracking highly " ++ "executed path with tmp_bb_idx %d", ++ tmp_bb_idx); ++ } ++ return false; ++ } ++ ++ record_high_execution_path (sorted_kernel, bb_path, bb_num_max); ++ ++ return true; ++} ++ ++ ++/* ================ phase 5 record_and_sort_ref_groups ================ */ ++/* Memory reference score, different aspects of one memory reference. */ ++ ++struct ref_score ++{ ++ /* certain memory reference. */ ++ data_ref d_ref; ++ ++ /* local count for bb where memory reference is located. */ ++ gcov_type bb_count; ++ ++ /* line-location of memory reference. */ ++ int line; ++}; ++ ++/* Memory reference group, different reference of the same variable. */ ++ ++struct ref_group ++{ ++ /* source variables. */ ++ tree var; ++ ++ /* variable size, Unit: MB. */ ++ double var_size; ++ ++ /* first ref for insert hint. */ ++ data_ref first_use; ++ ++ /* first ref with the highest-order CALC. */ ++ data_ref first_calc_use; ++ ++ /* reuse scores of variables. */ ++ float reuse_level; ++ ++ /* method of calculating the var size. */ ++ calc_type calc_by; ++ ++ /* memory reference index for specific variable. */ ++ unsigned int mem_ref_index; ++ ++ /* variable dimension. */ ++ unsigned int dim; ++ ++ /* True if first_calc_use's footprint replaces that of first_use. */ ++ unsigned int transfer_ft; ++ ++ /* Accessing Reference Records in Different Modes (key_index): ++ 000: write, random, non-parallel ++ 001: write, random, parallel ++ 010: write, regular, non-parallel ++ 011: write, regular, parallel ++ 100: read, random, non-parallel ++ 101: read, random, parallel ++ 110: read, regular, non-parallel ++ 111: read, regular, parallel ++ */ ++ std::map > ref_use; ++ ++ /* scores for different memory references. */ ++ std::vector ref_scores; ++ ++ ref_group () ++ { ++ var = NULL_TREE; ++ var_size = 0; ++ reuse_level = 0; ++ calc_by = UNHANDLE_CALC; ++ mem_ref_index = 0; ++ dim = 1; ++ transfer_ft = 0; ++ } ++}; ++ ++/* Get the integer part for log(x) with the given base. */ ++ ++static unsigned int ++flog (float x, float base) ++{ ++ unsigned int res = 0; ++ while (x >= base) ++ { ++ ++res; ++ x /= base; ++ } ++ return res; ++} ++ ++/* Calculate reuse time for a memory reference in ref_group. */ ++ ++float ++calculate_reuse_times (std::vector &mem_refs, std::set &loop_set, ++ std::set &bb_set, unsigned int var_dim) ++{ ++ const float SAME_BB_REUSE_WEIGHT = 0.1; ++ const float SAME_LOOP_REUSE_WEIGHT = 0.5; ++ const float NORMAL_REUSE_WEIGHT = 1.; ++ ++ float reuse_time_sum = 0.; ++ for (std::vector::iterator it = mem_refs.begin (); ++ it != mem_refs.end (); ++it) ++ { ++ const data_ref &mem_ref = *it; ++ float reuse_time = 0.; ++ if (bb_set.count (mem_ref.bb_idx)) ++ { ++ /* If the two mem_ref belong to the same bb, the new reuse ++ weight will not exceed 0.1 divided by the mem_ref mode group ++ size. ++ NEXT STEP: The following equation may hold and cause commutative ++ property of read and write op not holding: ++ write + (reused) read != read + (reused) write. ++ However, it seems that write mem_ref is always before read mem_ref, ++ so the above comparison does not show up in calculation due to ++ intrinsic in-order property of tree map, but this condition is ++ quite fragile anyway. */ ++ reuse_time = SAME_BB_REUSE_WEIGHT / mem_refs.size (); ++ } ++ else ++ { ++ bb_set.insert (mem_ref.bb_idx); ++ if (loop_set.count (mem_ref.loop_idx)) ++ { ++ /* If the mem_ref belongs to a loop where any other mem_ref is in, ++ the new reuse weight will be 0.5. */ ++ reuse_time = SAME_LOOP_REUSE_WEIGHT; ++ } ++ else ++ { ++ /* If the mem_ref is reused but not in the same group with any ++ other mem_ref, the new reuse weight will be 1. */ ++ loop_set.insert (mem_ref.loop_idx); ++ reuse_time = NORMAL_REUSE_WEIGHT; ++ } ++ } ++ unsigned int used_dim = std::min (mem_ref.loop_depth, var_dim); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "used_dim : %u, loop_depth : %u\n", used_dim, ++ mem_ref.loop_depth); ++ unsigned int power = flog (std::max (0u, mem_ref.loop_depth - used_dim) ++ + 2, 2.); ++ reuse_time_sum += reuse_time * (used_dim * used_dim / 2.) * (power); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "(%f * (%u * %u / 2) * (%u) = %f\n", ++ reuse_time, used_dim, used_dim, power, ++ reuse_time * (used_dim * used_dim / 2.) * (power)); ++ } ++ return reuse_time_sum; ++} ++ ++/* Calculate reuse level. */ ++ ++float ++calculate_reuse_level (std::map > &var_use, ++ unsigned int var_dim, double var_size) ++{ ++ const float VAR_SIZE_CACHE_CAPACITY = 1 / 4.; ++ const int WITHIN_CACHE_SIZE_COST = 4; ++ const float BYTE_CONVERT_RATIO = 1024.; ++ ++ float level = 0.; ++ std::set loop_set; ++ std::set bb_set; ++ bool has_write_op = false; ++ for (std::map >::iterator it = var_use.begin (); ++ it != var_use.end (); ++it) ++ { ++ unsigned int parallel = 1; ++ unsigned int regular = 1; ++ ++ if ((*it).second[0].parallel_p) ++ parallel = PARALLEL_NUM; ++ if (!(*it).second[0].regular_p) ++ regular = INDIRECT_ACCESS_VALUE; ++ if (!(*it).second[0].read_p) ++ has_write_op = true; ++ ++ /* In serial reuse, we will later check whether they are in the ++ same cacheline. If yes, delete the reuse. For details, see the ++ reuse analysis of prefetching and eliminate redundancy. */ ++ float reuse_times = calculate_reuse_times ((*it).second, loop_set, ++ bb_set, var_dim); ++ float add = parallel * reuse_times * regular; ++ level += add; ++ if (add && dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "%d : %d * %f * %d = %f\n", ++ (*it).first, parallel, reuse_times, regular, add); ++ } ++ ++ bool within_llc_size = var_size > param_l2_cache_size / BYTE_CONVERT_RATIO ++ && var_size < VAR_SIZE_CACHE_CAPACITY ++ * param_llc_capacity_per_core; ++ ++ float final_level = has_write_op ? (level * WRITE_COST) : level; ++ final_level = within_llc_size ? (final_level * WITHIN_CACHE_SIZE_COST) ++ : final_level; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "final level : %d * %f * %d = %f\n", ++ has_write_op ? WRITE_COST : 1, level, ++ within_llc_size ? WITHIN_CACHE_SIZE_COST : 1, final_level); ++ return final_level; ++} ++ ++/* Comparison of reference reuse level. */ ++ ++bool ++ref_group_reuse_cmp (const ref_group &a, const ref_group &b) ++{ ++ if (a.reuse_level != b.reuse_level) ++ return a.reuse_level > b.reuse_level; ++ else ++ return get_name (a.var) < get_name (b.var); ++} ++ ++/* Dump key information of reference group and memory access for llc hint. */ ++ ++void ++dump_key_info_for_llc_hint (std::vector &ref_groups) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nLLC hint info:\n"); ++ fprintf (dump_file, "rank\tvar\t(lineno, direct, vectorized, write)\n"); ++ for (unsigned int i = 0; i < ref_groups.size (); ++i) ++ { ++ fprintf (dump_file, "%d\t", i); ++ print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM); ++ data_ref &mem_ref = ref_groups[i].first_use; ++ fprintf (dump_file, "\t(%d, %u, %u, %u)", ++ expand_location (mem_ref.stmt->location).line, ++ mem_ref.regular_p, mem_ref.vectorize_p, 1 - mem_ref.read_p); ++ fprintf (dump_file, "\n"); ++ } ++ fprintf (dump_file, "\n"); ++ } ++} ++ ++/* Sort reference groups. */ ++ ++void ++sort_ref_groups (std::vector &ref_groups, ++ std::map &ref_groups_map) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nsort_ref_groups_by_reuse_level\n"); ++ ++ for (std::map::iterator it = ref_groups_map.begin (); ++ it != ref_groups_map.end (); ++it) ++ { ++ (*it).second.reuse_level = calculate_reuse_level ((*it).second.ref_use, ++ (*it).second.dim, ++ (*it).second.var_size); ++ ref_groups.push_back ((*it).second); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, (*it).second.var, TDF_SLIM); ++ fprintf (dump_file, " : %f\n\n", (*it).second.reuse_level); ++ } ++ } ++ ++ std::sort (ref_groups.begin (), ref_groups.end (), ref_group_reuse_cmp); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nsorted ref_groups:\n"); ++ fprintf (dump_file, "rank\tvar\t(data_size, dim, num_of_mem_ref, " ++ "need_tmp_name): reuse_level_score\n"); ++ for (unsigned int i = 0; i < ref_groups.size (); ++i) ++ { ++ fprintf (dump_file, "%d\t", i); ++ print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM); ++ int need_tmp_name = !get_name (ref_groups[i].var) ? 1 : 0; ++ fprintf (dump_file, "\t(%lf, %u, %lu, %d)", ref_groups[i].var_size, ++ ref_groups[i].dim, ref_groups[i].ref_scores.size (), ++ need_tmp_name); ++ fprintf (dump_file, " : %f\n", ref_groups[i].reuse_level); ++ } ++ fprintf (dump_file, "\n"); ++ ++ fprintf (dump_file, "first_use:\n"); ++ for (unsigned int i = 0; i < ref_groups.size (); ++i) ++ { ++ fprintf (dump_file, "%d ", i); ++ print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM); ++ fprintf (dump_file, " : "); ++ if (!ref_groups[i].first_use.vectorize_p) ++ print_generic_expr (dump_file, ref_groups[i].first_use.ref, ++ TDF_SLIM); ++ else ++ print_gimple_stmt (dump_file, ref_groups[i].first_use.stmt, ++ TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ fprintf (dump_file, "\n"); ++ } ++ dump_key_info_for_llc_hint (ref_groups); ++} ++ ++/* Attributes of variable data. */ ++ ++enum data_attribute ++{ ++ DA_PARALLEL = 0, ++ DA_REGULAR, ++ DA_READ ++}; ++ ++/* Record memory reference by use mode. ++ If the reference group is not found, create a group. */ ++ ++void ++record_mem_ref (std::map &ref_groups, data_ref &mem_ref) ++{ ++ unsigned int index = (mem_ref.parallel_p << DA_PARALLEL) ++ + (mem_ref.regular_p << DA_REGULAR) + (mem_ref.read_p << DA_READ); ++ ++ if (!ref_groups.count (mem_ref.var)) ++ { ++ ref_group ref_group; ++ ref_group.var = mem_ref.var; ++ ref_group.first_use = mem_ref; ++ ref_group.first_calc_use = mem_ref; ++ ref_groups[mem_ref.var] = ref_group; ++ } ++ ++ /* Ref_groups' calc_by reflects the highest order of calc_by that can be ++ achieved by all mem_ref of ref_groups. The first mem_ref that achieves ++ this order is defined to be `first_calc_use`. Later after sorting ++ mem_refs, calc_by will be replaced by the calc_by of `first_use`, and ++ even by the calc_by of `first_calc_use`. */ ++ if (mem_ref.calc_by > ref_groups[mem_ref.var].calc_by) ++ { ++ ref_groups[mem_ref.var].calc_by = mem_ref.calc_by; ++ ref_groups[mem_ref.var].first_calc_use = mem_ref; ++ } ++ ref_groups[mem_ref.var].var_size = std::max (ref_groups[mem_ref.var].var_size, ++ mem_ref.data_size); ++ ref_groups[mem_ref.var].dim = std::max (ref_groups[mem_ref.var].dim, ++ (unsigned int) mem_ref.loop_bounds.size ()); ++ ref_groups[mem_ref.var].ref_use[index].push_back (mem_ref); ++ ++ ref_score ref_level = { mem_ref, ((mem_ref.stmt)->bb->count).to_gcov_type (), ++ expand_location (mem_ref.stmt->location).line }; ++ ref_groups[mem_ref.var].ref_scores.push_back (ref_level); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "recorded in: "); ++ print_generic_expr (dump_file, mem_ref.var, TDF_SLIM); ++ fprintf (dump_file, ":%d:%ld\n", index, ++ ref_groups[mem_ref.var].ref_use[index].size () - 1); ++ ++ fprintf (dump_file, "base: "); ++ print_generic_expr (dump_file, mem_ref.base, TDF_SLIM); ++ ++ fprintf (dump_file, ", index: "); ++ print_generic_expr (dump_file, mem_ref.index, TDF_SLIM); ++ ++ fprintf (dump_file, ", step: "); ++ if (mem_ref.step && cst_and_fits_in_hwi (mem_ref.step)) ++ fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, ++ int_cst_value (mem_ref.step)); ++ else ++ print_generic_expr (dump_file, mem_ref.step, TDF_SLIM); ++ ++ fprintf (dump_file, ", offset: "); ++ if (mem_ref.offset && cst_and_fits_in_hwi (mem_ref.offset)) ++ fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, ++ int_cst_value (mem_ref.offset)); ++ else ++ print_generic_expr (dump_file, mem_ref.offset, TDF_SLIM); ++ fprintf (dump_file, ", %s", mem_ref.read_p ? "read" : "write"); ++ ++ fprintf (dump_file, ", size: %lf", mem_ref.data_size); ++ fprintf (dump_file, "\n\n"); ++ } ++} ++ ++/* Rank data reference index level. */ ++ ++bool ++best_insert_cmp (const ref_score &a, const ref_score &b) ++{ ++ /* NEXT STEP: We can also calculate gap using static/feedback info inferred ++ from historical maximum bb count: ++ gap = hist_max_bb_ct / (alpha * max (a.bb_ct, b.bb_ct)) + 1. ++ Also, bb count needs to be smoothed and scaled as divisor can be 0. ++ history maximum bb count can be obtained in Phase 4. */ ++ const float gap = 1; ++ if (a.d_ref.loop_depth != b.d_ref.loop_depth) ++ return a.d_ref.loop_depth > b.d_ref.loop_depth; ++ else if (a.d_ref.regular_p != b.d_ref.regular_p) ++ return a.d_ref.regular_p > b.d_ref.regular_p; ++ else if (abs (double (std::max (a.bb_count, b.bb_count) + 1) ++ / double (std::min (a.bb_count, b.bb_count) + 1) - 1) > gap) ++ return a.bb_count > b.bb_count; ++ else if (a.line != b.line) ++ return a.line < b.line; ++ else if (a.d_ref.read_p != b.d_ref.read_p) ++ return a.d_ref.read_p < b.d_ref.read_p; ++ else ++ return a.d_ref.vectorize_p > b.d_ref.vectorize_p; ++} ++ ++/* Sort data reference index level within one reference group in non-decreasing ++ order of the customized sorting scheme. */ ++ ++void ++sort_mem_ref_in_ref_group (std::map &ref_groups_map) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nsorted data_references:\n"); ++ for (std::map::iterator it = ref_groups_map.begin (); ++ it != ref_groups_map.end (); ++it) ++ { ++ ref_group &curr_ref_group = (*it).second; ++ std::vector &ref_scores = curr_ref_group.ref_scores; ++ std::stable_sort (ref_scores.begin (), ref_scores.end (), ++ best_insert_cmp); ++ /* Update ref_group's first_use and calc_by with the first mem_ref after ++ sorting. */ ++ curr_ref_group.first_use = curr_ref_group.ref_scores[0].d_ref; ++ curr_ref_group.calc_by = curr_ref_group.first_use.calc_by; ++ ++ /* When transferring footprint is enabled, it is allowed to transfer ++ the statically-calculated footprint of a mem_ref from the same ++ ref_group to `first_use` mem_ref. */ ++ if (param_transfer_footprint ++ && curr_ref_group.first_use.calc_by == UNHANDLE_CALC) ++ { ++ if (curr_ref_group.first_calc_use.calc_by > RUNTIME_CALC) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, (*it).first, TDF_SLIM); ++ fprintf (dump_file, "\nfirst_use: "); ++ print_gimple_stmt (dump_file, curr_ref_group.first_use.stmt, ++ 0, TDF_LINENO); ++ fprintf (dump_file, "first_calc_use: "); ++ print_gimple_stmt (dump_file, ++ curr_ref_group.first_calc_use.stmt, ++ 0, TDF_LINENO); ++ } ++ ++ curr_ref_group.calc_by = curr_ref_group.first_calc_use.calc_by; ++ curr_ref_group.transfer_ft = 1; ++ } ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, (*it).first, TDF_SLIM); ++ fprintf (dump_file, ": cannot transfer footprint to " ++ "first use mem_ref.\n"); ++ } ++ } ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, (*it).first, TDF_SLIM); ++ fprintf (dump_file, " : %lu\n", ref_scores.size ()); ++ for (unsigned int i = 0; i < ref_scores.size (); ++i) ++ { ++ fprintf (dump_file, "mem_ref_index %u: ", i); ++ print_gimple_stmt (dump_file, ref_scores[i].d_ref.stmt, 0, ++ TDF_LINENO); ++ fprintf (dump_file, "bb-%d ", ++ ref_scores[i].d_ref.stmt->bb->index); ++ fprintf (dump_file, "count %ld\n", ref_scores[i].bb_count); ++ } ++ fprintf (dump_file, "\n\n"); ++ } ++ } ++} ++ ++/* Tracing and sorting reference groups. */ ++ ++bool ++record_and_sort_ref_groups (std::vector &ref_groups, ++ std::vector &kernels, ++ std::map > &loop_refs, ++ std::set bb_pathset) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nPhase 5: trace_all_references_details\n\n"); ++ ++ std::map ref_groups_map; ++ ++ for (unsigned i = 0; i < kernels.size (); ++i) ++ { ++ class loop *loop = kernels[i]; ++ if (loop_refs.count (loop) == 0) ++ continue; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "loop header %d:\n", loop->header->index); ++ for (unsigned j = 0; j < loop_refs[loop].size (); ++j) ++ { ++ data_ref &mem_ref = loop_refs[loop][j]; ++ if (mem_ref.trace_status_p) ++ { ++ if (!param_filter_mode || (param_filter_mode ++ && bb_pathset.count (mem_ref.stmt->bb->index))) ++ record_mem_ref (ref_groups_map, mem_ref); ++ } ++ } ++ } ++ ++ /* Sort mem_ref within ref_group by local count and update first_use's ++ data_ref, stable sort. */ ++ sort_mem_ref_in_ref_group (ref_groups_map); ++ sort_ref_groups (ref_groups, ref_groups_map); ++ ++ return ref_groups.size () > 0; ++} ++ ++/* ================ phase 6 issue_llc_hint ================ */ ++ ++/* Issue vectorized mask prefetch gimple. */ ++ ++void ++issue_mask_prefetch (gimple *stmt) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "insert svprfd.\n"); ++ ++ /* vect__1.1 = .MASK_LOAD (_2, 32B, loop_mask_3); ++ .MASK_STORE (_4, 32B, loop_mask_5, vect__6.6); ++ */ ++ tree dataref_ptr = gimple_call_arg (stmt, 0); ++ tree scale = gimple_call_arg (stmt, 1); ++ tree final_mask = gimple_call_arg (stmt, 2); ++ tree target = NULL_TREE; ++ if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE) ++ target = gimple_call_arg (stmt, 3); ++ else if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD) ++ target = gimple_call_lhs (stmt); ++ tree prfop = NULL_TREE; ++ if (param_llc_level == 3) ++ /* for simulation, 4: PLDL3KEEP. */ ++ prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4); ++ else if (param_llc_level == 4) ++ /* 6: PLDL4KEEP. */ ++ prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6); ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "LLC cache levels are illegal.\n"); ++ return; ++ } ++ ++ /* add offset. */ ++ gimple_stmt_iterator si = gsi_for_stmt (stmt); ++ /* target: vector_type - XXX_type. */ ++ if (target == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "unhandled scene: target vect is null"); ++ return; ++ } ++ unsigned HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi ++ (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target)))); ++ tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance); ++ addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true, ++ NULL, true, GSI_SAME_STMT); ++ ++ gcall *call = gimple_build_call_internal (IFN_MASK_PREFETCH, 5, addr, scale, ++ final_mask, target, prfop); ++ gsi_insert_after (&si, call, GSI_SAME_STMT); ++ update_ssa (TODO_update_ssa_only_virtuals); ++} ++ ++/* Issue vectorized mask gather prefetch gimple. */ ++ ++void ++issue_mask_gather_prefetch (gimple *stmt) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "insert svprfd_gather_uxindex.\n"); ++ ++ /* vect_patt_1.1 = .MASK_GATHER_LOAD (_2, vect__3.3, 8, { 0.0, ... }, ++ loop_mask_4); */ ++ tree dataref_ptr = gimple_call_arg (stmt, 0); ++ tree vec_offset = gimple_call_arg (stmt, 1); ++ tree scale = gimple_call_arg (stmt, 2); ++ tree zero = gimple_call_arg (stmt, 3); ++ tree final_mask = gimple_call_arg (stmt, 4); ++ tree prfop = NULL_TREE; ++ if (param_llc_level == 3) // for simulation ++ prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4); // 4: PLDL3KEEP ++ else if (param_llc_level == 4) ++ prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6); // 6: PLDL4KEEP ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "LLC cache levels are illegal.\n"); ++ return; ++ } ++ ++ tree target = gimple_call_lhs (stmt); ++ /* add offset. */ ++ gimple_stmt_iterator si = gsi_for_stmt (stmt); ++ if (target == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "unhandled scene: target vect is null"); ++ return; ++ } ++ unsigned HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi ++ (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target)))); ++ tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance); ++ addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true, ++ NULL, true, GSI_SAME_STMT); ++ ++ gcall *call = gimple_build_call_internal (IFN_MASK_GATHER_PREFETCH, 7, addr, ++ vec_offset, scale, zero, ++ final_mask, target, prfop); ++ gsi_insert_after (&si, call, GSI_SAME_STMT); ++ update_ssa (TODO_update_ssa_only_virtuals); ++} ++ ++/* Issue builtin prefetch gimple. */ ++ ++void ++issue_builtin_prefetch (data_ref &mem_ref) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "insert prfm.\n"); ++ /* MEM[symbol: diagPtr, index: ivtmp_102, step: 8, offset: 0B] */ ++ gimple *stmt = mem_ref.stmt; ++ tree ref = mem_ref.ref; ++ ++ tree scale = mem_ref.step; ++ gimple_stmt_iterator si = gsi_for_stmt (stmt); ++ if (scale == NULL_TREE) ++ { ++ /* _190 = (void *) ivtmp.444_221; ++ Cannot detect size unit at (void *). */ ++ scale = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (mem_ref.var))); ++ if (scale == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "ERROR: Unknown size unit for the prefetching " ++ "variable. Stop builtin_prefetch.\n\n"); ++ return; ++ } ++ } ++ ++ tree addr = build_fold_addr_expr_with_type (ref, ptr_type_node); ++ addr = force_gimple_operand_gsi (&si, unshare_expr (addr), ++ true, NULL, true, GSI_SAME_STMT); ++ unsigned HOST_WIDE_INT distance = param_prefetch_offset ++ * tree_to_uhwi (scale); ++ ++ addr = fold_build_pointer_plus_hwi (addr, distance); ++ addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true, ++ NULL, true, GSI_SAME_STMT); ++ /* __builtin_prefetch (_68, 0, 1); ++ 1st param: *addr, 2nd param: write/read (1/0), 3rd param: temporal locality ++ (high means strong locality) */ ++ gcall *call = NULL; ++ if (param_llc_level == 3) ++ { ++ /* for simulation. ++ BUILT_IN_PREFETCH (addr, rw, locality). */ ++ call = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH), ++ 3, addr, integer_zero_node, integer_one_node); ++ } ++ else if (param_llc_level == 4) ++ { ++ tree prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6); ++ call = gimple_build_call ( ++ builtin_decl_explicit (BUILT_IN_PREFETCH_FULL), ++ 3, addr, integer_zero_node, prfop); ++ } ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "LLC cache levels are illegal.\n"); ++ return; ++ } ++ ++ gsi_insert_after (&si, call, GSI_SAME_STMT); ++ update_ssa (TODO_update_ssa_only_virtuals); ++} ++ ++/* Static form insertion and issue instruction. We may check the ++ determination of the ARM SVE architecture before SVE hint insertion. */ ++ ++void ++static_issue (std::vector &ref_groups, int num_issue_var) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "static issue\n"); ++ ++ for (int i = 0; i < num_issue_var; ++i) ++ { ++ data_ref mem_ref = ref_groups[i].first_use; ++ if (mem_ref.vectorize_p) ++ { ++ enum internal_fn ifn_code = gimple_call_internal_fn (mem_ref.stmt); ++ if (ifn_code == IFN_MASK_STORE || ifn_code == IFN_MASK_LOAD) ++ issue_mask_prefetch (mem_ref.stmt); ++ else if (ifn_code == IFN_MASK_GATHER_LOAD) ++ issue_mask_gather_prefetch (mem_ref.stmt); ++ else ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "other vectorized internal function\n"); ++ } ++ else ++ issue_builtin_prefetch (mem_ref); ++ } ++} ++ ++/* Check whether all loop bounds (niters) used for calculating the footprints ++ of previously-executed ref_groups are defined in a dominated bb to the ++ currentbranch bb, where the conditional expression requires the loop bound ++ info. */ ++ ++bool ++check_def_use_chain (std::vector &ref_groups, ++ basic_block &branch_header_bb, ++ std::vector &ref_group_idx) ++{ ++ for (std::vector::iterator it = ref_group_idx.begin (); ++ it != ref_group_idx.end (); ++it) ++ { ++ /* Transferring mem_ref only takes place during footprint calculation. */ ++ ref_group &ref_group_curr = ref_groups[*it]; ++ data_ref mem_ref = ref_group_curr.transfer_ft ++ ? ref_group_curr.first_calc_use ++ : ref_group_curr.first_use; ++ for (unsigned j = 0; j < mem_ref.loop_bounds.size (); ++j) ++ { ++ tree niters = mem_ref.loop_bounds[j].niters; ++ gimple *def_stmt = SSA_NAME_DEF_STMT (niters); ++ basic_block def_bb = gimple_bb (def_stmt); ++ /* Check dominator relationship of def bb and branch bb. */ ++ /* Case 1: Check whether the def bb is the single predecessor block ++ of header bb. */ ++ if (single_pred_p (branch_header_bb)) ++ { ++ basic_block branch_bb_prev = single_pred (branch_header_bb); ++ if (branch_bb_prev->index == def_bb->index) ++ continue; ++ } ++ /* Case 2: Check whether the branch bb is dominated by the def ++ bb. */ ++ if (!dominated_by_p (CDI_DOMINATORS, branch_header_bb, def_bb)) ++ return false; ++ } ++ } ++ return true; ++} ++ ++/* Generate the stmts for calculating the size. Later we will consider nested ++ multi-branches scenarios and check more information of niters when it is ++ a COND_EXPR. */ ++ ++tree ++calc_stmts_gen (std::vector &ref_groups, ++ gimple_seq &cond_expr_stmt_list, ++ basic_block branch_header_bb, ++ std::vector &ref_group_idx_curr, ++ std::vector &ref_group_idx_prev, tree &cumul_size) ++{ ++ /* Check whether the bbs of def stmt for footprint loop bounds dominates ++ the bb of new runtime branching conditional. */ ++ if (!check_def_use_chain (ref_groups, branch_header_bb, ref_group_idx_prev)) ++ return NULL_TREE; ++ ++ /* Accumulated allocation size. */ ++ for (std::vector::iterator it = ref_group_idx_curr.begin (); ++ it != ref_group_idx_curr.end (); ++it) ++ { ++ /* Transferring mem_ref only takes place during footprint calculation. */ ++ ref_group &ref_group_curr = ref_groups[*it]; ++ data_ref mem_ref = ref_group_curr.transfer_ft ++ ? ref_group_curr.first_calc_use ++ : ref_group_curr.first_use; ++ tree var = mem_ref.var; ++ tree unit = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (var))); ++ /* _190 = (void *) ivtmp.444_221; ++ Cannot detect size unit at (void *). */ ++ if (unit == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "WARNING: Cannot detect size unit " ++ "(use 1 byte) for variable %s: ", ++ get_name (var)); ++ print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ unit = size_one_node; ++ } ++ tree size = NULL_TREE; ++ for (unsigned j = 0; j < mem_ref.loop_bounds.size (); ++j) ++ { ++ tree niters = mem_ref.loop_bounds[j].niters; ++ ++ /* COND_EXPR. */ ++ if (TREE_CODE (niters) == COND_EXPR) ++ niters = TREE_OPERAND (niters, 1); ++ if (size == NULL_TREE) ++ { ++ size = niters; ++ } else { ++ size = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, ++ size); ++ } ++ } ++ unit = build1 (NOP_EXPR, TREE_TYPE (size), unit); ++ size = fold_build2 (MULT_EXPR, TREE_TYPE (size), size, unit); ++ size = build1 (FLOAT_EXPR, double_type_node, size); ++ cumul_size = fold_build2 (PLUS_EXPR, double_type_node, cumul_size, ++ size); ++ ref_group_idx_prev.push_back (*it); ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "cumul_size = "); ++ print_generic_expr (dump_file, cumul_size, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ /* Create a stmt list for size calculation. */ ++ tree div = build_int_cst (TREE_TYPE (integer_zero_node), 1024 * 1024); ++ div = build1 (NOP_EXPR, double_type_node, div); ++ tree total_size = fold_build2 (RDIV_EXPR, double_type_node, cumul_size, div); ++ ++ tree threshold = build_int_cst (TREE_TYPE (integer_zero_node), ++ param_llc_capacity_per_core / 2); ++ threshold = build_real_from_int_cst (double_type_node, threshold); ++ tree cond_expr = fold_build2 (LE_EXPR, boolean_type_node, total_size, ++ threshold); ++ ++ /* Convert cond_expr to stmt list. */ ++ cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr), ++ &cond_expr_stmt_list, is_gimple_condexpr, ++ NULL_TREE); ++ return cond_expr; ++} ++ ++/* Retrieve the least number of loops that cover all target mem_refs. ++ Try to merge loops that the mem_refs reside to a common superloop and ++ maintain a worklist which relates NEED-TO-COPY loops with the target mem ++ refs inside using the following criteria: ++ 1) If loop A is a superloop of loop B in the worklist, replace loop B with ++ loop A in the worklist, and attach all target mem_refs of loop B, ++ together with loop A's, to loop A. ++ 2) If loop B in the worklist is a superloop of loop A, attach loop A's ++ target mem_ref to loop B. ++ 3) If loop A is not a superloop/subloop of loop B in the worklist, replace ++ loop B with their lowest common superloop C in the worklist, and attach ++ all target mem_refs of loop A and loop B to loop C. ++ 4) If loop A and loop B's lowest common superloop is function body ++ (loop 0), stop merging and maintain loop independence. */ ++ ++void ++get_loop_worklist (std::vector &ref_groups, int num_issue_var, ++ std::map > &loop_worklist) ++{ ++ for (int i = 0; i < num_issue_var; ++i) ++ { ++ data_ref &mem_ref = ref_groups[i].first_use; ++ class loop *loop_new = mem_ref.loop_bounds.front ().loop; ++ class loop *common_superloop = loop_new; ++ bool add_loop_worklist = false; ++ ++ /* Use greedy algorithm to merge loops to a common superloop that can ++ contain the current mem_refs. */ ++ std::map >::iterator it_tmp; ++ std::vector ref_group_idx_tmp; ++ std::map >::iterator it; ++ for (it = loop_worklist.begin (); it != loop_worklist.end ();) ++ { ++ class loop *loop_old = it->first; ++ common_superloop = find_common_loop (loop_new, loop_old); ++ if (common_superloop == NULL || common_superloop->num == 0) ++ { ++ /* Stop merging two loops if there is no common superloop for ++ them except function body (loop 0). */ ++ if (common_superloop != NULL ++ && dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "ref_group %d's loop %d has no common " ++ "superloop with existing loop %d\n", ++ i, loop_new->num, loop_old->num); ++ } ++ ++it; ++ continue; ++ } ++ ++ if (common_superloop->num == loop_old->num) ++ { ++ /* If loop_old is the superloop of loop_new, add current ++ ref_group index to loop's worklist. */ ++ loop_worklist[common_superloop].push_back (i); ++ ++it; ++ } ++ else ++ { ++ /* If loop_old is not a superloop of loop_new, replace ++ loop_old with the common superloop. */ ++ it_tmp = it; ++ ++it_tmp; ++ ref_group_idx_tmp = it->second; ++ loop_worklist.erase (it); ++ it = it_tmp; ++ add_loop_worklist = true; ++ } ++ } ++ ++ if (loop_worklist.empty () || add_loop_worklist) ++ { ++ /* Update the new common superloop in loop_worklist. */ ++ std::vector &ref_groups_tmp = loop_worklist[common_superloop]; ++ ref_groups_tmp.push_back (i); ++ for (std::vector::iterator it = ref_group_idx_tmp.begin (); ++ it != ref_group_idx_tmp.end (); ++it) ++ ref_groups_tmp.push_back (*it); ++ std::sort (ref_groups_tmp.begin (), ref_groups_tmp.end ()); ++ } ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "runtime loop list:\n"); ++ std::map >::iterator it; ++ for (it = loop_worklist.begin (); it != loop_worklist.end (); ++it) ++ { ++ fprintf (dump_file, "loop %d:", it->first->num); ++ for (std::vector::iterator idx_it = it->second.begin (); ++ idx_it != it->second.end (); ++idx_it) ++ { ++ fprintf (dump_file, " %d", *idx_it); ++ } ++ fprintf (dump_file, "\n"); ++ } ++ } ++} ++ ++/* Runtime form insertion and issue instruction. */ ++ ++void ++runtime_issue (std::vector &ref_groups, int num_issue_var, ++ std::vector &sorted_kernels) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "runtime issue\n"); ++ ++ /* It is possible that the loop father of some mem_ref's bb may contain the ++ loop fathers of the others. Therefore, we intend to only copy loops ++ without inclusion relationship. */ ++ std::map > loop_worklist; ++ get_loop_worklist (ref_groups, num_issue_var, loop_worklist); ++ bool get_first_ref_group = false; ++ std::vector ref_group_idx_prev; ++ ++ /* NEXT STEP: Multiple loop copies (possibly nested within one loop can cost ++ front-end bound due to branching within loop), we need to set up a ++ threshold such that we may compensate this time cost by space cost ++ in binary (copying outer loop). */ ++ tree cumul_size = build_real_from_int_cst (double_type_node, ++ integer_zero_node); ++ for (std::vector::iterator it = sorted_kernels.begin (); ++ it != sorted_kernels.end (); ++it) ++ { ++ /* Start runtime branching until finding the first ref_group's loop. ++ Skip any ref_groups if their `first_use` mem_refs are executed ++ before the mem_ref of the first ref_group. */ ++ class loop *loop = *it; ++ if (!loop_worklist.count (loop) ++ || (!get_first_ref_group && loop_worklist[loop][0] != 0)) ++ continue; ++ ++ std::vector ref_group_idx_curr = loop_worklist[loop]; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "copy loop num: %d\n", loop->num); ++ } ++ /* If the exit edge points to bb with multiple inputs, split the exit ++ edge and create a new bb, make the exit edge point to bb with only ++ single input. */ ++ edge e = single_exit (loop); ++ if (e == NULL) ++ return; ++ if (!single_pred_p (e->dest)) ++ { ++ split_loop_exit_edge (e, true); ++ if (dump_enabled_p ()) ++ dump_printf (MSG_NOTE, "split exit edge\n"); ++ } ++ ++ /* After updating SSA, we are not sure whether the gimple_seq stmt list ++ is initialized and unchanged during iterations. Therefore, we need to ++ recreate this stmt list for every loop copy. */ ++ gimple_seq cond_expr_stmt_list = NULL; ++ tree cond_expr = calc_stmts_gen (ref_groups, cond_expr_stmt_list, ++ loop->header, ref_group_idx_curr, ++ ref_group_idx_prev, cumul_size); ++ if (cond_expr == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "incalculable variables for conditional\n"); ++ return; ++ } ++ ++ /* Use the previous cond and generate a new branch and copy loop. */ ++ basic_block condition_bb = NULL; ++ profile_probability prob = profile_probability::likely (); ++ initialize_original_copy_tables (); ++ class loop *nloop = loop_version (loop, cond_expr, &condition_bb, ++ prob, prob.invert (), prob, ++ prob.invert (), true); ++ free_original_copy_tables (); ++ ++ /* Insert the generated stmt list before cond_expr. */ ++ gimple_stmt_iterator cond_exp_gsi; ++ if (cond_expr_stmt_list) ++ { ++ /* Function `gsi_insert_seq_before` will insert `cond_expr` (1st ++ stmt) of `condition_bb` to the end of `cond_expr_stmt_list`. */ ++ cond_exp_gsi = gsi_last_bb (condition_bb); ++ gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, ++ GSI_SAME_STMT); ++ } ++ } ++ ++ update_ssa (TODO_update_ssa); ++ ++ /* Perform hint issue for branches that meet conditions. */ ++ static_issue (ref_groups, num_issue_var); ++} ++ ++/* Issue llc hints through prefetch instructions. */ ++ ++void ++issue_llc_hint (std::vector &ref_groups, ++ std::vector &sorted_kernels) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "issue_llc_hint:\n"); ++ ++ /* 1) If the issue-topn and force-issue options are available, top N var is ++ forcibly allocated then no runtime branch is generated. ++ 2) If the issue-topn option is available and the size of top N var is ++ statically known, top N is statically allocated and no runtime branch ++ is generated. ++ 3) If the issue-topn option is available and the size of the top N var is ++ unknown, but them is dynamically known, the top N is dynamically ++ allocated and generate runtime branches. (also depends on the screening ++ of the innermost variable boundary type) ++ 4) If the dynamic runtime cannot know the size, such as indirect access, ++ optimization is skipped. ++ */ ++ int num_issue_var = std::min (param_issue_topn, (int) ref_groups.size ()); ++ if (num_issue_var == 0) ++ return; ++ ++ if (num_issue_var < param_issue_topn ++ && dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "WARNING: Only %u (less than param_issue_topn = %d) " ++ "ref_group(s) is found for llc hint.\n", ++ num_issue_var, param_issue_topn); ++ } ++ if (param_force_issue) ++ { ++ static_issue (ref_groups, num_issue_var); ++ return; ++ } ++ calc_type topn_calc_type = STATIC_CALC; ++ for (int i = 0; i < num_issue_var; ++i) ++ topn_calc_type = std::min (topn_calc_type, ref_groups[i].calc_by); ++ ++ if (topn_calc_type == STATIC_CALC) ++ { ++ /* Before static issue, we still need to collect data size of all target ++ variables and compare the summation with LLC cache size. */ ++ double prefetch_data_size = 0.; ++ for (int i = 0; i < num_issue_var; ++i) ++ prefetch_data_size += ref_groups[i].var_size; ++ ++ if (prefetch_data_size <= (double) param_llc_capacity_per_core ++ * PREFETCH_CACHE_SIZE_RATIO) ++ static_issue (ref_groups, num_issue_var); ++ else ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "static issue: Prefetch size exceeds LLC cache " ++ "size: %lf > %lf.\n", ++ prefetch_data_size, ++ (double) param_llc_capacity_per_core ++ * PREFETCH_CACHE_SIZE_RATIO); ++ } ++ else if (topn_calc_type == RUNTIME_CALC) ++ runtime_issue (ref_groups, num_issue_var, sorted_kernels); ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "unhandled issue scene\n"); ++ } ++} ++ ++/* ==================== phase entry ==================== */ ++ ++/* The LLC intelligent allocation consists of 6 steps. */ ++ ++void ++llc_allocate (void) ++{ ++ std::map > kernels_refs; ++ std::vector kernels; ++ if (!get_dense_memory_kernels (kernels, kernels_refs)) ++ return; ++ ++ std::set traced_ref_stmt; ++ std::vector unresolved_refs; ++ trace_data_refs_info (kernels, kernels_refs, traced_ref_stmt, ++ unresolved_refs); ++ ++ if (!analyze_nested_kernels (kernels, kernels_refs, traced_ref_stmt, ++ unresolved_refs)) ++ return; ++ ++ retrace_loop_refs_info_unresolved (unresolved_refs, traced_ref_stmt); ++ ++ std::vector sorted_kernels; ++ std::vector ref_groups; ++ if (param_filter_mode) ++ { ++ /* AutoFDO mode: include ENTRY bb and EXIT bb indices. */ ++ std::set bb_pathset; ++ bb_pathset.insert (0); ++ bb_pathset.insert (1); ++ if (!filter_and_sort_kernels_feedback (sorted_kernels, bb_pathset)) ++ return; ++ ++ if (!record_and_sort_ref_groups (ref_groups, kernels, kernels_refs, ++ bb_pathset)) ++ return; ++ } ++ else ++ { ++ /* static mode. */ ++ std::set bb_pathset; ++ if (!filter_and_sort_kernels (sorted_kernels, kernels)) ++ return; ++ ++ if (!record_and_sort_ref_groups (ref_groups, sorted_kernels, kernels_refs, ++ bb_pathset)) ++ return; ++ } ++ ++ issue_llc_hint (ref_groups, sorted_kernels); ++} ++ ++/* Check whether the function is an operator reloading function. */ ++ ++bool ++operator_func_p (function *fn) ++{ ++ const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn->decl)); ++ ++ if (fn_name && strncmp (fn_name, "operator", 8) == 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "operator_func: %s ", fn_name); ++ ++ return true; ++ } ++ return false; ++} ++ ++/* Check whether the function file location is known. */ ++ ++bool ++func_location_p (function *fn) ++{ ++ expanded_location fn_decl_xloc ++ = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); ++ expanded_location fn_xloc ++ = expand_location (fn->function_start_locus); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "fn->function_start_locus = %d \n", ++ fn->function_start_locus); ++ fprintf (dump_file, "fn_xloc.file = %s \n", ++ fn_xloc.file ? fn_xloc.file : "NULL"); ++ fprintf (dump_file, "fn_decl_xloc.file = %s \n", ++ fn_decl_xloc.file ? fn_decl_xloc.file : "NULL"); ++ fprintf (dump_file, "LOCATION_FILE (input_location) = %s \n", ++ LOCATION_FILE (input_location) ? LOCATION_FILE (input_location) ++ : "NULL"); ++ } ++ if (fn_decl_xloc.file == NULL) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Function location unknown, skip analysis \n"); ++ return false; ++ } ++ /* Newly generated functions are filtered out, such as function constant ++ propagation func.constprop (). */ ++ if (LOCATION_FILE (input_location) != fn_decl_xloc.file) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Function location non-local, skip analysis \n"); ++ return false; ++ } ++ return true; ++} ++ ++/* Dump function information. */ ++ ++void ++dump_function_info (function *fn) ++{ ++ const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn->decl)); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nfn_name: %s\n", fn_name); ++ expanded_location cfun_xloc ++ = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); ++ if (cfun_xloc.line) ++ { ++ if (cfun_xloc.file) ++ fprintf (dump_file, "[%s:%d:%d]\n", ++ cfun_xloc.file, cfun_xloc.line, cfun_xloc.column); ++ } ++ fprintf (dump_file, "\n"); ++ flow_loops_dump (dump_file, NULL, 1); ++ fprintf (dump_file, "\n"); ++ } ++} ++ ++/* dump param. */ ++ ++void ++dump_param (void) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "LLC allocate parameters:\n"); ++ fprintf (dump_file, " block size: %d\n", param_l1_cache_line_size); ++ fprintf (dump_file, " L1 cache size: %d lines, %d kB\n", ++ param_l1_cache_size * 1024 / param_l1_cache_line_size, ++ param_l1_cache_size); ++ fprintf (dump_file, " L1 cache line size: %d\n", ++ param_l1_cache_line_size); ++ fprintf (dump_file, " L2 cache size: %d kB\n", param_l2_cache_size); ++ fprintf (dump_file, " min mem_access_ratio: %d \n", ++ param_mem_access_ratio); ++ fprintf (dump_file, " min mem_access_num: %d \n", ++ param_mem_access_num); ++ fprintf (dump_file, "\n"); ++ } ++} ++ ++/* Determine whether to analyze the function according to ++ the ordering of functions containing cycle counts. */ ++ ++static bool ++should_analyze_func_p (void) ++{ ++ gcov_type decl_uid = DECL_UID (current_function_decl); ++ gcov_type func_count = event_get_func_count (decl_uid, PMU_EVENT); ++ if (func_count == 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "function uid %ld cannot find profile data " ++ "and skip prefetch analysis\n", ++ decl_uid); ++ } ++ return false; ++ } ++ if (func_count < event_get_topn_function_total_count_thres ()) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "function uid %ld total counts is %lu: " ++ "counts %lu < perf's top %d threshold %lu, " ++ "skip prefetch analysis\n", ++ decl_uid, func_count, func_count, ++ PREFETCH_FUNC_TOPN, event_get_topn_function_total_count_thres ()); ++ } ++ return false; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "function uid %ld total counts is %lu: " ++ "counts %lu >= perf's top %d threshold %lu, " ++ "continue prefetch analysis\n", ++ decl_uid, func_count, func_count, ++ PREFETCH_FUNC_TOPN, event_get_topn_function_total_count_thres ()); ++ } ++ return true; ++} ++ ++const pass_data pass_data_llc_allocate = ++{ ++ GIMPLE_PASS, /* type. */ ++ "llc_allocate", /* name. */ ++ OPTGROUP_LOOP, /* optinfo_flags. */ ++ TV_TREE_PREFETCH, /* tv_id. */ ++ (PROP_cfg | PROP_ssa), /* properties_required. */ ++ 0, /* properties_provided. */ ++ 0, /* properties_destroyed. */ ++ 0, /* todo_flags_start. */ ++ 0, /* todo_flags_finish. */ ++}; ++ ++class pass_llc_allocate : public gimple_opt_pass ++{ ++public: ++ pass_llc_allocate (gcc::context *ctxt) ++ : gimple_opt_pass (pass_data_llc_allocate, ctxt) ++ {} ++ ++ /* opt_pass methods. */ ++ virtual bool gate (function *) ++ { ++ return (optimize >= 2 && flag_llc_allocate > 0); ++ } ++ virtual unsigned int execute (function *); ++ ++}; // class pass_llc_allocate ++ ++unsigned int ++pass_llc_allocate::execute (function *fn) ++{ ++ unsigned int ret = 0; ++ ++ if (!targetm.have_prefetch () ++ || targetm.vectorize.code_for_prefetch == NULL ++ || targetm.vectorize.prefetch_handleable_mode_p == NULL ++ || targetm.vectorize.code_for_gather_prefetch == NULL) ++ return 0; ++ ++ if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH)) ++ { ++ tree type = build_function_type_list (void_type_node, ++ const_ptr_type_node, NULL_TREE); ++ tree decl = add_builtin_function ("__builtin_prefetch", type, ++ BUILT_IN_PREFETCH, BUILT_IN_NORMAL, ++ NULL, NULL_TREE); ++ DECL_IS_NOVOPS (decl) = true; ++ set_builtin_decl (BUILT_IN_PREFETCH, decl, false); ++ } ++ if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH_FULL)) ++ { ++ tree type = build_function_type_list (void_type_node, ++ const_ptr_type_node, NULL_TREE); ++ tree decl = add_builtin_function ("__builtin_prefetch_full", type, ++ BUILT_IN_PREFETCH_FULL, BUILT_IN_NORMAL, ++ NULL, NULL_TREE); ++ DECL_IS_NOVOPS (decl) = true; ++ set_builtin_decl (BUILT_IN_PREFETCH_FULL, decl, false); ++ } ++ ++ dump_param (); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "llc_allocate: %s\n", ++ IDENTIFIER_POINTER (DECL_NAME (fn->decl))); ++ ++ if (number_of_loops (fn) <= 1 || !func_location_p (fn) ++ || operator_func_p (fn)) ++ return ret; ++ ++ /* Filter only when combined with PMU event. When the should_analyze_func_p ++ analysis fails (for example, the function without PMU-event count), ++ in order to ensure the accuracy of the LLC allocation analysis, the ++ function does not perform native allocation processing. */ ++ if (flag_additional_profile && (!profile_exist (PMU_EVENT) || !should_analyze_func_p ())) ++ { ++ return 0; ++ } ++ ++ dump_function_info (fn); ++ ++ llc_allocate (); ++ ++ return ret; ++} ++ ++} // anon namespace ++ ++gimple_opt_pass * ++make_pass_llc_allocate (gcc::context *ctxt) ++{ ++ return new pass_llc_allocate (ctxt); ++} +diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc +index 0353ffd30..0492dc6fd 100644 +--- a/gcc/tree-ssa-loop-niter.cc ++++ b/gcc/tree-ssa-loop-niter.cc +@@ -2489,6 +2489,37 @@ loop_only_exit_p (const class loop *loop, basic_block *body, const_edge exit) + return true; + } + ++/* Returns whether the number of vectorized iterations for the loop can be ++ estimated from the given IR and update the corresponding loop attribute, ++ e.g., next_mask_114 = .WHILE_ULT (_122, niters.5_75, { 0, ... }); */ ++ ++bool ++number_of_iterations_vect (class loop *loop, tree lhs, tree rhs) ++{ ++ loop->vec_nb_iterations = chrec_dont_know; ++ ++ if ((TREE_CODE (lhs) != SSA_NAME && TREE_CODE (rhs) != SSA_NAME) ++ || (TREE_CODE (lhs) == SSA_NAME && TREE_CODE (rhs) == SSA_NAME)) ++ return false; ++ ++ tree ssa = TREE_CODE (lhs) == SSA_NAME ? lhs : rhs; ++ gimple *def_stmt = SSA_NAME_DEF_STMT (ssa); ++ ++ if (gimple_code (def_stmt) != GIMPLE_CALL ++ || !gimple_call_internal_p (def_stmt)) ++ return false; ++ ++ internal_fn ifn = gimple_call_internal_fn (def_stmt); ++ if (ifn != IFN_WHILE_ULT) ++ return false; ++ ++ gcall *call = dyn_cast (def_stmt); ++ tree niters = gimple_call_arg (call, 1); ++ loop->vec_nb_iterations = niters; ++ ++ return true; ++} ++ + /* Stores description of number of iterations of LOOP derived from + EXIT (an exit edge of the LOOP) in NITER. Returns true if some useful + information could be derived (and fields of NITER have meaning described +@@ -2559,6 +2590,9 @@ number_of_iterations_exit_assumptions (class loop *loop, edge exit, + op1 = gimple_cond_rhs (stmt); + type = TREE_TYPE (op0); + ++ if (TREE_CODE (type) == VECTOR_TYPE) ++ number_of_iterations_vect (loop, op0, op1); ++ + if (TREE_CODE (type) != INTEGER_TYPE + && !POINTER_TYPE_P (type)) + return false; +@@ -2852,14 +2886,14 @@ bool + number_of_iterations_exit (class loop *loop, edge exit, + class tree_niter_desc *niter, + bool warn, bool every_iteration, +- basic_block *body) ++ basic_block *body, bool guarantee) + { + gcond *stmt; + if (!number_of_iterations_exit_assumptions (loop, exit, niter, + &stmt, every_iteration, body)) + return false; + +- if (integer_nonzerop (niter->assumptions)) ++ if (integer_nonzerop (niter->assumptions) || guarantee == false) + return true; + + if (warn && dump_enabled_p ()) +diff --git a/gcc/tree-ssa-loop-niter.h b/gcc/tree-ssa-loop-niter.h +index ceaf65e07..8f03458f7 100644 +--- a/gcc/tree-ssa-loop-niter.h ++++ b/gcc/tree-ssa-loop-niter.h +@@ -27,7 +27,8 @@ extern bool loop_only_exit_p (const class loop *, basic_block *body, + extern bool number_of_iterations_exit (class loop *, edge, + class tree_niter_desc *niter, bool, + bool every_iteration = true, +- basic_block * = NULL); ++ basic_block * = NULL, ++ bool guarantee = true); + extern bool number_of_iterations_exit_assumptions (class loop *, edge, + class tree_niter_desc *, + gcond **, bool = true, +diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc +index 9d21e6d03..6e61f7140 100644 +--- a/gcc/tree-vect-loop-manip.cc ++++ b/gcc/tree-vect-loop-manip.cc +@@ -3738,3 +3738,269 @@ vect_loop_versioning (loop_vec_info loop_vinfo, + + return nloop; + } ++ ++class loop * ++vect_loop_versioning_2 (loop_vec_info loop_vinfo, ++ gimple *loop_vectorized_call) ++{ ++ class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop; ++ class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); ++ basic_block condition_bb; ++ gphi_iterator gsi; ++ gimple_stmt_iterator cond_exp_gsi; ++ basic_block merge_bb; ++ basic_block new_exit_bb; ++ edge new_exit_e, e; ++ gphi *orig_phi, *new_phi; ++ tree cond_expr = NULL_TREE; ++ gimple_seq cond_expr_stmt_list = NULL; ++ tree arg; ++ profile_probability prob = profile_probability::likely (); ++ gimple_seq gimplify_stmt_list = NULL; ++ tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo); ++ bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo); ++ bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo); ++ bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo); ++ poly_uint64 versioning_threshold ++ = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); ++ tree version_simd_if_cond ++ = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo); ++ unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); ++ ++ if (vect_apply_runtime_profitability_check_p (loop_vinfo) ++ && !ordered_p (th, versioning_threshold)) ++ cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters, ++ build_int_cst (TREE_TYPE (scalar_loop_iters), ++ th - 1)); ++ if (maybe_ne (versioning_threshold, 0U)) ++ { ++ tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters, ++ build_int_cst (TREE_TYPE (scalar_loop_iters), ++ versioning_threshold - 1)); ++ if (cond_expr) ++ cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node, ++ expr, cond_expr); ++ else ++ cond_expr = expr; ++ } ++ ++ if (version_niter) ++ vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr); ++ ++ if (cond_expr) ++ cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr), ++ &cond_expr_stmt_list, ++ is_gimple_condexpr, NULL_TREE); ++ ++ if (version_align) ++ vect_create_cond_for_align_checks (loop_vinfo, &cond_expr, ++ &cond_expr_stmt_list); ++ ++ if (version_alias) ++ { ++ vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr); ++ vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr); ++ vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr); ++ } ++ ++ if (version_simd_if_cond) ++ { ++ gcc_assert (dom_info_available_p (CDI_DOMINATORS)); ++ if (flag_checking) ++ if (basic_block bb ++ = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond))) ++ gcc_assert (bb != loop->header ++ && dominated_by_p (CDI_DOMINATORS, loop->header, bb) ++ && (scalar_loop == NULL ++ || (bb != scalar_loop->header ++ && dominated_by_p (CDI_DOMINATORS, ++ scalar_loop->header, bb)))); ++ tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond)); ++ tree c = fold_build2 (NE_EXPR, boolean_type_node, ++ version_simd_if_cond, zero); ++ if (cond_expr) ++ cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, ++ c, cond_expr); ++ else ++ cond_expr = c; ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "created versioning for simd if condition check.\n"); ++ } ++ ++ cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr), ++ &gimplify_stmt_list, ++ is_gimple_condexpr, NULL_TREE); ++ gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list); ++ ++ /* Compute the outermost loop cond_expr and cond_expr_stmt_list are ++ invariant in. */ ++ class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr); ++ for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list); ++ !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ gimple *stmt = gsi_stmt (gsi); ++ update_stmt (stmt); ++ ssa_op_iter iter; ++ use_operand_p use_p; ++ basic_block def_bb; ++ FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE) ++ if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p)))) ++ && flow_bb_inside_loop_p (outermost, def_bb)) ++ outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1); ++ } ++ ++ /* Search for the outermost loop we can version. Avoid versioning of ++ non-perfect nests but allow if-conversion versioned loops inside. */ ++ class loop *loop_to_version = loop; ++ if (flow_loop_nested_p (outermost, loop)) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "trying to apply versioning to outer loop %d\n", ++ outermost->num); ++ if (outermost->num == 0) ++ outermost = superloop_at_depth (loop, 1); ++ /* And avoid applying versioning on non-perfect nests. */ ++ while (loop_to_version != outermost ++ && single_exit (loop_outer (loop_to_version)) ++ && (!loop_outer (loop_to_version)->inner->next ++ || vect_loop_vectorized_call (loop_to_version)) ++ && (!loop_outer (loop_to_version)->inner->next ++ || !loop_outer (loop_to_version)->inner->next->next)) ++ loop_to_version = loop_outer (loop_to_version); ++ } ++ ++ /* Apply versioning. If there is already a scalar version created by ++ if-conversion re-use that. Note we cannot re-use the copy of ++ an if-converted outer-loop when vectorizing the inner loop only. */ ++ gcond *cond; ++ if ((!loop_to_version->inner || loop == loop_to_version) ++ && loop_vectorized_call) ++ { ++ gcc_assert (scalar_loop); ++ condition_bb = gimple_bb (loop_vectorized_call); ++ cond = as_a (last_stmt (condition_bb)); ++ gimple_cond_set_condition_from_tree (cond, cond_expr); ++ update_stmt (cond); ++ ++ if (cond_expr_stmt_list) ++ { ++ cond_exp_gsi = gsi_for_stmt (loop_vectorized_call); ++ gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, ++ GSI_SAME_STMT); ++ } ++ ++ /* if-conversion uses profile_probability::always () for both paths, ++ reset the paths probabilities appropriately. */ ++ edge te, fe; ++ extract_true_false_edges_from_block (condition_bb, &te, &fe); ++ te->probability = prob; ++ fe->probability = prob.invert (); ++ /* We can scale loops counts immediately but have to postpone ++ scaling the scalar loop because we re-use it during peeling. */ ++ scale_loop_frequencies (loop_to_version, te->probability); ++ LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability; ++ ++ nloop = scalar_loop; ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "reusing %sloop version created by if conversion\n", ++ loop_to_version != loop ? "outer " : ""); ++ } ++ else ++ { ++ if (loop_to_version != loop ++ && dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "applying loop versioning to outer loop %d\n", ++ loop_to_version->num); ++ ++ initialize_original_copy_tables (); ++ nloop = loop_version (loop_to_version, cond_expr, &condition_bb, ++ prob, prob.invert (), prob, prob.invert (), true); ++ gcc_assert (nloop); ++ nloop = get_loop_copy (loop); ++ ++ /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will ++ reap those otherwise; they also refer to the original ++ loops. */ ++ class loop *l = loop; ++ while (gimple *call = vect_loop_vectorized_call (l)) ++ { ++ call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call))); ++ fold_loop_internal_call (call, boolean_false_node); ++ l = loop_outer (l); ++ } ++ free_original_copy_tables (); ++ ++ if (cond_expr_stmt_list) ++ { ++ cond_exp_gsi = gsi_last_bb (condition_bb); ++ gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, ++ GSI_SAME_STMT); ++ } ++ ++ /* Loop versioning violates an assumption we try to maintain during ++ vectorization - that the loop exit block has a single predecessor. ++ After versioning, the exit block of both loop versions is the same ++ basic block (i.e. it has two predecessors). Just in order to simplify ++ following transformations in the vectorizer, we fix this situation ++ here by adding a new (empty) block on the exit-edge of the loop, ++ with the proper loop-exit phis to maintain loop-closed-form. ++ If loop versioning wasn't done from loop, but scalar_loop instead, ++ merge_bb will have already just a single successor. */ ++ ++ merge_bb = single_exit (loop_to_version)->dest; ++ if (EDGE_COUNT (merge_bb->preds) >= 2) ++ { ++ gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2); ++ new_exit_bb = split_edge (single_exit (loop_to_version)); ++ new_exit_e = single_exit (loop_to_version); ++ e = EDGE_SUCC (new_exit_bb, 0); ++ ++ for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ tree new_res; ++ orig_phi = gsi.phi (); ++ new_res = copy_ssa_name (PHI_RESULT (orig_phi)); ++ new_phi = create_phi_node (new_res, new_exit_bb); ++ arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); ++ add_phi_arg (new_phi, arg, new_exit_e, ++ gimple_phi_arg_location_from_edge (orig_phi, e)); ++ adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi)); ++ } ++ } ++ ++ update_ssa (TODO_update_ssa); ++ } ++ ++ if (version_niter) ++ { ++ /* The versioned loop could be infinite, we need to clear existing ++ niter information which is copied from the original loop. */ ++ gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE)); ++ vect_free_loop_info_assumptions (nloop); ++ /* And set constraint LOOP_C_INFINITE for niter analyzer. */ ++ loop_constraint_set (loop, LOOP_C_INFINITE); ++ } ++ ++ if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION ++ && dump_enabled_p ()) ++ { ++ if (version_alias) ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING, ++ vect_location, ++ "loop versioned for vectorization because of " ++ "possible aliasing\n"); ++ if (version_align) ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING, ++ vect_location, ++ "loop versioned for vectorization to enhance " ++ "alignment\n"); ++ ++ } ++ ++ return nloop; ++} +diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc +index 7f7577951..023a83c38 100644 +--- a/gcc/tree-vect-loop.cc ++++ b/gcc/tree-vect-loop.cc +@@ -9735,8 +9735,11 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) + + if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) + { +- class loop *sloop +- = vect_loop_versioning (loop_vinfo, loop_vectorized_call); ++ class loop *sloop; ++ if (!(optimize >= 2 && flag_llc_allocate > 0)) ++ sloop = vect_loop_versioning (loop_vinfo, loop_vectorized_call); ++ else ++ sloop = vect_loop_versioning_2 (loop_vinfo, loop_vectorized_call); + sloop->force_vectorize = false; + check_profitability = false; + } +@@ -9989,7 +9992,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) + niters_vector_mult_vf, !niters_no_overflow); + + unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); +- scale_profile_for_vect_loop (loop, assumed_vf); ++ if (!(optimize >= 2 && flag_llc_allocate > 0)) ++ scale_profile_for_vect_loop (loop, assumed_vf); + + /* True if the final iteration might not handle a full vector's + worth of scalar iterations. */ +diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h +index e13bc6c99..85018f250 100644 +--- a/gcc/tree-vectorizer.h ++++ b/gcc/tree-vectorizer.h +@@ -2177,6 +2177,7 @@ extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge); + class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *, + class loop *, edge); + class loop *vect_loop_versioning (loop_vec_info, gimple *); ++class loop *vect_loop_versioning_2 (loop_vec_info, gimple *); + extern class loop *vect_do_peeling (loop_vec_info, tree, tree, + tree *, tree *, tree *, int, bool, bool, + tree *); +-- +2.44.0.windows.1 + diff --git a/0366-fix-prefetch-case-failed.patch b/0366-fix-prefetch-case-failed.patch new file mode 100644 index 0000000000000000000000000000000000000000..9c21445f26bf3767034be574c6891d5546cb2bd2 --- /dev/null +++ b/0366-fix-prefetch-case-failed.patch @@ -0,0 +1,144 @@ +From c7bdc03e48a0b6e213c5a4b8c821665d7ca897bb Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=E9=99=88=E9=B8=BF?= +Date: Thu, 6 Mar 2025 14:58:57 +0800 +Subject: [PATCH] fix prefetch case failed + +--- + gcc/params.opt | 2 +- + .../gcc.target/aarch64/sve/acle/general-c/prefetch_1.c | 6 +++--- + .../aarch64/sve/acle/general-c/prefetch_gather_index_1.c | 6 +++--- + .../aarch64/sve/acle/general-c/prefetch_gather_index_2.c | 6 +++--- + .../aarch64/sve/acle/general-c/prefetch_gather_offset_1.c | 6 +++--- + .../aarch64/sve/acle/general-c/prefetch_gather_offset_2.c | 6 +++--- + .../aarch64/sve/acle/general-c/prefetch_gather_offset_3.c | 6 +++--- + .../aarch64/sve/acle/general-c/prefetch_gather_offset_4.c | 6 +++--- + 8 files changed, 22 insertions(+), 22 deletions(-) + +diff --git a/gcc/params.opt b/gcc/params.opt +index e06e50611..a716f2cc4 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -1305,7 +1305,7 @@ cannot recognize inner loop boundaries. + -param=llc-level= + Common Joined UInteger Var(param_llc_level) Init(3) IntegerRange(3, 4) + Param Optimization +-Specifies the HBM cache level. ++Specifies the LLC cache level. + + -param=filter-mode= + Common Joined UInteger Var(param_filter_mode) Init(1) IntegerRange(0, 1) Param +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_1.c +index 316f77fc7..c8094ba2b 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_1.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_1.c +@@ -10,8 +10,8 @@ f1 (svbool_t pg, int32_t *s32_ptr, enum svprfop op) + svprfb (pg, s32_ptr, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 3 of 'svprfb', which expects a valid 'enum svprfop' value} } */ + svprfb (pg, s32_ptr, (enum svprfop) 0); + svprfb (pg, s32_ptr, (enum svprfop) 5); +- svprfb (pg, s32_ptr, (enum svprfop) 6); /* { dg-error {passing 6 to argument 3 of 'svprfb', which expects a valid 'enum svprfop' value} } */ +- svprfb (pg, s32_ptr, (enum svprfop) 7); /* { dg-error {passing 7 to argument 3 of 'svprfb', which expects a valid 'enum svprfop' value} } */ ++ svprfb (pg, s32_ptr, (enum svprfop) 6); ++ svprfb (pg, s32_ptr, (enum svprfop) 7); + svprfb (pg, s32_ptr, (enum svprfop) 8); +- svprfb (pg, s32_ptr, (enum svprfop) 14); /* { dg-error {passing 14 to argument 3 of 'svprfb', which expects a valid 'enum svprfop' value} } */ ++ svprfb (pg, s32_ptr, (enum svprfop) 14); + } +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_1.c +index c33c95440..862ec082b 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_1.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_1.c +@@ -46,8 +46,8 @@ f1 (svbool_t pg, int32_t *s32_ptr, void *void_ptr, void **ptr_ptr, + svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 4 of 'svprfh_gather_index', which expects a valid 'enum svprfop' value} } */ + svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 0); + svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 5); +- svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 4 of 'svprfh_gather_index', which expects a valid 'enum svprfop' value} } */ +- svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 4 of 'svprfh_gather_index', which expects a valid 'enum svprfop' value} } */ ++ svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 6); ++ svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 7); + svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 8); +- svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 4 of 'svprfh_gather_index', which expects a valid 'enum svprfop' value} } */ ++ svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 14); + } +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_2.c +index 3d7797305..f4873c631 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_2.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_2.c +@@ -10,8 +10,8 @@ f1 (svbool_t pg, int32_t *s32_ptr, svint32_t s32, enum svprfop op) + svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 4 of 'svprfh_gather_s32index', which expects a valid 'enum svprfop' value} } */ + svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 0); + svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 5); +- svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 4 of 'svprfh_gather_s32index', which expects a valid 'enum svprfop' value} } */ +- svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 4 of 'svprfh_gather_s32index', which expects a valid 'enum svprfop' value} } */ ++ svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 6); ++ svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 7); + svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 8); +- svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 4 of 'svprfh_gather_s32index', which expects a valid 'enum svprfop' value} } */ ++ svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 14); + } +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_1.c +index cc61901cb..3b82b4777 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_1.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_1.c +@@ -46,8 +46,8 @@ f1 (svbool_t pg, int32_t *s32_ptr, void *void_ptr, void **ptr_ptr, + svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 4 of 'svprfb_gather_offset', which expects a valid 'enum svprfop' value} } */ + svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 0); + svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 5); +- svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 4 of 'svprfb_gather_offset', which expects a valid 'enum svprfop' value} } */ +- svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 4 of 'svprfb_gather_offset', which expects a valid 'enum svprfop' value} } */ ++ svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 6); ++ svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 7); + svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 8); +- svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 4 of 'svprfb_gather_offset', which expects a valid 'enum svprfop' value} } */ ++ svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 14); + } +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_2.c +index 88e0c35e7..2be620de5 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_2.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_2.c +@@ -30,8 +30,8 @@ f1 (svbool_t pg, svint8_t s8, svuint8_t u8, + svprfb_gather (pg, u32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 3 of 'svprfb_gather', which expects a valid 'enum svprfop' value} } */ + svprfb_gather (pg, u32, (enum svprfop) 0); + svprfb_gather (pg, u32, (enum svprfop) 5); +- svprfb_gather (pg, u32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 3 of 'svprfb_gather', which expects a valid 'enum svprfop' value} } */ +- svprfb_gather (pg, u32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 3 of 'svprfb_gather', which expects a valid 'enum svprfop' value} } */ ++ svprfb_gather (pg, u32, (enum svprfop) 6); ++ svprfb_gather (pg, u32, (enum svprfop) 7); + svprfb_gather (pg, u32, (enum svprfop) 8); +- svprfb_gather (pg, u32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 3 of 'svprfb_gather', which expects a valid 'enum svprfop' value} } */ ++ svprfb_gather (pg, u32, (enum svprfop) 14); + } +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_3.c +index 24b4aa190..9a1d931e9 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_3.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_3.c +@@ -10,8 +10,8 @@ f1 (svbool_t pg, int32_t *s32_ptr, svint32_t s32, enum svprfop op) + svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 4 of 'svprfb_gather_s32offset', which expects a valid 'enum svprfop' value} } */ + svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 0); + svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 5); +- svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 4 of 'svprfb_gather_s32offset', which expects a valid 'enum svprfop' value} } */ +- svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 4 of 'svprfb_gather_s32offset', which expects a valid 'enum svprfop' value} } */ ++ svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 6); ++ svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 7); + svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 8); +- svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 4 of 'svprfb_gather_s32offset', which expects a valid 'enum svprfop' value} } */ ++ svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 14); + } +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_4.c +index 63ccdc5a4..f7ca09507 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_4.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_4.c +@@ -10,8 +10,8 @@ f1 (svbool_t pg, svuint32_t u32, enum svprfop op) + svprfb_gather_u32base (pg, u32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 3 of 'svprfb_gather_u32base', which expects a valid 'enum svprfop' value} } */ + svprfb_gather_u32base (pg, u32, (enum svprfop) 0); + svprfb_gather_u32base (pg, u32, (enum svprfop) 5); +- svprfb_gather_u32base (pg, u32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 3 of 'svprfb_gather_u32base', which expects a valid 'enum svprfop' value} } */ +- svprfb_gather_u32base (pg, u32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 3 of 'svprfb_gather_u32base', which expects a valid 'enum svprfop' value} } */ ++ svprfb_gather_u32base (pg, u32, (enum svprfop) 6); ++ svprfb_gather_u32base (pg, u32, (enum svprfop) 7); + svprfb_gather_u32base (pg, u32, (enum svprfop) 8); +- svprfb_gather_u32base (pg, u32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 3 of 'svprfb_gather_u32base', which expects a valid 'enum svprfop' value} } */ ++ svprfb_gather_u32base (pg, u32, (enum svprfop) 14); + } +-- +2.44.0.windows.1 + diff --git a/0367-llc-feature-bugfix.patch b/0367-llc-feature-bugfix.patch new file mode 100644 index 0000000000000000000000000000000000000000..57abf68d46b774159d0bfb3b3c8c4a699f8649f0 --- /dev/null +++ b/0367-llc-feature-bugfix.patch @@ -0,0 +1,79 @@ +From 9bb4c61897abb16d77a0614d4465bf2b0d67b265 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=E9=99=88=E9=B8=BF?= +Date: Mon, 10 Mar 2025 17:00:37 +0800 +Subject: [PATCH] llc feature bugfix + +--- + gcc/params.opt | 2 +- + gcc/testsuite/gcc.dg/llc-allocate/llc-1.c | 2 +- + gcc/tree-ssa-llc-allocate.cc | 2 +- + gcc/tree-vect-loop.cc | 10 +++------- + 4 files changed, 6 insertions(+), 10 deletions(-) + +diff --git a/gcc/params.opt b/gcc/params.opt +index a716f2cc4..ed7559783 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -1285,7 +1285,7 @@ Common Joined UInteger Var(param_issue_topn) Init(1) Param Optimization + Issue topn LLC mem_ref hint. + + -param=force-issue= +-Common Joined UInteger Var(param_force_issue) Init(0) IntegerRange(0, 1) Param ++Common Joined UInteger Var(param_force_issue) Init(1) IntegerRange(0, 1) Param + Force issue the topn LLC mem_ref hint, without generating dynamic multi-branches. + + -param=llc-capacity-per-core= +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c +index 091e654f9..0b81394ad 100644 +--- a/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c +@@ -58,4 +58,4 @@ main (int argc, char *argv[]) + /* { dg-final { scan-tree-dump "\\d\\tuPtr\\t\\(1.466660, 1, 1, 0\\)" "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */ +-/* { dg-final { scan-tree-dump-times "insert svprfd" 4 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "insert svprfd" 2 "llc_allocate" } } */ +diff --git a/gcc/tree-ssa-llc-allocate.cc b/gcc/tree-ssa-llc-allocate.cc +index da6d72b94..d10d60459 100644 +--- a/gcc/tree-ssa-llc-allocate.cc ++++ b/gcc/tree-ssa-llc-allocate.cc +@@ -3822,7 +3822,7 @@ issue_llc_hint (std::vector &ref_groups, + "ref_group(s) is found for llc hint.\n", + num_issue_var, param_issue_topn); + } +- if (param_force_issue) ++ if (param_force_issue == 1 || param_force_issue == 0) + { + static_issue (ref_groups, num_issue_var); + return; +diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc +index 023a83c38..7f7577951 100644 +--- a/gcc/tree-vect-loop.cc ++++ b/gcc/tree-vect-loop.cc +@@ -9735,11 +9735,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) + + if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) + { +- class loop *sloop; +- if (!(optimize >= 2 && flag_llc_allocate > 0)) +- sloop = vect_loop_versioning (loop_vinfo, loop_vectorized_call); +- else +- sloop = vect_loop_versioning_2 (loop_vinfo, loop_vectorized_call); ++ class loop *sloop ++ = vect_loop_versioning (loop_vinfo, loop_vectorized_call); + sloop->force_vectorize = false; + check_profitability = false; + } +@@ -9992,8 +9989,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) + niters_vector_mult_vf, !niters_no_overflow); + + unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); +- if (!(optimize >= 2 && flag_llc_allocate > 0)) +- scale_profile_for_vect_loop (loop, assumed_vf); ++ scale_profile_for_vect_loop (loop, assumed_vf); + + /* True if the final iteration might not handle a full vector's + worth of scalar iterations. */ +-- +2.44.0.windows.1 + diff --git a/0368-fix-llc-feature-case-failed.patch b/0368-fix-llc-feature-case-failed.patch new file mode 100644 index 0000000000000000000000000000000000000000..3bfcf72aa224f09c77652150932bc019c87d2078 --- /dev/null +++ b/0368-fix-llc-feature-case-failed.patch @@ -0,0 +1,78 @@ +From 889fed32e6e86a64974ec9edc69cd2c88c14e6f0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=E9=99=88=E9=B8=BF?= +Date: Sat, 15 Mar 2025 14:58:11 +0800 +Subject: [PATCH] fix llc feature case failed + +--- + gcc/testsuite/gcc.dg/llc-allocate/llc-1.c | 2 +- + gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c | 2 +- + gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c | 2 +- + gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 | 4 ++-- + .../gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 | 2 +- + 5 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c +index 0b81394ad..55d1396d4 100644 +--- a/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c +@@ -56,6 +56,6 @@ main (int argc, char *argv[]) + /* { dg-final { scan-tree-dump "\\d\\tupperPtr\\t\\(2.933319, 1, 1, 0\\)" "llc_allocate" } } */ + /* { dg-final { scan-tree-dump "\\d\\tlPtr\\t\\(1.466660, 1, 1, 0\\)" "llc_allocate" } } */ + /* { dg-final { scan-tree-dump "\\d\\tuPtr\\t\\(1.466660, 1, 1, 0\\)" "llc_allocate" } } */ +-/* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "runtime issue" 0 "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-times "insert svprfd" 2 "llc_allocate" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c +index e18725f60..5e908b380 100644 +--- a/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c +@@ -46,5 +46,5 @@ convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high) + return output_stack->reg[0]; + } + +-/* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "runtime issue" 0 "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c +index ba90e7ea4..9196d1d95 100644 +--- a/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c +@@ -59,4 +59,4 @@ main (int argc, char *argv[]) + + /* { dg-final { scan-tree-dump-times "Tracing succeeded" 24 "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } */ +-/* { dg-final { scan-tree-dump-times "unhandled issue scene" 2 "llc_allocate" } } */ +\ No newline at end of file ++/* { dg-final { scan-tree-dump-times "unhandled issue scene" 0 "llc_allocate" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 +index b0f68ebe3..da9669639 100644 +--- a/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 +@@ -205,7 +205,7 @@ END SUBROUTINE calc_p_rho + ! { dg-final { scan-tree-dump-times "\\d\\tt_1\\t\\(0.000000, 3, 1, 0\\)" 1 "llc_allocate" } } + ! { dg-final { scan-tree-dump-times "\\d\\tt_2\\t\\(0.000000, 3, 1, 0\\)" 1 "llc_allocate" } } + ! { dg-final { scan-tree-dump-times "\\d\\tc2a\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } } +-! { dg-final { scan-tree-dump-times "runtime issue" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "runtime issue" 0 "llc_allocate" } } + ! { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } + ! { dg-final { scan-tree-dump-times "insert svprfd" 2 "llc_allocate" } } +-! { dg-final { scan-tree-dump-times "cumul_size.*150960\\)" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "cumul_size.*150960\\)" 0 "llc_allocate" } } +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 +index 7345759db..eb2cc8690 100644 +--- a/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 +@@ -54,5 +54,5 @@ SUBROUTINE calc_p8w(p8w, ix, iy, k_start, k_end) + + END SUBROUTINE calc_p8w + +-! { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "runtime issue" 0 "llc_allocate" } } + ! { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } +\ No newline at end of file +-- +2.44.0.windows.1 + diff --git a/gcc.spec b/gcc.spec index 21df78d5169c173669b7342e88d07133e060155e..fe1b8f161985fb5004d06c05ae2f77f3046cd990 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 78 +%global gcc_release 79 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -474,6 +474,10 @@ Patch361: 0361-Enhancing-BOLT-Optimization-with-AI.patch Patch362: 0362-Modify-cache-size-for-hip10a-and-hip10c.patch Patch363: 0363-SVE-Add-std-find-with-sve.patch Patch364: 0364-CFGO-Enable-flag_profile_partial_training-for-CFGO-b.patch +Patch365: 0365-add-llc-allocate-feature.patch +Patch366: 0366-fix-prefetch-case-failed.patch +Patch367: 0367-llc-feature-bugfix.patch +Patch368: 0368-fix-llc-feature-case-failed.patch # Part 1001-1999 %ifarch sw_64 @@ -1624,6 +1628,10 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch -P362 -p1 %patch -P363 -p1 %patch -P364 -p1 +%patch -P365 -p1 +%patch -P366 -p1 +%patch -P367 -p1 +%patch -P368 -p1 %ifarch sw_64 %patch -P1001 -p1 @@ -4251,6 +4259,10 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Sat May 3 2025 huang-xiaoquan - 12.3.1-79 +- Type: Sync +- DESC: Sync patches from openeuler/gcc. + * Mon Apr 28 2025 liyancheng <412998149@qq.com> - 12.3.1-78 - Type: Sync - DESC: Sync patches from openeuler/gcc.