diff --git a/0025-AutoPrefetch-Support-cache-misses-profile.patch b/0025-AutoPrefetch-Support-cache-misses-profile.patch new file mode 100644 index 0000000000000000000000000000000000000000..1daa2db3f58396daa6690625f1f256fa4144be1f --- /dev/null +++ b/0025-AutoPrefetch-Support-cache-misses-profile.patch @@ -0,0 +1,669 @@ +From 26e4ba63112f55c27b7dd3d5f8c4497ef9a2f459 Mon Sep 17 00:00:00 2001 +From: benniaobufeijiushiji +Date: Thu, 6 Jan 2022 15:33:29 +0800 +Subject: [PATCH 25/28] [AutoPrefetch] Support cache misses profile + +Add pass ex-afdo after pass afdo in auto-profile.c. +Add flag -fcache-misses-profile. +Read profile of different types of perf events and build maps for +function and gimple location to its count of each perf event. +Currently, instruction execution and cahce misses are supported. +--- + gcc/auto-profile.c | 415 +++++++++++++++++++++++++++++++++++++++++++++ + gcc/auto-profile.h | 28 +++ + gcc/common.opt | 14 ++ + gcc/opts.c | 26 +++ + gcc/passes.def | 1 + + gcc/timevar.def | 1 + + gcc/toplev.c | 6 + + gcc/tree-pass.h | 2 + + 8 files changed, 493 insertions(+) + +diff --git a/gcc/auto-profile.c b/gcc/auto-profile.c +index 7d09887c9..aced8fca5 100644 +--- a/gcc/auto-profile.c ++++ b/gcc/auto-profile.c +@@ -49,6 +49,9 @@ along with GCC; see the file COPYING3. If not see + #include "auto-profile.h" + #include "tree-pretty-print.h" + #include "gimple-pretty-print.h" ++#include ++#include ++#include + + /* The following routines implements AutoFDO optimization. + +@@ -95,6 +98,7 @@ along with GCC; see the file COPYING3. If not see + */ + + #define DEFAULT_AUTO_PROFILE_FILE "fbdata.afdo" ++#define DEFAULT_CACHE_MISSES_PROFILE_FILE "cmsdata.gcov" + #define AUTO_PROFILE_VERSION 1 + + namespace autofdo +@@ -117,6 +121,14 @@ private: + bool annotated_; + }; + ++/* pair */ ++static bool ++event_count_cmp (std::pair &a, ++ std::pair &b) ++{ ++ return a.second > b.second; ++} ++ + /* Represent a source location: (function_decl, lineno). */ + typedef std::pair decl_lineno; + +@@ -338,6 +350,206 @@ static autofdo_source_profile *afdo_source_profile; + /* gcov_summary structure to store the profile_info. */ + static gcov_summary *afdo_profile_info; + ++/* Check opts->x_flags and put file name into EVENT_FILES. */ ++ ++static bool ++get_all_profile_names (const char **event_files) ++{ ++ if (!(flag_auto_profile || flag_cache_misses_profile)) ++ { ++ return false; ++ } ++ ++ event_files[INST_EXEC] = auto_profile_file; ++ ++ if (cache_misses_profile_file == NULL) ++ { ++ cache_misses_profile_file = DEFAULT_CACHE_MISSES_PROFILE_FILE; ++ } ++ event_files[CACHE_MISSES] = cache_misses_profile_file; ++ ++ return true; ++} ++ ++static void read_profile (void); ++ ++/* Maintain multiple profile data of different events with event_loc_count_map ++ and event_func_count_map. */ ++ ++class extend_auto_profile ++{ ++public: ++ bool auto_profile_exist (enum event_type type); ++ gcov_type get_loc_count (location_t, event_type); ++ gcov_type get_func_count (unsigned, event_type); ++ struct rank_info get_func_rank (unsigned, enum event_type); ++ /* There should be only one instance of class EXTEND_AUTO_PROFILE. */ ++ static extend_auto_profile *create () ++ { ++ extend_auto_profile *map = new extend_auto_profile (); ++ if (map->read ()) ++ { ++ return map; ++ } ++ delete map; ++ return NULL; ++ } ++private: ++ /* Basic maps of extend_auto_profile. */ ++ typedef std::map loc_count_map; ++ typedef std::map func_count_map; ++ ++ /* Map of function_uid to its descending order rank of counts. */ ++ typedef std::map rank_map; ++ ++ /* Mapping hardware events to corresponding basic maps. */ ++ typedef std::map event_loc_count_map; ++ typedef std::map event_func_count_map; ++ typedef std::map event_rank_map; ++ ++ extend_auto_profile () {} ++ bool read (); ++ void set_loc_count (); ++ void process_extend_source_profile (); ++ void read_extend_afdo_file (const char*, event_type); ++ void rank_all_func (); ++ void dump_event (); ++ event_loc_count_map event_loc_map; ++ event_func_count_map event_func_map; ++ event_rank_map func_rank; ++ event_type profile_type; ++}; ++ ++/* Member functions for extend_auto_profile. */ ++ ++bool ++extend_auto_profile::auto_profile_exist (enum event_type type) ++{ ++ switch (type) ++ { ++ case INST_EXEC: ++ return event_func_map.count (INST_EXEC) != 0 ++ || event_loc_map.count (INST_EXEC) != 0; ++ case CACHE_MISSES: ++ return event_func_map.count (CACHE_MISSES) != 0 ++ || event_loc_map.count (CACHE_MISSES) != 0; ++ default: ++ return false; ++ } ++} ++ ++void ++extend_auto_profile::dump_event () ++{ ++ if (dump_file) ++ { ++ switch (profile_type) ++ { ++ case INST_EXEC: ++ fprintf (dump_file, "Processing event instruction execution.\n"); ++ break; ++ case CACHE_MISSES: ++ fprintf (dump_file, "Processing event cache misses.\n"); ++ break; ++ default: ++ break; ++ } ++ } ++} ++ ++/* Return true if any profile data was read. */ ++ ++bool ++extend_auto_profile::read () ++{ ++ const char *event_files[EVENT_NUMBER] = {NULL}; ++ if (!get_all_profile_names (event_files)) ++ { ++ return false; ++ } ++ ++ /* Backup AFDO_STRING_TABLE and AFDO_SOURCE_PROFILE since we will create ++ new ones for each event_type. */ ++ autofdo::string_table *string_table_afdo = afdo_string_table; ++ autofdo::autofdo_source_profile *source_profile_afdo = afdo_source_profile; ++ ++ for (unsigned i = 0; i < EVENT_NUMBER; i++) ++ { ++ if (event_files[i] == NULL) ++ { ++ continue; ++ } ++ profile_type = (enum event_type) i; ++ dump_event (); ++ gcov_close (); ++ auto_profile_file = event_files[i]; ++ read_profile (); ++ gcov_close (); ++ ++ process_extend_source_profile (); ++ ++ delete afdo_source_profile; ++ delete afdo_string_table; ++ } ++ ++ /* Restore AFDO_STRING_TABLE and AFDO_SOURCE_PROFILE. Function ++ END_AUTO_PROFILE will free them at the end of compilation. */ ++ afdo_string_table = string_table_afdo; ++ afdo_source_profile = source_profile_afdo; ++ return true; ++} ++ ++/* Helper functions. */ ++ ++gcov_type ++extend_auto_profile::get_loc_count (location_t loc, event_type type) ++{ ++ event_loc_count_map::iterator event_iter = event_loc_map.find (type); ++ if (event_iter != event_loc_map.end ()) ++ { ++ loc_count_map::iterator loc_iter = event_iter->second.find (loc); ++ if (loc_iter != event_iter->second.end ()) ++ { ++ return loc_iter->second; ++ } ++ } ++ return 0; ++} ++ ++struct rank_info ++extend_auto_profile::get_func_rank (unsigned decl_uid, enum event_type type) ++{ ++ struct rank_info info = {0, 0}; ++ event_rank_map::iterator event_iter = func_rank.find (type); ++ if (event_iter != func_rank.end ()) ++ { ++ rank_map::iterator func_iter = event_iter->second.find (decl_uid); ++ if (func_iter != event_iter->second.end ()) ++ { ++ info.rank = func_iter->second; ++ info.total = event_iter->second.size (); ++ } ++ } ++ return info; ++} ++ ++gcov_type ++extend_auto_profile::get_func_count (unsigned decl_uid, event_type type) ++{ ++ event_func_count_map::iterator event_iter = event_func_map.find (type); ++ if (event_iter != event_func_map.end ()) ++ { ++ func_count_map::iterator func_iter = event_iter->second.find (decl_uid); ++ if (func_iter != event_iter->second.end ()) ++ { ++ return func_iter->second; ++ } ++ } ++ return 0; ++} ++ ++static extend_auto_profile *extend_profile; ++ + /* Helper functions. */ + + /* Return the original name of NAME: strip the suffix that starts +@@ -1654,6 +1866,131 @@ auto_profile (void) + + return TODO_rebuild_cgraph_edges; + } ++ ++void ++extend_auto_profile::rank_all_func () ++{ ++ std::vector > func_sorted; ++ event_func_count_map::iterator event_iter ++ = event_func_map.find (profile_type); ++ if (event_iter != event_func_map.end ()) ++ { ++ func_count_map::iterator func_iter; ++ for (func_iter = event_iter->second.begin (); ++ func_iter != event_iter->second.end (); func_iter++) ++ { ++ func_sorted.push_back (std::make_pair (func_iter->first, ++ func_iter->second)); ++ } ++ ++ std::sort (func_sorted.begin (), func_sorted.end (), event_count_cmp); ++ ++ for (unsigned i = 0; i < func_sorted.size (); ++i) ++ { ++ func_rank[profile_type][func_sorted[i].first] = i + 1; ++ } ++ } ++} ++ ++/* Iterate stmts in cfun and maintain its count to EVENT_LOC_MAP. */ ++ ++void ++extend_auto_profile::set_loc_count () ++{ ++ basic_block bb; ++ FOR_EACH_BB_FN (bb, cfun) ++ { ++ gimple_stmt_iterator gsi; ++ for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ count_info info; ++ gimple *stmt = gsi_stmt (gsi); ++ if (gimple_clobber_p (stmt) || is_gimple_debug (stmt)) ++ { ++ continue; ++ } ++ if (afdo_source_profile->get_count_info (stmt, &info)) ++ { ++ location_t loc = gimple_location (stmt); ++ event_loc_map[profile_type][loc] += info.count; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "stmt "); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM); ++ fprintf (dump_file, "counts %ld\n", ++ event_loc_map[profile_type][loc]); ++ } ++ } ++ } ++ } ++} ++ ++/* Process data in extend_auto_source_profile, save them into two maps. ++ 1. gimple_location to count. ++ 2. function_index to count. */ ++void ++extend_auto_profile::process_extend_source_profile () ++{ ++ struct cgraph_node *node; ++ if (symtab->state == FINISHED) ++ { ++ return; ++ } ++ FOR_EACH_FUNCTION (node) ++ { ++ if (!gimple_has_body_p (node->decl) || node->inlined_to) ++ { ++ continue; ++ } ++ ++ /* Don't profile functions produced for builtin stuff. */ ++ if (DECL_SOURCE_LOCATION (node->decl) == BUILTINS_LOCATION) ++ { ++ continue; ++ } ++ ++ function *fn = DECL_STRUCT_FUNCTION (node->decl); ++ push_cfun (fn); ++ ++ const function_instance *s ++ = afdo_source_profile->get_function_instance_by_decl ( ++ current_function_decl); ++ ++ if (s == NULL) ++ { ++ pop_cfun (); ++ continue; ++ } ++ unsigned int decl_uid = DECL_UID (current_function_decl); ++ gcov_type count = s->total_count (); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Extend auto-profile for function %s.\n", ++ node->dump_name ()); ++ } ++ event_func_map[profile_type][decl_uid] += count; ++ set_loc_count (); ++ pop_cfun (); ++ } ++ rank_all_func (); ++} ++ ++/* Main entry of extend_auto_profile. */ ++ ++static void ++extend_source_profile () ++{ ++ extend_profile = autofdo::extend_auto_profile::create (); ++ if (dump_file) ++ { ++ if (extend_profile == NULL) ++ { ++ fprintf (dump_file, "No profile file is found.\n"); ++ return; ++ } ++ fprintf (dump_file, "Extend profile info generated.\n"); ++ } ++} + } /* namespace autofdo. */ + + /* Read the profile from the profile data file. */ +@@ -1682,6 +2019,42 @@ end_auto_profile (void) + profile_info = NULL; + } + ++/* Extern function to get profile info in other passes. */ ++ ++bool ++profile_exist (enum event_type type) ++{ ++ return autofdo::extend_profile != NULL ++ && autofdo::extend_profile->auto_profile_exist (type); ++} ++ ++gcov_type ++event_get_loc_count (location_t loc, event_type type) ++{ ++ return autofdo::extend_profile->get_loc_count (loc, type); ++} ++ ++gcov_type ++event_get_func_count (unsigned decl_uid, event_type type) ++{ ++ return autofdo::extend_profile->get_func_count (decl_uid, type); ++} ++ ++struct rank_info ++event_get_func_rank (unsigned decl_uid, enum event_type type) ++{ ++ return autofdo::extend_profile->get_func_rank (decl_uid, type); ++} ++ ++void ++free_extend_profile_info () ++{ ++ if (autofdo::extend_profile != NULL) ++ { ++ delete autofdo::extend_profile; ++ } ++} ++ + /* Returns TRUE if EDGE is hot enough to be inlined early. */ + + bool +@@ -1743,8 +2116,50 @@ public: + + } // anon namespace + ++namespace ++{ ++const pass_data pass_data_ipa_extend_auto_profile = ++{ ++ SIMPLE_IPA_PASS, /* type */ ++ "ex-afdo", /* name */ ++ OPTGROUP_NONE, /* optinfo_flags */ ++ TV_IPA_EXTEND_AUTO_PROFILE, /* tv_id */ ++ 0, /* properties_required */ ++ 0, /* properties_provided */ ++ 0, /* properties_destroyed */ ++ 0, /* todo_flags_start */ ++ 0, /* todo_flags_finish */ ++}; ++ ++class pass_ipa_extend_auto_profile : public simple_ipa_opt_pass ++{ ++public: ++ pass_ipa_extend_auto_profile (gcc::context *ctxt) ++ : simple_ipa_opt_pass (pass_data_ipa_extend_auto_profile, ctxt) ++ {} ++ ++ /* opt_pass methods: */ ++ virtual bool gate (function *) {return (flag_ipa_extend_auto_profile > 0);} ++ virtual unsigned int execute (function *); ++ ++}; ++ ++unsigned int ++pass_ipa_extend_auto_profile::execute (function *fun) ++{ ++ autofdo::extend_source_profile (); ++ return 0; ++} ++} // anon namespace ++ + simple_ipa_opt_pass * + make_pass_ipa_auto_profile (gcc::context *ctxt) + { + return new pass_ipa_auto_profile (ctxt); + } ++ ++simple_ipa_opt_pass * ++make_pass_ipa_extend_auto_profile (gcc::context *ctxt) ++{ ++ return new pass_ipa_extend_auto_profile (ctxt); ++} +\ No newline at end of file +diff --git a/gcc/auto-profile.h b/gcc/auto-profile.h +index f5cff091d..230d7e68a 100644 +--- a/gcc/auto-profile.h ++++ b/gcc/auto-profile.h +@@ -21,6 +21,13 @@ along with GCC; see the file COPYING3. If not see + #ifndef AUTO_PROFILE_H + #define AUTO_PROFILE_H + ++enum event_type ++{ ++ INST_EXEC = 0, ++ CACHE_MISSES, ++ EVENT_NUMBER ++}; ++ + /* Read, process, finalize AutoFDO data structures. */ + extern void read_autofdo_file (void); + extern void end_auto_profile (void); +@@ -28,4 +35,25 @@ extern void end_auto_profile (void); + /* Returns TRUE if EDGE is hot enough to be inlined early. */ + extern bool afdo_callsite_hot_enough_for_early_inline (struct cgraph_edge *); + ++/* Chcek if profile exists before using this profile. */ ++extern bool profile_exist (enum event_type); ++ ++/* Given func decl_uid or gimple location and event_type, return count. ++ Count is 0 if function or gimple is not sampled. */ ++extern gcov_type event_get_func_count (unsigned, enum event_type); ++extern gcov_type event_get_loc_count (location_t, enum event_type); ++ ++struct rank_info ++{ ++ unsigned total; ++ unsigned rank; ++}; ++ ++/* Given function decl_uid and event type, return rank_info. Rank_info ++ is {0, 0} if function was not sampled. */ ++extern struct rank_info event_get_func_rank (unsigned, enum event_type); ++ ++/* Free memory allocated by autofdo::extern_profile. */ ++extern void free_extend_profile_info (); ++ + #endif /* AUTO_PROFILE_H */ +diff --git a/gcc/common.opt b/gcc/common.opt +index 73c24f28d..37cbbd8c0 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1074,6 +1074,16 @@ Common Joined RejectNegative Var(auto_profile_file) + Use sample profile information for call graph node weights. The profile + file is specified in the argument. + ++fcache-misses-profile ++Common Report Var(flag_cache_misses_profile) ++Use sample profile information for source code cache miss count. The default ++profile file is cmsdata.gcov in `pwd`. ++ ++fcache-misses-profile= ++Common Joined RejectNegative Var(cache_misses_profile_file) ++Use sample profile information for source code cache miss count. The profile ++file is specified in the argument. ++ + ; -fcheck-bounds causes gcc to generate array bounds checks. + ; For C, C++ and ObjC: defaults off. + ; For Java: defaults to on. +@@ -1873,6 +1883,10 @@ fipa-struct-reorg + Common Report Var(flag_ipa_struct_reorg) Init(0) Optimization + Perform structure layout optimizations. + ++fipa-extend-auto-profile ++Common Report Var(flag_ipa_extend_auto_profile) ++Use sample profile information for source code. ++ + fipa-vrp + Common Report Var(flag_ipa_vrp) Optimization + Perform IPA Value Range Propagation. +diff --git a/gcc/opts.c b/gcc/opts.c +index 6924a973a..642327296 100644 +--- a/gcc/opts.c ++++ b/gcc/opts.c +@@ -1742,6 +1742,13 @@ enable_fdo_optimizations (struct gcc_options *opts, + SET_OPTION_IF_UNSET (opts, opts_set, flag_tree_loop_distribution, value); + } + ++static void ++set_cache_misses_profile_params (struct gcc_options *opts, ++ struct gcc_options *opts_set) ++{ ++ SET_OPTION_IF_UNSET (opts, opts_set, flag_prefetch_loop_arrays, 1); ++} ++ + /* -f{,no-}sanitize{,-recover}= suboptions. */ + const struct sanitizer_opts_s sanitizer_opts[] = + { +@@ -2604,6 +2611,25 @@ common_handle_option (struct gcc_options *opts, + param_early_inliner_max_iterations, 10); + break; + ++ case OPT_fipa_extend_auto_profile: ++ opts->x_flag_ipa_extend_auto_profile = opts->x_flag_cache_misses_profile ++ ? true : value; ++ break; ++ ++ case OPT_fcache_misses_profile_: ++ opts->x_cache_misses_profile_file = xstrdup (arg); ++ opts->x_flag_cache_misses_profile = true; ++ value = true; ++ /* No break here - do -fcache-misses-profile processing. */ ++ /* FALLTHRU */ ++ case OPT_fcache_misses_profile: ++ opts->x_flag_ipa_extend_auto_profile = value; ++ if (value) ++ { ++ set_cache_misses_profile_params (opts, opts_set); ++ } ++ break; ++ + case OPT_fprofile_generate_: + opts->x_profile_data_prefix = xstrdup (arg); + value = true; +diff --git a/gcc/passes.def b/gcc/passes.def +index 63303ab65..e9c91d26e 100644 +--- a/gcc/passes.def ++++ b/gcc/passes.def +@@ -133,6 +133,7 @@ along with GCC; see the file COPYING3. If not see + + NEXT_PASS (pass_target_clone); + NEXT_PASS (pass_ipa_auto_profile); ++ NEXT_PASS (pass_ipa_extend_auto_profile); + NEXT_PASS (pass_ipa_tree_profile); + PUSH_INSERT_PASSES_WITHIN (pass_ipa_tree_profile) + NEXT_PASS (pass_feedback_split_functions); +diff --git a/gcc/timevar.def b/gcc/timevar.def +index ee25eccbb..e873747a8 100644 +--- a/gcc/timevar.def ++++ b/gcc/timevar.def +@@ -82,6 +82,7 @@ DEFTIMEVAR (TV_IPA_FNSPLIT , "ipa function splitting") + DEFTIMEVAR (TV_IPA_COMDATS , "ipa comdats") + DEFTIMEVAR (TV_IPA_REORDER_FIELDS , "ipa struct reorder fields optimization") + DEFTIMEVAR (TV_IPA_STRUCT_REORG , "ipa struct reorg optimization") ++DEFTIMEVAR (TV_IPA_EXTEND_AUTO_PROFILE, "ipa extend auto profile") + DEFTIMEVAR (TV_IPA_OPT , "ipa various optimizations") + DEFTIMEVAR (TV_IPA_LTO_DECOMPRESS , "lto stream decompression") + DEFTIMEVAR (TV_IPA_LTO_COMPRESS , "lto stream compression") +diff --git a/gcc/toplev.c b/gcc/toplev.c +index eaed6f6c7..51e6bd400 100644 +--- a/gcc/toplev.c ++++ b/gcc/toplev.c +@@ -577,6 +577,12 @@ compile_file (void) + targetm.asm_out.output_ident (ident_str); + } + ++ /* Extend auto profile finalization. */ ++ if (flag_ipa_extend_auto_profile) ++ { ++ free_extend_profile_info (); ++ } ++ + /* Auto profile finalization. */ + if (flag_auto_profile) + end_auto_profile (); +diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h +index eb32c5d44..be6387768 100644 +--- a/gcc/tree-pass.h ++++ b/gcc/tree-pass.h +@@ -511,6 +511,8 @@ extern ipa_opt_pass_d *make_pass_ipa_hsa (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_pure_const (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_reorder_fields (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_struct_reorg (gcc::context *ctxt); ++extern simple_ipa_opt_pass *make_pass_ipa_extend_auto_profile (gcc::context ++ *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_pta (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_tm (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_target_clone (gcc::context *ctxt); +-- +2.27.0.windows.1 + diff --git a/0026-AutoFDO-Enable-discriminator-and-MCF-algorithm-on-Au.patch b/0026-AutoFDO-Enable-discriminator-and-MCF-algorithm-on-Au.patch new file mode 100644 index 0000000000000000000000000000000000000000..bbc98c64e86f435a359b14847d13e3395fa914dd --- /dev/null +++ b/0026-AutoFDO-Enable-discriminator-and-MCF-algorithm-on-Au.patch @@ -0,0 +1,353 @@ +From eb58d920a95696d8d5a7db9a6d640d4494fb023f Mon Sep 17 00:00:00 2001 +From: liyancheng <412998149@qq.com> +Date: Tue, 25 Jan 2022 16:57:28 +0800 +Subject: [PATCH 26/28] [AutoFDO] Enable discriminator and MCF algorithm on + AutoFDO + +1. Support discriminator for distinguishes among several + basic blocks that share a common locus, allowing for + more accurate autofdo. + +2. Using option -fprofile-correction for calling MCF algorithm + to smooth non conservative BB counts. +--- + gcc/auto-profile.c | 172 ++++++++++++++++++++++++++++++++++++++++++++- + gcc/cfghooks.c | 7 ++ + gcc/ipa-cp.c | 21 ++++++ + gcc/opts.c | 5 +- + gcc/tree-inline.c | 14 ++++ + 5 files changed, 215 insertions(+), 4 deletions(-) + +diff --git a/gcc/auto-profile.c b/gcc/auto-profile.c +index aced8fca5..e6164b91b 100644 +--- a/gcc/auto-profile.c ++++ b/gcc/auto-profile.c +@@ -678,6 +678,17 @@ string_table::get_index (const char *name) const + if (name == NULL) + return -1; + string_index_map::const_iterator iter = map_.find (name); ++ /* Function name may be duplicate. Try to distinguish by the ++ #file_name#function_name defined by the autofdo tool chain. */ ++ if (iter == map_.end ()) ++ { ++ char* file_name = get_original_name (lbasename (dump_base_name)); ++ char* file_func_name ++ = concat ("#", file_name, "#", name, NULL); ++ iter = map_.find (file_func_name); ++ free (file_name); ++ free (file_func_name); ++ } + if (iter == map_.end ()) + return -1; + +@@ -866,7 +877,7 @@ function_instance::read_function_instance (function_instance_stack *stack, + + for (unsigned i = 0; i < num_pos_counts; i++) + { +- unsigned offset = gcov_read_unsigned () & 0xffff0000; ++ unsigned offset = gcov_read_unsigned (); + unsigned num_targets = gcov_read_unsigned (); + gcov_type count = gcov_read_counter (); + s->pos_counts[offset].count = count; +@@ -945,6 +956,10 @@ autofdo_source_profile::get_count_info (gimple *stmt, count_info *info) const + function_instance *s = get_function_instance_by_inline_stack (stack); + if (s == NULL) + return false; ++ if (s->get_count_info (stack[0].second + stmt->bb->discriminator, info)) ++ { ++ return true; ++ } + return s->get_count_info (stack[0].second, info); + } + +@@ -1583,6 +1598,68 @@ afdo_propagate (bb_set *annotated_bb) + } + } + ++/* Process the following scene when the branch probability ++ inversion when do function afdo_propagate (). E.g. ++ BB_NUM (sample count) ++ BB1 (1000) ++ / \ ++ BB2 (10) BB3 (0) ++ \ / ++ BB4 ++ In afdo_propagate(), count of BB3 is calculated by ++ COUNT (BB3) = 990 (990 = COUNT (BB1) - COUNT (BB2) = 1000 - 10) ++ ++ In fact, BB3 may be colder than BB2 by sample count. ++ ++ This function allocate source BB count to each succ BB by sample ++ rate, E.g. ++ BB2_COUNT = BB1_COUNT * (BB2_COUNT / (BB2_COUNT + BB3_COUNT)) */ ++ ++static void ++afdo_preprocess_bb_count () ++{ ++ basic_block bb; ++ FOR_ALL_BB_FN (bb, cfun) ++ { ++ if (bb->count.ipa_p () && EDGE_COUNT (bb->succs) > 1 ++ && bb->count > profile_count::zero ().afdo ()) ++ { ++ basic_block bb1 = EDGE_SUCC (bb, 0)->dest; ++ basic_block bb2 = EDGE_SUCC (bb, 1)->dest; ++ if (single_succ_p (bb1) && single_succ_p (bb2) ++ && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest) ++ { ++ gcov_type max_count = 0; ++ gcov_type total_count = 0; ++ edge e; ++ edge_iterator ei; ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ if (!e->dest->count.ipa_p ()) ++ { ++ continue; ++ } ++ max_count = MAX(max_count, e->dest->count.to_gcov_type ()); ++ total_count += e->dest->count.to_gcov_type (); ++ } ++ /* Only bb_count > max_count * 2, branch probability will ++ inversion. */ ++ if (max_count > 0 ++ && bb->count.to_gcov_type () > max_count * 2) ++ { ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ gcov_type target_count = bb->count.to_gcov_type () ++ * e->dest->count.to_gcov_type () / total_count; ++ e->dest->count ++ = profile_count::from_gcov_type (target_count).afdo (); ++ } ++ } ++ } ++ } ++ } ++} ++ + /* Propagate counts on control flow graph and calculate branch + probabilities. */ + +@@ -1608,6 +1685,7 @@ afdo_calculate_branch_prob (bb_set *annotated_bb) + } + + afdo_find_equiv_class (annotated_bb); ++ afdo_preprocess_bb_count (); + afdo_propagate (annotated_bb); + + FOR_EACH_BB_FN (bb, cfun) +@@ -1711,6 +1789,82 @@ afdo_vpt_for_early_inline (stmt_set *promoted_stmts) + return false; + } + ++/* Preparation before executing MCF algorithm. */ ++ ++static void ++afdo_init_mcf () ++{ ++ basic_block bb; ++ edge e; ++ edge_iterator ei; ++ ++ if (dump_file) ++ { ++ fprintf (dump_file, "\n init calling mcf_smooth_cfg (). \n"); ++ } ++ ++ /* Step1: when use mcf, BB id must be continous, ++ so we need compact_blocks (). */ ++ compact_blocks (); ++ ++ /* Step2: allocate memory for MCF input data. */ ++ bb_gcov_counts.safe_grow_cleared (cfun->cfg->x_last_basic_block); ++ edge_gcov_counts = new hash_map; ++ ++ /* Step3: init MCF input data from cfg. */ ++ FOR_ALL_BB_FN (bb, cfun) ++ { ++ /* Init BB count for MCF. */ ++ bb_gcov_count (bb) = bb->count.to_gcov_type (); ++ ++ gcov_type total_count = 0; ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ total_count += e->dest->count.to_gcov_type (); ++ } ++ ++ /* If there is no sample in each successor blocks, source ++ BB samples are allocated to each edge by branch static prob. */ ++ ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ if (total_count == 0) ++ { ++ edge_gcov_count (e) = e->src->count.to_gcov_type () ++ * e->probability.to_reg_br_prob_base () / REG_BR_PROB_BASE; ++ } ++ else ++ { ++ edge_gcov_count (e) = e->src->count.to_gcov_type () ++ * e->dest->count.to_gcov_type () / total_count; ++ } ++ } ++ } ++} ++ ++/* Free the resources used by MCF and reset BB count from MCF result, ++ branch probability has been updated in mcf_smooth_cfg (). */ ++ ++static void ++afdo_process_after_mcf () ++{ ++ basic_block bb; ++ /* Reset BB count from MCF result. */ ++ FOR_EACH_BB_FN (bb, cfun) ++ { ++ if (bb_gcov_count (bb)) ++ { ++ bb->count ++ = profile_count::from_gcov_type (bb_gcov_count (bb)).afdo (); ++ } ++ } ++ ++ /* Clean up MCF resource. */ ++ bb_gcov_counts.release (); ++ delete edge_gcov_counts; ++ edge_gcov_counts = NULL; ++} ++ + /* Annotate auto profile to the control flow graph. Do not annotate value + profile for stmts in PROMOTED_STMTS. */ + +@@ -1762,8 +1916,20 @@ afdo_annotate_cfg (const stmt_set &promoted_stmts) + afdo_source_profile->mark_annotated (cfun->function_end_locus); + if (max_count > profile_count::zero ()) + { +- /* Calculate, propagate count and probability information on CFG. */ +- afdo_calculate_branch_prob (&annotated_bb); ++ /* 1 means -fprofile-correction is enabled manually, and MCF ++ algorithm will be used to calculate count and probability. ++ Otherwise, use the default calculate algorithm. */ ++ if (flag_profile_correction == 1) ++ { ++ afdo_init_mcf (); ++ mcf_smooth_cfg (); ++ afdo_process_after_mcf (); ++ } ++ else ++ { ++ /* Calculate, propagate count and probability information on CFG. */ ++ afdo_calculate_branch_prob (&annotated_bb); ++ } + } + update_max_bb_count (); + profile_status_for_fn (cfun) = PROFILE_READ; +diff --git a/gcc/cfghooks.c b/gcc/cfghooks.c +index ea558b469..4ea490a8a 100644 +--- a/gcc/cfghooks.c ++++ b/gcc/cfghooks.c +@@ -526,6 +526,9 @@ split_block_1 (basic_block bb, void *i) + return NULL; + + new_bb->count = bb->count; ++ /* Copy discriminator from original bb for distinguishes among ++ several basic blocks that share a common locus, allowing for ++ more accurate autofdo. */ + new_bb->discriminator = bb->discriminator; + + if (dom_info_available_p (CDI_DOMINATORS)) +@@ -1091,6 +1094,10 @@ duplicate_block (basic_block bb, edge e, basic_block after, copy_bb_data *id) + move_block_after (new_bb, after); + + new_bb->flags = (bb->flags & ~BB_DUPLICATED); ++ /* Copy discriminator from original bb for distinguishes among ++ several basic blocks that share a common locus, allowing for ++ more accurate autofdo. */ ++ new_bb->discriminator = bb->discriminator; + FOR_EACH_EDGE (s, ei, bb->succs) + { + /* Since we are creating edges from a new block to successors +diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c +index b1f0881bd..c208070c9 100644 +--- a/gcc/ipa-cp.c ++++ b/gcc/ipa-cp.c +@@ -4365,6 +4365,27 @@ update_profiling_info (struct cgraph_node *orig_node, + orig_node_count.dump (dump_file); + fprintf (dump_file, "\n"); + } ++ ++ /* When autofdo uses PMU as the sampling unit, the count of ++ cgraph_node->count cannot be obtained directly and will ++ be zero. It using for apply_scale will cause the node ++ count incorrectly overestimated. So set orig_new_node_count ++ equal to orig_node_count, which is same as known error ++ handling. */ ++ if (orig_node->count == profile_count::zero ().afdo () ++ && new_node->count == profile_count::zero ().global0adjusted ()) ++ { ++ orig_new_node_count = (orig_sum + new_sum).apply_scale (12, 10); ++ ++ if (dump_file) ++ { ++ fprintf (dump_file, " node %s with zero count from afdo ", ++ new_node->dump_name ()); ++ fprintf (dump_file, " proceeding by pretending it was "); ++ orig_new_node_count.dump (dump_file); ++ fprintf (dump_file, "\n"); ++ } ++ } + } + + remainder = orig_node_count.combine_with_ipa_count (orig_node_count.ipa () +diff --git a/gcc/opts.c b/gcc/opts.c +index 642327296..7a39f618b 100644 +--- a/gcc/opts.c ++++ b/gcc/opts.c +@@ -2606,7 +2606,10 @@ common_handle_option (struct gcc_options *opts, + /* FALLTHRU */ + case OPT_fauto_profile: + enable_fdo_optimizations (opts, opts_set, value); +- SET_OPTION_IF_UNSET (opts, opts_set, flag_profile_correction, value); ++ /* 2 is special and means flag_profile_correction trun on by ++ -fauto-profile. */ ++ SET_OPTION_IF_UNSET (opts, opts_set, flag_profile_correction, ++ (value ? 2 : 0)); + SET_OPTION_IF_UNSET (opts, opts_set, + param_early_inliner_max_iterations, 10); + break; +diff --git a/gcc/tree-inline.c b/gcc/tree-inline.c +index efde5d158..8405a959c 100644 +--- a/gcc/tree-inline.c ++++ b/gcc/tree-inline.c +@@ -2015,6 +2015,10 @@ copy_bb (copy_body_data *id, basic_block bb, + basic_block_info automatically. */ + copy_basic_block = create_basic_block (NULL, (basic_block) prev->aux); + copy_basic_block->count = bb->count.apply_scale (num, den); ++ /* Copy discriminator from original bb for distinguishes among ++ several basic blocks that share a common locus, allowing for ++ more accurate autofdo. */ ++ copy_basic_block->discriminator = bb->discriminator; + + copy_gsi = gsi_start_bb (copy_basic_block); + +@@ -3028,6 +3032,16 @@ copy_cfg_body (copy_body_data * id, + den += e->count (); + ENTRY_BLOCK_PTR_FOR_FN (cfun)->count = den; + } ++ /* When autofdo uses PMU as the sampling unit, the number of ++ ENTRY_BLOCK_PTR_FOR_FN cannot be obtained directly and will ++ be zero. It using for adjust_for_ipa_scaling will cause the ++ inlined BB count incorrectly overestimated. So set den equal ++ to num, which is the source inline BB count to avoid ++ overestimated. */ ++ if (den == profile_count::zero ().afdo ()) ++ { ++ den = num; ++ } + + profile_count::adjust_for_ipa_scaling (&num, &den); + +-- +2.27.0.windows.1 + diff --git a/0027-Autoprefetch-Support-auto-feedback-prefetch.patch b/0027-Autoprefetch-Support-auto-feedback-prefetch.patch new file mode 100644 index 0000000000000000000000000000000000000000..c3dcf506a7705ef1f4bdde00f25dd62c287aa0c6 --- /dev/null +++ b/0027-Autoprefetch-Support-auto-feedback-prefetch.patch @@ -0,0 +1,1000 @@ +From 6b944bed1158d3454b1db27aeab4ec1f2b8e5866 Mon Sep 17 00:00:00 2001 +From: huangxiaoquan +Date: Thu, 27 Jan 2022 18:24:53 +0800 +Subject: [PATCH 27/28] [Autoprefetch] Support auto feedback prefetch + +1.Add option -fprefetch-loop-arrays=[value]. + +2.A prefetch distance analysis algorithm based on branch weight + is proposed to improve the accuracy of prefetch distance. + +3.Propose automatic feedback prefetching: + use the cache-miss profile information to guide the insertion of + prefetching instructions. +--- + gcc/auto-profile.c | 5 +- + gcc/common.opt | 5 + + gcc/opts.c | 7 + + gcc/params.opt | 16 + + gcc/tree-ssa-loop-prefetch.c | 735 ++++++++++++++++++++++++++++++++++- + 5 files changed, 748 insertions(+), 20 deletions(-) + +diff --git a/gcc/auto-profile.c b/gcc/auto-profile.c +index e6164b91b..f221978fc 100644 +--- a/gcc/auto-profile.c ++++ b/gcc/auto-profile.c +@@ -21,6 +21,8 @@ along with GCC; see the file COPYING3. If not see + #include "config.h" + #define INCLUDE_MAP + #define INCLUDE_SET ++#define INCLUDE_ALGORITHM ++#define INCLUDE_VECTOR + #include "system.h" + #include "coretypes.h" + #include "backend.h" +@@ -49,9 +51,6 @@ along with GCC; see the file COPYING3. If not see + #include "auto-profile.h" + #include "tree-pretty-print.h" + #include "gimple-pretty-print.h" +-#include +-#include +-#include + + /* The following routines implements AutoFDO optimization. + +diff --git a/gcc/common.opt b/gcc/common.opt +index 37cbbd8c0..9488bd90f 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -2201,6 +2201,11 @@ fprefetch-loop-arrays + Common Report Var(flag_prefetch_loop_arrays) Init(-1) Optimization + Generate prefetch instructions, if available, for arrays in loops. + ++fprefetch-loop-arrays= ++Common Joined RejectNegative UInteger Var(prefetch_level) Init(0) IntegerRange(0, 3) ++Generate prefetch instructions, if available, for arrays in loops. The prefetch ++level can control the optimize level to array prefetch. ++ + fprofile + Common Report Var(profile_flag) + Enable basic program profiling code. +diff --git a/gcc/opts.c b/gcc/opts.c +index 7a39f618b..f49f5ee58 100644 +--- a/gcc/opts.c ++++ b/gcc/opts.c +@@ -1747,6 +1747,8 @@ set_cache_misses_profile_params (struct gcc_options *opts, + struct gcc_options *opts_set) + { + SET_OPTION_IF_UNSET (opts, opts_set, flag_prefetch_loop_arrays, 1); ++ SET_OPTION_IF_UNSET (opts, opts_set, prefetch_level, 2); ++ SET_OPTION_IF_UNSET (opts, opts_set, param_simultaneous_prefetches, 100); + } + + /* -f{,no-}sanitize{,-recover}= suboptions. */ +@@ -2645,6 +2647,11 @@ common_handle_option (struct gcc_options *opts, + SET_OPTION_IF_UNSET (opts, opts_set, flag_ipa_bit_cp, value); + break; + ++ case OPT_fprefetch_loop_arrays_: ++ opts->x_prefetch_level = value; ++ opts->x_flag_prefetch_loop_arrays = true; ++ break; ++ + case OPT_fpatchable_function_entry_: + { + char *patch_area_arg = xstrdup (arg); +diff --git a/gcc/params.opt b/gcc/params.opt +index 2db69cc87..9d1faa7ab 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -968,4 +968,20 @@ Bound on number of runtime checks inserted by the vectorizer's loop versioning f + Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization + Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check. + ++-param=param-prefetch-func-topn= ++Common Joined UInteger Var(param_prefetch_func_topn) Init(3) Param Optimization ++TopN functions of cache miss counts to be analyzed in prefetching. ++ ++-param=param-prefetch-ref-topn= ++Common Joined UInteger Var(param_prefetch_ref_topn) Init(5) Param Optimization ++TopN ref of cache miss counts to be analyzed in prefetching. ++ ++-param=param-high-loop-execution-rate= ++Common Joined UInteger Var(param_high_loop_execution_rate) Init(95) IntegerRange(0, 100) Param Optimization ++High execution rate loops to be analyzed in prefetch (in%). ++ ++-param=param-prefetch-func-counts-threshold= ++Common Joined UInteger Var(param_prefetch_func_counts_threshold) Init(100) Param Optimization ++Threshold functions of cache miss counts to be analyzed in prefetching. ++ + ; This comment is to ensure we retain the blank line above. +diff --git a/gcc/tree-ssa-loop-prefetch.c b/gcc/tree-ssa-loop-prefetch.c +index d19ece641..3a5aef0fc 100644 +--- a/gcc/tree-ssa-loop-prefetch.c ++++ b/gcc/tree-ssa-loop-prefetch.c +@@ -18,6 +18,9 @@ along with GCC; see the file COPYING3. If not see + . */ + + #include "config.h" ++#define INCLUDE_ALGORITHM ++#define INCLUDE_MAP ++#define INCLUDE_VECTOR + #include "system.h" + #include "coretypes.h" + #include "backend.h" +@@ -48,6 +51,11 @@ along with GCC; see the file COPYING3. If not see + #include "tree-data-ref.h" + #include "diagnostic-core.h" + #include "dbgcnt.h" ++#include "gimple-pretty-print.h" ++#include "tree-cfg.h" ++#include "auto-profile.h" ++#include "cgraph.h" ++#include "print-tree.h" + + /* This pass inserts prefetch instructions to optimize cache usage during + accesses to arrays in loops. It processes loops sequentially and: +@@ -253,6 +261,22 @@ struct mem_ref_group + #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200 + #endif + ++#ifndef PREFETCH_FUNC_TOPN ++#define PREFETCH_FUNC_TOPN param_prefetch_func_topn ++#endif ++ ++#ifndef PREFETCH_FUNC_COUNTS_THRESHOLD ++#define PREFETCH_FUNC_COUNTS_THRESHOLD param_prefetch_func_counts_threshold ++#endif ++ ++#ifndef PREFETCH_REF_TOPN ++#define PREFETCH_REF_TOPN param_prefetch_ref_topn ++#endif ++ ++#ifndef LOOP_EXECUTION_RATE ++#define LOOP_EXECUTION_RATE param_high_loop_execution_rate ++#endif ++ + /* The memory reference. */ + + struct mem_ref +@@ -279,6 +303,131 @@ struct mem_ref + nontemporal one. */ + }; + ++/* Probability information of basic blocks and branches. */ ++struct bb_bp ++{ ++ basic_block bb; ++ basic_block true_edge_bb; ++ basic_block false_edge_bb; ++ float true_edge_prob; ++ float false_edge_prob; ++ float bb_prob; ++}; ++ ++typedef struct bb_bp bb_bp; ++ ++enum PREFETCH_MODE ++{ ++ ORIGINAL_MODE=0, /* Original prefetch method. */ ++ REFINE_BB_AHEAD, ++ /* Prefetch distance algorithm for removing ++ irrelevant bb. */ ++ BRANCH_WEIGHTED_AHEAD, ++ /* Branch weighted prefetch ++ distance algorithm. */ ++ INDIRECT_MODE /* Indirect array prefetch mode. */ ++}; ++ ++typedef std::map uid_rank_map; ++typedef std::map loc_rank_map; ++typedef std::vector > loc_gcov_type_vec; ++typedef std::map > loc_gimple_vec_map; ++ ++static loc_rank_map ref_rank; ++ ++/* Callback function for event_count comparison. */ ++ ++static bool ++event_count_cmp (std::pair &a, ++ std::pair &b) ++{ ++ return a.second > b.second; ++} ++ ++/* Prepared mappings from location to counts and from location ++ to stmt list. */ ++ ++static void ++prepare_loc_count_info (function *fun, loc_gcov_type_vec &ref_sorted, ++ loc_gimple_vec_map &loc_stmt, event_type event) ++{ ++ basic_block bb = NULL; ++ gimple_stmt_iterator bsi; ++ gimple *stmt; ++ tree lhs = NULL_TREE; ++ tree rhs = NULL_TREE; ++ ++ FOR_EACH_BB_FN (bb, fun) ++ { ++ for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) ++ { ++ stmt = gsi_stmt (bsi); ++ if (gimple_code (stmt) != GIMPLE_ASSIGN) ++ { ++ continue; ++ } ++ if (!gimple_vuse (stmt)) ++ { ++ continue; ++ } ++ lhs = gimple_assign_lhs (stmt); ++ rhs = gimple_assign_rhs1 (stmt); ++ if (REFERENCE_CLASS_P (rhs) || REFERENCE_CLASS_P (lhs)) ++ { ++ gcov_type loc_count = ++ event_get_loc_count (gimple_location (stmt), event); ++ if (loc_count > 0) ++ { ++ /* There may be multiple gimple correspond to the same ++ location. */ ++ if (loc_stmt.count (gimple_location (stmt)) == 0) ++ { ++ ref_sorted.push_back (std::make_pair (gimple_location (stmt), ++ loc_count)); ++ } ++ loc_stmt[gimple_location (stmt)].push_back (stmt); ++ } ++ } ++ } ++ } ++} ++ ++/* Sort references by event_count and dump loc count information after ++ sorting. */ ++ ++static void ++sort_ref_by_event_count (function *fun, event_type event) ++{ ++ loc_gcov_type_vec ref_sorted; ++ loc_gimple_vec_map loc_stmt; ++ ++ prepare_loc_count_info (fun, ref_sorted, loc_stmt, event); ++ sort (ref_sorted.begin (), ref_sorted.end (), event_count_cmp); ++ ++ for (unsigned i = 0; i < ref_sorted.size (); ++i) ++ { ++ ref_rank[ref_sorted[i].first] = i + 1; ++ /* Print the stmt and count of the topn ref. */ ++ if (i < PREFETCH_REF_TOPN && dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "stmt: \n"); ++ for (unsigned j = 0; j < loc_stmt[ref_sorted[i].first].size (); ++ ++j) ++ { ++ print_gimple_stmt (dump_file, ++ loc_stmt[ref_sorted[i].first][j], 0); ++ } ++ gcov_type loc_count = ++ event_get_loc_count (ref_sorted[i].first, event); ++ fprintf (dump_file, "stmt loc %u counts is %lu: " ++ "rank %d in top %d, (candidate analysis)\n\n", ++ ref_sorted[i].first, loc_count, ++ ref_rank[ref_sorted[i].first], PREFETCH_REF_TOPN); ++ } ++ } ++ return; ++} ++ + /* Dumps information about memory reference */ + static void + dump_mem_details (FILE *file, tree base, tree step, +@@ -479,6 +628,30 @@ idx_analyze_ref (tree base, tree *index, void *data) + return true; + } + ++/* Dumps information about ar_data structure. */ ++ ++static void ++dump_ar_data_details (FILE *file, tree ref, struct ar_data &ar_data) ++{ ++ print_generic_expr (file, ref, TDF_SLIM); ++ fprintf (file, "\n"); ++ if (*(ar_data.step)) ++ { ++ fprintf (file, " step "); ++ if (cst_and_fits_in_hwi (*(ar_data.step))) ++ fprintf (file, HOST_WIDE_INT_PRINT_DEC, ++ int_cst_value (*(ar_data.step))); ++ else ++ print_generic_expr (file, *(ar_data.step), TDF_SLIM); ++ } ++ fprintf (file, "\n"); ++ if (*(ar_data.delta)) ++ { ++ fprintf (file, " delta " HOST_WIDE_INT_PRINT_DEC "\n", ++ *(ar_data.delta)); ++ } ++} ++ + /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and + STEP are integer constants and iter is number of iterations of LOOP. The + reference occurs in statement STMT. Strips nonaddressable component +@@ -526,7 +699,17 @@ analyze_ref (class loop *loop, tree *ref_p, tree *base, + ar_data.stmt = stmt; + ar_data.step = step; + ar_data.delta = delta; +- return for_each_index (base, idx_analyze_ref, &ar_data); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ dump_ar_data_details (dump_file, ref, ar_data); ++ } ++ bool idx_flag = for_each_index (base, idx_analyze_ref, &ar_data); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "idx_flag = %d \n\n", idx_flag); ++ } ++ return idx_flag; + } + + /* Record a memory reference REF to the list REFS. The reference occurs in +@@ -601,6 +784,55 @@ gather_memory_references_ref (class loop *loop, struct mem_ref_group **refs, + return true; + } + ++/* Determine whether to collect the memory references based on the ++ ranking of ref cache miss counts. */ ++ ++static bool ++should_gather_memory_references (gimple *stmt) ++{ ++ if (!(profile_exist (CACHE_MISSES))) ++ { ++ return true; ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "stmt:"); ++ print_gimple_stmt (dump_file, stmt, 0); ++ fprintf (dump_file, "\n"); ++ } ++ if (ref_rank.count (gimple_location (stmt)) == 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "stmt location no found, skip prefetch " ++ "analysis\n"); ++ } ++ return false; ++ } ++ gcov_type loc_count = event_get_loc_count (gimple_location (stmt), CACHE_MISSES); ++ if (ref_rank[gimple_location (stmt)] > PREFETCH_REF_TOPN) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "stmt loc %u counts is %lu:" ++ "rank %d exceed topn %d, skip prefetch " ++ "analysis\n", ++ gimple_location (stmt), loc_count, ++ ref_rank[gimple_location (stmt)], PREFETCH_REF_TOPN); ++ } ++ return false; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "stmt loc %u counts is %lu: rank %d in top %d," ++ "continue prefetch analysis\n", ++ gimple_location (stmt), loc_count, ++ ref_rank[gimple_location (stmt)], PREFETCH_REF_TOPN); ++ } ++ return true; ++} ++ + /* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to + true if there are no other memory references inside the loop. */ + +@@ -626,6 +858,13 @@ gather_memory_references (class loop *loop, bool *no_other_refs, unsigned *ref_c + if (bb->loop_father != loop) + continue; + ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "==== the %dth loop bb body ====\n", i); ++ gimple_dump_bb (dump_file, bb, 0, dump_flags); ++ fprintf (dump_file, "\n"); ++ } ++ + for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) + { + stmt = gsi_stmt (bsi); +@@ -642,20 +881,31 @@ gather_memory_references (class loop *loop, bool *no_other_refs, unsigned *ref_c + if (! gimple_vuse (stmt)) + continue; + ++ if (!should_gather_memory_references (stmt)) ++ continue; ++ + lhs = gimple_assign_lhs (stmt); + rhs = gimple_assign_rhs1 (stmt); + + if (REFERENCE_CLASS_P (rhs)) + { +- *no_other_refs &= gather_memory_references_ref (loop, &refs, +- rhs, false, stmt); +- *ref_count += 1; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "====> the %dth ref \n", *ref_count); ++ } ++ *no_other_refs &= gather_memory_references_ref (loop, &refs, rhs, ++ false, stmt); ++ *ref_count += 1; + } + if (REFERENCE_CLASS_P (lhs)) + { +- *no_other_refs &= gather_memory_references_ref (loop, &refs, +- lhs, true, stmt); +- *ref_count += 1; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "====> the %dth ref \n", *ref_count); ++ } ++ *no_other_refs &= gather_memory_references_ref (loop, &refs, lhs, ++ true, stmt); ++ *ref_count += 1; + } + } + } +@@ -1168,9 +1418,9 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) + bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES; + + if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "Issued%s prefetch for reference %u:%u.\n", +- nontemporal ? " nontemporal" : "", +- ref->group->uid, ref->uid); ++ fprintf (dump_file, "Issued%s prefetch for reference %u:%u.\n", ++ nontemporal ? " nontemporal" : "", ++ ref->group->uid, ref->uid); + + bsi = gsi_for_stmt (ref->stmt); + +@@ -1875,6 +2125,306 @@ insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count, + return false; + } + ++/* Obtain the edge probability information of each basic block in the loop. */ ++ ++static float ++get_edge_prob (edge e) ++{ ++ /* Limit the minimum probability value. */ ++ const float MINNUM_PROB = 0.00001f; ++ float fvalue = 1; ++ ++ profile_probability probability = e->probability; ++ if (probability.initialized_p ()) ++ { ++ fvalue = probability.to_reg_br_prob_base () / float (REG_BR_PROB_BASE); ++ if (fvalue < MINNUM_PROB && probability.to_reg_br_prob_base ()) ++ { ++ fvalue = MINNUM_PROB; ++ } ++ } ++ return fvalue; ++} ++ ++ ++/* Dump the bb information in a loop. */ ++ ++static void ++dump_loop_bb (struct loop *loop) ++{ ++ basic_block *body = get_loop_body_in_dom_order (loop); ++ basic_block bb = NULL; ++ ++ for (unsigned i = 0; i < loop->num_nodes; i++) ++ { ++ bb = body[i]; ++ if (bb->loop_father != loop) ++ { ++ continue; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "===== the %dth loop bb body ======= \n", i); ++ gimple_dump_bb (dump_file, bb, 0, dump_flags); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ free (body); ++} ++ ++ ++/* Obtain the branch probability information of each basic block ++ in the loop. */ ++ ++static void ++get_bb_branch_prob (hash_map &bb_branch_prob, ++ struct loop *loop) ++{ ++ basic_block *body = get_loop_body (loop); ++ basic_block bb = NULL; ++ for (unsigned i = 0; i < loop->num_nodes; i++) ++ { ++ bb = body[i]; ++ if (bb->loop_father != loop) ++ { ++ continue; ++ } ++ bb_bp &branch_prob = bb_branch_prob.get_or_insert (bb); ++ branch_prob.bb = bb; ++ branch_prob.true_edge_bb = NULL; ++ branch_prob.false_edge_bb = NULL; ++ branch_prob.true_edge_prob = 0; ++ branch_prob.false_edge_prob = 0; ++ branch_prob.bb_prob = 0; ++ ++ gimple *stmt = last_stmt (bb); ++ if (stmt && gimple_code (stmt) == GIMPLE_COND) ++ { ++ if (EDGE_COUNT (bb->succs) != 2) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "The number of successful edges of bb" ++ "is abnormal\n"); ++ continue; ++ } ++ edge true_edge = NULL; ++ edge false_edge = NULL; ++ extract_true_false_edges_from_block (bb, &true_edge, &false_edge); ++ ++ /* If it is exiting bb, and the destination bb of the edge does not ++ belong to the current loop, the information of the edge is not ++ recorded. */ ++ if (true_edge->dest->loop_father == loop) ++ { ++ branch_prob.true_edge_bb = true_edge->dest; ++ branch_prob.true_edge_prob = get_edge_prob (true_edge); ++ } ++ if (false_edge->dest->loop_father == loop) ++ { ++ branch_prob.false_edge_bb = false_edge->dest; ++ branch_prob.false_edge_prob = get_edge_prob (false_edge); ++ } ++ } ++ ++ edge e = find_fallthru_edge (bb->succs); ++ if (e) ++ { ++ branch_prob.true_edge_bb = e->dest; ++ branch_prob.true_edge_prob = get_edge_prob (e); ++ } ++ } ++} ++ ++/* Traverse each bb in the loop and prune fake loops. */ ++ ++static bool ++traverse_prune_bb_branch (hash_map &bb_branch_prob, ++ int& max_path, hash_set &path_node, ++ basic_block current_bb, basic_block latch_bb) ++{ ++ /* Limit the maximum number of analysis paths. */ ++ if (max_path <= 0 || current_bb == NULL) ++ return false; ++ ++ /* Do not join edges that do not form a complete loop. */ ++ bb_bp *bb_bp_node = bb_branch_prob.get (current_bb); ++ if (bb_bp_node == NULL || (bb_bp_node->true_edge_bb == NULL ++ && bb_bp_node->false_edge_bb == NULL)) ++ return false; ++ ++ if (current_bb == latch_bb) ++ { ++ max_path--; ++ return true; ++ } ++ ++ /* Do not join edges that return to non-dominate nodes. */ ++ if (path_node.contains (bb_bp_node->true_edge_bb) ++ || path_node.contains (bb_bp_node->false_edge_bb)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "fake loop: in bb%d\n", current_bb->index); ++ return false; ++ } ++ ++ path_node.add (current_bb); ++ if (bb_bp_node->true_edge_bb) ++ { ++ if (traverse_prune_bb_branch (bb_branch_prob, max_path, ++ path_node, bb_bp_node->true_edge_bb, latch_bb) == false) ++ return false; ++ } ++ if (bb_bp_node->false_edge_bb) ++ { ++ if (traverse_prune_bb_branch (bb_branch_prob, max_path, ++ path_node, bb_bp_node->false_edge_bb, latch_bb) == false) ++ return false; ++ } ++ path_node.remove (current_bb); ++ ++ max_path--; ++ return true; ++} ++ ++/* Traverse and calculate the probability of basic block. */ ++ ++static void ++traverse_calculate_bb_prob (hash_map &bb_branch_prob, ++ basic_block current_bb, basic_block latch_bb, ++ float prob) ++{ ++ /* Limit bb block access probability, the probability is ++ less than 100% and include delta. */ ++ const float MAX_BB_PROBABILITY = 1.001f; ++ ++ if (current_bb == NULL) ++ { ++ return; ++ } ++ bb_bp *bb_bp_node = bb_branch_prob.get (current_bb); ++ bb_bp_node->bb_prob += prob; ++ ++ gcc_assert (bb_bp_node->bb_prob <= MAX_BB_PROBABILITY); ++ ++ if (bb_bp_node == NULL || (bb_bp_node->true_edge_bb == NULL ++ && bb_bp_node->false_edge_bb == NULL)) ++ { ++ return; ++ } ++ if (current_bb == latch_bb) ++ { ++ return; ++ } ++ ++ bool assign = (bb_bp_node->true_edge_bb && bb_bp_node->false_edge_bb); ++ if (bb_bp_node->true_edge_bb) ++ { ++ float assign_prob = assign ? bb_bp_node->true_edge_prob * prob : prob; ++ traverse_calculate_bb_prob (bb_branch_prob, ++ bb_bp_node->true_edge_bb, latch_bb, assign_prob); ++ } ++ if (bb_bp_node->false_edge_bb) ++ { ++ float assign_prob = assign ? bb_bp_node->false_edge_prob * prob : prob; ++ traverse_calculate_bb_prob (bb_branch_prob, ++ bb_bp_node->false_edge_bb, latch_bb, assign_prob); ++ } ++ return; ++} ++ ++/* Obtain the probability of basic block. */ ++ ++static bool ++get_bb_prob (hash_map &bb_branch_prob, struct loop *loop) ++{ ++ /* The upper limit of the branch path in the loop is 10000. */ ++ const int MAX_BB_BRANCH_PATH = 10000; ++ ++ if (loop->header == NULL || loop->latch == NULL ++ || loop->header == loop->latch) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "get_bb_prob failed: without the header bb or " ++ "latch bb\n"); ++ return false; ++ } ++ ++ bb_bp *latch_branch_prob = bb_branch_prob.get (loop->latch); ++ bb_bp *header_branch_prob = bb_branch_prob.get (loop->header); ++ if (header_branch_prob == NULL || latch_branch_prob == NULL ++ || (latch_branch_prob->true_edge_bb != header_branch_prob->bb ++ && latch_branch_prob->false_edge_bb != header_branch_prob->bb)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "get_bb_prob failed: loop data exception\n"); ++ return false; ++ } ++ ++ hash_set path_node; ++ int max_path = MAX_BB_BRANCH_PATH; ++ if (traverse_prune_bb_branch (bb_branch_prob, max_path, path_node, ++ header_branch_prob->bb, loop->latch) == false) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "traverse_prune_bb_branch false.\n"); ++ return false; ++ } ++ traverse_calculate_bb_prob (bb_branch_prob, ++ header_branch_prob->bb, loop->latch, 1); ++ ++ return true; ++} ++ ++/* Computes an estimated number of insns in LOOP, weighted by WEIGHTS. */ ++ ++static unsigned ++estimate_num_loop_insns (struct loop *loop, eni_weights *weights) ++{ ++ basic_block *body = get_loop_body_in_dom_order (loop); ++ gimple_stmt_iterator gsi; ++ float size = 0; ++ basic_block bb = NULL; ++ hash_map bb_branch_prob; ++ ++ if (prefetch_level >= BRANCH_WEIGHTED_AHEAD) ++ { ++ get_bb_branch_prob (bb_branch_prob, loop); ++ if (get_bb_prob (bb_branch_prob, loop) == false) ++ { ++ dump_loop_bb (loop); ++ return 0; ++ } ++ } ++ ++ for (unsigned i = 0; i < loop->num_nodes; i++) ++ { ++ bb = body[i]; ++ /* For nested loops, the bb of the inner loop is not calculated. */ ++ if (bb->loop_father != loop) ++ { ++ continue; ++ } ++ ++ float size_tmp = 0; ++ for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ size_tmp += estimate_num_insns (gsi_stmt (gsi), weights); ++ } ++ ++ if (prefetch_level >= BRANCH_WEIGHTED_AHEAD) ++ { ++ float bb_prob = bb_branch_prob.get (bb)->bb_prob; ++ size += size_tmp * bb_prob; ++ } ++ else ++ { ++ size += size_tmp; ++ } ++ } ++ free (body); ++ ++ return unsigned (size); ++} + + /* Issue prefetch instructions for array references in LOOP. Returns + true if the LOOP was unrolled. */ +@@ -1899,7 +2449,15 @@ loop_prefetch_arrays (class loop *loop) + + /* FIXME: the time should be weighted by the probabilities of the blocks in + the loop body. */ +- time = tree_num_loop_insns (loop, &eni_time_weights); ++ ++ if (prefetch_level >= REFINE_BB_AHEAD) ++ { ++ time = estimate_num_loop_insns (loop, &eni_time_weights); ++ } ++ else ++ { ++ time = tree_num_loop_insns (loop, &eni_time_weights); ++ } + if (time == 0) + return false; + +@@ -1913,7 +2471,14 @@ loop_prefetch_arrays (class loop *loop) + if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter)) + return false; + +- ninsns = tree_num_loop_insns (loop, &eni_size_weights); ++ if (prefetch_level >= REFINE_BB_AHEAD) ++ { ++ ninsns = estimate_num_loop_insns (loop, &eni_size_weights); ++ } ++ else ++ { ++ ninsns = tree_num_loop_insns (loop, &eni_size_weights); ++ } + + /* Step 1: gather the memory references. */ + refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count); +@@ -1978,10 +2543,49 @@ fail: + return unrolled; + } + ++/* Determine if it is a high execution rate loop. */ ++ ++static bool ++is_high_exec_rate_loop (struct loop *loop) ++{ ++ vec exit_edges = get_loop_exit_edges (loop); ++ if (exit_edges == vNULL) ++ { ++ return false; ++ } ++ ++ unsigned i = 0; ++ gcov_type exit_count = 0; ++ edge e = NULL; ++ float loop_exec_rate = 0; ++ gcov_type header_bb_count = loop->header->count.to_gcov_type (); ++ FOR_EACH_VEC_ELT (exit_edges, i, e) ++ { ++ gcov_type exiting_bb_count = e->src->count.to_gcov_type (); ++ float exit_edge_prob = get_edge_prob (e); ++ exit_count += exit_edge_prob * exiting_bb_count; ++ ++ loop_exec_rate = 1.0 - ((double) exit_count / header_bb_count); ++ ++ if (loop_exec_rate < (float) LOOP_EXECUTION_RATE / 100.0) ++ { ++ return false; ++ } ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "loop with high execution rate: %f >= %f\n\n", ++ loop_exec_rate, (float) LOOP_EXECUTION_RATE / 100.0); ++ dump_loop_bb (loop); ++ } ++ return true; ++} ++ + /* Issue prefetch instructions for array references in loops. */ + + unsigned int +-tree_ssa_prefetch_arrays (void) ++tree_ssa_prefetch_arrays (function *fun) + { + class loop *loop; + bool unrolled = false; +@@ -2012,6 +2616,12 @@ tree_ssa_prefetch_arrays (void) + param_min_insn_to_prefetch_ratio); + fprintf (dump_file, " min insn-to-mem ratio: %d \n", + param_prefetch_min_insn_to_mem_ratio); ++ fprintf (dump_file, " prefetch_func_topn: %d \n", ++ param_prefetch_func_topn); ++ fprintf (dump_file, " prefetch_ref_topn: %d \n", ++ param_prefetch_ref_topn); ++ fprintf (dump_file, " high_loop_execution_rate: %d \n", ++ LOOP_EXECUTION_RATE); + fprintf (dump_file, "\n"); + } + +@@ -2028,13 +2638,42 @@ tree_ssa_prefetch_arrays (void) + set_builtin_decl (BUILT_IN_PREFETCH, decl, false); + } + +- FOR_EACH_LOOP (loop, LI_FROM_INNERMOST) ++ enum li_flags LI = LI_FROM_INNERMOST; ++ ++ if (profile_exist (CACHE_MISSES)) ++ { ++ LI = LI_ONLY_INNERMOST; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Processing model %d:\n", LI); ++ } ++ ++ if (profile_exist (CACHE_MISSES)) ++ { ++ sort_ref_by_event_count (fun, CACHE_MISSES); ++ } ++ ++ FOR_EACH_LOOP (loop, LI) + { + if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "Processing loop %d:\n", loop->num); ++ { ++ fprintf (dump_file, "======================================\n"); ++ fprintf (dump_file, "Processing loop %d:\n", loop->num); ++ fprintf (dump_file, "======================================\n"); ++ flow_loop_dump (loop, dump_file, NULL, 1); ++ fprintf (dump_file, "\n\n"); ++ } + +- unrolled |= loop_prefetch_arrays (loop); ++ if (profile_exist (CACHE_MISSES)) ++ { ++ if (!is_high_exec_rate_loop (loop)) ++ { ++ continue; ++ } ++ } + ++ unrolled |= loop_prefetch_arrays (loop); + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "\n\n"); + } +@@ -2049,6 +2688,56 @@ tree_ssa_prefetch_arrays (void) + return todo_flags; + } + ++/* Determine whether to analyze the function according to ++ the sorting of the function containing cache-miss counts. */ ++ ++static bool ++should_analyze_func_p (void) ++{ ++ gcov_type decl_uid = DECL_UID (current_function_decl); ++ struct rank_info func_rank_info = ++ event_get_func_rank (decl_uid, CACHE_MISSES); ++ if (func_rank_info.total == 0) ++ { ++ return false; ++ } ++ gcov_type func_count = event_get_func_count (decl_uid, CACHE_MISSES); ++ if (func_count == 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "function uid %d cannot find profile data " ++ "and skip prefetch analysis\n", ++ decl_uid); ++ } ++ return false; ++ } ++ if (func_rank_info.rank > PREFETCH_FUNC_TOPN ++ || func_count < PREFETCH_FUNC_COUNTS_THRESHOLD) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "function uid %d total counts is %lu: " ++ "rank %d > topn %d, counts %lu < threshold %lu " ++ "skip prefetch analysis\n", ++ decl_uid, func_count, ++ func_rank_info.rank, PREFETCH_FUNC_TOPN, ++ func_count, PREFETCH_FUNC_COUNTS_THRESHOLD); ++ } ++ return false; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "function uid %d total counts is %lu: " ++ "rank %d in topn %d, counts %lu > threshold %lu " ++ "continue prefetch analysis\n", ++ decl_uid, func_count, ++ func_rank_info.rank, PREFETCH_FUNC_TOPN, ++ func_count, PREFETCH_FUNC_COUNTS_THRESHOLD); ++ } ++ return true; ++} ++ + /* Prefetching. */ + + namespace { +@@ -2085,6 +2774,18 @@ pass_loop_prefetch::execute (function *fun) + if (number_of_loops (fun) <= 1) + return 0; + ++ /* Filter only when combined with cache-miss. When the should_analyze_func_p ++ analysis fails (for example, the function without cache-miss count), ++ in order to ensure the accuracy of the prefetch analysis, the function ++ does not perform native prefetch processing. */ ++ if (profile_exist (CACHE_MISSES)) ++ { ++ if (!should_analyze_func_p ()) ++ { ++ return 0; ++ } ++ } ++ + if ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) != 0) + { + static bool warned = false; +@@ -2099,7 +2800,7 @@ pass_loop_prefetch::execute (function *fun) + return 0; + } + +- return tree_ssa_prefetch_arrays (); ++ return tree_ssa_prefetch_arrays (fun); + } + + } // anon namespace +-- +2.27.0.windows.1 + diff --git a/0028-AutoPrefetch-Handle-the-case-that-the-basic-block-br.patch b/0028-AutoPrefetch-Handle-the-case-that-the-basic-block-br.patch new file mode 100644 index 0000000000000000000000000000000000000000..48c2e52833aaa6ebdb5fdcc31666600ad58ef6cf --- /dev/null +++ b/0028-AutoPrefetch-Handle-the-case-that-the-basic-block-br.patch @@ -0,0 +1,151 @@ +From 3d20b13bc2e5af8d52e221a33881423e38c3dfdd Mon Sep 17 00:00:00 2001 +From: dingguangya +Date: Thu, 17 Feb 2022 21:53:31 +0800 +Subject: [PATCH 28/28] [AutoPrefetch] Handle the case that the basic block + branch probability is invalid + + When the node branch probability value is not initialized, + the branch probability must be set to 0 to ensure that + the calculation of the basic block execution probability + must be less than or equal to 100%. +--- + .../gcc.dg/autoprefetch/autoprefetch.exp | 27 +++++++++++++++++++ + .../autoprefetch/branch-weighted-prefetch.c | 22 +++++++++++++++ + .../autoprefetch/get-edge-prob-non-init.c | 24 +++++++++++++++++ + gcc/tree-ssa-loop-prefetch.c | 17 +++++++++++- + 4 files changed, 89 insertions(+), 1 deletion(-) + create mode 100644 gcc/testsuite/gcc.dg/autoprefetch/autoprefetch.exp + create mode 100644 gcc/testsuite/gcc.dg/autoprefetch/branch-weighted-prefetch.c + create mode 100644 gcc/testsuite/gcc.dg/autoprefetch/get-edge-prob-non-init.c + +diff --git a/gcc/testsuite/gcc.dg/autoprefetch/autoprefetch.exp b/gcc/testsuite/gcc.dg/autoprefetch/autoprefetch.exp +new file mode 100644 +index 000000000..a7408e338 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/autoprefetch/autoprefetch.exp +@@ -0,0 +1,27 @@ ++# Copyright (C) 1997-2022 Free Software Foundation, Inc. ++ ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 3 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with GCC; see the file COPYING3. If not see ++# . ++ ++load_lib gcc-dg.exp ++load_lib target-supports.exp ++ ++# Initialize `dg'. ++dg-init ++ ++gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \ ++ "" "-fprefetch-loop-arrays" ++ ++# All done. ++dg-finish +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/autoprefetch/branch-weighted-prefetch.c b/gcc/testsuite/gcc.dg/autoprefetch/branch-weighted-prefetch.c +new file mode 100644 +index 000000000..c63c5e5cb +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/autoprefetch/branch-weighted-prefetch.c +@@ -0,0 +1,22 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -fprefetch-loop-arrays=2 --param min-insn-to-prefetch-ratio=5 --param simultaneous-prefetches=100 -fdump-tree-aprefetch-details -fdump-tree-optimized" } */ ++#define N 10000000 ++ ++long long a[N]; ++ ++long long func () ++{ ++ long long i; ++ long long sum = 0; ++ ++ for (i = 0; i < N; i+=1) { ++ if (i < 100000) ++ sum += a[i]; ++ else ++ continue; ++ } ++ ++ return sum; ++} ++/* { dg-final { scan-tree-dump-times "Ahead 40" 1 "aprefetch" } } */ ++/* { dg-final { scan-tree-dump-times "builtin_prefetch" 1 "optimized" } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/autoprefetch/get-edge-prob-non-init.c b/gcc/testsuite/gcc.dg/autoprefetch/get-edge-prob-non-init.c +new file mode 100644 +index 000000000..f55481008 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/autoprefetch/get-edge-prob-non-init.c +@@ -0,0 +1,24 @@ ++/* { dg-do compile } */ ++/* { dg-options "-Ofast -fprefetch-loop-arrays=2 -fdump-tree-aprefetch-details" } */ ++ ++int a, c, f; ++static int *b = &a; ++int *d; ++int e[0]; ++void g() { ++ int h; ++ for (;;) { ++ h = 1; ++ for (; h >= 0; h--) { ++ c = 2; ++ for (; c; c--) ++ if (e[0]) ++ if (e[c]) ++ *b = 0; ++ f || (*d = 0); ++ } ++ } ++} ++int main() {} ++ ++/* { dg-final } */ +diff --git a/gcc/tree-ssa-loop-prefetch.c b/gcc/tree-ssa-loop-prefetch.c +index 3a5aef0fc..673f453a4 100644 +--- a/gcc/tree-ssa-loop-prefetch.c ++++ b/gcc/tree-ssa-loop-prefetch.c +@@ -2132,7 +2132,7 @@ get_edge_prob (edge e) + { + /* Limit the minimum probability value. */ + const float MINNUM_PROB = 0.00001f; +- float fvalue = 1; ++ float fvalue = 0; + + profile_probability probability = e->probability; + if (probability.initialized_p ()) +@@ -2143,6 +2143,21 @@ get_edge_prob (edge e) + fvalue = MINNUM_PROB; + } + } ++ else ++ { ++ /* When the node branch probability value is not initialized, the branch ++ probability must be set to 0 to ensure that the calculation of the ++ basic block execution probability must be less than or equal to 100%. ++ i.e, ++ ... ++ [local count: 20000] ++ if (f_2 != 0) ++ goto ; [INV] ++ else ++ goto ; [100.00%] ++ ... */ ++ fvalue = 0; ++ } + return fvalue; + } + +-- +2.27.0.windows.1 + diff --git a/gcc.spec b/gcc.spec index f6e8676e1ba3b9bd79937e38c835c8c7781c083a..3e0b2617cde8a1cea90f09dfe594c006c8978afe 100644 --- a/gcc.spec +++ b/gcc.spec @@ -1,4 +1,4 @@ -%global DATE 20220105 +%global DATE 20220223 %global gcc_version 10.3.1 %global gcc_major 10.3.1 @@ -63,7 +63,7 @@ Summary: Various compilers (C, C++, Objective-C, ...) Name: gcc Version: %{gcc_version} -Release: %{DATE}.6 +Release: %{DATE}.7 License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD URL: https://gcc.gnu.org @@ -142,6 +142,10 @@ Patch21: 0021-mcmodel-Bugfix-for-mcmodel-medium-on-x86.patch Patch22: 0022-StructReorderFields-Fix-pointer-layer-check-bug.patch Patch23: 0023-StructReorderFields-Add-pointer-offset-check.patch Patch24: 0024-StructReorderFields-Add-lto-and-whole-program-gate.patch +Patch25: 0025-AutoPrefetch-Support-cache-misses-profile.patch +Patch26: 0026-AutoFDO-Enable-discriminator-and-MCF-algorithm-on-Au.patch +Patch27: 0027-Autoprefetch-Support-auto-feedback-prefetch.patch +Patch28: 0028-AutoPrefetch-Handle-the-case-that-the-basic-block-br.patch %global gcc_target_platform %{_arch}-linux-gnu @@ -607,6 +611,10 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch22 -p1 %patch23 -p1 %patch24 -p1 +%patch25 -p1 +%patch26 -p1 +%patch27 -p1 +%patch28 -p1 %build @@ -2569,6 +2577,12 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Wed Feb 23 2022 benniaobufeijiushiji - 10.3.1-20220223.7 +- Type:Sync +- ID:NA +- SUG:NA +- DESC:Sync patch from openeuler/gcc + * Wed Jan 05 2022 eastb233 - 10.3.1-20220105.6 - Type:SPEC - ID:NA