From 6c64bc9c3b8c1e28218709f79195115b0e9fee38 Mon Sep 17 00:00:00 2001
From: chenyuanfeng <yuanfeng.chen@shingroup.cn>
Date: Wed, 29 May 2024 16:59:49 +0800
Subject: [PATCH 1/2] set default configuration for the ppc64le

(cherry picked from commit 22381869b45430a230637cd255d913a02e2432f4)
---
 gcc.spec | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/gcc.spec b/gcc.spec
index 4355aa3..80a045f 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -61,7 +61,7 @@
 Summary: Various compilers (C, C++, Objective-C, ...)
 Name: gcc
 Version: %{gcc_version}
-Release: 53
+Release: 54
 License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD
 URL: https://gcc.gnu.org
 
@@ -1026,6 +1026,11 @@ CC="$CC" CFLAGS="$OPT_FLAGS" \
 	   --with-arch=rv64g --with-abi=lp64d \
 	   --disable-libquadmath --disable-multilib
 %endif
+%ifarch ppc64le
+	   --disable-multilib \
+	   --enable-targets=powerpcle-linux \
+	   --withc-cpu-32=power8 --with-tune-32=power8 --with-cpu-64=power8 --with-tune-64=power8 \
+%endif
 
 %ifarch sparc sparcv9 sparc64
 make %{?_smp_mflags} BOOT_CFLAGS="$OPT_FLAGS" bootstrap
@@ -2968,6 +2973,12 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Thu Mar 14 2024 chenyuanfeng <yuanfeng.chen@shingroup.cn> - 10.3.1-54
+- Type: Spec
+- ID:NA
+- SUG:NA
+- DESC: Set default configuration for the ppc64le
+
 * Mon Apr 15 2024 huyubiao <huyubiao@huawei.com> - 10.3.1-53
 - Type:SPEC
 - ID:NA
-- 
Gitee


From 570e7eec588ca77f6bb92fbd8ab1d71e67a25ba9 Mon Sep 17 00:00:00 2001
From: yzyssdd <yuzeyang4@huawei.com>
Date: Wed, 29 May 2024 17:27:16 +0800
Subject: [PATCH 2/2] Add feedback llc allocate and support llc prefetch
 instruction

(cherry picked from commit 80f60182971f6136efd0c2783ab2c39ed0c1239f)
---
 ...erence-between-source-and-patch-code.patch |   69 +
 ...ugfix-Fix-ambiguous-reference-due-to.patch |  640 +++
 ...ected-filter_and_sort_kernels-in-Pha.patch | 3503 +++++++++++++++++
 ...el-parameter-to-specify-the-last-lev.patch |  827 ++++
 gcc.spec                                      |   16 +-
 5 files changed, 5054 insertions(+), 1 deletion(-)
 create mode 100644 0190-sync-LLC-difference-between-source-and-patch-code.patch
 create mode 100644 0191-LLC-Allocation-Bugfix-Fix-ambiguous-reference-due-to.patch
 create mode 100644 0192-Add-feedback-directed-filter_and_sort_kernels-in-Pha.patch
 create mode 100644 0193-Add-prefetch-level-parameter-to-specify-the-last-lev.patch

diff --git a/0190-sync-LLC-difference-between-source-and-patch-code.patch b/0190-sync-LLC-difference-between-source-and-patch-code.patch
new file mode 100644
index 0000000..e635931
--- /dev/null
+++ b/0190-sync-LLC-difference-between-source-and-patch-code.patch
@@ -0,0 +1,69 @@
+From 0f667a2f934023d2dd1636572f2dc8391334d7f8 Mon Sep 17 00:00:00 2001
+From: liuf9 <liufeiyang6@huawei.com>
+Date: Wed, 29 May 2024 20:14:02 +0800
+Subject: [PATCH] b
+
+---
+ gcc/tree-ssa-llc-allocate.c | 13 ++++++-------
+ 1 file changed, 6 insertions(+), 7 deletions(-)
+
+diff --git a/gcc/tree-ssa-llc-allocate.c b/gcc/tree-ssa-llc-allocate.c
+index 297790a..62b5f18 100644
+--- a/gcc/tree-ssa-llc-allocate.c
++++ b/gcc/tree-ssa-llc-allocate.c
+@@ -1527,7 +1527,7 @@ trace_ref_dimension_and_loop_bounds (data_ref &mem_ref)
+ 
+ tree
+ get_cur_loop_niters (map<class loop*, vector<data_ref> > &loop_refs,
+-			    class loop* loop)
++		     class loop* loop)
+ {
+   if (loop_refs.count (loop) == 0)
+     return NULL_TREE;
+@@ -1565,7 +1565,6 @@ trace_outer_loop_depth (tree niters, unsigned start_depth)
+     {
+       /* Trace the SSA that define this niter.  */
+       def_stmt = SSA_NAME_DEF_STMT (niters);
+-      enum gimple_code stmt_code = gimple_code (def_stmt);
+       if (dump_file && (dump_flags & TDF_DETAILS))
+ 	{
+ 	  fprintf (dump_file, "ssa_name of niters: ");
+@@ -1575,7 +1574,8 @@ trace_outer_loop_depth (tree niters, unsigned start_depth)
+ 	  fprintf (dump_file, "\n");
+ 	}
+       /* Termination condition of dfs.  Return the depth of the bb block.  */
+-      if (stmt_code == GIMPLE_PHI || stmt_code == GIMPLE_NOP)
++      if (gimple_code (def_stmt) == GIMPLE_PHI
++          || gimple_code (def_stmt) == GIMPLE_NOP)
+ 	{
+ 	  basic_block def_bb = gimple_bb (SSA_NAME_DEF_STMT (niters));
+ 	  if (def_bb == NULL || def_bb->loop_father == NULL)
+@@ -1584,13 +1584,13 @@ trace_outer_loop_depth (tree niters, unsigned start_depth)
+ 	  if (dump_file && (dump_flags & TDF_DETAILS))
+ 	    {
+ 	      fprintf (dump_file, "Stop tracing the outer loop depth, ");
+-	      fprintf (dump_file, "current depth: %d, current bb: %d\n", \
++	      fprintf (dump_file, "current depth: %d, current bb: %d\n",
+ 				  ret_depth, def_bb->index);
+ 	    }
+ 	  return ret_depth;
+ 	}
+       /* 'ASSIGN': Use dfs to trace the rhs of the assignment statement.  */
+-      else if (stmt_code == GIMPLE_ASSIGN)
++      else if (gimple_code (def_stmt) == GIMPLE_ASSIGN)
+ 	{
+ 	  tree rhs = gimple_assign_rhs1 (def_stmt);
+ 	  if (TREE_CODE (rhs) == TARGET_MEM_REF)
+@@ -1605,8 +1605,7 @@ trace_outer_loop_depth (tree niters, unsigned start_depth)
+ 	      /* 'ASSIGN': start from 1 because op[0] is the lhs.  */
+ 	      for (unsigned i = 1; i < operand_num; i++)
+ 		{
+-		  tree subtree = GIMPLE_CHECK2<const gassign *>
+-				 (def_stmt)->op[i];
++		  tree subtree = dyn_cast<gassign *>(def_stmt)->op[i];
+ 		  if (subtree == NULL)
+ 		    continue;
+ 		  unsigned depth = trace_outer_loop_depth (subtree, \
+-- 
+2.33.0
+
diff --git a/0191-LLC-Allocation-Bugfix-Fix-ambiguous-reference-due-to.patch b/0191-LLC-Allocation-Bugfix-Fix-ambiguous-reference-due-to.patch
new file mode 100644
index 0000000..7d96389
--- /dev/null
+++ b/0191-LLC-Allocation-Bugfix-Fix-ambiguous-reference-due-to.patch
@@ -0,0 +1,640 @@
+From 5acce23a2d8412df874f78d0b703c9643d15ecc2 Mon Sep 17 00:00:00 2001
+From: liuf9 <liufeiyang6@huawei.com>
+Date: Tue, 27 Feb 2024 15:40:06 +0800
+Subject: [PATCH 1/4] [LLC Allocation][Bugfix] Fix ambiguous reference due to
+ namespace.
+
+---
+ gcc/tree-ssa-llc-allocate.c | 189 +++++++++++++++++++-----------------
+ 1 file changed, 98 insertions(+), 91 deletions(-)
+
+diff --git a/gcc/tree-ssa-llc-allocate.c b/gcc/tree-ssa-llc-allocate.c
+index 62b5f18ad..0b4ad637d 100644
+--- a/gcc/tree-ssa-llc-allocate.c
++++ b/gcc/tree-ssa-llc-allocate.c
+@@ -79,8 +79,6 @@ const unsigned int WRITE_COST = 2;
+ 
+ namespace {
+ 
+-using namespace std;
+-
+ /* loop bound info of the memory reference located.  */
+ struct loop_bound
+ {
+@@ -144,7 +142,7 @@ struct data_ref
+   tree step;
+ 
+   /* loop boundary info of each dimension.  */
+-  vector<loop_bound> loop_bounds;
++  std::vector<loop_bound> loop_bounds;
+ 
+   /* memory data size, Unit: MB.  */
+   double data_size;
+@@ -191,7 +189,7 @@ struct data_ref
+ /* Add ref node and print.  */
+ 
+ void
+-add_ref (vector<data_ref> &references, tree op, gimple *stmt,
++add_ref (std::vector<data_ref> &references, tree op, gimple *stmt,
+ 	 bool vectorize_p, bool read_p)
+ {
+   data_ref ref;
+@@ -210,7 +208,7 @@ add_ref (vector<data_ref> &references, tree op, gimple *stmt,
+ /* Get the references from the simple call (vectorization type).  */
+ 
+ void
+-get_references_in_gimple_call (gimple *stmt, vector<data_ref> &references)
++get_references_in_gimple_call (gimple *stmt, std::vector<data_ref> &references)
+ {
+   if (gimple_code (stmt) != GIMPLE_CALL)
+     return;
+@@ -276,7 +274,7 @@ get_references_in_gimple_call (gimple *stmt, vector<data_ref> &references)
+ /* Stores the locations of memory references in STMT to REFERENCES.  */
+ 
+ void
+-get_references_in_stmt (gimple *stmt, vector<data_ref> &references)
++get_references_in_stmt (gimple *stmt, std::vector<data_ref> &references)
+ {
+   if (!gimple_vuse (stmt))
+     return;
+@@ -326,7 +324,7 @@ struct loop_filter_out_flag
+ 
+ /* Check whether an external node is used.  */
+ 
+-bool use_ext_node_p (const vector<data_ref> &references,
++bool use_ext_node_p (const std::vector<data_ref> &references,
+ 		     unsigned int &start)
+ {
+   expanded_location cfun_xloc
+@@ -352,7 +350,7 @@ bool use_ext_node_p (const vector<data_ref> &references,
+ 
+ bool
+ filter_out_loop_by_stmt_p (loop_filter_out_flag &loop_filter, gimple *stmt,
+-		  const vector<data_ref> &references, unsigned int &start)
++		  const std::vector<data_ref> &references, unsigned int &start)
+ {
+   expanded_location xloc = expand_location (stmt->location);
+   /* check use_ext_call.  */
+@@ -431,7 +429,7 @@ dump_loop_filter_out_flag (loop_filter_out_flag &loop_filter)
+ /* Get references in loop.  */
+ 
+ bool
+-get_references_in_loop (vector<data_ref> &references,
++get_references_in_loop (std::vector<data_ref> &references,
+ 			loop_filter_out_flag &loop_filter,
+ 			class loop *loop)
+ {
+@@ -501,7 +499,7 @@ estimate_loop_insns (class loop *loop, eni_weights *weights)
+ /* Check whether the memory access is dense.  */
+ 
+ bool
+-dense_memory_p (const vector<data_ref> &references, class loop *loop)
++dense_memory_p (const std::vector<data_ref> &references, class loop *loop)
+ {
+   int ref_count = references.size ();
+   unsigned int ninsns = estimate_loop_insns (loop, &eni_size_weights);
+@@ -550,11 +548,12 @@ dense_memory_p (const vector<data_ref> &references, class loop *loop)
+ /* Analyze the inner loop and get the loop with dense memory access.  */
+ 
+ void
+-analyze_loop_dense_memory (vector<class loop *> &kernels,
+-			  map<class loop *, vector<data_ref> > &kernels_refs,
+-			  class loop *loop)
++analyze_loop_dense_memory (std::vector<class loop *> &kernels,
++			   std::map<class loop *,
++				    std::vector<data_ref> > &kernels_refs,
++			   class loop *loop)
+ {
+-  vector<data_ref> references;
++  std::vector<data_ref> references;
+   number_of_latch_executions (loop);
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     {
+@@ -589,8 +588,9 @@ analyze_loop_dense_memory (vector<class loop *> &kernels,
+ /* Analyze the inner loop and get the loop with dense memory access.  */
+ 
+ bool
+-get_dense_memory_kernels (vector<class loop *> &kernels,
+-			  map<class loop *, vector<data_ref> > &kernels_refs)
++get_dense_memory_kernels (std::vector<class loop *> &kernels,
++			  std::map<class loop *,
++				   std::vector<data_ref> > &kernels_refs)
+ {
+   if (dump_file)
+     fprintf (dump_file, "\nPhase 1: get_dense_memory_kernels\n\n");
+@@ -631,7 +631,8 @@ generic_decl_p (tree expr)
+    Add different initial node based on different gimple statements.  */
+ 
+ void
+-add_worklist (vector<tree> &worklist, set<tree> &walked, gimple *def_stmt)
++add_worklist (std::vector<tree> &worklist, std::set<tree> &walked,
++	      gimple *def_stmt)
+ {
+   if (gimple_code (def_stmt) == GIMPLE_PHI)
+     {
+@@ -715,8 +716,8 @@ add_worklist (vector<tree> &worklist, set<tree> &walked, gimple *def_stmt)
+ */
+ 
+ void
+-trace_base_var_helper (tree arg, set<tree> &walked,
+-		       map<tree, int>& base_var_candid)
++trace_base_var_helper (tree arg, std::set<tree> &walked,
++		       std::map<tree, int>& base_var_candid)
+ {
+   if (arg == NULL)
+     return;
+@@ -765,7 +766,7 @@ trace_base_var_helper (tree arg, set<tree> &walked,
+       print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM);
+     }
+ 
+-  vector<tree> worklist;
++  std::vector<tree> worklist;
+   add_worklist (worklist, walked, def_stmt);
+   for (unsigned i = 0; i < worklist.size (); ++i)
+     trace_base_var_helper (worklist[i], walked, base_var_candid);
+@@ -781,9 +782,9 @@ trace_base_var_helper (tree arg, set<tree> &walked,
+     (e.g., criterion 2: 1 -> any odd number).    */
+ 
+ bool
+-trace_base_var (tree &var, tree arg, set<tree> &walked)
++trace_base_var (tree &var, tree arg, std::set<tree> &walked)
+ {
+-  map<tree, int> base_var_candid;
++  std::map<tree, int> base_var_candid;
+   trace_base_var_helper (arg, walked, base_var_candid);
+   bool is_tracing_unusual = false;
+   if (base_var_candid.size () == 1)
+@@ -791,7 +792,7 @@ trace_base_var (tree &var, tree arg, set<tree> &walked)
+   else
+     {
+       is_tracing_unusual = true;
+-      for (const pair<tree, int>& base_var_count : base_var_candid)
++      for (const std::pair<tree, int>& base_var_count : base_var_candid)
+ 	if (base_var_count.second == 1)
+ 	  var = base_var_count.first;
+     }
+@@ -800,7 +801,7 @@ trace_base_var (tree &var, tree arg, set<tree> &walked)
+       fprintf (dump_file, "Traced variables at ");
+       print_generic_expr (dump_file, arg, TDF_SLIM);
+       fprintf (dump_file, ":\n");
+-      for (const pair<tree, int>& base_var_count : base_var_candid)
++      for (const std::pair<tree, int>& base_var_count : base_var_candid)
+ 	fprintf (dump_file, "%s:%d, ", get_name (base_var_count.first),
+ 		  base_var_count.second);
+       fprintf (dump_file, "\n");
+@@ -817,7 +818,7 @@ trace_base_var (tree &var, tree arg, set<tree> &walked)
+ /* Tracing direct memory reference information.  */
+ 
+ bool
+-trace_direct_mem_ref (data_ref &mem_ref, set <gimple *> &traced_ref_stmt)
++trace_direct_mem_ref (data_ref &mem_ref, std::set<gimple *> &traced_ref_stmt)
+ {
+   if (TREE_CODE (mem_ref.ref) != TARGET_MEM_REF)
+     return false;
+@@ -829,7 +830,7 @@ trace_direct_mem_ref (data_ref &mem_ref, set <gimple *> &traced_ref_stmt)
+   mem_ref.index = TREE_OPERAND (mem_ref.ref, 2);
+   mem_ref.step = TREE_OPERAND (mem_ref.ref, 3);
+ 
+-  set<tree> walked;
++  std::set<tree> walked;
+   if (mem_ref.var == NULL_TREE
+       && !trace_base_var (mem_ref.var, mem_ref.base, walked))
+     return false;
+@@ -843,7 +844,7 @@ trace_direct_mem_ref (data_ref &mem_ref, set <gimple *> &traced_ref_stmt)
+    If true, it is an indirect access.  */
+ 
+ bool
+-trace_indirect_operand (tree arg, set<gimple *> &traced_ref_stmt)
++trace_indirect_operand (tree arg, std::set<gimple *> &traced_ref_stmt)
+ {
+   if (TREE_CODE (arg) != SSA_NAME)
+     return false;
+@@ -889,7 +890,7 @@ trace_indirect_operand (tree arg, set<gimple *> &traced_ref_stmt)
+ 
+ bool
+ trace_indirect_ptr (tree &base, tree &index, tree arg,
+-		    set<gimple *> traced_ref_stmt)
++		    std::set<gimple *> traced_ref_stmt)
+ {
+   gimple *def_stmt = SSA_NAME_DEF_STMT (arg);
+ 
+@@ -922,7 +923,7 @@ trace_indirect_ptr (tree &base, tree &index, tree arg,
+ 
+ bool
+ trace_indirect_array (tree &base, tree &index,
+-		      set<gimple *> traced_ref_stmt, tree ref)
++		      std::set<gimple *> traced_ref_stmt, tree ref)
+ {
+   if (TREE_CODE (ref) != ARRAY_REF)
+     return false;
+@@ -937,7 +938,7 @@ trace_indirect_array (tree &base, tree &index,
+ 
+ bool
+ trace_indirect_mem_ref (data_ref &mem_ref,
+-			set <gimple *> &traced_ref_stmt)
++			std::set<gimple *> &traced_ref_stmt)
+ {
+   /* Processing of vectorization types.  */
+   if (mem_ref.vectorize_p)
+@@ -947,7 +948,7 @@ trace_indirect_mem_ref (data_ref &mem_ref,
+ 	{
+ 	  mem_ref.base = gimple_call_arg (mem_ref.stmt, 0);
+ 	  mem_ref.regular_p = false;
+-	  set<tree> walked;
++	  std::set<tree> walked;
+ 	  if (mem_ref.var == NULL_TREE
+ 	      && !trace_base_var (mem_ref.var, mem_ref.base, walked))
+ 	    return false;
+@@ -983,7 +984,7 @@ trace_indirect_mem_ref (data_ref &mem_ref,
+ 	  mem_ref.base = base;
+ 	  mem_ref.index = index;
+ 	  mem_ref.regular_p = false;
+-	  set<tree> walked;
++	  std::set<tree> walked;
+ 	  if (mem_ref.var == NULL_TREE
+ 	      && !trace_base_var (mem_ref.var, mem_ref.base, walked))
+ 	    return false;
+@@ -1002,7 +1003,7 @@ trace_indirect_mem_ref (data_ref &mem_ref,
+ */
+ 
+ void
+-trace_ref_info (data_ref &mem_ref, set <gimple *> &traced_ref_stmt)
++trace_ref_info (data_ref &mem_ref, std::set<gimple *> &traced_ref_stmt)
+ {
+   enum tree_code ref_code = TREE_CODE (mem_ref.ref);
+   if (/* Vectorized and non-vectorized direct access.  */
+@@ -1041,7 +1042,8 @@ trace_ref_info (data_ref &mem_ref, set <gimple *> &traced_ref_stmt)
+ /* Trace all references in the loop.  */
+ 
+ void
+-trace_loop_refs_info (vector<data_ref> &refs, set <gimple *> &traced_ref_stmt)
++trace_loop_refs_info (std::vector<data_ref> &refs,
++		      std::set<gimple *> &traced_ref_stmt)
+ {
+   for (unsigned i = 0; i < refs.size (); ++i)
+     {
+@@ -1058,9 +1060,9 @@ trace_loop_refs_info (vector<data_ref> &refs, set <gimple *> &traced_ref_stmt)
+ /* Tracing and sorting reference groups.  */
+ 
+ void
+-trace_data_refs_info (vector<class loop *> &kernels,
+-		      map<class loop*, vector<data_ref> > &loop_refs,
+-		      set <gimple *> &traced_ref_stmt)
++trace_data_refs_info (std::vector<class loop *> &kernels,
++		      std::map<class loop*, std::vector<data_ref> > &loop_refs,
++		      std::set<gimple *> &traced_ref_stmt)
+ {
+   if (dump_file)
+     fprintf (dump_file, "\nPhase 2: trace_all_references_info\n\n");
+@@ -1140,7 +1142,8 @@ loop_bound_iv_p (tree t, tree &outer_loop_t)
+ /* add worklist and walked list.  */
+ 
+ void
+-add_worklist_walked (vector<tree> &worklist, set<tree> &walked, tree node)
++add_worklist_walked (std::vector<tree> &worklist, std::set<tree> &walked,
++		     tree node)
+ {
+   if (!walked.count (node))
+     {
+@@ -1154,7 +1157,8 @@ add_worklist_walked (vector<tree> &worklist, set<tree> &walked, tree node)
+ /* check bound iv and add worklist.  */
+ 
+ void
+-check_bound_iv_and_add_worklist (vector<tree> &worklist, set<tree> &walked,
++check_bound_iv_and_add_worklist (std::vector<tree> &worklist,
++				 std::set<tree> &walked,
+ 				 tree t, data_ref &mem_ref)
+ {
+   if (t == NULL_TREE || TREE_CODE (t) != SSA_NAME)
+@@ -1216,9 +1220,9 @@ trace_loop_bound_iv (data_ref &mem_ref)
+     mem_ref.loop_bounds.push_back (
+ 	    loop_bound (mem_ref.index, SSA_NAME_DEF_STMT (mem_ref.index)));
+ 
+-  vector<tree> worklist;
++  std::vector<tree> worklist;
+   worklist.push_back (mem_ref.base);
+-  set<tree> walked;
++  std::set<tree> walked;
+ 
+   while (worklist.size ())
+     {
+@@ -1509,11 +1513,11 @@ trace_ref_dimension_and_loop_bounds (data_ref &mem_ref)
+ 	loop_bound_dump (dump_file, mem_ref.loop_bounds[i]);
+ 
+       if (niters == NULL_TREE || niters == chrec_dont_know)
+-	mem_ref.calc_by = min (mem_ref.calc_by, UNHANDLE_CALC);
++	mem_ref.calc_by = std::min (mem_ref.calc_by, UNHANDLE_CALC);
+       else if (TREE_CODE (niters) != INTEGER_CST)
+-	mem_ref.calc_by = min (mem_ref.calc_by, RUNTIME_CALC);
++	mem_ref.calc_by = std::min (mem_ref.calc_by, RUNTIME_CALC);
+       else
+-	mem_ref.calc_by = min (mem_ref.calc_by, STATIC_CALC);
++	mem_ref.calc_by = std::min (mem_ref.calc_by, STATIC_CALC);
+     }
+ 
+   if (mem_ref.calc_by == RUNTIME_CALC)
+@@ -1526,12 +1530,12 @@ trace_ref_dimension_and_loop_bounds (data_ref &mem_ref)
+    Return NULL_TREE if not found.  */
+ 
+ tree
+-get_cur_loop_niters (map<class loop*, vector<data_ref> > &loop_refs,
++get_cur_loop_niters (std::map<class loop*, std::vector<data_ref> > &loop_refs,
+ 		     class loop* loop)
+ {
+   if (loop_refs.count (loop) == 0)
+     return NULL_TREE;
+-  vector<loop_bound> bounds = loop_refs[loop][0].loop_bounds;
++  std::vector<loop_bound> bounds = loop_refs[loop][0].loop_bounds;
+   return bounds.size () ? bounds[0].niters : NULL_TREE;
+ }
+ 
+@@ -1575,7 +1579,7 @@ trace_outer_loop_depth (tree niters, unsigned start_depth)
+ 	}
+       /* Termination condition of dfs.  Return the depth of the bb block.  */
+       if (gimple_code (def_stmt) == GIMPLE_PHI
+-          || gimple_code (def_stmt) == GIMPLE_NOP)
++	  || gimple_code (def_stmt) == GIMPLE_NOP)
+ 	{
+ 	  basic_block def_bb = gimple_bb (SSA_NAME_DEF_STMT (niters));
+ 	  if (def_bb == NULL || def_bb->loop_father == NULL)
+@@ -1610,7 +1614,7 @@ trace_outer_loop_depth (tree niters, unsigned start_depth)
+ 		    continue;
+ 		  unsigned depth = trace_outer_loop_depth (subtree, \
+ 				   start_depth);
+-		  min_depth = MIN (min_depth, depth);
++		  min_depth = std::min (min_depth, depth);
+ 		  }
+ 		return min_depth;
+ 	    }
+@@ -1648,7 +1652,7 @@ trace_outer_loop_depth (tree niters, unsigned start_depth)
+ 	  if (subtree == NULL)
+ 	    continue;
+ 	  unsigned depth = trace_outer_loop_depth (subtree, start_depth);
+-	  min_depth = MIN (min_depth, depth);
++	  min_depth = std::min (min_depth, depth);
+ 	}
+       return min_depth;
+     }
+@@ -1668,7 +1672,7 @@ trace_outer_loop_depth (tree niters, unsigned start_depth)
+ /* Traces the ref dimension information in each loop.  */
+ 
+ void
+-analyze_loop_refs_dimension (vector<data_ref> &refs)
++analyze_loop_refs_dimension (std::vector<data_ref> &refs)
+ {
+   for (unsigned i = 0; i < refs.size (); ++i)
+     {
+@@ -1689,9 +1693,10 @@ analyze_loop_refs_dimension (vector<data_ref> &refs)
+ */
+ 
+ bool
+-analyze_nested_kernels (vector<class loop *> &kernels,
+-			map<class loop*, vector<data_ref> > &loop_refs,
+-			set <gimple *> &traced_ref_stmt)
++analyze_nested_kernels (std::vector<class loop *> &kernels,
++			std::map<class loop*,
++				 std::vector<data_ref> > &loop_refs,
++			std::set<gimple *> &traced_ref_stmt)
+ {
+   if (dump_file)
+     fprintf (dump_file, "\nPhase 3: analyze_nested_kernels\n\n");
+@@ -1840,7 +1845,7 @@ next_high_probability_bb (basic_block bb)
+ /* Dump loop header bb.  */
+ 
+ void
+-dump_loop_headers (const char *name, vector<class loop *> &loops)
++dump_loop_headers (const char *name, std::vector<class loop *> &loops)
+ {
+   if (dump_file && (dump_flags & TDF_DETAILS))
+   {
+@@ -1855,15 +1860,15 @@ dump_loop_headers (const char *name, vector<class loop *> &loops)
+ /* Combine and sort candidate loops.  */
+ 
+ bool
+-filter_and_sort_kernels (vector<class loop *> &sorted_kernels,
+-			 vector<class loop *> &kernels)
++filter_and_sort_kernels (std::vector<class loop *> &sorted_kernels,
++			 std::vector<class loop *> &kernels)
+ {
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     fprintf (dump_file, "\nPhase 4: filter_and_sort_kernels:\n\n");
+ 
+-  set<basic_block> end_bb;
+-  list<basic_block> walked_header_bb; /* Used to record nested loops.  */
+-  set<int> walked_non_header_bb_idx;
++  std::set<basic_block> end_bb;
++  std::list<basic_block> walked_header_bb; /* Used to record nested loops.  */
++  std::set<int> walked_non_header_bb_idx;
+ 
+   for (unsigned i = 0; i < kernels.size (); ++i)
+     {
+@@ -1875,7 +1880,7 @@ filter_and_sort_kernels (vector<class loop *> &sorted_kernels,
+ 
+   if (!param_filter_kernels)
+     {
+-      for (vector<class loop *>::iterator it = kernels.begin ();
++      for (std::vector<class loop *>::iterator it = kernels.begin ();
+ 	   it != kernels.end (); ++it)
+ 	sorted_kernels.push_back (*it);
+     }
+@@ -1985,10 +1990,10 @@ struct ref_group
+     110: read, regular, non-parallel
+     111: read, regular, parallel
+   */
+-  map<int, vector<data_ref> > ref_use;
++  std::map<int, std::vector<data_ref> > ref_use;
+ 
+   /* scores for different memory references.  */
+-  vector<ref_score> ref_scores;
++  std::vector<ref_score> ref_scores;
+ 
+   ref_group ()
+     {
+@@ -2003,10 +2008,10 @@ struct ref_group
+ /* calculate reuse level.  */
+ 
+ unsigned int
+-calculate_reuse_level (map<int, vector<data_ref> > &var_use)
++calculate_reuse_level (std::map<int, std::vector<data_ref> > &var_use)
+ {
+   unsigned int level = 0;
+-  for (map<int, vector<data_ref> >::iterator it = var_use.begin ();
++  for (std::map<int, std::vector<data_ref> >::iterator it = var_use.begin ();
+        it != var_use.end (); ++it)
+     {
+       unsigned int parallel = 1;
+@@ -2043,13 +2048,13 @@ ref_group_reuse_cmp (const ref_group &a, const ref_group &b)
+ /* Sort reference groups.  */
+ 
+ void
+-sort_ref_groups (vector<ref_group> &ref_groups,
+-		 map<tree, ref_group> &ref_groups_map)
++sort_ref_groups (std::vector<ref_group> &ref_groups,
++		 std::map<tree, ref_group> &ref_groups_map)
+ {
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     fprintf (dump_file, "\nsort_ref_groups_by_reuse_level\n");
+ 
+-  for (map<tree, ref_group>::iterator it = ref_groups_map.begin ();
++  for (std::map<tree, ref_group>::iterator it = ref_groups_map.begin ();
+        it != ref_groups_map.end (); ++it)
+     {
+       (*it).second.reuse_level = calculate_reuse_level ((*it).second.ref_use);
+@@ -2062,7 +2067,7 @@ sort_ref_groups (vector<ref_group> &ref_groups,
+ 	}
+     }
+ 
+-  sort (ref_groups.begin (), ref_groups.end (), ref_group_reuse_cmp);
++  std::sort (ref_groups.begin (), ref_groups.end (), ref_group_reuse_cmp);
+ 
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     {
+@@ -2111,7 +2116,7 @@ enum data_attribute
+    If the reference group is not found, create a group.  */
+ 
+ void
+-record_mem_ref (map<tree, ref_group> &ref_groups, data_ref &mem_ref)
++record_mem_ref (std::map<tree, ref_group> &ref_groups, data_ref &mem_ref)
+ {
+   unsigned int index = (mem_ref.parallel_p << DA_PARALLEL)
+ 	      + (mem_ref.regular_p << DA_REGULAR) + (mem_ref.read_p << DA_READ);
+@@ -2127,9 +2132,9 @@ record_mem_ref (map<tree, ref_group> &ref_groups, data_ref &mem_ref)
+   /* Ref_groups' calc_by depends on the inserted mem_ref's calc_by.
+      Runtime issue requires the specified mem_ref's calc_by to be >= 1.
+      Temporarily modified ref_group's first_use after sorting mem_refs.  */
+-  ref_groups[mem_ref.var].calc_by = max (ref_groups[mem_ref.var].calc_by,
++  ref_groups[mem_ref.var].calc_by = std::max (ref_groups[mem_ref.var].calc_by,
+ 					 mem_ref.calc_by);
+-  ref_groups[mem_ref.var].var_size = max (ref_groups[mem_ref.var].var_size,
++  ref_groups[mem_ref.var].var_size = std::max (ref_groups[mem_ref.var].var_size,
+ 					  mem_ref.data_size);
+   ref_groups[mem_ref.var].ref_use[index].push_back (mem_ref);
+ 
+@@ -2182,15 +2187,16 @@ data_ref_reuse_cmp (const ref_score &a, const ref_score &b)
+    order of the customized sorting scheme.  */
+ 
+ void
+-sort_mem_ref_in_ref_group (map<tree, ref_group> &ref_groups_map)
++sort_mem_ref_in_ref_group (std::map<tree, ref_group> &ref_groups_map)
+ {
+   if (dump_file)
+     fprintf (dump_file, "\nsorted data_references:\n");
+-  for (map<tree, ref_group>::iterator it = ref_groups_map.begin ();
++  for (std::map<tree, ref_group>::iterator it = ref_groups_map.begin ();
+        it != ref_groups_map.end (); ++it)
+     {
+-      vector<ref_score> &ref_scores = (*it).second.ref_scores;
+-      stable_sort (ref_scores.begin (), ref_scores.end (), data_ref_reuse_cmp);
++      std::vector<ref_score> &ref_scores = (*it).second.ref_scores;
++      std::stable_sort (ref_scores.begin (), ref_scores.end (),
++			data_ref_reuse_cmp);
+       /* Update ref_group's first_use and calc_by with the first mem_ref after
+ 	 sorting.  */
+       (*it).second.first_use = (*it).second.ref_scores[0].d_ref;
+@@ -2214,14 +2220,15 @@ sort_mem_ref_in_ref_group (map<tree, ref_group> &ref_groups_map)
+ /* Tracing and sorting reference groups.  */
+ 
+ bool
+-record_and_sort_ref_groups (vector<ref_group> &ref_groups,
+-			    vector<class loop *> &kernels,
+-			    map<class loop*, vector<data_ref> > &loop_refs)
++record_and_sort_ref_groups (std::vector<ref_group> &ref_groups,
++			    std::vector<class loop *> &kernels,
++			    std::map<class loop*,
++				     std::vector<data_ref> > &loop_refs)
+ {
+   if (dump_file)
+     fprintf (dump_file, "\nPhase 5: trace_all_references_details\n\n");
+ 
+-  map<tree, ref_group> ref_groups_map;
++  std::map<tree, ref_group> ref_groups_map;
+ 
+   for (unsigned i = 0; i < kernels.size (); ++i)
+     {
+@@ -2395,7 +2402,7 @@ issue_builtin_prefetch (data_ref &mem_ref)
+    determination of the ARM SVE architecture before SVE hint insertion.  */
+ 
+ void
+-static_issue (vector<ref_group> &ref_groups, int num_issue_var)
++static_issue (std::vector<ref_group> &ref_groups, int num_issue_var)
+ {
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     fprintf (dump_file, "static issue\n");
+@@ -2425,8 +2432,8 @@ static_issue (vector<ref_group> &ref_groups, int num_issue_var)
+    a COND_EXPR.  */
+ 
+ tree
+-calc_stmts_gen (vector<ref_group> &ref_groups, gimple_seq &cond_expr_stmt_list,
+-		int num_issue_var)
++calc_stmts_gen (std::vector<ref_group> &ref_groups,
++		gimple_seq &cond_expr_stmt_list, int num_issue_var)
+ {
+   /* Accumulated keep size.  */
+   tree total_size = build_real_from_int_cst
+@@ -2483,7 +2490,7 @@ calc_stmts_gen (vector<ref_group> &ref_groups, gimple_seq &cond_expr_stmt_list,
+ /* Runtime form insertion and issue instruction.  */
+ 
+ void
+-runtime_issue (vector<ref_group> &ref_groups, int num_issue_var)
++runtime_issue (std::vector<ref_group> &ref_groups, int num_issue_var)
+ {
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     fprintf (dump_file, "runtime issue\n");
+@@ -2547,7 +2554,7 @@ runtime_issue (vector<ref_group> &ref_groups, int num_issue_var)
+ /* Issue llc hints through prefetch instructions.  */
+ 
+ void
+-issue_llc_hint (vector<ref_group> &ref_groups)
++issue_llc_hint (std::vector<ref_group> &ref_groups)
+ {
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     fprintf (dump_file, "issue_llc_hint:\n");
+@@ -2567,7 +2574,7 @@ issue_llc_hint (vector<ref_group> &ref_groups)
+   if (ref_groups.size () == 0)
+     return;
+ 
+-  int num_issue_var = min (param_issue_topn,
++  int num_issue_var = std::min (param_issue_topn,
+ 			   static_cast<int>(ref_groups.size ()));
+   if (num_issue_var < param_issue_topn
+       && dump_file && (dump_flags & TDF_DETAILS))
+@@ -2583,7 +2590,7 @@ issue_llc_hint (vector<ref_group> &ref_groups)
+     }
+   calc_type topn_calc_type = STATIC_CALC;
+   for (int i = 0; i < num_issue_var; ++i)
+-    topn_calc_type = min (topn_calc_type, ref_groups[i].calc_by);
++    topn_calc_type = std::min (topn_calc_type, ref_groups[i].calc_by);
+ 
+   if (topn_calc_type == STATIC_CALC)
+     {
+@@ -2616,22 +2623,22 @@ issue_llc_hint (vector<ref_group> &ref_groups)
+ void
+ llc_allocate (void)
+ {
+-  map<class loop *, vector<data_ref> > kernels_refs;
+-  vector<class loop *> kernels;
++  std::map<class loop *, std::vector<data_ref> > kernels_refs;
++  std::vector<class loop *> kernels;
+   if (!get_dense_memory_kernels (kernels, kernels_refs))
+     return;
+ 
+-  set <gimple *> traced_ref_stmt;
++  std::set<gimple *> traced_ref_stmt;
+   trace_data_refs_info (kernels, kernels_refs, traced_ref_stmt);
+ 
+   if (!analyze_nested_kernels (kernels, kernels_refs, traced_ref_stmt))
+     return;
+ 
+-  vector<class loop *> sorted_kernels;
++  std::vector<class loop *> sorted_kernels;
+   if (!filter_and_sort_kernels (sorted_kernels, kernels))
+     return;
+ 
+-  vector<ref_group> ref_groups;
++  std::vector<ref_group> ref_groups;
+   if (!record_and_sort_ref_groups (ref_groups, sorted_kernels, kernels_refs))
+     return;
+ 
+-- 
+2.33.0
+
diff --git a/0192-Add-feedback-directed-filter_and_sort_kernels-in-Pha.patch b/0192-Add-feedback-directed-filter_and_sort_kernels-in-Pha.patch
new file mode 100644
index 0000000..93a67d0
--- /dev/null
+++ b/0192-Add-feedback-directed-filter_and_sort_kernels-in-Pha.patch
@@ -0,0 +1,3503 @@
+From 39324addcef7e87170b9377da55d9cc03dc8d314 Mon Sep 17 00:00:00 2001
+From: liuf9 <liufeiyang6@huawei.com>
+Date: Tue, 28 May 2024 22:55:55 +0800
+Subject: [PATCH 1/2] Add feedback-directed filter_and_sort_kernels in Phase 4.
+ Update ref_group and mem_ref index ranking criteria. Add calculation and
+ transference schemes for variable footprint. Extend direct and indirect
+ memory access tracing. Refactor reference base info tracing process in Phase
+ 2. Add function filtering for LLC pass. Add function filtering for LLC pass.
+
+---
+ gcc/auto-profile.c                            |   65 +-
+ gcc/auto-profile.h                            |    1 +
+ gcc/common.opt                                |   10 +
+ gcc/opts.c                                    |   10 +
+ gcc/params.opt                                |   16 +
+ gcc/testsuite/gcc.dg/llc-allocate/llc-1.c     |   16 +-
+ gcc/testsuite/gcc.dg/llc-allocate/llc-2.c     |    8 +-
+ .../llc-allocate/llc-cross-bb-indir-mem-acc.c |   36 +
+ .../llc-feedback-branch-in-loop.c             |   39 +
+ .../llc-allocate/llc-feedback-break-in-loop.c |   41 +
+ .../llc-allocate/llc-feedback-goto-in-loop.c  |   50 +
+ .../llc-feedback-same-loop-cycle.c            |  129 ++
+ .../gcc.dg/llc-allocate/llc-ref-trace.c       |    8 +-
+ .../gcc.dg/llc-allocate/llc-same-loop-cycle.c |    2 +-
+ .../gfortran.dg/llc-allocate/llc-3.f90        |   22 +-
+ gcc/tree-ssa-llc-allocate.c                   | 2000 +++++++++++++----
+ 16 files changed, 2041 insertions(+), 412 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-branch-in-loop.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-break-in-loop.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-goto-in-loop.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-same-loop-cycle.c
+
+diff --git a/gcc/auto-profile.c b/gcc/auto-profile.c
+index f221978fc..8170702b3 100644
+--- a/gcc/auto-profile.c
++++ b/gcc/auto-profile.c
+@@ -98,6 +98,7 @@ along with GCC; see the file COPYING3.  If not see
+ 
+ #define DEFAULT_AUTO_PROFILE_FILE "fbdata.afdo"
+ #define DEFAULT_CACHE_MISSES_PROFILE_FILE "cmsdata.gcov"
++#define DEFAULT_ADDITIONAL_PROFILE_FILE "addldata.gcov"
+ #define AUTO_PROFILE_VERSION 1
+ 
+ namespace autofdo
+@@ -322,6 +323,9 @@ public:
+   /* Mark LOC as annotated.  */
+   void mark_annotated (location_t loc);
+ 
++  /* Compute total count threshold of top functions in sampled data.  */
++  void get_topn_function_total_count_thres (unsigned topn) const;
++
+ private:
+   /* Map from function_instance name index (in string_table) to
+      function_instance.  */
+@@ -354,18 +358,30 @@ static gcov_summary *afdo_profile_info;
+ static bool
+ get_all_profile_names (const char **event_files)
+ {
+-  if (!(flag_auto_profile || flag_cache_misses_profile))
++  if (!(flag_auto_profile
++        || (flag_cache_misses_profile || flag_additional_profile)))
+     {
+       return false;
+     }
+ 
+   event_files[INST_EXEC] = auto_profile_file;
+ 
+-  if (cache_misses_profile_file == NULL)
++  if (flag_cache_misses_profile)
++    {
++      if (cache_misses_profile_file == NULL)
++        {
++          cache_misses_profile_file = DEFAULT_CACHE_MISSES_PROFILE_FILE;
++        }
++      event_files[CACHE_MISSES] = cache_misses_profile_file;
++    }
++  else if (flag_additional_profile)
+     {
+-      cache_misses_profile_file = DEFAULT_CACHE_MISSES_PROFILE_FILE;
++      if (additional_profile_file == NULL)
++        {
++          additional_profile_file = DEFAULT_ADDITIONAL_PROFILE_FILE;
++        }
++      event_files[PMU_EVENT] = additional_profile_file;
+     }
+-  event_files[CACHE_MISSES] = cache_misses_profile_file;
+ 
+   return true;
+ }
+@@ -432,6 +448,9 @@ extend_auto_profile::auto_profile_exist (enum event_type type)
+       case CACHE_MISSES:
+ 	return event_func_map.count (CACHE_MISSES) != 0
+ 	       || event_loc_map.count (CACHE_MISSES) != 0;
++      case PMU_EVENT:
++	return event_func_map.count (PMU_EVENT) != 0
++	       || event_loc_map.count (PMU_EVENT) != 0;
+       default:
+ 	  return false;
+     }
+@@ -450,6 +469,9 @@ extend_auto_profile::dump_event ()
+ 	  case CACHE_MISSES:
+ 	    fprintf (dump_file, "Processing event cache misses.\n");
+ 	    break;
++          case PMU_EVENT:
++	    fprintf (dump_file, "Processing other PMU events.\n");
++	    break;
+ 	  default:
+ 	    break;
+ 	}
+@@ -694,7 +716,7 @@ string_table::get_index (const char *name) const
+   return iter->second;
+ }
+ 
+-/* Return the index of a given function DECL. Return -1 if DECL is not 
++/* Return the index of a given function DECL. Return -1 if DECL is not
+    found in string table.  */
+ 
+ int
+@@ -1128,6 +1150,31 @@ autofdo_source_profile::get_function_instance_by_inline_stack (
+   return s;
+ }
+ 
++/* Compute total count threshold of top functions in sampled data.  */
++
++void
++autofdo_source_profile::get_topn_function_total_count_thres (
++    unsigned topn) const
++{
++  std::set<gcov_type> func_counts;
++  for (name_function_instance_map::const_iterator iter = map_.begin ();
++       iter != map_.end (); ++iter)
++    {
++      if (func_counts.size () < topn)
++        func_counts.insert (iter->second->total_count ());
++      else if (*func_counts.begin () < iter->second->total_count ())
++        {
++          func_counts.erase (func_counts.begin ());
++          func_counts.insert (iter->second->total_count ());
++        }
++    }
++
++  gcov_type func_counts_topn = *func_counts.begin ();
++  if (func_counts.size () == topn
++      && param_llc_allocate_func_counts_threshold < func_counts_topn)
++    param_llc_allocate_func_counts_threshold = func_counts_topn;
++}
++
+ /* Module profile is only used by LIPO. Here we simply ignore it.  */
+ 
+ static void
+@@ -1189,6 +1236,12 @@ read_profile (void)
+       return;
+     }
+ 
++  if (param_llc_allocate_func_topn > 0)
++    {
++      afdo_source_profile->get_topn_function_total_count_thres (
++        param_llc_allocate_func_topn);
++    }
++
+   /* autofdo_module_profile.  */
+   fake_read_autofdo_module_profile ();
+ }
+@@ -2327,4 +2380,4 @@ simple_ipa_opt_pass *
+ make_pass_ipa_extend_auto_profile (gcc::context *ctxt)
+ {
+   return new pass_ipa_extend_auto_profile (ctxt);
+-}
+\ No newline at end of file
++}
+diff --git a/gcc/auto-profile.h b/gcc/auto-profile.h
+index 230d7e68a..254eb19ad 100644
+--- a/gcc/auto-profile.h
++++ b/gcc/auto-profile.h
+@@ -25,6 +25,7 @@ enum event_type
+ {
+   INST_EXEC = 0,
+   CACHE_MISSES,
++  PMU_EVENT,
+   EVENT_NUMBER
+ };
+ 
+diff --git a/gcc/common.opt b/gcc/common.opt
+index 73234dcc3..ea55355be 100644
+--- a/gcc/common.opt
++++ b/gcc/common.opt
+@@ -1097,6 +1097,16 @@ Common Joined RejectNegative Var(cache_misses_profile_file)
+ Use sample profile information for source code cache miss count.  The profile
+ file is specified in the argument.
+ 
++fadditional-profile
++Common Report Var(flag_additional_profile)
++Use additional PMU-event sample profile information for source code bb count.
++The default profile file is addldata.gcov in `pwd`.
++
++fadditional-profile=
++Common Joined RejectNegative Var(additional_profile_file)
++Use additional PMU-event sample profile information for source code bb count.
++The profile file is specified in the argument.
++
+ ; -fcheck-bounds causes gcc to generate array bounds checks.
+ ; For C, C++ and ObjC: defaults off.
+ ; For Java: defaults to on.
+diff --git a/gcc/opts.c b/gcc/opts.c
+index f12b13599..30ac57eec 100644
+--- a/gcc/opts.c
++++ b/gcc/opts.c
+@@ -2695,6 +2695,16 @@ common_handle_option (struct gcc_options *opts,
+ 	}
+       break;
+ 
++    case OPT_fadditional_profile_:
++      opts->x_additional_profile_file = xstrdup (arg);
++      opts->x_flag_additional_profile = true;
++      value = true;
++      /* No break here - do -fadditional-profile processing. */
++      /* FALLTHRU */
++    case OPT_fadditional_profile:
++      opts->x_flag_ipa_extend_auto_profile = value;
++      break;
++
+     case OPT_fipa_struct_reorg_:
+       /* No break here - do -fipa-struct-reorg processing.  */
+       /* FALLTHRU.  */
+diff --git a/gcc/params.opt b/gcc/params.opt
+index 76ae925fd..0c9a270b4 100644
+--- a/gcc/params.opt
++++ b/gcc/params.opt
+@@ -1059,5 +1059,21 @@ Common Joined UInteger Var(param_outer_loop_num) Init(1) IntegerRange(1, 10) Par
+ Maximum number of outer loops allowed to extend outer loops for loops that
+ cannot recognize inner loop boundaries.
+ 
++-param=filter-mode=
++Common Joined UInteger Var(param_filter_mode) Init(1) IntegerRange(0, 1) Param
++Set kernel filtering mode. Use basic block count by default; use branch probability mode when filter mode is turned off.
++
++-param=transfer-footprint=
++Common Joined UInteger Var(param_transfer_footprint) Init(1) IntegerRange(0, 1) Param
++Allow transferring the firstly calculated footprint expression to the target memory reference
++from which it is impossible to retrieve the foortprint.
++
++-param=llc-allocate-func-topn=
++Common Joined UInteger Var(param_llc_allocate_func_topn) Init(0) Param Optimization
++TopN functions of pmu counts to be analyzed in LLC allocation.
++
++-param=llc-allocate-func-counts-threshold=
++Common Joined UInteger Var(param_llc_allocate_func_counts_threshold) Init(1) Param Optimization
++Threshold functions of pmu counts to be analyzed in LLC allocation.
+ 
+ ; This comment is to ensure we retain the blank line above.
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
+index a4828eaab..8990f0a1c 100644
+--- a/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile { target { aarch64*-*-linux* } } } */
+-/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param issue-topn=2" } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param issue-topn=2 --param filter-mode=0" } */
+ 
+ #include <stdio.h>
+ 
+@@ -49,13 +49,13 @@ main (int argc, char *argv[])
+ /* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){3}\}" 1 "llc_allocate" } } */
+ /* { dg-final { scan-tree-dump-times ", size: (?!(0\.000000))" 7 "llc_allocate" } } */
+ /* { dg-final { scan-tree-dump-times ", size: 0\.000000" 19 "llc_allocate" } } */
+-/* { dg-final { scan-tree-dump       "\\d ApsiPtr \\(1.003952, 5, 0\\) : 17" "llc_allocate" } } */
+-/* { dg-final { scan-tree-dump       "\\d psiPtr \\(1.003952, 3, 0\\) : 8" "llc_allocate" } } */
+-/* { dg-final { scan-tree-dump       "\\d diagPtr \\(1.003952, 1, 0\\) : 2" "llc_allocate" } } */
+-/* { dg-final { scan-tree-dump       "\\d lowerPtr \\(2.933319, 1, 0\\) : 2" "llc_allocate" } } */
+-/* { dg-final { scan-tree-dump       "\\d upperPtr \\(2.933319, 1, 0\\) : 2" "llc_allocate" } } */
+-/* { dg-final { scan-tree-dump       "\\d lPtr \\(1.466660, 1, 0\\) : 2" "llc_allocate" } } */
+-/* { dg-final { scan-tree-dump       "\\d uPtr \\(1.466660, 1, 0\\) : 2" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tApsiPtr\\t\\(1.003952, 1, 5, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tpsiPtr\\t\\(1.003952, 1, 3, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tdiagPtr\\t\\(1.003952, 1, 1, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tlowerPtr\\t\\(2.933319, 1, 1, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tupperPtr\\t\\(2.933319, 1, 1, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tlPtr\\t\\(1.466660, 1, 1, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tuPtr\\t\\(1.466660, 1, 1, 0\\)" "llc_allocate" } } */
+ /* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */
+ /* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
+ /* { dg-final { scan-tree-dump-times "insert svprfd" 4 "llc_allocate" } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c
+index f8b1cc5c1..16a56ae03 100644
+--- a/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile { target { aarch64*-*-linux* } } } */
+-/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param force-issue=1" } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param force-issue=1 --param filter-mode=0" } */
+ 
+ #include <stdio.h>
+ 
+@@ -46,9 +46,9 @@ main (int argc, char *argv[])
+ /* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 2 "llc_allocate" } } */
+ /* { dg-final { scan-tree-dump-not   ", size: (?!(0\.000000))" "llc_allocate" } } */
+ /* { dg-final { scan-tree-dump-times ", size: 0\.000000" 6 "llc_allocate" } } */
+-/* { dg-final { scan-tree-dump-times "\\d x_data \\(0.000000, 1, 0\\) : 3" 2 "llc_allocate" } } */
+-/* { dg-final { scan-tree-dump-times "\\d A_j \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } */
+-/* { dg-final { scan-tree-dump-times "\\d A_data \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "\\d\\tx_data\\t\\(0.000000, 1, 1, 0\\)" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "\\d\\tA_j\\t\\(0.000000, 1, 1, 0\\)" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "\\d\\tA_data\\t\\(0.000000, 1, 1, 0\\)" 2 "llc_allocate" } } */
+ /* { dg-final { scan-tree-dump-not   "runtime issue" "llc_allocate" } } */
+ /* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
+ /* { dg-final { scan-tree-dump-times "insert svprfd_gather" 2 "llc_allocate" } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c
+new file mode 100644
+index 000000000..18122c291
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c
+@@ -0,0 +1,36 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c --param=mem-access-ratio=1 --param=mem-access-num=0" } */
++
++/* In this deja test case, we test how Phase 2 & 3 of llc-allocate pass deals
++   with an indirect memory access in a nested loop where the use-block for the
++   induction variable of this memory access is a child/descendent of its
++   def-block (we make it by defining the induction variable in the outer loop).
++   Therefore, the reference can be successfully traced after outer-loop
++   analysis.  */
++#include <stdlib.h>
++#include <time.h> 
++
++void cross_bb_indir_mem_acc (int *arr1, int *arr2, int *arr3, int *arr4, int n) {
++    srand (time (NULL));
++
++    int j_s;
++    int j_e = arr1[0];
++    int k;
++
++    for (int i = 0; i < n; i++)
++    {
++        j_s = j_e;
++        j_e = arr1[i + 1];
++
++        k = arr3[i];
++
++        for (int j = j_s; j < j_e; j++)
++        {
++           arr4[j] -= arr2[k];
++        }
++
++    }
++}
++
++/* { dg-final { scan-tree-dump "Unhandled indirect memory access tracing." "llc_allocate" } } */
++/* { dg-final { scan-tree-dump "Retrace indirect memory access after outer loop analysis:" "llc_allocate" } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-branch-in-loop.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-branch-in-loop.c
+new file mode 100644
+index 000000000..2d541d887
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-branch-in-loop.c
+@@ -0,0 +1,39 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c" } */
++
++/* In this deja test case, we test how Phase 4 of llc-allocate pass deals with
++   loop that contains a branching.  */
++#include <stdio.h>
++
++#define N 131590
++
++double diagPtr[N];
++double psiPtr[N];
++double ApsiPtr[N];
++
++void
++branch_in_loop (double *diagPtr, double *psiPtr, double *ApsiPtr, int nCells)
++{
++  for (int cell=0; cell<nCells; cell++)
++    {
++      if (psiPtr[cell] > 0)
++          ApsiPtr[cell] = 0;
++      else
++          ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
++    }
++}
++
++int
++main (int argc, char *argv[])
++{
++  int nCells = N;
++  int testIter = 100;
++
++  for (int i=0; i<testIter; i++)
++    {
++      branch_in_loop (diagPtr,psiPtr,ApsiPtr,nCells);
++    }
++  return  0;
++}
++
++/* { dg-final { scan-tree-dump "static issue" "llc_allocate" } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-break-in-loop.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-break-in-loop.c
+new file mode 100644
+index 000000000..912dc9355
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-break-in-loop.c
+@@ -0,0 +1,41 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c" } */
++
++/* In this deja test case, we test how Phase 4 of llc-allocate pass deals with
++   loop that contains a break statement (which introduces multiple exits for a
++   loop). Currently, loops with multiple exits are filtered by Phase 1.  */
++#include <stdio.h>
++
++#define N 131590
++
++double diagPtr[N];
++double psiPtr[N];
++double ApsiPtr[N];
++
++void
++break_in_loop (double *diagPtr, double *psiPtr, double *ApsiPtr, int nCells)
++{
++  for (int cell=0; cell<nCells; cell++)
++    {
++      if (psiPtr[cell] > 0)
++          break;
++      ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
++    }
++}
++
++int
++main (int argc, char *argv[])
++{
++  int nCells = N;
++  int testIter = 2;
++
++  for (int i=0; i<testIter; i++)
++    {
++      break_in_loop (diagPtr,psiPtr,ApsiPtr,nCells);
++    }
++  return  0;
++}
++
++/* { dg-final { scan-tree-dump "loop_multiple_exits" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not "Phase 2" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not "static issue" "llc_allocate" } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-goto-in-loop.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-goto-in-loop.c
+new file mode 100644
+index 000000000..e6468fe6a
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-goto-in-loop.c
+@@ -0,0 +1,50 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c" } */
++
++/* In this deja test case, we test how Phase 4 of llc-allocate pass deals with
++   loop that contains a goto statement (which introduces multiple exits for a
++   loop). Currently, loops with multiple exits are filtered by Phase 1.  */
++#include <stdio.h>
++
++#define N 131
++
++double diagPtr[N];
++int psiPtr[N];
++double ApsiPtr[N];
++
++void
++goto_in_loop (double *diagPtr, int *psiPtr, double *ApsiPtr, int nCells)
++{
++  for (int cell=0; cell<nCells; cell++)
++    {
++      if (psiPtr[cell] % 3 == 0)
++          goto zero;
++      else if (psiPtr[cell] % 3 == 1)
++          goto one;
++      else
++          ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
++    }
++
++  zero:
++      ApsiPtr[0] = 0.;
++      return;
++  one:
++      ApsiPtr[0] = 1.;
++}
++
++int
++main (int argc, char *argv[])
++{
++  int nCells = N;
++  int testIter = 2;
++
++  for (int i=0; i<testIter; i++)
++    {
++      goto_in_loop (diagPtr,psiPtr,ApsiPtr,nCells);
++    }
++  return  0;
++}
++
++/* { dg-final { scan-tree-dump "loop_multiple_exits" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not "Phase 2" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not "static issue" "llc_allocate" } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-same-loop-cycle.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-same-loop-cycle.c
+new file mode 100644
+index 000000000..3c959c198
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-same-loop-cycle.c
+@@ -0,0 +1,129 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c --param=force-issue=1" } */
++
++/* In this deja test case, we test how Phase 4 of llc-allocate pass deals with
++   cfg that contains a backedge not being the latch of a formal GCC loop
++   structure.  */
++typedef unsigned long size_t;
++typedef long scalar_t__;
++
++typedef  struct TYPE_13__   TYPE_3__ ;
++typedef  struct TYPE_12__   TYPE_2__ ;
++typedef  struct TYPE_11__   TYPE_1__ ;
++
++struct dom_info {int nodes; int* dfs_parent; int* dfs_order; int* key; int* next_bucket; int* bucket; int* dom; int fake_exit_edge; TYPE_3__** dfs_to_bb; } ;
++typedef  enum cdi_direction { ____Placeholder_cdi_direction } cdi_direction ;
++struct TYPE_11__ {scalar_t__ index; } ;
++typedef  TYPE_1__ edge_iterator ;
++typedef  TYPE_2__* edge ;
++typedef  TYPE_3__* basic_block ;
++struct TYPE_13__ {size_t index; int preds; int succs; } ;
++struct TYPE_12__ {TYPE_3__* src; TYPE_3__* dest; } ;
++typedef  int TBB ;
++
++basic_block ENTRY_BLOCK_PTR ;
++basic_block EXIT_BLOCK_PTR ;
++scalar_t__ bitmap_bit_p (int,size_t) ;
++edge ei_edge (edge_iterator) ;
++int ei_end_p (edge_iterator) ;
++int ei_next (edge_iterator*) ;
++edge_iterator ei_start (int) ;
++size_t eval (struct dom_info*,int) ;
++size_t last_basic_block ;
++int link_roots (struct dom_info*,int,int) ;
++
++__attribute__((used)) static void
++calc_idoms (struct dom_info *di, enum cdi_direction reverse)
++{
++  TBB v, w, k, par;
++  basic_block en_block;
++  edge_iterator ei, einext;
++
++  if (reverse)
++    en_block = EXIT_BLOCK_PTR;
++  else
++    en_block = ENTRY_BLOCK_PTR;
++
++  /* Go backwards in DFS order, to first look at the leafs.  */
++  v = di->nodes;
++  while (v > 1)
++    {
++      basic_block bb = di->dfs_to_bb[v];
++      edge e;
++
++      par = di->dfs_parent[v];
++      k = v;
++
++      ei = (reverse) ? ei_start (bb->succs) : ei_start (bb->preds);
++
++      if (reverse)
++	{
++	  /* If this block has a fake edge to exit, process that first.  */
++	  if (bitmap_bit_p (di->fake_exit_edge, bb->index))
++	    {
++	      einext = ei;
++	      einext.index = 0;
++	      goto do_fake_exit_edge;
++	    }
++	}
++
++      /* Search all direct predecessors for the smallest node with a path
++	 to them.  That way we have the smallest node with also a path to
++	 us only over nodes behind us.  In effect we search for our
++	 semidominator.  */
++      while (!ei_end_p (ei))
++	{
++	  basic_block b;
++	  TBB k1;
++
++	  e = ei_edge (ei);
++	  b = (reverse) ? e->dest : e->src;
++	  einext = ei;
++	  ei_next (&einext);
++
++	  if (b == en_block)
++	    {
++	    do_fake_exit_edge:
++	      k1 = di->dfs_order[last_basic_block];
++	    }
++	  else
++	    k1 = di->dfs_order[b->index];
++
++	  /* Call eval() only if really needed.  If k1 is above V in DFS tree,
++	     then we know, that eval(k1) == k1 and key[k1] == k1.  */
++	  if (k1 > v)
++	    k1 = di->key[eval (di, k1)];
++	  if (k1 < k)
++	    k = k1;
++
++	  ei = einext;
++	}
++
++      di->key[v] = k;
++      link_roots (di, par, v);
++      di->next_bucket[v] = di->bucket[k];
++      di->bucket[k] = v;
++
++      /* Transform semidominators into dominators.  */
++      for (w = di->bucket[par]; w; w = di->next_bucket[w])
++	{
++	  k = eval (di, w);
++	  if (di->key[k] < di->key[w])
++	    di->dom[w] = k;
++	  else
++	    di->dom[w] = par;
++	}
++      /* We don't need to cleanup next_bucket[].  */
++      di->bucket[par] = 0;
++      v--;
++    }
++
++  /* Explicitly define the dominators.  */
++  di->dom[1] = 0;
++  for (v = 2; v <= di->nodes; v++)
++    if (di->dom[v] != di->key[v])
++      di->dom[v] = di->dom[di->dom[v]];
++}
++
++/* { dg-final { scan-tree-dump-times "Warning: Find cycle at bb index" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump "static issue" "llc_allocate" } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c
+index 27cd574cf..be5bac228 100644
+--- a/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c
+@@ -24,11 +24,11 @@ referenceTrace (double *psiPtr, int *lPtr, int *uPtr, int nCells)
+       sum += psiPtr[b[cell]];
+       psiPtr[a[cell]] = sum;
+ 
+-      // Multi-layer array, currently failed tracing at b[cell] and a[cell]
++      // Multi-layer array
+       sum += a[b[cell]];
+       c[a[cell]] = sum;
+ 
+-      // Outer array, inner pointer, currently failed tracing at lPtr[cell]
++      // Outer array, inner pointer
+       sum += a[lPtr[cell]];
+       c[lPtr[cell]] = sum;
+     }
+@@ -57,6 +57,6 @@ main (int argc, char *argv[])
+   return 0;
+ }
+ 
+-/* { dg-final { scan-tree-dump-times "Tracing succeeded" 16 "llc_allocate" } } */
+-/* { dg-final { scan-tree-dump-times "Tracing failed" 8 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "Tracing succeeded" 24 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } */
+ /* { dg-final { scan-tree-dump-times "unhandled issue scene" 2 "llc_allocate" } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-same-loop-cycle.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-same-loop-cycle.c
+index ba5b5b0c8..551bc9897 100644
+--- a/gcc/testsuite/gcc.dg/llc-allocate/llc-same-loop-cycle.c
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-same-loop-cycle.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile { target { aarch64*-*-linux* } } } */
+-/* { dg-options "-O3 -fwhole-program -flto-partition=one -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=1 --param=branch-prob-threshold=50 -c -w" } */ 
++/* { dg-options "-O3 -fwhole-program -flto-partition=one -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=1 --param=branch-prob-threshold=50 -c -w --param=filter-mode=0" } */
+ 
+ typedef unsigned long size_t;
+ typedef long scalar_t__;
+diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90
+index ec918e144..e1df4ef7c 100644
+--- a/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90
++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90
+@@ -1,5 +1,5 @@
+ ! { dg-do compile { target { aarch64*-*-linux* } } }
+-! { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param branch-prob-threshold=50" }
++! { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param branch-prob-threshold=50 --param filter-mode=0" }
+ 
+ program main
+ 
+@@ -198,16 +198,16 @@ END SUBROUTINE calc_p_rho
+ ! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){4}\}" 1 "llc_allocate" } }
+ ! { dg-final { scan-tree-dump-not   ", size: (?!(0\.000000))" "llc_allocate" } }
+ ! { dg-final { scan-tree-dump-times ", size: 0\.000000" 28 "llc_allocate" } }
+-! { dg-final { scan-tree-dump-times "\\d p \\(0.000000, 3, 0\\) : 8" 2 "llc_allocate" } }
+-! { dg-final { scan-tree-dump-times "\\d pm1 \\(0.000000, 2, 0\\) : 5" 2 "llc_allocate" } }
+-! { dg-final { scan-tree-dump-times "\\d ph \\(0.000000, 2, 0\\) : 4" 2 "llc_allocate" } }
+-! { dg-final { scan-tree-dump-times "\\d al \\(0.000000, 1, 0\\) : 3" 2 "llc_allocate" } }
+-! { dg-final { scan-tree-dump-times "\\d alt \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } }
+-! { dg-final { scan-tree-dump-times "\\d t_1 \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } }
+-! { dg-final { scan-tree-dump-times "\\d t_2 \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } }
+-! { dg-final { scan-tree-dump-times "\\d c2a \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } }
+-! { dg-final { scan-tree-dump-times "\\d mu \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } }
+-! { dg-final { scan-tree-dump-times "\\d muts \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tp\\t\\(0.000000, 3, 3, 0\\)" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tpm1\\t\\(0.000000, 3, 2, 0\\)" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tph\\t\\(0.000000, 3, 2, 0\\)" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tal\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\talt\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tt_1\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tt_2\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tc2a\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tmu\\t\\(0.000000, 2, 1, 0\\)" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tmuts\\t\\(0.000000, 2, 1, 0\\)" 2 "llc_allocate" } }
+ ! { dg-final { scan-tree-dump-times "runtime issue" 2 "llc_allocate" } }
+ ! { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } }
+ ! { dg-final { scan-tree-dump-times "insert svprfd" 2 "llc_allocate" } }
+diff --git a/gcc/tree-ssa-llc-allocate.c b/gcc/tree-ssa-llc-allocate.c
+index 0b4ad637d..107d5da26 100644
+--- a/gcc/tree-ssa-llc-allocate.c
++++ b/gcc/tree-ssa-llc-allocate.c
+@@ -57,15 +57,19 @@ along with GCC; see the file COPYING3.  If not see
+ #include "internal-fn.h"
+ #include "tree-cfg.h"
+ #include "profile-count.h"
++#include "auto-profile.h"
+ 
+ /* Number of parallel cores.  */
+ const unsigned int PARALLEL_NUM = 288;
+ 
+ /* Indirect access weight.  */
+-const unsigned int INDIRECT_ACCESS_VALUE = 2;
++const unsigned int INDIRECT_ACCESS_VALUE = 3;
+ 
+ /* Write memory weight.  */
+-const unsigned int WRITE_COST = 2;
++const unsigned int WRITE_COST = 4;
++
++/* Maximum ratio of total prefetch data size to cache size.  */
++const double PREFETCH_CACHE_SIZE_RATIO = 0.8;
+ 
+ /* Prefetch tool input max length.  */
+ #ifndef PREFETCH_TOOL_INPUT_MAX_LEN
+@@ -77,6 +81,14 @@ const unsigned int WRITE_COST = 2;
+ #define PREFETCH_TOOL_NUM_MAX_LEN 9
+ #endif
+ 
++#ifndef PREFETCH_FUNC_TOPN
++#define PREFETCH_FUNC_TOPN param_llc_allocate_func_topn
++#endif
++
++#ifndef PREFETCH_FUNC_COUNTS_THRESHOLD
++#define PREFETCH_FUNC_COUNTS_THRESHOLD param_llc_allocate_func_counts_threshold
++#endif
++
+ namespace {
+ 
+ /* loop bound info of the memory reference located.  */
+@@ -165,6 +177,15 @@ struct data_ref
+   /* True if the memory reference is read.  */
+   unsigned int read_p : 1;
+ 
++  /* loop father depth.  */
++  unsigned int loop_depth;
++
++  /* bb index.  */
++  int bb_idx;
++
++  /* loop index.  */
++  int loop_idx;
++
+   data_ref ()
+     {
+       ref = NULL_TREE;
+@@ -181,6 +202,9 @@ struct data_ref
+       parallel_p = false;
+       regular_p = true;
+       read_p = true;
++      loop_depth = 0;
++      bb_idx = 0;
++      loop_idx = 0;
+     }
+ };
+ 
+@@ -197,6 +221,9 @@ add_ref (std::vector<data_ref> &references, tree op, gimple *stmt,
+   ref.stmt = stmt;
+   ref.vectorize_p = vectorize_p;
+   ref.read_p = read_p;
++  ref.loop_depth = loop_depth (stmt->bb->loop_father);
++  ref.bb_idx = stmt->bb->index;
++  ref.loop_idx = stmt->bb->loop_father->num;
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     {
+       print_generic_expr (dump_file, ref.ref, TDF_LINENO);
+@@ -271,6 +298,51 @@ get_references_in_gimple_call (gimple *stmt, std::vector<data_ref> &references)
+     }
+ }
+ 
++/* Check whether memory reference is located exactly in main function.
++   There are some other unexpected scenarios where mem ref or function is
++   tracing failed without loc info (newly generated gimple/function).  */
++
++bool
++is_reference_in_main_p (gimple *stmt)
++{
++  expanded_location xloc = expand_location (stmt->location);
++  const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (cfun->decl));
++  if (strstr (fn_name, "main") != NULL || strstr (fn_name, "MAIN") != NULL)
++    {
++      /* NEXT STEP: Check why some functions have no end_locus.  */
++      if (!(DECL_SOURCE_LOCATION (current_function_decl)
++	    && cfun->function_end_locus))
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "Cannot find function start-end location.\n");
++	  return true;
++	}
++      else if (!(xloc.file && xloc.line))
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "Cannot find gimple statement location.\n");
++	      print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
++	    }
++	  return false;
++	}
++      int fn_start = expand_location (
++	DECL_SOURCE_LOCATION (current_function_decl)).line;
++      int fn_end = expand_location (cfun->function_end_locus).line;
++
++      if (xloc.line >= fn_start && xloc.line <= fn_end)
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "Memory access in main function: ");
++	      print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
++	    }
++	  return true;
++	}
++    }
++  return false;
++}
++
+ /* Stores the locations of memory references in STMT to REFERENCES.  */
+ 
+ void
+@@ -285,6 +357,12 @@ get_references_in_stmt (gimple *stmt, std::vector<data_ref> &references)
+       print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
+     }
+ 
++  /* Filter out memory references located in main function. This is a
++     experimental filtering scheme ONLY for HPC case verification as
++     some HPC cases assign values for variables (mem ref) in main function.  */
++  if (is_reference_in_main_p (stmt))
++    return;
++
+   if (gimple_code (stmt) == GIMPLE_ASSIGN)
+     {
+       tree op0 = gimple_assign_lhs (stmt);
+@@ -350,7 +428,8 @@ bool use_ext_node_p (const std::vector<data_ref> &references,
+ 
+ bool
+ filter_out_loop_by_stmt_p (loop_filter_out_flag &loop_filter, gimple *stmt,
+-		  const std::vector<data_ref> &references, unsigned int &start)
++			   const std::vector<data_ref> &references,
++			   unsigned int &start)
+ {
+   expanded_location xloc = expand_location (stmt->location);
+   /* check use_ext_call.  */
+@@ -369,7 +448,7 @@ filter_out_loop_by_stmt_p (loop_filter_out_flag &loop_filter, gimple *stmt,
+   if (xloc.file && xloc.column != 1)
+     loop_filter.use_macro_loop = false;
+ 
+-  /* checke use_cond_func, VEC_COND_EXPR/MIN_EXPR/MAX_EXPR.  */
++  /* check use_cond_func, VEC_COND_EXPR/MIN_EXPR/MAX_EXPR.  */
+   if (gimple_code (stmt) == GIMPLE_ASSIGN)
+     {
+       enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
+@@ -549,16 +628,16 @@ dense_memory_p (const std::vector<data_ref> &references, class loop *loop)
+ 
+ void
+ analyze_loop_dense_memory (std::vector<class loop *> &kernels,
+-			   std::map<class loop *,
+-				    std::vector<data_ref> > &kernels_refs,
+-			   class loop *loop)
++			  std::map<class loop *,
++				   std::vector<data_ref> > &kernels_refs,
++			  class loop *loop)
+ {
+   std::vector<data_ref> references;
+   number_of_latch_executions (loop);
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     {
+       fprintf (dump_file, "\n========== Processing loop %d: ==========\n",
+-	      loop->num);
++	       loop->num);
+       loop_dump (dump_file, loop);
+       flow_loop_dump (loop, dump_file, NULL, 1);
+       fprintf (dump_file, "loop unroll: %d\n", loop->unroll);
+@@ -567,7 +646,7 @@ analyze_loop_dense_memory (std::vector<class loop *> &kernels,
+   if (get_loop_exit_edges (loop).length () != 1)
+     {
+       if (dump_file && (dump_flags & TDF_DETAILS))
+-	fprintf (dump_file, "non-dense mem access: loop_branching\n");
++	fprintf (dump_file, "non-dense mem access: loop_multiple_exits\n");
+       return;
+     }
+ 
+@@ -675,6 +754,15 @@ add_worklist (std::vector<tree> &worklist, std::set<tree> &walked,
+ 	      walked.insert (node);
+ 	    }
+ 	}
++      else if (rhs_code == TARGET_MEM_REF || rhs_code == MEM_REF)
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "possibly unnested indirect memory access: ");
++	      print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO);
++	      fprintf (dump_file, "\n");
++	    }
++	}
+       else
+ 	{
+ 	  /* unhandled assign rhs_code: _219 = _17 * _70;
+@@ -722,6 +810,16 @@ trace_base_var_helper (tree arg, std::set<tree> &walked,
+   if (arg == NULL)
+     return;
+ 
++  /* Var_decl type: base address extracted from ARRAY_REF */
++  if (TREE_CODE (TREE_TYPE (arg)) == ARRAY_TYPE && TREE_CODE (arg) == VAR_DECL
++      && generic_decl_p (arg))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "var_decl type\n");
++      base_var_candid[arg] += 1;
++      return;
++    }
++
+   /* Array type.  */
+   tree op0 = NULL;
+   if (TREE_CODE (arg) == ADDR_EXPR
+@@ -774,12 +872,12 @@ trace_base_var_helper (tree arg, std::set<tree> &walked,
+ 
+ /* Identify the base variable traced from base address of memory reference.
+    We recognize that current method could detect several base variable
+-    candidates and the temporary criteria for base variable determination
+-    is that either one of the following statement is true:
+-      1. The number of base variable candidates is 1;
+-      2. The number of detected gimple statements for some variable is 1.
+-    We may use other criteria or relax the current criteria
+-    (e.g., criterion 2: 1 -> any odd number).    */
++   candidates and the temporary criteria for base variable determination
++   is that either one of the following statement is true:
++    1) The number of base variable candidates is 1;
++    2) The number of detected gimple statements for some variable is 1.
++   We may use other criteria or relax the current criteria
++   (e.g., criterion 2: 1 -> any odd number).  */
+ 
+ bool
+ trace_base_var (tree &var, tree arg, std::set<tree> &walked)
+@@ -792,96 +890,107 @@ trace_base_var (tree &var, tree arg, std::set<tree> &walked)
+   else
+     {
+       is_tracing_unusual = true;
+-      for (const std::pair<tree, int>& base_var_count : base_var_candid)
+-	if (base_var_count.second == 1)
+-	  var = base_var_count.first;
++      for (std::map<tree, int>::iterator it = base_var_candid.begin ();
++	   it != base_var_candid.end (); ++it)
++	var = it->second == 1 ? it->first : var;
+     }
++
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     {
+       fprintf (dump_file, "Traced variables at ");
+       print_generic_expr (dump_file, arg, TDF_SLIM);
+       fprintf (dump_file, ":\n");
+-      for (const std::pair<tree, int>& base_var_count : base_var_candid)
+-	fprintf (dump_file, "%s:%d, ", get_name (base_var_count.first),
+-		  base_var_count.second);
++      for (std::map<tree, int>::iterator it = base_var_candid.begin ();
++	   it != base_var_candid.end (); ++it)
++	fprintf (dump_file, "%s:%d, ", get_name (it->first), it->second);
+       fprintf (dump_file, "\n");
+ 
+       if (var == NULL_TREE)
+ 	fprintf (dump_file, "Unhandled scenario for tracing base variable.\n");
+       else if (is_tracing_unusual && var != NULL_TREE)
+ 	fprintf (dump_file, "Tracing unusual number or occurrences of base "
+-		"variables.  Choose %s.\n", get_name (var));
++			    "variables.  Choose %s.\n",
++		 get_name (var));
+     }
+   return var != NULL_TREE;
+ }
+ 
+-/* Tracing direct memory reference information.  */
+-
+-bool
+-trace_direct_mem_ref (data_ref &mem_ref, std::set<gimple *> &traced_ref_stmt)
+-{
+-  if (TREE_CODE (mem_ref.ref) != TARGET_MEM_REF)
+-    return false;
+-
+-  /* Direct memory access, regardless of whether it is in vectorized form,
+-     can be determined through TARGET_MEM_REF.  */
+-  mem_ref.base = TREE_OPERAND (mem_ref.ref, 0);
+-  mem_ref.offset = TREE_OPERAND (mem_ref.ref, 1);
+-  mem_ref.index = TREE_OPERAND (mem_ref.ref, 2);
+-  mem_ref.step = TREE_OPERAND (mem_ref.ref, 3);
+-
+-  std::set<tree> walked;
+-  if (mem_ref.var == NULL_TREE
+-      && !trace_base_var (mem_ref.var, mem_ref.base, walked))
+-    return false;
+-
+-  traced_ref_stmt.insert (mem_ref.stmt);
+-  return true;
+-}
+-
+ /* Recursively trace and check whether the definition stmt of the
+    index operand is a recorded stmt in direct access tracing.
+-   If true, it is an indirect access.  */
++   Return 0 if ref is a direct access a[].
++   Return 1 if ref is a non-nested indirect access a[b[]].
++   Return 2 if ref is a complex indirect memory access, such as a[f(b[])].  */
+ 
+-bool
++int
+ trace_indirect_operand (tree arg, std::set<gimple *> &traced_ref_stmt)
+ {
++  /* Return 0 if tree `arg` is not an SSA for further tracing.  */
+   if (TREE_CODE (arg) != SSA_NAME)
+-    return false;
++    return 0;
+ 
+   gimple *def_stmt = SSA_NAME_DEF_STMT (arg);
+ 
++  /* Return 1 if `index` has been detected as a traced direct memory access
++     before.  */
+   if (traced_ref_stmt.count (def_stmt))
+-    return true;
++    return 1;
+ 
++  /* Return 0 if def stmt of `arg` is not in gimple assign type. Stop tracing
++     index operand and currently no memory access operand is detected.  */
+   if (!def_stmt || !is_gimple_assign (def_stmt))
+-    return false;
++    return 0;
+ 
+   tree_code rhs_code = gimple_assign_rhs_code (def_stmt);
+   /* Collect a whitelist of gimple_assign_rhs_code for tracing pointer/array
+-     type indirect memory access.  Please check examples before function
+-     trace_indirect_ptr and trace_indirect_array.  */
++     type indirect memory access.  */
+   if (rhs_code != MULT_EXPR && rhs_code != NOP_EXPR
+-      && rhs_code != CONVERT_EXPR && rhs_code != PLUS_EXPR
+-      && rhs_code != ARRAY_REF)
+-    return false;
++      && rhs_code != CONVERT_EXPR && rhs_code != PLUS_EXPR)
++    {
++      /* Return 2 if tree code has any type representing references to storge,
++	 implying a complex indirect memory access scenario for future
++	 analysis.  */
++      if (rhs_code == MEM_REF || rhs_code == TARGET_MEM_REF
++	  || rhs_code == ARRAY_REF || rhs_code == ARRAY_RANGE_REF
++	  || rhs_code == COMPONENT_REF || rhs_code == ADDR_EXPR
++	  || rhs_code == INDIRECT_REF)
++	return 2;
++
++      /* Return 0 and stop tracing if tree code is not a common tracing
++	 operand, but still reflected as a non-reference type.
++	 Caveats: if we never deal with this tree code before, maybe it is
++	 more suitable to treat this scenario strictly.  */
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "unknown tracing tree code: %s\n",
++		   get_tree_code_name (rhs_code));
++	  print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      return 0;
++    }
+ 
+   tree op = NULL_TREE;
+   ssa_op_iter iter;
+   FOR_EACH_SSA_TREE_OPERAND (op, def_stmt, iter, SSA_OP_USE)
+     {
+-      if (trace_indirect_operand (op, traced_ref_stmt))
+-	return true;
++      int trace_indir_p = trace_indirect_operand (op, traced_ref_stmt);
++      if (trace_indir_p != 0)
++	return trace_indir_p;
+     }
+-  return false;
++  return 0;
+ }
+ 
+-/* Trace the pointer of the indirect memory access:
+-   1) obtain the base address of the indirect memory access.
+-   2) ensure that the index has been traced in the direct memory access.
+-
+-   _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; // Traced in
+-   direct access
++/* Trace the pointer of the direct/indirect memory access:
++   1) Obtain the base address of the memory access.
++   2) If index variable is formed by another memory access operation (i.e., an
++      indication of indirect memory access), ensure that the index has been
++      traced in an already discovered direct memory access.
++   3) Otherwise, the memory access is in a more complex scenario and we need to
++      postpone the analysis later. For example, the indirect memory access is
++      nested, a[b[c[...]]], or the index variable (formed in another memory
++      access) has not been recorded/traced yet.
++   e.g.,
++   _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B];
+    _4 = (long unsigned int) _1;
+    _5 = _4 * 8;
+    _6 = p(D) + _5; // get base
+@@ -889,56 +998,142 @@ trace_indirect_operand (tree arg, std::set<gimple *> &traced_ref_stmt)
+ */
+ 
+ bool
+-trace_indirect_ptr (tree &base, tree &index, tree arg,
+-		    std::set<gimple *> traced_ref_stmt)
+-{
+-  gimple *def_stmt = SSA_NAME_DEF_STMT (arg);
++trace_ptr_mem_ref (data_ref &mem_ref, std::set<gimple *> &traced_ref_stmt,
++		   std::vector<data_ref> &unresolved_refs)
++{
++  /* Simple scenario:
++     _2208 = np.120_2207 * 8;
++     _1921 = sorted_weight$data_381 + _2208;
++     *_1921 = _2206;
++
++     Complex scenario:
++     MEM[base: _3235, index: ivtmp.2768_3189, step: 4, offset: 0B] = _105;
++     _3236 = (sizetype) _214;
++     _3237 = _3236 * 4;
++     _3238 = _857 + _3237;  // base + index * step
++     _3239 = _3238 + 4;     // offset
++     MEM[base: _3239, index: ivtmp.2768_3189, step: 4, offset: 0B] = 0.0;
++  */
++  tree pointer = TREE_OPERAND (mem_ref.ref, 0);
++  tree offset = TREE_OPERAND (mem_ref.ref, 1);
++  if (TREE_CODE (offset) != INTEGER_CST)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Unhandled scenario for non-constant offset.\n");
++      return false;
++    }
+ 
+-  if (!def_stmt || !is_gimple_assign (def_stmt))
++  /* Tracing back base address from SSA.  */
++  gimple *ptr_def_stmt = SSA_NAME_DEF_STMT (pointer);
++  if (ptr_def_stmt == NULL || gimple_code (ptr_def_stmt) != GIMPLE_ASSIGN
++      || gimple_assign_rhs_code (ptr_def_stmt) != POINTER_PLUS_EXPR)
+     return false;
++  tree base = gimple_assign_rhs1 (ptr_def_stmt);
++  /* index_offset = index * step.  */
++  tree index_offset = gimple_assign_rhs2 (ptr_def_stmt);
+ 
+-  tree_code rhs_code = gimple_assign_rhs_code (def_stmt);
+-  if (rhs_code != POINTER_PLUS_EXPR)
+-    return false;
++  /* Tracing back index from SSA.  */
++  if (TREE_CODE (index_offset) != SSA_NAME)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  if (TREE_CODE (index_offset) == INTEGER_CST)
++	    fprintf (dump_file, "Constant index for memory access.\n");
++	  else
++	    fprintf (dump_file, "Unhandled scenario for index tracing.\n");
++	}
++      return false;
++    }
+ 
+-  /* POINTER_PLUS_EXPR, The first operand is always a pointer/reference type.
+-     The second operand is always an unsigned integer type compatible with
+-     sizetype.  */
+-  base = gimple_assign_rhs1 (def_stmt);
+-  index = gimple_assign_rhs2 (def_stmt);
++  gimple *idx_def_stmt = SSA_NAME_DEF_STMT (index_offset);
++  if (idx_def_stmt == NULL || gimple_code (idx_def_stmt) != GIMPLE_ASSIGN
++      || gimple_assign_rhs_code (idx_def_stmt) != MULT_EXPR)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Unhandled scenario for index tracing.\n");
++      return false;
++    }
+ 
+-  return trace_indirect_operand (index, traced_ref_stmt);
+-}
++  /* Split array index from total offset of index, `index * step`.  */
++  mem_ref.base = base;
++  mem_ref.offset = offset;
++  mem_ref.index = gimple_assign_rhs1 (idx_def_stmt);
++  mem_ref.step = gimple_assign_rhs2 (idx_def_stmt);
++  if (TREE_CODE (gimple_assign_rhs1 (idx_def_stmt)) == INTEGER_CST)
++    {
++      mem_ref.index = gimple_assign_rhs2 (idx_def_stmt);
++      mem_ref.step = gimple_assign_rhs1 (idx_def_stmt);
++    }
+ 
+-/* Trace the array of the indirect memory access:
+-   1) obtain the base address of the indirect memory access.
+-   2) ensure that the index has been traced in the direct memory access.
++  int trace_index_indir_p = trace_indirect_operand (mem_ref.index,
++						    traced_ref_stmt);
++  if (trace_index_indir_p == 0)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Direct memory access tracing succeeded.\n");
++    }
++  else if (trace_index_indir_p == 1)
++    {
++      mem_ref.regular_p = false;
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Indirect memory access tracing succeeded.\n");
++    }
++  else
++    {
++      /* Record indirect memory access with complex scenarios for future
++	 analysis.  */
++      unresolved_refs.push_back (mem_ref);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Unhandled indirect memory access tracing.\n");
++      return false;
++    }
+ 
+-   _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; // Traced in
+-   direct access
+-   _4 = (integer(kind=8)) _1;
+-   _5 = _4 + 135;
+-   _6 = p[_5];       // start tracing
+-*/
++  return true;
++}
++
++/* Tracing direct memory reference information.  */
+ 
+ bool
+-trace_indirect_array (tree &base, tree &index,
+-		      std::set<gimple *> traced_ref_stmt, tree ref)
++trace_direct_mem_ref (data_ref &mem_ref)
+ {
+-  if (TREE_CODE (ref) != ARRAY_REF)
+-    return false;
+-  base = TREE_OPERAND (ref, 0);
+-  index = TREE_OPERAND (ref, 1);
+-  return trace_indirect_operand (index, traced_ref_stmt);
++  /* Direct memory access, regardless of whether it is in vectorized form,
++     can be determined through TARGET_MEM_REF:
++      address = base + index * step + offset.
++     MASK_LOAD example:
++      _43 = &MEM[base: _42, index: ivtmp_140, step: 8, offset: 0B];
++      vect__42.11_160 = .MASK_LOAD (_43, 64B, loop_mask_163);
++
++     In some cases (2D-array or complex-index 1D array), mem_ref's `base`
++     may actually represent `base + index * step` when `base` address updates
++     by a PHI operation, e.g.,
++      MEM[base: _51, offset: 0B]
++      _51 = (void *) ivtmp.18_11;
++      ivtmp.18_11 = PHI <ivtmp.18_43(10), ivtmp.18_52(14)>
++      ivtmp.18_43 = ivtmp.18_11 + 16;
++      ivtmp.18_52 = (unsigned long) _10;
++      _10 = arr2D_29(D) + _9;
++  */
++  mem_ref.base = TREE_OPERAND (mem_ref.ref, 0);
++  mem_ref.offset = TREE_OPERAND (mem_ref.ref, 1);
++  mem_ref.index = TREE_OPERAND (mem_ref.ref, 2);
++  mem_ref.step = TREE_OPERAND (mem_ref.ref, 3);
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "Direct memory access tracing succeeded.\n");
++
++  return true;
+ }
+ 
+-/* Tracing indirect memory reference information.
+-   Include tracing of base addresses and source variable.
+-   _x(ssa name) -> a_2(base addr) -> a(src var)  */
++/* Tracing vectorized indirect memory reference information.
++   MASK_GATHER_LOAD example:
++    vect__45.13_146 = .MASK_LOAD (_41, 32B, loop_mask_153);
++    vect__46.14_145 = (vector([2,2]) long unsigned int) vect__45.13_146;
++    vect_patt_163.15_143 = .MASK_GATHER_LOAD (_144, vect__46.14_145, 8,
++      { 0.0, ... }, loop_mask_153);  */
+ 
+ bool
+-trace_indirect_mem_ref (data_ref &mem_ref,
+-			std::set<gimple *> &traced_ref_stmt)
++trace_indirect_mem_ref_vectorized (data_ref &mem_ref,
++				   std::set<gimple *> &traced_ref_stmt)
+ {
+   /* Processing of vectorization types.  */
+   if (mem_ref.vectorize_p)
+@@ -947,72 +1142,93 @@ trace_indirect_mem_ref (data_ref &mem_ref,
+       if (trace_indirect_operand (op, traced_ref_stmt))
+ 	{
+ 	  mem_ref.base = gimple_call_arg (mem_ref.stmt, 0);
++	  mem_ref.index = gimple_call_arg (mem_ref.stmt, 1);
++	  mem_ref.step = gimple_call_arg (mem_ref.stmt, 2);
+ 	  mem_ref.regular_p = false;
+-	  std::set<tree> walked;
+-	  if (mem_ref.var == NULL_TREE
+-	      && !trace_base_var (mem_ref.var, mem_ref.base, walked))
+-	    return false;
++
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "Indirect memory access tracing succeeded.\n");
+ 	  return true;
+ 	}
+-      return false;
+     }
++  return false;
++}
+ 
+- /* Processing of non-vectorized types.  */
+-  tree op = NULL_TREE;
+-  ssa_op_iter iter;
+-  FOR_EACH_SSA_TREE_OPERAND (op, mem_ref.stmt, iter, SSA_OP_USE)
++/* Trace the array of the indirect memory access:
++   1) Obtain the base address of the indirect memory access.
++   2) Ensure that the index has been traced in the direct memory access.
++   e.g.,
++   _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B];
++   _4 = (integer(kind=8)) _1;
++   _5 = _4 + 135;
++   _6 = p[_5];       // start tracing
++*/
++
++bool
++trace_indirect_array (data_ref &mem_ref, std::set<gimple *> &traced_ref_stmt)
++{
++  tree base = TREE_OPERAND (mem_ref.ref, 0);
++  tree index = TREE_OPERAND (mem_ref.ref, 1);
++  if (trace_indirect_operand (index, traced_ref_stmt))
+     {
++      /* ARRAY_REF, The first operand is the array;
++		    the second is the index.  */
++      mem_ref.base = base;
++      mem_ref.index = index;
++      mem_ref.regular_p = false;
+ 
+-      /* Array type:
+-	 _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B];
+-	 _4 = c[_1];
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Indirect memory access tracing succeeded.\n");
+ 
+-	 Pointer type:
+-	 _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B];
+-	 _4 = (long unsigned int) _1;
+-	 _5 = _4 * 8;
+-	 _6 = p(D) + _5;
+-	 _7 = *_6;
+-      */
+-      tree base = NULL_TREE;
+-      tree index = NULL_TREE;
+-      if (trace_indirect_array (base, index, traced_ref_stmt, mem_ref.ref)
+-	  || trace_indirect_ptr (base, index, op, traced_ref_stmt))
+-	{
+-	  /* ARRAY_REF, The first operand is the array;
+-			the second is the index.  */
+-	  mem_ref.base = base;
+-	  mem_ref.index = index;
+-	  mem_ref.regular_p = false;
+-	  std::set<tree> walked;
+-	  if (mem_ref.var == NULL_TREE
+-	      && !trace_base_var (mem_ref.var, mem_ref.base, walked))
+-	    return false;
+-	  return true;
+-	}
++      return true;
+     }
+ 
+   return false;
+ }
+ 
+-/* Trace references base info:
+-   1) Parallel analysis
+-   2) Memory access rule analysis
+-   3) Tracing base address and source variable of memory references
++/* Trace memory references base info:
++   1) Memory access rule analysis and reference info tracing
++   2) Source variable tracing, along base address of memory reference
+    We will extend parallel analysis later.
+ */
+ 
+ void
+-trace_ref_info (data_ref &mem_ref, std::set<gimple *> &traced_ref_stmt)
++trace_ref_info (data_ref &mem_ref, std::set<gimple *> &traced_ref_stmt,
++		std::vector<data_ref> &unresolved_refs)
+ {
+   enum tree_code ref_code = TREE_CODE (mem_ref.ref);
+-  if (/* Vectorized and non-vectorized direct access.  */
+-      ref_code != TARGET_MEM_REF
+-      /* non-vectorized indirect memory access.  */
+-      && ref_code != MEM_REF && ref_code != ARRAY_REF
+-      /* vectorized indirect memory access.  */
+-      && ref_code != SSA_NAME)
++  /* 1) Direct and indirect access traces.  */
++  switch (ref_code)
+     {
++    case MEM_REF:
++      /* Non-vectorized direct/indirect access by pointer.  */
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "MEM_REF\n");
++      if (!trace_ptr_mem_ref (mem_ref, traced_ref_stmt, unresolved_refs))
++	return;
++      break;
++    case TARGET_MEM_REF:
++      /* Vectorized and non-vectorized direct access.  */
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "TARGET_MEM_REF\n");
++      if (!trace_direct_mem_ref (mem_ref))
++	return;
++      break;
++    case SSA_NAME:
++      /* Vectorized indirect memory access.  */
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "SSA_NAME\n");
++      if (!trace_indirect_mem_ref_vectorized (mem_ref, traced_ref_stmt))
++	return;
++      break;
++    case ARRAY_REF:
++      /* Non-vectorized indirect memory access.  */
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "ARRAY_REF\n");
++      if (!trace_indirect_array (mem_ref, traced_ref_stmt))
++	return;
++      break;
++    default:
+       if (dump_file && (dump_flags & TDF_DETAILS))
+ 	{
+ 	  fprintf (dump_file, "ref is another tree-code: ");
+@@ -1025,15 +1241,19 @@ trace_ref_info (data_ref &mem_ref, std::set<gimple *> &traced_ref_stmt)
+       return;
+     }
+ 
+-  /* 1) Direct and indirect access traces and traces source variables.  */
+-  if (!trace_direct_mem_ref (mem_ref, traced_ref_stmt)
+-      && !trace_indirect_mem_ref (mem_ref, traced_ref_stmt))
++  /* 2) Source variable tracing.  */
++  std::set<tree> walked;
++  if (mem_ref.var == NULL_TREE
++      && !trace_base_var (mem_ref.var, mem_ref.base, walked))
+     {
+       if (dump_file && (dump_flags & TDF_DETAILS))
+-	fprintf (dump_file, "Tracing failed.\n\n");
++	fprintf (dump_file, "Source variable tracing failed.\n\n");
+       return;
+     }
+ 
++  if (mem_ref.regular_p)
++    traced_ref_stmt.insert (mem_ref.stmt);
++
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     fprintf (dump_file, "Tracing succeeded.\n\n");
+   mem_ref.trace_status_p = true;
+@@ -1043,7 +1263,8 @@ trace_ref_info (data_ref &mem_ref, std::set<gimple *> &traced_ref_stmt)
+ 
+ void
+ trace_loop_refs_info (std::vector<data_ref> &refs,
+-		      std::set<gimple *> &traced_ref_stmt)
++		      std::set<gimple *> &traced_ref_stmt,
++		      std::vector<data_ref> &unresolved_refs)
+ {
+   for (unsigned i = 0; i < refs.size (); ++i)
+     {
+@@ -1053,7 +1274,7 @@ trace_loop_refs_info (std::vector<data_ref> &refs,
+ 	  print_generic_expr (dump_file, refs[i].ref, TDF_SLIM);
+ 	  fprintf (dump_file, "\n");
+ 	}
+-      trace_ref_info (refs[i], traced_ref_stmt);
++      trace_ref_info (refs[i], traced_ref_stmt, unresolved_refs);
+     }
+ }
+ 
+@@ -1061,8 +1282,10 @@ trace_loop_refs_info (std::vector<data_ref> &refs,
+ 
+ void
+ trace_data_refs_info (std::vector<class loop *> &kernels,
+-		      std::map<class loop*, std::vector<data_ref> > &loop_refs,
+-		      std::set<gimple *> &traced_ref_stmt)
++		      std::map<class loop *,
++			       std::vector<data_ref> > &loop_refs,
++		      std::set<gimple *> &traced_ref_stmt,
++		      std::vector<data_ref> &unresolved_refs)
+ {
+   if (dump_file)
+     fprintf (dump_file, "\nPhase 2: trace_all_references_info\n\n");
+@@ -1074,7 +1297,60 @@ trace_data_refs_info (std::vector<class loop *> &kernels,
+ 	continue;
+       if (dump_file && (dump_flags & TDF_DETAILS))
+ 	fprintf (dump_file, "loop header %d:\n", loop->header->index);
+-      trace_loop_refs_info (loop_refs[loop], traced_ref_stmt);
++      trace_loop_refs_info (loop_refs[loop], traced_ref_stmt, unresolved_refs);
++    }
++}
++
++/* Retrace references base info for complex scenarios in indirect memory access
++   after Phase 3.  */
++
++void
++retrace_ref_info_unresolved (data_ref &mem_ref,
++			     std::set<gimple *> &traced_ref_stmt)
++{
++  /* 1) Indirect access traces.  */
++  int trace_index_indir_p = trace_indirect_operand (mem_ref.index,
++						    traced_ref_stmt);
++  if (trace_index_indir_p == 1)
++    {
++      mem_ref.regular_p = false;
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Indirect memory access tracing succeeded.\n");
++    }
++
++  /* 2) Source variable tracing.  */
++  std::set<tree> walked;
++  if (mem_ref.var == NULL_TREE
++      && !trace_base_var (mem_ref.var, mem_ref.base, walked))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Source variable tracing failed.\n\n");
++      return;
++    }
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "Tracing succeeded.\n\n");
++  mem_ref.trace_status_p = true;
++}
++
++/* Retrace all unresolved references.  */
++
++void
++retrace_loop_refs_info_unresolved (std::vector<data_ref> &unresolved_refs,
++				   std::set<gimple *> &traced_ref_stmt)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file,
++	     "\nRetrace indirect memory access after outer loop analysis:\n");
++  for (unsigned i = 0; i < unresolved_refs.size (); ++i)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "trace_references_base_info %d:\n", i);
++	  print_generic_expr (dump_file, unresolved_refs[i].ref, TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      retrace_ref_info_unresolved (unresolved_refs[i], traced_ref_stmt);
+     }
+ }
+ 
+@@ -1098,9 +1374,19 @@ loop_bound_iv_p (tree t, tree &outer_loop_t)
+ {
+   if (t == NULL || TREE_CODE (t) != SSA_NAME
+       || TREE_CODE (TREE_TYPE (t)) != INTEGER_TYPE)
+-  return false;
++    return false;
+ 
+   gimple *def_stmt = SSA_NAME_DEF_STMT (t);
++
++  /* NOP_EXPR convertion between PHI node and memory reference due to MACRO.
++    n_898 = PHI <n_907(355), 0(356)>
++    _757 = (sizetype) n_898;
++    _900 = MEM[base: _726, index: _757, step: 8, offset: 0B];
++  */
++  while (gimple_code (def_stmt) == GIMPLE_ASSIGN
++	 && gimple_assign_rhs_code (def_stmt) == NOP_EXPR)
++    def_stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (def_stmt));
++
+   if (gimple_code (def_stmt) != GIMPLE_PHI)
+     return false;
+ 
+@@ -1208,17 +1494,27 @@ check_bound_iv_and_add_worklist (std::vector<tree> &worklist,
+ bool
+ trace_loop_bound_iv (data_ref &mem_ref)
+ {
+-  /* Indirect memory access, the size cannot be determined based on the loop
+-     boundary.  */
+-  if (!mem_ref.regular_p)
+-    return false;
++  /* In indirect memory access, the size cannot be determined based on the
++     loop boundary. However, we can take advantage of loop bound as an upper
++     bound (unrepeated memory access) to predict the variable footprint
++     involved in the specific loop dimension.  */
+ 
+   /* Determine and record the boundary iv of the current index,
+      but do not trace it.  */
+   tree outer_loop_t = NULL_TREE;
+-  if (loop_bound_iv_p (mem_ref.index, outer_loop_t))
+-    mem_ref.loop_bounds.push_back (
++  /* indirect access example, mem_ref.index = _64
++    _62 = MEM[symbol: uPtr, index: ivtmp.22_96, step: 4, offset: 0B];
++    _63 = (long unsigned int) _62;
++    _64 = _63 * 8;
++    _65 = [openfoam_smooth.c:28:28] &bPrimePtr + _64;
++    _66 = *_65;  */
++  if (loop_bound_iv_p (mem_ref.index, outer_loop_t) || !mem_ref.regular_p)
++    {
++      mem_ref.loop_bounds.push_back (
+ 	    loop_bound (mem_ref.index, SSA_NAME_DEF_STMT (mem_ref.index)));
++      if (!mem_ref.regular_p)
++	return false;
++    }
+ 
+   std::vector<tree> worklist;
+   worklist.push_back (mem_ref.base);
+@@ -1236,7 +1532,7 @@ trace_loop_bound_iv (data_ref &mem_ref)
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     {
+       fprintf (dump_file, "\nmem_ref access dimension: %ld\n",
+-	      mem_ref.loop_bounds.size ());
++	       mem_ref.loop_bounds.size ());
+       fprintf (dump_file, "Traced variables: ");
+       print_generic_expr (dump_file, mem_ref.base, TDF_SLIM);
+       fprintf (dump_file, "\n");
+@@ -1263,7 +1559,7 @@ loop_bound_dump (FILE *file, loop_bound &lb)
+     fprintf (file, ", latch = %d", loop->latch->index);
+   fprintf (file, ", lb_niters = ");
+   print_generic_expr (file, lb.niters);
+-  fprintf (file, ")\n");
++  fprintf (file, ")\n\n");
+ }
+ 
+ /* static calculate data size.  */
+@@ -1275,10 +1571,11 @@ static_calculate_data_size (data_ref &mem_ref)
+     fprintf (dump_file, "\nstatic_calculate_data_size\n");
+ 
+   tree size_unit = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (mem_ref.var)));
+-  HOST_WIDE_INT type_size = size_unit ? tree_to_uhwi (size_unit) : 0;
++  unsigned HOST_WIDE_INT type_size = size_unit ? tree_to_uhwi (size_unit) : 0;
+   for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i)
+     {
+-      HOST_WIDE_INT est_niter = tree_to_uhwi (mem_ref.loop_bounds[i].niters);
++      unsigned HOST_WIDE_INT est_niter = tree_to_uhwi
++					   (mem_ref.loop_bounds[i].niters);
+       unsigned int unroll = mem_ref.loop_bounds[i].unroll;
+       if (i == 0)
+ 	{
+@@ -1323,8 +1620,8 @@ trace_and_create_dominate_expr (tree expr, class loop *outermost)
+   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
+   tree_code_class code_class = TREE_CODE_CLASS (rhs_code);
+   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
+-  tree rhs1 = trace_and_create_dominate_expr
+-		(gimple_assign_rhs1 (stmt), outermost);
++  tree rhs1 = trace_and_create_dominate_expr (gimple_assign_rhs1 (stmt),
++					      outermost);
+   if (rhs1 == NULL_TREE)
+     return NULL_TREE;
+ 
+@@ -1341,8 +1638,8 @@ trace_and_create_dominate_expr (tree expr, class loop *outermost)
+     }
+   else if (code_class == tcc_binary)
+     {
+-      tree rhs2 = trace_and_create_dominate_expr
+-		    (gimple_assign_rhs2 (stmt), outermost);
++      tree rhs2 = trace_and_create_dominate_expr (gimple_assign_rhs2 (stmt),
++						  outermost);
+       if (rhs2 == NULL_TREE)
+ 	return NULL_TREE;
+ 
+@@ -1425,8 +1722,8 @@ void
+ trace_and_create_dominate_loop_bounds (data_ref &mem_ref)
+ {
+   /* Check whether the niters is a loop dominant.
+-     If not, trace and determine whether the result is dominant.  If yes, create
+-     the expr of the dominant node.
++     If not, trace and determine whether the result is dominant.  If yes,
++     create the expr of the dominant node.
+   */
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     fprintf (dump_file, "\ntrace_and_create_dominate_loop_bounds\n");
+@@ -1509,6 +1806,28 @@ trace_ref_dimension_and_loop_bounds (data_ref &mem_ref)
+       if ((niters == chrec_dont_know) && loop->vec_nb_iterations
+ 	   && (loop->vec_nb_iterations != chrec_dont_know))
+ 	niters = loop->vec_nb_iterations;
++
++      if (niters == chrec_dont_know)
++	{
++	  /* We derive est_loop_niters from function
++	     `estimated_loop_iterations_int`. Usually only the innermost loop is
++	     vectorized, so vec_nb_iterations can be 4 or 8 times as large as
++	     `est_loop_niters` due to vectorization. However, function
++	     `estimated_loop_iterations_int` only returns an integer instead of
++	     a tree node expression, so it cannot substitute
++	     function `number_of_latch_executions` in runtime computation.  */
++	  HOST_WIDE_INT est_loop_niters = estimated_loop_iterations_int (loop);
++	  if (est_loop_niters >= 0 && est_loop_niters < INT_MAX)
++	    /* e.g., loop iterations from `estimated_loop_iterations_int`: (-1)
++	       loop_144 (header = 519, latch = 625, niter = scev_not_known,
++	       upper_bound = 1073741823, likely_upper_bound = 1073741823,
++	       unroll = 1)  */
++	    /* variable `niters` from `loop->vec_nb_iterations`
++	       <integer_cst 0xfffff57df5d0 type
++	       <integer_type 0xfffff625a1f8> constant 34>  */
++	    niters = build_int_cst (integer_type_node, (int) est_loop_niters);
++	}
++
+       if (dump_file && (dump_flags & TDF_DETAILS))
+ 	loop_bound_dump (dump_file, mem_ref.loop_bounds[i]);
+ 
+@@ -1518,6 +1837,24 @@ trace_ref_dimension_and_loop_bounds (data_ref &mem_ref)
+ 	mem_ref.calc_by = std::min (mem_ref.calc_by, RUNTIME_CALC);
+       else
+ 	mem_ref.calc_by = std::min (mem_ref.calc_by, STATIC_CALC);
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  if (mem_ref.calc_by == 2)
++	    {
++	      fprintf (dump_file, "\nniters: ");
++	      print_generic_expr (dump_file, niters, TDF_SLIM);
++	      fprintf (dump_file, "\nSTATIC_CALC.\n");
++	    }
++	  else if (mem_ref.calc_by == 1)
++	    {
++	      fprintf (dump_file, "\nniters: ");
++	      print_generic_expr (dump_file, niters, TDF_SLIM);
++	      fprintf (dump_file, "\nRUNTIME_CALC.\n");
++	    }
++	  else
++	    fprintf (dump_file, "\nUNHANDLE_CALC.\n");
++	}
+     }
+ 
+   if (mem_ref.calc_by == RUNTIME_CALC)
+@@ -1530,8 +1867,8 @@ trace_ref_dimension_and_loop_bounds (data_ref &mem_ref)
+    Return NULL_TREE if not found.  */
+ 
+ tree
+-get_cur_loop_niters (std::map<class loop*, std::vector<data_ref> > &loop_refs,
+-		     class loop* loop)
++get_cur_loop_niters (std::map<class loop *, std::vector<data_ref> > &loop_refs,
++		     class loop *loop)
+ {
+   if (loop_refs.count (loop) == 0)
+     return NULL_TREE;
+@@ -1589,7 +1926,7 @@ trace_outer_loop_depth (tree niters, unsigned start_depth)
+ 	    {
+ 	      fprintf (dump_file, "Stop tracing the outer loop depth, ");
+ 	      fprintf (dump_file, "current depth: %d, current bb: %d\n",
+-				  ret_depth, def_bb->index);
++		       ret_depth, def_bb->index);
+ 	    }
+ 	  return ret_depth;
+ 	}
+@@ -1614,7 +1951,7 @@ trace_outer_loop_depth (tree niters, unsigned start_depth)
+ 		    continue;
+ 		  unsigned depth = trace_outer_loop_depth (subtree, \
+ 				   start_depth);
+-		  min_depth = std::min (min_depth, depth);
++		  min_depth = MIN (min_depth, depth);
+ 		  }
+ 		return min_depth;
+ 	    }
+@@ -1622,15 +1959,15 @@ trace_outer_loop_depth (tree niters, unsigned start_depth)
+       else
+ 	{
+ 	  /* Adding termination conditions:
+-	   1.  Niters is MEM variable;
+-	   2.  Niters is a runtime value (smooth_uPtr), and consider \
++	   1)  Niters is MEM variable;
++	   2)  Niters is a runtime value (smooth_uPtr), and consider
+ 	       finding footprint in other mem_ref;
+-	   3.  Niters is loop variable (i_start/i_end), and the boundary in \
++	   3)  Niters is loop variable (i_start/i_end), and the boundary in
+ 	       the outer loop depends on the variable j_start/j_end.  */
+ 	  if (dump_file && (dump_flags & TDF_DETAILS))
+ 	    {
+-	      fprintf (dump_file, "The loop termination condition");
+-	      fprintf (dump_file, "is to be extended.\n");
++	      fprintf (dump_file, "The loop termination condition is "
++				  "extended.\n");
+ 	    }
+ 	  return start_depth;
+ 	}
+@@ -1652,7 +1989,7 @@ trace_outer_loop_depth (tree niters, unsigned start_depth)
+ 	  if (subtree == NULL)
+ 	    continue;
+ 	  unsigned depth = trace_outer_loop_depth (subtree, start_depth);
+-	  min_depth = std::min (min_depth, depth);
++	  min_depth = MIN (min_depth, depth);
+ 	}
+       return min_depth;
+     }
+@@ -1660,7 +1997,7 @@ trace_outer_loop_depth (tree niters, unsigned start_depth)
+     {
+       if (dump_file && (dump_flags & TDF_DETAILS))
+ 	{
+-	  fprintf (dump_file, "niters is another tree code: %s\n", \
++	  fprintf (dump_file, "niters is another tree code: %s\n",
+ 		   get_tree_code_name (niter_code));
+ 	  print_generic_expr (dump_file, niters, TDF_SLIM);
+ 	  fprintf (dump_file, "\n");
+@@ -1687,16 +2024,18 @@ analyze_loop_refs_dimension (std::vector<data_ref> &refs)
+       trace_ref_dimension_and_loop_bounds (refs[i]);
+     }
+ }
++
+ /* analyze nested kernels
+-   1. multidimension loop analyze
+-   2. extended outer loop analyze
++   1) multidimension loop analyze
++   2) extended outer loop analyze
+ */
+ 
+ bool
+ analyze_nested_kernels (std::vector<class loop *> &kernels,
+-			std::map<class loop*,
++			std::map<class loop *,
+ 				 std::vector<data_ref> > &loop_refs,
+-			std::set<gimple *> &traced_ref_stmt)
++			std::set<gimple *> &traced_ref_stmt,
++			std::vector<data_ref> &unresolved_refs)
+ {
+   if (dump_file)
+     fprintf (dump_file, "\nPhase 3: analyze_nested_kernels\n\n");
+@@ -1706,7 +2045,7 @@ analyze_nested_kernels (std::vector<class loop *> &kernels,
+   unsigned init_kernels_size = kernels.size ();
+   for (unsigned i = 0; i < init_kernels_size; ++i)
+     {
+-      class loop* loop = kernels[i];
++      class loop *loop = kernels[i];
+       if (loop_refs.count (loop) == 0)
+ 	continue;
+ 
+@@ -1715,11 +2054,11 @@ analyze_nested_kernels (std::vector<class loop *> &kernels,
+       analyze_loop_refs_dimension (loop_refs[loop]);
+ 
+       unsigned depth = loop_depth (loop);
+-      unsigned outer_depth = trace_outer_loop_depth (get_cur_loop_niters \
++      unsigned outer_depth = trace_outer_loop_depth (get_cur_loop_niters
+ 			     (loop_refs, loop), depth);
+       if (dump_file && (dump_flags & TDF_DETAILS))
+-	fprintf (dump_file, "cur_depth: %d, outer_depth: %d\n", \
+-			    depth, outer_depth);
++	fprintf (dump_file, "cur_depth: %d, outer_depth: %d\n",
++		 depth, outer_depth);
+       /* param_outer_loop_num: number of loops of the extended outer loop.
+ 	 Outermost loop should not be extended when outer_depth = 0.
+ 	 `outer_depth == depth` means the current loop is the loop which
+@@ -1727,19 +2066,20 @@ analyze_nested_kernels (std::vector<class loop *> &kernels,
+       if (outer_depth == 0 || outer_depth == depth
+ 	  || depth > outer_depth + param_outer_loop_num)
+ 	continue;
++
+       /* Extend outer loop.  */
+       if (dump_file && (dump_flags & TDF_DETAILS))
+ 	fprintf (dump_file, "\nStart extending outer loop\n");
+-      /* Superloops of the loop, start from the loop closest to the \
++      /* Superloops of the loop, start from the loop closest to the
+ 	  current loop in the outermost loop.  */
+-      for (unsigned j = 0; j < param_outer_loop_num && --depth; ++j)
++      for (int j = 0; j < param_outer_loop_num && --depth; ++j)
+ 	{
+-	  class loop* outer_loop = (*loop->superloops)[depth];
++	  class loop *outer_loop = (*loop->superloops)[depth];
+ 	  /* The outer loop may be added when analyzing previous inner loops,
+ 	     i.e. the outer loop contains two or more inner loops.  */
+ 	  if (loop_refs.count (outer_loop))
+ 	    continue;
+-	  /* phase1~phase3 analysis on the extended outer loop.  */
++	  /* phase1 ~ phase3 analysis on the extended outer loop.  */
+ 	  analyze_loop_dense_memory (kernels, loop_refs, outer_loop);
+ 	  if (loop_refs.count (outer_loop) == 0)
+ 	    continue;
+@@ -1748,15 +2088,16 @@ analyze_nested_kernels (std::vector<class loop *> &kernels,
+ 	      if (dump_file && (dump_flags & TDF_DETAILS))
+ 		{
+ 		  fprintf (dump_file, "outer_analyze_nested_kernels %d: ", k);
+-		  print_generic_expr (dump_file, loop_refs[outer_loop][k].ref,\
+-		  TDF_SLIM);
++		  print_generic_expr (dump_file, loop_refs[outer_loop][k].ref,
++				      TDF_SLIM);
+ 		  fprintf (dump_file, "\n");
+ 		}
+ 	    }
+-	  trace_loop_refs_info (loop_refs[outer_loop], traced_ref_stmt);
++	  trace_loop_refs_info (loop_refs[outer_loop], traced_ref_stmt,
++				unresolved_refs);
+ 	  analyze_loop_refs_dimension (loop_refs[outer_loop]);
+-	  outer_depth = trace_outer_loop_depth (get_cur_loop_niters \
+-					       (loop_refs, outer_loop), depth);
++	  outer_depth = trace_outer_loop_depth (get_cur_loop_niters
++						(loop_refs, outer_loop), depth);
+ 	  /* `outer_depth == depth` means the current loop is the loop which
+ 	   boundary is known, so there is no need to extend the outer loop.  */
+ 	  if (outer_depth == depth)
+@@ -1817,9 +2158,9 @@ next_high_probability_bb (basic_block bb)
+       if ((true_edge_prob >= (param_branch_prob_threshold / 100.0) - minimum)
+ 	  && flow_bb_inside_loop_p (bb->loop_father, true_edge->dest))
+ 	return true_edge->dest;
+-      else if ((false_edge_prob >= (param_branch_prob_threshold / 100.0)
+-		- minimum) && flow_bb_inside_loop_p (bb->loop_father,
+-		false_edge->dest))
++      else if ((false_edge_prob
++		>= (param_branch_prob_threshold / 100.0) - minimum)
++	       && flow_bb_inside_loop_p (bb->loop_father, false_edge->dest))
+ 	return false_edge->dest;
+       else
+ 	{
+@@ -1848,13 +2189,13 @@ void
+ dump_loop_headers (const char *name, std::vector<class loop *> &loops)
+ {
+   if (dump_file && (dump_flags & TDF_DETAILS))
+-  {
+-    fprintf (dump_file, "\n\n%s:\n", name);
+-    fprintf (dump_file, "{ ");
+-    for (unsigned int i = 0; i < loops.size (); i++)
+-      fprintf (dump_file, "%d(%d) ", loops[i]->num, loops[i]->header->index);
+-    fprintf (dump_file, "}\n\n");
+-  }
++    {
++      fprintf (dump_file, "\n\n%s:\n", name);
++      fprintf (dump_file, "{ ");
++      for (unsigned int i = 0; i < loops.size (); i++)
++	fprintf (dump_file, "%d(%d) ", loops[i]->num, loops[i]->header->index);
++      fprintf (dump_file, "}\n\n");
++    }
+ }
+ 
+ /* Combine and sort candidate loops.  */
+@@ -1905,7 +2246,7 @@ filter_and_sort_kernels (std::vector<class loop *> &sorted_kernels,
+ 		{
+ 		  if (dump_file && (dump_flags & TDF_DETAILS))
+ 		    fprintf (dump_file, "Find same-loop cycle.  "
+-			     "Abort filtering process.\n");
++					"Abort filtering process.\n");
+ 		  return false;
+ 		}
+ 	      walked_non_header_bb_idx.insert (bb->index);
+@@ -1942,21 +2283,469 @@ filter_and_sort_kernels (std::vector<class loop *> &sorted_kernels,
+   return true;
+ }
+ 
+-/* ================ phase 5 record_and_sort_ref_groups ================  */
+-/* Memory reference score, different aspects of one memory reference.  */
++/* Check whether the given bb is null.  */
+ 
+-struct ref_score
++bool
++check_null_bb (basic_block bb)
+ {
+-  /* certain memory reference.  */
+-  data_ref d_ref;
++  if (bb == NULL)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Unexpected error at null bb.\n");
++      return true;
++    }
++  return false;
++}
+ 
+-  /* local count for bb where memory reference is located.  */
+-  gcov_type bb_count;
++/* Check whether the loop father of the given bb is null.  */
+ 
+-  /* line-location of memory reference.  */
+-  int line;
+-};
++bool
++check_null_loop_father (basic_block bb)
++{
++  if (check_null_bb (bb))
++    return true;
+ 
++  if (bb->loop_father == NULL)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "bb %d's loop father is null.\n", bb->index);
++      return true;
++    }
++  return false;
++}
++
++/* States for bb during path traversal.  */
++
++enum bb_traversal_state
++{
++  NOT_TRAVERSED = 0,
++  UNDER_TRAVERSAL,
++  FULLY_TRAVERSED
++};
++
++/* Detect abnormal revisit for bb during path traversal where bb is
++   1) fully traversed,
++   2) non-loop-header bb but currently under traversal.  */
++
++bool
++revisit_bb_abnormal_p (basic_block bb, std::vector<int> &bb_visited,
++		       const std::set<int> &header_bb_idx_set,
++		       std::set<std::pair<int, int> > &backedges,
++		       int src_bb_idx)
++{
++  /* If the header bb has been already fully traversed, early exit
++     the function.  */
++  if (bb_visited[bb->index] == FULLY_TRAVERSED)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Already visited bb index %d. Abort.\n",
++		 bb->index);
++      return true;
++    }
++
++  /* If we revisit a non-header bb during next-bb traversal, we detect
++     an inner-loop cycle and dump warning info. Record this abnormal edge
++     in `backedges` for special treatment in path weight update.  */
++  if (!header_bb_idx_set.count (bb->index)
++      && bb_visited[bb->index] == UNDER_TRAVERSAL)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Warning: Find cycle at bb index %d. Abort.\n",
++		 bb->index);
++      backedges.insert (std::make_pair (src_bb_idx, bb->index));
++      return true;
++    }
++
++  return false;
++}
++
++/* Check successor bb through edge e. Return true if successor bb is NULL or
++   out of loop.  */
++
++bool
++check_succ_bb_abnormal_p (basic_block bb, edge e)
++{
++  if (check_null_bb (e->dest))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Null bb connected to src bb %d.\n", bb->index);
++
++      return true;
++    }
++
++  /* If bb is within one loop and the edge is pointing to the
++     outer loop, skip edge processing until a backedge to header
++     bb. `loop->num = 0` represents function body.  */
++  if (bb->loop_father->num != 0
++      && !flow_bb_inside_loop_p (bb->loop_father, e->dest))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Find edges to the outer loop at bb "
++			    "index %d to bb index %d. Abort.\n",
++		 bb->index, e->dest->index);
++
++      return true;
++    }
++
++  return false;
++}
++
++/* Criteria for retrieving the next bb in modified control-flow graph, which
++   creates a topological order for the bb traversal.  */
++
++void
++get_next_toposort_bb (basic_block bb, std::vector<int> &bb_visited,
++		      std::list<basic_block> &bb_topo_order,
++		      const std::set<int> &header_bb_idx_set,
++		      std::set<std::pair<int, int> > &backedges,
++		      int src_bb_idx)
++{
++  /* 1) Before bb returns to the loop header, bb will not go to the outer loop.
++     2) After returning to the loop header, traverse all exit_bbs.
++     NEXT STEP:
++     1) If goto jumps out of 2 loops, goto has to traverse smaller jumps first.
++     2) If path length is the same => choose higher depth traversal path.  */
++  if (check_null_bb (bb) || check_null_loop_father (bb))
++    return;
++
++  /* Find last bb of function.  */
++  if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun))
++    return;
++
++  if (revisit_bb_abnormal_p (bb, bb_visited, header_bb_idx_set, backedges,
++			     src_bb_idx))
++    return;
++
++  /* If we revisit the header bb of a loop, traverse all exit bbs.  */
++  if (header_bb_idx_set.count (bb->index)
++      && bb_visited[bb->index] == UNDER_TRAVERSAL)
++    {
++      unsigned i;
++      edge e;
++      vec<edge> exits = get_loop_exit_edges (bb->loop_father);
++
++      if (exits.length () > 1 && dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Detect multiple exits at loop %d.\n",
++		 bb->loop_father->num);
++
++      FOR_EACH_VEC_ELT (exits, i, e)
++	{
++	  get_next_toposort_bb (e->dest, bb_visited, bb_topo_order,
++				header_bb_idx_set, backedges, bb->index);
++	}
++      return;
++    }
++
++  /* Post-order traversal for normal bb.  */
++  bb_visited[bb->index] = UNDER_TRAVERSAL;
++  edge e;
++  edge_iterator ei;
++
++  FOR_EACH_EDGE (e, ei, bb->succs)
++    {
++      if (check_succ_bb_abnormal_p (bb, e))
++	continue;
++
++      get_next_toposort_bb (e->dest, bb_visited, bb_topo_order,
++			    header_bb_idx_set, backedges, bb->index);
++    }
++
++  /* bb is marked as fully traversed and all its descendents have been
++      fully traversed due to post-order traversal.  */
++  bb_visited[bb->index] = FULLY_TRAVERSED;
++  bb_topo_order.push_back (bb);
++}
++
++/* A struct that represents the longest path weight at each bb.  */
++
++struct weight
++{
++  /* Longest path weight at current bb.  */
++  gcov_type bb_count;
++
++  /* Prev bb from the current longest path.  */
++  int prev_bb_idx;
++};
++
++/* A helper function for checking whether overflow will occur when adding two
++   gcov_type weights.  */
++
++bool
++check_weight_overflow (gcov_type a, gcov_type b)
++{
++  if ((a > 0 && b > INT64_MAX - a) || (a < 0 && b < INT64_MIN - a))
++    return true;
++
++  return false;
++}
++
++/* A helper function that update the weight of the current longest path to
++   bb_idx_dst and a new path pointing from bb_idx_src to bb_idx_dst.  */
++
++void
++update_path_weight (std::vector<weight> &bb_weights, int bb_idx_src,
++		    int bb_idx_dst, gcov_type weight_dst)
++{
++  if (check_weight_overflow (bb_weights[bb_idx_src].bb_count, weight_dst)
++      && dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "WARNING: Path weight overflow at src bb %d "
++			  "and dest bb %d.\n",
++	       bb_idx_src, bb_idx_dst);
++    }
++  if (bb_weights[bb_idx_dst].bb_count
++      < bb_weights[bb_idx_src].bb_count + weight_dst)
++    {
++      bb_weights[bb_idx_dst].bb_count
++	= bb_weights[bb_idx_src].bb_count + weight_dst;
++      bb_weights[bb_idx_dst].prev_bb_idx = bb_idx_src;
++    }
++}
++
++/* Check whether the required bb/loop info for path update is null.  */
++
++bool
++check_null_info_in_path_update (basic_block bb, edge e)
++{
++  if (check_null_bb (e->dest))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Null bb detected for edge connected "
++			    "to src bb %d.\n",
++		 bb->index);
++      return true;
++    }
++
++  if (check_null_loop_father (bb) || check_null_loop_father (e->dest))
++    return true;
++
++  return false;
++}
++
++/* Update path weight to loop exit bbs where the current source bb is connected
++   to header bb using a backedge.  */
++
++void
++update_backedge_path_weight (std::vector<weight> &bb_weights, basic_block bb)
++{
++  unsigned i;
++  edge e_exit;
++  vec<edge> exits = get_loop_exit_edges (bb->loop_father);
++  FOR_EACH_VEC_ELT (exits, i, e_exit)
++    {
++      if (check_null_bb (e_exit->dest))
++	{
++	  if (e_exit->src != NULL && dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "Null bb detected for exiting edge "
++				"connected to src bb %d.\n",
++		     e_exit->src->index);
++	  continue;
++	}
++
++      update_path_weight (bb_weights, bb->index, e_exit->dest->index,
++			  e_exit->dest->count.to_gcov_type ());
++    }
++}
++
++/* Update the longest length of the path through control flow graph.  */
++
++void
++update_max_length_of_path (std::vector<weight> &bb_weights,
++			   std::list<basic_block> &bb_topo_order,
++			   const std::set<int> &header_bb_idx_set,
++			   const std::set<std::pair<int, int> > &backedges)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "Start update weight traversal:\n");
++
++  while (!bb_topo_order.empty ())
++    {
++      basic_block bb = bb_topo_order.back ();
++      bb_topo_order.pop_back ();
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "%d ", bb->index);
++
++      edge e;
++      edge_iterator ei;
++      FOR_EACH_EDGE (e, ei, bb->succs)
++	{
++	  if (check_null_info_in_path_update (bb, e))
++	    continue;
++
++	  if (header_bb_idx_set.count (e->dest->index)
++	      && bb->loop_father == e->dest->loop_father)
++	    {
++	      /* Backedge case.  */
++	      update_backedge_path_weight (bb_weights, bb);
++	    }
++	  else if (bb->loop_father->num != 0
++		   && !flow_bb_inside_loop_p (bb->loop_father, e->dest))
++	    {
++	      /* Outer-loop edge case.  */
++	      continue;
++	    }
++	  else if (backedges.count (std::make_pair (bb->index, e->dest->index)))
++	    {
++	      /* Inner-loop-cycle backedge case.  */
++	      continue;
++	    }
++	  else
++	    {
++	      /* Normal edge case.  */
++	      update_path_weight (bb_weights, bb->index, e->dest->index,
++				  e->dest->count.to_gcov_type ());
++	    }
++	}
++    }
++}
++
++/* Collect all header bb of loops in the function beforehand.  */
++
++void
++collect_header_bb_for_fn (std::set<int> &header_bb_idx_set)
++{
++  class loop *loop = NULL;
++  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
++    header_bb_idx_set.insert (loop->header->index);
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "\nCheck header bbs:\n");
++      for (std::set<int>::iterator it = header_bb_idx_set.begin ();
++	   it != header_bb_idx_set.end (); ++it)
++	fprintf (dump_file, "%d ", *it);
++      fprintf (dump_file, "\n");
++    }
++}
++
++/* Record loop executing order and bb high-executing path.  */
++
++void
++record_high_execution_path (std::vector<class loop *> &sorted_kernel,
++			    std::vector<int> &bb_path, int bb_num_max)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nPATH FOR %s: ", get_name (cfun->decl));
++
++  std::set<int> loop_set;
++  for (int i = bb_path.size() - 1; i >= 0; --i)
++    {
++      int bb_idx = bb_path[i];
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "%d ", bb_idx);
++      gcc_assert (bb_idx < bb_num_max);
++
++      class loop *loop = BASIC_BLOCK_FOR_FN (cfun, bb_idx)->loop_father;
++      if (!loop_set.count (loop->num))
++	{
++	  loop_set.insert (loop->num);
++	  sorted_kernel.push_back (loop);
++	}
++    }
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\n");
++}
++
++/* Combine and sort candidate loops using feedback information.  */
++
++bool
++filter_and_sort_kernels_feedback (std::vector<class loop *> &sorted_kernel,
++				  std::set<int> &bb_pathset)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nPhase 4: filter_and_sort_kernels:\n\n");
++
++  std::set<int> header_bb_idx_set;
++  std::list<basic_block> bb_topo_order;
++
++  /* Quoted from GCC internal, Chapter 15.1, "the index for any block should
++     never be greater than `last_basic_block`." Therefore, we use this
++     variable for retrieving the max bb index of a function.  */
++  /* Since the pass does not add/remove/merge basic blocks until Phase 6
++     and previous passes will update ssa accordingly, we do not need to
++     `compact_blocks` to update bb indices currently.   */
++  int bb_num_max = last_basic_block_for_fn (cfun) + 1;
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nMaximal number of possible bbs in the "
++			  "function: %d\n",
++	     bb_num_max);
++  std::vector<int> bb_visited = std::vector<int>(bb_num_max, 0);
++
++  collect_header_bb_for_fn (header_bb_idx_set);
++  basic_block bb_start = ENTRY_BLOCK_PTR_FOR_FN (cfun);
++
++  /* Step 1: Get topological order of bb during traversal.  */
++  std::set<std::pair<int, int> > backedges;
++  get_next_toposort_bb (bb_start, bb_visited, bb_topo_order, header_bb_idx_set,
++			backedges, -1);
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "\nCheck bbs in topological order:\n");
++      for (std::list<basic_block>::iterator it = bb_topo_order.begin ();
++	   it != bb_topo_order.end (); ++it)
++	fprintf (dump_file, "%d ", (*it)->index);
++      fprintf (dump_file, "\n");
++    }
++
++  /* Step 2: Update weights of nodes and path.  */
++  weight weight_init = {-1, -1};
++  std::vector<weight> bb_weights = std::vector<weight>(bb_num_max, weight_init);
++  bb_weights[0].bb_count = 0;  /* ENTRY bb has count 0 and prev bb as -1.  */
++  update_max_length_of_path (bb_weights, bb_topo_order, header_bb_idx_set,
++			     backedges);
++
++  /* Step 3: Backtrack a path from EXIT bb to ENTRY bb.  */
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nCheck counts for each bb:\n");
++
++  std::vector<int> bb_path;
++  int tmp_bb_idx = 1;
++  bb_pathset.insert (tmp_bb_idx);
++  bb_path.push_back (tmp_bb_idx);
++  tmp_bb_idx = bb_weights[tmp_bb_idx].prev_bb_idx;
++  while (tmp_bb_idx > 0 && tmp_bb_idx < bb_num_max)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "%d: %ld, ", tmp_bb_idx,
++		 bb_weights[tmp_bb_idx].bb_count);
++      bb_pathset.insert (tmp_bb_idx);
++      bb_path.push_back (tmp_bb_idx);
++      tmp_bb_idx = bb_weights[tmp_bb_idx].prev_bb_idx;
++    }
++  /* It is possible that the function exit code is wrapped around as an
++     variable, and thus, EXIT_BB in cfg is not connected to any bb. */
++  if (tmp_bb_idx < 0 || tmp_bb_idx >= bb_num_max)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "unhandled scenario at backtracking highly "
++			      "executed path with tmp_bb_idx %d",
++		   tmp_bb_idx);
++	}
++      return false;
++    }
++
++  record_high_execution_path (sorted_kernel, bb_path, bb_num_max);
++
++  return true;
++}
++
++
++/* ================ phase 5 record_and_sort_ref_groups ================  */
++/* Memory reference score, different aspects of one memory reference.  */
++
++struct ref_score
++{
++  /* certain memory reference.  */
++  data_ref d_ref;
++
++  /* local count for bb where memory reference is located.  */
++  gcov_type bb_count;
++
++  /* line-location of memory reference.  */
++  int line;
++};
+ 
+ /* Memory reference group, different reference of the same variable.  */
+ 
+@@ -1971,8 +2760,11 @@ struct ref_group
+   /* first ref for insert hint.  */
+   data_ref first_use;
+ 
++  /* first ref with the highest-order CALC.  */
++  data_ref first_calc_use;
++
+   /* reuse scores of variables.  */
+-  unsigned int reuse_level;
++  float reuse_level;
+ 
+   /* method of calculating the var size.  */
+   calc_type calc_by;
+@@ -1980,6 +2772,12 @@ struct ref_group
+   /* memory reference index for specific variable.  */
+   unsigned int mem_ref_index;
+ 
++  /* variable dimension.  */
++  unsigned int dim;
++
++  /* True if first_calc_use's footprint replaces that of first_use.  */
++  unsigned int transfer_ft;
++
+   /* Accessing Reference Records in Different Modes (key_index):
+     000: write, random, non-parallel
+     001: write, random, parallel
+@@ -2002,39 +2800,138 @@ struct ref_group
+       reuse_level = 0;
+       calc_by = UNHANDLE_CALC;
+       mem_ref_index = 0;
++      dim = 1;
++      transfer_ft = 0;
+     }
+ };
+ 
+-/* calculate reuse level.  */
++/* Get the integer part for log(x) with the given base.  */
+ 
+-unsigned int
+-calculate_reuse_level (std::map<int, std::vector<data_ref> > &var_use)
++static unsigned int
++flog (float x, float base)
+ {
+-  unsigned int level = 0;
++  unsigned int res = 0;
++  while (x >= base)
++    {
++      ++res;
++      x /= base;
++    }
++  return res;
++}
++
++/* Calculate reuse time for a memory reference in ref_group.  */
++
++float
++calculate_reuse_times (std::vector<data_ref> &mem_refs, std::set<int> &loop_set,
++		       std::set<int> &bb_set, unsigned int var_dim)
++{
++  const float SAME_BB_REUSE_WEIGHT = 0.1;
++  const float SAME_LOOP_REUSE_WEIGHT = 0.5;
++  const float NORMAL_REUSE_WEIGHT = 1.;
++
++  float reuse_time_sum = 0.;
++  for (std::vector<data_ref>::iterator it = mem_refs.begin ();
++       it != mem_refs.end (); ++it)
++    {
++      const data_ref &mem_ref = *it;
++      float reuse_time = 0.;
++      if (bb_set.count (mem_ref.bb_idx))
++	{
++	  /* If the two mem_ref belong to the same bb, the new reuse
++	     weight will not exceed 0.1 divided by the mem_ref mode group
++	     size.
++	     NEXT STEP: The following equation may hold and cause commutative
++	     property of read and write op not holding:
++	      write + (reused) read != read + (reused) write.
++	     However, it seems that write mem_ref is always before read mem_ref,
++	     so the above comparison does not show up in calculation due to
++	     intrinsic in-order property of tree map, but this condition is
++	     quite fragile anyway.  */
++	  reuse_time = SAME_BB_REUSE_WEIGHT / mem_refs.size ();
++	}
++      else
++	{
++	  bb_set.insert (mem_ref.bb_idx);
++	  if (loop_set.count (mem_ref.loop_idx))
++	    {
++	      /* If the mem_ref belongs to a loop where any other mem_ref is in,
++		 the new reuse weight will be 0.5.  */
++	      reuse_time = SAME_LOOP_REUSE_WEIGHT;
++	    }
++	  else
++	    {
++	      /* If the mem_ref is reused but not in the same group with any
++		 other mem_ref, the new reuse weight will be 1.  */
++	      loop_set.insert (mem_ref.loop_idx);
++	      reuse_time = NORMAL_REUSE_WEIGHT;
++	    }
++	}
++      unsigned int used_dim = std::min (mem_ref.loop_depth, var_dim);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "used_dim : %u, loop_depth : %u\n", used_dim,
++		 mem_ref.loop_depth);
++      unsigned int power = flog (std::max (0u, mem_ref.loop_depth - used_dim)
++				 + 2, 2.);
++      reuse_time_sum += reuse_time * (used_dim * used_dim / 2.) * (power);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "(%f * (%u * %u / 2) * (%u) = %f\n",
++		 reuse_time, used_dim, used_dim, power,
++		 reuse_time * (used_dim * used_dim / 2.) * (power));
++    }
++  return reuse_time_sum;
++}
++
++/* Calculate reuse level.  */
++
++float
++calculate_reuse_level (std::map<int, std::vector<data_ref> > &var_use,
++		       unsigned int var_dim, double var_size)
++{
++  const float VAR_SIZE_CACHE_CAPACITY = 1 / 4.;
++  const int WITHIN_CACHE_SIZE_COST = 4;
++  const float BYTE_CONVERT_RATIO = 1024.;
++
++  float level = 0.;
++  std::set<int> loop_set;
++  std::set<int> bb_set;
++  bool has_write_op = false;
+   for (std::map<int, std::vector<data_ref> >::iterator it = var_use.begin ();
+        it != var_use.end (); ++it)
+     {
+       unsigned int parallel = 1;
+       unsigned int regular = 1;
+-      unsigned int cost = 1;
+ 
+       if ((*it).second[0].parallel_p)
+ 	parallel = PARALLEL_NUM;
+       if (!(*it).second[0].regular_p)
+ 	regular = INDIRECT_ACCESS_VALUE;
+       if (!(*it).second[0].read_p)
+-	cost = WRITE_COST;
++	has_write_op = true;
+ 
+       /* In serial reuse, we will later check whether they are in the
+ 	 same cacheline.  If yes, delete the reuse.  For details, see the
+ 	 reuse analysis of prefetching and eliminate redundancy.  */
+-      unsigned int add = parallel * ((*it).second.size () * (cost + regular));
++      float reuse_times = calculate_reuse_times ((*it).second, loop_set,
++						 bb_set, var_dim);
++      float add = parallel * reuse_times * regular;
+       level += add;
+       if (add && dump_file && (dump_flags & TDF_DETAILS))
+-	fprintf (dump_file, "%d : %d * (%ld * (%d + %d)) = %d\n",
+-	      (*it).first, parallel, (*it).second.size (), cost, regular, add);
++	fprintf (dump_file, "%d : %d * %f * %d = %f\n",
++		 (*it).first, parallel, reuse_times, regular, add);
+     }
+-  return level;
++
++  bool within_llc_size = var_size > param_l2_cache_size / BYTE_CONVERT_RATIO
++			 && var_size < VAR_SIZE_CACHE_CAPACITY
++				       * param_llc_capacity_per_core;
++
++  float final_level = has_write_op ? (level * WRITE_COST) : level;
++  final_level = within_llc_size ? (final_level * WITHIN_CACHE_SIZE_COST)
++				: final_level;
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "final level : %d * %f * %d = %f\n",
++	     has_write_op ? WRITE_COST : 1, level,
++	     within_llc_size ? WITHIN_CACHE_SIZE_COST : 1, final_level);
++  return final_level;
+ }
+ 
+ /* Comparison of reference reuse level.  */
+@@ -2042,7 +2939,33 @@ calculate_reuse_level (std::map<int, std::vector<data_ref> > &var_use)
+ bool
+ ref_group_reuse_cmp (const ref_group &a, const ref_group &b)
+ {
+-  return a.reuse_level > b.reuse_level;
++  if (a.reuse_level != b.reuse_level)
++    return a.reuse_level > b.reuse_level;
++  else
++    return get_name (a.var) < get_name (b.var);
++}
++
++/* Dump key information of reference group and memory access for llc hint.  */
++
++void
++dump_key_info_for_llc_hint (std::vector<ref_group> &ref_groups)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "\nLLC hint info:\n");
++      fprintf (dump_file, "rank\tvar\t(lineno, direct, vectorized, write)\n");
++      for (unsigned int i = 0; i < ref_groups.size (); ++i)
++	{
++	  fprintf (dump_file, "%d\t", i);
++	  print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM);
++	  data_ref &mem_ref = ref_groups[i].first_use;
++	  fprintf (dump_file, "\t(%d, %u, %u, %u)",
++		   expand_location (mem_ref.stmt->location).line,
++		   mem_ref.regular_p, mem_ref.vectorize_p, 1 - mem_ref.read_p);
++	  fprintf (dump_file, "\n");
++	}
++      fprintf (dump_file, "\n");
++    }
+ }
+ 
+ /* Sort reference groups.  */
+@@ -2057,13 +2980,15 @@ sort_ref_groups (std::vector<ref_group> &ref_groups,
+   for (std::map<tree, ref_group>::iterator it = ref_groups_map.begin ();
+        it != ref_groups_map.end (); ++it)
+     {
+-      (*it).second.reuse_level = calculate_reuse_level ((*it).second.ref_use);
++      (*it).second.reuse_level = calculate_reuse_level ((*it).second.ref_use,
++							(*it).second.dim,
++							(*it).second.var_size);
+       ref_groups.push_back ((*it).second);
+ 
+       if (dump_file && (dump_flags & TDF_DETAILS))
+ 	{
+ 	  print_generic_expr (dump_file, (*it).second.var, TDF_SLIM);
+-	  fprintf (dump_file, " : %d\n", (*it).second.reuse_level);
++	  fprintf (dump_file, " : %f\n\n", (*it).second.reuse_level);
+ 	}
+     }
+ 
+@@ -2072,16 +2997,17 @@ sort_ref_groups (std::vector<ref_group> &ref_groups,
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     {
+       fprintf (dump_file, "\nsorted ref_groups:\n");
+-      fprintf (dump_file, "rank var (data_size, num_of_mem_ref, need_tmp_name):"
+-	       " reuse_level_score\n");
++      fprintf (dump_file, "rank\tvar\t(data_size, dim, num_of_mem_ref, "
++			  "need_tmp_name): reuse_level_score\n");
+       for (unsigned int i = 0; i < ref_groups.size (); ++i)
+ 	{
+-	  fprintf (dump_file, "%d ", i);
++	  fprintf (dump_file, "%d\t", i);
+ 	  print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM);
+ 	  int need_tmp_name = !get_name (ref_groups[i].var) ? 1 : 0;
+-	  fprintf (dump_file, " (%lf, %lu, %d)", ref_groups[i].var_size,
+-		   ref_groups[i].ref_scores.size (), need_tmp_name);
+-	  fprintf (dump_file, " : %d\n", ref_groups[i].reuse_level);
++	  fprintf (dump_file, "\t(%lf, %u, %lu, %d)", ref_groups[i].var_size,
++		   ref_groups[i].dim, ref_groups[i].ref_scores.size (),
++		   need_tmp_name);
++	  fprintf (dump_file, " : %f\n", ref_groups[i].reuse_level);
+ 	}
+       fprintf (dump_file, "\n");
+ 
+@@ -2101,6 +3027,7 @@ sort_ref_groups (std::vector<ref_group> &ref_groups,
+ 	}
+       fprintf (dump_file, "\n");
+     }
++    dump_key_info_for_llc_hint (ref_groups);
+ }
+ 
+ /* Attributes of variable data.  */
+@@ -2126,19 +3053,27 @@ record_mem_ref (std::map<tree, ref_group> &ref_groups, data_ref &mem_ref)
+       ref_group ref_group;
+       ref_group.var = mem_ref.var;
+       ref_group.first_use = mem_ref;
++      ref_group.first_calc_use = mem_ref;
+       ref_groups[mem_ref.var] = ref_group;
+     }
+ 
+-  /* Ref_groups' calc_by depends on the inserted mem_ref's calc_by.
+-     Runtime issue requires the specified mem_ref's calc_by to be >= 1.
+-     Temporarily modified ref_group's first_use after sorting mem_refs.  */
+-  ref_groups[mem_ref.var].calc_by = std::max (ref_groups[mem_ref.var].calc_by,
+-					 mem_ref.calc_by);
++  /* Ref_groups' calc_by reflects the highest order of calc_by that can be
++     achieved by all mem_ref of ref_groups. The first mem_ref that achieves
++     this order is defined to be `first_calc_use`. Later after sorting
++     mem_refs, calc_by will be replaced by the calc_by of `first_use`, and
++     even by the calc_by of `first_calc_use`.  */
++  if (mem_ref.calc_by > ref_groups[mem_ref.var].calc_by)
++    {
++      ref_groups[mem_ref.var].calc_by = mem_ref.calc_by;
++      ref_groups[mem_ref.var].first_calc_use = mem_ref;
++    }
+   ref_groups[mem_ref.var].var_size = std::max (ref_groups[mem_ref.var].var_size,
+-					  mem_ref.data_size);
++					       mem_ref.data_size);
++  ref_groups[mem_ref.var].dim = std::max (ref_groups[mem_ref.var].dim,
++				(unsigned int) mem_ref.loop_bounds.size ());
+   ref_groups[mem_ref.var].ref_use[index].push_back (mem_ref);
+ 
+-  ref_score ref_level{ mem_ref, ((mem_ref.stmt)->bb->count).to_gcov_type (),
++  ref_score ref_level = { mem_ref, ((mem_ref.stmt)->bb->count).to_gcov_type (),
+ 			   expand_location (mem_ref.stmt->location).line };
+   ref_groups[mem_ref.var].ref_scores.push_back (ref_level);
+ 
+@@ -2165,7 +3100,7 @@ record_mem_ref (std::map<tree, ref_group> &ref_groups, data_ref &mem_ref)
+       fprintf (dump_file, ", offset: ");
+       if (mem_ref.offset && cst_and_fits_in_hwi (mem_ref.offset))
+ 	fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
+-		int_cst_value (mem_ref.offset));
++		 int_cst_value (mem_ref.offset));
+       else
+ 	print_generic_expr (dump_file, mem_ref.offset, TDF_SLIM);
+       fprintf (dump_file, ", %s", mem_ref.read_p ? "read" : "write");
+@@ -2175,12 +3110,30 @@ record_mem_ref (std::map<tree, ref_group> &ref_groups, data_ref &mem_ref)
+     }
+ }
+ 
+-/* Rank data reference index level by the scheme of source code line number.  */
++/* Rank data reference index level.  */
+ 
+ bool
+-data_ref_reuse_cmp (const ref_score &a, const ref_score &b)
+-{
+-  return a.line < b.line;
++best_insert_cmp (const ref_score &a, const ref_score &b)
++{
++  /* NEXT STEP: We can also calculate gap using static/feedback info inferred
++     from historical maximum bb count:
++	gap = hist_max_bb_ct / (alpha * max (a.bb_ct, b.bb_ct)) + 1.
++     Also, bb count needs to be smoothed and scaled as divisor can be 0.
++     history maximum bb count can be obtained in Phase 4.  */
++  const float gap = 1;
++  if (a.d_ref.loop_depth != b.d_ref.loop_depth)
++    return a.d_ref.loop_depth > b.d_ref.loop_depth;
++  else if (a.d_ref.regular_p != b.d_ref.regular_p)
++    return a.d_ref.regular_p > b.d_ref.regular_p;
++  else if (abs (double (std::max (a.bb_count, b.bb_count) + 1) /
++		double (std::min (a.bb_count, b.bb_count) + 1) - 1) > gap)
++    return a.bb_count > b.bb_count;
++  else if (a.line != b.line)
++    return a.line < b.line;
++  else if (a.d_ref.read_p != b.d_ref.read_p)
++    return a.d_ref.read_p < b.d_ref.read_p;
++  else
++    return a.d_ref.vectorize_p > b.d_ref.vectorize_p;
+ }
+ 
+ /* Sort data reference index level within one reference group in non-decreasing
+@@ -2194,13 +3147,48 @@ sort_mem_ref_in_ref_group (std::map<tree, ref_group> &ref_groups_map)
+   for (std::map<tree, ref_group>::iterator it = ref_groups_map.begin ();
+        it != ref_groups_map.end (); ++it)
+     {
+-      std::vector<ref_score> &ref_scores = (*it).second.ref_scores;
++      ref_group &curr_ref_group = (*it).second;
++      std::vector<ref_score> &ref_scores = curr_ref_group.ref_scores;
+       std::stable_sort (ref_scores.begin (), ref_scores.end (),
+-			data_ref_reuse_cmp);
++			best_insert_cmp);
+       /* Update ref_group's first_use and calc_by with the first mem_ref after
+ 	 sorting.  */
+-      (*it).second.first_use = (*it).second.ref_scores[0].d_ref;
+-      (*it).second.calc_by = (*it).second.first_use.calc_by;
++      curr_ref_group.first_use = curr_ref_group.ref_scores[0].d_ref;
++      curr_ref_group.calc_by = curr_ref_group.first_use.calc_by;
++
++      /* When transferring footprint is enabled, it is allowed to transfer
++	 the statically-calculated footprint of a mem_ref from the same
++	 ref_group to `first_use` mem_ref.  */
++      if (param_transfer_footprint
++	  && curr_ref_group.first_use.calc_by == UNHANDLE_CALC)
++	{
++	  if (curr_ref_group.first_calc_use.calc_by > RUNTIME_CALC)
++	    {
++	      if (dump_file && (dump_flags & TDF_DETAILS))
++		{
++		  print_generic_expr (dump_file, (*it).first, TDF_SLIM);
++		  fprintf (dump_file, "\nfirst_use: ");
++		  print_gimple_stmt (dump_file, curr_ref_group.first_use.stmt,
++				     0, TDF_LINENO);
++		  fprintf (dump_file, "first_calc_use: ");
++		  print_gimple_stmt (dump_file,
++				     curr_ref_group.first_calc_use.stmt,
++				     0, TDF_LINENO);
++		}
++
++	      curr_ref_group.calc_by = curr_ref_group.first_calc_use.calc_by;
++	      curr_ref_group.transfer_ft = 1;
++	    }
++	  else
++	    {
++	      if (dump_file && (dump_flags & TDF_DETAILS))
++		{
++		  print_generic_expr (dump_file, (*it).first, TDF_SLIM);
++		  fprintf (dump_file, ": cannot transfer footprint to "
++				      "first use mem_ref.\n");
++		}
++	    }
++	}
+ 
+       if (dump_file && (dump_flags & TDF_DETAILS))
+ 	{
+@@ -2211,6 +3199,9 @@ sort_mem_ref_in_ref_group (std::map<tree, ref_group> &ref_groups_map)
+ 	      fprintf (dump_file, "mem_ref_index %u: ", i);
+ 	      print_gimple_stmt (dump_file, ref_scores[i].d_ref.stmt, 0,
+ 				 TDF_LINENO);
++	      fprintf (dump_file, "bb-%d ",
++		       ref_scores[i].d_ref.stmt->bb->index);
++	      fprintf (dump_file, "count %ld\n", ref_scores[i].bb_count);
+ 	    }
+ 	  fprintf (dump_file, "\n\n");
+ 	}
+@@ -2222,8 +3213,9 @@ sort_mem_ref_in_ref_group (std::map<tree, ref_group> &ref_groups_map)
+ bool
+ record_and_sort_ref_groups (std::vector<ref_group> &ref_groups,
+ 			    std::vector<class loop *> &kernels,
+-			    std::map<class loop*,
+-				     std::vector<data_ref> > &loop_refs)
++			    std::map<class loop *,
++				     std::vector<data_ref> > &loop_refs,
++			    std::set<int> bb_pathset)
+ {
+   if (dump_file)
+     fprintf (dump_file, "\nPhase 5: trace_all_references_details\n\n");
+@@ -2232,7 +3224,7 @@ record_and_sort_ref_groups (std::vector<ref_group> &ref_groups,
+ 
+   for (unsigned i = 0; i < kernels.size (); ++i)
+     {
+-      class loop* loop = kernels[i];
++      class loop *loop = kernels[i];
+       if (loop_refs.count (loop) == 0)
+ 	continue;
+ 
+@@ -2240,8 +3232,13 @@ record_and_sort_ref_groups (std::vector<ref_group> &ref_groups,
+ 	fprintf (dump_file, "loop header %d:\n", loop->header->index);
+       for (unsigned j = 0; j < loop_refs[loop].size (); ++j)
+ 	{
+-	  if (loop_refs[loop][j].trace_status_p)
+-	    record_mem_ref (ref_groups_map, loop_refs[loop][j]);
++	  data_ref &mem_ref = loop_refs[loop][j];
++	  if (mem_ref.trace_status_p)
++	    {
++	      if (!param_filter_mode || (param_filter_mode
++		  && bb_pathset.count (mem_ref.stmt->bb->index)))
++		record_mem_ref (ref_groups_map, mem_ref);
++	    }
+ 	}
+     }
+ 
+@@ -2286,14 +3283,14 @@ issue_mask_prefetch (gimple *stmt)
+ 	fprintf (dump_file, "unhandled scene: target vect is null");
+       return;
+     }
+-  HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi
++  unsigned HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi
+ 		       (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target))));
+   tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance);
+   addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true,
+-				    NULL, true, GSI_SAME_STMT);
++				   NULL, true, GSI_SAME_STMT);
+ 
+-  gcall *call = gimple_build_call_internal (IFN_MASK_PREFETCH,
+-				5, addr, scale, final_mask, target, prfop);
++  gcall *call = gimple_build_call_internal (IFN_MASK_PREFETCH, 5, addr, scale,
++					    final_mask, target, prfop);
+   gsi_insert_after (&si, call, GSI_SAME_STMT);
+   update_ssa (TODO_update_ssa_only_virtuals);
+ }
+@@ -2324,15 +3321,15 @@ issue_mask_gather_prefetch (gimple *stmt)
+ 	fprintf (dump_file, "unhandled scene: target vect is null");
+       return;
+     }
+-  HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi
++  unsigned HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi
+ 		       (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target))));
+   tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance);
+   addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true,
+ 				   NULL, true, GSI_SAME_STMT);
+ 
+-  gcall *call = gimple_build_call_internal
+-		(IFN_MASK_GATHER_PREFETCH, 7, addr,
+-		 vec_offset, scale, zero, final_mask, target, prfop);
++  gcall *call = gimple_build_call_internal (IFN_MASK_GATHER_PREFETCH, 7, addr,
++					    vec_offset, scale, zero,
++					    final_mask, target, prfop);
+   gsi_insert_after (&si, call, GSI_SAME_STMT);
+   update_ssa (TODO_update_ssa_only_virtuals);
+ }
+@@ -2345,12 +3342,10 @@ issue_builtin_prefetch (data_ref &mem_ref)
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     fprintf (dump_file, "insert prfm.\n");
+   /* MEM[symbol: diagPtr, index: ivtmp_102, step: 8, offset: 0B] */
+-  gimple* stmt = mem_ref.stmt;
+-  tree dataref_ptr = mem_ref.base;
+-  tree data_idx = mem_ref.index;
++  gimple *stmt = mem_ref.stmt;
++  tree ref = mem_ref.ref;
++
+   tree scale = mem_ref.step;
+-  tree offset = mem_ref.offset;
+-  /* add offset.  */
+   gimple_stmt_iterator si = gsi_for_stmt (stmt);
+   if (scale == NULL_TREE)
+     {
+@@ -2361,30 +3356,16 @@ issue_builtin_prefetch (data_ref &mem_ref)
+ 	{
+ 	  if (dump_file && (dump_flags & TDF_DETAILS))
+ 	    fprintf (dump_file, "ERROR: Unknown size unit for the prefetching "
+-		     "variable.  Stop builtin_prefetch.\n\n");
++				"variable.  Stop builtin_prefetch.\n\n");
+ 	  return;
+ 	}
+     }
+ 
+-  data_idx = data_idx ? data_idx : size_zero_node;
+-  data_idx = build1 (NOP_EXPR, TREE_TYPE (scale), data_idx);
+-  tree displacement = fold_build2 (MULT_EXPR, TREE_TYPE (scale), data_idx,
+-				   scale);
+-  if (offset != NULL_TREE && TREE_CODE (offset) != TREE_CODE (size_zero_node))
+-    {
+-      if (dump_file && (dump_flags & TDF_DETAILS))
+-	fprintf (dump_file, "WARNING: offset's TREE_TYPE is not integer_cst: "
+-		 "%s\nStop builtin_prefetch.\n",
+-		 get_tree_code_name (TREE_CODE (offset)));
+-      return;
+-    }
+-  offset = offset ? offset : size_zero_node;
+-  offset = build1 (NOP_EXPR, TREE_TYPE (scale), offset);
+-  dataref_ptr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (dataref_ptr),
+-			     dataref_ptr, offset);
+-  tree addr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (dataref_ptr),
+-			   dataref_ptr, displacement);
+-  HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi (scale);
++  tree addr = build_fold_addr_expr_with_type (ref, ptr_type_node);
++  addr = force_gimple_operand_gsi (&si, unshare_expr (addr),
++				   true, NULL, true, GSI_SAME_STMT);
++  unsigned HOST_WIDE_INT distance = param_prefetch_offset
++				      * tree_to_uhwi (scale);
+ 
+   addr = fold_build_pointer_plus_hwi (addr, distance);
+   addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true,
+@@ -2392,8 +3373,8 @@ issue_builtin_prefetch (data_ref &mem_ref)
+   /* __builtin_prefetch (_68, 0, 1);
+      1st param: *addr, 2nd param: write/read (1/0), 3rd param: temporal locality
+      (high means strong locality) */
+-  gcall *call = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
+-				3, addr, integer_zero_node, integer_one_node);
++  gcall *call = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH), 3,
++				   addr, integer_zero_node, integer_one_node);
+   gsi_insert_after (&si, call, GSI_SAME_STMT);
+   update_ssa (TODO_update_ssa_only_virtuals);
+ }
+@@ -2412,8 +3393,7 @@ static_issue (std::vector<ref_group> &ref_groups, int num_issue_var)
+       data_ref mem_ref = ref_groups[i].first_use;
+       if (mem_ref.vectorize_p)
+ 	{
+-	  enum internal_fn ifn_code = gimple_call_internal_fn
+-					(mem_ref.stmt);
++	  enum internal_fn ifn_code = gimple_call_internal_fn (mem_ref.stmt);
+ 	  if (ifn_code == IFN_MASK_STORE || ifn_code == IFN_MASK_LOAD)
+ 	    issue_mask_prefetch (mem_ref.stmt);
+ 	  else if (ifn_code == IFN_MASK_GATHER_LOAD)
+@@ -2427,20 +3407,72 @@ static_issue (std::vector<ref_group> &ref_groups, int num_issue_var)
+     }
+ }
+ 
++/* Check whether all loop bounds (niters) used for calculating the footprints
++   of previously-executed ref_groups are defined in a dominated bb to the
++   currentbranch bb, where the conditional expression requires the loop bound
++   info.  */
++
++bool
++check_def_use_chain (std::vector<ref_group> &ref_groups,
++		     basic_block &branch_header_bb,
++		     std::vector<int> &ref_group_idx)
++{
++  for (std::vector<int>::iterator it = ref_group_idx.begin ();
++       it != ref_group_idx.end (); ++it)
++    {
++      /* Transferring mem_ref only takes place during footprint calculation.  */
++      ref_group &ref_group_curr = ref_groups[*it];
++      data_ref mem_ref = ref_group_curr.transfer_ft
++			  ? ref_group_curr.first_calc_use
++			  : ref_group_curr.first_use;
++      for (unsigned j = 0; j < mem_ref.loop_bounds.size (); ++j)
++	{
++	  tree niters = mem_ref.loop_bounds[j].niters;
++	  gimple *def_stmt = SSA_NAME_DEF_STMT (niters);
++	  basic_block def_bb = gimple_bb (def_stmt);
++	  /* Check dominator relationship of def bb and branch bb.  */
++	  /* Case 1: Check whether the def bb is the single predecessor block
++	     of header bb.  */
++	  if (single_pred_p (branch_header_bb))
++	    {
++	      basic_block branch_bb_prev = single_pred (branch_header_bb);
++	      if (branch_bb_prev->index == def_bb->index)
++		continue;
++	    }
++	  /* Case 2: Check whether the branch bb is dominated by the def
++	     bb.  */
++	  if (!dominated_by_p (CDI_DOMINATORS, branch_header_bb, def_bb))
++	    return false;
++	}
++    }
++  return true;
++}
++
+ /* Generate the stmts for calculating the size.  Later we will consider nested
+    multi-branches scenarios and check more information of niters when it is
+    a COND_EXPR.  */
+ 
+ tree
+ calc_stmts_gen (std::vector<ref_group> &ref_groups,
+-		gimple_seq &cond_expr_stmt_list, int num_issue_var)
+-{
+-  /* Accumulated keep size.  */
+-  tree total_size = build_real_from_int_cst
+-		      (double_type_node, integer_zero_node);
+-  for (int i = 0; i < num_issue_var; ++i)
++		gimple_seq &cond_expr_stmt_list,
++		basic_block branch_header_bb,
++		std::vector<int> &ref_group_idx_curr,
++		std::vector<int> &ref_group_idx_prev, tree &cumul_size)
++{
++  /* Check whether the bbs of def stmt for footprint loop bounds dominates
++     the bb of new runtime branching conditional.  */
++  if (!check_def_use_chain (ref_groups, branch_header_bb, ref_group_idx_prev))
++    return NULL_TREE;
++
++  /* Accumulated allocation size.  */
++  for (std::vector<int>::iterator it = ref_group_idx_curr.begin ();
++       it != ref_group_idx_curr.end (); ++it)
+     {
+-      data_ref &mem_ref = ref_groups[i].first_use;
++      /* Transferring mem_ref only takes place during footprint calculation.  */
++      ref_group &ref_group_curr = ref_groups[*it];
++      data_ref mem_ref = ref_group_curr.transfer_ft
++			  ? ref_group_curr.first_calc_use
++			  : ref_group_curr.first_use;
+       tree var = mem_ref.var;
+       for (unsigned j = 0; j < mem_ref.loop_bounds.size (); ++j)
+ 	{
+@@ -2457,7 +3489,8 @@ calc_stmts_gen (std::vector<ref_group> &ref_groups,
+ 	      if (dump_file && (dump_flags & TDF_DETAILS))
+ 		{
+ 		  fprintf (dump_file, "WARNING: Cannot detect size unit "
+-			   "(use 1 byte) for variable %s: ", get_name (var));
++				      "(use 1 byte) for variable %s: ",
++			   get_name (var));
+ 		  print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM);
+ 		  fprintf (dump_file, "\n");
+ 		}
+@@ -2466,85 +3499,215 @@ calc_stmts_gen (std::vector<ref_group> &ref_groups,
+ 	  unit = build1 (NOP_EXPR, TREE_TYPE (niters), unit);
+ 	  tree size = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, unit);
+ 	  size = build1 (FLOAT_EXPR, double_type_node, size);
+-	  total_size = fold_build2
+-			 (PLUS_EXPR, double_type_node, total_size, size);
++	  cumul_size = fold_build2 (PLUS_EXPR, double_type_node, cumul_size,
++				    size);
+ 	}
++      ref_group_idx_prev.push_back (*it);
+     }
+   /* Create a stmt list for size calculation.  */
+   tree div = build_int_cst (TREE_TYPE (integer_zero_node), 1024 * 1024);
+   div = build1 (NOP_EXPR, double_type_node, div);
+-  total_size = fold_build2 (RDIV_EXPR, double_type_node, total_size, div);
++  tree total_size = fold_build2 (RDIV_EXPR, double_type_node, cumul_size, div);
+ 
+   tree threshold = build_int_cst (TREE_TYPE (integer_zero_node),
+ 				  param_llc_capacity_per_core / 2);
+   threshold = build_real_from_int_cst (double_type_node, threshold);
+-  tree cond_expr = fold_build2
+-		     (LE_EXPR, boolean_type_node, total_size, threshold);
++  tree cond_expr = fold_build2 (LE_EXPR, boolean_type_node, total_size,
++				threshold);
+ 
+   /* Convert cond_expr to stmt list.  */
+   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
+-	      &cond_expr_stmt_list, is_gimple_condexpr, NULL_TREE);
++				      &cond_expr_stmt_list, is_gimple_condexpr,
++				      NULL_TREE);
+   return cond_expr;
+ }
+ 
+-/* Runtime form insertion and issue instruction.  */
++/* Retrieve the least number of loops that cover all target mem_refs.
++   Try to merge loops that the mem_refs reside to a common superloop and
++   maintain a worklist which relates NEED-TO-COPY loops with the target mem
++   refs inside using the following criteria:
++   1) If loop A is a superloop of loop B in the worklist, replace loop B with
++      loop A in the worklist, and attach all target mem_refs of loop B,
++      together with loop A's, to loop A.
++   2) If loop B in the worklist is a superloop of loop A, attach loop A's
++      target mem_ref to loop B.
++   3) If loop A is not a superloop/subloop of loop B in the worklist, replace
++      loop B with their lowest common superloop C in the worklist, and attach
++      all target mem_refs of loop A and loop B to loop C.
++   4) If loop A and loop B's lowest common superloop is function body
++      (loop 0), stop merging and maintain loop independence.  */
+ 
+ void
+-runtime_issue (std::vector<ref_group> &ref_groups, int num_issue_var)
++get_loop_worklist (std::vector<ref_group> &ref_groups, int num_issue_var,
++		   std::map<class loop *, std::vector<int> > &loop_worklist)
+ {
+-  if (dump_file && (dump_flags & TDF_DETAILS))
+-    fprintf (dump_file, "runtime issue\n");
+-
+-  if (ref_groups.size () == 0)
+-    return;
+-  data_ref &mem_ref = ref_groups[0].first_use;
+-  class loop *loop = mem_ref.loop_bounds.back ().loop;
+-  /* Ensure that variables are in the same loop.  */
+-  for (int i = 1; i < num_issue_var; ++i)
++  for (int i = 0; i < num_issue_var; ++i)
+     {
+       data_ref &mem_ref = ref_groups[i].first_use;
+-      if (loop != mem_ref.loop_bounds.back ().loop)
++      class loop *loop_new = mem_ref.loop_bounds.front ().loop;
++      class loop *common_superloop = loop_new;
++      bool add_loop_worklist = false;
++
++      /* Use greedy algorithm to merge loops to a common superloop that can
++	 contain the current mem_refs.  */
++      std::map<class loop *, std::vector<int> >::iterator it_tmp;
++      std::vector<int> ref_group_idx_tmp;
++      std::map<class loop *, std::vector<int> >::iterator it;
++      for (it = loop_worklist.begin (); it != loop_worklist.end (); )
+ 	{
+-	  if (dump_file && (dump_flags & TDF_DETAILS))
+-	    fprintf (dump_file, "topn var are not in the same loop\n");
+-	  return;
++	  class loop *loop_old = it->first;
++	  common_superloop = find_common_loop (loop_new, loop_old);
++	  if (common_superloop == NULL || common_superloop->num == 0)
++	    {
++	      /* Stop merging two loops if there is no common superloop for
++		 them except function body (loop 0).  */
++	      if (common_superloop != NULL
++		  && dump_file && (dump_flags & TDF_DETAILS))
++		{
++		  fprintf (dump_file, "ref_group %d's loop %d has no common "
++				      "superloop with existing loop %d\n",
++			   i, loop_new->num, loop_old->num);
++		}
++	      ++it;
++	      continue;
++	    }
++
++	  if (common_superloop->num == loop_old->num)
++	    {
++	      /* If loop_old is the superloop of loop_new, add current
++		 ref_group index to loop's worklist.  */
++	      loop_worklist[common_superloop].push_back (i);
++	      ++it;
++	    }
++	  else
++	    {
++	      /* If loop_old is not a superloop of loop_new, replace
++		 loop_old with the common superloop.  */
++	      it_tmp = it;
++	      ++it_tmp;
++	      ref_group_idx_tmp = it->second;
++	      loop_worklist.erase (it);
++	      it = it_tmp;
++	      add_loop_worklist = true;
++	    }
++	}
++
++      if (loop_worklist.empty () || add_loop_worklist)
++	{
++	  /* Update the new common superloop in loop_worklist.  */
++	  std::vector<int> &ref_groups_tmp = loop_worklist[common_superloop];
++	  ref_groups_tmp.push_back (i);
++	  for (std::vector<int>::iterator it = ref_group_idx_tmp.begin ();
++	       it != ref_group_idx_tmp.end (); ++it)
++	    ref_groups_tmp.push_back (*it);
++	  std::sort (ref_groups_tmp.begin (), ref_groups_tmp.end ());
+ 	}
+     }
+-  if (loop == NULL)
+-    return;
+ 
+-  /* If the exit edge points to bb with multiple inputs, split the exit edge
+-     and create a new bb, make the exit edge point to bb only single input.  */
+-  edge e = single_exit (loop);
+-  if (e == NULL)
+-    return;
+-  if (!single_pred_p (e->dest))
++  if (dump_file && (dump_flags & TDF_DETAILS))
+     {
+-      split_loop_exit_edge (e, true);
+-      if (dump_enabled_p ())
+-	dump_printf (MSG_NOTE, "split exit edge\n");
++      fprintf (dump_file, "runtime loop list:\n");
++      std::map<class loop *, std::vector<int> >::iterator it;
++      for (it = loop_worklist.begin (); it != loop_worklist.end (); ++it)
++	{
++	  fprintf (dump_file, "loop %d:", it->first->num);
++	  for (std::vector<int>::iterator idx_it = it->second.begin ();
++	       idx_it != it->second.end (); ++idx_it)
++	    {
++	      fprintf (dump_file, " %d", *idx_it);
++	    }
++	  fprintf (dump_file, "\n");
++	}
+     }
++}
+ 
+-  gimple_seq cond_expr_stmt_list = NULL;
+-  tree cond_expr = calc_stmts_gen (ref_groups, cond_expr_stmt_list,
+-				   num_issue_var);
++/* Runtime form insertion and issue instruction.  */
++
++void
++runtime_issue (std::vector<ref_group> &ref_groups, int num_issue_var,
++	       std::vector<class loop *> &sorted_kernels)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "runtime issue\n");
+ 
+-  /* Use the previous cond and generate a new branch and copy loop.  */
+-  basic_block condition_bb = NULL;
+-  profile_probability prob = profile_probability::likely ();
+-  initialize_original_copy_tables ();
+-  class loop *nloop = loop_version (loop, cond_expr, &condition_bb,
+-		      prob, prob.invert (), prob, prob.invert (), true);
+-  free_original_copy_tables ();
++  /* It is possible that the loop father of some mem_ref's bb may contain the
++     loop fathers of the others. Therefore, we intend to only copy loops
++     without inclusion relationship.  */
++  std::map<class loop *, std::vector<int> > loop_worklist;
++  get_loop_worklist (ref_groups, num_issue_var, loop_worklist);
++  bool get_first_ref_group = false;
++  std::vector<int> ref_group_idx_prev;
++
++  /* NEXT STEP: Multiple loop copies (possibly nested within one loop can cost
++     front-end bound due to branching within loop), we need to set up a
++     threshold such that we may compensate this time cost by space cost
++     in binary (copying outer loop).  */
++  tree cumul_size = build_real_from_int_cst (double_type_node,
++					     integer_zero_node);
++  for (std::vector<class loop *>::iterator it = sorted_kernels.begin ();
++       it != sorted_kernels.end (); ++it)
++    {
++      /* Start runtime branching until finding the first ref_group's loop.
++	 Skip any ref_groups if their `first_use` mem_refs are executed
++	 before the mem_ref of the first ref_group.  */
++      class loop *loop = *it;
++      if (!loop_worklist.count (loop)
++	  || (!get_first_ref_group && loop_worklist[loop][0] != 0))
++	continue;
+ 
+-  /* Insert the generated stmt list before cond_expr.  */
+-  gimple_stmt_iterator cond_exp_gsi;
+-  if (cond_expr_stmt_list)
+-    {
+-      cond_exp_gsi = gsi_last_bb (condition_bb);
+-      gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
+-	  GSI_SAME_STMT);
++      std::vector<int> ref_group_idx_curr = loop_worklist[loop];
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "copy loop num: %d\n", loop->num);
++	}
++      /* If the exit edge points to bb with multiple inputs, split the exit
++	 edge and create a new bb, make the exit edge point to bb with only
++	 single input.  */
++      edge e = single_exit (loop);
++      if (e == NULL)
++	return;
++      if (!single_pred_p (e->dest))
++	{
++	  split_loop_exit_edge (e, true);
++	  if (dump_enabled_p ())
++	    dump_printf (MSG_NOTE, "split exit edge\n");
++	}
++
++      /* After updating SSA, we are not sure whether the gimple_seq stmt list
++	 is initialized and unchanged during iterations. Therefore, we need to
++	 recreate this stmt list for every loop copy.  */
++      gimple_seq cond_expr_stmt_list = NULL;
++      tree cond_expr = calc_stmts_gen (ref_groups, cond_expr_stmt_list,
++				       loop->header, ref_group_idx_curr,
++				       ref_group_idx_prev, cumul_size);
++      if (cond_expr == NULL_TREE)
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "incalculable variables for conditional\n");
++	  return;
++	}
++
++      /* Use the previous cond and generate a new branch and copy loop.  */
++      basic_block condition_bb = NULL;
++      profile_probability prob = profile_probability::likely ();
++      initialize_original_copy_tables ();
++      class loop *nloop = loop_version (loop, cond_expr, &condition_bb,
++					prob, prob.invert (), prob,
++					prob.invert (), true);
++      free_original_copy_tables ();
++
++      /* Insert the generated stmt list before cond_expr.  */
++      gimple_stmt_iterator cond_exp_gsi;
++      if (cond_expr_stmt_list)
++	{
++	  /* Function `gsi_insert_seq_before` will insert `cond_expr` (1st
++	     stmt) of `condition_bb` to the end of `cond_expr_stmt_list`.  */
++	  cond_exp_gsi = gsi_last_bb (condition_bb);
++	  gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
++				 GSI_SAME_STMT);
++	}
+     }
++
+   update_ssa (TODO_update_ssa);
+ 
+   /* Perform hint issue for branches that meet conditions.  */
+@@ -2554,33 +3717,33 @@ runtime_issue (std::vector<ref_group> &ref_groups, int num_issue_var)
+ /* Issue llc hints through prefetch instructions.  */
+ 
+ void
+-issue_llc_hint (std::vector<ref_group> &ref_groups)
++issue_llc_hint (std::vector<ref_group> &ref_groups,
++		std::vector<class loop *> &sorted_kernels)
+ {
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     fprintf (dump_file, "issue_llc_hint:\n");
+ 
+-  /* 1. If the issue-topn and force-issue options are available, top N var is
++  /* 1) If the issue-topn and force-issue options are available, top N var is
+ 	forcibly allocated and no runtime branch is generated.
+-     2. If the issue-topn option is available and the size of top N var is
++     2) If the issue-topn option is available and the size of top N var is
+ 	statically known, top N is statically allocated and no runtime branch
+ 	is generated.
+-     3. If the issue-topn option is available and the size of the top N var is
++     3) If the issue-topn option is available and the size of the top N var is
+ 	unknown, but them is dynamically known, the top N is dynamically
+ 	allocated and generate runtime branches. (also depends on the screening
+ 	of the innermost variable boundary type)
+-     4. If the dynamic runtime cannot know the size, such as indirect access,
++     4) If the dynamic runtime cannot know the size, such as indirect access,
+ 	optimization is skipped.
+   */
+-  if (ref_groups.size () == 0)
++  int num_issue_var = std::min (param_issue_topn, (int) ref_groups.size ());
++  if (num_issue_var == 0)
+     return;
+ 
+-  int num_issue_var = std::min (param_issue_topn,
+-			   static_cast<int>(ref_groups.size ()));
+   if (num_issue_var < param_issue_topn
+       && dump_file && (dump_flags & TDF_DETAILS))
+     {
+       fprintf (dump_file, "WARNING: Only %u (less than param_issue_topn = %d) "
+-	       "ref_group(s) is found for llc hint.\n",
++			  "ref_group(s) is found for llc hint.\n",
+ 	       num_issue_var, param_issue_topn);
+     }
+   if (param_force_issue)
+@@ -2599,16 +3762,20 @@ issue_llc_hint (std::vector<ref_group> &ref_groups)
+       double prefetch_data_size = 0.;
+       for (int i = 0; i < num_issue_var; ++i)
+ 	prefetch_data_size += ref_groups[i].var_size;
+-      if (prefetch_data_size <= (double) param_llc_capacity_per_core * 0.8)
++
++      if (prefetch_data_size <= (double) param_llc_capacity_per_core
++				* PREFETCH_CACHE_SIZE_RATIO)
+ 	static_issue (ref_groups, num_issue_var);
+       else
+ 	if (dump_file && (dump_flags & TDF_DETAILS))
+ 	  fprintf (dump_file, "static issue: Prefetch size exceeds LLC cache "
+-		   "size: %lf > %lf.\n", prefetch_data_size,
+-		   (double) param_llc_capacity_per_core * 0.8);
++			      "size: %lf > %lf.\n",
++		   prefetch_data_size,
++		   (double) param_llc_capacity_per_core
++		   * PREFETCH_CACHE_SIZE_RATIO);
+     }
+   else if (topn_calc_type == RUNTIME_CALC)
+-    runtime_issue (ref_groups, num_issue_var);
++    runtime_issue (ref_groups, num_issue_var, sorted_kernels);
+   else
+     {
+       if (dump_file && (dump_flags & TDF_DETAILS))
+@@ -2629,20 +3796,44 @@ llc_allocate (void)
+     return;
+ 
+   std::set<gimple *> traced_ref_stmt;
+-  trace_data_refs_info (kernels, kernels_refs, traced_ref_stmt);
++  std::vector<data_ref> unresolved_refs;
++  trace_data_refs_info (kernels, kernels_refs, traced_ref_stmt,
++			unresolved_refs);
+ 
+-  if (!analyze_nested_kernels (kernels, kernels_refs, traced_ref_stmt))
++  if (!analyze_nested_kernels (kernels, kernels_refs, traced_ref_stmt,
++			       unresolved_refs))
+     return;
+ 
+-  std::vector<class loop *> sorted_kernels;
+-  if (!filter_and_sort_kernels (sorted_kernels, kernels))
+-    return;
++  retrace_loop_refs_info_unresolved (unresolved_refs, traced_ref_stmt);
+ 
++  std::vector<class loop *> sorted_kernels;
+   std::vector<ref_group> ref_groups;
+-  if (!record_and_sort_ref_groups (ref_groups, sorted_kernels, kernels_refs))
+-    return;
++  if (param_filter_mode)
++    {
++      /* AutoFDO mode: include ENTRY bb and EXIT bb indices.  */
++      std::set<int> bb_pathset;
++      bb_pathset.insert (0);
++      bb_pathset.insert (1);
++      if (!filter_and_sort_kernels_feedback (sorted_kernels, bb_pathset))
++	return;
++
++      if (!record_and_sort_ref_groups (ref_groups, kernels, kernels_refs,
++				       bb_pathset))
++	return;
++    }
++  else
++    {
++      /* static mode  */
++      std::set<int> bb_pathset;
++      if (!filter_and_sort_kernels (sorted_kernels, kernels))
++	return;
++
++      if (!record_and_sort_ref_groups (ref_groups, sorted_kernels, kernels_refs,
++				       bb_pathset))
++	return;
++    }
+ 
+-  issue_llc_hint (ref_groups);
++  issue_llc_hint (ref_groups, sorted_kernels);
+ }
+ 
+ /* Check whether the function is an operator reloading function.  */
+@@ -2747,6 +3938,47 @@ dump_param (void)
+   }
+ }
+ 
++/* Determine whether to analyze the function according to
++   the ordering of functions containing cycle counts.  */
++
++static bool
++should_analyze_func_p (void)
++{
++  gcov_type decl_uid = DECL_UID (current_function_decl);
++  gcov_type func_count = event_get_func_count (decl_uid, PMU_EVENT);
++  if (func_count == 0)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "function uid %ld cannot find profile data "
++			      "and skip prefetch analysis\n",
++		   decl_uid);
++	}
++      return false;
++    }
++  if (func_count < PREFETCH_FUNC_COUNTS_THRESHOLD)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "function uid %ld total counts is %lu: "
++			      "counts %lu < perf's top %d threshold %u, "
++			      "skip prefetch analysis\n",
++		   decl_uid, func_count, func_count,
++		   PREFETCH_FUNC_TOPN, PREFETCH_FUNC_COUNTS_THRESHOLD);
++	}
++      return false;
++    }
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "function uid %ld total counts is %lu: "
++			  "counts %lu >= perf's top %d threshold %u, "
++			  "continue prefetch analysis\n",
++	       decl_uid, func_count, func_count,
++	       PREFETCH_FUNC_TOPN, PREFETCH_FUNC_COUNTS_THRESHOLD);
++    }
++  return true;
++}
++
+ const pass_data pass_data_llc_allocate =
+ {
+   GIMPLE_PASS, /* type.  */
+@@ -2807,6 +4039,18 @@ pass_llc_allocate::execute (function *fn)
+       || operator_func_p (fn))
+     return ret;
+ 
++  /* Filter only when combined with PMU event. When the should_analyze_func_p
++     analysis fails (for example, the function without PMU-event count),
++     in order to ensure the accuracy of the LLC allocation analysis, the
++     function does not perform native allocation processing.  */
++  if (profile_exist (PMU_EVENT))
++    {
++      if (!should_analyze_func_p ())
++	{
++	  return 0;
++	}
++    }
++
+   dump_function_info (fn);
+ 
+   llc_allocate ();
+-- 
+2.33.0
+
diff --git a/0193-Add-prefetch-level-parameter-to-specify-the-last-lev.patch b/0193-Add-prefetch-level-parameter-to-specify-the-last-lev.patch
new file mode 100644
index 0000000..109ae19
--- /dev/null
+++ b/0193-Add-prefetch-level-parameter-to-specify-the-last-lev.patch
@@ -0,0 +1,827 @@
+From 599d6f94c11fd906cfbabbd7ba4e5e2e5642cac9 Mon Sep 17 00:00:00 2001
+From: yzyssdd <yuzeyang4@huawei.com>
+Date: Tue, 28 May 2024 10:43:20 +0800
+Subject: [PATCH 2/2] Add prefetch level parameter to specify the last level
+ cache. Add l4 inst and deja case
+
+---
+ gcc/builtins.c                                | 82 +++++++++++++++++++
+ gcc/builtins.def                              |  1 +
+ gcc/config/aarch64/aarch64-protos.h           |  6 +-
+ gcc/config/aarch64/aarch64.md                 | 39 +++++++++
+ gcc/dce.c                                     |  1 +
+ gcc/hsa-gen.c                                 |  4 +-
+ gcc/ipa-pure-const.c                          |  1 +
+ gcc/params.opt                                |  5 ++
+ gcc/print-rtl.c                               |  6 ++
+ gcc/rtl.def                                   |  9 ++
+ gcc/rtl.h                                     |  4 +
+ gcc/rtlanal.c                                 |  2 +
+ gcc/sched-deps.c                              |  4 +-
+ gcc/target-insns.def                          |  1 +
+ .../llc-prefetch-full-pldl1keep.c             | 15 ++++
+ .../llc-prefetch-full-pldl1strm.c             | 15 ++++
+ .../llc-prefetch-full-pldl2keep.c             | 15 ++++
+ .../llc-prefetch-full-pldl2strm.c             | 15 ++++
+ .../llc-prefetch-full-pldl3keep.c             | 15 ++++
+ .../llc-prefetch-full-pldl3strm.c             | 15 ++++
+ .../llc-prefetch-full-pldl4keep.c             | 15 ++++
+ .../llc-prefetch-full-pldl4strm.c             | 15 ++++
+ .../llc-prefetch-full-pstl1keep.c             | 15 ++++
+ .../llc-prefetch-full-pstl1strm.c             | 15 ++++
+ .../llc-prefetch-full-pstl2keep.c             | 15 ++++
+ .../llc-prefetch-full-pstl2strm.c             | 15 ++++
+ .../llc-prefetch-full-pstl3keep.c             | 15 ++++
+ .../llc-prefetch-full-pstl3strm.c             | 15 ++++
+ .../llc-prefetch-full-pstl4keep.c             | 15 ++++
+ .../llc-prefetch-full-pstl4strm.c             | 15 ++++
+ gcc/tree-ssa-llc-allocate.c                   | 54 ++++++++++--
+ 31 files changed, 449 insertions(+), 10 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c
+
+diff --git a/gcc/builtins.c b/gcc/builtins.c
+index 1b1c75cc1..ffbb2cae9 100644
+--- a/gcc/builtins.c
++++ b/gcc/builtins.c
+@@ -1463,6 +1463,85 @@ expand_builtin_prefetch (tree exp)
+     emit_insn (op0);
+ }
+ 
++/* Expand a call to __builtin_prefetch_full.  */
++
++static void
++expand_builtin_prefetch_full (tree exp)
++{
++  tree arg0, arg1, arg2;
++  int nargs;
++  rtx op0, op1, op2;
++
++  if (!validate_arglist (exp, POINTER_TYPE, 0))
++    return;
++
++  arg0 = CALL_EXPR_ARG (exp, 0);
++
++  /* Arguments 1 and 2 are optional; argument 1 (read/write) defaults to
++     zero (read) and argument 2 (locality) defaults to 3 (high degree of
++     locality).  */
++  nargs = call_expr_nargs (exp);
++  if (nargs > 1)
++    arg1 = CALL_EXPR_ARG (exp, 1);
++  else
++    arg1 = integer_zero_node;
++  if (nargs > 2)
++    arg2 = CALL_EXPR_ARG (exp, 2);
++  else
++    arg2 = integer_three_node;
++
++  /* Argument 0 is an address.  */
++  op0 = expand_expr (arg0, NULL_RTX, Pmode, EXPAND_NORMAL);
++
++  /* Argument 1 (read/write flag) must be a compile-time constant int.  */
++  if (TREE_CODE (arg1) != INTEGER_CST)
++    {
++      error ("second argument to %<__builtin_prefetch_full%> must be a "
++             "constant");
++      arg1 = integer_zero_node;
++    }
++  op1 = expand_normal (arg1);
++  /* Argument 1 must be either zero or one.  */
++  if (INTVAL (op1) != 0 && INTVAL (op1) != 1)
++    {
++      warning (0, "invalid second argument to %<__builtin_prefetch_full%>;"
++	       " using zero");
++      op1 = const0_rtx;
++    }
++
++  /* Argument 2 (locality) must be a compile-time constant int.  */
++  if (TREE_CODE (arg2) != INTEGER_CST)
++    {
++      error ("third argument to %<__builtin_prefetch_full%> must be a "
++             "constant");
++      arg2 = integer_zero_node;
++    }
++  op2 = expand_normal (arg2);
++  /* Argument 2 must be 0-7.  */
++  if (INTVAL (op2) < 0 || INTVAL (op2) > 7)
++    {
++      warning (0, "invalid third argument to %<__builtin_prefetch_full%>; "
++               "using zero");
++      op2 = const0_rtx;
++    }
++
++  if (targetm.have_prefetch_full ())
++    {
++      class expand_operand ops[3];
++
++      create_address_operand (&ops[0], op0);
++      create_integer_operand (&ops[1], INTVAL (op1));
++      create_integer_operand (&ops[2], INTVAL (op2));
++      if (maybe_expand_insn (targetm.code_for_prefetch_full, 3, ops))
++	return;
++    }
++
++  /* Don't do anything with direct references to volatile memory, but
++     generate code to handle other side effects.  */
++  if (!MEM_P (op0) && side_effects_p (op0))
++    emit_insn (op0);
++}
++
+ /* Get a MEM rtx for expression EXP which is the address of an operand
+    to be used in a string instruction (cmpstrsi, cpymemsi, ..).  LEN is
+    the maximum length of the block of memory that might be accessed or
+@@ -8386,6 +8465,9 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
+     case BUILT_IN_PREFETCH:
+       expand_builtin_prefetch (exp);
+       return const0_rtx;
++    case BUILT_IN_PREFETCH_FULL:
++      expand_builtin_prefetch_full (exp);
++      return const0_rtx;
+ 
+     case BUILT_IN_INIT_TRAMPOLINE:
+       return expand_builtin_init_trampoline (exp, true);
+diff --git a/gcc/builtins.def b/gcc/builtins.def
+index ee67ac15d..b89cec11f 100644
+--- a/gcc/builtins.def
++++ b/gcc/builtins.def
+@@ -927,6 +927,7 @@ DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl", BT_FN_INT_ULONG, ATTR_C
+ DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll", BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
+ DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign", BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
+ DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
++DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH_FULL, "prefetch_full", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
+ DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE, ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
+ DEF_GCC_BUILTIN        (BUILT_IN_RETURN, "return", BT_FN_VOID_PTR, ATTR_NORETURN_NOTHROW_LEAF_LIST)
+ DEF_GCC_BUILTIN        (BUILT_IN_RETURN_ADDRESS, "return_address", BT_FN_PTR_UINT, ATTR_LEAF_LIST)
+diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
+index 1a4fc2028..c8388f902 100644
+--- a/gcc/config/aarch64/aarch64-protos.h
++++ b/gcc/config/aarch64/aarch64-protos.h
+@@ -455,12 +455,16 @@ extern struct tune_params aarch64_tune_params;
+   T (PLDL2STRM, pldl2strm, 3) \
+   T (PLDL3KEEP, pldl3keep, 4) \
+   T (PLDL3STRM, pldl3strm, 5) \
++  T (PLDL4KEEP, pldl4keep, 6) \
++  T (PLDL4STRM, pldl4strm, 7) \
+   T (PSTL1KEEP, pstl1keep, 8) \
+   T (PSTL1STRM, pstl1strm, 9) \
+   T (PSTL2KEEP, pstl2keep, 10) \
+   T (PSTL2STRM, pstl2strm, 11) \
+   T (PSTL3KEEP, pstl3keep, 12) \
+-  T (PSTL3STRM, pstl3strm, 13)
++  T (PSTL3STRM, pstl3strm, 13) \
++  T (PSTL4KEEP, pstl4keep, 14) \
++  T (PSTL4STRM, pstl4strm, 15)
+ 
+ #define AARCH64_SVENUM(UPPER, LOWER, VALUE) AARCH64_SV_##UPPER = VALUE,
+ enum aarch64_svpattern {
+diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
+index 38af8d000..2ec1c5d19 100644
+--- a/gcc/config/aarch64/aarch64.md
++++ b/gcc/config/aarch64/aarch64.md
+@@ -831,6 +831,45 @@
+   [(set_attr "type" "load_4")]
+ )
+ 
++(define_insn "prefetch_full"
++  [(prefetch_full (match_operand:DI 0 "aarch64_prefetch_operand" "Dp")
++            (match_operand:QI 1 "const_int_operand" "")
++            (match_operand:QI 2 "const_int_operand" ""))]
++  ""
++  {
++    const char * pftype[2][8] =
++    {
++      {"prfm\\tPLDL1KEEP, %0",
++       "prfm\\tPLDL1STRM, %0",
++       "prfm\\tPLDL2KEEP, %0",
++       "prfm\\tPLDL2STRM, %0",
++       "prfm\\tPLDL3KEEP, %0",
++       "prfm\\tPLDL3STRM, %0",
++       "prfm\\tPLDL4KEEP, %0",
++       "prfm\\tPLDL4STRM, %0"},
++      {"prfm\\tPSTL1KEEP, %0",
++       "prfm\\tPSTL1STRM, %0",
++       "prfm\\tPSTL2KEEP, %0",
++       "prfm\\tPSTL2STRM, %0",
++       "prfm\\tPSTL3KEEP, %0",
++       "prfm\\tPSTL3STRM, %0",
++       "prfm\\tPSTL4KEEP, %0",
++       "prfm\\tPSTL4STRM, %0"},
++    };
++
++    int prfop = INTVAL (operands[2]);
++
++    gcc_assert (IN_RANGE (prfop, 0, 7));
++
++    /* PRFM accepts the same addresses as a 64-bit LDR so wrap
++       the address into a DImode MEM so that aarch64_print_operand knows
++       how to print it.  */
++    operands[0] = gen_rtx_MEM (DImode, operands[0]);
++    return pftype[INTVAL(operands[1])][prfop];
++  }
++  [(set_attr "type" "load_4")]
++)
++
+ (define_insn "trap"
+   [(trap_if (const_int 1) (const_int 8))]
+   ""
+diff --git a/gcc/dce.c b/gcc/dce.c
+index a6a1599b5..aaa63b63a 100644
+--- a/gcc/dce.c
++++ b/gcc/dce.c
+@@ -72,6 +72,7 @@ deletable_insn_p_1 (rtx body)
+   switch (GET_CODE (body))
+     {
+     case PREFETCH:
++    case PREFETCH_FULL:
+     case TRAP_IF:
+       /* The UNSPEC case was added here because the ia-64 claims that
+ 	 USEs do not work after reload and generates UNSPECS rather
+diff --git a/gcc/hsa-gen.c b/gcc/hsa-gen.c
+index 767badab6..c121aee8d 100644
+--- a/gcc/hsa-gen.c
++++ b/gcc/hsa-gen.c
+@@ -5309,7 +5309,8 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
+ 
+       /* Prefetch pass can create type-mismatching prefetch builtin calls which
+ 	 fail the gimple_call_builtin_p test above.  Handle them here.  */
+-      if (fndecl_built_in_p (function_decl, BUILT_IN_PREFETCH))
++      if (fndecl_built_in_p (function_decl, BUILT_IN_PREFETCH)
++          || fndecl_built_in_p (function_decl, BUILT_IN_PREFETCH_FULL))
+ 	return;
+ 
+       if (hsa_callable_function_p (function_decl))
+@@ -5723,6 +5724,7 @@ gen_hsa_insns_for_call (gimple *stmt, hsa_bb *hbb)
+ 	break;
+       }
+     case BUILT_IN_PREFETCH:
++    case BUILT_IN_PREFETCH_FULL:
+       break;
+     default:
+       {
+diff --git a/gcc/ipa-pure-const.c b/gcc/ipa-pure-const.c
+index 564c6629c..0dc8e60a8 100644
+--- a/gcc/ipa-pure-const.c
++++ b/gcc/ipa-pure-const.c
+@@ -534,6 +534,7 @@ special_builtin_state (enum pure_const_state_e *state, bool *looping,
+ 	*state = IPA_CONST;
+ 	return true;
+       case BUILT_IN_PREFETCH:
++      case BUILT_IN_PREFETCH_FULL:
+ 	*looping = true;
+ 	*state = IPA_CONST;
+ 	return true;
+diff --git a/gcc/params.opt b/gcc/params.opt
+index 0c9a270b4..f128ae6a4 100644
+--- a/gcc/params.opt
++++ b/gcc/params.opt
+@@ -1059,6 +1059,11 @@ Common Joined UInteger Var(param_outer_loop_num) Init(1) IntegerRange(1, 10) Par
+ Maximum number of outer loops allowed to extend outer loops for loops that
+ cannot recognize inner loop boundaries.
+ 
++-param=llc-level=
++Common Joined UInteger Var(param_llc_level) Init(3) IntegerRange(3, 4)
++Param Optimization
++Specifies the HBM cache level.
++
+ -param=filter-mode=
+ Common Joined UInteger Var(param_filter_mode) Init(1) IntegerRange(0, 1) Param
+ Set kernel filtering mode. Use basic block count by default; use branch probability mode when filter mode is turned off.
+diff --git a/gcc/print-rtl.c b/gcc/print-rtl.c
+index 611ea079c..4443caf4a 100644
+--- a/gcc/print-rtl.c
++++ b/gcc/print-rtl.c
+@@ -1549,6 +1549,12 @@ print_exp (pretty_printer *pp, const_rtx x, int verbose)
+       op[1] = XEXP (x, 1);
+       op[2] = XEXP (x, 2);
+       break;
++    case PREFETCH_FULL:
++      fun = "prefetch_full";
++      op[0] = XEXP (x, 0);
++      op[1] = XEXP (x, 1);
++      op[2] = XEXP (x, 2);
++      break;
+     case UNSPEC:
+     case UNSPEC_VOLATILE:
+       {
+diff --git a/gcc/rtl.def b/gcc/rtl.def
+index 9754333ea..30fd1cf81 100644
+--- a/gcc/rtl.def
++++ b/gcc/rtl.def
+@@ -282,6 +282,15 @@ DEF_RTL_EXPR(ADDR_DIFF_VEC, "addr_diff_vec", "eEee0", RTX_EXTRA)
+    whose prefetch instructions do not support them.  */
+ DEF_RTL_EXPR(PREFETCH, "prefetch", "eee", RTX_EXTRA)
+ 
++/* Memory prefetch, with attributes supported on some targets.
++   Operand 1 is the address of the memory to fetch.
++   Operand 2 is 1 for a write access, 0 otherwise.
++   Operand 3 is the level of prfop.
++
++   The attributes specified by operands 2 and 3 are ignored for targets
++   whose prefetch instructions do not support them.  */
++DEF_RTL_EXPR(PREFETCH_FULL, "prefetch_full", "eee", RTX_EXTRA)
++
+ /* ----------------------------------------------------------------------
+    At the top level of an instruction (perhaps under PARALLEL).
+    ---------------------------------------------------------------------- */
+diff --git a/gcc/rtl.h b/gcc/rtl.h
+index b29afca8d..fbcd05562 100644
+--- a/gcc/rtl.h
++++ b/gcc/rtl.h
+@@ -2804,6 +2804,10 @@ do {								        \
+ #define PREFETCH_SCHEDULE_BARRIER_P(RTX)					\
+   (RTL_FLAG_CHECK1 ("PREFETCH_SCHEDULE_BARRIER_P", (RTX), PREFETCH)->volatil)
+ 
++/* True if RTX is flagged to be a scheduling barrier.  */
++#define PREFETCH_FULL_SCHEDULE_BARRIER_P(RTX)				\
++  (RTL_FLAG_CHECK1 ("PREFETCH_FULL_SCHEDULE_BARRIER_P", (RTX), PREFETCH_FULL)->volatil)
++
+ /* Indicate whether the machine has any sort of auto increment addressing.
+    If not, we can avoid checking for REG_INC notes.  */
+ 
+diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c
+index 0ebde7622..63bf1bf58 100644
+--- a/gcc/rtlanal.c
++++ b/gcc/rtlanal.c
+@@ -1195,6 +1195,7 @@ reg_referenced_p (const_rtx x, const_rtx body)
+       return reg_overlap_mentioned_p (x, TRAP_CONDITION (body));
+ 
+     case PREFETCH:
++    case PREFETCH_FULL:
+       return reg_overlap_mentioned_p (x, XEXP (body, 0));
+ 
+     case UNSPEC:
+@@ -2007,6 +2008,7 @@ note_uses (rtx *pbody, void (*fun) (rtx *, void *), void *data)
+       return;
+ 
+     case PREFETCH:
++    case PREFETCH_FULL:
+       (*fun) (&XEXP (body, 0), data);
+       return;
+ 
+diff --git a/gcc/sched-deps.c b/gcc/sched-deps.c
+index 331af5ffd..cb5a64ed9 100644
+--- a/gcc/sched-deps.c
++++ b/gcc/sched-deps.c
+@@ -2720,7 +2720,9 @@ sched_analyze_2 (class deps_desc *deps, rtx x, rtx_insn *insn)
+       break;
+ 
+     case PREFETCH:
+-      if (PREFETCH_SCHEDULE_BARRIER_P (x))
++    case PREFETCH_FULL:
++      if ((code == PREFETCH && PREFETCH_SCHEDULE_BARRIER_P (x))
++          || (code == PREFETCH_FULL && PREFETCH_FULL_SCHEDULE_BARRIER_P (x)))
+ 	reg_pending_barrier = TRUE_BARRIER;
+       /* Prefetch insn contains addresses only.  So if the prefetch
+ 	 address has no registers, there will be no dependencies on
+diff --git a/gcc/target-insns.def b/gcc/target-insns.def
+index 4d7eb92cf..e80361f0a 100644
+--- a/gcc/target-insns.def
++++ b/gcc/target-insns.def
+@@ -77,6 +77,7 @@ DEF_TARGET_INSN (omp_simt_vote_any, (rtx x0, rtx x1))
+ DEF_TARGET_INSN (omp_simt_xchg_bfly, (rtx x0, rtx x1, rtx x2))
+ DEF_TARGET_INSN (omp_simt_xchg_idx, (rtx x0, rtx x1, rtx x2))
+ DEF_TARGET_INSN (prefetch, (rtx x0, rtx x1, rtx x2))
++DEF_TARGET_INSN (prefetch_full, (rtx x0, rtx x1, rtx x2))
+ DEF_TARGET_INSN (probe_stack, (rtx x0))
+ DEF_TARGET_INSN (probe_stack_address, (rtx x0))
+ DEF_TARGET_INSN (prologue, (void))
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c
+new file mode 100644
+index 000000000..c0fa2db2f
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c
+@@ -0,0 +1,15 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options " -S -O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,0);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL1KEEP"  } } */
++
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c
+new file mode 100644
+index 000000000..bcd1113d1
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c
+@@ -0,0 +1,15 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,1);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL1STRM"  } } */
++
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c
+new file mode 100644
+index 000000000..46702bfbc
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c
+@@ -0,0 +1,15 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,2);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL2KEEP"  } } */
++
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c
+new file mode 100644
+index 000000000..e359ad178
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c
+@@ -0,0 +1,15 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,3);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL2STRM"  } } */
++
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c
+new file mode 100644
+index 000000000..0a9dae090
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c
+@@ -0,0 +1,15 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,4);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL3KEEP"  } } */
++
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c
+new file mode 100644
+index 000000000..58db40ba1
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c
+@@ -0,0 +1,15 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,5);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL3STRM"  } } */
++
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c
+new file mode 100644
+index 000000000..6f6b7bbd4
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c
+@@ -0,0 +1,15 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,6);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL4KEEP"  } } */
++
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c
+new file mode 100644
+index 000000000..b69b4a5e6
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c
+@@ -0,0 +1,15 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,7);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL4STRM"  } } */
++
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c
+new file mode 100644
+index 000000000..f5a474eb5
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c
+@@ -0,0 +1,15 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,0);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL1KEEP"  } } */
++
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c
+new file mode 100644
+index 000000000..6798824a9
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c
+@@ -0,0 +1,15 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,1);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL1STRM"  } } */
++
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c
+new file mode 100644
+index 000000000..c19fcc830
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c
+@@ -0,0 +1,15 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,2);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL2KEEP"  } } */
++
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c
+new file mode 100644
+index 000000000..dde160a28
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c
+@@ -0,0 +1,15 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,3);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL2STRM"  } } */
++
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c
+new file mode 100644
+index 000000000..fa698243d
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c
+@@ -0,0 +1,15 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,4);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL3KEEP"  } } */
++
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c
+new file mode 100644
+index 000000000..653f7786e
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c
+@@ -0,0 +1,15 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,5);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL3STRM"  } } */
++
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c
+new file mode 100644
+index 000000000..16a3b6552
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c
+@@ -0,0 +1,15 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,6);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL4KEEP"  } } */
++
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c
+new file mode 100644
+index 000000000..60d671bf5
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c
+@@ -0,0 +1,15 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,7);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL4STRM"  } } */
++
+diff --git a/gcc/tree-ssa-llc-allocate.c b/gcc/tree-ssa-llc-allocate.c
+index 107d5da26..75501f41c 100644
+--- a/gcc/tree-ssa-llc-allocate.c
++++ b/gcc/tree-ssa-llc-allocate.c
+@@ -3271,8 +3271,19 @@ issue_mask_prefetch (gimple *stmt)
+     target = gimple_call_arg (stmt, 3);
+   else if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD)
+     target = gimple_call_lhs (stmt);
+-  /* 4: PLDL3KEEP.  */
+-  tree prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4);
++  tree prfop = NULL_TREE;
++  if (param_llc_level == 3)
++    /* for simulation, 4: PLDL3KEEP. */
++    prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4);
++  else if (param_llc_level == 4)
++    /* 6: PLDL4KEEP.  */
++    prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6);
++  else
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "LLC cache levels are illegal.\n");
++      return;
++    }
+ 
+   /* add offset.  */
+   gimple_stmt_iterator si = gsi_for_stmt (stmt);
+@@ -3310,9 +3321,19 @@ issue_mask_gather_prefetch (gimple *stmt)
+   tree scale = gimple_call_arg (stmt, 2);
+   tree zero = gimple_call_arg (stmt, 3);
+   tree final_mask = gimple_call_arg (stmt, 4);
+-  tree prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4);
+-  tree target = gimple_call_lhs (stmt);
++  tree prfop = NULL_TREE;
++  if (param_llc_level == 3) // for simulation
++    prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4); // 4: PLDL3KEEP
++  else if (param_llc_level == 4)
++    prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6); // 6: PLDL4KEEP
++  else
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "LLC cache levels are illegal.\n");
++      return;
++    }
+ 
++  tree target = gimple_call_lhs (stmt);
+   /* add offset.  */
+   gimple_stmt_iterator si = gsi_for_stmt (stmt);
+   if (target == NULL_TREE)
+@@ -3373,8 +3394,27 @@ issue_builtin_prefetch (data_ref &mem_ref)
+   /* __builtin_prefetch (_68, 0, 1);
+      1st param: *addr, 2nd param: write/read (1/0), 3rd param: temporal locality
+      (high means strong locality) */
+-  gcall *call = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH), 3,
+-				   addr, integer_zero_node, integer_one_node);
++  gcall *call = NULL;
++  if (param_llc_level == 3)
++    {
++      /* for simulation.
++         BUILT_IN_PREFETCH (addr, rw, locality).  */
++      call = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
++                                3, addr, integer_zero_node, integer_one_node);
++    }
++  else if (param_llc_level == 4)
++    {
++        tree prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6);
++        call = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH_FULL),
++				3, addr, integer_zero_node, prfop);
++    }
++  else
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "LLC cache levels are illegal.\n");
++      return;
++    }
++
+   gsi_insert_after (&si, call, GSI_SAME_STMT);
+   update_ssa (TODO_update_ssa_only_virtuals);
+ }
+@@ -3724,7 +3764,7 @@ issue_llc_hint (std::vector<ref_group> &ref_groups,
+     fprintf (dump_file, "issue_llc_hint:\n");
+ 
+   /* 1) If the issue-topn and force-issue options are available, top N var is
+-	forcibly allocated and no runtime branch is generated.
++	forcibly allocated then no runtime branch is generated.
+      2) If the issue-topn option is available and the size of top N var is
+ 	statically known, top N is statically allocated and no runtime branch
+ 	is generated.
+-- 
+2.33.0
+
diff --git a/gcc.spec b/gcc.spec
index 80a045f..60a92c9 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -61,7 +61,7 @@
 Summary: Various compilers (C, C++, Objective-C, ...)
 Name: gcc
 Version: %{gcc_version}
-Release: 54
+Release: 55
 License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD
 URL: https://gcc.gnu.org
 
@@ -297,6 +297,10 @@ Patch186: 0186-Loop-CRC-Solving-the-problem-of-insufficient-CRC-tab.patch
 Patch187: 0187-Add-IPA-prefetch-test.patch
 Patch188: 0188-Fix-fails-in-ICP-for-src-openEuler-gcc-I90P7M-I91CZ8.patch
 Patch189: 0189-Add-hip11-CPU-pipeline-scheduling.patch
+Patch190: 0190-sync-LLC-difference-between-source-and-patch-code.patch
+Patch191: 0191-LLC-Allocation-Bugfix-Fix-ambiguous-reference-due-to.patch
+Patch192: 0192-Add-feedback-directed-filter_and_sort_kernels-in-Pha.patch
+Patch193: 0193-Add-prefetch-level-parameter-to-specify-the-last-lev.patch
 
 %global gcc_target_platform %{_arch}-linux-gnu
 
@@ -939,6 +943,10 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch187 -p1
 %patch188 -p1
 %patch189 -p1
+%patch190 -p1
+%patch191 -p1
+%patch192 -p1
+%patch193 -p1
 
 %build
 
@@ -2973,6 +2981,12 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Wed May 29 2024 yzyssdd <yuzeyang4@huawei.com> - 10.3.1-55
+- Type:SPEC
+- ID:NA
+- SUG:NA
+- DESC: Add feedback llc allocate and support llc prefetch instruction
+
 * Thu Mar 14 2024 chenyuanfeng <yuanfeng.chen@shingroup.cn> - 10.3.1-54
 - Type: Spec
 - ID:NA
-- 
Gitee