From ad6e562db9402d171babf94d32c4f24d33ddf661 Mon Sep 17 00:00:00 2001
From: chenhong <chenhong92@huawei.com>
Date: Tue, 18 Feb 2025 19:11:58 +0800
Subject: [PATCH] add llc feature

---
 gcc/Makefile.in                               |    1 +
 gcc/auto-profile.cc                           |  490 +-
 gcc/auto-profile.h                            |   34 +
 gcc/builtins.cc                               |   82 +
 gcc/builtins.def                              |    1 +
 gcc/cfgloop.h                                 |    3 +
 gcc/common.opt                                |   28 +
 gcc/config/aarch64/aarch64-protos.h           |   30 +-
 gcc/config/aarch64/aarch64-sve.md             |   48 +-
 gcc/config/aarch64/aarch64.cc                 |   17 +
 gcc/config/aarch64/aarch64.md                 |   39 +
 gcc/dce.cc                                    |    1 +
 gcc/doc/tm.texi                               |   21 +
 gcc/doc/tm.texi.in                            |    6 +
 gcc/internal-fn.cc                            |  122 +
 gcc/internal-fn.def                           |    4 +
 gcc/ipa-pure-const.cc                         |    1 +
 gcc/optabs.def                                |    2 +
 gcc/opts.cc                                   |   40 +-
 gcc/params.opt                                |   80 +
 gcc/passes.def                                |    2 +
 gcc/print-rtl.cc                              |    6 +
 gcc/rtl.def                                   |    9 +
 gcc/rtl.h                                     |    5 +
 gcc/rtlanal.cc                                |    2 +
 gcc/sched-deps.cc                             |    4 +-
 gcc/target-insns.def                          |    1 +
 gcc/target.def                                |   31 +
 .../g++.dg/llc-allocate/llc-allocate.exp      |   27 +
 .../llc-allocate/llc-relion-expand-kernels.C  |   52 +
 .../g++.dg/llc-allocate/multidim_array.h      |  186 +
 gcc/testsuite/gcc.dg/llc-allocate/llc-1.c     |   61 +
 gcc/testsuite/gcc.dg/llc-allocate/llc-2.c     |   54 +
 .../gcc.dg/llc-allocate/llc-allocate.exp      |   27 +
 .../llc-allocate/llc-cross-bb-indir-mem-acc.c |   36 +
 .../llc-allocate/llc-extend-outer-loop.c      |   61 +
 .../llc-feedback-branch-in-loop.c             |   39 +
 .../llc-allocate/llc-feedback-break-in-loop.c |   41 +
 .../llc-allocate/llc-feedback-goto-in-loop.c  |   50 +
 .../llc-feedback-same-loop-cycle.c            |  129 +
 .../gcc.dg/llc-allocate/llc-nonzero-offset.c  |   50 +
 .../llc-prefetch-full-pldl1keep.c             |   14 +
 .../llc-prefetch-full-pldl1strm.c             |   14 +
 .../llc-prefetch-full-pldl2keep.c             |   14 +
 .../llc-prefetch-full-pldl2strm.c             |   14 +
 .../llc-prefetch-full-pldl3keep.c             |   14 +
 .../llc-prefetch-full-pldl3strm.c             |   14 +
 .../llc-prefetch-full-pldl4keep.c             |   14 +
 .../llc-prefetch-full-pldl4strm.c             |   14 +
 .../llc-prefetch-full-pstl1keep.c             |   14 +
 .../llc-prefetch-full-pstl1strm.c             |   14 +
 .../llc-prefetch-full-pstl2keep.c             |   14 +
 .../llc-prefetch-full-pstl2strm.c             |   14 +
 .../llc-prefetch-full-pstl3keep.c             |   14 +
 .../llc-prefetch-full-pstl3strm.c             |   14 +
 .../llc-prefetch-full-pstl4keep.c             |   14 +
 .../llc-prefetch-full-pstl4strm.c             |   14 +
 .../gcc.dg/llc-allocate/llc-ref-trace.c       |   62 +
 .../llc-allocate/llc-tool-insertion-1.c       |   48 +
 .../llc-allocate/llc-tool-insertion-2.c       |   48 +
 .../llc-allocate/llc-tool-insertion-3.c       |   48 +
 .../llc-allocate/llc-tool-insertion-4.c       |   47 +
 .../llc-allocate/llc-tool-insertion-5.c       |   48 +
 .../llc-allocate/llc-tool-insertion-6.c       |   47 +
 .../llc-tool-insertion-7-null-var-name.c      |   52 +
 .../llc-tool-insertion-8-tmp-var-name.c       |   54 +
 .../gfortran.dg/llc-allocate/llc-3.f90        |  216 +
 .../gfortran.dg/llc-allocate/llc-allocate.exp |   29 +
 .../llc-trace-multiple-base-var.f90           |   62 +
 .../llc-unknown-type-size-unit.f90            |   58 +
 .../llc-allocate/llc-wrf-4-outer-loop-num.f90 |  320 ++
 gcc/timevar.def                               |    2 +
 gcc/toplev.cc                                 |    6 +
 gcc/tree-cfg.cc                               |   11 +
 gcc/tree-cfg.h                                |    2 +
 gcc/tree-pass.h                               |    4 +
 gcc/tree-scalar-evolution.cc                  |    9 +-
 gcc/tree-scalar-evolution.h                   |    3 +-
 gcc/tree-ssa-llc-allocate.cc                  | 4408 +++++++++++++++++
 gcc/tree-ssa-loop-niter.cc                    |   42 +-
 gcc/tree-ssa-loop-niter.h                     |    8 +-
 gcc/tree-vect-loop-manip.cc                   |  102 +-
 gcc/tree-vect-loop.cc                         |    1 -
 83 files changed, 7743 insertions(+), 141 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp
 create mode 100644 gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C
 create mode 100644 gcc/testsuite/g++.dg/llc-allocate/multidim_array.h
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-2.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-branch-in-loop.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-break-in-loop.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-goto-in-loop.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-same-loop-cycle.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-2.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-3.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-4.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-5.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-6.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-7-null-var-name.c
 create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c
 create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90
 create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp
 create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90
 create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90
 create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90
 create mode 100644 gcc/tree-ssa-llc-allocate.cc

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 65f683bbd7f..ef773358021 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1659,6 +1659,7 @@ OBJS = \
 	tree-ssa-loop-niter.o \
 	tree-ssa-loop-array-widen-compare.o \
 	tree-ssa-loop-prefetch.o \
+	tree-ssa-llc-allocate.o \
 	tree-ssa-loop-split.o \
 	tree-ssa-loop-unswitch.o \
 	tree-ssa-loop.o \
diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index 5e85381ce12..088131ee834 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -49,6 +49,9 @@ along with GCC; see the file COPYING3.  If not see
 #include "auto-profile.h"
 #include "tree-pretty-print.h"
 #include "gimple-pretty-print.h"
+#include <map>
+#include <vector>
+#include <algorithm>
 
 /* The following routines implements AutoFDO optimization.
 
@@ -95,6 +98,8 @@ along with GCC; see the file COPYING3.  If not see
 */
 
 #define DEFAULT_AUTO_PROFILE_FILE "fbdata.afdo"
+#define DEFAULT_CACHE_MISSES_PROFILE_FILE "cmsdata.gcov"
+#define DEFAULT_ADDITIONAL_PROFILE_FILE "addldata.gcov"
 #define AUTO_PROFILE_VERSION 2
 
 namespace autofdo
@@ -117,6 +122,14 @@ private:
   bool annotated_;
 };
 
+/* pair <func_decl, count>  */
+static bool
+event_count_cmp (std::pair<unsigned, gcov_type> &a,
+		 std::pair<unsigned, gcov_type> &b)
+{
+  return a.second > b.second;
+}
+
 /* Represent a source location: (function_decl, lineno).  */
 typedef std::pair<tree, unsigned> decl_lineno;
 
@@ -311,6 +324,9 @@ public:
   /* Mark LOC as annotated.  */
   void mark_annotated (location_t loc);
 
+  /* Compute total count threshold of top functions in sampled data.  */
+  gcov_type calc_topn_function_total_count_thres (unsigned topn) const;
+
 private:
   /* Map from function_instance name index (in string_table) to
      function_instance.  */
@@ -338,6 +354,247 @@ static autofdo_source_profile *afdo_source_profile;
 /* gcov_summary structure to store the profile_info.  */
 static gcov_summary *afdo_profile_info;
 
+/* Check opts->x_flags and put file name into EVENT_FILES.  */
+
+static bool
+get_all_profile_names (const char **event_files)
+{
+  if (!(flag_auto_profile
+	|| (flag_cache_misses_profile || flag_additional_profile)))
+    {
+      return false;
+    }
+
+  event_files[INST_EXEC] = auto_profile_file;
+
+  if (flag_cache_misses_profile)
+    {
+      if (cache_misses_profile_file == NULL)
+	{
+	  if (additional_profile_file == NULL)
+	    {
+	      additional_profile_file = DEFAULT_ADDITIONAL_PROFILE_FILE;
+	    }
+	  event_files[PMU_EVENT] = additional_profile_file;
+	}
+      event_files[CACHE_MISSES] = cache_misses_profile_file;
+    }
+  else if (flag_additional_profile)
+    {
+      if (additional_profile_file == NULL)
+	{
+	  additional_profile_file = DEFAULT_ADDITIONAL_PROFILE_FILE;
+	}
+      event_files[PMU_EVENT] = additional_profile_file;
+    }
+
+  return true;
+}
+
+static void
+read_profile (void);
+
+/* Maintain multiple profile data of different events with event_loc_count_map
+   and event_func_count_map.  */
+
+class extend_auto_profile
+{
+public:
+  bool auto_profile_exist (enum event_type type);
+  gcov_type get_loc_count (location_t, event_type);
+  gcov_type get_func_count (unsigned, event_type);
+  gcov_type get_topn_function_total_count_thres () const;
+  struct rank_info get_func_rank (unsigned, enum event_type);
+  /* There should be only one instance of class EXTEND_AUTO_PROFILE.  */
+  static extend_auto_profile *create ()
+  {
+    extend_auto_profile *map = new extend_auto_profile ();
+    if (map->read ())
+      {
+	return map;
+      }
+    delete map;
+    return NULL;
+  }
+
+private:
+  /* Basic maps of extend_auto_profile.  */
+  typedef std::map<location_t, gcov_type> loc_count_map;
+  typedef std::map<unsigned, gcov_type> func_count_map;
+
+  /* Map of function_uid to its descending order rank of counts.  */
+  typedef std::map<unsigned, unsigned> rank_map;
+
+  /* Mapping hardware events to corresponding basic maps.  */
+  typedef std::map<event_type, loc_count_map> event_loc_count_map;
+  typedef std::map<event_type, func_count_map> event_func_count_map;
+  typedef std::map<event_type, rank_map> event_rank_map;
+
+  extend_auto_profile () {}
+  bool read ();
+  void set_loc_count ();
+  void process_extend_source_profile ();
+  void read_extend_afdo_file (const char *, event_type);
+  void rank_all_func ();
+  void dump_event ();
+  event_loc_count_map event_loc_map;
+  event_func_count_map event_func_map;
+  event_rank_map func_rank;
+  event_type profile_type;
+  gcov_type topn_function_total_count_thres;
+};
+
+/* Member functions for extend_auto_profile.  */
+
+bool
+extend_auto_profile::auto_profile_exist (enum event_type type)
+{
+  switch (type)
+    {
+    case INST_EXEC:
+      return event_func_map.count (INST_EXEC) != 0
+	     || event_loc_map.count (INST_EXEC) != 0;
+    case CACHE_MISSES:
+      return event_func_map.count (CACHE_MISSES) != 0
+	     || event_loc_map.count (CACHE_MISSES) != 0;
+    case PMU_EVENT:
+      return event_func_map.count (PMU_EVENT) != 0
+	     || event_loc_map.count (PMU_EVENT) != 0;
+    default:
+      return false;
+    }
+}
+
+void
+extend_auto_profile::dump_event ()
+{
+  if (dump_file)
+    {
+      switch (profile_type)
+	{
+	case INST_EXEC:
+	  fprintf (dump_file, "Processing event instruction execution.\n");
+	  break;
+	case CACHE_MISSES:
+	  fprintf (dump_file, "Processing event cache misses.\n");
+	  break;
+	case PMU_EVENT:
+	  fprintf (dump_file, "Processing other PMU events.\n");
+	  break;
+	default:
+	  break;
+	}
+    }
+}
+
+/* Return true if any profile data was read.  */
+
+bool
+extend_auto_profile::read ()
+{
+  const char *event_files[EVENT_NUMBER] = {NULL};
+  if (!get_all_profile_names (event_files))
+    {
+      return false;
+    }
+
+  /* Backup AFDO_STRING_TABLE and AFDO_SOURCE_PROFILE since we will create
+     new ones for each event_type.  */
+  autofdo::string_table *string_table_afdo = afdo_string_table;
+  autofdo::autofdo_source_profile *source_profile_afdo = afdo_source_profile;
+
+  for (unsigned i = 0; i < EVENT_NUMBER; i++)
+    {
+      if (event_files[i] == NULL)
+	{
+	  continue;
+	}
+      profile_type = (enum event_type) i;
+      dump_event ();
+      gcov_close ();
+      auto_profile_file = event_files[i];
+      read_profile ();
+      gcov_close ();
+
+      topn_function_total_count_thres
+	= param_llc_allocate_func_counts_threshold;
+      if (param_llc_allocate_func_topn > 0 && profile_type == PMU_EVENT)
+	{
+	  topn_function_total_count_thres
+	    = afdo_source_profile->calc_topn_function_total_count_thres (
+	      param_llc_allocate_func_topn);
+	}
+
+      process_extend_source_profile ();
+
+      delete afdo_source_profile;
+      delete afdo_string_table;
+    }
+
+  /* Restore AFDO_STRING_TABLE and AFDO_SOURCE_PROFILE.  Function
+     END_AUTO_PROFILE will free them at the end of compilation.  */
+  afdo_string_table = string_table_afdo;
+  afdo_source_profile = source_profile_afdo;
+  return true;
+}
+
+/* Helper functions.  */
+
+gcov_type
+extend_auto_profile::get_loc_count (location_t loc, event_type type)
+{
+  event_loc_count_map::iterator event_iter = event_loc_map.find (type);
+  if (event_iter != event_loc_map.end ())
+    {
+      loc_count_map::iterator loc_iter = event_iter->second.find (loc);
+      if (loc_iter != event_iter->second.end ())
+	{
+	  return loc_iter->second;
+	}
+    }
+  return 0;
+}
+
+struct rank_info
+extend_auto_profile::get_func_rank (unsigned decl_uid, enum event_type type)
+{
+  struct rank_info info = {0, 0};
+  event_rank_map::iterator event_iter = func_rank.find (type);
+  if (event_iter != func_rank.end ())
+    {
+      rank_map::iterator func_iter = event_iter->second.find (decl_uid);
+      if (func_iter != event_iter->second.end ())
+	{
+	  info.rank = func_iter->second;
+	  info.total = event_iter->second.size ();
+	}
+    }
+  return info;
+}
+
+gcov_type
+extend_auto_profile::get_func_count (unsigned decl_uid, event_type type)
+{
+  event_func_count_map::iterator event_iter = event_func_map.find (type);
+  if (event_iter != event_func_map.end ())
+    {
+      func_count_map::iterator func_iter = event_iter->second.find (decl_uid);
+      if (func_iter != event_iter->second.end ())
+	{
+	  return func_iter->second;
+	}
+    }
+  return 0;
+}
+
+gcov_type
+extend_auto_profile::get_topn_function_total_count_thres () const
+{
+  return topn_function_total_count_thres;
+}
+
+static extend_auto_profile *extend_profile;
+
 /* Helper functions.  */
 
 /* Return the original name of NAME: strip the suffix that starts
@@ -483,7 +740,7 @@ string_table::get_index (const char *name) const
   return iter->second;
 }
 
-/* Return the index of a given function DECL. Return -1 if DECL is not 
+/* Return the index of a given function DECL. Return -1 if DECL is not
    found in string table.  */
 
 int
@@ -917,6 +1174,31 @@ autofdo_source_profile::get_function_instance_by_inline_stack (
   return s;
 }
 
+/* Compute total count threshold of top functions in sampled data.  */
+
+gcov_type
+autofdo_source_profile::calc_topn_function_total_count_thres (
+  unsigned topn) const
+{
+  std::set<gcov_type> func_counts;
+  for (name_function_instance_map::const_iterator iter = map_.begin ();
+       iter != map_.end (); ++iter)
+    {
+      if (func_counts.size () < topn)
+	func_counts.insert (iter->second->total_count ());
+      else if (*func_counts.begin () < iter->second->total_count ())
+	{
+	  func_counts.erase (func_counts.begin ());
+	  func_counts.insert (iter->second->total_count ());
+	}
+    }
+
+  gcov_type func_counts_topn = *func_counts.begin ();
+  if (func_counts.size () == topn
+      && param_llc_allocate_func_counts_threshold < func_counts_topn)
+    return func_counts_topn;
+}
+
 /* Module profile is only used by LIPO. Here we simply ignore it.  */
 
 static void
@@ -1842,6 +2124,131 @@ auto_profile (void)
 
   return TODO_rebuild_cgraph_edges;
 }
+
+void
+extend_auto_profile::rank_all_func ()
+{
+  std::vector<std::pair<unsigned, gcov_type> > func_sorted;
+  event_func_count_map::iterator event_iter
+    = event_func_map.find (profile_type);
+  if (event_iter != event_func_map.end ())
+    {
+      func_count_map::iterator func_iter;
+      for (func_iter = event_iter->second.begin ();
+	   func_iter != event_iter->second.end (); func_iter++)
+	{
+	  func_sorted.push_back (
+	    std::make_pair (func_iter->first, func_iter->second));
+	}
+
+      std::sort (func_sorted.begin (), func_sorted.end (), event_count_cmp);
+
+      for (unsigned i = 0; i < func_sorted.size (); ++i)
+	{
+	  func_rank[profile_type][func_sorted[i].first] = i + 1;
+	}
+    }
+}
+
+/* Iterate stmts in cfun and maintain its count to EVENT_LOC_MAP.  */
+
+void
+extend_auto_profile::set_loc_count ()
+{
+  basic_block bb;
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      gimple_stmt_iterator gsi;
+      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+	{
+	  count_info info;
+	  gimple *stmt = gsi_stmt (gsi);
+	  if (gimple_clobber_p (stmt) || is_gimple_debug (stmt))
+	    {
+	      continue;
+	    }
+	  if (afdo_source_profile->get_count_info (stmt, &info))
+	    {
+	      location_t loc = gimple_location (stmt);
+	      event_loc_map[profile_type][loc] += info.count;
+	      if (dump_file && (dump_flags & TDF_DETAILS))
+		{
+		  fprintf (dump_file, "stmt ");
+		  print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
+		  fprintf (dump_file, "counts %ld\n",
+			   event_loc_map[profile_type][loc]);
+		}
+	    }
+	}
+    }
+}
+
+/* Process data in extend_auto_source_profile, save them into two maps.
+   1. gimple_location to count.
+   2. function_index to count.  */
+void
+extend_auto_profile::process_extend_source_profile ()
+{
+  struct cgraph_node *node;
+  if (symtab->state == FINISHED)
+    {
+      return;
+    }
+  FOR_EACH_FUNCTION (node)
+    {
+      if (!gimple_has_body_p (node->decl) || node->inlined_to)
+	{
+	  continue;
+	}
+
+      /* Don't profile functions produced for builtin stuff.  */
+      if (DECL_SOURCE_LOCATION (node->decl) == BUILTINS_LOCATION)
+	{
+	  continue;
+	}
+
+      function *fn = DECL_STRUCT_FUNCTION (node->decl);
+      push_cfun (fn);
+
+      const function_instance *s
+	= afdo_source_profile->get_function_instance_by_decl (
+	  current_function_decl);
+
+      if (s == NULL)
+	{
+	  pop_cfun ();
+	  continue;
+	}
+      unsigned int decl_uid = DECL_UID (current_function_decl);
+      gcov_type count = s->total_count ();
+      if (dump_file)
+	{
+	  fprintf (dump_file, "Extend auto-profile for function %s.\n",
+		   node->dump_name ());
+	}
+      event_func_map[profile_type][decl_uid] += count;
+      set_loc_count ();
+      pop_cfun ();
+    }
+  rank_all_func ();
+}
+
+/* Main entry of extend_auto_profile.  */
+
+static void
+extend_source_profile ()
+{
+  extend_profile = autofdo::extend_auto_profile::create ();
+  if (dump_file)
+    {
+      if (extend_profile == NULL)
+	{
+	  fprintf (dump_file, "No profile file is found.\n");
+	  return;
+	}
+      fprintf (dump_file, "Extend profile info generated.\n");
+    }
+}
 } /* namespace autofdo.  */
 
 /* Read the profile from the profile data file.  */
@@ -1870,6 +2277,48 @@ end_auto_profile (void)
   profile_info = NULL;
 }
 
+/* Extern function to get profile info in other passes.  */
+
+bool
+profile_exist (enum event_type type)
+{
+  return autofdo::extend_profile != NULL
+	 && autofdo::extend_profile->auto_profile_exist (type);
+}
+
+gcov_type
+event_get_loc_count (location_t loc, event_type type)
+{
+  return autofdo::extend_profile->get_loc_count (loc, type);
+}
+
+gcov_type
+event_get_func_count (unsigned decl_uid, event_type type)
+{
+  return autofdo::extend_profile->get_func_count (decl_uid, type);
+}
+
+struct rank_info
+event_get_func_rank (unsigned decl_uid, enum event_type type)
+{
+  return autofdo::extend_profile->get_func_rank (decl_uid, type);
+}
+
+gcov_type
+event_get_topn_function_total_count_thres ()
+{
+  return autofdo::extend_profile->get_topn_function_total_count_thres ();
+}
+
+void
+free_extend_profile_info ()
+{
+  if (autofdo::extend_profile != NULL)
+    {
+      delete autofdo::extend_profile;
+    }
+}
+
 /* Returns TRUE if EDGE is hot enough to be inlined early.  */
 
 bool
@@ -1931,8 +2380,47 @@ public:
 
 } // anon namespace
 
+namespace {
+const pass_data pass_data_ipa_extend_auto_profile = {
+  SIMPLE_IPA_PASS,	      /* type */
+  "ex-afdo",		      /* name */
+  OPTGROUP_NONE,	      /* optinfo_flags */
+  TV_IPA_EXTEND_AUTO_PROFILE, /* tv_id */
+  0,			      /* properties_required */
+  0,			      /* properties_provided */
+  0,			      /* properties_destroyed */
+  0,			      /* todo_flags_start */
+  0,			      /* todo_flags_finish */
+};
+
+class pass_ipa_extend_auto_profile : public simple_ipa_opt_pass
+{
+public:
+  pass_ipa_extend_auto_profile (gcc::context *ctxt)
+    : simple_ipa_opt_pass (pass_data_ipa_extend_auto_profile, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *) { return (flag_ipa_extend_auto_profile > 0); }
+  virtual unsigned int execute (function *);
+};
+
+unsigned int
+pass_ipa_extend_auto_profile::execute (function *fun)
+{
+  autofdo::extend_source_profile ();
+  return 0;
+}
+} // namespace
+
 simple_ipa_opt_pass *
 make_pass_ipa_auto_profile (gcc::context *ctxt)
 {
   return new pass_ipa_auto_profile (ctxt);
 }
+
+simple_ipa_opt_pass *
+make_pass_ipa_extend_auto_profile (gcc::context *ctxt)
+{
+  return new pass_ipa_extend_auto_profile (ctxt);
+}
diff --git a/gcc/auto-profile.h b/gcc/auto-profile.h
index bf3f90f2fff..31d5c309bfe 100644
--- a/gcc/auto-profile.h
+++ b/gcc/auto-profile.h
@@ -21,6 +21,14 @@ along with GCC; see the file COPYING3.  If not see
 #ifndef AUTO_PROFILE_H
 #define AUTO_PROFILE_H
 
+enum event_type
+{
+  INST_EXEC = 0,
+  CACHE_MISSES,
+  PMU_EVENT,
+  EVENT_NUMBER
+};
+
 /* Read, process, finalize AutoFDO data structures.  */
 extern void read_autofdo_file (void);
 extern void end_auto_profile (void);
@@ -28,4 +36,30 @@ extern void end_auto_profile (void);
 /* Returns TRUE if EDGE is hot enough to be inlined early.  */
 extern bool afdo_callsite_hot_enough_for_early_inline (struct cgraph_edge *);
 
+/* Chcek if profile exists before using this profile.  */
+extern bool profile_exist (enum event_type);
+
+/* Given func decl_uid or gimple location and event_type, return count.
+   Count is 0 if function or gimple is not sampled.  */
+extern gcov_type
+event_get_func_count (unsigned, enum event_type);
+extern gcov_type event_get_loc_count (location_t, enum event_type);
+extern gcov_type
+event_get_topn_function_total_count_thres ();
+
+struct rank_info
+{
+  unsigned total;
+  unsigned rank;
+};
+
+/* Given function decl_uid and event type, return rank_info.  Rank_info
+   is {0, 0} if function was not sampled.  */
+extern struct rank_info
+event_get_func_rank (unsigned, enum event_type);
+
+/* Free memory allocated by autofdo::extern_profile.  */
+extern void
+free_extend_profile_info ();
+
 #endif /* AUTO_PROFILE_H */
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 57929a42bc4..622548e28e3 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -1352,6 +1352,85 @@ expand_builtin_prefetch (tree exp)
     emit_insn (op0);
 }
 
+/* Expand a call to __builtin_prefetch_full.  */
+
+static void
+expand_builtin_prefetch_full (tree exp)
+{
+  tree arg0, arg1, arg2;
+  int nargs;
+  rtx op0, op1, op2;
+
+  if (!validate_arglist (exp, POINTER_TYPE, 0))
+    return;
+
+  arg0 = CALL_EXPR_ARG (exp, 0);
+
+  /* Arguments 1 and 2 are optional; argument 1 (read/write) defaults to
+     zero (read) and argument 2 (locality) defaults to 3 (high degree of
+     locality).  */
+  nargs = call_expr_nargs (exp);
+  if (nargs > 1)
+    arg1 = CALL_EXPR_ARG (exp, 1);
+  else
+    arg1 = integer_zero_node;
+  if (nargs > 2)
+    arg2 = CALL_EXPR_ARG (exp, 2);
+  else
+    arg2 = integer_three_node;
+
+  /* Argument 0 is an address.  */
+  op0 = expand_expr (arg0, NULL_RTX, Pmode, EXPAND_NORMAL);
+
+  /* Argument 1 (read/write flag) must be a compile-time constant int.  */
+  if (TREE_CODE (arg1) != INTEGER_CST)
+    {
+      error ("second argument to %<__builtin_prefetch_full%> must be a "
+	     "constant");
+      arg1 = integer_zero_node;
+    }
+  op1 = expand_normal (arg1);
+  /* Argument 1 must be either zero or one.  */
+  if (INTVAL (op1) != 0 && INTVAL (op1) != 1)
+    {
+      warning (0, "invalid second argument to %<__builtin_prefetch_full%>;"
+		  " using zero");
+      op1 = const0_rtx;
+    }
+
+  /* Argument 2 (locality) must be a compile-time constant int.  */
+  if (TREE_CODE (arg2) != INTEGER_CST)
+    {
+      error ("third argument to %<__builtin_prefetch_full%> must be a "
+	     "constant");
+      arg2 = integer_zero_node;
+    }
+  op2 = expand_normal (arg2);
+  /* Argument 2 must be 0-7.  */
+  if (INTVAL (op2) < 0 || INTVAL (op2) > 7)
+    {
+      warning (0, "invalid third argument to %<__builtin_prefetch_full%>; "
+		  "using zero");
+      op2 = const0_rtx;
+    }
+
+  if (targetm.have_prefetch_full ())
+    {
+      class expand_operand ops[3];
+
+      create_address_operand (&ops[0], op0);
+      create_integer_operand (&ops[1], INTVAL (op1));
+      create_integer_operand (&ops[2], INTVAL (op2));
+      if (maybe_expand_insn (targetm.code_for_prefetch_full, 3, ops))
+	return;
+    }
+
+  /* Don't do anything with direct references to volatile memory, but
+     generate code to handle other side effects.  */
+  if (!MEM_P (op0) && side_effects_p (op0))
+    emit_insn (op0);
+}
+
 /* Get a MEM rtx for expression EXP which is the address of an operand
    to be used in a string instruction (cmpstrsi, cpymemsi, ..).  LEN is
    the maximum length of the block of memory that might be accessed or
@@ -7598,6 +7677,9 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
     case BUILT_IN_PREFETCH:
       expand_builtin_prefetch (exp);
       return const0_rtx;
+    case BUILT_IN_PREFETCH_FULL:
+      expand_builtin_prefetch_full (exp);
+      return const0_rtx;
 
     case BUILT_IN_INIT_TRAMPOLINE:
       return expand_builtin_init_trampoline (exp, true);
diff --git a/gcc/builtins.def b/gcc/builtins.def
index 005976f34e9..f2e0c357da6 100644
--- a/gcc/builtins.def
+++ b/gcc/builtins.def
@@ -924,6 +924,7 @@ DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl", BT_FN_INT_ULONG, ATTR_C
 DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll", BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
 DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign", BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
 DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
+DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH_FULL, "prefetch_full", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
 DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE, ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
 DEF_GCC_BUILTIN        (BUILT_IN_RETURN, "return", BT_FN_VOID_PTR, ATTR_NORETURN_NOTHROW_LEAF_LIST)
 DEF_GCC_BUILTIN        (BUILT_IN_RETURN_ADDRESS, "return_address", BT_FN_PTR_UINT, ATTR_LEAF_LIST)
diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
index d2714e20cb0..794bc3ecc0b 100644
--- a/gcc/cfgloop.h
+++ b/gcc/cfgloop.h
@@ -272,6 +272,9 @@ public:
      the basic-block from being collected but its index can still be
      reused.  */
   basic_block former_header;
+
+  /* Number of latch executions from vectorization.  */
+  tree vec_nb_iterations;
 };
 
 /* Set if the loop is known to be infinite.  */
diff --git a/gcc/common.opt b/gcc/common.opt
index 6ab7ba4ccb0..e6ffa1c5818 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1148,6 +1148,26 @@ Common Joined RejectNegative Var(auto_profile_file)
 Use sample profile information for call graph node weights. The profile
 file is specified in the argument.
 
+fcache-misses-profile
+Common Var(flag_cache_misses_profile)
+Use sample profile information for source code cache miss count.  The default
+profile file is cmsdata.gcov in `pwd`.
+
+fcache-misses-profile=
+Common Joined RejectNegative Var(cache_misses_profile_file)
+Use sample profile information for source code cache miss count.  The profile
+file is specified in the argument.
+
+fadditional-profile
+Common Var(flag_additional_profile)
+Use additional PMU-event sample profile information for source code bb count.
+The default profile file is addldata.gcov in `pwd`.
+
+fadditional-profile=
+Common Joined RejectNegative Var(additional_profile_file)
+Use additional PMU-event sample profile information for source code bb count.
+The profile file is specified in the argument.
+
 ; -fcheck-bounds causes gcc to generate array bounds checks.
 ; For C, C++ and ObjC: defaults off.
 ; For Java: defaults to on.
@@ -2074,6 +2094,10 @@ fipa-struct-sfc-shadow
 Common Var(flag_ipa_struct_sfc_shadow) Init(0) Optimization
 Enable field shadowing optimization in static struct field compression.
 
+fipa-extend-auto-profile
+Common Var(flag_ipa_extend_auto_profile)
+Use sample profile information for source code.
+
 fipa-vrp
 Common Var(flag_ipa_vrp) Optimization
 Perform IPA Value Range Propagation.
@@ -2424,6 +2448,10 @@ fipa-prefetch
 Common Var(flag_ipa_prefetch) Init(0) Optimization
 Generate prefetch instructions, if available, using IPA info.
 
+fllc-allocate
+Common Var(flag_llc_allocate) Init(-1) Optimization
+Generate LLC hint instructions.
+
 fprofile
 Common Var(profile_flag)
 Enable basic program profiling code.
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index cbb844fbc56..dd2e3abaf6f 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -695,19 +695,23 @@ extern struct tune_params aarch64_tune_params;
   T (ALL, all, 31)
 
 /* The available SVE prefetch operations, known in the ACLE as "svprfop".  */
-#define AARCH64_FOR_SVPRFOP(T) \
-  T (PLDL1KEEP, pldl1keep, 0) \
-  T (PLDL1STRM, pldl1strm, 1) \
-  T (PLDL2KEEP, pldl2keep, 2) \
-  T (PLDL2STRM, pldl2strm, 3) \
-  T (PLDL3KEEP, pldl3keep, 4) \
-  T (PLDL3STRM, pldl3strm, 5) \
-  T (PSTL1KEEP, pstl1keep, 8) \
-  T (PSTL1STRM, pstl1strm, 9) \
-  T (PSTL2KEEP, pstl2keep, 10) \
-  T (PSTL2STRM, pstl2strm, 11) \
-  T (PSTL3KEEP, pstl3keep, 12) \
-  T (PSTL3STRM, pstl3strm, 13)
+#define AARCH64_FOR_SVPRFOP(T)                                                 \
+  T (PLDL1KEEP, pldl1keep, 0)                                                  \
+  T (PLDL1STRM, pldl1strm, 1)                                                  \
+  T (PLDL2KEEP, pldl2keep, 2)                                                  \
+  T (PLDL2STRM, pldl2strm, 3)                                                  \
+  T (PLDL3KEEP, pldl3keep, 4)                                                  \
+  T (PLDL3STRM, pldl3strm, 5)                                                  \
+  T (PLDL4KEEP, pldl4keep, 6)                                                  \
+  T (PLDL4STRM, pldl4strm, 7)                                                  \
+  T (PSTL1KEEP, pstl1keep, 8)                                                  \
+  T (PSTL1STRM, pstl1strm, 9)                                                  \
+  T (PSTL2KEEP, pstl2keep, 10)                                                 \
+  T (PSTL2STRM, pstl2strm, 11)                                                 \
+  T (PSTL3KEEP, pstl3keep, 12)                                                 \
+  T (PSTL3STRM, pstl3strm, 13)                                                 \
+  T (PSTL4KEEP, pstl4keep, 14)                                                 \
+  T (PSTL4STRM, pstl4strm, 15)
 
 #define AARCH64_SVENUM(UPPER, LOWER, VALUE) AARCH64_SV_##UPPER = VALUE,
 enum aarch64_svpattern {
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index a8a5dc3a2a6..7808abf70a8 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -1952,7 +1952,7 @@
 (define_insn "@aarch64_sve_prefetch<mode>"
   [(prefetch (unspec:DI
 	       [(match_operand:<VPRED> 0 "register_operand" "Upl")
-		(match_operand:SVE_FULL_I 1 "aarch64_sve_prefetch_operand" "UP<Vesize>")
+		(match_operand:SVE_FULL 1 "aarch64_sve_prefetch_operand" "UP<Vesize>")
 		(match_operand:DI 2 "const_int_operand")]
 	       UNSPEC_SVE_PREFETCH)
 	     (match_operand:DI 3 "const_int_operand")
@@ -1985,14 +1985,14 @@
 ;; 6: the prefetch operator (an svprfop)
 ;; 7: the normal RTL prefetch rw flag
 ;; 8: the normal RTL prefetch locality value
-(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx4SI_ONLY:mode>"
+(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL:mode><VNx4SI_ONLY:mode>"
   [(prefetch (unspec:DI
 	       [(match_operand:VNx4BI 0 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
-		(match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL_I:Vesize>" "Z, vg<SVE_FULL_I:Vesize>, rk, rk, rk, rk")
+		(match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL:Vesize>" "Z, vg<SVE_FULL:Vesize>, rk, rk, rk, rk")
 		(match_operand:VNx4SI_ONLY 2 "register_operand" "w, w, w, w, w, w")
 		(match_operand:DI 3 "const_int_operand" "i, i, Z, Ui1, Z, Ui1")
-		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, Ui1, Ui1, Ui1, i, i")
-		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
+		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL:Vesize>" "Ui1, Ui1, Ui1, Ui1, i, i")
+		(match_operand:SVE_FULL 5 "aarch64_simd_imm_zero")
 		(match_operand:DI 6 "const_int_operand")]
 	       UNSPEC_SVE_PREFETCH_GATHER)
 	     (match_operand:DI 7 "const_int_operand")
@@ -2000,12 +2000,12 @@
   "TARGET_SVE && TARGET_NON_STREAMING"
   {
     static const char *const insns[][2] = {
-      "prf<SVE_FULL_I:Vesize>", "%0, [%2.s]",
-      "prf<SVE_FULL_I:Vesize>", "%0, [%2.s, #%1]",
+      "prf<SVE_FULL:Vesize>", "%0, [%2.s]",
+      "prf<SVE_FULL:Vesize>", "%0, [%2.s, #%1]",
       "prfb", "%0, [%1, %2.s, sxtw]",
       "prfb", "%0, [%1, %2.s, uxtw]",
-      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.s, sxtw %p4]",
-      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.s, uxtw %p4]"
+      "prf<SVE_FULL:Vesize>", "%0, [%1, %2.s, sxtw %p4]",
+      "prf<SVE_FULL:Vesize>", "%0, [%1, %2.s, uxtw %p4]"
     };
     const char *const *parts = insns[which_alternative];
     return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
@@ -2014,14 +2014,14 @@
 
 ;; Predicated gather prefetches for 64-bit elements.  The value of operand 3
 ;; doesn't matter in this case.
-(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx2DI_ONLY:mode>"
+(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL:mode><VNx2DI_ONLY:mode>"
   [(prefetch (unspec:DI
 	       [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl, Upl, Upl")
-		(match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL_I:Vesize>" "Z, vg<SVE_FULL_I:Vesize>, rk, rk")
+		(match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL:Vesize>" "Z, vg<SVE_FULL:Vesize>, rk, rk")
 		(match_operand:VNx2DI_ONLY 2 "register_operand" "w, w, w, w")
 		(match_operand:DI 3 "const_int_operand")
-		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, Ui1, Ui1, i")
-		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
+		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL:Vesize>" "Ui1, Ui1, Ui1, i")
+		(match_operand:SVE_FULL 5 "aarch64_simd_imm_zero")
 		(match_operand:DI 6 "const_int_operand")]
 	       UNSPEC_SVE_PREFETCH_GATHER)
 	     (match_operand:DI 7 "const_int_operand")
@@ -2029,10 +2029,10 @@
   "TARGET_SVE && TARGET_NON_STREAMING"
   {
     static const char *const insns[][2] = {
-      "prf<SVE_FULL_I:Vesize>", "%0, [%2.d]",
-      "prf<SVE_FULL_I:Vesize>", "%0, [%2.d, #%1]",
+      "prf<SVE_FULL:Vesize>", "%0, [%2.d]",
+      "prf<SVE_FULL:Vesize>", "%0, [%2.d, #%1]",
       "prfb", "%0, [%1, %2.d]",
-      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.d, lsl %p4]"
+      "prf<SVE_FULL:Vesize>", "%0, [%1, %2.d, lsl %p4]"
     };
     const char *const *parts = insns[which_alternative];
     return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
@@ -2040,7 +2040,7 @@
 )
 
 ;; Likewise, but with the offset being sign-extended from 32 bits.
-(define_insn_and_rewrite "*aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx2DI_ONLY:mode>_sxtw"
+(define_insn_and_rewrite "*aarch64_sve_gather_prefetch<SVE_FULL:mode><VNx2DI_ONLY:mode>_sxtw"
   [(prefetch (unspec:DI
 	       [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl")
 		(match_operand:DI 1 "register_operand" "rk, rk")
@@ -2051,8 +2051,8 @@
 		       (match_operand:VNx2DI 2 "register_operand" "w, w")))]
 		  UNSPEC_PRED_X)
 		(match_operand:DI 3 "const_int_operand")
-		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, i")
-		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
+		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL:Vesize>" "Ui1, i")
+		(match_operand:SVE_FULL 5 "aarch64_simd_imm_zero")
 		(match_operand:DI 6 "const_int_operand")]
 	       UNSPEC_SVE_PREFETCH_GATHER)
 	     (match_operand:DI 7 "const_int_operand")
@@ -2061,7 +2061,7 @@
   {
     static const char *const insns[][2] = {
       "prfb", "%0, [%1, %2.d, sxtw]",
-      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.d, sxtw %p4]"
+      "prf<SVE_FULL:Vesize>", "%0, [%1, %2.d, sxtw %p4]"
     };
     const char *const *parts = insns[which_alternative];
     return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
@@ -2073,7 +2073,7 @@
 )
 
 ;; Likewise, but with the offset being zero-extended from 32 bits.
-(define_insn "*aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx2DI_ONLY:mode>_uxtw"
+(define_insn "*aarch64_sve_gather_prefetch<SVE_FULL:mode><VNx2DI_ONLY:mode>_uxtw"
   [(prefetch (unspec:DI
 	       [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl")
 		(match_operand:DI 1 "register_operand" "rk, rk")
@@ -2081,8 +2081,8 @@
 		  (match_operand:VNx2DI 2 "register_operand" "w, w")
 		  (match_operand:VNx2DI 9 "aarch64_sve_uxtw_immediate"))
 		(match_operand:DI 3 "const_int_operand")
-		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, i")
-		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
+		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL:Vesize>" "Ui1, i")
+		(match_operand:SVE_FULL 5 "aarch64_simd_imm_zero")
 		(match_operand:DI 6 "const_int_operand")]
 	       UNSPEC_SVE_PREFETCH_GATHER)
 	     (match_operand:DI 7 "const_int_operand")
@@ -2091,7 +2091,7 @@
   {
     static const char *const insns[][2] = {
       "prfb", "%0, [%1, %2.d, uxtw]",
-      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.d, uxtw %p4]"
+      "prf<SVE_FULL:Vesize>", "%0, [%1, %2.d, uxtw %p4]"
     };
     const char *const *parts = insns[which_alternative];
     return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index e9c387b2436..263e0cf6541 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -4408,6 +4408,13 @@ aarch64_sve_data_mode_p (machine_mode mode)
   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
 }
 
+/* Return true if MODE is an full SVE data vector mode.  */
+static bool
+aarch64_full_sve_data_mode_p (machine_mode mode)
+{
+  return aarch64_classify_vector_mode (mode) == VEC_SVE_DATA;
+}
+
 /* Return the number of defined bytes in one constituent vector of
    SVE mode MODE, which has vector flags VEC_FLAGS.  */
 static poly_int64
@@ -31796,6 +31803,16 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_ASM_FUNCTION_EPILOGUE
 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
 
+#undef TARGET_VECTORIZE_CODE_FOR_PREFETCH
+#define TARGET_VECTORIZE_CODE_FOR_PREFETCH code_for_aarch64_sve_prefetch
+
+#undef TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH
+#define TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH                              \
+  code_for_aarch64_sve_gather_prefetch
+
+#undef TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P
+#define TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P aarch64_full_sve_data_mode_p
+
 #undef TARGET_HAVE_SHADOW_CALL_STACK
 #define TARGET_HAVE_SHADOW_CALL_STACK true
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 2f46bc79319..69d29655674 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -925,6 +925,45 @@
   [(set_attr "type" "load_4")]
 )
 
+(define_insn "prefetch_full"
+  [(prefetch_full (match_operand:DI 0 "aarch64_prefetch_operand" "Dp")
+            (match_operand:QI 1 "const_int_operand" "")
+            (match_operand:QI 2 "const_int_operand" ""))]
+  ""
+  {
+    const char * pftype[2][8] =
+    {
+      {"prfm\\tPLDL1KEEP, %0",
+       "prfm\\tPLDL1STRM, %0",
+       "prfm\\tPLDL2KEEP, %0",
+       "prfm\\tPLDL2STRM, %0",
+       "prfm\\tPLDL3KEEP, %0",
+       "prfm\\tPLDL3STRM, %0",
+       "prfm\\tPLDL4KEEP, %0",
+       "prfm\\tPLDL4STRM, %0"},
+      {"prfm\\tPSTL1KEEP, %0",
+       "prfm\\tPSTL1STRM, %0",
+       "prfm\\tPSTL2KEEP, %0",
+       "prfm\\tPSTL2STRM, %0",
+       "prfm\\tPSTL3KEEP, %0",
+       "prfm\\tPSTL3STRM, %0",
+       "prfm\\tPSTL4KEEP, %0",
+       "prfm\\tPSTL4STRM, %0"},
+    };
+
+    int prfop = INTVAL (operands[2]);
+
+    gcc_assert (IN_RANGE (prfop, 0, 7));
+
+    /* PRFM accepts the same addresses as a 64-bit LDR so wrap
+       the address into a DImode MEM so that aarch64_print_operand knows
+       how to print it.  */
+    operands[0] = gen_rtx_MEM (DImode, operands[0]);
+    return pftype[INTVAL (operands[1])][prfop];
+  }
+  [(set_attr "type" "load_4")]
+)
+
 (define_insn "trap"
   [(trap_if (const_int 1) (const_int 8))]
   ""
diff --git a/gcc/dce.cc b/gcc/dce.cc
index 6676cbcd429..964a0a6d0b7 100644
--- a/gcc/dce.cc
+++ b/gcc/dce.cc
@@ -72,6 +72,7 @@ deletable_insn_p_1 (rtx body)
   switch (GET_CODE (body))
     {
     case PREFETCH:
+    case PREFETCH_FULL:
     case TRAP_IF:
       /* The UNSPEC case was added here because the ia-64 claims that
 	 USEs do not work after reload and generates UNSPECS rather
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 50bbbbc4250..16ada7aae42 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6278,6 +6278,27 @@ The default is @code{NULL_TREE} which means to not vectorize scatter
 stores.
 @end deftypefn
 
+@deftypefn {Target Hook} insn_code TARGET_VECTORIZE_CODE_FOR_PREFETCH (machine_mode @var{arg})
+This hook should return the decl of a function that implements the
+vectorized variant of the function with the @code{combined_fn} code
+@var{code} or @code{NULL_TREE} if such a function is not available.
+The return type of the vectorized function shall be of vector type
+@var{vec_type_out} and the argument types should be @var{vec_type_in}.
+@end deftypefn
+
+@deftypefn {Target Hook} insn_code TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH (machine_mode @var{mode_to}, machine_mode @var{mode_form})
+This hook should return the decl of a function that implements the
+vectorized variant of the function with the @code{combined_fn} code
+@var{code} or @code{NULL_TREE} if such a function is not available.
+The return type of the vectorized function shall be of vector type
+@var{vec_type_out} and the argument types should be @var{vec_type_in}.
+@end deftypefn
+
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P (machine_mode @var{arg})
+This hook should return true if the target hardware architecture
+supports a full SVE data vector mode.
+@end deftypefn
+
 @deftypefn {Target Hook} int TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN (struct cgraph_node *@var{}, struct cgraph_simd_clone *@var{}, @var{tree}, @var{int})
 This hook should set @var{vecsize_mangle}, @var{vecsize_int}, @var{vecsize_float}
 fields in @var{simd_clone} structure pointed by @var{clone_info} argument and also
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index cfda603042a..88db8752ed4 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4190,6 +4190,12 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_BUILTIN_SCATTER
 
+@hook TARGET_VECTORIZE_CODE_FOR_PREFETCH
+
+@hook TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH
+
+@hook TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P
+
 @hook TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
 
 @hook TARGET_SIMD_CLONE_ADJUST
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 8b1733e20c4..20b52e27da8 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -107,11 +107,19 @@ init_internal_fns ()
    direct_internal_fn.  */
 #define not_direct { -2, -2, false }
 #define mask_load_direct { -1, 2, false }
+#define mask_prefetch_direct                                                   \
+  {                                                                            \
+    -1, 2, false                                                               \
+  }
 #define load_lanes_direct { -1, -1, false }
 #define mask_load_lanes_direct { -1, -1, false }
 #define gather_load_direct { 3, 1, false }
 #define len_load_direct { -1, -1, false }
 #define mask_store_direct { 3, 2, false }
+#define gather_prefetch_direct                                                 \
+  {                                                                            \
+    3, 1, false                                                                \
+  }
 #define store_lanes_direct { 0, 0, false }
 #define mask_store_lanes_direct { 0, 0, false }
 #define vec_cond_mask_direct { 1, 0, false }
@@ -2745,6 +2753,53 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
 #define expand_mask_load_lanes_optab_fn expand_mask_load_optab_fn
 #define expand_len_load_optab_fn expand_partial_load_optab_fn
 
+/* Expand MASK_PREFETCH call STMT using optab OPTAB.
+   .MASK_STORE (_5, 64B, loop_mask_98, vect__8.10_102);
+   .MASK_PREFETCH (_68, 64B, loop_mask_98, vect__8.10_102, 4);
+*/
+
+static void
+expand_mask_prefetch_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
+{
+  if (targetm.vectorize.code_for_prefetch == NULL
+      || targetm.vectorize.prefetch_handleable_mode_p == NULL)
+    return;
+
+  tree base = gimple_call_arg (stmt, 0);
+  if (base == NULL_TREE)
+    return;
+
+  tree maskt = gimple_call_arg (stmt, 2);
+  tree target = gimple_call_arg (stmt, 3);
+  tree prfop = gimple_call_arg (stmt, 4);
+  HOST_WIDE_INT prfop_int = tree_to_uhwi (prfop);
+  /* Bit 3 of the prfop selects stores over loads.  */
+  HOST_WIDE_INT access = prfop_int & 8;
+  /* Bits 1 and 2 specify the locality; 0-based for svprfop but
+     1-based for PREFETCH.  */
+  HOST_WIDE_INT locality = ((prfop_int >> 1) & 3) + 1;
+
+  machine_mode m_mode = TYPE_MODE (TREE_TYPE (target));
+  if (!targetm.vectorize.prefetch_handleable_mode_p (m_mode))
+    return;
+  insn_code icode = targetm.vectorize.code_for_prefetch (m_mode);
+
+  rtx mask = expand_normal (maskt);
+  rtx base_rtx = expand_normal (base);
+  /* Convert ptr_mode value X to Pmode.  */
+  if (ptr_mode == SImode)
+    base_rtx = simplify_gen_unary (ZERO_EXTEND, DImode, base_rtx, SImode);
+
+  unsigned i = 0;
+  class expand_operand ops[5];
+  create_input_operand (&ops[i++], mask, TYPE_MODE (TREE_TYPE (maskt)));
+  create_address_operand (&ops[i++], base_rtx);
+  create_integer_operand (&ops[i++], prfop_int);
+  create_integer_operand (&ops[i++], access);
+  create_integer_operand (&ops[i++], locality);
+  expand_insn (icode, i, ops);
+}
+
 /* Expand MASK_STORE{,_LANES} or LEN_STORE call STMT using optab OPTAB.  */
 
 static void
@@ -3402,6 +3457,71 @@ contains_call_div_mod (rtx_insn *insn)
   return false;
  }
 
+ /* Expand {MASK_,}GATHER_PREFETCH call CALL using optab OPTAB.
+   vect_patt_97.14_77 = .MASK_GATHER_LOAD (_78, vect__14.13_79, 8, { 0.0, ... },
+   loop_mask_87); .MASK_GATHER_PREFETCH (_45, vect__14.13_79, 8, { 0.0, ... },
+   loop_mask_87, vect_patt_97.14_77, 4);
+ */
+
+ static void
+ expand_gather_prefetch_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
+ {
+   if (targetm.vectorize.code_for_gather_prefetch == NULL
+       || targetm.vectorize.prefetch_handleable_mode_p == NULL)
+     return;
+
+   /* Extracting tree nodes, only expand for scalar base and vector index.  */
+   tree base = gimple_call_arg (stmt, 0);
+   if (VECTOR_TYPE_P (TREE_TYPE (base)))
+     return;
+   tree offset = gimple_call_arg (stmt, 1);
+   if (VECTOR_TYPE_P (TREE_TYPE (offset)) == false)
+     return;
+
+   tree scale = gimple_call_arg (stmt, 2);
+   tree mask = gimple_call_arg (stmt, 4);
+   tree target = gimple_call_arg (stmt, 5);
+   tree prfop = gimple_call_arg (stmt, 6);
+
+   /* Convert to the rtx node.  */
+   rtx base_rtx = expand_normal (base);
+   /* Convert ptr_mode value X to Pmode.  */
+   if (ptr_mode == SImode)
+     base_rtx = simplify_gen_unary (ZERO_EXTEND, DImode, base_rtx, SImode);
+   rtx offset_rtx = expand_normal (offset);
+   rtx const_rtx = CONST0_RTX (TYPE_MODE (TREE_TYPE (target)));
+   rtx mask_rtx = expand_normal (mask);
+   HOST_WIDE_INT scale_int = tree_to_shwi (scale);
+   HOST_WIDE_INT prfop_int = tree_to_uhwi (prfop);
+   /* Bit 3 of the prfop selects stores over loads.  */
+   HOST_WIDE_INT access = prfop_int & 8;
+   /* Bits 1 and 2 specify the locality; 0-based for svprfop but
+      1-based for PREFETCH.  */
+   HOST_WIDE_INT locality = ((prfop_int >> 1) & 3) + 1;
+
+   /* add operand.  */
+   unsigned int i = 0;
+   class expand_operand ops[9];
+   create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask)));
+   create_address_operand (&ops[i++], base_rtx);
+   create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset)));
+   /* Check whether the index has unsigned.  */
+   create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset)));
+   create_integer_operand (&ops[i++], scale_int);
+   create_input_operand (&ops[i++], const_rtx, GET_MODE (const_rtx));
+   create_integer_operand (&ops[i++], prfop_int);
+   create_integer_operand (&ops[i++], access);
+   create_integer_operand (&ops[i++], locality);
+
+   machine_mode reg_mode = GET_MODE (offset_rtx);
+   machine_mode m_mode = TYPE_MODE (TREE_TYPE (target));
+   if (!targetm.vectorize.prefetch_handleable_mode_p (m_mode))
+     return;
+   insn_code icode
+     = targetm.vectorize.code_for_gather_prefetch (m_mode, reg_mode);
+   expand_insn (icode, i, ops);
+ }
+
 /* Expand DIVMOD() using:
  a) optab handler for udivmod/sdivmod if it is available.
  b) If optab_handler doesn't exist, generate call to
@@ -3767,10 +3887,12 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
 #define direct_cond_binary_optab_supported_p direct_optab_supported_p
 #define direct_cond_ternary_optab_supported_p direct_optab_supported_p
 #define direct_mask_load_optab_supported_p convert_optab_supported_p
+#define direct_mask_prefetch_optab_supported_p direct_optab_supported_p
 #define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p
 #define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p
 #define direct_gather_load_optab_supported_p convert_optab_supported_p
 #define direct_len_load_optab_supported_p direct_optab_supported_p
+#define direct_gather_prefetch_optab_supported_p direct_optab_supported_p
 #define direct_mask_store_optab_supported_p convert_optab_supported_p
 #define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p
 #define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index d2d550d3586..05fc5032851 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -121,6 +121,8 @@ along with GCC; see the file COPYING3.  If not see
 #endif
 
 DEF_INTERNAL_OPTAB_FN (MASK_LOAD, ECF_PURE, maskload, mask_load)
+DEF_INTERNAL_OPTAB_FN (MASK_PREFETCH, ECF_NOVOPS | ECF_LEAF,
+		       maskprefetch, mask_prefetch)
 DEF_INTERNAL_OPTAB_FN (LOAD_LANES, ECF_CONST, vec_load_lanes, load_lanes)
 DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE,
 		       vec_mask_load_lanes, mask_load_lanes)
@@ -128,6 +130,8 @@ DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE,
 DEF_INTERNAL_OPTAB_FN (GATHER_LOAD, ECF_PURE, gather_load, gather_load)
 DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE,
 		       mask_gather_load, gather_load)
+DEF_INTERNAL_OPTAB_FN (MASK_GATHER_PREFETCH, ECF_NOVOPS | ECF_LEAF,
+		       mask_gather_prefetch, gather_prefetch)
 
 DEF_INTERNAL_OPTAB_FN (LEN_LOAD, ECF_PURE, len_load, len_load)
 
diff --git a/gcc/ipa-pure-const.cc b/gcc/ipa-pure-const.cc
index 2642df91e63..222fe646529 100644
--- a/gcc/ipa-pure-const.cc
+++ b/gcc/ipa-pure-const.cc
@@ -534,6 +534,7 @@ builtin_safe_for_const_function_p (bool *looping, tree callee)
 	*looping = false;
 	return true;
       case BUILT_IN_PREFETCH:
+      case BUILT_IN_PREFETCH_FULL:
 	*looping = true;
 	return true;
       default:
diff --git a/gcc/optabs.def b/gcc/optabs.def
index dbf529434e0..8ca25a5cc64 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -90,9 +90,11 @@ OPTAB_CD(vec_cmp_optab, "vec_cmp$a$b")
 OPTAB_CD(vec_cmpu_optab, "vec_cmpu$a$b")
 OPTAB_CD(vec_cmpeq_optab, "vec_cmpeq$a$b")
 OPTAB_CD(maskload_optab, "maskload$a$b")
+OPTAB_CD(maskprefetch_optab, "maskprefetch$a$b")
 OPTAB_CD(maskstore_optab, "maskstore$a$b")
 OPTAB_CD(gather_load_optab, "gather_load$a$b")
 OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b")
+OPTAB_CD(mask_gather_prefetch_optab, "mask_gather_prefetch$a$b")
 OPTAB_CD(scatter_store_optab, "scatter_store$a$b")
 OPTAB_CD(mask_scatter_store_optab, "mask_scatter_store$a$b")
 OPTAB_CD(vec_extract_optab, "vec_extract$a$b")
diff --git a/gcc/opts.cc b/gcc/opts.cc
index 2433ace06d2..7c86a0ca7b6 100644
--- a/gcc/opts.cc
+++ b/gcc/opts.cc
@@ -2108,6 +2108,13 @@ enable_fdo_optimizations (struct gcc_options *opts,
   SET_OPTION_IF_UNSET (opts, opts_set, flag_tree_loop_distribution, value);
 }
 
+static void
+set_cache_misses_profile_params (struct gcc_options *opts,
+				 struct gcc_options *opts_set)
+{
+  SET_OPTION_IF_UNSET (opts, opts_set, flag_prefetch_loop_arrays, 1);
+}
+
 /* Enable cfgo-related flags.  */
 
 static void
@@ -3143,12 +3150,22 @@ common_handle_option (struct gcc_options *opts,
       /* FALLTHRU */
     case OPT_fauto_profile:
       enable_fdo_optimizations (opts, opts_set, value);
-	  /* 2 is special and means flag_profile_correction trun on by
-	     -fauto-profile.  */
+      /* 2 is special and means flag_profile_correction trun on by
+	 -fauto-profile.  */
       SET_OPTION_IF_UNSET (opts, opts_set, flag_profile_correction,
 			   (value ? 2 : 0));
       break;
 
+    case OPT_fadditional_profile_:
+      opts->x_additional_profile_file = xstrdup (arg);
+      opts->x_flag_additional_profile = true;
+      value = true;
+      /* No break here - do -fadditional-profile processing. */
+      /* FALLTHRU */
+    case OPT_fadditional_profile:
+      opts->x_flag_ipa_extend_auto_profile = value;
+      break;
+
     case OPT_fipa_struct_reorg_:
       /* No break here - do -fipa-struct-reorg processing.  */
       /* FALLTHRU.  */
@@ -3166,6 +3183,25 @@ common_handle_option (struct gcc_options *opts,
       SET_OPTION_IF_UNSET (opts, opts_set, flag_ipa_struct_reorg, value);
       break;
 
+    case OPT_fipa_extend_auto_profile:
+      opts->x_flag_ipa_extend_auto_profile
+	= opts->x_flag_cache_misses_profile ? true : value;
+      break;
+
+    case OPT_fcache_misses_profile_:
+      opts->x_cache_misses_profile_file = xstrdup (arg);
+      opts->x_flag_cache_misses_profile = true;
+      value = true;
+      /* No break here - do -fcache-misses-profile processing. */
+      /* FALLTHRU */
+    case OPT_fcache_misses_profile:
+      opts->x_flag_ipa_extend_auto_profile = value;
+      if (value)
+	{
+	  set_cache_misses_profile_params (opts, opts_set);
+	}
+      break;
+
     case OPT_fcfgo_profile_generate_:
       opts->x_profile_data_prefix = xstrdup (arg);
       value = true;
diff --git a/gcc/params.opt b/gcc/params.opt
index e5472dfc87e..8ca0bb8e163 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1262,4 +1262,84 @@ Range for depended ldp search in split-ldp-stp path.
 Common Joined UInteger Var(semi_relayout_level) Init(13) IntegerRange(11, 15) Param Optimization
 Set capacity of each bucket to semi-relayout to (1 << semi-relayout-level) / 8 .
 
+-param=mem-access-ratio=
+Common Joined UInteger Var(param_mem_access_ratio) Init(20) IntegerRange(0, 100) Param Optimization
+Memory access ratio (in percent).
+
+-param=mem-access-num=
+Common Joined UInteger Var(param_mem_access_num) Init(3) Param Optimization
+Memory access num.
+
+-param=prefetch-offset=
+Common Joined UInteger Var(param_prefetch_offset) Init(1024)
+IntegerRange(1, 999999) Param Optimization
+Prefetch Offset, which is usually a power of two due to cache line size.
+
+-param=branch-prob-threshold=
+Common Joined UInteger Var(param_branch_prob_threshold) Init(80) IntegerRange(50, 100)
+Param Optimization
+High Execution Rate Branch Threshold.
+
+-param=issue-topn=
+Common Joined UInteger Var(param_issue_topn) Init(1) Param Optimization
+Issue topn LLC mem_ref hint.
+
+-param=force-issue=
+Common Joined UInteger Var(param_force_issue) Init(0) IntegerRange(0, 1) Param
+Force issue the topn LLC mem_ref hint, without generating dynamic multi-branches.
+
+-param=llc-capacity-per-core=
+Common Joined UInteger Var(param_llc_capacity_per_core) Init(107) IntegerRange(0, 999999) Param
+LLC capacity per core.
+
+-param=target-variables=
+Common Joined Var(param_target_variables) Init("") Param Optimization
+--param=target-variables=<var>[,<var>,...] Target variables for prefetching, separated by comma,
+without space.  The representation of a variable can be complex and containing space, please surround
+it by quotation marks and escape special characters in Linux.  The input length should be no more
+than 512 characters.
+
+-param=use-ref-group-index=
+Common Joined UInteger Var(param_use_ref_group_index) Init(0) IntegerRange(0, 1) Param Optimization
+Prefetch the target variables by their indices in sorted ref_groups, use together with parameter
+target-variables.
+
+-param=mem-ref-index=
+Common Joined Var(param_mem_ref_index) Init("") Param Optimization
+--param=mem-ref-index=<idx>[,<idx>,...] Prefetch the target variable at the memory reference
+location with the index of customized order, separated by comma, without space.  The input length
+should be no more than 512 characters.
+
+-param=filter-kernels=
+Common Joined UInteger Var(param_filter_kernels) Init(1) IntegerRange(0, 1) Param
+Allow LLC allocate pass to greedily filter kernels by traversing the corresponding basic blocks
+through edges with branch probability no less than param_branch_prob_threshold.
+
+-param=outer-loop-nums=
+Common Joined UInteger Var(param_outer_loop_num) Init(1) IntegerRange(1, 10) Param
+Maximum number of outer loops allowed to extend outer loops for loops that
+cannot recognize inner loop boundaries.
+
+-param=llc-level=
+Common Joined UInteger Var(param_llc_level) Init(3) IntegerRange(3, 4)
+Param Optimization
+Specifies the HBM cache level.
+
+-param=filter-mode=
+Common Joined UInteger Var(param_filter_mode) Init(1) IntegerRange(0, 1) Param
+Set kernel filtering mode. Use basic block count by default; use branch probability mode when filter mode is turned off.
+
+-param=transfer-footprint=
+Common Joined UInteger Var(param_transfer_footprint) Init(1) IntegerRange(0, 1) Param
+Allow transferring the firstly calculated footprint expression to the target memory reference
+from which it is impossible to retrieve the foortprint.
+
+-param=llc-allocate-func-topn=
+Common Joined UInteger Var(param_llc_allocate_func_topn) Init(0) Param Optimization
+TopN functions of pmu counts to be analyzed in LLC allocation.
+
+-param=llc-allocate-func-counts-threshold=
+Common Joined UInteger Var(param_llc_allocate_func_counts_threshold) Init(1) Param Optimization
+Threshold functions of pmu counts to be analyzed in LLC allocation.
+
 ; This comment is to ensure we retain the blank line above.
diff --git a/gcc/passes.def b/gcc/passes.def
index 90643d53325..49001adde7c 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -141,6 +141,7 @@ along with GCC; see the file COPYING3.  If not see
 
   NEXT_PASS (pass_target_clone);
   NEXT_PASS (pass_ipa_auto_profile);
+  NEXT_PASS (pass_ipa_extend_auto_profile);
   NEXT_PASS (pass_ipa_tree_profile);
   PUSH_INSERT_PASSES_WITHIN (pass_ipa_tree_profile)
       NEXT_PASS (pass_feedback_split_functions);
@@ -325,6 +326,7 @@ along with GCC; see the file COPYING3.  If not see
 	  /* Run IVOPTs after the last pass that uses data-reference analysis
 	     as that doesn't handle TARGET_MEM_REFs.  */
 	  NEXT_PASS (pass_iv_optimize);
+	  NEXT_PASS (pass_llc_allocate);
 	  NEXT_PASS (pass_lim);
 	  NEXT_PASS (pass_tree_loop_done);
       POP_INSERT_PASSES ()
diff --git a/gcc/print-rtl.cc b/gcc/print-rtl.cc
index 636113d5b97..b7506514a58 100644
--- a/gcc/print-rtl.cc
+++ b/gcc/print-rtl.cc
@@ -1579,6 +1579,12 @@ print_exp (pretty_printer *pp, const_rtx x, int verbose)
       op[1] = XEXP (x, 1);
       op[2] = XEXP (x, 2);
       break;
+    case PREFETCH_FULL:
+      fun = "prefetch_full";
+      op[0] = XEXP (x, 0);
+      op[1] = XEXP (x, 1);
+      op[2] = XEXP (x, 2);
+      break;
     case UNSPEC:
     case UNSPEC_VOLATILE:
       {
diff --git a/gcc/rtl.def b/gcc/rtl.def
index 08e31fa3544..78ec1a021a9 100644
--- a/gcc/rtl.def
+++ b/gcc/rtl.def
@@ -282,6 +282,15 @@ DEF_RTL_EXPR(ADDR_DIFF_VEC, "addr_diff_vec", "eEee0", RTX_EXTRA)
    whose prefetch instructions do not support them.  */
 DEF_RTL_EXPR(PREFETCH, "prefetch", "eee", RTX_EXTRA)
 
+/* Memory prefetch, with attributes supported on some targets.
+   Operand 1 is the address of the memory to fetch.
+   Operand 2 is 1 for a write access, 0 otherwise.
+   Operand 3 is the level of prfop.
+
+   The attributes specified by operands 2 and 3 are ignored for targets
+   whose prefetch instructions do not support them.  */
+DEF_RTL_EXPR(PREFETCH_FULL, "prefetch_full", "eee", RTX_EXTRA)
+
 /* ----------------------------------------------------------------------
    At the top level of an instruction (perhaps under PARALLEL).
    ---------------------------------------------------------------------- */
diff --git a/gcc/rtl.h b/gcc/rtl.h
index a0db225cb2e..12cee5e8aad 100644
--- a/gcc/rtl.h
+++ b/gcc/rtl.h
@@ -2814,6 +2814,11 @@ do {								        \
 #define PREFETCH_SCHEDULE_BARRIER_P(RTX)					\
   (RTL_FLAG_CHECK1 ("PREFETCH_SCHEDULE_BARRIER_P", (RTX), PREFETCH)->volatil)
 
+/* True if RTX is flagged to be a scheduling barrier.  */
+#define PREFETCH_FULL_SCHEDULE_BARRIER_P(RTX)                                  \
+  (RTL_FLAG_CHECK1 ("PREFETCH_FULL_SCHEDULE_BARRIER_P", (RTX), PREFETCH_FULL)  \
+     ->volatil)
+
 /* Indicate whether the machine has any sort of auto increment addressing.
    If not, we can avoid checking for REG_INC notes.  */
 
diff --git a/gcc/rtlanal.cc b/gcc/rtlanal.cc
index c436c640c37..7f5646ce777 100644
--- a/gcc/rtlanal.cc
+++ b/gcc/rtlanal.cc
@@ -1198,6 +1198,7 @@ reg_referenced_p (const_rtx x, const_rtx body)
       return reg_overlap_mentioned_p (x, TRAP_CONDITION (body));
 
     case PREFETCH:
+    case PREFETCH_FULL:
       return reg_overlap_mentioned_p (x, XEXP (body, 0));
 
     case UNSPEC:
@@ -2042,6 +2043,7 @@ note_uses (rtx *pbody, void (*fun) (rtx *, void *), void *data)
       return;
 
     case PREFETCH:
+    case PREFETCH_FULL:
       (*fun) (&XEXP (body, 0), data);
       return;
 
diff --git a/gcc/sched-deps.cc b/gcc/sched-deps.cc
index 948aa0c3b60..d8a90afb51f 100644
--- a/gcc/sched-deps.cc
+++ b/gcc/sched-deps.cc
@@ -2705,7 +2705,9 @@ sched_analyze_2 (class deps_desc *deps, rtx x, rtx_insn *insn)
       break;
 
     case PREFETCH:
-      if (PREFETCH_SCHEDULE_BARRIER_P (x))
+    case PREFETCH_FULL:
+      if ((code == PREFETCH && PREFETCH_SCHEDULE_BARRIER_P (x))
+	  || (code == PREFETCH_FULL && PREFETCH_FULL_SCHEDULE_BARRIER_P (x)))
 	reg_pending_barrier = TRUE_BARRIER;
       /* Prefetch insn contains addresses only.  So if the prefetch
 	 address has no registers, there will be no dependencies on
diff --git a/gcc/target-insns.def b/gcc/target-insns.def
index de8c0092f98..9cfa194753f 100644
--- a/gcc/target-insns.def
+++ b/gcc/target-insns.def
@@ -77,6 +77,7 @@ DEF_TARGET_INSN (omp_simt_vote_any, (rtx x0, rtx x1))
 DEF_TARGET_INSN (omp_simt_xchg_bfly, (rtx x0, rtx x1, rtx x2))
 DEF_TARGET_INSN (omp_simt_xchg_idx, (rtx x0, rtx x1, rtx x2))
 DEF_TARGET_INSN (prefetch, (rtx x0, rtx x1, rtx x2))
+DEF_TARGET_INSN (prefetch_full, (rtx x0, rtx x1, rtx x2))
 DEF_TARGET_INSN (probe_stack, (rtx x0))
 DEF_TARGET_INSN (probe_stack_address, (rtx x0))
 DEF_TARGET_INSN (prologue, (void))
diff --git a/gcc/target.def b/gcc/target.def
index 142858fa342..64648954000 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2064,6 +2064,37 @@ it is for the vector version.",
  (vec_info *vinfo, bool costing_for_scalar),
  default_vectorize_create_costs)
 
+/* Function for vector prefetch operation.  */
+DEFHOOK
+(code_for_prefetch,
+ "This hook should return the decl of a function that implements the\n\
+vectorized variant of the function with the @code{combined_fn} code\n\
+@var{code} or @code{NULL_TREE} if such a function is not available.\n\
+The return type of the vectorized function shall be of vector type\n\
+@var{vec_type_out} and the argument types should be @var{vec_type_in}.",
+ insn_code, (machine_mode arg),
+ NULL)
+
+/* Function for vector gather prefetch operation.  */
+DEFHOOK
+(code_for_gather_prefetch,
+ "This hook should return the decl of a function that implements the\n\
+vectorized variant of the function with the @code{combined_fn} code\n\
+@var{code} or @code{NULL_TREE} if such a function is not available.\n\
+The return type of the vectorized function shall be of vector type\n\
+@var{vec_type_out} and the argument types should be @var{vec_type_in}.",
+ insn_code, (machine_mode mode_to, machine_mode mode_form),
+ NULL)
+
+/* Function to check whether the target hardware architecture supports
+   a full SVE data vector mode.  */
+DEFHOOK
+(prefetch_handleable_mode_p,
+ "This hook should return true if the target hardware architecture\n\
+supports a full SVE data vector mode.",
+ bool, (machine_mode arg),
+ NULL)
+
 HOOK_VECTOR_END (vectorize)
 
 #undef HOOK_PREFIX
diff --git a/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp
new file mode 100644
index 00000000000..1793ba9d177
--- /dev/null
+++ b/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp
@@ -0,0 +1,27 @@
+#   Copyright (C) 1997-2022 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+load_lib g++-dg.exp
+load_lib target-supports.exp
+
+# Initialize `dg'.
+dg-init
+
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.C]] \
+	"" "-fllc-allocate"
+
+# All done.
+dg-finish
\ No newline at end of file
diff --git a/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C b/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C
new file mode 100644
index 00000000000..b5bf69510a0
--- /dev/null
+++ b/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C
@@ -0,0 +1,52 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param branch-prob-threshold=50  --param filter-kernels=0 --param mem-access-num=2 --param issue-topn=1 --param force-issue=1" } */
+#include "multidim_array.h"
+
+class Input
+{
+  public:
+    int metadata_offset = 13;
+    int exp_nr_images = 1;
+    MultidimArray<double> exp_Mweight;
+    void convertAllSquaredDifferencesToWeights();
+};
+
+int main()
+{
+  clock_t start = clock();
+  Input input;
+  int testIter = 2;
+
+  for (int i = 0; i < testIter; ++i)
+    {
+      input.convertAllSquaredDifferencesToWeights();
+    }
+  return 0;
+}
+
+void Input::convertAllSquaredDifferencesToWeights()
+{
+  for (int img_id = 0; img_id < exp_nr_images; img_id++)
+  {
+    int my_metadata_offset = metadata_offset + img_id;
+    MultidimArray<double> sorted_weight;
+
+    exp_Mweight.getRow(img_id, sorted_weight);
+    long int np = 0;
+    FOR_ALL_DIRECT_ELEMENTS_IN_MULTIDIMARRAY(sorted_weight)
+    {
+      if (DIRECT_MULTIDIM_ELEM(sorted_weight, n) > 0.)
+        {
+          DIRECT_MULTIDIM_ELEM(sorted_weight, np) = DIRECT_MULTIDIM_ELEM( \
+            sorted_weight, n);
+          np++;
+        }
+    }
+  }
+}
+
+
+
+/* { dg-final { scan-tree-dump-times "dense memory access" 1 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_prefetch" 1 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */
diff --git a/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h b/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h
new file mode 100644
index 00000000000..682f24703d8
--- /dev/null
+++ b/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h
@@ -0,0 +1,186 @@
+#ifndef MULTIDIM_ARRAY_H
+#define MULTIDIM_ARRAY_H
+
+#include <iostream>
+
+#define RELION_ALIGNED_MALLOC malloc
+#define RELION_ALIGNED_FREE free
+
+#define STARTINGX(v) ((v).xinit)
+#define STARTINGY(v) ((v).yinit)
+#define NZYXSIZE(v) ((v).nzyxdim)
+
+#define DIRECT_MULTIDIM_ELEM(v,n) ((v).data[(n)])
+#define FOR_ALL_DIRECT_ELEMENTS_IN_MULTIDIMARRAY(v) \
+  for (long int n=0; n<NZYXSIZE(v); ++n)
+
+#define FOR_ALL_DIRECT_ELEMENTS_IN_MULTIDIMARRAY_ptr(v,n,ptr) \
+  for ((n)=0, (ptr)=(v).data; (n)<NZYXSIZE(v); ++(n), ++(ptr))
+
+#define DIRECT_A2D_ELEM(v,i,j) ((v).data[(i)*(v).xdim+(j)])
+#define A2D_ELEM(v, i, j) \
+  DIRECT_A2D_ELEM(v, (i) - STARTINGY(v), (j) - STARTINGX(v))
+
+#define DIRECT_A1D_ELEM(v, i) ((v).data[(i)])
+#define A1D_ELEM(v, i) DIRECT_A1D_ELEM(v, (i) - ((v).xinit))
+
+template<typename T>
+class MultidimArray
+{
+public:
+  T* data;
+  bool destroyData;
+  long int ndim;
+  long int zdim;
+  long int ydim;
+  long int xdim;
+  long int yxdim;
+  long int zyxdim;
+  long int nzyxdim;
+  long int zinit;
+  long int yinit;
+  long int xinit;
+  long int nzyxdimAlloc;
+
+public:
+  void clear()
+  {
+    coreDeallocate();
+    coreInit();
+  }
+
+  void coreInit()
+  {
+    xdim=0;
+    yxdim=0;
+    zyxdim=0;
+    nzyxdim=0;
+    ydim=1;
+    zdim=1;
+    ndim=1;
+    zinit=0;
+    yinit=0;
+    xinit=0;
+    data=NULL;
+    nzyxdimAlloc = 0;
+    destroyData=true;
+  }
+
+  void coreAllocate(long int _ndim, long int _zdim, long int _ydim, long int _xdim)
+  {
+    if (_ndim <= 0 || _zdim <= 0 || _ydim<=0 || _xdim<=0)
+      {
+        clear();
+        return;
+      }
+
+    ndim=_ndim;
+    zdim=_zdim;
+    ydim=_ydim;
+    xdim=_xdim;
+    yxdim=ydim*xdim;
+    zyxdim=zdim*yxdim;
+    nzyxdim=ndim*zyxdim;
+
+    coreAllocate();
+  }
+
+  void coreAllocate()
+  {
+    data = (T*)RELION_ALIGNED_MALLOC(sizeof(T) * nzyxdim);
+    nzyxdimAlloc = nzyxdim;
+  }
+
+  void coreDeallocate()
+  {
+    if (data != NULL && destroyData)
+      {
+        RELION_ALIGNED_FREE(data);
+      }
+    data=NULL;
+    nzyxdimAlloc = 0;
+  }
+
+  void resize(long int Ndim, long int Zdim, long int Ydim, long int Xdim)
+  {
+    if (Ndim*Zdim*Ydim*Xdim == nzyxdimAlloc && data != NULL)
+      {
+        ndim = Ndim;
+        xdim = Xdim;
+        ydim = Ydim;
+        zdim = Zdim;
+        yxdim = Ydim * Xdim;
+        zyxdim = Zdim * yxdim;
+        nzyxdim = Ndim * zyxdim;
+        nzyxdimAlloc = nzyxdim;
+        return;
+      }
+
+    if (Xdim <= 0 || Ydim <= 0 || Zdim <= 0 || Ndim <= 0)
+      {
+        clear();
+        return;
+      }
+
+    if (NZYXSIZE(*this) > 0 && data == NULL)
+      {
+        coreAllocate();
+        return;
+      }
+
+    size_t YXdim=Ydim*Xdim;
+    size_t ZYXdim=Zdim*YXdim;
+    size_t NZYXdim=Ndim*ZYXdim;
+
+    T * new_data = (T*)RELION_ALIGNED_MALLOC(sizeof(T) * NZYXdim);
+    for (long int l = 0; l < Ndim; l++)
+        for (long int k = 0; k < Zdim; k++)
+            for (long int i = 0; i < Ydim; i++)
+                for (long int j = 0; j < Xdim; j++)
+                  {
+                    T val;
+                    new_data[l*ZYXdim + k*YXdim+i*Xdim+j] = val;
+                  }
+    coreDeallocate();
+
+    data = new_data;
+    ndim = Ndim;
+    xdim = Xdim;
+    ydim = Ydim;
+    zdim = Zdim;
+    yxdim = Ydim * Xdim;
+    zyxdim = Zdim * yxdim;
+    nzyxdim = Ndim * zyxdim;
+    nzyxdimAlloc = nzyxdim;
+  }
+
+  void resize(long int Xdim)
+  {
+    resize(1, 1, 1, Xdim);
+  }
+
+  inline T& operator()(long int i, long int j) const
+  {
+    return A2D_ELEM(*this, i, j);
+  }
+
+  inline T& operator()(long int i) const
+  {
+    return A1D_ELEM(*this, i);
+  }
+
+  void getRow(long int i, MultidimArray<T>& v) const
+  {
+    if (xdim == 0 || ydim == 0)
+      {
+        v.clear();
+        return;
+      }
+
+    v.resize(xdim);
+    for (long int j = 0; j < xdim; j++)
+      v(j) = (*this)(i, j);
+  }
+};
+
+#endif /* MULTIDIM_ARRAY_H */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
new file mode 100644
index 00000000000..091e654f9b6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
@@ -0,0 +1,61 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param issue-topn=2 --param branch-prob-threshold=50 --param filter-mode=0" } */
+
+#include <stdio.h>
+
+#define N 131590
+#define F 384477
+
+double diagPtr[N];
+double psiPtr[N];
+double ApsiPtr[N];
+int lPtr[F];
+int uPtr[F];
+double lowerPtr[F];
+double upperPtr[F];
+
+void
+AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr,
+      int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces)
+{
+  for (int cell=0; cell<nCells; cell++)
+    ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
+
+  for (int face=0; face<nFaces; face++)
+    {
+      ApsiPtr[uPtr[face]] += lowerPtr[face]*psiPtr[lPtr[face]];
+      ApsiPtr[lPtr[face]] += upperPtr[face]*psiPtr[uPtr[face]];
+    }
+}
+
+int
+main (int argc, char *argv[])
+{
+  int nCells = N;
+  int nFaces = F;
+  int testIter = 2;
+
+  for (int i=0; i<testIter; i++)
+    AMUL (diagPtr,psiPtr,ApsiPtr,lPtr,uPtr,lowerPtr,upperPtr,nCells,nFaces);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 5 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "Tracing succeeded" 29 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not   "Tracing failed" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "static_data_size:" 7 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){2}\}" 3 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){3}\}" 1 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times ", size: (?!(0\.000000))" 7 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times ", size: 0\.000000" 19 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump       "\\d\\tApsiPtr\\t\\(1.003952, 1, 5, 0\\)" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump       "\\d\\tpsiPtr\\t\\(1.003952, 1, 3, 0\\)" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump       "\\d\\tdiagPtr\\t\\(1.003952, 1, 1, 0\\)" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump       "\\d\\tlowerPtr\\t\\(2.933319, 1, 1, 0\\)" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump       "\\d\\tupperPtr\\t\\(2.933319, 1, 1, 0\\)" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump       "\\d\\tlPtr\\t\\(1.466660, 1, 1, 0\\)" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump       "\\d\\tuPtr\\t\\(1.466660, 1, 1, 0\\)" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "insert svprfd" 4 "llc_allocate" } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c
new file mode 100644
index 00000000000..16a56ae03d7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c
@@ -0,0 +1,54 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param force-issue=1 --param filter-mode=0" } */
+
+#include <stdio.h>
+
+#define N 100000
+
+int A_i[N];
+int A_j[N];
+double A_data[N];
+double x_data[N];
+double y_data[N];
+int num_rows = N;
+
+void
+MatMult (int *A_i, int *A_j, double *A_data, double *x_data,
+         int num_rows, double *y_data)
+{
+  int i = 0;
+  int j = 0;
+  double temp = 0;
+  for (i = 0; i < num_rows; i++)
+    {
+      temp = y_data[i];
+      for (j = A_i[i]; j < A_i[i+1]; j++)
+        temp += A_data[j] * x_data[A_j[j]];
+      y_data[i] = temp;
+    }
+}
+
+int
+main (int argc, char *argv[])
+{
+  int testIter = 2;
+
+  for (int i = 0; i < testIter; i++)
+    MatMult (A_i, A_j, A_data, x_data, num_rows, y_data);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 4 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "Tracing succeeded" 14 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not   "Tracing failed" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not   "static_data_size:" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 2 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not   ", size: (?!(0\.000000))" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times ", size: 0\.000000" 6 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "\\d\\tx_data\\t\\(0.000000, 1, 1, 0\\)" 2 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "\\d\\tA_j\\t\\(0.000000, 1, 1, 0\\)" 2 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "\\d\\tA_data\\t\\(0.000000, 1, 1, 0\\)" 2 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not   "runtime issue" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "insert svprfd_gather" 2 "llc_allocate" } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp
new file mode 100644
index 00000000000..05a3bf84287
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp
@@ -0,0 +1,27 @@
+#   Copyright (C) 2022-2023 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+load_lib gcc-dg.exp
+load_lib target-supports.exp
+
+# Initialize `dg'.
+dg-init
+
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \
+	"" "-fllc-allocate"
+
+# All done.
+dg-finish
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c
new file mode 100644
index 00000000000..113acbceb77
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c
@@ -0,0 +1,36 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c --param=mem-access-ratio=1 --param=mem-access-num=0" } */
+
+/* In this deja test case, we test how Phase 2 & 3 of llc-allocate pass deals
+   with an indirect memory access in a nested loop where the use-block for the
+   induction variable of this memory access is a child/descendent of its
+   def-block (we make it by defining the induction variable in the outer loop).
+   Therefore, the reference can be successfully traced after outer-loop
+   analysis.  */
+#include <stdlib.h>
+#include <time.h> 
+
+void cross_bb_indir_mem_acc (int *arr1, int *arr2, int *arr3, int *arr4, int n) {
+    srand (time (NULL));
+
+    int j_s;
+    int j_e = arr1[0];
+    int k;
+
+    for (int i = 0; i < n; i++)
+    {
+        j_s = j_e;
+        j_e = arr1[i + 1];
+
+        k = arr3[i];
+
+        for (int j = j_s; j < j_e; j++)
+        {
+           arr4[j] -= arr2[k];
+        }
+
+    }
+}
+
+/* { dg-final { scan-tree-dump "Unhandled indirect memory access tracing." "llc_allocate" } } */
+/* { dg-final { scan-tree-dump "Retrace indirect memory access after outer loop analysis:" "llc_allocate" } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c
new file mode 100644
index 00000000000..a2e7f66a410
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c
@@ -0,0 +1,61 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
+#include <stdio.h>
+#define N 131590
+#define F 384477
+
+int ownStartPtr[F];
+double bPrimePtr[N];
+double diagPtr[N];
+double psiPtr[N];
+double upperPtr[F];
+double lowerPtr[F];
+int uPtr[F];
+
+void SMOOTH(int *ownStartPtr, double *bPrimePtr, double *diagPtr, double *psiPtr, int *uPtr, double *lowerPtr, double *upperPtr, int nCells);
+
+int main(int argc, char *argv[])
+{
+  int nCells = N;
+  int nFaces = F;
+  int testIter = 2;
+  for (int i = 0; i < testIter; i++)
+    {
+      SMOOTH(ownStartPtr, bPrimePtr, diagPtr, psiPtr, uPtr, lowerPtr, upperPtr, nCells);
+    }
+  return  0;
+}
+
+
+void SMOOTH(int *ownStartPtr, double *bPrimePtr, double *diagPtr, double *psiPtr, int *uPtr, double *lowerPtr, double *upperPtr, int nCells)
+{
+  double psii;
+  int fStart;
+  int fEnd = ownStartPtr[0];
+
+  for (int celli = 0; celli < nCells; celli++)
+    {
+      fStart = fEnd;
+      fEnd = ownStartPtr[celli + 1];
+      psii = bPrimePtr[celli];
+
+      for (int facei = fStart; facei<fEnd; facei++)
+	{
+	  psii -= upperPtr[facei] * psiPtr[uPtr[facei]];
+	}
+
+      psii /= diagPtr[celli];
+      for (int facei = fStart; facei < fEnd; facei++)
+	{
+	  bPrimePtr[uPtr[facei]] -= lowerPtr[facei] * psii;
+	}
+      psiPtr[celli] = psii;
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "bPrimePtr : 3" 2 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "diagPtr : 1" 2 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "upperPtr : 1" 2 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "psiPtr : 2" 2 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "insert" 8 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not   "Processing loop 0"  "llc_allocate" } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-branch-in-loop.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-branch-in-loop.c
new file mode 100644
index 00000000000..704f8792ce1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-branch-in-loop.c
@@ -0,0 +1,39 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c" } */
+
+/* In this deja test case, we test how Phase 4 of llc-allocate pass deals with
+   loop that contains a branching.  */
+#include <stdio.h>
+
+#define N 131590
+
+double diagPtr[N];
+double psiPtr[N];
+double ApsiPtr[N];
+
+void
+branch_in_loop (double *diagPtr, double *psiPtr, double *ApsiPtr, int nCells)
+{
+  for (int cell=0; cell<nCells; cell++)
+    {
+      if (psiPtr[cell] > 0)
+          ApsiPtr[cell] = 0;
+      else
+          ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
+    }
+}
+
+int
+main (int argc, char *argv[])
+{
+  int nCells = N;
+  int testIter = 100;
+
+  for (int i=0; i<testIter; i++)
+    {
+      branch_in_loop (diagPtr,psiPtr,ApsiPtr,nCells);
+    }
+  return  0;
+}
+
+/* { dg-final { scan-tree-dump "static issue" "llc_allocate" } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-break-in-loop.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-break-in-loop.c
new file mode 100644
index 00000000000..a19d11506e8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-break-in-loop.c
@@ -0,0 +1,41 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c" } */
+
+/* In this deja test case, we test how Phase 4 of llc-allocate pass deals with
+   loop that contains a break statement (which introduces multiple exits for a
+   loop). Currently, loops with multiple exits are filtered by Phase 1.  */
+#include <stdio.h>
+
+#define N 131590
+
+double diagPtr[N];
+double psiPtr[N];
+double ApsiPtr[N];
+
+void
+break_in_loop (double *diagPtr, double *psiPtr, double *ApsiPtr, int nCells)
+{
+  for (int cell=0; cell<nCells; cell++)
+    {
+      if (psiPtr[cell] > 0)
+	break;
+      ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
+    }
+}
+
+int
+main (int argc, char *argv[])
+{
+  int nCells = N;
+  int testIter = 2;
+
+  for (int i=0; i<testIter; i++)
+    {
+      break_in_loop (diagPtr,psiPtr,ApsiPtr,nCells);
+    }
+  return  0;
+}
+
+/* { dg-final { scan-tree-dump "loop_multiple_exits" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not "Phase 2" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not "static issue" "llc_allocate" } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-goto-in-loop.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-goto-in-loop.c
new file mode 100644
index 00000000000..9e017d7aacc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-goto-in-loop.c
@@ -0,0 +1,50 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c" } */
+
+/* In this deja test case, we test how Phase 4 of llc-allocate pass deals with
+   loop that contains a goto statement (which introduces multiple exits for a
+   loop). Currently, loops with multiple exits are filtered by Phase 1.  */
+#include <stdio.h>
+
+#define N 131
+
+double diagPtr[N];
+int psiPtr[N];
+double ApsiPtr[N];
+
+void
+goto_in_loop (double *diagPtr, int *psiPtr, double *ApsiPtr, int nCells)
+{
+  for (int cell=0; cell<nCells; cell++)
+    {
+      if (psiPtr[cell] % 3 == 0)
+          goto zero;
+      else if (psiPtr[cell] % 3 == 1)
+          goto one;
+      else
+          ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
+    }
+
+  zero:
+      ApsiPtr[0] = 0.;
+      return;
+  one:
+      ApsiPtr[0] = 1.;
+}
+
+int
+main (int argc, char *argv[])
+{
+  int nCells = N;
+  int testIter = 2;
+
+  for (int i=0; i<testIter; i++)
+    {
+      goto_in_loop (diagPtr,psiPtr,ApsiPtr,nCells);
+    }
+  return  0;
+}
+
+/* { dg-final { scan-tree-dump "loop_multiple_exits" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not "Phase 2" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not "static issue" "llc_allocate" } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-same-loop-cycle.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-same-loop-cycle.c
new file mode 100644
index 00000000000..16cb7012bb7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-same-loop-cycle.c
@@ -0,0 +1,129 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c --param=force-issue=1" } */
+
+/* In this deja test case, we test how Phase 4 of llc-allocate pass deals with
+   cfg that contains a backedge not being the latch of a formal GCC loop
+   structure.  */
+typedef unsigned long size_t;
+typedef long scalar_t__;
+
+typedef  struct TYPE_13__   TYPE_3__ ;
+typedef  struct TYPE_12__   TYPE_2__ ;
+typedef  struct TYPE_11__   TYPE_1__ ;
+
+struct dom_info {int nodes; int* dfs_parent; int* dfs_order; int* key; int* next_bucket; int* bucket; int* dom; int fake_exit_edge; TYPE_3__** dfs_to_bb; } ;
+typedef  enum cdi_direction { ____Placeholder_cdi_direction } cdi_direction ;
+struct TYPE_11__ {scalar_t__ index; } ;
+typedef  TYPE_1__ edge_iterator ;
+typedef  TYPE_2__* edge ;
+typedef  TYPE_3__* basic_block ;
+struct TYPE_13__ {size_t index; int preds; int succs; } ;
+struct TYPE_12__ {TYPE_3__* src; TYPE_3__* dest; } ;
+typedef  int TBB ;
+
+basic_block ENTRY_BLOCK_PTR ;
+basic_block EXIT_BLOCK_PTR ;
+scalar_t__ bitmap_bit_p (int,size_t) ;
+edge ei_edge (edge_iterator) ;
+int ei_end_p (edge_iterator) ;
+int ei_next (edge_iterator*) ;
+edge_iterator ei_start (int) ;
+size_t eval (struct dom_info*,int) ;
+size_t last_basic_block ;
+int link_roots (struct dom_info*,int,int) ;
+
+__attribute__((used)) static void
+calc_idoms (struct dom_info *di, enum cdi_direction reverse)
+{
+  TBB v, w, k, par;
+  basic_block en_block;
+  edge_iterator ei, einext;
+
+  if (reverse)
+    en_block = EXIT_BLOCK_PTR;
+  else
+    en_block = ENTRY_BLOCK_PTR;
+
+  /* Go backwards in DFS order, to first look at the leafs.  */
+  v = di->nodes;
+  while (v > 1)
+    {
+      basic_block bb = di->dfs_to_bb[v];
+      edge e;
+
+      par = di->dfs_parent[v];
+      k = v;
+
+      ei = (reverse) ? ei_start (bb->succs) : ei_start (bb->preds);
+
+      if (reverse)
+	{
+	  /* If this block has a fake edge to exit, process that first.  */
+	  if (bitmap_bit_p (di->fake_exit_edge, bb->index))
+	    {
+	      einext = ei;
+	      einext.index = 0;
+	      goto do_fake_exit_edge;
+	    }
+	}
+
+      /* Search all direct predecessors for the smallest node with a path
+	 to them.  That way we have the smallest node with also a path to
+	 us only over nodes behind us.  In effect we search for our
+	 semidominator.  */
+      while (!ei_end_p (ei))
+	{
+	  basic_block b;
+	  TBB k1;
+
+	  e = ei_edge (ei);
+	  b = (reverse) ? e->dest : e->src;
+	  einext = ei;
+	  ei_next (&einext);
+
+	  if (b == en_block)
+	    {
+	    do_fake_exit_edge:
+	      k1 = di->dfs_order[last_basic_block];
+	    }
+	  else
+	    k1 = di->dfs_order[b->index];
+
+	  /* Call eval() only if really needed.  If k1 is above V in DFS tree,
+	     then we know, that eval(k1) == k1 and key[k1] == k1.  */
+	  if (k1 > v)
+	    k1 = di->key[eval (di, k1)];
+	  if (k1 < k)
+	    k = k1;
+
+	  ei = einext;
+	}
+
+      di->key[v] = k;
+      link_roots (di, par, v);
+      di->next_bucket[v] = di->bucket[k];
+      di->bucket[k] = v;
+
+      /* Transform semidominators into dominators.  */
+      for (w = di->bucket[par]; w; w = di->next_bucket[w])
+	{
+	  k = eval (di, w);
+	  if (di->key[k] < di->key[w])
+	    di->dom[w] = k;
+	  else
+	    di->dom[w] = par;
+	}
+      /* We don't need to cleanup next_bucket[].  */
+      di->bucket[par] = 0;
+      v--;
+    }
+
+  /* Explicitly define the dominators.  */
+  di->dom[1] = 0;
+  for (v = 2; v <= di->nodes; v++)
+    if (di->dom[v] != di->key[v])
+      di->dom[v] = di->dom[di->dom[v]];
+}
+
+/* { dg-final { scan-tree-dump-times "Warning: Find cycle at bb index" 2 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump "static issue" "llc_allocate" } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c
new file mode 100644
index 00000000000..e18725f607d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c
@@ -0,0 +1,50 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -c -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0" } */
+
+#include <stdio.h>
+
+typedef struct stack_def
+{
+  int top;                      /* index to top stack element */
+  unsigned long reg_set;        /* set of live registers */
+  unsigned char reg[128];       /* register - stack mapping */
+} *stack;
+
+typedef struct block_info_def
+{
+  struct stack_def stack_in;    /* Input stack configuration.  */
+  struct stack_def stack_out;   /* Output stack configuration.  */
+  unsigned long out_reg_set;    /* Stack regs live on output.  */
+  int done;                     /* True if block already converted.  */
+  int predecessors;             /* Number of predecessors that need
+                                   to be visited.  */
+} *block_info;
+
+typedef struct basic_block_def
+{
+  void *aux;
+} *basic_block;
+
+unsigned char
+convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high)
+{
+  stack output_stack;
+
+  output_stack = &(((block_info) bb->aux)->stack_in);
+  if (value_reg_low == -1)
+    output_stack->top = -1;
+  else
+    {
+      int reg;
+      output_stack->top = value_reg_high - value_reg_low;
+      for (reg = value_reg_low; reg <= value_reg_high; ++reg)
+        {
+          (output_stack->reg + 16)[value_reg_high - reg] = reg;
+          output_stack->reg_set |= (unsigned long) 1 << reg;
+        }
+    }
+  return output_stack->reg[0];
+}
+
+/* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c
new file mode 100644
index 00000000000..328dc57bcf1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c
@@ -0,0 +1,14 @@
+
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options " -S -O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
+
+
+int val[100000];
+int main(){
+	for(int i=0;i<100000;i++){
+		__builtin_prefetch_full(&val[i],0,0);
+		val[i]=i+1;		
+	}
+}
+
+/* { dg-final { scan-assembler "PLDL1KEEP"  } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c
new file mode 100644
index 00000000000..d9c9198690a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c
@@ -0,0 +1,14 @@
+
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
+
+
+int val[100000];
+int main(){
+	for(int i=0;i<100000;i++){
+		__builtin_prefetch_full(&val[i],0,1);
+		val[i]=i+1;		
+	}
+}
+
+/* { dg-final { scan-assembler "PLDL1STRM"  } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c
new file mode 100644
index 00000000000..806366b5b55
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c
@@ -0,0 +1,14 @@
+
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
+
+
+int val[100000];
+int main(){
+	for(int i=0;i<100000;i++){
+		__builtin_prefetch_full(&val[i],0,2);
+		val[i]=i+1;		
+	}
+}
+
+/* { dg-final { scan-assembler "PLDL2KEEP"  } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c
new file mode 100644
index 00000000000..00e2f877a18
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c
@@ -0,0 +1,14 @@
+
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
+
+
+int val[100000];
+int main(){
+	for(int i=0;i<100000;i++){
+		__builtin_prefetch_full(&val[i],0,3);
+		val[i]=i+1;		
+	}
+}
+
+/* { dg-final { scan-assembler "PLDL2STRM"  } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c
new file mode 100644
index 00000000000..c2815065433
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c
@@ -0,0 +1,14 @@
+
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
+
+
+int val[100000];
+int main(){
+	for(int i=0;i<100000;i++){
+		__builtin_prefetch_full(&val[i],0,4);
+		val[i]=i+1;		
+	}
+}
+
+/* { dg-final { scan-assembler "PLDL3KEEP"  } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c
new file mode 100644
index 00000000000..e8d9c86938f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c
@@ -0,0 +1,14 @@
+
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
+
+
+int val[100000];
+int main(){
+	for(int i=0;i<100000;i++){
+		__builtin_prefetch_full(&val[i],0,5);
+		val[i]=i+1;		
+	}
+}
+
+/* { dg-final { scan-assembler "PLDL3STRM"  } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c
new file mode 100644
index 00000000000..b0281882fc6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c
@@ -0,0 +1,14 @@
+
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
+
+
+int val[100000];
+int main(){
+	for(int i=0;i<100000;i++){
+		__builtin_prefetch_full(&val[i],0,6);
+		val[i]=i+1;		
+	}
+}
+
+/* { dg-final { scan-assembler "PLDL4KEEP"  } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c
new file mode 100644
index 00000000000..26807556f2d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c
@@ -0,0 +1,14 @@
+
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
+
+
+int val[100000];
+int main(){
+	for(int i=0;i<100000;i++){
+		__builtin_prefetch_full(&val[i],0,7);
+		val[i]=i+1;		
+	}
+}
+
+/* { dg-final { scan-assembler "PLDL4STRM"  } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c
new file mode 100644
index 00000000000..4f2def13d36
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c
@@ -0,0 +1,14 @@
+
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
+
+
+int val[100000];
+int main(){
+	for(int i=0;i<100000;i++){
+		__builtin_prefetch_full(&val[i],1,0);
+		val[i]=i+1;		
+	}
+}
+
+/* { dg-final { scan-assembler "PSTL1KEEP"  } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c
new file mode 100644
index 00000000000..ecc501f1f86
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c
@@ -0,0 +1,14 @@
+
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
+
+
+int val[100000];
+int main(){
+	for(int i=0;i<100000;i++){
+		__builtin_prefetch_full(&val[i],1,1);
+		val[i]=i+1;		
+	}
+}
+
+/* { dg-final { scan-assembler "PSTL1STRM"  } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c
new file mode 100644
index 00000000000..d140f1ed1b8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c
@@ -0,0 +1,14 @@
+
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
+
+
+int val[100000];
+int main(){
+	for(int i=0;i<100000;i++){
+		__builtin_prefetch_full(&val[i],1,2);
+		val[i]=i+1;		
+	}
+}
+
+/* { dg-final { scan-assembler "PSTL2KEEP"  } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c
new file mode 100644
index 00000000000..d6f1702533e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c
@@ -0,0 +1,14 @@
+
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
+
+
+int val[100000];
+int main(){
+	for(int i=0;i<100000;i++){
+		__builtin_prefetch_full(&val[i],1,3);
+		val[i]=i+1;		
+	}
+}
+
+/* { dg-final { scan-assembler "PSTL2STRM"  } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c
new file mode 100644
index 00000000000..8da092b369e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c
@@ -0,0 +1,14 @@
+
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
+
+
+int val[100000];
+int main(){
+	for(int i=0;i<100000;i++){
+		__builtin_prefetch_full(&val[i],1,4);
+		val[i]=i+1;		
+	}
+}
+
+/* { dg-final { scan-assembler "PSTL3KEEP"  } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c
new file mode 100644
index 00000000000..4cf65188a37
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c
@@ -0,0 +1,14 @@
+
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
+
+
+int val[100000];
+int main(){
+	for(int i=0;i<100000;i++){
+		__builtin_prefetch_full(&val[i],1,5);
+		val[i]=i+1;		
+	}
+}
+
+/* { dg-final { scan-assembler "PSTL3STRM"  } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c
new file mode 100644
index 00000000000..36f4a3aa008
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c
@@ -0,0 +1,14 @@
+
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
+
+
+int val[100000];
+int main(){
+	for(int i=0;i<100000;i++){
+		__builtin_prefetch_full(&val[i],1,6);
+		val[i]=i+1;		
+	}
+}
+
+/* { dg-final { scan-assembler "PSTL4KEEP"  } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c
new file mode 100644
index 00000000000..43d2d41d554
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c
@@ -0,0 +1,14 @@
+
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
+
+
+int val[100000];
+int main(){
+	for(int i=0;i<100000;i++){
+		__builtin_prefetch_full(&val[i],1,7);
+		val[i]=i+1;		
+	}
+}
+
+/* { dg-final { scan-assembler "PSTL4STRM"  } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c
new file mode 100644
index 00000000000..ba90e7ea43f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c
@@ -0,0 +1,62 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N 1000
+
+long a[N] = {0};
+long b[N] = {0};
+long c[N] = {0};
+
+double
+referenceTrace (double *psiPtr, int *lPtr, int *uPtr, int nCells)
+{
+  double sum;
+  for (int cell = 0; cell < nCells; cell++)
+    {
+      // Multi-layer pointer
+      sum += psiPtr[lPtr[cell]];
+      psiPtr[uPtr[cell]] = sum;
+
+      // Outer pointer, inner array
+      sum += psiPtr[b[cell]];
+      psiPtr[a[cell]] = sum;
+
+      // Multi-layer array
+      sum += a[b[cell]];
+      c[a[cell]] = sum;
+
+      // Outer array, inner pointer
+      sum += a[lPtr[cell]];
+      c[lPtr[cell]] = sum;
+    }
+  return sum;
+}
+
+int
+main (int argc, char *argv[])
+{
+  int testIter = 2;
+
+  double *psiPtr = NULL;
+  int *lPtr = NULL;
+  int *uPtr = NULL;
+  psiPtr = (double *) calloc (N, sizeof(double));
+  lPtr = (int *) calloc (N, sizeof(int));
+  uPtr = (int *) calloc (N, sizeof(int));
+
+  for (int i = 0; i < testIter; i++)
+    referenceTrace (psiPtr, lPtr, uPtr, N);
+
+  free (psiPtr);
+  free (lPtr);
+  free (uPtr);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "Tracing succeeded" 24 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "unhandled issue scene" 2 "llc_allocate" } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c
new file mode 100644
index 00000000000..276781c4f40
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c
@@ -0,0 +1,48 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=lPtr" } */
+
+#include <stdio.h>
+
+#define N 131590
+#define F 384477
+
+double diagPtr[N];
+double psiPtr[N];
+double ApsiPtr[N];
+int lPtr[F];
+int uPtr[F];
+double lowerPtr[F];
+double upperPtr[F];
+
+void
+AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr,
+      int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces)
+{
+  for (int cell=0; cell<nCells; cell++)
+    ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
+
+  for (int face=0; face<nFaces; face++)
+    {
+      ApsiPtr[uPtr[face]] += lowerPtr[face]*psiPtr[lPtr[face]];
+      ApsiPtr[lPtr[face]] += upperPtr[face]*psiPtr[uPtr[face]];
+    }
+}
+
+int
+main (int argc, char *argv[])
+{
+  int nCells = N;
+  int nFaces = F;
+  int testIter = 2;
+
+  for (int i=0; i<testIter; i++)
+    AMUL (diagPtr,psiPtr,ApsiPtr,lPtr,uPtr,lowerPtr,upperPtr,nCells,nFaces);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "NOTICE: Prefetching target variable \""
+                " lPtr \"" 2 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "insert prfm" 2 "llc_allocate" } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-2.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-2.c
new file mode 100644
index 00000000000..57c76f4a6ad
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-2.c
@@ -0,0 +1,48 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=,lPtr, --param mem-ref-index=5" } */
+
+#include <stdio.h>
+
+#define N 131590
+#define F 384477
+
+double diagPtr[N];
+double psiPtr[N];
+double ApsiPtr[N];
+int lPtr[F];
+int uPtr[F];
+double lowerPtr[F];
+double upperPtr[F];
+
+void
+AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr,
+      int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces)
+{
+  for (int cell=0; cell<nCells; cell++)
+    ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
+
+  for (int face=0; face<nFaces; face++)
+    {
+      ApsiPtr[uPtr[face]] += lowerPtr[face]*psiPtr[lPtr[face]];
+      ApsiPtr[lPtr[face]] += upperPtr[face]*psiPtr[uPtr[face]];
+    }
+}
+
+int
+main (int argc, char *argv[])
+{
+  int nCells = N;
+  int nFaces = F;
+  int testIter = 2;
+
+  for (int i=0; i<testIter; i++)
+    AMUL (diagPtr,psiPtr,ApsiPtr,lPtr,uPtr,lowerPtr,upperPtr,nCells,nFaces);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "WARNING: The target data_ref index is "
+                "out of range." 2 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "insert prfm" 2 "llc_allocate" } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-3.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-3.c
new file mode 100644
index 00000000000..d9c0535665e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-3.c
@@ -0,0 +1,48 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=lPtr,uPtr,, --param mem-ref-index=5" } */
+
+#include <stdio.h>
+
+#define N 131590
+#define F 384477
+
+double diagPtr[N];
+double psiPtr[N];
+double ApsiPtr[N];
+int lPtr[F];
+int uPtr[F];
+double lowerPtr[F];
+double upperPtr[F];
+
+void
+AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr,
+      int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces)
+{
+  for (int cell=0; cell<nCells; cell++)
+    ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
+
+  for (int face=0; face<nFaces; face++)
+    {
+      ApsiPtr[uPtr[face]] += lowerPtr[face]*psiPtr[lPtr[face]];
+      ApsiPtr[lPtr[face]] += upperPtr[face]*psiPtr[uPtr[face]];
+    }
+}
+
+int
+main (int argc, char *argv[])
+{
+  int nCells = N;
+  int nFaces = F;
+  int testIter = 2;
+
+  for (int i=0; i<testIter; i++)
+    AMUL (diagPtr,psiPtr,ApsiPtr,lPtr,uPtr,lowerPtr,upperPtr,nCells,nFaces);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-not "WARNING: The number of provided memory "
+                "reference indices is less" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "insert prfm" 4 "llc_allocate" } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-4.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-4.c
new file mode 100644
index 00000000000..b87f9903d9e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-4.c
@@ -0,0 +1,47 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=lPtr --param use-ref-group-index=1" } */
+
+#include <stdio.h>
+
+#define N 131590
+#define F 384477
+
+double diagPtr[N];
+double psiPtr[N];
+double ApsiPtr[N];
+int lPtr[F];
+int uPtr[F];
+double lowerPtr[F];
+double upperPtr[F];
+
+void
+AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr,
+      int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces)
+{
+  for (int cell=0; cell<nCells; cell++)
+    ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
+
+  for (int face=0; face<nFaces; face++)
+    {
+      ApsiPtr[uPtr[face]] += lowerPtr[face]*psiPtr[lPtr[face]];
+      ApsiPtr[lPtr[face]] += upperPtr[face]*psiPtr[uPtr[face]];
+    }
+}
+
+int
+main (int argc, char *argv[])
+{
+  int nCells = N;
+  int nFaces = F;
+  int testIter = 2;
+
+  for (int i=0; i<testIter; i++)
+    AMUL (diagPtr,psiPtr,ApsiPtr,lPtr,uPtr,lowerPtr,upperPtr,nCells,nFaces);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "ERROR: not an unsigned integer" 1
+                "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not "static issue" "llc_allocate" } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-5.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-5.c
new file mode 100644
index 00000000000..d07836765ca
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-5.c
@@ -0,0 +1,48 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=1 --param use-ref-group-index=1" } */
+
+#include <stdio.h>
+
+#define N 131590
+#define F 384477
+
+double diagPtr[N];
+double psiPtr[N];
+double ApsiPtr[N];
+int lPtr[F];
+int uPtr[F];
+double lowerPtr[F];
+double upperPtr[F];
+
+void
+AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr,
+      int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces)
+{
+  for (int cell=0; cell<nCells; cell++)
+    ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
+
+  for (int face=0; face<nFaces; face++)
+    {
+      ApsiPtr[uPtr[face]] += lowerPtr[face]*psiPtr[lPtr[face]];
+      ApsiPtr[lPtr[face]] += upperPtr[face]*psiPtr[uPtr[face]];
+    }
+}
+
+int
+main (int argc, char *argv[])
+{
+  int nCells = N;
+  int nFaces = F;
+  int testIter = 2;
+
+  for (int i=0; i<testIter; i++)
+    AMUL (diagPtr,psiPtr,ApsiPtr,lPtr,uPtr,lowerPtr,upperPtr,nCells,nFaces);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "NOTICE: Prefetching target variable \""
+                " psiPtr \"" 2 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "insert svprfd" 2 "llc_allocate" } } */
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-6.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-6.c
new file mode 100644
index 00000000000..5d95cfc3cd1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-6.c
@@ -0,0 +1,47 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=3,a --param use-ref-group-index=1" } */
+
+#include <stdio.h>
+
+#define N 131590
+#define F 384477
+
+double diagPtr[N];
+double psiPtr[N];
+double ApsiPtr[N];
+int lPtr[F];
+int uPtr[F];
+double lowerPtr[F];
+double upperPtr[F];
+
+void
+AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr,
+      int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces)
+{
+  for (int cell=0; cell<nCells; cell++)
+    ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
+
+  for (int face=0; face<nFaces; face++)
+    {
+      ApsiPtr[uPtr[face]] += lowerPtr[face]*psiPtr[lPtr[face]];
+      ApsiPtr[lPtr[face]] += upperPtr[face]*psiPtr[uPtr[face]];
+    }
+}
+
+int
+main (int argc, char *argv[])
+{
+  int nCells = N;
+  int nFaces = F;
+  int testIter = 2;
+
+  for (int i=0; i<testIter; i++)
+    AMUL (diagPtr,psiPtr,ApsiPtr,lPtr,uPtr,lowerPtr,upperPtr,nCells,nFaces);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "ERROR: not an unsigned integer" 1
+                "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not "static issue" "llc_allocate" } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-7-null-var-name.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-7-null-var-name.c
new file mode 100644
index 00000000000..e3aed8d70af
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-7-null-var-name.c
@@ -0,0 +1,52 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -c -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0 --param target-variables=\"bb_16(D)->aux\"" } */
+
+#include <stdio.h>
+
+typedef struct stack_def
+{
+  int top;                      /* index to top stack element */
+  unsigned long reg_set;        /* set of live registers */
+  unsigned char reg[128];       /* register - stack mapping */
+} *stack;
+
+typedef struct block_info_def
+{
+  struct stack_def stack_in;    /* Input stack configuration.  */
+  struct stack_def stack_out;   /* Output stack configuration.  */
+  unsigned long out_reg_set;    /* Stack regs live on output.  */
+  int done;                     /* True if block already converted.  */
+  int predecessors;             /* Number of predecessors that need
+                                   to be visited.  */
+} *block_info;
+
+typedef struct basic_block_def
+{
+  void *aux;
+} *basic_block;
+
+unsigned char
+convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high)
+{
+  stack output_stack;
+
+  output_stack = &(((block_info) bb->aux)->stack_in);
+  if (value_reg_low == -1)
+    output_stack->top = -1;
+  else
+    {
+      int reg;
+      output_stack->top = value_reg_high - value_reg_low;
+      for (reg = value_reg_low; reg <= value_reg_high; ++reg)
+        {
+          (output_stack->reg + 16)[value_reg_high - reg] = reg;
+          output_stack->reg_set |= (unsigned long) 1 << reg;
+        }
+    }
+  return output_stack->reg[0];
+}
+
+/* { dg-final { scan-tree-dump-not "Unrecognizable variable name"
+                "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not "static issue" "llc_allocate" } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c
new file mode 100644
index 00000000000..23eadb83afa
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c
@@ -0,0 +1,54 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -c -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0 --param target-variables=tmp_var_0" } */
+
+#include <stdio.h>
+
+typedef struct stack_def
+{
+  int top;                      /* index to top stack element */
+  unsigned long reg_set;        /* set of live registers */
+  unsigned char reg[128];       /* register - stack mapping */
+} *stack;
+
+typedef struct block_info_def
+{
+  struct stack_def stack_in;    /* Input stack configuration.  */
+  struct stack_def stack_out;   /* Output stack configuration.  */
+  unsigned long out_reg_set;    /* Stack regs live on output.  */
+  int done;                     /* True if block already converted.  */
+  int predecessors;             /* Number of predecessors that need
+                                   to be visited.  */
+} *block_info;
+
+typedef struct basic_block_def
+{
+  void *aux;
+} *basic_block;
+
+unsigned char
+convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high)
+{
+  stack output_stack;
+
+  output_stack = &(((block_info) bb->aux)->stack_in);
+  if (value_reg_low == -1)
+    output_stack->top = -1;
+  else
+    {
+      int reg;
+      output_stack->top = value_reg_high - value_reg_low;
+      for (reg = value_reg_low; reg <= value_reg_high; ++reg)
+        {
+          (output_stack->reg + 16)[value_reg_high - reg] = reg;
+          output_stack->reg_set |= (unsigned long) 1 << reg;
+        }
+    }
+  return output_stack->reg[0];
+}
+
+/* { dg-final { scan-tree-dump-not "Unrecognizable variable name"
+                "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "NOTICE: Prefetching target variable \""
+                " bb_16(D)->aux \"" 1 "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */
+/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90
new file mode 100644
index 00000000000..645be38f171
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90
@@ -0,0 +1,216 @@
+! { dg-do compile { target { aarch64*-*-linux* } } }
+! { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param branch-prob-threshold=50 --param filter-mode=0" }
+
+program main
+
+  IMPLICIT NONE
+  INTEGER :: ids,ide, jds,jde, kds,kde
+  INTEGER,parameter :: ims=-4,kms=1,jms=-4
+  INTEGER,parameter :: ime=210,kme=36,jme=192
+  INTEGER :: its,ite, jts,jte, kts,kte
+  INTEGER :: number_of_small_timesteps,rk_step, rk_order, step
+
+  REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: t_1, t_2, c2a, p, ph, pm1, al, alt
+
+
+  REAL, DIMENSION(ims:ime, jms:jme) :: mu, muts
+
+  REAL, DIMENSION(kms:kme) :: dnw, rdnw, znu
+
+  REAL :: rdx,rdy
+  REAL :: dts, t0, smdiv
+  REAL :: random1,time_begin,time_end,total_time
+
+  INTEGER :: i, j, k
+  INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end
+  INTEGER :: i_endu, j_endv
+  INTEGER :: interval=1
+  INTEGER :: epoch,iter
+
+  LOGICAL :: non_hydrostatic
+
+  data ids, jds, kds, its, jts, kts /6*1/
+  data ide, ite /2*205/
+  data jde, jte /2*187/
+  data kde, kte /2*36/
+
+  number_of_small_timesteps = 1
+  rk_step = 3
+  rk_order = 1
+  dts = 1.
+
+  rdx = 1.
+  rdy = 1.
+
+  t0 = 0.
+  smdiv = 1.
+  step = 1
+  non_hydrostatic = .true.
+
+  call random_number(random1)
+  interval = random1*100
+  interval=1
+
+  call random_seed(put=(/(i,i=1,10000,interval)/))
+
+  call random_number(alt)
+  call random_number(c2a)
+  call random_number(ph)
+  call random_number(pm1)
+  call random_number(mu)
+  call random_number(muts)
+  call random_number(dnw)
+  call random_number(rdnw)
+  call random_number(znu)
+
+  do iter=1,2
+  call calc_p_rho( al, p, ph,                        &
+                       alt, t_2, t_1, c2a, pm1,      &
+                       mu, muts, znu, t0,            &
+                       rdnw, dnw, smdiv,             &
+                       non_hydrostatic, step,        &
+                       ids, ide, jds, jde, kds, kde, &
+                       ims, ime, jms, jme, kms, kme, &
+                       its,ite, jts,jte, kts,kte    )
+
+  enddo
+
+end program
+
+
+SUBROUTINE calc_p_rho( al, p, ph,                    &
+                       alt, t_2, t_1, c2a, pm1,      &
+                       mu, muts, znu, t0,            &
+                       rdnw, dnw, smdiv,             &
+                       non_hydrostatic, step,        &
+                       ids, ide, jds, jde, kds, kde, &
+                       ims, ime, jms, jme, kms, kme, &
+                       its,ite, jts,jte, kts,kte    )
+
+  IMPLICIT NONE  ! religion first
+  !asb
+! declarations for the stuff coming in
+
+  INTEGER,      INTENT(IN   )    :: ids,ide, jds,jde, kds,kde
+  INTEGER,      INTENT(IN   )    :: ims,ime, jms,jme, kms,kme
+  INTEGER,      INTENT(IN   )    :: its,ite, jts,jte, kts,kte
+
+  INTEGER,      INTENT(IN   )    :: step
+
+  REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(  OUT) :: al,   &
+                                                               p
+
+  REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(IN   ) :: alt,   &
+                                                              t_2,   &
+                                                              t_1,   &
+                                                              c2a
+
+  REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(INOUT) :: ph, pm1
+
+  REAL, DIMENSION(ims:ime, jms:jme)         , INTENT(IN   ) :: mu,   &
+                                                               muts
+
+  REAL, DIMENSION(kms:kme)         , INTENT(IN   ) :: dnw,  &
+                                                      rdnw, &
+                                                      znu
+
+  REAL,                                       INTENT(IN   ) :: t0, smdiv
+
+  LOGICAL, INTENT(IN   )  :: non_hydrostatic
+
+! local variables
+
+  INTEGER :: i, j, k
+  INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end
+  REAL    :: ptmp
+
+   i_start = its
+   i_end   = min(ite,ide-1)
+   j_start = jts
+   j_end   = min(jte,jde-1)
+   k_start = kts
+   k_end = min(kte,kde-1)
+
+   IF (non_hydrostatic) THEN
+     DO j=j_start, j_end
+     DO k=k_start, k_end
+     DO i=i_start, i_end
+
+!  al computation is all dry, so ok with moisture
+
+      al(i,k,j)=-1./muts(i,j)*(alt(i,k,j)*mu(i,j)  &
+             +rdnw(k)*(ph(i,k+1,j)-ph(i,k,j)))
+
+!  this is temporally linearized p, no moisture correction needed
+
+      p(i,k,j)=c2a(i,k,j)*(alt(i,k,j)*(t_2(i,k,j)-mu(i,j)*t_1(i,k,j))  &
+                       /(muts(i,j)*(t0+t_1(i,k,j)))-al (i,k,j))
+
+     ENDDO
+     ENDDO
+     ENDDO
+
+   ELSE  ! hydrostatic calculation
+
+       DO j=j_start, j_end
+       DO k=k_start, k_end
+       DO i=i_start, i_end
+         p(i,k,j)=mu(i,j)*znu(k)
+         al(i,k,j)=alt(i,k,j)*(t_2(i,k,j)-mu(i,j)*t_1(i,k,j))            &
+                      /(muts(i,j)*(t0+t_1(i,k,j)))-p(i,k,j)/c2a(i,k,j)
+         ph(i,k+1,j)=ph(i,k,j)-dnw(k)*(muts(i,j)*al (i,k,j)              &
+                          +mu(i,j)*alt(i,k,j))
+       ENDDO
+       ENDDO
+       ENDDO
+
+   END IF
+
+!  divergence damping setup
+
+     IF (step == 0) then   ! we're initializing small timesteps
+       DO j=j_start, j_end
+       DO k=k_start, k_end
+       DO i=i_start, i_end
+         pm1(i,k,j)=p(i,k,j)
+       ENDDO
+       ENDDO
+       ENDDO
+     ELSE                     ! we're in the small timesteps
+       DO j=j_start, j_end    ! and adding div damping component
+       DO k=k_start, k_end
+       DO i=i_start, i_end
+         ptmp = p(i,k,j)
+         p(i,k,j) = p(i,k,j) + smdiv*(p(i,k,j)-pm1(i,k,j))
+         pm1(i,k,j) = ptmp
+       ENDDO
+       ENDDO
+       ENDDO
+     END IF
+
+END SUBROUTINE calc_p_rho
+
+! { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 6 "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "Tracing succeeded" 46 "llc_allocate" } }
+! { dg-final { scan-tree-dump-not   "Tracing failed" "llc_allocate" } }
+! { dg-final { scan-tree-dump-not   "static_data_size:" "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 1 "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){2}\}" 2 "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){4}\}" 1 "llc_allocate" } }
+! { dg-final { scan-tree-dump-not   ", size: (?!(0\.000000))" "llc_allocate" } }
+! { dg-final { scan-tree-dump-times ", size: 0\.000000" 22 "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "\\d\\tp\\t\\(0.000000, 3, 1, 0\\)" 1 "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "\\d\\tp\\t\\(0.000000, 3, 3, 0\\)" 1 "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "\\d\\tpm1\\t\\(0.000000, 3, 2, 0\\)" 1 "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "\\d\\tph\\t\\(0.000000, 3, 2, 0\\)" 2 "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "\\d\\tal\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "\\d\\talt\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "\\d\\tt_1\\t\\(0.000000, 3, 1, 0\\)" 1 "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "\\d\\tt_2\\t\\(0.000000, 3, 1, 0\\)" 1 "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "\\d\\tc2a\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "\\d\\tmu\\t\\(0.000000, 2, 1, 0\\)" 2 "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "\\d\\tmuts\\t\\(0.000000, 2, 1, 0\\)" 2 "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "runtime issue" 2 "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "insert svprfd" 2 "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "cumul_size.*150960\\)" 1 "llc_allocate" } }
\ No newline at end of file
diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp
new file mode 100644
index 00000000000..a9d8e9d5486
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp
@@ -0,0 +1,29 @@
+#   Copyright (C) 2022-2023 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+# GCC testsuite that uses the `dg.exp' driver.
+
+load_lib gfortran-dg.exp
+
+# Initialize `dg'.
+dg-init
+
+# Main loop.
+gfortran-dg-runtest [lsort \
+       [glob -nocomplain $srcdir/$subdir/*.\[fF\]{,90,95,03,08} ] ] "" ""
+
+# All done.
+dg-finish
\ No newline at end of file
diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90
new file mode 100644
index 00000000000..501e6e74cdf
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90
@@ -0,0 +1,62 @@
+! { dg-do compile { target { aarch64*-*-linux* } } }
+! { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno" }
+
+MODULE INPUT
+    IMPLICIT NONE
+
+    INTEGER, PARAMETER :: wp = 8, jpi = 25, jpj = 39, jpk = 31, kjpt = 2
+
+    INTEGER :: kt = 1, jpkm1 = 30, jpjm1 = 38, fs_jpim1 = 24, fs_2 = 2
+    REAL(wp), DIMENSION(jpi, jpj) :: e12t
+    REAL(wp), DIMENSION(jpi, jpj, jpk) :: fse3t_n
+    REAL(wp), DIMENSION(jpi, jpj, jpk, kjpt) :: pta
+
+END MODULE INPUT
+
+PROGRAM MAIN
+    USE INPUT
+
+    IMPLICIT NONE
+
+    INTEGER :: EPOCH
+
+! Initialize arrays
+
+    e12t = 1
+    fse3t_n = 1
+    pta = 1
+!
+
+    DO EPOCH=1,2
+        CALL tra_ldf_iso
+    ENDDO
+
+END PROGRAM MAIN
+
+SUBROUTINE tra_ldf_iso
+    USE INPUT
+
+    IMPLICIT NONE
+    !
+    INTEGER :: ji, jj, jk, jn   ! dummy loop indices
+    REAL(wp) :: zbtr, ztra            !   -      -
+    REAL(wp), DIMENSION(jpi, jpj, jpk) :: ztfw
+
+    DO jn = 1, kjpt
+        ztfw(:, :, 1) = 0.e0; ztfw(:, :, jpk) = 0.e0
+
+        DO jk = 1, jpkm1
+            DO jj = 2, jpjm1
+                DO ji = fs_2, fs_jpim1   ! vector opt.
+                    zbtr = 1.0/(e12t(ji, jj)*fse3t_n(ji, jj, jk))
+                    ztra = (ztfw(ji, jj, jk) - ztfw(ji, jj, jk + 1))*zbtr
+                    pta(ji, jj, jk, jn) = pta(ji, jj, jk, jn) + ztra
+                END DO
+            END DO
+        END DO
+        !
+    END DO
+    !
+END SUBROUTINE tra_ldf_iso
+
+! { dg-final { scan-tree-dump-times "Traced variables at vectp_ztfw" 2 "llc_allocate" } }
diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90
new file mode 100644
index 00000000000..7345759db68
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90
@@ -0,0 +1,58 @@
+! { dg-do compile { target { aarch64*-*-linux* } } }
+! { dg-options "-c -O3 -march=armv8.2-a+sve -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0 --param issue-topn=1 --param mem-access-ratio=5 --param mem-access-num=1" }
+
+Module module_domain
+    IMPLICIT NONE
+
+    REAL, PARAMETER :: g = 9.8
+    TYPE :: grid_type
+        REAL, POINTER   :: phb(:,:,:), ph_2(:,:,:), p(:,:,:), pb(:,:,:)
+        REAL, POINTER   :: fnm(:), fnp(:)
+    END TYPE
+END Module
+
+SUBROUTINE calc_p8w(p8w, ix, iy, k_start, k_end)
+
+   USE module_domain
+   !USE module_model_constants
+
+   IMPLICIT NONE
+
+
+   !TYPE (domain), INTENT(IN) :: grid
+   INTEGER, INTENT(IN) :: k_start, k_end, ix, iy
+   REAL, DIMENSION(k_start:k_end), INTENT(OUT) :: p8w
+
+
+   INTEGER :: k
+   REAL    :: z0, z1, z2, w1, w2
+   REAL, DIMENSION(k_start:k_end)   :: z_at_w
+   REAL, DIMENSION(k_start:k_end-1) :: z
+   TYPE (grid_type), POINTER :: grid
+
+
+   DO k = k_start, k_end
+      z_at_w(k) = (grid%phb(ix,k,iy)+grid%ph_2(ix,k,iy))/g
+   END DO
+
+   DO k = k_start, k_end-1
+      z(k) = 0.5*(z_at_w(k) + z_at_w(k+1))
+   END DO
+
+   DO k = k_start+1, k_end-1
+      p8w(k) = grid%fnm(k)*(grid%p(ix,k,iy)+grid%pb(ix,k,iy)) + &
+               grid%fnp(k)*(grid%p(ix,k-1,iy)+grid%pb(ix,k-1,iy))
+   END DO
+
+   z0 = z_at_w(k_start)
+   z1 = z(k_start)
+   z2 = z(k_start+1)
+   w1 = (z0 - z2)/(z1 - z2)
+   w2 = 1. - w1
+   p8w(k_start) = w1*(grid%p(ix,k_start,iy)+grid%pb(ix,k_start,iy)) + &
+                  w2*(grid%p(ix,k_start+1,iy)+grid%pb(ix,k_start+1,iy))
+
+END SUBROUTINE calc_p8w
+
+! { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } }
\ No newline at end of file
diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90
new file mode 100644
index 00000000000..f79df5d2656
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90
@@ -0,0 +1,320 @@
+! { dg-do compile { target { aarch64*-*-linux* } } }
+! { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno  --param=branch-prob-threshold=50 --param=filter-kernels=0 --param=mem-access-num=2 --param=issue-topn=2  --param=force-issue=1 --param=outer-loop-nums=3" }
+!include "module_small_step_em.F90"
+
+Module add_type
+  IMPLICIT NONE
+
+  TYPE :: grid_config_rec_type
+      LOGICAL :: open_xs
+      LOGICAL :: open_ys
+      LOGICAL :: open_xe
+      LOGICAL :: open_ye
+      LOGICAL :: symmetric_xs
+      LOGICAL :: symmetric_xe
+      LOGICAL :: symmetric_ys
+      LOGICAL :: symmetric_ye
+      LOGICAL :: polar
+      LOGICAL :: nested
+      LOGICAL :: periodic_x
+      LOGICAL :: specified
+  END TYPE
+END Module
+
+program main
+
+
+!  include "module_small_step_em_modify.F90"
+
+!  use module_small_step_em
+!  use module_small_step_em_modify
+
+  use add_type
+
+  IMPLICIT NONE
+  INTEGER :: ids,ide, jds,jde, kds,kde
+  INTEGER,parameter :: ims=-4,kms=1,jms=-4
+  INTEGER,parameter :: ime=210,kme=36,jme=192
+  INTEGER :: its,ite, jts,jte, kts,kte
+  INTEGER :: number_of_small_timesteps,rk_step, rk_order, step, spec_zone
+
+  REAL, DIMENSION(ims:ime, kms:kme, jms:jme, 1:8) :: llcRefresh
+  REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: u, v, u_1, v_1, t_1, ww_1, ft!u, v, u_1, v_1, w_1, t_1, ww1, ww_1,ph_1, ft
+  REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: u_save, v_save, w_save, t_save, ph_save,h_diabatic
+  ! REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: u_2, v_2, w_2, t_2, ph_2
+  ! REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: c2a, ww_save, cqw, cqu, cqv, alpha, gamma, a
+  REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: ww!pb, p, ph, php, pm1, al, alt, ww, random_array
+  ! REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: ru_tend, rv_tend
+  REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: t, t_ave, uam, vam, wwam
+
+  REAL, DIMENSION(ims:ime, jms:jme) :: mu_1,mu_2, mu
+  REAL, DIMENSION(ims:ime, jms:jme) :: mub, muu, muv, mut,        &
+                                       msfux, msfuy,              &
+                                       msfvx, msfvx_inv, msfvy,   &
+                                       msftx, msfty
+
+  REAL, DIMENSION(ims:ime, jms:jme) :: muus, muvs, muts, mudf, muave
+  REAL, DIMENSION(ims:ime, jms:jme) :: mu_save, mu_tend
+
+  REAL, DIMENSION(kms:kme) :: rdn, rdnw,dnw, fnm, fnp, znu
+
+  REAL :: rdx,rdy
+  REAL :: dts, cf1, cf2, cf3, t0, emdiv, smdiv, epssm, g
+  REAL :: random1,time_begin,time_end,total_time
+
+  INTEGER :: i, j, k
+  INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end
+  INTEGER :: i_endu, j_endv
+  INTEGER :: interval=1
+  INTEGER :: epoch
+
+  LOGICAL :: non_hydrostatic, top_lid
+
+
+  TYPE (grid_config_rec_type) :: config_flags
+  config_flags%open_xs = .true.
+  config_flags%open_ys = .true.
+  config_flags%open_xe = .true.
+  config_flags%open_ye = .true.
+  config_flags%symmetric_xs = .true.
+  config_flags%symmetric_xe = .true.
+  config_flags%symmetric_ys = .true.
+  config_flags%symmetric_ye = .true.
+  config_flags%polar = .true.
+  config_flags%nested = .true.
+  config_flags%periodic_x = .true.
+  config_flags%specified = .true.
+
+  data ids, jds, kds, its, jts, kts /6*1/
+  data ide, ite /2*205/
+  data jde, jte /2*187/
+  data kde, kte /2*98/
+
+  number_of_small_timesteps = 1
+  rk_step = 1
+  rk_order = 1
+  dts = 1.
+  epssm = 1.
+  g = 1.
+
+  rdx = 1.
+  rdy = 1.
+  dts = 1.
+  cf1 = 1.
+  cf2 = 1.
+  cf3 = 1.
+
+  t0 = 0.
+  smdiv = 1.
+  emdiv = 1.
+  step = 1
+  spec_zone = 1
+
+  non_hydrostatic = .true.
+  top_lid = .true.
+
+  interval=1
+
+
+  total_time=0
+
+  call random_seed(put=(/(i,i=1,10000,interval)/))
+
+  call random_number(u)
+  call random_number(v)
+  call random_number(u_1)
+  call random_number(v_1)
+  call random_number(t_1)
+  call random_number(ft)
+
+  call random_number(ww)
+  call random_number(ww_1)
+  call random_number(t)
+  call random_number(t_ave)
+  call random_number(uam)
+  call random_number(vam)
+  call random_number(wwam)
+
+  call random_number(muu)
+  call random_number(muv)
+  call random_number(mut)
+  call random_number(msfux)
+  call random_number(msfuy)
+  call random_number(msfvx)
+  call random_number(msfvx_inv)
+  call random_number(msfvy)
+  call random_number(msftx)
+  call random_number(msfty)
+  call random_number(mu_tend)
+
+  call random_number(muave)
+  call random_number(muts)
+  call random_number(mudf)
+  call random_number(mu)
+
+  call random_number(fnm)
+  call random_number(fnp)
+  call random_number(dnw)
+  call random_number(rdnw)
+
+  DO j=jms, jme
+  DO k=kms, kme
+  DO i=ims, ime
+
+    llcRefresh(i,k,j,1)=i+k+j+7
+
+  ENDDO
+  ENDDO
+  ENDDO
+
+  do epoch = 1,2
+  call advance_mu_t_fortran_plu( ww, ww_1, u, u_1, v, v_1,            &
+                         mu, mut, muave, muts, muu, muv,      &
+                         mudf, uam, vam, wwam, t, t_1,        &
+                         t_ave, ft, mu_tend,                  &
+                         rdx, rdy, dts, epssm,                &
+                         dnw, fnm, fnp, rdnw,                 &
+                         msfux, msfuy, msfvx, msfvx_inv,      &
+                         msfvy, msftx, msfty,                 &
+                         step, config_flags,                  &
+                         ids, ide, jds, jde, kds, kde,        &
+                         ims, ime, jms, jme, kms, kme,        &
+                         its, ite, jts, jte, kts, kte        )
+  enddo
+end program
+
+
+
+SUBROUTINE advance_mu_t_fortran_plu( ww, ww_1, u, u_1, v, v_1,            &
+        mu, mut, muave, muts, muu, muv,      &
+        mudf, uam, vam, wwam, t, t_1,        &
+        t_ave, ft, mu_tend,                  &
+        rdx, rdy, dts, epssm,                &
+        dnw, fnm, fnp, rdnw,                 &
+        msfux, msfuy, msfvx, msfvx_inv,      &
+        msfvy, msftx, msfty,                 &
+        step, config_flags,                  &
+        ids, ide, jds, jde, kds, kde,        &
+        ims, ime, jms, jme, kms, kme,        &
+        its, ite, jts, jte, kts, kte        )
+  use add_type
+
+  IMPLICIT NONE  ! religion first
+
+  ! stuff coming in
+
+  TYPE(grid_config_rec_type), INTENT(IN   ) :: config_flags
+  INTEGER,      INTENT(IN   )    :: ids,ide, jds,jde, kds,kde
+  INTEGER,      INTENT(IN   )    :: ims,ime, jms,jme, kms,kme
+  INTEGER,      INTENT(IN   )    :: its,ite, jts,jte, kts,kte
+
+  INTEGER,      INTENT(IN   )    :: step
+
+  REAL, DIMENSION( ims:ime , kms:kme, jms:jme ),   &
+          INTENT(IN   ) ::                       &
+          u,   &
+          v,   &
+          u_1, &
+          v_1, &
+          t_1, &
+          ft
+
+  REAL, DIMENSION( ims:ime , kms:kme, jms:jme ),      &
+          INTENT(INOUT) ::                          &
+          ww,     &
+          ww_1,   &
+          t,      &
+          t_ave,  &
+          uam,    &
+          vam,    &
+          wwam
+
+  REAL, DIMENSION( ims:ime , jms:jme ),    INTENT(IN   ) :: muu,  &
+          muv,  &
+          mut,  &
+          msfux,&
+          msfuy,&
+          msfvx,&
+          msfvx_inv,&
+          msfvy,&
+          msftx,&
+          msfty,&
+          mu_tend
+
+  REAL, DIMENSION( ims:ime , jms:jme ),    INTENT( INOUT) :: muave, &
+          muts,  &
+          mudf
+
+  REAL, DIMENSION( ims:ime , jms:jme ),    INTENT(INOUT) :: mu
+
+  REAL, DIMENSION( kms:kme ),              INTENT(IN   ) :: fnm,    &
+          fnp,    &
+          dnw,    &
+          rdnw
+
+
+  REAL,                                    INTENT(IN   ) :: rdx,    &
+          rdy,    &
+          dts,    &
+          epssm
+
+  REAL, DIMENSION (its:ite, kts:kte) :: wdtn, dvdxi
+  REAL, DIMENSION (its:ite) :: dmdt
+
+  INTEGER :: i,j,k, i_start, i_end, j_start, j_end, k_start, k_end
+  INTEGER :: i_endu, j_endv
+  REAL    :: acc
+
+  INTEGER :: ubv, lbv, t1, t2, t3, t4, ceild, floord
+
+  ceild(t1, t2) = ceiling(REAL(t1)/REAL(t2))
+  floord(t1, t2) = floor(REAL(t1)/REAL(t2))
+  i_start = its
+  i_end   = min(ite,ide-1)
+  j_start = jts
+  j_end   = min(jte,jde-1)
+  k_start = kts
+  k_end   = kte-1
+  IF ( .NOT. config_flags%periodic_x )THEN
+    IF ( config_flags%specified .or. config_flags%nested ) then
+      i_start = max(its,ids+1)
+      i_end   = min(ite,ide-2)
+    ENDIF
+  ENDIF
+  IF ( config_flags%specified .or. config_flags%nested ) then
+    j_start = max(jts,jds+1)
+    j_end   = min(jte,jde-2)
+  ENDIF
+
+  i_endu = ite
+  j_endv = jte
+
+  DO j = j_start, j_end
+
+    DO i=i_start, i_end
+      dmdt(i) = 0.
+    ENDDO
+
+    DO k=k_start, k_end
+      DO i=i_start, i_end
+        dvdxi(i,k) = msftx(i,j)*msfty(i,j)*(      &
+                rdy*((v(i,k,j+1)+muv(i,j+1)*v_1(i,k,j+1)*msfvx_inv(i,j+1))  &
+                        -(v(i,k,j  )+muv(i,j  )*v_1(i,k,j)*msfvx_inv(i,j  ))) &
+                        +rdx*((u(i+1,k,j)+muu(i+1,j)*u_1(i+1,k,j)/msfuy(i+1,j))      &
+                        -(u(i,k,j  )+muu(i  ,j)*u_1(i,k,j  )/msfuy(i,j)) ))
+        dmdt(i)    = dmdt(i) + dnw(k)*dvdxi(i,k)
+      ENDDO
+    ENDDO
+    DO i=i_start, i_end
+      muave(i,j) = mu(i,j)
+      mu(i,j) = mu(i,j)+dts*(dmdt(i)+mu_tend(i,j))
+      mudf(i,j) = (dmdt(i)+mu_tend(i,j)) ! save tendency for div dampfilter
+      muts(i,j) = mut(i,j)+mu(i,j)
+      muave(i,j) =.5*((1.+epssm)*mu(i,j)+(1.-epssm)*muave(i,j))
+    ENDDO
+  ENDDO
+END SUBROUTINE advance_mu_t_fortran_plu
+
+! { dg-final { scan-tree-dump "issue_llc_hint" "llc_allocate" } }
+! { dg-final { scan-tree-dump-times "analyze_nested_kernels" 2 "llc_allocate" } }
+! { dg-final { scan-tree-dump "Stop tracing the outer loop depth" "llc_allocate" } }
\ No newline at end of file
diff --git a/gcc/timevar.def b/gcc/timevar.def
index 36c3e7d5af3..14129a500b1 100644
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@@ -84,6 +84,7 @@ DEFTIMEVAR (TV_IPA_COMDATS	     , "ipa comdats")
 DEFTIMEVAR (TV_IPA_HARDWARE_DETECTION, "ipa detection")
 DEFTIMEVAR (TV_IPA_PREFETCH	     , "ipa prefetch")
 DEFTIMEVAR (TV_IPA_STRUCT_REORG      , "ipa struct reorg optimization")
+DEFTIMEVAR (TV_IPA_EXTEND_AUTO_PROFILE, "ipa extend auto profile")
 DEFTIMEVAR (TV_IPA_OPT		     , "ipa various optimizations")
 DEFTIMEVAR (TV_IPA_LTO_DECOMPRESS    , "lto stream decompression")
 DEFTIMEVAR (TV_IPA_LTO_COMPRESS      , "lto stream compression")
@@ -215,6 +216,7 @@ DEFTIMEVAR (TV_TREE_LOOP_DISTRIBUTION, "tree loop distribution")
 DEFTIMEVAR (TV_CHECK_DATA_DEPS       , "tree check data dependences")
 DEFTIMEVAR (TV_TREE_PREFETCH	     , "tree prefetching")
 DEFTIMEVAR (TV_TREE_LOOP_IVOPTS	     , "tree iv optimization")
+DEFTIMEVAR (TV_TREE_LLC_ALLOCATE     , "tree llc allocation")
 DEFTIMEVAR (TV_PREDCOM		     , "predictive commoning")
 DEFTIMEVAR (TV_TREE_CH		     , "tree copy headers")
 DEFTIMEVAR (TV_TREE_SSA_UNCPROP	     , "tree SSA uncprop")
diff --git a/gcc/toplev.cc b/gcc/toplev.cc
index f00a166df2b..bdbd4de63e7 100644
--- a/gcc/toplev.cc
+++ b/gcc/toplev.cc
@@ -567,6 +567,12 @@ compile_file (void)
       targetm.asm_out.output_ident (ident_str);
     }
 
+  /* Extend auto profile finalization.  */
+  if (flag_ipa_extend_auto_profile)
+    {
+      free_extend_profile_info ();
+    }
+
   /* Auto profile finalization. */
   if (flag_auto_profile)
     end_auto_profile ();
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index d33aaec8ce4..40f67a8ed14 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -8476,6 +8476,17 @@ print_loops (FILE *file, int verbosity)
     print_loop_and_siblings (file, bb->loop_father, 0, verbosity);
 }
 
+/* Dump a loop to file.  */
+
+void
+loop_dump (FILE *file, class loop *loop)
+{
+  print_loop (file, loop, 0, 0);
+  fprintf (file, "vec_niter = ");
+  print_generic_expr (file, loop->vec_nb_iterations);
+  fprintf (file, "\n");
+}
+
 /* Dump a loop.  */
 
 DEBUG_FUNCTION void
diff --git a/gcc/tree-cfg.h b/gcc/tree-cfg.h
index bfe44c07344..3eb57721ed4 100644
--- a/gcc/tree-cfg.h
+++ b/gcc/tree-cfg.h
@@ -83,6 +83,8 @@ extern void dump_function_to_file (tree, FILE *, dump_flags_t);
 extern void debug_function (tree, dump_flags_t);
 extern void print_loops_bb (FILE *, basic_block, int, int);
 extern void print_loops (FILE *, int);
+extern void
+loop_dump (FILE *file, class loop *loop);
 extern void debug (class loop &ref);
 extern void debug (class loop *ptr);
 extern void debug_verbose (class loop &ref);
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index a98f8439787..bee8028055e 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -395,6 +395,8 @@ extern gimple_opt_pass *make_pass_slp_vectorize_late (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);
+extern gimple_opt_pass *
+make_pass_llc_allocate (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt);
@@ -536,6 +538,8 @@ extern simple_ipa_opt_pass *make_pass_ipa_hardware_detection (gcc::context *
 							      ctxt);
 extern simple_ipa_opt_pass *make_pass_ipa_prefetch (gcc::context *ctxt);
 extern simple_ipa_opt_pass *make_pass_ipa_struct_reorg (gcc::context *ctxt);
+extern simple_ipa_opt_pass *
+make_pass_ipa_extend_auto_profile (gcc::context *ctxt);
 extern simple_ipa_opt_pass *make_pass_ipa_pta (gcc::context *ctxt);
 extern simple_ipa_opt_pass *make_pass_ipa_tm (gcc::context *ctxt);
 extern simple_ipa_opt_pass *make_pass_target_clone (gcc::context *ctxt);
diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
index 44157265ce8..e888d129b36 100644
--- a/gcc/tree-scalar-evolution.cc
+++ b/gcc/tree-scalar-evolution.cc
@@ -2789,7 +2789,7 @@ resolve_mixers (class loop *loop, tree chrec, bool *folded_casts)
    the loop body has been executed 6 times.  */
 
 tree
-number_of_latch_executions (class loop *loop)
+number_of_latch_executions (class loop *loop, bool guarantee)
 {
   edge exit;
   class tree_niter_desc niter_desc;
@@ -2810,7 +2810,9 @@ number_of_latch_executions (class loop *loop)
   res = chrec_dont_know;
   exit = single_exit (loop);
 
-  if (exit && number_of_iterations_exit (loop, exit, &niter_desc, false))
+  if (exit
+      && number_of_iterations_exit (loop, exit, &niter_desc, false, true, NULL,
+				    guarantee))
     {
       may_be_zero = niter_desc.may_be_zero;
       res = niter_desc.niter;
@@ -2836,7 +2838,8 @@ number_of_latch_executions (class loop *loop)
       fprintf (dump_file, "))\n");
     }
 
-  loop->nb_iterations = res;
+  if (guarantee)
+    loop->nb_iterations = res;
   return res;
 }
 
diff --git a/gcc/tree-scalar-evolution.h b/gcc/tree-scalar-evolution.h
index 0f90207bc73..005ffe7a5af 100644
--- a/gcc/tree-scalar-evolution.h
+++ b/gcc/tree-scalar-evolution.h
@@ -21,7 +21,8 @@ along with GCC; see the file COPYING3.  If not see
 #ifndef GCC_TREE_SCALAR_EVOLUTION_H
 #define GCC_TREE_SCALAR_EVOLUTION_H
 
-extern tree number_of_latch_executions (class loop *);
+extern tree
+number_of_latch_executions (class loop *, bool guarantee = true);
 extern gcond *get_loop_exit_condition (const class loop *);
 
 extern void scev_initialize (void);
diff --git a/gcc/tree-ssa-llc-allocate.cc b/gcc/tree-ssa-llc-allocate.cc
new file mode 100644
index 00000000000..97dbe2c2556
--- /dev/null
+++ b/gcc/tree-ssa-llc-allocate.cc
@@ -0,0 +1,4408 @@
+/* LLC allocate.
+   Copyright (C) 2022-2023 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#define INCLUDE_MAP
+#define INCLUDE_SET
+#define INCLUDE_VECTOR
+#define INCLUDE_LIST
+#define INCLUDE_ALGORITHM
+#define INCLUDE_STRING
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "target.h"
+#include "rtl.h"
+#include "tree.h"
+#include "gimple.h"
+#include "predict.h"
+#include "tree-pass.h"
+#include "gimple-ssa.h"
+#include "optabs-query.h"
+#include "tree-pretty-print.h"
+#include "fold-const.h"
+#include "stor-layout.h"
+#include "gimplify.h"
+#include "gimple-iterator.h"
+#include "gimplify-me.h"
+#include "tree-ssa-loop-ivopts.h"
+#include "tree-ssa-loop-manip.h"
+#include "tree-ssa-loop-niter.h"
+#include "tree-ssa-loop.h"
+#include "ssa.h"
+#include "tree-into-ssa.h"
+#include "cfgloop.h"
+#include "tree-scalar-evolution.h"
+#include "langhooks.h"
+#include "tree-inline.h"
+#include "tree-data-ref.h"
+#include "diagnostic-core.h"
+#include "dbgcnt.h"
+#include "gimple-pretty-print.h"
+#include "internal-fn.h"
+#include "tree-cfg.h"
+#include "profile-count.h"
+#include "auto-profile.h"
+
+/* Number of parallel cores.  */
+const unsigned int PARALLEL_NUM = 304;
+
+/* Indirect access weight.  */
+const unsigned int INDIRECT_ACCESS_VALUE = 3;
+
+/* Write memory weight.  */
+const unsigned int WRITE_COST = 4;
+
+/* Maximum ratio of total prefetch data size to cache size.  */
+const double PREFETCH_CACHE_SIZE_RATIO = 0.8;
+
+/* Prefetch tool input max length.  */
+#ifndef PREFETCH_TOOL_INPUT_MAX_LEN
+#define PREFETCH_TOOL_INPUT_MAX_LEN 512
+#endif
+
+/* Prefetch tool number max length.  */
+#ifndef PREFETCH_TOOL_NUM_MAX_LEN
+#define PREFETCH_TOOL_NUM_MAX_LEN 9
+#endif
+
+#ifndef PREFETCH_FUNC_TOPN
+#define PREFETCH_FUNC_TOPN param_llc_allocate_func_topn
+#endif
+
+namespace {
+
+/* loop bound info of the memory reference located.  */
+struct loop_bound
+{
+  /* iv tree_node.  */
+  tree iv;
+
+  /* define stmt of iv.  */
+  gimple *def_stmt;
+
+  /* loop where stmt is located.  */
+  class loop *loop;
+
+  /* loop unroll factor.  */
+  unsigned int unroll;
+
+  /* Number of iterations of loop.  */
+  tree niters;
+
+  loop_bound (tree t, gimple *stmt)
+    {
+      iv = t;
+      def_stmt = stmt;
+      loop = loop_containing_stmt (stmt);
+      unroll = 1;
+      niters = chrec_dont_know;
+    }
+};
+
+/* method of calculating the data size.  */
+
+enum calc_type
+{
+  UNHANDLE_CALC = 0,
+  RUNTIME_CALC,
+  STATIC_CALC
+};
+
+/* Describes a info of a memory reference.  */
+
+struct data_ref
+{
+  /* The memory reference.  */
+  tree ref;
+
+  /* Statement where the ref is located.  */
+  gimple *stmt;
+
+  /* var_decl or param_decl, used for the ref_group.  */
+  tree var;
+
+  /* Base of the reference.  */
+  tree base;
+
+  /* Constant offset of the reference.  */
+  tree offset;
+
+  /* index of the reference.  */
+  tree index;
+
+  /* Constant step of the reference.  */
+  tree step;
+
+  /* loop boundary info of each dimension.  */
+  std::vector<loop_bound> loop_bounds;
+
+  /* memory data size, Unit: MB.  */
+  double data_size;
+
+  /* method of calculating the data size.  */
+  calc_type calc_by;
+
+  /* True if the info of ref is traced, and then record it.  */
+  unsigned int trace_status_p : 1;
+
+  /* True if the loop is vectorized.  */
+  unsigned int vectorize_p : 1;
+
+  /* True if the memory reference is shared.  */
+  unsigned int parallel_p : 1;
+
+  /* True if the memory reference is regular.  */
+  unsigned int regular_p : 1;
+
+  /* True if the memory reference is read.  */
+  unsigned int read_p : 1;
+
+  /* loop father depth.  */
+  unsigned int loop_depth;
+
+  /* bb index.  */
+  int bb_idx;
+
+  /* loop index.  */
+  int loop_idx;
+
+  data_ref ()
+    {
+      ref = NULL_TREE;
+      stmt = NULL;
+      var = NULL_TREE;
+      base = NULL_TREE;
+      offset = NULL_TREE;
+      index = NULL_TREE;
+      step = NULL_TREE;
+      data_size = 0;
+      calc_by = UNHANDLE_CALC;
+      trace_status_p = false;
+      vectorize_p = false;
+      parallel_p = false;
+      regular_p = true;
+      read_p = true;
+      loop_depth = 0;
+      bb_idx = 0;
+      loop_idx = 0;
+    }
+};
+
+/* ================ phase 1 get_dense_memory_kernels ================  */
+
+/* Add ref node and print.  */
+
+void
+add_ref (std::vector<data_ref> &references, tree op, gimple *stmt,
+	 bool vectorize_p, bool read_p)
+{
+  data_ref ref;
+  ref.ref = op;
+  ref.stmt = stmt;
+  ref.vectorize_p = vectorize_p;
+  ref.read_p = read_p;
+  ref.loop_depth = loop_depth (stmt->bb->loop_father);
+  ref.bb_idx = stmt->bb->index;
+  ref.loop_idx = stmt->bb->loop_father->num;
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      print_generic_expr (dump_file, ref.ref, TDF_LINENO);
+      fprintf (dump_file, "\n");
+    }
+  references.push_back (ref);
+}
+
+/* Get the references from the simple call (vectorization type).  */
+
+void
+get_references_in_gimple_call (gimple *stmt, std::vector<data_ref> &references)
+{
+  if (gimple_code (stmt) != GIMPLE_CALL)
+    return;
+
+  if (gimple_call_internal_p (stmt))
+    {
+      bool read_p = false;
+      switch (gimple_call_internal_fn (stmt))
+	{
+	  case IFN_MASK_GATHER_LOAD:
+	  case IFN_MASK_LOAD:
+	    {
+	      if (gimple_call_lhs (stmt) == NULL_TREE)
+		return;
+	      read_p = true;
+	      // FALLTHRU
+	    }
+	  case IFN_MASK_STORE:
+	    {
+	      /* _1 = &MEM[base: a_2(D), index: ivtmp_3, step: 8, offset: 0B];
+		 vect__1.1 = .MASK_LOAD (_1, 64B, loop_mask_4);
+
+		 _1 = &MEM[base: a_2(D), index: ivtmp_3, step: 8, offset: 0B];
+		 .MASK_STORE (_1, 64B, loop_mask_4, vect__1.2);
+
+		_1 = (sizetype) a_2(D);
+		 vect_patt_3.3 = .MASK_GATHER_LOAD (_1, vect__4.4, 8,
+						    { 0.0, ... }, loop_mask_5);
+	      */
+	      tree op1 = gimple_call_arg (stmt, 0);
+	      if (TREE_CODE (op1) != SSA_NAME)
+		{
+		  if (dump_file && (dump_flags & TDF_DETAILS))
+		    {
+		      fprintf (dump_file, "get_references_in_gimple_call: ");
+		      fprintf (dump_file, "find base that not ssa_name: ");
+		      print_generic_expr (dump_file, op1, TDF_LINENO);
+		      fprintf (dump_file, "\n");
+		    }
+		  return;
+		}
+	      gimple *op1_def = SSA_NAME_DEF_STMT (op1);
+	      if (op1_def != NULL && gimple_code (op1_def) == GIMPLE_ASSIGN)
+		{
+		  /* &MEM[base: xx]  */
+		  tree rhs1 = gimple_assign_rhs1 (op1_def);
+		  /* If the definition stmt of the operation is memory
+		     reference type, read it directly.  */
+		  if (TREE_CODE (rhs1) == ADDR_EXPR
+		      && TREE_CODE (TREE_OPERAND (rhs1, 0)) == TARGET_MEM_REF)
+		    op1 = TREE_OPERAND (rhs1, 0); /* MEM[base: xx]  */
+		}
+
+	      add_ref (references, op1, stmt, true, read_p);
+	      return;
+	    }
+	  default:
+	    return;
+	}
+    }
+}
+
+/* Check whether memory reference is located exactly in main function.
+   There are some other unexpected scenarios where mem ref or function is
+   tracing failed without loc info (newly generated gimple/function).  */
+
+bool
+is_reference_in_main_p (gimple *stmt)
+{
+  expanded_location xloc = expand_location (stmt->location);
+  if (DECL_NAME (cfun->decl) && MAIN_NAME_P (DECL_NAME (cfun->decl)))
+    {
+      /* NEXT STEP: Check why some functions have no end_locus.  */
+      if (!(DECL_SOURCE_LOCATION (current_function_decl)
+	    && cfun->function_end_locus))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "Cannot find function start-end location.\n");
+	  return true;
+	}
+      else if (!(xloc.file && xloc.line))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    {
+	      fprintf (dump_file, "Cannot find gimple statement location.\n");
+	      print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
+	    }
+	  return false;
+	}
+      int fn_start = expand_location (
+	DECL_SOURCE_LOCATION (current_function_decl)).line;
+      int fn_end = expand_location (cfun->function_end_locus).line;
+
+      if (xloc.line >= fn_start && xloc.line <= fn_end)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    {
+	      fprintf (dump_file, "Memory access in main function: ");
+	      print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
+	    }
+	  return true;
+	}
+    }
+  return false;
+}
+
+/* Stores the locations of memory references in STMT to REFERENCES.  */
+
+void
+get_references_in_stmt (gimple *stmt, std::vector<data_ref> &references)
+{
+  if (!gimple_vuse (stmt))
+    return;
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "gimple_vuse: ");
+      print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
+    }
+
+  /* Filter out memory references located in main function. This is a
+     experimental filtering scheme ONLY for HPC case verification as
+     some HPC cases assign values for variables (mem ref) in main function.  */
+  if (is_reference_in_main_p (stmt))
+    return;
+
+  if (gimple_code (stmt) == GIMPLE_ASSIGN)
+    {
+      tree op0 = gimple_assign_lhs (stmt);
+      tree op1 = gimple_assign_rhs1 (stmt);
+      tree base = NULL_TREE;
+
+      /* _1 = MEM[base: a, index: i, step: 8, offset: 0B];  */
+      if (REFERENCE_CLASS_P (op1)  && (base = get_base_address (op1))
+	  && TREE_CODE (base) != SSA_NAME && !is_gimple_min_invariant (base))
+	add_ref (references, op1, stmt, false, true);
+
+      if (REFERENCE_CLASS_P (op0) && get_base_address (op0))
+	add_ref (references, op0, stmt, false, false);
+    }
+  else if (gimple_code (stmt) == GIMPLE_CALL)
+    get_references_in_gimple_call (stmt, references);
+
+  return;
+}
+
+/* flag of loop filter out.  */
+
+struct loop_filter_out_flag
+{
+  /* Use external call.  */
+  bool use_ext_call;
+
+  /* Use external node.  */
+  bool use_ext_node;
+
+  /* Use loop defined in macros.  */
+  bool use_macro_loop;
+
+  /* Use external node.  */
+  bool use_cond_func;
+};
+
+/* Check whether an external node is used.  */
+
+bool use_ext_node_p (const std::vector<data_ref> &references,
+		     unsigned int &start)
+{
+  expanded_location cfun_xloc
+	= expand_location (DECL_SOURCE_LOCATION (current_function_decl));
+
+  unsigned i = start;
+  start = references.size ();
+  for (; i < references.size (); i++)
+    {
+      data_ref ref = references[i];
+      expanded_location xloc = expand_location (ref.stmt->location);
+      if (xloc.file && filename_cmp (cfun_xloc.file, xloc.file))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "use_ext_node\n\n");
+	  return true;
+	}
+    }
+  return false;
+}
+
+/* Determine whether to filter out loops by stmt.  */
+
+bool
+filter_out_loop_by_stmt_p (loop_filter_out_flag &loop_filter, gimple *stmt,
+			   const std::vector<data_ref> &references,
+			   unsigned int &start)
+{
+  expanded_location xloc = expand_location (stmt->location);
+  /* check use_ext_call.  */
+  if (gimple_code (stmt) == GIMPLE_CALL && !gimple_call_internal_p (stmt))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "use_ext_call: ");
+	  print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
+	}
+      loop_filter.use_ext_call = true;
+      return true;
+    }
+
+  /* check use_macro_loop.  */
+  if (xloc.file && xloc.column != 1)
+    loop_filter.use_macro_loop = false;
+
+  /* check use_cond_func, VEC_COND_EXPR/MIN_EXPR/MAX_EXPR.  */
+  if (gimple_code (stmt) == GIMPLE_ASSIGN)
+    {
+      enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
+      if (rhs_code == VEC_COND_EXPR || rhs_code == MIN_EXPR
+	  || rhs_code == MAX_EXPR)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    {
+	      fprintf (dump_file, "use_cond_func: ");
+	      print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
+	    }
+	  loop_filter.use_cond_func = true;
+	  return true;
+	}
+    }
+
+  /* check use_ext_node.  */
+  if (use_ext_node_p (references, start))
+    {
+      loop_filter.use_ext_node = true;
+      return true;
+    }
+
+  return false;
+}
+
+/* Dump the flag type of the loop is filtered out.  */
+
+void
+dump_loop_filter_out_flag (loop_filter_out_flag &loop_filter)
+{
+  if (loop_filter.use_ext_call)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "non-dense mem access: use_ext_call\n");
+    }
+
+  if (loop_filter.use_ext_node)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "non-dense mem access: use_ext_node\n");
+    }
+
+  if (loop_filter.use_macro_loop)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "non-dense mem access: use_macro_loop\n");
+    }
+
+  if (loop_filter.use_cond_func)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "non-dense mem access: use_cond_func\n");
+    }
+}
+
+/* Get references in loop.  */
+
+bool
+get_references_in_loop (std::vector<data_ref> &references,
+			loop_filter_out_flag &loop_filter,
+			class loop *loop)
+{
+  unsigned int start = 0;
+  bool filter_out_loop = true;
+
+  /* Analyze each bb in the loop.  */
+  basic_block *body = get_loop_body_in_dom_order (loop);
+  for (unsigned i = 0; i < loop->num_nodes; i++)
+    {
+      basic_block bb = body[i];
+      if (bb->loop_father != loop)
+	continue;
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "\n==== the %dth loop bb body ====\n", i);
+	  gimple_dump_bb (dump_file, bb, 0, dump_flags);
+	  fprintf (dump_file, "\n");
+	}
+
+      gimple_stmt_iterator bsi;
+      for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
+	{
+	  gimple *stmt = gsi_stmt (bsi);
+	  get_references_in_stmt (stmt, references);
+	  filter_out_loop = filter_out_loop_by_stmt_p (loop_filter, stmt,
+						       references, start);
+	  if (filter_out_loop)
+	    break;
+	}
+      if (filter_out_loop)
+	break;
+    }
+  free (body);
+  return !filter_out_loop;
+}
+
+/* Computes an estimated number of insns in LOOP, weighted by WEIGHTS.
+   Assume that the HPC data reading and calculation process does not involve
+   adding branches in loops.  Therefore, all bbs of loops are directly used for
+   calculation (excluding embedded loops) without considering branch weighting.
+*/
+
+unsigned
+estimate_loop_insns (class loop *loop, eni_weights *weights)
+{
+  basic_block *body = get_loop_body (loop);
+  gimple_stmt_iterator gsi;
+  unsigned size = 0, i;
+
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      basic_block bb = body[i];
+      if (bb->loop_father != loop)
+	{
+	  continue;
+	}
+      for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
+	size += estimate_num_insns (gsi_stmt (gsi), weights);
+    }
+  free (body);
+
+  return size;
+}
+
+/* Check whether the memory access is dense.  */
+
+bool
+dense_memory_p (const std::vector<data_ref> &references, class loop *loop)
+{
+  int ref_count = references.size ();
+  unsigned int ninsns = estimate_loop_insns (loop, &eni_size_weights);
+  float mem_to_insn_ratio = (float)ref_count / (float)ninsns;
+
+  /* The number of cores to be run and DDR bandwidth information can be
+  transferred to flexibly adjust the threshold.  */
+  bool dense_mem = (mem_to_insn_ratio >= (param_mem_access_ratio / 100.0)
+		    && ref_count >= param_mem_access_num);
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (cfun->decl));
+
+      /* Dump dense memory source code location.  */
+      if (ref_count && references[0].stmt->location)
+	{
+	  expanded_location xloc = expand_location
+				     (references[0].stmt->location);
+	  int fn_start = 0;
+	  if (DECL_SOURCE_LOCATION (current_function_decl))
+	    fn_start = expand_location (
+			    DECL_SOURCE_LOCATION (current_function_decl)).line;
+	  int fn_end = fn_start;
+	  if (cfun->function_end_locus)
+	    fn_end = expand_location (cfun->function_end_locus).line;
+	  if (xloc.file)
+	    fprintf (dump_file, "[%s:%s(%d-%d):%d:%d] ",
+		      xloc.file, fn_name, fn_start, fn_end,
+		      xloc.line, xloc.column);
+	}
+
+      /* Dump memory dense information.  */
+      if (dense_mem)
+	fprintf (dump_file, "dense memory access: ");
+      else
+	fprintf (dump_file, "non-dense mem access: ");
+      fprintf (dump_file,
+	       "ref_count = %d, ninsns = %d, mem_to_insn_ratio = %f\n\n",
+	       ref_count, ninsns, mem_to_insn_ratio);
+    }
+
+  return dense_mem;
+}
+
+/* Analyze the inner loop and get the loop with dense memory access.  */
+
+void
+analyze_loop_dense_memory (std::vector<class loop *> &kernels,
+			  std::map<class loop *,
+				   std::vector<data_ref> > &kernels_refs,
+			  class loop *loop)
+{
+  std::vector<data_ref> references;
+  number_of_latch_executions (loop);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "\n========== Processing loop %d: ==========\n",
+	       loop->num);
+      loop_dump (dump_file, loop);
+      flow_loop_dump (loop, dump_file, NULL, 1);
+      fprintf (dump_file, "loop unroll: %d\n", loop->unroll);
+    }
+
+  if (get_loop_exit_edges (loop).length () != 1)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "non-dense mem access: loop_multiple_exits\n");
+      return;
+    }
+
+  loop_filter_out_flag loop_filter = {false, false, true, false};
+
+  if (!get_references_in_loop (references, loop_filter, loop))
+    {
+      dump_loop_filter_out_flag (loop_filter);
+      return;
+    }
+
+  if (dense_memory_p (references, loop))
+    {
+      kernels_refs[loop] = references;
+      kernels.push_back (loop);
+    }
+}
+/* Analyze the inner loop and get the loop with dense memory access.  */
+
+bool
+get_dense_memory_kernels (std::vector<class loop *> &kernels,
+			  std::map<class loop *,
+				   std::vector<data_ref> > &kernels_refs)
+{
+  if (dump_file)
+    fprintf (dump_file, "\nPhase 1: get_dense_memory_kernels\n\n");
+  for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
+    analyze_loop_dense_memory (kernels, kernels_refs, loop);
+  return kernels.size () > 0;
+}
+
+/* ================ phase 2 trace_data_refs_info ================  */
+
+/* Determine whether the declaration is a non-vectorized.  */
+
+bool
+generic_decl_p (tree expr)
+{
+  if (expr == NULL_TREE)
+    return false;
+  enum tree_code expr_code = TREE_CODE (expr);
+  if (expr_code != VAR_DECL && expr_code != PARM_DECL
+      && expr_code != COMPONENT_REF)
+    return false;
+  return true;
+}
+
+/* Initial worklist preparation for source variable tracing.
+   Add different initial node based on different gimple statements.  */
+
+void
+add_worklist (std::vector<tree> &worklist, std::set<tree> &walked,
+	      gimple *def_stmt)
+{
+  if (gimple_code (def_stmt) == GIMPLE_PHI)
+    {
+      for (unsigned i = 0; i < gimple_phi_num_args (def_stmt); i++)
+	{
+	  tree node = gimple_phi_arg_def (def_stmt, i);
+	  if (!walked.count (node))
+	    {
+	      worklist.push_back (node);
+	      walked.insert (node);
+	    }
+	}
+    }
+  else if (is_gimple_assign (def_stmt))
+    {
+      tree_code rhs_code = gimple_assign_rhs_code (def_stmt);
+      if (rhs_code == POINTER_PLUS_EXPR || rhs_code == NEGATE_EXPR
+	  || rhs_code == NOP_EXPR || rhs_code == SSA_NAME
+	  || rhs_code == COMPONENT_REF)
+	{
+	  tree node = gimple_assign_rhs1 (def_stmt);
+	  if (!walked.count (node))
+	    {
+	      worklist.push_back (node);
+	      walked.insert (node);
+	    }
+	}
+      else if (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR)
+	{
+	  tree node = gimple_assign_rhs1 (def_stmt);
+	  if (!walked.count (node))
+	    {
+	      worklist.push_back (node);
+	      walked.insert (node);
+	    }
+	  node = gimple_assign_rhs2 (def_stmt);
+	  if (!walked.count (node))
+	    {
+	      worklist.push_back (node);
+	      walked.insert (node);
+	    }
+	}
+      else if (rhs_code == TARGET_MEM_REF || rhs_code == MEM_REF)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    {
+	      fprintf (dump_file, "possibly unnested indirect memory access: ");
+	      print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO);
+	      fprintf (dump_file, "\n");
+	    }
+	}
+      else
+	{
+	  /* unhandled assign rhs_code: _219 = _17 * _70;
+	     _17 = *grid_56(D).sst.span;
+	     _70 = *grid_56(D).sst.dim[0].stride;
+	  */
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    {
+	      fprintf (dump_file, "unhandled assign rhs_code: ");
+	      print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO);
+	      fprintf (dump_file, "\n");
+	    }
+	}
+    }
+  else
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "unsupported tracing stmt: ");
+	  print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO);
+	  fprintf (dump_file, "\n");
+	}
+    }
+}
+
+
+/* Tracing source variables:
+   vectp.1 = a_2(D) + _3;
+   _4 = &MEM[base: vectp.1, index: ivtmp_5, step: 8, offset: 0B];
+   vect__1.6 = .MASK_LOAD (_4, 64B, loop_mask_7);
+
+   _1 = (sizetype) b_2(D);
+   vect_patt_3.3 = .MASK_GATHER_LOAD (_1, vect__4.4, 8, { 0.0, ... },
+				      loop_mask_5);
+  ...
+  Due to previous pass optimizations, the current tracing method can find
+  several source variable candidates.  We decide to record them in a map and
+  later filter out the true base variable by some criteria.
+*/
+
+void
+trace_base_var_helper (tree arg, std::set<tree> &walked,
+		       std::map<tree, int>& base_var_candid, bool is_vect_type)
+{
+  if (arg == NULL)
+    return;
+
+  /* Var_decl type: base address extracted from ARRAY_REF.  */
+  if (TREE_CODE (TREE_TYPE (arg)) == ARRAY_TYPE && TREE_CODE (arg) == VAR_DECL
+      && generic_decl_p (arg))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "var_decl type\n");
+      base_var_candid[arg] += 1;
+      return;
+    }
+
+  /* Array type.  */
+  tree op0 = NULL;
+  if (TREE_CODE (arg) == ADDR_EXPR
+      && (op0 = TREE_OPERAND (arg, 0)) && generic_decl_p (op0))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "array type\n");
+      base_var_candid[op0] += 1;
+      return;
+    }
+
+  /* Pointer type.  */
+  if (TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE && generic_decl_p (arg))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "pointer type\n");
+      base_var_candid[arg] += 1;
+      return;
+    }
+
+  /* SSA_NAME type.  */
+  if (TREE_CODE (arg) != SSA_NAME)
+    return;
+
+  tree tmp_var = SSA_NAME_VAR (arg);
+  if (tmp_var && !is_vect_type && TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "ssa pointer type\n");
+      base_var_candid[tmp_var] += 1;
+      return;
+    }
+
+  gimple *def_stmt = SSA_NAME_DEF_STMT (arg);
+  if (def_stmt == NULL)
+    return;
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      print_generic_expr (dump_file, arg, TDF_SLIM);
+      fprintf (dump_file, "\t\t: ");
+      print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM);
+    }
+
+  if (gimple_code (def_stmt) == GIMPLE_NOP)
+    {
+      if (!walked.count (tmp_var))
+	walked.insert (tmp_var);
+      trace_base_var_helper (tmp_var, walked, base_var_candid, is_vect_type);
+    }
+  else
+    {
+      std::vector<tree> worklist;
+      add_worklist (worklist, walked, def_stmt);
+      for (unsigned i = 0; i < worklist.size (); ++i)
+	trace_base_var_helper (worklist[i], walked, base_var_candid, is_vect_type);
+    }
+}
+
+/* Identify the base variable traced from base address of memory reference.
+   We recognize that current method could detect several base variable
+   candidates and the temporary criteria for base variable determination
+   is that either one of the following statement is true:
+    1) The number of base variable candidates is 1;
+    2) The number of detected gimple statements for some variable is 1.
+   We may use other criteria or relax the current criteria
+   (e.g., criterion 2: 1 -> any odd number).  */
+
+bool
+trace_base_var (data_ref &mem_ref, std::set<tree> &walked)
+{
+  tree &var = mem_ref.var;
+  tree arg = mem_ref.base;
+  std::map<tree, int> base_var_candid;
+  bool is_vect_type = TREE_CODE (TREE_TYPE (mem_ref.ref)) == VECTOR_TYPE;
+  trace_base_var_helper (arg, walked, base_var_candid, is_vect_type);
+  bool is_tracing_unusual = false;
+  if (base_var_candid.size () == 1)
+    var = base_var_candid.begin ()->first;
+  else
+    {
+      is_tracing_unusual = true;
+      for (std::map<tree, int>::iterator it = base_var_candid.begin ();
+	   it != base_var_candid.end (); ++it)
+	var = it->second == 1 ? it->first : var;
+    }
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "Traced variables at ");
+      print_generic_expr (dump_file, arg, TDF_SLIM);
+      fprintf (dump_file, ":\n");
+      for (std::map<tree, int>::iterator it = base_var_candid.begin ();
+	   it != base_var_candid.end (); ++it)
+	fprintf (dump_file, "%s:%d, ", get_name (it->first), it->second);
+      fprintf (dump_file, "\n");
+
+      if (var == NULL_TREE)
+	fprintf (dump_file, "Unhandled scenario for tracing base variable.\n");
+      else if (is_tracing_unusual && var != NULL_TREE)
+	fprintf (dump_file, "Tracing unusual number or occurrences of base "
+			    "variables.  Choose %s.\n",
+		 get_name (var));
+    }
+  return var != NULL_TREE;
+}
+
+/* Recursively trace and check whether the definition stmt of the
+   index operand is a recorded stmt in direct access tracing.
+   Return 0 if ref is a direct access a[].
+   Return 1 if ref is a non-nested indirect access a[b[]].
+   Return 2 if ref is a complex indirect memory access, such as a[f(b[])].  */
+
+int
+trace_indirect_operand (tree arg, std::set<gimple *> &traced_ref_stmt)
+{
+  /* Return 0 if tree `arg` is not an SSA for further tracing.  */
+  if (TREE_CODE (arg) != SSA_NAME)
+    return 0;
+
+  gimple *def_stmt = SSA_NAME_DEF_STMT (arg);
+
+  /* Return 1 if `index` has been detected as a traced direct memory access
+     before.  */
+  if (traced_ref_stmt.count (def_stmt))
+    return 1;
+
+  /* Return 0 if def stmt of `arg` is not in gimple assign type. Stop tracing
+     index operand and currently no memory access operand is detected.  */
+  if (!def_stmt || !is_gimple_assign (def_stmt))
+    return 0;
+
+  tree_code rhs_code = gimple_assign_rhs_code (def_stmt);
+  /* Collect a whitelist of gimple_assign_rhs_code for tracing pointer/array
+     type indirect memory access.  */
+  if (rhs_code != MULT_EXPR && rhs_code != NOP_EXPR
+      && rhs_code != CONVERT_EXPR && rhs_code != PLUS_EXPR)
+    {
+      /* Return 2 if tree code has any type representing references to storge,
+	 implying a complex indirect memory access scenario for future
+	 analysis.  */
+      if (rhs_code == MEM_REF || rhs_code == TARGET_MEM_REF
+	  || rhs_code == ARRAY_REF || rhs_code == ARRAY_RANGE_REF
+	  || rhs_code == COMPONENT_REF || rhs_code == ADDR_EXPR
+	  || rhs_code == INDIRECT_REF)
+	return 2;
+
+      /* Return 0 and stop tracing if tree code is not a common tracing
+	 operand, but still reflected as a non-reference type.
+	 Caveats: if we never deal with this tree code before, maybe it is
+	 more suitable to treat this scenario strictly.  */
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "unknown tracing tree code: %s\n",
+		   get_tree_code_name (rhs_code));
+	  print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM);
+	  fprintf (dump_file, "\n");
+	}
+      return 0;
+    }
+
+  tree op = NULL_TREE;
+  ssa_op_iter iter;
+  FOR_EACH_SSA_TREE_OPERAND (op, def_stmt, iter, SSA_OP_USE)
+    {
+      int trace_indir_p = trace_indirect_operand (op, traced_ref_stmt);
+      if (trace_indir_p != 0)
+	return trace_indir_p;
+    }
+  return 0;
+}
+
+/* Trace the pointer of the direct/indirect memory access:
+   1) Obtain the base address of the memory access.
+   2) If index variable is formed by another memory access operation (i.e., an
+      indication of indirect memory access), ensure that the index has been
+      traced in an already discovered direct memory access.
+   3) Otherwise, the memory access is in a more complex scenario and we need to
+      postpone the analysis later. For example, the indirect memory access is
+      nested, a[b[c[...]]], or the index variable (formed in another memory
+      access) has not been recorded/traced yet.
+   e.g.,
+   _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B];
+   _4 = (long unsigned int) _1;
+   _5 = _4 * 8;
+   _6 = p(D) + _5; // get base
+   _7 = *_6;       // start tracing
+*/
+
+bool
+trace_ptr_mem_ref (data_ref &mem_ref, std::set<gimple *> &traced_ref_stmt,
+		   std::vector<data_ref> &unresolved_refs)
+{
+  /* Simple scenario:
+     _2208 = np.120_2207 * 8;
+     _1921 = sorted_weight$data_381 + _2208;
+     *_1921 = _2206;
+
+     Complex scenario:
+     MEM[base: _3235, index: ivtmp.2768_3189, step: 4, offset: 0B] = _105;
+     _3236 = (sizetype) _214;
+     _3237 = _3236 * 4;
+     _3238 = _857 + _3237;  // base + index * step
+     _3239 = _3238 + 4;     // offset
+     MEM[base: _3239, index: ivtmp.2768_3189, step: 4, offset: 0B] = 0.0;
+  */
+  tree pointer = TREE_OPERAND (mem_ref.ref, 0);
+  tree offset = TREE_OPERAND (mem_ref.ref, 1);
+  if (TREE_CODE (offset) != INTEGER_CST)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Unhandled scenario for non-constant offset.\n");
+
+      return false;
+    }
+  if (TREE_CODE (pointer) != SSA_NAME)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Unhandled scenario for non-ssa pointer.\n");
+
+      return false;
+    }
+
+  /* Tracing back base address from SSA.  */
+  gimple *ptr_def_stmt = SSA_NAME_DEF_STMT (pointer);
+  if (ptr_def_stmt == NULL || gimple_code (ptr_def_stmt) != GIMPLE_ASSIGN
+      || gimple_assign_rhs_code (ptr_def_stmt) != POINTER_PLUS_EXPR)
+    return false;
+  tree base = gimple_assign_rhs1 (ptr_def_stmt);
+  /* index_offset = index * step.  */
+  tree index_offset = gimple_assign_rhs2 (ptr_def_stmt);
+
+  /* Tracing back index from SSA.  */
+  if (TREE_CODE (index_offset) != SSA_NAME)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  if (TREE_CODE (index_offset) == INTEGER_CST)
+	    fprintf (dump_file, "Constant index for memory access.\n");
+	  else
+	    fprintf (dump_file, "Unhandled scenario for index tracing.\n");
+	}
+      return false;
+    }
+
+  gimple *idx_def_stmt = SSA_NAME_DEF_STMT (index_offset);
+  if (idx_def_stmt == NULL || gimple_code (idx_def_stmt) != GIMPLE_ASSIGN
+      || gimple_assign_rhs_code (idx_def_stmt) != MULT_EXPR)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Unhandled scenario for index tracing.\n");
+      return false;
+    }
+
+  /* Split array index from total offset of index, `index * step`.  */
+  mem_ref.base = base;
+  mem_ref.offset = offset;
+  mem_ref.index = gimple_assign_rhs1 (idx_def_stmt);
+  mem_ref.step = gimple_assign_rhs2 (idx_def_stmt);
+  if (TREE_CODE (gimple_assign_rhs1 (idx_def_stmt)) == INTEGER_CST)
+    {
+      mem_ref.index = gimple_assign_rhs2 (idx_def_stmt);
+      mem_ref.step = gimple_assign_rhs1 (idx_def_stmt);
+    }
+
+  int trace_index_indir_p = trace_indirect_operand (mem_ref.index,
+						    traced_ref_stmt);
+  if (trace_index_indir_p == 0)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Direct memory access tracing succeeded.\n");
+    }
+  else if (trace_index_indir_p == 1)
+    {
+      mem_ref.regular_p = false;
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Indirect memory access tracing succeeded.\n");
+    }
+  else
+    {
+      /* Record indirect memory access with complex scenarios for future
+	 analysis.  */
+      unresolved_refs.push_back (mem_ref);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Unhandled indirect memory access tracing.\n");
+      return false;
+    }
+
+  return true;
+}
+
+/* Tracing direct memory reference information.  */
+
+bool
+trace_direct_mem_ref (data_ref &mem_ref)
+{
+  /* Direct memory access, regardless of whether it is in vectorized form,
+     can be determined through TARGET_MEM_REF:
+      address = base + index * step + offset.
+     MASK_LOAD example:
+      _43 = &MEM[base: _42, index: ivtmp_140, step: 8, offset: 0B];
+      vect__42.11_160 = .MASK_LOAD (_43, 64B, loop_mask_163);
+
+     In some cases (2D-array or complex-index 1D array), mem_ref's `base`
+     may actually represent `base + index * step` when `base` address updates
+     by a PHI operation, e.g.,
+      MEM[base: _51, offset: 0B]
+      _51 = (void *) ivtmp.18_11;
+      ivtmp.18_11 = PHI <ivtmp.18_43(10), ivtmp.18_52(14)>
+      ivtmp.18_43 = ivtmp.18_11 + 16;
+      ivtmp.18_52 = (unsigned long) _10;
+      _10 = arr2D_29(D) + _9;
+  */
+  mem_ref.base = TREE_OPERAND (mem_ref.ref, 0);
+  mem_ref.offset = TREE_OPERAND (mem_ref.ref, 1);
+  mem_ref.index = TREE_OPERAND (mem_ref.ref, 2);
+  mem_ref.step = TREE_OPERAND (mem_ref.ref, 3);
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "Direct memory access tracing succeeded.\n");
+
+  return true;
+}
+
+/* Tracing vectorized indirect memory reference information.
+   MASK_GATHER_LOAD example:
+    vect__45.13_146 = .MASK_LOAD (_41, 32B, loop_mask_153);
+    vect__46.14_145 = (vector([2,2]) long unsigned int) vect__45.13_146;
+    vect_patt_163.15_143 = .MASK_GATHER_LOAD (_144, vect__46.14_145, 8,
+      { 0.0, ... }, loop_mask_153);  */
+
+bool
+trace_indirect_mem_ref_vectorized (data_ref &mem_ref,
+				   std::set<gimple *> &traced_ref_stmt)
+{
+  /* Processing of vectorization types.  */
+  if (mem_ref.vectorize_p)
+    {
+      tree op = gimple_call_arg (mem_ref.stmt, 1);
+      if (trace_indirect_operand (op, traced_ref_stmt))
+	{
+	  mem_ref.base = gimple_call_arg (mem_ref.stmt, 0);
+	  mem_ref.index = gimple_call_arg (mem_ref.stmt, 1);
+	  mem_ref.step = gimple_call_arg (mem_ref.stmt, 2);
+	  mem_ref.regular_p = false;
+
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "Indirect memory access tracing succeeded.\n");
+	  return true;
+	}
+    }
+  return false;
+}
+
+/* Trace the array of the indirect memory access:
+   1) Obtain the base address of the indirect memory access.
+   2) Ensure that the index has been traced in the direct memory access.
+   e.g.,
+   _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B];
+   _4 = (integer(kind=8)) _1;
+   _5 = _4 + 135;
+   _6 = p[_5];       // start tracing
+*/
+
+bool
+trace_indirect_array (data_ref &mem_ref, std::set<gimple *> &traced_ref_stmt)
+{
+  tree base = TREE_OPERAND (mem_ref.ref, 0);
+  tree index = TREE_OPERAND (mem_ref.ref, 1);
+  if (trace_indirect_operand (index, traced_ref_stmt))
+    {
+      /* ARRAY_REF, The first operand is the array;
+		    the second is the index.  */
+      mem_ref.base = base;
+      mem_ref.index = index;
+      mem_ref.regular_p = false;
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Indirect memory access tracing succeeded.\n");
+
+      return true;
+    }
+
+  return false;
+}
+
+/* Trace memory references base info:
+   1) Memory access rule analysis and reference info tracing
+   2) Source variable tracing, along base address of memory reference
+   We will extend parallel analysis later.
+*/
+
+void
+trace_ref_info (data_ref &mem_ref, std::set<gimple *> &traced_ref_stmt,
+		std::vector<data_ref> &unresolved_refs)
+{
+  enum tree_code ref_code = TREE_CODE (mem_ref.ref);
+  /* 1) Direct and indirect access traces.  */
+  switch (ref_code)
+    {
+    case MEM_REF:
+      /* Non-vectorized direct/indirect access by pointer.  */
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "MEM_REF\n");
+      if (!trace_ptr_mem_ref (mem_ref, traced_ref_stmt, unresolved_refs))
+	return;
+      break;
+    case TARGET_MEM_REF:
+      /* Vectorized and non-vectorized direct access.  */
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "TARGET_MEM_REF\n");
+      if (!trace_direct_mem_ref (mem_ref))
+	return;
+      break;
+    case SSA_NAME:
+      /* Vectorized indirect memory access.  */
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "SSA_NAME\n");
+      if (!trace_indirect_mem_ref_vectorized (mem_ref, traced_ref_stmt))
+	return;
+      break;
+    case ARRAY_REF:
+      /* Non-vectorized indirect memory access.  */
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "ARRAY_REF\n");
+      if (!trace_indirect_array (mem_ref, traced_ref_stmt))
+	return;
+      break;
+    default:
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "ref is another tree-code: ");
+	  fprintf (dump_file, "stmt: ");
+	  print_gimple_stmt (dump_file, mem_ref.stmt, 0, TDF_LINENO);
+	  fprintf (dump_file, "ref: ");
+	  print_generic_expr (dump_file, mem_ref.ref, TDF_LINENO);
+	  fprintf (dump_file, "\n");
+	}
+      return;
+    }
+
+  /* 2) Source variable tracing.  */
+  std::set<tree> walked;
+  if (mem_ref.var == NULL_TREE
+      && !trace_base_var (mem_ref, walked))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Source variable tracing failed.\n\n");
+      return;
+    }
+
+  if (mem_ref.regular_p)
+    traced_ref_stmt.insert (mem_ref.stmt);
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "Tracing succeeded.\n\n");
+  mem_ref.trace_status_p = true;
+}
+
+/* Trace all references in the loop.  */
+
+void
+trace_loop_refs_info (std::vector<data_ref> &refs,
+		      std::set<gimple *> &traced_ref_stmt,
+		      std::vector<data_ref> &unresolved_refs)
+{
+  for (unsigned i = 0; i < refs.size (); ++i)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "trace_references_base_info %d:\n", i);
+	  print_generic_expr (dump_file, refs[i].ref, TDF_SLIM);
+	  fprintf (dump_file, "\n");
+	}
+      trace_ref_info (refs[i], traced_ref_stmt, unresolved_refs);
+    }
+}
+
+/* Tracing and sorting reference groups.  */
+
+void
+trace_data_refs_info (std::vector<class loop *> &kernels,
+		      std::map<class loop *,
+			       std::vector<data_ref> > &loop_refs,
+		      std::set<gimple *> &traced_ref_stmt,
+		      std::vector<data_ref> &unresolved_refs)
+{
+  if (dump_file)
+    fprintf (dump_file, "\nPhase 2: trace_all_references_info\n\n");
+
+  for (unsigned i = 0; i < kernels.size (); ++i)
+    {
+      class loop *loop = kernels[i];
+      if (loop_refs.count (loop) == 0)
+	continue;
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "loop header %d:\n", loop->header->index);
+      trace_loop_refs_info (loop_refs[loop], traced_ref_stmt, unresolved_refs);
+    }
+}
+
+/* Retrace references base info for complex scenarios in indirect memory access
+   after Phase 3.  */
+
+void
+retrace_ref_info_unresolved (data_ref &mem_ref,
+			     std::set<gimple *> &traced_ref_stmt)
+{
+  /* 1) Indirect access traces.  */
+  int trace_index_indir_p = trace_indirect_operand (mem_ref.index,
+						    traced_ref_stmt);
+  if (trace_index_indir_p == 1)
+    {
+      mem_ref.regular_p = false;
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Indirect memory access tracing succeeded.\n");
+    }
+
+  /* 2) Source variable tracing.  */
+  std::set<tree> walked;
+  if (mem_ref.var == NULL_TREE
+      && !trace_base_var (mem_ref, walked))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Source variable tracing failed.\n\n");
+      return;
+    }
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "Tracing succeeded.\n\n");
+  mem_ref.trace_status_p = true;
+}
+
+/* Retrace all unresolved references.  */
+
+void
+retrace_loop_refs_info_unresolved (std::vector<data_ref> &unresolved_refs,
+				   std::set<gimple *> &traced_ref_stmt)
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file,
+	     "\nRetrace indirect memory access after outer loop analysis:\n");
+  for (unsigned i = 0; i < unresolved_refs.size (); ++i)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "trace_references_base_info %d:\n", i);
+	  print_generic_expr (dump_file, unresolved_refs[i].ref, TDF_SLIM);
+	  fprintf (dump_file, "\n");
+	}
+      retrace_ref_info_unresolved (unresolved_refs[i], traced_ref_stmt);
+    }
+}
+
+/* ================ phase 3 analyze_nested_kernels ================  */
+
+/* Return the inner most type for arrays and pointers of TYPE.  */
+
+tree
+inner_type (tree type)
+{
+  while (POINTER_TYPE_P (type)
+	 || TREE_CODE (type) == ARRAY_TYPE)
+    type = TREE_TYPE (type);
+  return type;
+}
+
+/* Check whether the input iv is the loop dimension boundary.  */
+
+bool
+loop_bound_iv_p (tree t, tree &outer_loop_t)
+{
+  if (t == NULL || TREE_CODE (t) != SSA_NAME
+      || TREE_CODE (TREE_TYPE (t)) != INTEGER_TYPE)
+    return false;
+
+  gimple *def_stmt = SSA_NAME_DEF_STMT (t);
+
+  /* NOP_EXPR convertion between PHI node and memory reference due to MACRO.
+    n_898 = PHI <n_907(355), 0(356)>
+    _757 = (sizetype) n_898;
+    _900 = MEM[base: _726, index: _757, step: 8, offset: 0B];
+  */
+  while (gimple_code (def_stmt) == GIMPLE_ASSIGN
+	 && gimple_assign_rhs_code (def_stmt) == NOP_EXPR)
+    def_stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (def_stmt));
+
+  if (gimple_code (def_stmt) != GIMPLE_PHI)
+    return false;
+
+  /* Filter scenarios with only two phi inputs.  */
+  if (gimple_phi_num_args (def_stmt) != 2)
+    return false;
+
+  gphi *phi_stmt = as_a <gphi *> (def_stmt);
+  basic_block src0 = gimple_phi_arg_edge (phi_stmt, 0)->src;
+  basic_block src1 = gimple_phi_arg_edge (phi_stmt, 1)->src;
+
+  class loop *loop = loop_containing_stmt (def_stmt);
+  bool res = false;
+  /* Two phi inputs, one from the current loop and one from the outer loop.  */
+  if ((src0->loop_father == loop) && (src1->loop_father == loop_outer (loop)))
+    {
+      outer_loop_t = gimple_phi_arg_def (def_stmt, 1);
+      res = true;
+    }
+  else if ((src1->loop_father == loop)
+	   && (src0->loop_father == loop_outer (loop)))
+    {
+      outer_loop_t = gimple_phi_arg_def (def_stmt, 0);
+      res = true;
+    }
+
+  if (res)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "===> ");
+	  print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM);
+	}
+      return true;
+    }
+  return false;
+}
+
+/* add worklist and walked list.  */
+
+void
+add_worklist_walked (std::vector<tree> &worklist, std::set<tree> &walked,
+		     tree node)
+{
+  if (!walked.count (node))
+    {
+      worklist.push_back (node);
+      /* Avoid phi node cycle introduction, which makes the worklist unable
+	 to end.  */
+      walked.insert (node);
+    }
+}
+
+/* check bound iv and add worklist.  */
+
+void
+check_bound_iv_and_add_worklist (std::vector<tree> &worklist,
+				 std::set<tree> &walked,
+				 std::set<basic_block> &walked_loop,
+				 tree t, data_ref &mem_ref)
+{
+  if (t == NULL_TREE || TREE_CODE (t) != SSA_NAME)
+    return;
+
+  gimple *def_stmt = SSA_NAME_DEF_STMT (t);
+  if (def_stmt == NULL)
+    return;
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      print_generic_expr (dump_file, t, TDF_SLIM);
+      fprintf (dump_file, "\t\t: ");
+      print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM);
+    }
+
+  if (gimple_code (def_stmt) == GIMPLE_PHI)
+    {
+      tree out_loop_t = NULL_TREE;
+      if (loop_bound_iv_p (t, out_loop_t))
+	{
+	  basic_block bb = gimple_bb (def_stmt);
+	  if (!walked_loop.count (bb))
+	    {
+	      mem_ref.loop_bounds.push_back (loop_bound (t, def_stmt));
+	      walked_loop.insert (bb);
+	    }
+	  add_worklist_walked (worklist, walked, out_loop_t);
+	}
+    }
+  else if (is_gimple_assign (def_stmt))
+    {
+      tree_code rhs_code = gimple_assign_rhs_code (def_stmt);
+
+      /* unary.  */
+      if (rhs_code == SSA_NAME || rhs_code == NOP_EXPR)
+	add_worklist_walked (worklist, walked, gimple_assign_rhs1 (def_stmt));
+      else if (rhs_code == POINTER_PLUS_EXPR)
+	add_worklist_walked (worklist, walked, gimple_assign_rhs2 (def_stmt));
+
+      /* binary.  */
+      else if (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR
+	       || rhs_code == MULT_EXPR)
+	{
+	  add_worklist_walked (worklist, walked, gimple_assign_rhs1 (def_stmt));
+	  add_worklist_walked (worklist, walked, gimple_assign_rhs2 (def_stmt));
+	}
+    }
+}
+
+/* DFS trace the loop bound of iv.  */
+
+bool
+trace_loop_bound_iv (data_ref &mem_ref)
+{
+  /* In indirect memory access, the size cannot be determined based on the
+     loop boundary. However, we can take advantage of loop bound as an upper
+     bound (unrepeated memory access) to predict the variable footprint
+     involved in the specific loop dimension.  */
+
+  /* Determine and record the boundary iv of the current index,
+     but do not trace it.  */
+  tree outer_loop_t = NULL_TREE;
+  /* indirect access example, mem_ref.index = _64
+    _62 = MEM[symbol: uPtr, index: ivtmp.22_96, step: 4, offset: 0B];
+    _63 = (long unsigned int) _62;
+    _64 = _63 * 8;
+    _65 = [openfoam_smooth.c:28:28] &bPrimePtr + _64;
+    _66 = *_65;  */
+  if (loop_bound_iv_p (mem_ref.index, outer_loop_t) || !mem_ref.regular_p)
+    {
+      mem_ref.loop_bounds.push_back (
+	    loop_bound (mem_ref.index, SSA_NAME_DEF_STMT (mem_ref.index)));
+      if (!mem_ref.regular_p)
+	return false;
+    }
+
+  std::vector<tree> worklist;
+  worklist.push_back (mem_ref.base);
+  std::set<tree> walked;
+  std::set<basic_block> walked_loop;
+
+  while (worklist.size ())
+    {
+      tree t = worklist.back ();
+      worklist.pop_back ();
+
+      /* add worklist.  */
+      check_bound_iv_and_add_worklist (worklist, walked, walked_loop, t, mem_ref);
+    }
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "\nmem_ref access dimension: %ld\n",
+	       mem_ref.loop_bounds.size ());
+      fprintf (dump_file, "Traced variables: ");
+      print_generic_expr (dump_file, mem_ref.base, TDF_SLIM);
+      fprintf (dump_file, "\n");
+    }
+
+  return mem_ref.loop_bounds.size () > 0;
+}
+
+/* dump loop bound.  */
+
+void
+loop_bound_dump (FILE *file, loop_bound &lb)
+{
+  class loop *loop = lb.loop;
+  fprintf (file, "loop_bound: loop_%d (", loop->num);
+  if (loop->header)
+    fprintf (file, "header = %d", loop->header->index);
+  else
+    {
+      fprintf (file, "deleted)\n");
+      return;
+    }
+  if (loop->latch)
+    fprintf (file, ", latch = %d", loop->latch->index);
+  fprintf (file, ", lb_niters = ");
+  print_generic_expr (file, lb.niters);
+  fprintf (file, ")\n\n");
+}
+
+/* static calculate data size.  */
+
+void
+static_calculate_data_size (data_ref &mem_ref)
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "\nstatic_calculate_data_size\n");
+
+  tree size_unit = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (mem_ref.var)));
+  unsigned HOST_WIDE_INT type_size = size_unit ? tree_to_uhwi (size_unit) : 0;
+  for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i)
+    {
+      unsigned HOST_WIDE_INT est_niter = tree_to_uhwi
+					   (mem_ref.loop_bounds[i].niters);
+      unsigned int unroll = mem_ref.loop_bounds[i].unroll;
+      if (i == 0)
+	{
+	  /* The unit conversion between byte, kilobytes, and megabytes is
+	     1024.  */
+	  mem_ref.data_size = double (type_size
+				      * est_niter * unroll) / 1024 / 1024;
+	}
+      else
+	mem_ref.data_size *= est_niter * unroll;
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "static_data_size: %lf\n", mem_ref.data_size);
+    }
+}
+
+/* Recursive tracing and creating of dominant nodes.  */
+
+tree
+trace_and_create_dominate_expr (tree expr, class loop *outermost)
+{
+  if (expr == NULL_TREE || is_gimple_constant (expr))
+    return expr;
+
+  if (TREE_CODE (expr) != SSA_NAME)
+    return NULL_TREE;
+
+  if (SSA_NAME_IS_DEFAULT_DEF (expr))
+    return expr;
+
+  gimple *stmt = SSA_NAME_DEF_STMT (expr);
+  basic_block def_bb = gimple_bb (stmt);
+  if (def_bb == NULL || def_bb->loop_father == NULL)
+    return NULL_TREE;
+
+  if (dominated_by_p (CDI_DOMINATORS, outermost->header, def_bb))
+    return expr;
+
+  if (gimple_code (stmt) != GIMPLE_ASSIGN)
+    return NULL_TREE;
+
+  enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
+  tree_code_class code_class = TREE_CODE_CLASS (rhs_code);
+  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
+  tree rhs1 = trace_and_create_dominate_expr (gimple_assign_rhs1 (stmt),
+					      outermost);
+  if (rhs1 == NULL_TREE)
+    return NULL_TREE;
+
+  if (code_class == tcc_unary)
+    {
+      tree expr_new = build1 (rhs_code, type, rhs1);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "expr_new = ");
+	  print_generic_expr (dump_file, expr_new, TDF_SLIM);
+	  fprintf (dump_file, "\n");
+	}
+      return expr_new;
+    }
+  else if (code_class == tcc_binary)
+    {
+      tree rhs2 = trace_and_create_dominate_expr (gimple_assign_rhs2 (stmt),
+						  outermost);
+      if (rhs2 == NULL_TREE)
+	return NULL_TREE;
+
+      tree expr_new = fold_build2 (rhs_code, type, rhs1, rhs2);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "expr_new = ");
+	  print_generic_expr (dump_file, expr_new, TDF_SLIM);
+	  fprintf (dump_file, "\n");
+	}
+      return expr_new;
+    }
+
+  return NULL_TREE;
+}
+
+/* Recursive parsing and craating of nodes in expr expressions.  */
+
+tree
+parse_and_create_expr (tree expr, class loop *outermost)
+{
+  if (expr == NULL_TREE || expr == chrec_dont_know
+      || is_gimple_constant (expr) || TREE_CODE (expr) == ADDR_EXPR)
+    {
+      /* tcc_expression (e.g., &q) situation combined with tcc_unary.  */
+      if (TREE_CODE (expr) == ADDR_EXPR && dump_file
+	  && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "tcc_expression case in ADDR_EXPR: ");
+	  print_generic_expr (dump_file, expr, TDF_SLIM);
+	  fprintf (dump_file, "\n");
+	}
+      return expr;
+    }
+
+  if (TREE_CODE (expr) == SSA_NAME)
+    return trace_and_create_dominate_expr (expr, outermost);
+  else if (EXPR_P (expr))
+    {
+      enum tree_code tree_code = TREE_CODE (expr);
+      tree_code_class code_class = TREE_CODE_CLASS (tree_code);
+      tree type = TREE_TYPE (expr);
+      tree op1 = parse_and_create_expr (TREE_OPERAND (expr, 0), outermost);
+      if (op1 == NULL_TREE)
+	return NULL_TREE;
+
+      if (code_class == tcc_unary)
+	{
+	  tree expr_new = build1 (tree_code, type, op1);
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    {
+	      fprintf (dump_file, "expr_new = ");
+	      print_generic_expr (dump_file, expr_new, TDF_SLIM);
+	      fprintf (dump_file, "\n");
+	    }
+	  return expr_new;
+	}
+      else if (code_class == tcc_binary)
+	{
+	  tree op2 = parse_and_create_expr (TREE_OPERAND (expr, 1), outermost);
+	  if (op2 == NULL_TREE)
+	    return NULL_TREE;
+
+	  tree expr_new = fold_build2 (tree_code, type, op1, op2);
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    {
+	      fprintf (dump_file, "expr_new = ");
+	      print_generic_expr (dump_file, expr_new, TDF_SLIM);
+	      fprintf (dump_file, "\n");
+	    }
+	  return expr_new;
+	}
+    }
+  return NULL_TREE;
+}
+
+/* Trace and creat dominate loop bounds.  */
+
+void
+trace_and_create_dominate_loop_bounds (data_ref &mem_ref)
+{
+  /* Check whether the niters is a loop dominant.
+     If not, trace and determine whether the result is dominant.  If yes,
+     create the expr of the dominant node.
+  */
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "\ntrace_and_create_dominate_loop_bounds\n");
+
+  /* Determine the relationship between the boundary of the innermost loop and
+     the dominant of the outer loop and the processing.  */
+  loop_bound &outermost = mem_ref.loop_bounds.back ();
+  for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i)
+    {
+      loop_bound &current = mem_ref.loop_bounds[i];
+      tree &niters = current.niters;
+      if (TREE_CODE (niters) == COND_EXPR)
+	niters = TREE_OPERAND (niters, 1);
+
+      niters = parse_and_create_expr (niters, outermost.loop);
+
+      if (niters == NULL_TREE)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    {
+	      print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM);
+	      fprintf (dump_file, "Tracing loop bound failed at dimension %d\n",
+		       i);
+	    }
+	  mem_ref.calc_by = UNHANDLE_CALC;
+	  break;
+	}
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	loop_bound_dump (dump_file, mem_ref.loop_bounds[i]);
+    }
+}
+
+/* trace the dimension and corresponding loop bounds of mem_ref.
+   This function is used to supplement the information of mem_ref.loop_bounds.
+*/
+
+void
+trace_ref_dimension_and_loop_bounds (data_ref &mem_ref)
+{
+  /* In the same loop, some memory access dimensions are different.  Remove
+     variables with fewer dimensions.
+     Previous cyclic filtering conditions and memory access node records and
+     tracing.
+     The false result is also processed.
+  */
+  if (dump_file)
+    fprintf (dump_file, "\ncalculate_data_size\n");
+
+  /* Trace the loop bound iv of ref to determine the dimension.  */
+  /* Record data from the loop perspective to avoid repeated tracing.  */
+  if (!trace_loop_bound_iv (mem_ref))
+    return;
+
+  /* The traced mem_ref may have multiple dimensions, which corresponds to
+     multiple loops.  */
+  /* And in the dimension-by-dimensional analysis, the computable way is
+     continuously reduced.  */
+  mem_ref.calc_by = STATIC_CALC;
+  for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i)
+    {
+      class loop *loop = mem_ref.loop_bounds[i].loop;
+      tree &niters = mem_ref.loop_bounds[i].niters;
+
+      /* Set NULL_TREE to ensure that nb_iterations are retraced and
+	 vec_nb_iterations are also extracted.  */
+      loop->nb_iterations = NULL_TREE;
+      niters = number_of_latch_executions (loop, false);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	loop_dump (dump_file, loop);
+
+      if (loop->unroll)
+	{
+	  if (loop->unroll == USHRT_MAX && dump_file
+	      && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "loop->unroll = USHRT_MAX = %d", USHRT_MAX);
+	  mem_ref.loop_bounds[i].unroll = loop->unroll;
+	}
+
+      if ((niters == chrec_dont_know) && loop->vec_nb_iterations
+	   && (loop->vec_nb_iterations != chrec_dont_know))
+	niters = loop->vec_nb_iterations;
+
+      if (niters == chrec_dont_know)
+	{
+	  /* We derive est_loop_niters from function
+	     `estimated_loop_iterations_int`. Usually only the innermost loop is
+	     vectorized, so vec_nb_iterations can be 4 or 8 times as large as
+	     `est_loop_niters` due to vectorization. However, function
+	     `estimated_loop_iterations_int` only returns an integer instead of
+	     a tree node expression, so it cannot substitute
+	     function `number_of_latch_executions` in runtime computation.  */
+	  HOST_WIDE_INT est_loop_niters = estimated_loop_iterations_int (loop);
+	  if (est_loop_niters >= 0 && est_loop_niters < INT_MAX)
+	    /* e.g., loop iterations from `estimated_loop_iterations_int`: (-1)
+	       loop_144 (header = 519, latch = 625, niter = scev_not_known,
+	       upper_bound = 1073741823, likely_upper_bound = 1073741823,
+	       unroll = 1)  */
+	    /* variable `niters` from `loop->vec_nb_iterations`
+	       <integer_cst 0xfffff57df5d0 type
+	       <integer_type 0xfffff625a1f8> constant 34>  */
+	    niters = build_int_cst (integer_type_node, (int) est_loop_niters);
+	}
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	loop_bound_dump (dump_file, mem_ref.loop_bounds[i]);
+
+      if (niters == NULL_TREE || niters == chrec_dont_know)
+	mem_ref.calc_by = std::min (mem_ref.calc_by, UNHANDLE_CALC);
+      else if (TREE_CODE (niters) != INTEGER_CST)
+	mem_ref.calc_by = std::min (mem_ref.calc_by, RUNTIME_CALC);
+      else
+	mem_ref.calc_by = std::min (mem_ref.calc_by, STATIC_CALC);
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  if (mem_ref.calc_by == 2)
+	    {
+	      fprintf (dump_file, "\nniters: ");
+	      print_generic_expr (dump_file, niters, TDF_SLIM);
+	      fprintf (dump_file, "\nSTATIC_CALC.\n");
+	    }
+	  else if (mem_ref.calc_by == 1)
+	    {
+	      fprintf (dump_file, "\nniters: ");
+	      print_generic_expr (dump_file, niters, TDF_SLIM);
+	      fprintf (dump_file, "\nRUNTIME_CALC.\n");
+	    }
+	  else
+	    fprintf (dump_file, "\nUNHANDLE_CALC.\n");
+	}
+    }
+
+  if (mem_ref.calc_by == RUNTIME_CALC)
+    trace_and_create_dominate_loop_bounds (mem_ref);
+  else if (mem_ref.calc_by == STATIC_CALC)
+    static_calculate_data_size (mem_ref);
+}
+
+/* Get the loop's niters tree.
+   Return NULL_TREE if not found.  */
+
+tree
+get_cur_loop_niters (std::map<class loop *, std::vector<data_ref> > &loop_refs,
+		     class loop *loop)
+{
+  if (loop_refs.count (loop) == 0)
+    return NULL_TREE;
+  std::vector<loop_bound> bounds = loop_refs[loop][0].loop_bounds;
+  return bounds.size () ? bounds[0].niters : NULL_TREE;
+}
+
+/* Trace the sources of the niters tree and return the
+   outermost depth of the loops containing them.
+   Return start_depth if not found.
+
+   example:
+   niters:(long) (((int) i_end_417 - (int) i_start_452) + 1)
+   operand_num: 1, subtree:(long) (((int) i_end_417 - (int) i_start_452) + 1)
+   operand_num: 2, subtree:((int) i_end_417 - (int) i_start_452) + 1
+   operand_num: 2, subtree:(int) i_end_417 - (int) i_start_452
+   operand_num: 1, subtree:(int) i_end_417
+   SSA_NAME of niters: i_end_417
+   gimple of SSA: i_end_417 = PHI <i_end_446(9), i_end_410(100)>
+   return gimple depth;
+*/
+
+unsigned
+trace_outer_loop_depth (tree niters, unsigned start_depth)
+{
+  /* If niter does not exist or the type is INTEGER_CST,
+     the loop bound is determined and return start_depth.  */
+  if (niters == NULL_TREE || TREE_CODE (niters) == INTEGER_CST)
+    return start_depth;
+
+  gimple *def_stmt = NULL;
+  /* niters examples: i_start_452, fEnd_35, fEnd_100.  */
+  enum tree_code niter_code = TREE_CODE (niters);
+  if (niter_code == SSA_NAME)
+    {
+      /* Trace the SSA that define this niter.  */
+      def_stmt = SSA_NAME_DEF_STMT (niters);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "ssa_name of niters: ");
+	  print_generic_expr (dump_file, niters);
+	  fprintf (dump_file, "\ngimple of ssa: \n");
+	  print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO);
+	  fprintf (dump_file, "\n");
+	}
+      /* Termination condition of dfs.  Return the depth of the bb block.  */
+      if (gimple_code (def_stmt) == GIMPLE_PHI
+	  || gimple_code (def_stmt) == GIMPLE_NOP)
+	{
+	  basic_block def_bb = gimple_bb (SSA_NAME_DEF_STMT (niters));
+	  if (def_bb == NULL || def_bb->loop_father == NULL)
+	    return start_depth;
+	  unsigned ret_depth = loop_depth (def_bb->loop_father);
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    {
+	      fprintf (dump_file, "Stop tracing the outer loop depth, ");
+	      fprintf (dump_file, "current depth: %d, current bb: %d\n",
+		       ret_depth, def_bb->index);
+	    }
+	  return ret_depth;
+	}
+      /* 'ASSIGN': Use dfs to trace the rhs of the assignment statement.  */
+      else if (gimple_code (def_stmt) == GIMPLE_ASSIGN)
+	{
+	  tree rhs = gimple_assign_rhs1 (def_stmt);
+	  if (TREE_CODE (rhs) == TARGET_MEM_REF)
+	    /* fEnd_35 = MEM[base: _19, index: ivtmp.96, step: 4,
+			     offset: 0B]  */
+	    return trace_outer_loop_depth (TREE_OPERAND (rhs, 2), start_depth);
+	  else
+	    {
+	      /* M.218_658 = MIN_EXPR <_631, _657>  */
+	      unsigned min_depth = start_depth;
+	      unsigned operand_num = gimple_num_ops (def_stmt);
+	      /* 'ASSIGN': start from 1 because op[0] is the lhs.  */
+	      for (unsigned i = 1; i < operand_num; i++)
+		{
+		  tree subtree = dyn_cast<gassign *>(def_stmt)->op[i];
+		  if (subtree == NULL)
+		    continue;
+		  unsigned depth = trace_outer_loop_depth (subtree, \
+				   start_depth);
+		  min_depth = MIN (min_depth, depth);
+		  }
+		return min_depth;
+	    }
+	}
+      else
+	{
+	  /* Adding termination conditions:
+	   1)  Niters is MEM variable;
+	   2)  Niters is a runtime value (smooth_uPtr), and consider
+	       finding footprint in other mem_ref;
+	   3)  Niters is loop variable (i_start/i_end), and the boundary in
+	       the outer loop depends on the variable j_start/j_end.  */
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    {
+	      fprintf (dump_file, "The loop termination condition is "
+				  "extended.\n");
+	    }
+	  return start_depth;
+	}
+    }
+  /* The operand nums can be obtained when the tree code is as follows.  */
+  else if (niter_code == NOP_EXPR || niter_code == MEM_REF
+	   || niter_code == ARRAY_REF || niter_code == COND_EXPR
+	   || niter_code == PLUS_EXPR || niter_code == MINUS_EXPR
+	   || niter_code == TARGET_MEM_REF || niter_code == POINTER_PLUS_EXPR)
+    {
+      /* operand_num is the operand in the niters statement.
+	 example: In the following niter statement, operand_num = 3.
+	 (unsigned int) fEnd_35 - (unsigned int) fEnd_100 + 4294967295.  */
+      unsigned operand_num = TREE_OPERAND_LENGTH (niters);
+      unsigned min_depth = start_depth;
+      for (unsigned i = 0; i < operand_num; i++)
+	{
+	  tree subtree = TREE_OPERAND (niters, i);
+	  if (subtree == NULL)
+	    continue;
+	  unsigned depth = trace_outer_loop_depth (subtree, start_depth);
+	  min_depth = MIN (min_depth, depth);
+	}
+      return min_depth;
+    }
+  else
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "niters is another tree code: %s\n",
+		   get_tree_code_name (niter_code));
+	  print_generic_expr (dump_file, niters, TDF_SLIM);
+	  fprintf (dump_file, "\n");
+	}
+      return start_depth;
+    }
+}
+
+/* Traces the ref dimension information in each loop.  */
+
+void
+analyze_loop_refs_dimension (std::vector<data_ref> &refs)
+{
+  for (unsigned i = 0; i < refs.size (); ++i)
+    {
+      if (refs[i].trace_status_p == false)
+	continue;
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "trace_reference_dimension %d:\n", i);
+	  print_generic_expr (dump_file, refs[i].ref, TDF_SLIM);
+	  fprintf (dump_file, "\n");
+	}
+      trace_ref_dimension_and_loop_bounds (refs[i]);
+    }
+}
+
+/* analyze nested kernels
+   1) multidimension loop analyze
+   2) extended outer loop analyze
+*/
+
+bool
+analyze_nested_kernels (std::vector<class loop *> &kernels,
+			std::map<class loop *,
+				 std::vector<data_ref> > &loop_refs,
+			std::set<gimple *> &traced_ref_stmt,
+			std::vector<data_ref> &unresolved_refs)
+{
+  if (dump_file)
+    fprintf (dump_file, "\nPhase 3: analyze_nested_kernels\n\n");
+
+  /* `kernels` may be added in during outer loop extension phase,
+     thus using initial size to avoid repeatedly analyzing.  */
+  unsigned init_kernels_size = kernels.size ();
+  for (unsigned i = 0; i < init_kernels_size; ++i)
+    {
+      class loop *loop = kernels[i];
+      if (loop_refs.count (loop) == 0)
+	continue;
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "loop header %d:\n", loop->header->index);
+      analyze_loop_refs_dimension (loop_refs[loop]);
+
+      unsigned depth = loop_depth (loop);
+      unsigned outer_depth = trace_outer_loop_depth (get_cur_loop_niters
+			     (loop_refs, loop), depth);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "cur_depth: %d, outer_depth: %d\n",
+		 depth, outer_depth);
+      /* param_outer_loop_num: number of loops of the extended outer loop.
+	 Outermost loop should not be extended when outer_depth = 0.
+	 `outer_depth == depth` means the current loop is the loop which
+	 boundary is known, so there is no need to extend the outer loop.  */
+      if (outer_depth == 0 || outer_depth == depth
+	  || depth > outer_depth + param_outer_loop_num)
+	continue;
+
+      /* Extend outer loop.  */
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "\nStart extending outer loop\n");
+      /* Superloops of the loop, start from the loop closest to the
+	  current loop in the outermost loop.  */
+      for (int j = 0; j < param_outer_loop_num && --depth; ++j)
+	{
+	  class loop *outer_loop = (*loop->superloops)[depth];
+	  /* The outer loop may be added when analyzing previous inner loops,
+	     i.e. the outer loop contains two or more inner loops.  */
+	  if (loop_refs.count (outer_loop))
+	    continue;
+	  /* phase1 ~ phase3 analysis on the extended outer loop.  */
+	  analyze_loop_dense_memory (kernels, loop_refs, outer_loop);
+	  if (loop_refs.count (outer_loop) == 0)
+	    continue;
+	  for (unsigned k = 0; k < loop_refs[outer_loop].size (); ++k)
+	    {
+	      if (dump_file && (dump_flags & TDF_DETAILS))
+		{
+		  fprintf (dump_file, "outer_analyze_nested_kernels %d: ", k);
+		  print_generic_expr (dump_file, loop_refs[outer_loop][k].ref,
+				      TDF_SLIM);
+		  fprintf (dump_file, "\n");
+		}
+	    }
+	  trace_loop_refs_info (loop_refs[outer_loop], traced_ref_stmt,
+				unresolved_refs);
+	  analyze_loop_refs_dimension (loop_refs[outer_loop]);
+	  outer_depth = trace_outer_loop_depth (get_cur_loop_niters
+						(loop_refs, outer_loop), depth);
+	  /* `outer_depth == depth` means the current loop is the loop which
+	   boundary is known, so there is no need to extend the outer loop.  */
+	  if (outer_depth == depth)
+	    break;
+	  else
+	    /* The outer loop cannot find the current loop boundary,
+	       Remove the record of outer_loop from the loop_refs.  */
+	    loop_refs.erase (outer_loop);
+	}
+    }
+  return true;
+}
+
+/* ================ phase 4 filter_and_sort_kernels ================  */
+
+/* Get the edge probability information of each basic block in the loop.  */
+
+float
+get_edge_prob (edge e, float minimum)
+{
+  float fvalue = 0;
+
+  profile_probability probability = e->probability;
+  if (probability.initialized_p ())
+    {
+      fvalue = probability.to_reg_br_prob_base () / float (REG_BR_PROB_BASE);
+      if (fvalue < minimum && probability.to_reg_br_prob_base ())
+	fvalue = minimum;
+    }
+  return fvalue;
+}
+
+/* Get the next bb with a high branch probability.  */
+
+basic_block
+next_high_probability_bb (basic_block bb)
+{
+  if (bb == NULL)
+    return NULL;
+
+  /* Limit the minimum probability value.  */
+  const float MINNUM_PROB = 0.00001f;
+  float minimum = MINNUM_PROB;
+
+  gimple *stmt = last_stmt (bb);
+  if (stmt && gimple_code (stmt) == GIMPLE_COND)
+    {
+      edge true_edge = NULL;
+      edge false_edge = NULL;
+      extract_true_false_edges_from_block (bb, &true_edge, &false_edge);
+
+      float true_edge_prob = get_edge_prob (true_edge, minimum);
+      float false_edge_prob = get_edge_prob (false_edge, minimum);
+      /* If the content of the branch does not include the candidate
+	 kernel, the branch probability may not be limited.  */
+      /* The edge_prob may have precision error during static prediction,
+	 so we need to relax the limit before comparison.  */
+      if ((true_edge_prob >= (param_branch_prob_threshold / 100.0) - minimum)
+	  && flow_bb_inside_loop_p (bb->loop_father, true_edge->dest))
+	return true_edge->dest;
+      else if ((false_edge_prob
+		>= (param_branch_prob_threshold / 100.0) - minimum)
+	       && flow_bb_inside_loop_p (bb->loop_father, false_edge->dest))
+	return false_edge->dest;
+      else
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    {
+	      fprintf (dump_file, "No high probability bb:");
+	      fprintf (dump_file, "current bb: %d, true: %f, false: %f\n",
+		       bb->index, true_edge_prob, false_edge_prob);
+	    }
+	  return NULL;
+	}
+    }
+  else
+    {
+      edge e = find_fallthru_edge (bb->succs);
+      if (e)
+	return e->dest;
+    }
+  return NULL;
+}
+
+
+/* Dump loop header bb.  */
+
+void
+dump_loop_headers (const char *name, std::vector<class loop *> &loops)
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "\n\n%s:\n", name);
+      fprintf (dump_file, "{ ");
+      for (unsigned int i = 0; i < loops.size (); i++)
+	fprintf (dump_file, "%d(%d) ", loops[i]->num, loops[i]->header->index);
+      fprintf (dump_file, "}\n\n");
+    }
+}
+
+/* Combine and sort candidate loops.  */
+
+bool
+filter_and_sort_kernels (std::vector<class loop *> &sorted_kernels,
+			 std::vector<class loop *> &kernels)
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "\nPhase 4: filter_and_sort_kernels:\n\n");
+
+  std::set<basic_block> end_bb;
+  std::list<basic_block> walked_header_bb; /* Used to record nested loops.  */
+  std::set<int> walked_non_header_bb_idx;
+
+  for (unsigned i = 0; i < kernels.size (); ++i)
+    {
+      if (kernels[i]->inner == NULL)
+	end_bb.insert (kernels[i]->header);
+    }
+
+  dump_loop_headers ("kernels", kernels);
+
+  if (!param_filter_kernels)
+    {
+      for (std::vector<class loop *>::iterator it = kernels.begin ();
+	   it != kernels.end (); ++it)
+	sorted_kernels.push_back (*it);
+    }
+  else
+    {
+      basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun);
+
+      while (bb)
+	{
+	  if (bb == NULL)
+	    return false;
+	  if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun))
+	    break;
+
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "%d ", bb->index);
+
+	  /* bb is not the head of the loop, go to the next.  */
+	  if (bb != bb->loop_father->header)
+	    {
+	      if (walked_non_header_bb_idx.count (bb->index))
+		{
+		  if (dump_file && (dump_flags & TDF_DETAILS))
+		    fprintf (dump_file, "Find same-loop cycle.  "
+					"Abort filtering process.\n");
+		  return false;
+		}
+	      walked_non_header_bb_idx.insert (bb->index);
+      	      bb = next_high_probability_bb (bb);
+	      continue;
+	    }
+
+	  /* bb is the head of the loop.  */
+	  if (bb != walked_header_bb.back ())
+	    {
+	      if (end_bb.count (bb))
+		{
+		  sorted_kernels.push_back (bb->loop_father);
+		  bb = single_exit (bb->loop_father)->dest;
+		  continue;
+		}
+	      if (loop_outer (bb->loop_father) != NULL
+		  && get_loop_exit_edges (bb->loop_father).length () != 1)
+		return false;
+	      walked_header_bb.push_back (bb);
+	      bb = next_high_probability_bb (bb);
+	      continue;
+	    }
+	  else
+	    {
+	      walked_header_bb.pop_back ();
+	      bb = single_exit (bb->loop_father)->dest;
+	      continue;
+	    }
+	}
+    }
+
+  dump_loop_headers ("sorted_kernels", sorted_kernels);
+  return true;
+}
+
+/* Check whether the given bb is null.  */
+
+bool
+check_null_bb (basic_block bb)
+{
+  if (bb == NULL)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Unexpected error at null bb.\n");
+      return true;
+    }
+  return false;
+}
+
+/* Check whether the loop father of the given bb is null.  */
+
+bool
+check_null_loop_father (basic_block bb)
+{
+  if (check_null_bb (bb))
+    return true;
+
+  if (bb->loop_father == NULL)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "bb %d's loop father is null.\n", bb->index);
+      return true;
+    }
+  return false;
+}
+
+/* States for bb during path traversal.  */
+
+enum bb_traversal_state
+{
+  NOT_TRAVERSED = 0,
+  UNDER_TRAVERSAL,
+  FULLY_TRAVERSED
+};
+
+/* Detect abnormal revisit for bb during path traversal where bb is
+   1) fully traversed,
+   2) non-loop-header bb but currently under traversal.  */
+
+bool
+revisit_bb_abnormal_p (basic_block bb, std::vector<int> &bb_visited,
+		       const std::set<int> &header_bb_idx_set,
+		       std::set<std::pair<int, int> > &unused_edges,
+		       int src_bb_idx)
+{
+  /* If the header bb has been already fully traversed, early exit
+     the function.  */
+  if (bb_visited[bb->index] == FULLY_TRAVERSED)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Already visited bb index %d. Abort.\n",
+		 bb->index);
+      return true;
+    }
+
+  /* If we revisit a non-header bb during next-bb traversal, we detect
+     an inner-loop cycle and dump warning info. Record this abnormal edge
+     in `unused_edges` for special treatment in path weight update.  */
+  if (!header_bb_idx_set.count (bb->index)
+      && bb_visited[bb->index] == UNDER_TRAVERSAL)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Warning: Find cycle at bb index %d. Abort.\n",
+		 bb->index);
+      unused_edges.insert (std::make_pair (src_bb_idx, bb->index));
+      return true;
+    }
+
+  return false;
+}
+
+/* Check successor bb through edge e. Return true if successor bb is NULL or
+   out of loop.  */
+
+bool
+check_succ_bb_abnormal_p (basic_block bb, edge e)
+{
+  if (check_null_bb (e->dest))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Null bb connected to src bb %d.\n", bb->index);
+
+      return true;
+    }
+
+  /* If bb is within one loop and the edge is pointing to the
+     outer loop, skip edge processing until a backedge to header
+     bb. `loop->num = 0` represents function body.  */
+  if (bb->loop_father->num != 0
+      && !flow_bb_inside_loop_p (bb->loop_father, e->dest))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Find edges to the outer loop at bb "
+			    "index %d to bb index %d. Abort.\n",
+		 bb->index, e->dest->index);
+
+      return true;
+    }
+
+  return false;
+}
+
+/* Criteria for retrieving the next bb in modified control-flow graph, which
+   creates a topological order for the bb traversal.  */
+
+void
+get_next_toposort_bb (basic_block bb, std::vector<int> &bb_visited,
+		      std::list<basic_block> &bb_topo_order,
+		      const std::set<int> &header_bb_idx_set,
+		      std::set<std::pair<int, int> > &unused_edges,
+		      int src_bb_idx)
+{
+  /* 1) Before bb returns to the loop header, bb will not go to the outer loop.
+     2) After returning to the loop header, traverse all exit_bbs.
+     NEXT STEP:
+     1) If goto jumps out of 2 loops, goto has to traverse smaller jumps first.
+     2) If path length is the same => choose higher depth traversal path.  */
+  if (check_null_bb (bb) || check_null_loop_father (bb))
+    return;
+
+  /* Find last bb of function.  */
+  if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun))
+    return;
+
+  if (revisit_bb_abnormal_p (bb, bb_visited, header_bb_idx_set, unused_edges,
+			     src_bb_idx))
+    return;
+
+  /* If we revisit the header bb of a loop, traverse all exit bbs.  */
+  if (header_bb_idx_set.count (bb->index)
+      && bb_visited[bb->index] == UNDER_TRAVERSAL)
+    {
+      unsigned i;
+      edge e;
+      auto_vec<edge> exits = get_loop_exit_edges (bb->loop_father);
+
+      if (exits.length () > 1 && dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Detect multiple exits at loop %d.\n",
+		 bb->loop_father->num);
+
+      FOR_EACH_VEC_ELT (exits, i, e)
+	{
+	  get_next_toposort_bb (e->dest, bb_visited, bb_topo_order,
+				header_bb_idx_set, unused_edges, src_bb_idx);
+	}
+      return;
+    }
+
+  /* Post-order traversal for normal bb.  */
+  bb_visited[bb->index] = UNDER_TRAVERSAL;
+  edge e;
+  edge_iterator ei;
+
+  FOR_EACH_EDGE (e, ei, bb->succs)
+    {
+      if (check_succ_bb_abnormal_p (bb, e))
+	continue;
+
+      get_next_toposort_bb (e->dest, bb_visited, bb_topo_order,
+			    header_bb_idx_set, unused_edges, bb->index);
+    }
+
+  /* bb is marked as fully traversed and all its descendents have been
+      fully traversed due to post-order traversal.  */
+  bb_visited[bb->index] = FULLY_TRAVERSED;
+  bb_topo_order.push_back (bb);
+}
+
+/* A struct that represents the longest path weight at each bb.  */
+
+struct weight
+{
+  /* Longest path weight at current bb.  */
+  gcov_type bb_count;
+
+  /* Prev bb from the current longest path.  */
+  int prev_bb_idx;
+};
+
+/* A helper function for checking whether overflow will occur when adding two
+   gcov_type weights.  */
+
+bool
+check_weight_overflow (gcov_type a, gcov_type b)
+{
+  if ((a > 0 && b > INT64_MAX - a) || (a < 0 && b < INT64_MIN - a))
+    return true;
+
+  return false;
+}
+
+/* A helper function that update the weight of the current longest path to
+   bb_idx_dst and a new path pointing from bb_idx_src to bb_idx_dst.  */
+
+void
+update_path_weight (std::vector<weight> &bb_weights, int bb_idx_src,
+		    int bb_idx_dst, gcov_type weight_dst)
+{
+  if (check_weight_overflow (bb_weights[bb_idx_src].bb_count, weight_dst)
+      && dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "WARNING: Path weight overflow at src bb %d "
+			  "and dest bb %d.\n",
+	       bb_idx_src, bb_idx_dst);
+    }
+  if (bb_weights[bb_idx_dst].bb_count
+      < bb_weights[bb_idx_src].bb_count + weight_dst)
+    {
+      bb_weights[bb_idx_dst].bb_count
+	= bb_weights[bb_idx_src].bb_count + weight_dst;
+      bb_weights[bb_idx_dst].prev_bb_idx = bb_idx_src;
+    }
+}
+
+/* Check whether the required bb/loop info for path update is null.  */
+
+bool
+check_null_info_in_path_update (basic_block bb, edge e)
+{
+  if (check_null_bb (e->dest))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Null bb detected for edge connected "
+			    "to src bb %d.\n",
+		 bb->index);
+      return true;
+    }
+
+  if (check_null_loop_father (bb) || check_null_loop_father (e->dest))
+    return true;
+
+  return false;
+}
+
+/* Update path weight to loop exit bbs where the current source bb is connected
+   to header bb using a backedge.  */
+
+void
+update_backedge_path_weight (std::vector<weight> &bb_weights, basic_block bb,
+			   const std::set<std::pair<int, int> > &unused_edges)
+{
+  unsigned i;
+  edge e_exit;
+  auto_vec<edge> exits = get_loop_exit_edges (bb->loop_father);
+  FOR_EACH_VEC_ELT (exits, i, e_exit)
+    {
+      if (check_null_bb (e_exit->dest))
+	{
+	  if (e_exit->src != NULL && dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "Null bb detected for exiting edge "
+				"connected to src bb %d.\n",
+		     e_exit->src->index);
+	  continue;
+	}
+
+      if (unused_edges.count (std::make_pair (bb->index, e_exit->dest->index)))
+	{
+	  /* Inner-loop-cycle backedge case.  */
+	  continue;
+	}
+      update_path_weight (bb_weights, bb->index, e_exit->dest->index,
+			  e_exit->dest->count.to_gcov_type ());
+    }
+}
+
+/* Update the longest length of the path through control flow graph.  */
+
+void
+update_max_length_of_path (std::vector<weight> &bb_weights,
+			   std::list<basic_block> &bb_topo_order,
+			   const std::set<int> &header_bb_idx_set,
+			   const std::set<std::pair<int, int> > &unused_edges)
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "Start update weight traversal:\n");
+
+  while (!bb_topo_order.empty ())
+    {
+      basic_block bb = bb_topo_order.back ();
+      bb_topo_order.pop_back ();
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "%d ", bb->index);
+
+      edge e;
+      edge_iterator ei;
+      FOR_EACH_EDGE (e, ei, bb->succs)
+	{
+	  if (check_null_info_in_path_update (bb, e))
+	    continue;
+
+	  if (unused_edges.count (std::make_pair (bb->index, e->dest->index)))
+	    {
+	      /* Inner-loop-cycle backedge case.  */
+	      continue;
+	    }
+	  else if (bb->loop_father->num != 0
+		   && !flow_bb_inside_loop_p (bb->loop_father, e->dest))
+	    {
+	      /* Outer-loop edge case.  */
+	      continue;
+	    }
+	  else if (header_bb_idx_set.count (e->dest->index)
+	      && bb->loop_father == e->dest->loop_father)
+	    {
+	      /* Backedge case.  */
+	      update_backedge_path_weight (bb_weights, bb, unused_edges);
+	    }
+	  else
+	    {
+	      /* Normal edge case.  */
+	      update_path_weight (bb_weights, bb->index, e->dest->index,
+				  e->dest->count.to_gcov_type ());
+	    }
+	}
+    }
+}
+
+/* Collect all header bb of loops in the function beforehand.  */
+
+void
+collect_header_bb_for_fn (std::set<int> &header_bb_idx_set)
+{
+  for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
+    header_bb_idx_set.insert (loop->header->index);
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "\nCheck header bbs:\n");
+      for (std::set<int>::iterator it = header_bb_idx_set.begin ();
+	   it != header_bb_idx_set.end (); ++it)
+	fprintf (dump_file, "%d ", *it);
+      fprintf (dump_file, "\n");
+    }
+}
+
+/* Record loop executing order and bb high-executing path.  */
+
+void
+record_high_execution_path (std::vector<class loop *> &sorted_kernel,
+			    std::vector<int> &bb_path, int bb_num_max)
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "\nPATH FOR %s: ", get_name (cfun->decl));
+
+  std::set<int> loop_set;
+  for (int i = bb_path.size() - 1; i >= 0; --i)
+    {
+      int bb_idx = bb_path[i];
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "%d ", bb_idx);
+      gcc_assert (bb_idx < bb_num_max);
+
+      class loop *loop = BASIC_BLOCK_FOR_FN (cfun, bb_idx)->loop_father;
+      if (!loop_set.count (loop->num))
+	{
+	  loop_set.insert (loop->num);
+	  sorted_kernel.push_back (loop);
+	}
+    }
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "\n");
+}
+
+/* Combine and sort candidate loops using feedback information.  */
+
+bool
+filter_and_sort_kernels_feedback (std::vector<class loop *> &sorted_kernel,
+				  std::set<int> &bb_pathset)
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "\nPhase 4: filter_and_sort_kernels:\n\n");
+
+  std::set<int> header_bb_idx_set;
+  std::list<basic_block> bb_topo_order;
+
+  /* Quoted from GCC internal, Chapter 15.1, "the index for any block should
+     never be greater than `last_basic_block`." Therefore, we use this
+     variable for retrieving the max bb index of a function.  */
+  /* Since the pass does not add/remove/merge basic blocks until Phase 6
+     and previous passes will update ssa accordingly, we do not need to
+     `compact_blocks` to update bb indices currently.  */
+  int bb_num_max = last_basic_block_for_fn (cfun) + 1;
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "\nMaximal number of possible bbs in the "
+			  "function: %d\n",
+	     bb_num_max);
+  std::vector<int> bb_visited = std::vector<int>(bb_num_max, 0);
+
+  collect_header_bb_for_fn (header_bb_idx_set);
+  basic_block bb_start = ENTRY_BLOCK_PTR_FOR_FN (cfun);
+
+  /* Step 1: Get topological order of bb during traversal.  */
+  std::set<std::pair<int, int> > unused_edges;
+  get_next_toposort_bb (bb_start, bb_visited, bb_topo_order, header_bb_idx_set,
+			unused_edges, -1);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "\nCheck bbs in topological order:\n");
+      for (std::list<basic_block>::iterator it = bb_topo_order.begin ();
+	   it != bb_topo_order.end (); ++it)
+	fprintf (dump_file, "%d ", (*it)->index);
+      fprintf (dump_file, "\n");
+    }
+
+  /* Step 2: Update weights of nodes and path.  */
+  weight weight_init = {-1, -1};
+  std::vector<weight> bb_weights = std::vector<weight>(bb_num_max, weight_init);
+  bb_weights[0].bb_count = 0;  /* ENTRY bb has count 0 and prev bb as -1.  */
+  update_max_length_of_path (bb_weights, bb_topo_order, header_bb_idx_set,
+			     unused_edges);
+
+  /* Step 3: Backtrack a path from EXIT bb to ENTRY bb.  */
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "\nCheck counts for each bb:\n");
+
+  std::vector<int> bb_path;
+  int tmp_bb_idx = 1;
+  bb_pathset.insert (tmp_bb_idx);
+  bb_path.push_back (tmp_bb_idx);
+  tmp_bb_idx = bb_weights[tmp_bb_idx].prev_bb_idx;
+  while (tmp_bb_idx > 0 && tmp_bb_idx < bb_num_max)
+    {
+      if (bb_pathset.count (tmp_bb_idx))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf(dump_file, "ERROR: already seen bb index %d\n",
+		    tmp_bb_idx);
+	  return false;
+	}
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "%d: %ld, ", tmp_bb_idx,
+		 bb_weights[tmp_bb_idx].bb_count);
+      bb_pathset.insert (tmp_bb_idx);
+      bb_path.push_back (tmp_bb_idx);
+      tmp_bb_idx = bb_weights[tmp_bb_idx].prev_bb_idx;
+    }
+  /* It is possible that the function exit code is wrapped around as an
+     variable, and thus, EXIT_BB in cfg is not connected to any bb.  */
+  if (tmp_bb_idx < 0 || tmp_bb_idx >= bb_num_max)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "unhandled scenario at backtracking highly "
+			      "executed path with tmp_bb_idx %d",
+		   tmp_bb_idx);
+	}
+      return false;
+    }
+
+  record_high_execution_path (sorted_kernel, bb_path, bb_num_max);
+
+  return true;
+}
+
+
+/* ================ phase 5 record_and_sort_ref_groups ================  */
+/* Memory reference score, different aspects of one memory reference.  */
+
+struct ref_score
+{
+  /* certain memory reference.  */
+  data_ref d_ref;
+
+  /* local count for bb where memory reference is located.  */
+  gcov_type bb_count;
+
+  /* line-location of memory reference.  */
+  int line;
+};
+
+/* Memory reference group, different reference of the same variable.  */
+
+struct ref_group
+{
+  /* source variables.  */
+  tree var;
+
+  /* variable size, Unit: MB.  */
+  double var_size;
+
+  /* first ref for insert hint.  */
+  data_ref first_use;
+
+  /* first ref with the highest-order CALC.  */
+  data_ref first_calc_use;
+
+  /* reuse scores of variables.  */
+  float reuse_level;
+
+  /* method of calculating the var size.  */
+  calc_type calc_by;
+
+  /* memory reference index for specific variable.  */
+  unsigned int mem_ref_index;
+
+  /* variable dimension.  */
+  unsigned int dim;
+
+  /* True if first_calc_use's footprint replaces that of first_use.  */
+  unsigned int transfer_ft;
+
+  /* Accessing Reference Records in Different Modes (key_index):
+    000: write, random, non-parallel
+    001: write, random, parallel
+    010: write, regular, non-parallel
+    011: write, regular, parallel
+    100: read, random, non-parallel
+    101: read, random, parallel
+    110: read, regular, non-parallel
+    111: read, regular, parallel
+  */
+  std::map<int, std::vector<data_ref> > ref_use;
+
+  /* scores for different memory references.  */
+  std::vector<ref_score> ref_scores;
+
+  ref_group ()
+    {
+      var = NULL_TREE;
+      var_size = 0;
+      reuse_level = 0;
+      calc_by = UNHANDLE_CALC;
+      mem_ref_index = 0;
+      dim = 1;
+      transfer_ft = 0;
+    }
+};
+
+/* Get the integer part for log(x) with the given base.  */
+
+static unsigned int
+flog (float x, float base)
+{
+  unsigned int res = 0;
+  while (x >= base)
+    {
+      ++res;
+      x /= base;
+    }
+  return res;
+}
+
+/* Calculate reuse time for a memory reference in ref_group.  */
+
+float
+calculate_reuse_times (std::vector<data_ref> &mem_refs, std::set<int> &loop_set,
+		       std::set<int> &bb_set, unsigned int var_dim)
+{
+  const float SAME_BB_REUSE_WEIGHT = 0.1;
+  const float SAME_LOOP_REUSE_WEIGHT = 0.5;
+  const float NORMAL_REUSE_WEIGHT = 1.;
+
+  float reuse_time_sum = 0.;
+  for (std::vector<data_ref>::iterator it = mem_refs.begin ();
+       it != mem_refs.end (); ++it)
+    {
+      const data_ref &mem_ref = *it;
+      float reuse_time = 0.;
+      if (bb_set.count (mem_ref.bb_idx))
+	{
+	  /* If the two mem_ref belong to the same bb, the new reuse
+	     weight will not exceed 0.1 divided by the mem_ref mode group
+	     size.
+	     NEXT STEP: The following equation may hold and cause commutative
+	     property of read and write op not holding:
+	      write + (reused) read != read + (reused) write.
+	     However, it seems that write mem_ref is always before read mem_ref,
+	     so the above comparison does not show up in calculation due to
+	     intrinsic in-order property of tree map, but this condition is
+	     quite fragile anyway.  */
+	  reuse_time = SAME_BB_REUSE_WEIGHT / mem_refs.size ();
+	}
+      else
+	{
+	  bb_set.insert (mem_ref.bb_idx);
+	  if (loop_set.count (mem_ref.loop_idx))
+	    {
+	      /* If the mem_ref belongs to a loop where any other mem_ref is in,
+		 the new reuse weight will be 0.5.  */
+	      reuse_time = SAME_LOOP_REUSE_WEIGHT;
+	    }
+	  else
+	    {
+	      /* If the mem_ref is reused but not in the same group with any
+		 other mem_ref, the new reuse weight will be 1.  */
+	      loop_set.insert (mem_ref.loop_idx);
+	      reuse_time = NORMAL_REUSE_WEIGHT;
+	    }
+	}
+      unsigned int used_dim = std::min (mem_ref.loop_depth, var_dim);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "used_dim : %u, loop_depth : %u\n", used_dim,
+		 mem_ref.loop_depth);
+      unsigned int power = flog (std::max (0u, mem_ref.loop_depth - used_dim)
+				 + 2, 2.);
+      reuse_time_sum += reuse_time * (used_dim * used_dim / 2.) * (power);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "(%f * (%u * %u / 2) * (%u) = %f\n",
+		 reuse_time, used_dim, used_dim, power,
+		 reuse_time * (used_dim * used_dim / 2.) * (power));
+    }
+  return reuse_time_sum;
+}
+
+/* Calculate reuse level.  */
+
+float
+calculate_reuse_level (std::map<int, std::vector<data_ref> > &var_use,
+		       unsigned int var_dim, double var_size)
+{
+  const float VAR_SIZE_CACHE_CAPACITY = 1 / 4.;
+  const int WITHIN_CACHE_SIZE_COST = 4;
+  const float BYTE_CONVERT_RATIO = 1024.;
+
+  float level = 0.;
+  std::set<int> loop_set;
+  std::set<int> bb_set;
+  bool has_write_op = false;
+  for (std::map<int, std::vector<data_ref> >::iterator it = var_use.begin ();
+       it != var_use.end (); ++it)
+    {
+      unsigned int parallel = 1;
+      unsigned int regular = 1;
+
+      if ((*it).second[0].parallel_p)
+	parallel = PARALLEL_NUM;
+      if (!(*it).second[0].regular_p)
+	regular = INDIRECT_ACCESS_VALUE;
+      if (!(*it).second[0].read_p)
+	has_write_op = true;
+
+      /* In serial reuse, we will later check whether they are in the
+	 same cacheline.  If yes, delete the reuse.  For details, see the
+	 reuse analysis of prefetching and eliminate redundancy.  */
+      float reuse_times = calculate_reuse_times ((*it).second, loop_set,
+						 bb_set, var_dim);
+      float add = parallel * reuse_times * regular;
+      level += add;
+      if (add && dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "%d : %d * %f * %d = %f\n",
+		 (*it).first, parallel, reuse_times, regular, add);
+    }
+
+  bool within_llc_size = var_size > param_l2_cache_size / BYTE_CONVERT_RATIO
+			 && var_size < VAR_SIZE_CACHE_CAPACITY
+				       * param_llc_capacity_per_core;
+
+  float final_level = has_write_op ? (level * WRITE_COST) : level;
+  final_level = within_llc_size ? (final_level * WITHIN_CACHE_SIZE_COST)
+				: final_level;
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "final level : %d * %f * %d = %f\n",
+	     has_write_op ? WRITE_COST : 1, level,
+	     within_llc_size ? WITHIN_CACHE_SIZE_COST : 1, final_level);
+  return final_level;
+}
+
+/* Comparison of reference reuse level.  */
+
+bool
+ref_group_reuse_cmp (const ref_group &a, const ref_group &b)
+{
+  if (a.reuse_level != b.reuse_level)
+    return a.reuse_level > b.reuse_level;
+  else
+    return get_name (a.var) < get_name (b.var);
+}
+
+/* Dump key information of reference group and memory access for llc hint.  */
+
+void
+dump_key_info_for_llc_hint (std::vector<ref_group> &ref_groups)
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "\nLLC hint info:\n");
+      fprintf (dump_file, "rank\tvar\t(lineno, direct, vectorized, write)\n");
+      for (unsigned int i = 0; i < ref_groups.size (); ++i)
+	{
+	  fprintf (dump_file, "%d\t", i);
+	  print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM);
+	  data_ref &mem_ref = ref_groups[i].first_use;
+	  fprintf (dump_file, "\t(%d, %u, %u, %u)",
+		   expand_location (mem_ref.stmt->location).line,
+		   mem_ref.regular_p, mem_ref.vectorize_p, 1 - mem_ref.read_p);
+	  fprintf (dump_file, "\n");
+	}
+      fprintf (dump_file, "\n");
+    }
+}
+
+/* Sort reference groups.  */
+
+void
+sort_ref_groups (std::vector<ref_group> &ref_groups,
+		 std::map<tree, ref_group> &ref_groups_map)
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "\nsort_ref_groups_by_reuse_level\n");
+
+  for (std::map<tree, ref_group>::iterator it = ref_groups_map.begin ();
+       it != ref_groups_map.end (); ++it)
+    {
+      (*it).second.reuse_level = calculate_reuse_level ((*it).second.ref_use,
+							(*it).second.dim,
+							(*it).second.var_size);
+      ref_groups.push_back ((*it).second);
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  print_generic_expr (dump_file, (*it).second.var, TDF_SLIM);
+	  fprintf (dump_file, " : %f\n\n", (*it).second.reuse_level);
+	}
+    }
+
+  std::sort (ref_groups.begin (), ref_groups.end (), ref_group_reuse_cmp);
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "\nsorted ref_groups:\n");
+      fprintf (dump_file, "rank\tvar\t(data_size, dim, num_of_mem_ref, "
+			  "need_tmp_name): reuse_level_score\n");
+      for (unsigned int i = 0; i < ref_groups.size (); ++i)
+	{
+	  fprintf (dump_file, "%d\t", i);
+	  print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM);
+	  int need_tmp_name = !get_name (ref_groups[i].var) ? 1 : 0;
+	  fprintf (dump_file, "\t(%lf, %u, %lu, %d)", ref_groups[i].var_size,
+		   ref_groups[i].dim, ref_groups[i].ref_scores.size (),
+		   need_tmp_name);
+	  fprintf (dump_file, " : %f\n", ref_groups[i].reuse_level);
+	}
+      fprintf (dump_file, "\n");
+
+      fprintf (dump_file, "first_use:\n");
+      for (unsigned int i = 0; i < ref_groups.size (); ++i)
+	{
+	  fprintf (dump_file, "%d ", i);
+	  print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM);
+	  fprintf (dump_file, " : ");
+	  if (!ref_groups[i].first_use.vectorize_p)
+	    print_generic_expr (dump_file, ref_groups[i].first_use.ref,
+				TDF_SLIM);
+	  else
+	    print_gimple_stmt (dump_file, ref_groups[i].first_use.stmt,
+				TDF_SLIM);
+	  fprintf (dump_file, "\n");
+	}
+      fprintf (dump_file, "\n");
+    }
+    dump_key_info_for_llc_hint (ref_groups);
+}
+
+/* Attributes of variable data.  */
+
+enum data_attribute
+{
+  DA_PARALLEL = 0,
+  DA_REGULAR,
+  DA_READ
+};
+
+/* Record memory reference by use mode.
+   If the reference group is not found, create a group.  */
+
+void
+record_mem_ref (std::map<tree, ref_group> &ref_groups, data_ref &mem_ref)
+{
+  unsigned int index = (mem_ref.parallel_p << DA_PARALLEL)
+	      + (mem_ref.regular_p << DA_REGULAR) + (mem_ref.read_p << DA_READ);
+
+  if (!ref_groups.count (mem_ref.var))
+    {
+      ref_group ref_group;
+      ref_group.var = mem_ref.var;
+      ref_group.first_use = mem_ref;
+      ref_group.first_calc_use = mem_ref;
+      ref_groups[mem_ref.var] = ref_group;
+    }
+
+  /* Ref_groups' calc_by reflects the highest order of calc_by that can be
+     achieved by all mem_ref of ref_groups. The first mem_ref that achieves
+     this order is defined to be `first_calc_use`. Later after sorting
+     mem_refs, calc_by will be replaced by the calc_by of `first_use`, and
+     even by the calc_by of `first_calc_use`.  */
+  if (mem_ref.calc_by > ref_groups[mem_ref.var].calc_by)
+    {
+      ref_groups[mem_ref.var].calc_by = mem_ref.calc_by;
+      ref_groups[mem_ref.var].first_calc_use = mem_ref;
+    }
+  ref_groups[mem_ref.var].var_size = std::max (ref_groups[mem_ref.var].var_size,
+					       mem_ref.data_size);
+  ref_groups[mem_ref.var].dim = std::max (ref_groups[mem_ref.var].dim,
+				(unsigned int) mem_ref.loop_bounds.size ());
+  ref_groups[mem_ref.var].ref_use[index].push_back (mem_ref);
+
+  ref_score ref_level = { mem_ref, ((mem_ref.stmt)->bb->count).to_gcov_type (),
+			   expand_location (mem_ref.stmt->location).line };
+  ref_groups[mem_ref.var].ref_scores.push_back (ref_level);
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "recorded in: ");
+      print_generic_expr (dump_file, mem_ref.var, TDF_SLIM);
+      fprintf (dump_file, ":%d:%ld\n", index,
+	       ref_groups[mem_ref.var].ref_use[index].size () - 1);
+
+      fprintf (dump_file, "base: ");
+      print_generic_expr (dump_file, mem_ref.base, TDF_SLIM);
+
+      fprintf (dump_file, ", index: ");
+      print_generic_expr (dump_file, mem_ref.index, TDF_SLIM);
+
+      fprintf (dump_file, ", step: ");
+      if (mem_ref.step && cst_and_fits_in_hwi (mem_ref.step))
+	fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
+		 int_cst_value (mem_ref.step));
+      else
+	print_generic_expr (dump_file, mem_ref.step, TDF_SLIM);
+
+      fprintf (dump_file, ", offset: ");
+      if (mem_ref.offset && cst_and_fits_in_hwi (mem_ref.offset))
+	fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
+		 int_cst_value (mem_ref.offset));
+      else
+	print_generic_expr (dump_file, mem_ref.offset, TDF_SLIM);
+      fprintf (dump_file, ", %s", mem_ref.read_p ? "read" : "write");
+
+      fprintf (dump_file, ", size: %lf", mem_ref.data_size);
+      fprintf (dump_file, "\n\n");
+    }
+}
+
+/* Rank data reference index level.  */
+
+bool
+best_insert_cmp (const ref_score &a, const ref_score &b)
+{
+  /* NEXT STEP: We can also calculate gap using static/feedback info inferred
+     from historical maximum bb count:
+	gap = hist_max_bb_ct / (alpha * max (a.bb_ct, b.bb_ct)) + 1.
+     Also, bb count needs to be smoothed and scaled as divisor can be 0.
+     history maximum bb count can be obtained in Phase 4.  */
+  const float gap = 1;
+  if (a.d_ref.loop_depth != b.d_ref.loop_depth)
+    return a.d_ref.loop_depth > b.d_ref.loop_depth;
+  else if (a.d_ref.regular_p != b.d_ref.regular_p)
+    return a.d_ref.regular_p > b.d_ref.regular_p;
+  else if (abs (double (std::max (a.bb_count, b.bb_count) + 1)
+		/ double (std::min (a.bb_count, b.bb_count) + 1) - 1) > gap)
+    return a.bb_count > b.bb_count;
+  else if (a.line != b.line)
+    return a.line < b.line;
+  else if (a.d_ref.read_p != b.d_ref.read_p)
+    return a.d_ref.read_p < b.d_ref.read_p;
+  else
+    return a.d_ref.vectorize_p > b.d_ref.vectorize_p;
+}
+
+/* Sort data reference index level within one reference group in non-decreasing
+   order of the customized sorting scheme.  */
+
+void
+sort_mem_ref_in_ref_group (std::map<tree, ref_group> &ref_groups_map)
+{
+  if (dump_file)
+    fprintf (dump_file, "\nsorted data_references:\n");
+  for (std::map<tree, ref_group>::iterator it = ref_groups_map.begin ();
+       it != ref_groups_map.end (); ++it)
+    {
+      ref_group &curr_ref_group = (*it).second;
+      std::vector<ref_score> &ref_scores = curr_ref_group.ref_scores;
+      std::stable_sort (ref_scores.begin (), ref_scores.end (),
+			best_insert_cmp);
+      /* Update ref_group's first_use and calc_by with the first mem_ref after
+	 sorting.  */
+      curr_ref_group.first_use = curr_ref_group.ref_scores[0].d_ref;
+      curr_ref_group.calc_by = curr_ref_group.first_use.calc_by;
+
+      /* When transferring footprint is enabled, it is allowed to transfer
+	 the statically-calculated footprint of a mem_ref from the same
+	 ref_group to `first_use` mem_ref.  */
+      if (param_transfer_footprint
+	  && curr_ref_group.first_use.calc_by == UNHANDLE_CALC)
+	{
+	  if (curr_ref_group.first_calc_use.calc_by > RUNTIME_CALC)
+	    {
+	      if (dump_file && (dump_flags & TDF_DETAILS))
+		{
+		  print_generic_expr (dump_file, (*it).first, TDF_SLIM);
+		  fprintf (dump_file, "\nfirst_use: ");
+		  print_gimple_stmt (dump_file, curr_ref_group.first_use.stmt,
+				     0, TDF_LINENO);
+		  fprintf (dump_file, "first_calc_use: ");
+		  print_gimple_stmt (dump_file,
+				     curr_ref_group.first_calc_use.stmt,
+				     0, TDF_LINENO);
+		}
+
+	      curr_ref_group.calc_by = curr_ref_group.first_calc_use.calc_by;
+	      curr_ref_group.transfer_ft = 1;
+	    }
+	  else
+	    {
+	      if (dump_file && (dump_flags & TDF_DETAILS))
+		{
+		  print_generic_expr (dump_file, (*it).first, TDF_SLIM);
+		  fprintf (dump_file, ": cannot transfer footprint to "
+				      "first use mem_ref.\n");
+		}
+	    }
+	}
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  print_generic_expr (dump_file, (*it).first, TDF_SLIM);
+	  fprintf (dump_file, " : %lu\n", ref_scores.size ());
+	  for (unsigned int i = 0; i < ref_scores.size (); ++i)
+	    {
+	      fprintf (dump_file, "mem_ref_index %u: ", i);
+	      print_gimple_stmt (dump_file, ref_scores[i].d_ref.stmt, 0,
+				 TDF_LINENO);
+	      fprintf (dump_file, "bb-%d ",
+		       ref_scores[i].d_ref.stmt->bb->index);
+	      fprintf (dump_file, "count %ld\n", ref_scores[i].bb_count);
+	    }
+	  fprintf (dump_file, "\n\n");
+	}
+    }
+}
+
+/* Tracing and sorting reference groups.  */
+
+bool
+record_and_sort_ref_groups (std::vector<ref_group> &ref_groups,
+			    std::vector<class loop *> &kernels,
+			    std::map<class loop *,
+				     std::vector<data_ref> > &loop_refs,
+			    std::set<int> bb_pathset)
+{
+  if (dump_file)
+    fprintf (dump_file, "\nPhase 5: trace_all_references_details\n\n");
+
+  std::map<tree, ref_group> ref_groups_map;
+
+  for (unsigned i = 0; i < kernels.size (); ++i)
+    {
+      class loop *loop = kernels[i];
+      if (loop_refs.count (loop) == 0)
+	continue;
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "loop header %d:\n", loop->header->index);
+      for (unsigned j = 0; j < loop_refs[loop].size (); ++j)
+	{
+	  data_ref &mem_ref = loop_refs[loop][j];
+	  if (mem_ref.trace_status_p)
+	    {
+	      if (!param_filter_mode || (param_filter_mode
+		  && bb_pathset.count (mem_ref.stmt->bb->index)))
+		record_mem_ref (ref_groups_map, mem_ref);
+	    }
+	}
+    }
+
+  /* Sort mem_ref within ref_group by local count and update first_use's
+     data_ref, stable sort.  */
+  sort_mem_ref_in_ref_group (ref_groups_map);
+  sort_ref_groups (ref_groups, ref_groups_map);
+
+  return ref_groups.size () > 0;
+}
+
+/* ================ phase 6 issue_llc_hint ================  */
+
+/* Issue vectorized mask prefetch gimple.  */
+
+void
+issue_mask_prefetch (gimple *stmt)
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "insert svprfd.\n");
+
+  /* vect__1.1 = .MASK_LOAD (_2, 32B, loop_mask_3);
+     .MASK_STORE (_4, 32B, loop_mask_5, vect__6.6);
+  */
+  tree dataref_ptr = gimple_call_arg (stmt, 0);
+  tree scale = gimple_call_arg (stmt, 1);
+  tree final_mask = gimple_call_arg (stmt, 2);
+  tree target = NULL_TREE;
+  if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+    target = gimple_call_arg (stmt, 3);
+  else if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD)
+    target = gimple_call_lhs (stmt);
+  tree prfop = NULL_TREE;
+  if (param_llc_level == 3)
+    /* for simulation, 4: PLDL3KEEP.  */
+    prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4);
+  else if (param_llc_level == 4)
+    /* 6: PLDL4KEEP.  */
+    prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6);
+  else
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "LLC cache levels are illegal.\n");
+      return;
+    }
+
+  /* add offset.  */
+  gimple_stmt_iterator si = gsi_for_stmt (stmt);
+  /* target: vector_type - XXX_type.  */
+  if (target == NULL_TREE)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "unhandled scene: target vect is null");
+      return;
+    }
+  unsigned HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi
+		       (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target))));
+  tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance);
+  addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true,
+				   NULL, true, GSI_SAME_STMT);
+
+  gcall *call = gimple_build_call_internal (IFN_MASK_PREFETCH, 5, addr, scale,
+					    final_mask, target, prfop);
+  gsi_insert_after (&si, call, GSI_SAME_STMT);
+  update_ssa (TODO_update_ssa_only_virtuals);
+}
+
+/* Issue vectorized mask gather prefetch gimple.  */
+
+void
+issue_mask_gather_prefetch (gimple *stmt)
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "insert svprfd_gather_uxindex.\n");
+
+  /* vect_patt_1.1 = .MASK_GATHER_LOAD (_2, vect__3.3, 8, { 0.0, ... },
+					loop_mask_4);  */
+  tree dataref_ptr = gimple_call_arg (stmt, 0);
+  tree vec_offset = gimple_call_arg (stmt, 1);
+  tree scale = gimple_call_arg (stmt, 2);
+  tree zero = gimple_call_arg (stmt, 3);
+  tree final_mask = gimple_call_arg (stmt, 4);
+  tree prfop = NULL_TREE;
+  if (param_llc_level == 3) // for simulation
+    prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4); // 4: PLDL3KEEP
+  else if (param_llc_level == 4)
+    prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6); // 6: PLDL4KEEP
+  else
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "LLC cache levels are illegal.\n");
+      return;
+    }
+
+  tree target = gimple_call_lhs (stmt);
+  /* add offset.  */
+  gimple_stmt_iterator si = gsi_for_stmt (stmt);
+  if (target == NULL_TREE)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "unhandled scene: target vect is null");
+      return;
+    }
+  unsigned HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi
+		       (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target))));
+  tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance);
+  addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true,
+				   NULL, true, GSI_SAME_STMT);
+
+  gcall *call = gimple_build_call_internal (IFN_MASK_GATHER_PREFETCH, 7, addr,
+					    vec_offset, scale, zero,
+					    final_mask, target, prfop);
+  gsi_insert_after (&si, call, GSI_SAME_STMT);
+  update_ssa (TODO_update_ssa_only_virtuals);
+}
+
+/* Issue builtin prefetch gimple.  */
+
+void
+issue_builtin_prefetch (data_ref &mem_ref)
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "insert prfm.\n");
+  /* MEM[symbol: diagPtr, index: ivtmp_102, step: 8, offset: 0B] */
+  gimple *stmt = mem_ref.stmt;
+  tree ref = mem_ref.ref;
+
+  tree scale = mem_ref.step;
+  gimple_stmt_iterator si = gsi_for_stmt (stmt);
+  if (scale == NULL_TREE)
+    {
+      /* _190 = (void *) ivtmp.444_221;
+	 Cannot detect size unit at (void *).  */
+      scale = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (mem_ref.var)));
+      if (scale == NULL_TREE)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "ERROR: Unknown size unit for the prefetching "
+				"variable.  Stop builtin_prefetch.\n\n");
+	  return;
+	}
+    }
+
+  tree addr = build_fold_addr_expr_with_type (ref, ptr_type_node);
+  addr = force_gimple_operand_gsi (&si, unshare_expr (addr),
+				   true, NULL, true, GSI_SAME_STMT);
+  unsigned HOST_WIDE_INT distance = param_prefetch_offset
+				      * tree_to_uhwi (scale);
+
+  addr = fold_build_pointer_plus_hwi (addr, distance);
+  addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true,
+				   NULL, true, GSI_SAME_STMT);
+  /* __builtin_prefetch (_68, 0, 1);
+     1st param: *addr, 2nd param: write/read (1/0), 3rd param: temporal locality
+     (high means strong locality) */
+  gcall *call = NULL;
+  if (param_llc_level == 3)
+    {
+      /* for simulation.
+	 BUILT_IN_PREFETCH (addr, rw, locality).  */
+      call = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
+				3, addr, integer_zero_node, integer_one_node);
+    }
+  else if (param_llc_level == 4)
+    {
+	tree prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6);
+	call = gimple_build_call (
+				builtin_decl_explicit (BUILT_IN_PREFETCH_FULL),
+				3, addr, integer_zero_node, prfop);
+    }
+  else
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "LLC cache levels are illegal.\n");
+      return;
+    }
+
+  gsi_insert_after (&si, call, GSI_SAME_STMT);
+  update_ssa (TODO_update_ssa_only_virtuals);
+}
+
+/* Retrieve memory reference at the specific index.  */
+
+data_ref
+get_data_ref_at_idx (ref_group &var_ref_group)
+{
+  unsigned int mem_ref_size = static_cast<unsigned int>(
+      			var_ref_group.ref_scores.size ());
+  if (strlen (param_mem_ref_index) == 0)
+    return var_ref_group.first_use;
+  else
+    {
+      /* Insert prefetch hint at highly-likely-used location with the given
+	 index.  */
+      if (var_ref_group.mem_ref_index >= mem_ref_size)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "WARNING: The target data_ref index is out "
+		     "of range.  Use top index instead!\n");
+	  return var_ref_group.ref_scores[0].d_ref;
+	}
+      return var_ref_group.ref_scores[var_ref_group.mem_ref_index].d_ref;
+    }
+}
+
+/* Static form insertion and issue instruction.  We may check the
+   determination of the ARM SVE architecture before SVE hint insertion.  */
+
+void
+static_issue (std::vector<ref_group> &ref_groups, int num_issue_var)
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "static issue\n");
+
+  for (int i = 0; i < num_issue_var; ++i)
+    {
+      data_ref mem_ref = get_data_ref_at_idx (ref_groups[i]);
+      if (mem_ref.vectorize_p)
+	{
+	  enum internal_fn ifn_code = gimple_call_internal_fn (mem_ref.stmt);
+	  if (ifn_code == IFN_MASK_STORE || ifn_code == IFN_MASK_LOAD)
+	    issue_mask_prefetch (mem_ref.stmt);
+	  else if (ifn_code == IFN_MASK_GATHER_LOAD)
+	    issue_mask_gather_prefetch (mem_ref.stmt);
+	  else
+	    if (dump_file && (dump_flags & TDF_DETAILS))
+	      fprintf (dump_file, "other vectorized internal function\n");
+	}
+      else
+	issue_builtin_prefetch (mem_ref);
+    }
+}
+
+/* Check whether all loop bounds (niters) used for calculating the footprints
+   of previously-executed ref_groups are defined in a dominated bb to the
+   currentbranch bb, where the conditional expression requires the loop bound
+   info.  */
+
+bool
+check_def_use_chain (std::vector<ref_group> &ref_groups,
+		     basic_block &branch_header_bb,
+		     std::vector<int> &ref_group_idx)
+{
+  for (std::vector<int>::iterator it = ref_group_idx.begin ();
+       it != ref_group_idx.end (); ++it)
+    {
+      /* Transferring mem_ref only takes place during footprint calculation.  */
+      ref_group &ref_group_curr = ref_groups[*it];
+      data_ref mem_ref = ref_group_curr.transfer_ft
+			  ? ref_group_curr.first_calc_use
+			  : ref_group_curr.first_use;
+      for (unsigned j = 0; j < mem_ref.loop_bounds.size (); ++j)
+	{
+	  tree niters = mem_ref.loop_bounds[j].niters;
+	  gimple *def_stmt = SSA_NAME_DEF_STMT (niters);
+	  basic_block def_bb = gimple_bb (def_stmt);
+	  /* Check dominator relationship of def bb and branch bb.  */
+	  /* Case 1: Check whether the def bb is the single predecessor block
+	     of header bb.  */
+	  if (single_pred_p (branch_header_bb))
+	    {
+	      basic_block branch_bb_prev = single_pred (branch_header_bb);
+	      if (branch_bb_prev->index == def_bb->index)
+		continue;
+	    }
+	  /* Case 2: Check whether the branch bb is dominated by the def
+	     bb.  */
+	  if (!dominated_by_p (CDI_DOMINATORS, branch_header_bb, def_bb))
+	    return false;
+	}
+    }
+  return true;
+}
+
+/* Generate the stmts for calculating the size.  Later we will consider nested
+   multi-branches scenarios and check more information of niters when it is
+   a COND_EXPR.  */
+
+tree
+calc_stmts_gen (std::vector<ref_group> &ref_groups,
+		gimple_seq &cond_expr_stmt_list,
+		basic_block branch_header_bb,
+		std::vector<int> &ref_group_idx_curr,
+		std::vector<int> &ref_group_idx_prev, tree &cumul_size)
+{
+  /* Check whether the bbs of def stmt for footprint loop bounds dominates
+     the bb of new runtime branching conditional.  */
+  if (!check_def_use_chain (ref_groups, branch_header_bb, ref_group_idx_prev))
+    return NULL_TREE;
+
+  /* Accumulated allocation size.  */
+  for (std::vector<int>::iterator it = ref_group_idx_curr.begin ();
+       it != ref_group_idx_curr.end (); ++it)
+    {
+      /* Transferring mem_ref only takes place during footprint calculation.  */
+      ref_group &ref_group_curr = ref_groups[*it];
+      data_ref mem_ref = ref_group_curr.transfer_ft
+			  ? ref_group_curr.first_calc_use
+			  : ref_group_curr.first_use;
+      tree var = mem_ref.var;
+      tree unit = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (var)));
+      /* _190 = (void *) ivtmp.444_221;
+	 Cannot detect size unit at (void *).  */
+      if (unit == NULL_TREE)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    {
+	      fprintf (dump_file, "WARNING: Cannot detect size unit "
+				  "(use 1 byte) for variable %s: ",
+				  get_name (var));
+	      print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM);
+	      fprintf (dump_file, "\n");
+	    }
+	  unit = size_one_node;
+	}
+      tree size = NULL_TREE;
+      for (unsigned j = 0; j < mem_ref.loop_bounds.size (); ++j)
+	{
+	  tree niters = mem_ref.loop_bounds[j].niters;
+
+	  /* COND_EXPR.  */
+	  if (TREE_CODE (niters) == COND_EXPR)
+	    niters = TREE_OPERAND (niters, 1);
+	  if (size == NULL_TREE) 
+	    {
+		    size = niters;
+	    } else {
+		    size = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, 
+					size);
+	    }
+	}
+      unit = build1 (NOP_EXPR, TREE_TYPE (size), unit);
+      size = fold_build2 (MULT_EXPR, TREE_TYPE (size), size, unit);
+      size = build1 (FLOAT_EXPR, double_type_node, size);
+      cumul_size = fold_build2 (PLUS_EXPR, double_type_node, cumul_size,
+				size);
+      ref_group_idx_prev.push_back (*it);
+    }
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "cumul_size = ");
+      print_generic_expr (dump_file, cumul_size, TDF_SLIM);
+      fprintf (dump_file, "\n");
+    }
+  /* Create a stmt list for size calculation.  */
+  tree div = build_int_cst (TREE_TYPE (integer_zero_node), 1024 * 1024);
+  div = build1 (NOP_EXPR, double_type_node, div);
+  tree total_size = fold_build2 (RDIV_EXPR, double_type_node, cumul_size, div);
+
+  tree threshold = build_int_cst (TREE_TYPE (integer_zero_node),
+				  param_llc_capacity_per_core / 2);
+  threshold = build_real_from_int_cst (double_type_node, threshold);
+  tree cond_expr = fold_build2 (LE_EXPR, boolean_type_node, total_size,
+				threshold);
+
+  /* Convert cond_expr to stmt list.  */
+  cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
+				      &cond_expr_stmt_list, is_gimple_condexpr,
+				      NULL_TREE);
+  return cond_expr;
+}
+
+/* Retrieve the least number of loops that cover all target mem_refs.
+   Try to merge loops that the mem_refs reside to a common superloop and
+   maintain a worklist which relates NEED-TO-COPY loops with the target mem
+   refs inside using the following criteria:
+   1) If loop A is a superloop of loop B in the worklist, replace loop B with
+      loop A in the worklist, and attach all target mem_refs of loop B,
+      together with loop A's, to loop A.
+   2) If loop B in the worklist is a superloop of loop A, attach loop A's
+      target mem_ref to loop B.
+   3) If loop A is not a superloop/subloop of loop B in the worklist, replace
+      loop B with their lowest common superloop C in the worklist, and attach
+      all target mem_refs of loop A and loop B to loop C.
+   4) If loop A and loop B's lowest common superloop is function body
+      (loop 0), stop merging and maintain loop independence.  */
+
+void
+get_loop_worklist (std::vector<ref_group> &ref_groups, int num_issue_var,
+		   std::map<class loop *, std::vector<int> > &loop_worklist)
+{
+  for (int i = 0; i < num_issue_var; ++i)
+    {
+      data_ref &mem_ref = ref_groups[i].first_use;
+      class loop *loop_new = mem_ref.loop_bounds.front ().loop;
+      class loop *common_superloop = loop_new;
+      bool add_loop_worklist = false;
+
+      /* Use greedy algorithm to merge loops to a common superloop that can
+	 contain the current mem_refs.  */
+      std::map<class loop *, std::vector<int> >::iterator it_tmp;
+      std::vector<int> ref_group_idx_tmp;
+      std::map<class loop *, std::vector<int> >::iterator it;
+      for (it = loop_worklist.begin (); it != loop_worklist.end ();)
+	{
+	  class loop *loop_old = it->first;
+	  common_superloop = find_common_loop (loop_new, loop_old);
+	  if (common_superloop == NULL || common_superloop->num == 0)
+	    {
+	      /* Stop merging two loops if there is no common superloop for
+		 them except function body (loop 0).  */
+	      if (common_superloop != NULL
+		  && dump_file && (dump_flags & TDF_DETAILS))
+		{
+		  fprintf (dump_file, "ref_group %d's loop %d has no common "
+				      "superloop with existing loop %d\n",
+			   i, loop_new->num, loop_old->num);
+		}
+	      ++it;
+	      continue;
+	    }
+
+	  if (common_superloop->num == loop_old->num)
+	    {
+	      /* If loop_old is the superloop of loop_new, add current
+		 ref_group index to loop's worklist.  */
+	      loop_worklist[common_superloop].push_back (i);
+	      ++it;
+	    }
+	  else
+	    {
+	      /* If loop_old is not a superloop of loop_new, replace
+		 loop_old with the common superloop.  */
+	      it_tmp = it;
+	      ++it_tmp;
+	      ref_group_idx_tmp = it->second;
+	      loop_worklist.erase (it);
+	      it = it_tmp;
+	      add_loop_worklist = true;
+	    }
+	}
+
+      if (loop_worklist.empty () || add_loop_worklist)
+	{
+	  /* Update the new common superloop in loop_worklist.  */
+	  std::vector<int> &ref_groups_tmp = loop_worklist[common_superloop];
+	  ref_groups_tmp.push_back (i);
+	  for (std::vector<int>::iterator it = ref_group_idx_tmp.begin ();
+	       it != ref_group_idx_tmp.end (); ++it)
+	    ref_groups_tmp.push_back (*it);
+	  std::sort (ref_groups_tmp.begin (), ref_groups_tmp.end ());
+	}
+    }
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "runtime loop list:\n");
+      std::map<class loop *, std::vector<int> >::iterator it;
+      for (it = loop_worklist.begin (); it != loop_worklist.end (); ++it)
+	{
+	  fprintf (dump_file, "loop %d:", it->first->num);
+	  for (std::vector<int>::iterator idx_it = it->second.begin ();
+	       idx_it != it->second.end (); ++idx_it)
+	    {
+	      fprintf (dump_file, " %d", *idx_it);
+	    }
+	  fprintf (dump_file, "\n");
+	}
+    }
+}
+
+/* Runtime form insertion and issue instruction.  */
+
+void
+runtime_issue (std::vector<ref_group> &ref_groups, int num_issue_var,
+	       std::vector<class loop *> &sorted_kernels)
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "runtime issue\n");
+
+  /* It is possible that the loop father of some mem_ref's bb may contain the
+     loop fathers of the others. Therefore, we intend to only copy loops
+     without inclusion relationship.  */
+  std::map<class loop *, std::vector<int> > loop_worklist;
+  get_loop_worklist (ref_groups, num_issue_var, loop_worklist);
+  bool get_first_ref_group = false;
+  std::vector<int> ref_group_idx_prev;
+
+  /* NEXT STEP: Multiple loop copies (possibly nested within one loop can cost
+     front-end bound due to branching within loop), we need to set up a
+     threshold such that we may compensate this time cost by space cost
+     in binary (copying outer loop).  */
+  tree cumul_size = build_real_from_int_cst (double_type_node,
+					     integer_zero_node);
+  for (std::vector<class loop *>::iterator it = sorted_kernels.begin ();
+       it != sorted_kernels.end (); ++it)
+    {
+      /* Start runtime branching until finding the first ref_group's loop.
+	 Skip any ref_groups if their `first_use` mem_refs are executed
+	 before the mem_ref of the first ref_group.  */
+      class loop *loop = *it;
+      if (!loop_worklist.count (loop)
+	  || (!get_first_ref_group && loop_worklist[loop][0] != 0))
+	continue;
+
+      std::vector<int> ref_group_idx_curr = loop_worklist[loop];
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "copy loop num: %d\n", loop->num);
+	}
+      /* If the exit edge points to bb with multiple inputs, split the exit
+	 edge and create a new bb, make the exit edge point to bb with only
+	 single input.  */
+      edge e = single_exit (loop);
+      if (e == NULL)
+	return;
+      if (!single_pred_p (e->dest))
+	{
+	  split_loop_exit_edge (e, true);
+	  if (dump_enabled_p ())
+	    dump_printf (MSG_NOTE, "split exit edge\n");
+	}
+
+      /* After updating SSA, we are not sure whether the gimple_seq stmt list
+	 is initialized and unchanged during iterations. Therefore, we need to
+	 recreate this stmt list for every loop copy.  */
+      gimple_seq cond_expr_stmt_list = NULL;
+      tree cond_expr = calc_stmts_gen (ref_groups, cond_expr_stmt_list,
+				       loop->header, ref_group_idx_curr,
+				       ref_group_idx_prev, cumul_size);
+      if (cond_expr == NULL_TREE)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "incalculable variables for conditional\n");
+	  return;
+	}
+
+      /* Use the previous cond and generate a new branch and copy loop.  */
+      basic_block condition_bb = NULL;
+      profile_probability prob = profile_probability::likely ();
+      initialize_original_copy_tables ();
+      class loop *nloop = loop_version (loop, cond_expr, &condition_bb,
+					prob, prob.invert (), prob,
+					prob.invert (), true);
+      free_original_copy_tables ();
+
+      /* Insert the generated stmt list before cond_expr.  */
+      gimple_stmt_iterator cond_exp_gsi;
+      if (cond_expr_stmt_list)
+	{
+	  /* Function `gsi_insert_seq_before` will insert `cond_expr` (1st
+	     stmt) of `condition_bb` to the end of `cond_expr_stmt_list`.  */
+	  cond_exp_gsi = gsi_last_bb (condition_bb);
+	  gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
+				 GSI_SAME_STMT);
+	}
+    }
+
+  update_ssa (TODO_update_ssa);
+
+  /* Perform hint issue for branches that meet conditions.  */
+  static_issue (ref_groups, num_issue_var);
+}
+
+/* Issue llc hints through prefetch instructions.  */
+
+void
+issue_llc_hint (std::vector<ref_group> &ref_groups,
+		std::vector<class loop *> &sorted_kernels)
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "issue_llc_hint:\n");
+
+  /* 1) If the issue-topn and force-issue options are available, top N var is
+	forcibly allocated then no runtime branch is generated.
+     2) If the issue-topn option is available and the size of top N var is
+	statically known, top N is statically allocated and no runtime branch
+	is generated.
+     3) If the issue-topn option is available and the size of the top N var is
+	unknown, but them is dynamically known, the top N is dynamically
+	allocated and generate runtime branches. (also depends on the screening
+	of the innermost variable boundary type)
+     4) If the dynamic runtime cannot know the size, such as indirect access,
+	optimization is skipped.
+  */
+  int num_issue_var = std::min (param_issue_topn, (int) ref_groups.size ());
+  if (num_issue_var == 0)
+    return;
+
+  if (num_issue_var < param_issue_topn
+      && dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "WARNING: Only %u (less than param_issue_topn = %d) "
+			  "ref_group(s) is found for llc hint.\n",
+	       num_issue_var, param_issue_topn);
+    }
+  if (param_force_issue)
+    {
+      if (strlen (param_target_variables) > 0)
+	static_issue (ref_groups, static_cast<int>(ref_groups.size ()));
+      else
+	static_issue (ref_groups, num_issue_var);
+      return;
+    }
+  calc_type topn_calc_type = STATIC_CALC;
+  for (int i = 0; i < num_issue_var; ++i)
+    topn_calc_type = std::min (topn_calc_type, ref_groups[i].calc_by);
+
+  if (topn_calc_type == STATIC_CALC)
+    {
+      /* Before static issue, we still need to collect data size of all target
+	 variables and compare the summation with LLC cache size.  */
+      double prefetch_data_size = 0.;
+      for (int i = 0; i < num_issue_var; ++i)
+	prefetch_data_size += ref_groups[i].var_size;
+
+      if (prefetch_data_size <= (double) param_llc_capacity_per_core
+				* PREFETCH_CACHE_SIZE_RATIO)
+	static_issue (ref_groups, num_issue_var);
+      else
+	if (dump_file && (dump_flags & TDF_DETAILS))
+	  fprintf (dump_file, "static issue: Prefetch size exceeds LLC cache "
+			      "size: %lf > %lf.\n",
+		   prefetch_data_size,
+		   (double) param_llc_capacity_per_core
+		   * PREFETCH_CACHE_SIZE_RATIO);
+    }
+  else if (topn_calc_type == RUNTIME_CALC)
+    runtime_issue (ref_groups, num_issue_var, sorted_kernels);
+  else
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "unhandled issue scene\n");
+    }
+}
+
+/* ==================== phase entry ====================  */
+/* Check whether a string can be converted to an unsigned integer.  */
+
+bool is_unsigned_int (const std::string &s)
+{
+  if (s.empty () || s.size () > PREFETCH_TOOL_NUM_MAX_LEN)
+    return false;
+
+  for (unsigned int i = 0; i < s.size (); ++i)
+    {
+      if (s[i] < '0' || s[i] > '9')
+	return false;
+    }
+  return true;
+}
+
+/* Parse a substring separated by comma.  If the substring is valid and
+   non-empty, store it as a parsed element.  */
+
+bool
+parse_string_helper (const std::string &substr, std::vector<std::string>& str_elts,
+		     bool check_unsigned, size_t start, size_t end)
+{
+  if (substr == "" && dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "WARNING: The input string from %lu to %lu is "
+	     "empty.\n", start, end);
+  else if (check_unsigned && !is_unsigned_int (substr))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "ERROR: not an unsigned integer: %s\n",
+		 substr.c_str ());
+      str_elts.clear ();
+      return false;
+    }
+  else
+    str_elts.push_back (substr);
+  return true;
+}
+
+/* Parse a user input string, separated by comma.  */
+
+void
+parse_string (const std::string &s, std::vector<std::string>& str_elts,
+	      bool check_unsigned = false)
+{
+  std::string delim = ",";
+  size_t start = 0;
+  size_t end = s.find (delim);
+  std::string substr = s.substr (start, end - start);
+  while (end != std::string::npos)
+    {
+      if (!parse_string_helper (substr, str_elts, check_unsigned, start, end))
+	return;
+      start = end + delim.size ();
+      end = s.find (delim, start);
+      substr = s.substr (start, end - start);
+    }
+  parse_string_helper (substr, str_elts, check_unsigned, start, end);
+}
+
+/* Parse user input of target variables and memory indices and create a map
+   that assigns a target variable to a memory index.  */
+
+void
+parse_param_inputs (std::map<std::string, unsigned int> &var2mem_idx)
+{
+  /* The user input length should have an input length limit.  */
+  if ((strlen (param_target_variables) >= PREFETCH_TOOL_INPUT_MAX_LEN
+       || strlen (param_mem_ref_index) >= PREFETCH_TOOL_INPUT_MAX_LEN)
+      && dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "INVALID INPUT: The user inputs for target variables "
+	      "and/or memory reference indices are too long for parsing.\n");
+
+  std::vector<std::string> var_names;
+  std::string target_variables = param_target_variables;
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "Start parsing target variables:\n");
+  if (param_use_ref_group_index)
+    parse_string (target_variables, var_names, true);
+  else
+    parse_string (target_variables, var_names, false);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "Finish parsing target variables.\n\n");
+
+  std::vector<std::string> var_mem_indices;
+  std::string mem_indices = param_mem_ref_index;
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "Start parsing memory reference indices:\n");
+  parse_string (mem_indices, var_mem_indices, true);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "Finish parsing memory reference indices.\n\n");
+
+  /* Construct a map of var_name: var_mem_index.  */
+  if (var_names.size () > 0)
+    {
+      if (var_mem_indices.size () < var_names.size ())
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "WARNING: The number of provided memory "
+		     "reference indices is less than that of target "
+		     "variables.\nUse the top index for all variables "
+		     "instead.\n");
+	  for (std::string& var_name : var_names)
+	    var2mem_idx[var_name] = 0;
+	}
+      else
+	{
+	  if (var_mem_indices.size () > var_names.size ()
+	      && dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "WARNING: The number of target variables is "
+		     "less than that of memory reference indices.\n");
+	  for (unsigned int i = 0; i < var_names.size (); ++i)
+	    {
+	      var2mem_idx[var_names[i]] = static_cast<unsigned int>(
+		atoi (var_mem_indices[i].c_str ()));
+	    }
+	}
+    }
+}
+
+/* Filter reference groups by only selecting target variables from the user
+   input.  There are two options for prefetching target variables:
+   1. Specify variable name parsed by the pass, which you can double-check at
+      "sorted ref_groups" section in the dump file.
+   2. Specify variable rank exhibited at "sorted ref_groups" section in the
+      dump file.
+*/
+
+void
+prefetch_variables (const std::vector<ref_group>& ref_groups,
+		    std::vector<ref_group>& reduced_ref_groups)
+{
+  std::map<unsigned int, unsigned int> ref_group2mem_idx;
+
+  std::map<std::string, unsigned int> var2mem_idx;  /* externally defined.  */
+  parse_param_inputs (var2mem_idx);
+
+  if (param_use_ref_group_index)
+    {
+      /* Use ref_group index at "sorted ref_groups" section to specify
+	 variable.  */
+      /* Collect the variables in "reduced_ref_group" only if their indices
+	show up at "sorted ref_groups" section.  */
+      for (const std::pair<std::string, unsigned int> &var_mem_idx : var2mem_idx)
+	{
+	  unsigned int var_idx = static_cast<unsigned int>(atoi (
+				    var_mem_idx.first.c_str ()));
+	  if (var_idx < ref_groups.size ())
+	    ref_group2mem_idx[var_idx] = var_mem_idx.second;
+	  else if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "WARNING: The index \"%u\" does not show "
+		      "up in the ref_groups.\n", var_idx);
+	}
+    }
+  else
+    {
+      /* Use variable name shown up at "sorted ref_groups" section to specify
+	 variable:
+	 var2ref_group_idx + var2mem_idx -> ref_group2mem_idx.  */
+      /* Create a map that assigns the variable name to its corresponding
+	 ref_group index.  */
+      std::map<std::string, unsigned int> var2ref_group_idx;  /* internally detected.  */
+      for (unsigned int i = 0; i < ref_groups.size (); ++i)
+	{
+	  const ref_group &curr_ref_group = ref_groups[i];
+	  const int UINT_MAX_DIGIT = 10;
+	  /* Unrecognizable variable name related to ref_group.  */
+	  if (!get_name (curr_ref_group.var))
+	    {
+	      /* If the variable name does not have a string representation,
+		 we can rename it by "tmp_var_" + <sorted_ref_group_index>.  */
+	      char group_idx[UINT_MAX_DIGIT];
+	      sprintf (group_idx, "%u", i);
+	      std::string tmp_var_name = "tmp_var_" + std::string (group_idx);
+	      fprintf (dump_file, "Unrecognizable variable name at ref_group "
+		       "index %u.\nThe tree expression for variable is: ", i);
+	      print_generic_expr (dump_file, curr_ref_group.var, TDF_SLIM);
+	      fprintf (dump_file, "\n");
+	      var2ref_group_idx[tmp_var_name] = i;
+	    }
+	  else
+	    var2ref_group_idx[std::string (get_name (curr_ref_group.var))] = i;
+	}
+      /* Collect the variables in "reduced_ref_group" only if they show up in
+	 the ref_groups.  */
+      for (const std::pair<std::string, unsigned int> &var_mem_idx : var2mem_idx)
+	{
+	  if (var2ref_group_idx.count (var_mem_idx.first))
+	    {
+	      unsigned int ref_group_idx = var2ref_group_idx[var_mem_idx.first];
+	      ref_group2mem_idx[ref_group_idx] = var_mem_idx.second;
+	    }
+	  else if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "WARNING: Target variable \" %s \" does "
+		      "not show up in the ref_groups.  Check whether it needs "
+		      "temporary variable name.\n",
+		      var_mem_idx.first.c_str ());
+	}
+    }
+
+  for (const std::pair<unsigned int, unsigned int> &ref_group_mem_idx :
+       ref_group2mem_idx)
+    {
+      ref_group curr_ref_group = ref_groups[ref_group_mem_idx.first];
+      curr_ref_group.mem_ref_index = ref_group_mem_idx.second;
+      reduced_ref_groups.push_back (curr_ref_group);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "\nNOTICE: Prefetching target variable \" ");
+	  print_generic_expr (dump_file, curr_ref_group.var, TDF_SLIM);
+	  fprintf (dump_file, " \" at ref_group index %u and memory location "
+		   "index %u.\n", ref_group_mem_idx.first,
+		   ref_group_mem_idx.second);
+	}
+    }
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "\n\n");
+}
+
+/* The LLC intelligent allocation consists of 6 steps.  */
+
+void
+llc_allocate (void)
+{
+  std::map<class loop *, std::vector<data_ref> > kernels_refs;
+  std::vector<class loop *> kernels;
+  if (!get_dense_memory_kernels (kernels, kernels_refs))
+    return;
+
+  std::set<gimple *> traced_ref_stmt;
+  std::vector<data_ref> unresolved_refs;
+  trace_data_refs_info (kernels, kernels_refs, traced_ref_stmt,
+			unresolved_refs);
+
+  if (!analyze_nested_kernels (kernels, kernels_refs, traced_ref_stmt,
+			       unresolved_refs))
+    return;
+
+  retrace_loop_refs_info_unresolved (unresolved_refs, traced_ref_stmt);
+
+  std::vector<class loop *> sorted_kernels;
+  std::vector<ref_group> ref_groups;
+  if (param_filter_mode)
+    {
+      /* AutoFDO mode: include ENTRY bb and EXIT bb indices.  */
+      std::set<int> bb_pathset;
+      bb_pathset.insert (0);
+      bb_pathset.insert (1);
+      if (!filter_and_sort_kernels_feedback (sorted_kernels, bb_pathset))
+	return;
+
+      if (!record_and_sort_ref_groups (ref_groups, kernels, kernels_refs,
+				       bb_pathset))
+	return;
+    }
+  else
+    {
+      /* static mode.  */
+      std::set<int> bb_pathset;
+      if (!filter_and_sort_kernels (sorted_kernels, kernels))
+	return;
+
+      if (!record_and_sort_ref_groups (ref_groups, sorted_kernels, kernels_refs,
+				       bb_pathset))
+	return;
+    }
+
+  if (strlen (param_target_variables) > 0)
+    {
+      /* If "param_target_variables" is not empty, we will issue parsed target
+	variables compulsorily.  */
+      param_force_issue = true;
+      std::vector<ref_group> reduced_ref_groups;
+      prefetch_variables (ref_groups, reduced_ref_groups);
+      issue_llc_hint (reduced_ref_groups, sorted_kernels);
+    }
+  else
+    issue_llc_hint (ref_groups, sorted_kernels);
+}
+
+/* Check whether the function is an operator reloading function.  */
+
+bool
+operator_func_p (function *fn)
+{
+  const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn->decl));
+
+  if (fn_name && strncmp (fn_name, "operator", 8) == 0)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "operator_func: %s ", fn_name);
+
+      return true;
+    }
+  return false;
+}
+
+/* Check whether the function file location is known.  */
+
+bool
+func_location_p (function *fn)
+{
+  expanded_location fn_decl_xloc
+    = expand_location (DECL_SOURCE_LOCATION (current_function_decl));
+  expanded_location fn_xloc
+    = expand_location (fn->function_start_locus);
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "fn->function_start_locus = %d \n",
+	       fn->function_start_locus);
+      fprintf (dump_file, "fn_xloc.file = %s \n",
+	       fn_xloc.file ? fn_xloc.file : "NULL");
+      fprintf (dump_file, "fn_decl_xloc.file = %s \n",
+	       fn_decl_xloc.file ? fn_decl_xloc.file : "NULL");
+      fprintf (dump_file, "LOCATION_FILE (input_location) = %s \n",
+	LOCATION_FILE (input_location) ? LOCATION_FILE (input_location)
+				       : "NULL");
+    }
+  if (fn_decl_xloc.file == NULL)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Function location unknown, skip analysis \n");
+      return false;
+    }
+  /* Newly generated functions are filtered out, such as function constant
+     propagation func.constprop ().  */
+  if (LOCATION_FILE (input_location) != fn_decl_xloc.file)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Function location non-local, skip analysis \n");
+      return false;
+    }
+  return true;
+}
+
+/* Dump function information.  */
+
+void
+dump_function_info (function *fn)
+{
+  const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn->decl));
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "\nfn_name: %s\n", fn_name);
+      expanded_location cfun_xloc
+	= expand_location (DECL_SOURCE_LOCATION (current_function_decl));
+      if (cfun_xloc.line)
+	{
+	  if (cfun_xloc.file)
+	    fprintf (dump_file, "[%s:%d:%d]\n",
+		     cfun_xloc.file, cfun_xloc.line, cfun_xloc.column);
+	}
+      fprintf (dump_file, "\n");
+      flow_loops_dump (dump_file, NULL, 1);
+      fprintf (dump_file, "\n");
+    }
+}
+
+/* dump param.  */
+
+void
+dump_param (void)
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+  {
+    fprintf (dump_file, "LLC allocate parameters:\n");
+    fprintf (dump_file, "    block size: %d\n", param_l1_cache_line_size);
+    fprintf (dump_file, "    L1 cache size: %d lines, %d kB\n",
+	param_l1_cache_size * 1024 / param_l1_cache_line_size,
+	param_l1_cache_size);
+    fprintf (dump_file, "    L1 cache line size: %d\n",
+	param_l1_cache_line_size);
+    fprintf (dump_file, "    L2 cache size: %d kB\n", param_l2_cache_size);
+    fprintf (dump_file, "    min mem_access_ratio: %d \n",
+	param_mem_access_ratio);
+    fprintf (dump_file, "    min mem_access_num: %d \n",
+	param_mem_access_num);
+    fprintf (dump_file, "\n");
+  }
+}
+
+/* Determine whether to analyze the function according to
+   the ordering of functions containing cycle counts.  */
+
+static bool
+should_analyze_func_p (void)
+{
+  gcov_type decl_uid = DECL_UID (current_function_decl);
+  gcov_type func_count = event_get_func_count (decl_uid, PMU_EVENT);
+  if (func_count == 0)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "function uid %ld cannot find profile data "
+			      "and skip prefetch analysis\n",
+		   decl_uid);
+	}
+      return false;
+    }
+  if (func_count < event_get_topn_function_total_count_thres ())
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "function uid %ld total counts is %lu: "
+			      "counts %lu < perf's top %d threshold %lu, "
+			      "skip prefetch analysis\n",
+		   decl_uid, func_count, func_count,
+		   PREFETCH_FUNC_TOPN, event_get_topn_function_total_count_thres ());
+	}
+      return false;
+    }
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "function uid %ld total counts is %lu: "
+			  "counts %lu >= perf's top %d threshold %lu, "
+			  "continue prefetch analysis\n",
+	       decl_uid, func_count, func_count,
+	       PREFETCH_FUNC_TOPN, event_get_topn_function_total_count_thres ());
+    }
+  return true;
+}
+
+const pass_data pass_data_llc_allocate =
+{
+  GIMPLE_PASS, /* type.  */
+  "llc_allocate", /* name.  */
+  OPTGROUP_LOOP, /* optinfo_flags.  */
+  TV_TREE_PREFETCH, /* tv_id.  */
+  (PROP_cfg | PROP_ssa), /* properties_required.  */
+  0, /* properties_provided.  */
+  0, /* properties_destroyed.  */
+  0, /* todo_flags_start.  */
+  0, /* todo_flags_finish.  */
+};
+
+class pass_llc_allocate : public gimple_opt_pass
+{
+public:
+  pass_llc_allocate (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_llc_allocate, ctxt)
+  {}
+
+  /* opt_pass methods.  */
+  virtual bool gate (function *)
+    {
+      return (optimize >= 2 && flag_llc_allocate > 0);
+    }
+  virtual unsigned int execute (function *);
+
+}; // class pass_llc_allocate
+
+unsigned int
+pass_llc_allocate::execute (function *fn)
+{
+  unsigned int ret = 0;
+
+  if (!targetm.have_prefetch ()
+      || targetm.vectorize.code_for_prefetch == NULL
+      || targetm.vectorize.prefetch_handleable_mode_p == NULL
+      || targetm.vectorize.code_for_gather_prefetch == NULL)
+    return 0;
+
+  if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
+    {
+      tree type = build_function_type_list (void_type_node,
+					    const_ptr_type_node, NULL_TREE);
+      tree decl = add_builtin_function ("__builtin_prefetch", type,
+					BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
+					NULL, NULL_TREE);
+      DECL_IS_NOVOPS (decl) = true;
+      set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
+    }
+  if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH_FULL))
+    {
+      tree type = build_function_type_list (void_type_node,
+					    const_ptr_type_node, NULL_TREE);
+      tree decl = add_builtin_function ("__builtin_prefetch_full", type,
+					BUILT_IN_PREFETCH_FULL, BUILT_IN_NORMAL,
+					NULL, NULL_TREE);
+      DECL_IS_NOVOPS (decl) = true;
+      set_builtin_decl (BUILT_IN_PREFETCH_FULL, decl, false);
+    }
+
+  dump_param ();
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "llc_allocate: %s\n",
+	     IDENTIFIER_POINTER (DECL_NAME (fn->decl)));
+
+  if (number_of_loops (fn) <= 1  || !func_location_p (fn)
+      || operator_func_p (fn))
+    return ret;
+
+  /* Filter only when combined with PMU event. When the should_analyze_func_p
+     analysis fails (for example, the function without PMU-event count),
+     in order to ensure the accuracy of the LLC allocation analysis, the
+     function does not perform native allocation processing.  */
+  if (flag_additional_profile)
+    {
+      if (!profile_exist (PMU_EVENT) || !should_analyze_func_p ())
+	{
+	  return 0;
+	}
+    }
+
+  dump_function_info (fn);
+
+  llc_allocate ();
+
+  return ret;
+}
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_llc_allocate (gcc::context *ctxt)
+{
+  return new pass_llc_allocate (ctxt);
+}
diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
index 0353ffd3022..fdb84f57775 100644
--- a/gcc/tree-ssa-loop-niter.cc
+++ b/gcc/tree-ssa-loop-niter.cc
@@ -2489,6 +2489,37 @@ loop_only_exit_p (const class loop *loop, basic_block *body, const_edge exit)
   return true;
 }
 
+/* Returns whether the number of vectorized iterations for the loop can be
+   estimated from the given IR and update the corresponding loop attribute,
+   e.g., next_mask_114 = .WHILE_ULT (_122, niters.5_75, { 0, ... });  */
+
+bool
+number_of_iterations_vect (class loop *loop, tree lhs, tree rhs)
+{
+  loop->vec_nb_iterations = chrec_dont_know;
+
+  if ((TREE_CODE (lhs) != SSA_NAME && TREE_CODE (rhs) != SSA_NAME)
+      || (TREE_CODE (lhs) == SSA_NAME && TREE_CODE (rhs) == SSA_NAME))
+    return false;
+
+  tree ssa = TREE_CODE (lhs) == SSA_NAME ? lhs : rhs;
+  gimple *def_stmt = SSA_NAME_DEF_STMT (ssa);
+
+  if (gimple_code (def_stmt) != GIMPLE_CALL
+      || !gimple_call_internal_p (def_stmt))
+    return false;
+
+  internal_fn ifn = gimple_call_internal_fn (def_stmt);
+  if (ifn != IFN_WHILE_ULT)
+    return false;
+
+  gcall *call = dyn_cast<gcall *> (def_stmt);
+  tree niters = gimple_call_arg (call, 1);
+  loop->vec_nb_iterations = niters;
+
+  return true;
+}
+
 /* Stores description of number of iterations of LOOP derived from
    EXIT (an exit edge of the LOOP) in NITER.  Returns true if some useful
    information could be derived (and fields of NITER have meaning described
@@ -2559,6 +2590,9 @@ number_of_iterations_exit_assumptions (class loop *loop, edge exit,
   op1 = gimple_cond_rhs (stmt);
   type = TREE_TYPE (op0);
 
+  if (TREE_CODE (type) == VECTOR_TYPE)
+    number_of_iterations_vect (loop, op0, op1);
+
   if (TREE_CODE (type) != INTEGER_TYPE
       && !POINTER_TYPE_P (type))
     return false;
@@ -2850,16 +2884,16 @@ number_of_iterations_popcount (loop_p loop, edge exit,
 
 bool
 number_of_iterations_exit (class loop *loop, edge exit,
-			   class tree_niter_desc *niter,
-			   bool warn, bool every_iteration,
-			   basic_block *body)
+			   class tree_niter_desc *niter, bool warn,
+			   bool every_iteration, basic_block *body,
+			   bool guarantee)
 {
   gcond *stmt;
   if (!number_of_iterations_exit_assumptions (loop, exit, niter,
 					      &stmt, every_iteration, body))
     return false;
 
-  if (integer_nonzerop (niter->assumptions))
+  if (integer_nonzerop (niter->assumptions) || guarantee == false)
     return true;
 
   if (warn && dump_enabled_p ())
diff --git a/gcc/tree-ssa-loop-niter.h b/gcc/tree-ssa-loop-niter.h
index ceaf65e072b..1a7a4e8f753 100644
--- a/gcc/tree-ssa-loop-niter.h
+++ b/gcc/tree-ssa-loop-niter.h
@@ -24,10 +24,10 @@ extern tree expand_simple_operations (tree, tree = NULL);
 extern tree simplify_using_initial_conditions (class loop *, tree);
 extern bool loop_only_exit_p (const class loop *, basic_block *body,
 			      const_edge);
-extern bool number_of_iterations_exit (class loop *, edge,
-				       class tree_niter_desc *niter, bool,
-				       bool every_iteration = true,
-				       basic_block * = NULL);
+extern bool
+number_of_iterations_exit (class loop *, edge, class tree_niter_desc *niter,
+			   bool, bool every_iteration = true,
+			   basic_block * = NULL, bool guarantee = true);
 extern bool number_of_iterations_exit_assumptions (class loop *, edge,
 						   class tree_niter_desc *,
 						   gcond **, bool = true,
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 9d21e6d0310..32c81e47dfd 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -3447,34 +3447,13 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
 	cond_expr = expr;
     }
 
-  tree cost_name = NULL_TREE;
-  profile_probability prob2 = profile_probability::uninitialized ();
-  if (cond_expr
-      && EXPR_P (cond_expr)
-      && (version_niter
-	  || version_align
-	  || version_alias
-	  || version_simd_if_cond))
-    {
-      cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
-						      &cond_expr_stmt_list,
-						      is_gimple_val, NULL_TREE);
-      /* Split prob () into two so that the overall probability of passing
-	 both the cost-model and versioning checks is the orig prob.  */
-      prob2 = prob.split (prob);
-    }
-
   if (version_niter)
     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
 
   if (cond_expr)
-    {
-      gimple_seq tem = NULL;
-      cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
-					  &tem,
-					  is_gimple_condexpr, NULL_TREE);
-      gimple_seq_add_seq (&cond_expr_stmt_list, tem);
-    }
+    cond_expr
+      = force_gimple_operand_1 (unshare_expr (cond_expr), &cond_expr_stmt_list,
+				is_gimple_condexpr, NULL_TREE);
 
   if (version_align)
     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
@@ -3503,10 +3482,10 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
       tree c = fold_build2 (NE_EXPR, boolean_type_node,
 			    version_simd_if_cond, zero);
       if (cond_expr)
-        cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
-				 c, cond_expr);
+	cond_expr
+	  = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, c, cond_expr);
       else
-        cond_expr = c;
+	cond_expr = c;
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location,
 			 "created versioning for simd if condition check.\n");
@@ -3547,8 +3526,7 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
 	outermost = superloop_at_depth (loop, 1);
       /* And avoid applying versioning on non-perfect nests.  */
       while (loop_to_version != outermost
-	     && (e = single_exit (loop_outer (loop_to_version)))
-	     && !(e->flags & EDGE_COMPLEX)
+	     && single_exit (loop_outer (loop_to_version))
 	     && (!loop_outer (loop_to_version)->inner->next
 		 || vect_loop_vectorized_call (loop_to_version))
 	     && (!loop_outer (loop_to_version)->inner->next
@@ -3583,7 +3561,7 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
       te->probability = prob;
       fe->probability = prob.invert ();
       /* We can scale loops counts immediately but have to postpone
-         scaling the scalar loop because we re-use it during peeling.  */
+	 scaling the scalar loop because we re-use it during peeling.  */
       scale_loop_frequencies (loop_to_version, te->probability);
       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
 
@@ -3601,31 +3579,14 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
 			 "applying loop versioning to outer loop %d\n",
 			 loop_to_version->num);
 
-      unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
-
       initialize_original_copy_tables ();
       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
 			    prob, prob.invert (), prob, prob.invert (), true);
       gcc_assert (nloop);
       nloop = get_loop_copy (loop);
 
-      /* For cycle vectorization with SLP we rely on the PHI arguments
-	 appearing in the same order as the SLP node operands which for the
-	 loop PHI nodes means the preheader edge dest index needs to remain
-	 the same for the analyzed loop which also becomes the vectorized one.
-	 Make it so in case the state after versioning differs by redirecting
-	 the first edge into the header to the same destination which moves
-	 it last.  */
-      if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
-	{
-	  edge e = EDGE_PRED (loop->header, 0);
-	  ssa_redirect_edge (e, e->dest);
-	  flush_pending_stmts (e);
-	}
-      gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
-
       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
-         reap those otherwise;  they also refer to the original
+	 reap those otherwise;  they also refer to the original
 	 loops.  */
       class loop *l = loop;
       while (gimple *call = vect_loop_vectorized_call (l))
@@ -3678,62 +3639,29 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
       update_ssa (TODO_update_ssa);
     }
 
-  /* Split the cost model check off to a separate BB.  Costing assumes
-     this is the only thing we perform when we enter the scalar loop
-     from a failed cost decision.  */
-  if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
-    {
-      gimple *def = SSA_NAME_DEF_STMT (cost_name);
-      gcc_assert (gimple_bb (def) == condition_bb);
-      /* All uses of the cost check are 'true' after the check we
-	 are going to insert.  */
-      replace_uses_by (cost_name, boolean_true_node);
-      /* And we're going to build the new single use of it.  */
-      gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
-				       NULL_TREE, NULL_TREE);
-      edge e = split_block (gimple_bb (def), def);
-      gimple_stmt_iterator gsi = gsi_for_stmt (def);
-      gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
-      edge true_e, false_e;
-      extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
-      e->flags &= ~EDGE_FALLTHRU;
-      e->flags |= EDGE_TRUE_VALUE;
-      edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
-      e->probability = prob2;
-      e2->probability = prob2.invert ();
-      set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
-      auto_vec<basic_block, 3> adj;
-      for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
-	   son;
-	   son = next_dom_son (CDI_DOMINATORS, son))
-	if (EDGE_COUNT (son->preds) > 1)
-	  adj.safe_push (son);
-      for (auto son : adj)
-	set_immediate_dominator (CDI_DOMINATORS, son, e->src);
-    }
-
   if (version_niter)
     {
       /* The versioned loop could be infinite, we need to clear existing
 	 niter information which is copied from the original loop.  */
       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
       vect_free_loop_info_assumptions (nloop);
+      /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
+      loop_constraint_set (loop, LOOP_C_INFINITE);
     }
 
   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
       && dump_enabled_p ())
     {
       if (version_alias)
-        dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
+	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
 			 vect_location,
-                         "loop versioned for vectorization because of "
+			 "loop versioned for vectorization because of "
 			 "possible aliasing\n");
       if (version_align)
-        dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
+	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
 			 vect_location,
-                         "loop versioned for vectorization to enhance "
+			 "loop versioned for vectorization to enhance "
 			 "alignment\n");
-
     }
 
   return nloop;
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 7f75779519a..19eed9319d5 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -9989,7 +9989,6 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 			   niters_vector_mult_vf, !niters_no_overflow);
 
   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
-  scale_profile_for_vect_loop (loop, assumed_vf);
 
   /* True if the final iteration might not handle a full vector's
      worth of scalar iterations.  */
-- 
Gitee