diff --git a/0356-add-llc-allocate-feature.patch b/0356-add-llc-allocate-feature.patch
new file mode 100644
index 0000000000000000000000000000000000000000..e7c8e126ae3f9ac4549925bbfdd56e90571815c6
--- /dev/null
+++ b/0356-add-llc-allocate-feature.patch
@@ -0,0 +1,8452 @@
+From 43e93c6df874a0bf78675fb4d3586d9ad1cb7dac Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?=E9=99=88=E9=B8=BF?= <chenhong92@huawei.com>
+Date: Tue, 25 Feb 2025 16:27:36 +0800
+Subject: [PATCH 1/2] add llc allocate feature
+
+---
+ gcc/Makefile.in                               |    1 +
+ gcc/auto-profile.cc                           |  491 +-
+ gcc/auto-profile.h                            |   30 +
+ gcc/builtins.cc                               |   82 +
+ gcc/builtins.def                              |    1 +
+ gcc/cfgloop.h                                 |    3 +
+ gcc/common.opt                                |   28 +
+ gcc/config/aarch64/aarch64-protos.h           |    6 +-
+ gcc/config/aarch64/aarch64-sve.md             |   48 +-
+ gcc/config/aarch64/aarch64.cc                 |   18 +
+ gcc/config/aarch64/aarch64.md                 |   39 +
+ gcc/dce.cc                                    |    1 +
+ gcc/doc/tm.texi                               |   21 +
+ gcc/doc/tm.texi.in                            |    6 +
+ gcc/internal-fn.cc                            |  115 +
+ gcc/internal-fn.def                           |    4 +
+ gcc/ipa-pure-const.cc                         |    1 +
+ gcc/optabs.def                                |    2 +
+ gcc/opts.cc                                   |   52 +-
+ gcc/params.opt                                |   62 +
+ gcc/passes.def                                |    2 +
+ gcc/print-rtl.cc                              |    6 +
+ gcc/rtl.def                                   |    9 +
+ gcc/rtl.h                                     |    4 +
+ gcc/rtlanal.cc                                |    2 +
+ gcc/sched-deps.cc                             |    4 +-
+ gcc/target-insns.def                          |    1 +
+ gcc/target.def                                |   31 +
+ .../g++.dg/llc-allocate/llc-allocate.exp      |   27 +
+ .../llc-allocate/llc-relion-expand-kernels.C  |   52 +
+ .../g++.dg/llc-allocate/multidim_array.h      |  186 +
+ gcc/testsuite/gcc.dg/llc-allocate/llc-1.c     |   61 +
+ gcc/testsuite/gcc.dg/llc-allocate/llc-2.c     |   54 +
+ .../gcc.dg/llc-allocate/llc-allocate.exp      |   27 +
+ .../llc-allocate/llc-cross-bb-indir-mem-acc.c |   36 +
+ .../llc-allocate/llc-extend-outer-loop.c      |   61 +
+ .../llc-feedback-branch-in-loop.c             |   39 +
+ .../llc-allocate/llc-feedback-break-in-loop.c |   41 +
+ .../llc-allocate/llc-feedback-goto-in-loop.c  |   50 +
+ .../llc-feedback-same-loop-cycle.c            |  129 +
+ .../gcc.dg/llc-allocate/llc-nonzero-offset.c  |   50 +
+ .../llc-prefetch-full-pldl1keep.c             |   14 +
+ .../llc-prefetch-full-pldl1strm.c             |   14 +
+ .../llc-prefetch-full-pldl2keep.c             |   14 +
+ .../llc-prefetch-full-pldl2strm.c             |   16 +
+ .../llc-prefetch-full-pldl3keep.c             |   14 +
+ .../llc-prefetch-full-pldl3strm.c             |   14 +
+ .../llc-prefetch-full-pldl4keep.c             |   14 +
+ .../llc-prefetch-full-pldl4strm.c             |   14 +
+ .../llc-prefetch-full-pstl1keep.c             |   14 +
+ .../llc-prefetch-full-pstl1strm.c             |   14 +
+ .../llc-prefetch-full-pstl2keep.c             |   14 +
+ .../llc-prefetch-full-pstl2strm.c             |   14 +
+ .../llc-prefetch-full-pstl3keep.c             |   14 +
+ .../llc-prefetch-full-pstl3strm.c             |   14 +
+ .../llc-prefetch-full-pstl4keep.c             |   14 +
+ .../llc-prefetch-full-pstl4strm.c             |   14 +
+ .../gcc.dg/llc-allocate/llc-ref-trace.c       |   62 +
+ .../gfortran.dg/llc-allocate/llc-3.f90        |  211 +
+ .../gfortran.dg/llc-allocate/llc-allocate.exp |   29 +
+ .../llc-trace-multiple-base-var.f90           |   62 +
+ .../llc-unknown-type-size-unit.f90            |   58 +
+ .../llc-allocate/llc-wrf-4-outer-loop-num.f90 |  320 ++
+ gcc/timevar.def                               |    2 +
+ gcc/toplev.cc                                 |    6 +
+ gcc/tree-cfg.cc                               |   11 +
+ gcc/tree-cfg.h                                |    1 +
+ gcc/tree-pass.h                               |    3 +
+ gcc/tree-scalar-evolution.cc                  |    8 +-
+ gcc/tree-scalar-evolution.h                   |    3 +-
+ gcc/tree-ssa-llc-allocate.cc                  | 4150 +++++++++++++++++
+ gcc/tree-ssa-loop-niter.cc                    |   38 +-
+ gcc/tree-ssa-loop-niter.h                     |    3 +-
+ gcc/tree-vect-loop-manip.cc                   |  266 ++
+ gcc/tree-vect-loop.cc                         |   10 +-
+ gcc/tree-vectorizer.h                         |    1 +
+ 76 files changed, 7308 insertions(+), 45 deletions(-)
+ create mode 100644 gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp
+ create mode 100644 gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C
+ create mode 100644 gcc/testsuite/g++.dg/llc-allocate/multidim_array.h
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-2.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-branch-in-loop.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-break-in-loop.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-goto-in-loop.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-same-loop-cycle.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c
+ create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90
+ create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp
+ create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90
+ create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90
+ create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90
+ create mode 100644 gcc/tree-ssa-llc-allocate.cc
+
+diff --git a/gcc/Makefile.in b/gcc/Makefile.in
+index 65f683bbd..ef7733580 100644
+--- a/gcc/Makefile.in
++++ b/gcc/Makefile.in
+@@ -1659,6 +1659,7 @@ OBJS = \
+ 	tree-ssa-loop-niter.o \
+ 	tree-ssa-loop-array-widen-compare.o \
+ 	tree-ssa-loop-prefetch.o \
++	tree-ssa-llc-allocate.o \
+ 	tree-ssa-loop-split.o \
+ 	tree-ssa-loop-unswitch.o \
+ 	tree-ssa-loop.o \
+diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
+index 5e85381ce..97c3bafd5 100644
+--- a/gcc/auto-profile.cc
++++ b/gcc/auto-profile.cc
+@@ -49,6 +49,9 @@ along with GCC; see the file COPYING3.  If not see
+ #include "auto-profile.h"
+ #include "tree-pretty-print.h"
+ #include "gimple-pretty-print.h"
++#include <map>
++#include <vector>
++#include <algorithm>
+ 
+ /* The following routines implements AutoFDO optimization.
+ 
+@@ -95,6 +98,8 @@ along with GCC; see the file COPYING3.  If not see
+ */
+ 
+ #define DEFAULT_AUTO_PROFILE_FILE "fbdata.afdo"
++#define DEFAULT_CACHE_MISSES_PROFILE_FILE "cmsdata.gcov"
++#define DEFAULT_ADDITIONAL_PROFILE_FILE "addldata.gcov"
+ #define AUTO_PROFILE_VERSION 2
+ 
+ namespace autofdo
+@@ -117,6 +122,14 @@ private:
+   bool annotated_;
+ };
+ 
++/* pair <func_decl, count>  */
++static bool
++event_count_cmp (std::pair<unsigned, gcov_type> &a,
++		 std::pair<unsigned, gcov_type> &b)
++{
++  return a.second > b.second;
++}
++
+ /* Represent a source location: (function_decl, lineno).  */
+ typedef std::pair<tree, unsigned> decl_lineno;
+ 
+@@ -311,6 +324,9 @@ public:
+   /* Mark LOC as annotated.  */
+   void mark_annotated (location_t loc);
+ 
++  /* Compute total count threshold of top functions in sampled data.  */
++  gcov_type calc_topn_function_total_count_thres (unsigned topn) const;
++
+ private:
+   /* Map from function_instance name index (in string_table) to
+      function_instance.  */
+@@ -338,6 +354,244 @@ static autofdo_source_profile *afdo_source_profile;
+ /* gcov_summary structure to store the profile_info.  */
+ static gcov_summary *afdo_profile_info;
+ 
++/* Check opts->x_flags and put file name into EVENT_FILES.  */
++
++static bool
++get_all_profile_names (const char **event_files)
++{
++  if (!(flag_auto_profile
++        || (flag_cache_misses_profile || flag_additional_profile)))
++    {
++      return false;
++    }
++
++  event_files[INST_EXEC] = auto_profile_file;
++
++  if (flag_cache_misses_profile)
++    {
++      if (cache_misses_profile_file == NULL)
++        {
++          if (additional_profile_file == NULL)
++        {
++          additional_profile_file = DEFAULT_ADDITIONAL_PROFILE_FILE;
++        }
++      event_files[PMU_EVENT] = additional_profile_file;
++        }
++      event_files[CACHE_MISSES] = cache_misses_profile_file;
++    }
++  else if (flag_additional_profile)
++    {
++      if (additional_profile_file == NULL)
++        {
++          additional_profile_file = DEFAULT_ADDITIONAL_PROFILE_FILE;
++        }
++      event_files[PMU_EVENT] = additional_profile_file;
++    }
++
++  return true;
++}
++
++static void read_profile (void);
++
++/* Maintain multiple profile data of different events with event_loc_count_map
++   and event_func_count_map.  */
++
++class extend_auto_profile
++{
++public:
++  bool auto_profile_exist (enum event_type type);
++  gcov_type get_loc_count (location_t, event_type);
++  gcov_type get_func_count (unsigned, event_type);
++  gcov_type get_topn_function_total_count_thres () const;
++  struct rank_info get_func_rank (unsigned, enum event_type);
++  /* There should be only one instance of class EXTEND_AUTO_PROFILE.  */
++  static extend_auto_profile *create ()
++    {
++      extend_auto_profile *map = new extend_auto_profile ();
++      if (map->read ())
++	{
++	  return map;
++	}
++      delete map;
++      return NULL;
++    }
++private:
++  /* Basic maps of extend_auto_profile.  */
++  typedef std::map<location_t, gcov_type> loc_count_map;
++  typedef std::map<unsigned, gcov_type> func_count_map;
++
++  /* Map of function_uid to its descending order rank of counts.  */
++  typedef std::map<unsigned, unsigned> rank_map;
++
++  /* Mapping hardware events to corresponding basic maps.  */
++  typedef std::map<event_type, loc_count_map> event_loc_count_map;
++  typedef std::map<event_type, func_count_map> event_func_count_map;
++  typedef std::map<event_type, rank_map> event_rank_map;
++
++  extend_auto_profile () {}
++  bool read ();
++  void set_loc_count ();
++  void process_extend_source_profile ();
++  void read_extend_afdo_file (const char*, event_type);
++  void rank_all_func ();
++  void dump_event ();
++  event_loc_count_map event_loc_map;
++  event_func_count_map event_func_map;
++  event_rank_map func_rank;
++  event_type profile_type;
++  gcov_type topn_function_total_count_thres;
++};
++
++/* Member functions for extend_auto_profile.  */
++
++bool
++extend_auto_profile::auto_profile_exist (enum event_type type)
++{
++  switch (type)
++    {
++      case INST_EXEC:
++	return event_func_map.count (INST_EXEC) != 0
++	       || event_loc_map.count (INST_EXEC) != 0;
++      case CACHE_MISSES:
++	return event_func_map.count (CACHE_MISSES) != 0
++	       || event_loc_map.count (CACHE_MISSES) != 0;
++      case PMU_EVENT:
++	return event_func_map.count (PMU_EVENT) != 0
++	       || event_loc_map.count (PMU_EVENT) != 0;
++      default:
++	  return false;
++    }
++}
++
++void
++extend_auto_profile::dump_event ()
++{
++  if (dump_file)
++    {
++      switch (profile_type)
++	{
++	  case INST_EXEC:
++	    fprintf (dump_file, "Processing event instruction execution.\n");
++	    break;
++	  case CACHE_MISSES:
++	    fprintf (dump_file, "Processing event cache misses.\n");
++	    break;
++        case PMU_EVENT:
++	    fprintf (dump_file, "Processing other PMU events.\n");
++	    break;
++	  default:
++	    break;
++	}
++    }
++}
++
++/* Return true if any profile data was read.  */
++
++bool
++extend_auto_profile::read ()
++{
++  const char *event_files[EVENT_NUMBER] = {NULL};
++  if (!get_all_profile_names (event_files))
++    {
++      return false;
++    }
++
++  /* Backup AFDO_STRING_TABLE and AFDO_SOURCE_PROFILE since we will create
++     new ones for each event_type.  */
++  autofdo::string_table *string_table_afdo = afdo_string_table;
++  autofdo::autofdo_source_profile *source_profile_afdo = afdo_source_profile;
++
++  for (unsigned i = 0; i < EVENT_NUMBER; i++)
++    {
++      if (event_files[i] == NULL)
++	{
++	  continue;
++	}
++      profile_type = (enum event_type) i;
++      dump_event ();
++      gcov_close ();
++      auto_profile_file = event_files[i];
++      read_profile ();
++      gcov_close ();
++
++      topn_function_total_count_thres = param_llc_allocate_func_counts_threshold;
++      if (param_llc_allocate_func_topn > 0 && profile_type == PMU_EVENT)
++        {
++	  topn_function_total_count_thres
++	    = afdo_source_profile->calc_topn_function_total_count_thres (
++		param_llc_allocate_func_topn);
++        }
++
++      process_extend_source_profile ();
++
++      delete afdo_source_profile;
++      delete afdo_string_table;
++    }
++
++  /* Restore AFDO_STRING_TABLE and AFDO_SOURCE_PROFILE.  Function
++     END_AUTO_PROFILE will free them at the end of compilation.  */
++  afdo_string_table = string_table_afdo;
++  afdo_source_profile = source_profile_afdo;
++  return true;
++}
++
++/* Helper functions.  */
++
++gcov_type
++extend_auto_profile::get_loc_count (location_t loc, event_type type)
++{
++  event_loc_count_map::iterator event_iter = event_loc_map.find (type);
++  if (event_iter != event_loc_map.end ())
++    {
++      loc_count_map::iterator loc_iter = event_iter->second.find (loc);
++      if (loc_iter != event_iter->second.end ())
++	{
++	  return loc_iter->second;
++	}
++    }
++  return 0;
++}
++
++struct rank_info
++extend_auto_profile::get_func_rank (unsigned decl_uid, enum event_type type)
++{
++  struct rank_info info = {0, 0};
++  event_rank_map::iterator event_iter = func_rank.find (type);
++  if (event_iter != func_rank.end ())
++    {
++      rank_map::iterator func_iter = event_iter->second.find (decl_uid);
++      if (func_iter != event_iter->second.end ())
++	{
++	  info.rank = func_iter->second;
++	  info.total = event_iter->second.size ();
++	}
++    }
++  return info;
++}
++
++gcov_type
++extend_auto_profile::get_func_count (unsigned decl_uid, event_type type)
++{
++  event_func_count_map::iterator event_iter = event_func_map.find (type);
++  if (event_iter != event_func_map.end ())
++    {
++      func_count_map::iterator func_iter = event_iter->second.find (decl_uid);
++      if (func_iter != event_iter->second.end ())
++	{
++	  return func_iter->second;
++	}
++    }
++  return 0;
++}
++
++gcov_type
++extend_auto_profile::get_topn_function_total_count_thres () const
++{
++  return topn_function_total_count_thres;
++}
++
++static extend_auto_profile *extend_profile;
++
+ /* Helper functions.  */
+ 
+ /* Return the original name of NAME: strip the suffix that starts
+@@ -483,7 +737,7 @@ string_table::get_index (const char *name) const
+   return iter->second;
+ }
+ 
+-/* Return the index of a given function DECL. Return -1 if DECL is not 
++/* Return the index of a given function DECL. Return -1 if DECL is not
+    found in string table.  */
+ 
+ int
+@@ -917,6 +1171,31 @@ autofdo_source_profile::get_function_instance_by_inline_stack (
+   return s;
+ }
+ 
++/* Compute total count threshold of top functions in sampled data.  */
++
++gcov_type
++autofdo_source_profile::calc_topn_function_total_count_thres (
++    unsigned topn) const
++{
++  std::set<gcov_type> func_counts;
++  for (name_function_instance_map::const_iterator iter = map_.begin ();
++       iter != map_.end (); ++iter)
++    {
++      if (func_counts.size () < topn)
++        func_counts.insert (iter->second->total_count ());
++      else if (*func_counts.begin () < iter->second->total_count ())
++        {
++          func_counts.erase (func_counts.begin ());
++          func_counts.insert (iter->second->total_count ());
++        }
++    }
++ 
++  gcov_type func_counts_topn = *func_counts.begin ();
++  if (func_counts.size () == topn
++      && param_llc_allocate_func_counts_threshold < func_counts_topn)
++    return func_counts_topn;
++}
++
+ /* Module profile is only used by LIPO. Here we simply ignore it.  */
+ 
+ static void
+@@ -1842,6 +2121,132 @@ auto_profile (void)
+ 
+   return TODO_rebuild_cgraph_edges;
+ }
++
++
++void
++extend_auto_profile::rank_all_func ()
++{
++  std::vector<std::pair<unsigned, gcov_type> > func_sorted;
++  event_func_count_map::iterator event_iter
++				 = event_func_map.find (profile_type);
++  if (event_iter != event_func_map.end ())
++    {
++      func_count_map::iterator func_iter;
++      for (func_iter = event_iter->second.begin ();
++	   func_iter != event_iter->second.end (); func_iter++)
++	{
++	  func_sorted.push_back (std::make_pair (func_iter->first,
++						 func_iter->second));
++	}
++
++      std::sort (func_sorted.begin (), func_sorted.end (), event_count_cmp);
++
++      for (unsigned i = 0; i < func_sorted.size (); ++i)
++	{
++	  func_rank[profile_type][func_sorted[i].first] = i + 1;
++	}
++    }
++}
++
++/* Iterate stmts in cfun and maintain its count to EVENT_LOC_MAP.  */
++
++void
++extend_auto_profile::set_loc_count ()
++{
++  basic_block bb;
++  FOR_EACH_BB_FN (bb, cfun)
++    {
++      gimple_stmt_iterator gsi;
++      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
++	{
++	  count_info info;
++	  gimple *stmt = gsi_stmt (gsi);
++	  if (gimple_clobber_p (stmt) || is_gimple_debug (stmt))
++	    {
++	      continue;
++	    }
++	  if (afdo_source_profile->get_count_info (stmt, &info))
++	    {
++	      location_t loc = gimple_location (stmt);
++	      event_loc_map[profile_type][loc] += info.count;
++	      if (dump_file && (dump_flags & TDF_DETAILS))
++		{
++		  fprintf (dump_file, "stmt ");
++		  print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
++		  fprintf (dump_file, "counts %ld\n",
++			   event_loc_map[profile_type][loc]);
++		}
++	    }
++	}
++    }
++}
++
++/* Process data in extend_auto_source_profile, save them into two maps.
++   1. gimple_location to count.
++   2. function_index to count.  */
++void
++extend_auto_profile::process_extend_source_profile ()
++{
++  struct cgraph_node *node;
++  if (symtab->state == FINISHED)
++    {
++      return;
++    }
++  FOR_EACH_FUNCTION (node)
++    {
++      if (!gimple_has_body_p (node->decl) || node->inlined_to)
++	{
++	  continue;
++	}
++
++      /* Don't profile functions produced for builtin stuff.  */
++      if (DECL_SOURCE_LOCATION (node->decl) == BUILTINS_LOCATION)
++	{
++	  continue;
++	}
++
++      function *fn = DECL_STRUCT_FUNCTION (node->decl);
++      push_cfun (fn);
++
++      const function_instance *s
++      = afdo_source_profile->get_function_instance_by_decl (
++	  current_function_decl);
++
++      if (s == NULL)
++	{
++	  pop_cfun ();
++	  continue;
++	}
++      unsigned int decl_uid = DECL_UID (current_function_decl);
++      gcov_type count = s->total_count ();
++      if (dump_file)
++	{
++	  fprintf (dump_file, "Extend auto-profile for function %s.\n",
++			       node->dump_name ());
++	}
++      event_func_map[profile_type][decl_uid] += count;
++      set_loc_count ();
++      pop_cfun ();
++    }
++  rank_all_func ();
++}
++
++/* Main entry of extend_auto_profile.  */
++
++static void
++extend_source_profile ()
++{
++  extend_profile = autofdo::extend_auto_profile::create ();
++  if (dump_file)
++    {
++      if (extend_profile == NULL)
++	{
++	  fprintf (dump_file, "No profile file is found.\n");
++	  return;
++	}
++      fprintf (dump_file, "Extend profile info generated.\n");
++    }
++}
+ } /* namespace autofdo.  */
+ 
+ /* Read the profile from the profile data file.  */
+@@ -1870,6 +2275,48 @@ end_auto_profile (void)
+   profile_info = NULL;
+ }
+ 
++/* Extern function to get profile info in other passes.  */
++
++bool
++profile_exist (enum event_type type)
++{
++  return autofdo::extend_profile != NULL
++	 && autofdo::extend_profile->auto_profile_exist (type);
++}
++
++gcov_type
++event_get_loc_count (location_t loc, event_type type)
++{
++  return autofdo::extend_profile->get_loc_count (loc, type);
++}
++
++gcov_type
++event_get_func_count (unsigned decl_uid, event_type type)
++{
++  return autofdo::extend_profile->get_func_count (decl_uid, type);
++}
++
++struct rank_info
++event_get_func_rank (unsigned decl_uid, enum event_type type)
++{
++  return autofdo::extend_profile->get_func_rank (decl_uid, type);
++}
++
++gcov_type
++event_get_topn_function_total_count_thres ()
++{
++  return autofdo::extend_profile->get_topn_function_total_count_thres ();
++}
++
++void
++free_extend_profile_info ()
++{
++  if (autofdo::extend_profile != NULL)
++    {
++      delete autofdo::extend_profile;
++    }
++}
++
+ /* Returns TRUE if EDGE is hot enough to be inlined early.  */
+ 
+ bool
+@@ -1931,8 +2378,50 @@ public:
+ 
+ } // anon namespace
+ 
++namespace
++{
++const pass_data pass_data_ipa_extend_auto_profile =
++{
++  SIMPLE_IPA_PASS, /* type */
++  "ex-afdo", /* name */
++  OPTGROUP_NONE, /* optinfo_flags */
++  TV_IPA_EXTEND_AUTO_PROFILE, /* tv_id */
++  0, /* properties_required */
++  0, /* properties_provided */
++  0, /* properties_destroyed */
++  0, /* todo_flags_start */
++  0, /* todo_flags_finish */
++};
++
++class pass_ipa_extend_auto_profile : public simple_ipa_opt_pass
++{
++public:
++  pass_ipa_extend_auto_profile (gcc::context *ctxt)
++    : simple_ipa_opt_pass (pass_data_ipa_extend_auto_profile, ctxt)
++  {}
++
++  /* opt_pass methods: */
++  virtual bool gate (function *) {return (flag_ipa_extend_auto_profile > 0);}
++  virtual unsigned int execute (function *);
++
++};
++
++unsigned int
++pass_ipa_extend_auto_profile::execute (function *fun)
++{
++  autofdo::extend_source_profile ();
++  return 0;
++}
++} // anon namespace
++
+ simple_ipa_opt_pass *
+ make_pass_ipa_auto_profile (gcc::context *ctxt)
+ {
+   return new pass_ipa_auto_profile (ctxt);
+ }
++
++simple_ipa_opt_pass *
++make_pass_ipa_extend_auto_profile (gcc::context *ctxt)
++{
++  return new pass_ipa_extend_auto_profile (ctxt);
++}
+diff --git a/gcc/auto-profile.h b/gcc/auto-profile.h
+index bf3f90f2f..dea0b18e6 100644
+--- a/gcc/auto-profile.h
++++ b/gcc/auto-profile.h
+@@ -21,6 +21,14 @@ along with GCC; see the file COPYING3.  If not see
+ #ifndef AUTO_PROFILE_H
+ #define AUTO_PROFILE_H
+ 
++enum event_type
++{
++  INST_EXEC = 0,
++  CACHE_MISSES,
++  PMU_EVENT,
++  EVENT_NUMBER
++};
++
+ /* Read, process, finalize AutoFDO data structures.  */
+ extern void read_autofdo_file (void);
+ extern void end_auto_profile (void);
+@@ -28,4 +36,26 @@ extern void end_auto_profile (void);
+ /* Returns TRUE if EDGE is hot enough to be inlined early.  */
+ extern bool afdo_callsite_hot_enough_for_early_inline (struct cgraph_edge *);
+ 
++/* Chcek if profile exists before using this profile.  */
++extern bool profile_exist (enum event_type);
++
++/* Given func decl_uid or gimple location and event_type, return count.
++   Count is 0 if function or gimple is not sampled.  */
++extern gcov_type event_get_func_count (unsigned, enum event_type);
++extern gcov_type event_get_loc_count (location_t, enum event_type);
++extern gcov_type event_get_topn_function_total_count_thres ();
++
++struct rank_info
++{
++  unsigned total;
++  unsigned rank;
++};
++
++/* Given function decl_uid and event type, return rank_info.  Rank_info
++   is {0, 0} if function was not sampled.  */
++extern struct rank_info event_get_func_rank (unsigned, enum event_type);
++
++/* Free memory allocated by autofdo::extern_profile.  */
++extern void free_extend_profile_info ();
++
+ #endif /* AUTO_PROFILE_H */
+diff --git a/gcc/builtins.cc b/gcc/builtins.cc
+index 57929a42b..dc2e9c3f3 100644
+--- a/gcc/builtins.cc
++++ b/gcc/builtins.cc
+@@ -1352,6 +1352,85 @@ expand_builtin_prefetch (tree exp)
+     emit_insn (op0);
+ }
+ 
++/* Expand a call to __builtin_prefetch_full.  */
++
++static void
++expand_builtin_prefetch_full (tree exp)
++{
++  tree arg0, arg1, arg2;
++  int nargs;
++  rtx op0, op1, op2;
++
++  if (!validate_arglist (exp, POINTER_TYPE, 0))
++    return;
++
++  arg0 = CALL_EXPR_ARG (exp, 0);
++
++  /* Arguments 1 and 2 are optional; argument 1 (read/write) defaults to
++     zero (read) and argument 2 (locality) defaults to 3 (high degree of
++     locality).  */
++  nargs = call_expr_nargs (exp);
++  if (nargs > 1)
++    arg1 = CALL_EXPR_ARG (exp, 1);
++  else
++    arg1 = integer_zero_node;
++  if (nargs > 2)
++    arg2 = CALL_EXPR_ARG (exp, 2);
++  else
++    arg2 = integer_three_node;
++
++  /* Argument 0 is an address.  */
++  op0 = expand_expr (arg0, NULL_RTX, Pmode, EXPAND_NORMAL);
++
++  /* Argument 1 (read/write flag) must be a compile-time constant int.  */
++  if (TREE_CODE (arg1) != INTEGER_CST)
++    {
++      error ("second argument to %<__builtin_prefetch_full%> must be a "
++             "constant");
++      arg1 = integer_zero_node;
++    }
++  op1 = expand_normal (arg1);
++  /* Argument 1 must be either zero or one.  */
++  if (INTVAL (op1) != 0 && INTVAL (op1) != 1)
++    {
++      warning (0, "invalid second argument to %<__builtin_prefetch_full%>;"
++	       " using zero");
++      op1 = const0_rtx;
++    }
++
++  /* Argument 2 (locality) must be a compile-time constant int.  */
++  if (TREE_CODE (arg2) != INTEGER_CST)
++    {
++      error ("third argument to %<__builtin_prefetch_full%> must be a "
++             "constant");
++      arg2 = integer_zero_node;
++    }
++  op2 = expand_normal (arg2);
++  /* Argument 2 must be 0-7.  */
++  if (INTVAL (op2) < 0 || INTVAL (op2) > 7)
++    {
++      warning (0, "invalid third argument to %<__builtin_prefetch_full%>; "
++               "using zero");
++      op2 = const0_rtx;
++    }
++
++  if (targetm.have_prefetch_full ())
++    {
++      class expand_operand ops[3];
++
++      create_address_operand (&ops[0], op0);
++      create_integer_operand (&ops[1], INTVAL (op1));
++      create_integer_operand (&ops[2], INTVAL (op2));
++      if (maybe_expand_insn (targetm.code_for_prefetch_full, 3, ops))
++	return;
++    }
++
++  /* Don't do anything with direct references to volatile memory, but
++     generate code to handle other side effects.  */
++  if (!MEM_P (op0) && side_effects_p (op0))
++    emit_insn (op0);
++}
++
+ /* Get a MEM rtx for expression EXP which is the address of an operand
+    to be used in a string instruction (cmpstrsi, cpymemsi, ..).  LEN is
+    the maximum length of the block of memory that might be accessed or
+@@ -7598,6 +7677,9 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
+     case BUILT_IN_PREFETCH:
+       expand_builtin_prefetch (exp);
+       return const0_rtx;
++    case BUILT_IN_PREFETCH_FULL:
++      expand_builtin_prefetch_full (exp);
++      return const0_rtx;
+ 
+     case BUILT_IN_INIT_TRAMPOLINE:
+       return expand_builtin_init_trampoline (exp, true);
+diff --git a/gcc/builtins.def b/gcc/builtins.def
+index 005976f34..f2e0c357d 100644
+--- a/gcc/builtins.def
++++ b/gcc/builtins.def
+@@ -924,6 +924,7 @@ DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl", BT_FN_INT_ULONG, ATTR_C
+ DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll", BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
+ DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign", BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
+ DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
++DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH_FULL, "prefetch_full", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
+ DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE, ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
+ DEF_GCC_BUILTIN        (BUILT_IN_RETURN, "return", BT_FN_VOID_PTR, ATTR_NORETURN_NOTHROW_LEAF_LIST)
+ DEF_GCC_BUILTIN        (BUILT_IN_RETURN_ADDRESS, "return_address", BT_FN_PTR_UINT, ATTR_LEAF_LIST)
+diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
+index d2714e20c..794bc3ecc 100644
+--- a/gcc/cfgloop.h
++++ b/gcc/cfgloop.h
+@@ -272,6 +272,9 @@ public:
+      the basic-block from being collected but its index can still be
+      reused.  */
+   basic_block former_header;
++
++  /* Number of latch executions from vectorization.  */
++  tree vec_nb_iterations;
+ };
+ 
+ /* Set if the loop is known to be infinite.  */
+diff --git a/gcc/common.opt b/gcc/common.opt
+index 6ab7ba4cc..e6ffa1c58 100644
+--- a/gcc/common.opt
++++ b/gcc/common.opt
+@@ -1148,6 +1148,26 @@ Common Joined RejectNegative Var(auto_profile_file)
+ Use sample profile information for call graph node weights. The profile
+ file is specified in the argument.
+ 
++fcache-misses-profile
++Common Var(flag_cache_misses_profile)
++Use sample profile information for source code cache miss count.  The default
++profile file is cmsdata.gcov in `pwd`.
++
++fcache-misses-profile=
++Common Joined RejectNegative Var(cache_misses_profile_file)
++Use sample profile information for source code cache miss count.  The profile
++file is specified in the argument.
++
++fadditional-profile
++Common Var(flag_additional_profile)
++Use additional PMU-event sample profile information for source code bb count.
++The default profile file is addldata.gcov in `pwd`.
++
++fadditional-profile=
++Common Joined RejectNegative Var(additional_profile_file)
++Use additional PMU-event sample profile information for source code bb count.
++The profile file is specified in the argument.
++
+ ; -fcheck-bounds causes gcc to generate array bounds checks.
+ ; For C, C++ and ObjC: defaults off.
+ ; For Java: defaults to on.
+@@ -2074,6 +2094,10 @@ fipa-struct-sfc-shadow
+ Common Var(flag_ipa_struct_sfc_shadow) Init(0) Optimization
+ Enable field shadowing optimization in static struct field compression.
+ 
++fipa-extend-auto-profile
++Common Var(flag_ipa_extend_auto_profile)
++Use sample profile information for source code.
++
+ fipa-vrp
+ Common Var(flag_ipa_vrp) Optimization
+ Perform IPA Value Range Propagation.
+@@ -2424,6 +2448,10 @@ fipa-prefetch
+ Common Var(flag_ipa_prefetch) Init(0) Optimization
+ Generate prefetch instructions, if available, using IPA info.
+ 
++fllc-allocate
++Common Var(flag_llc_allocate) Init(-1) Optimization
++Generate LLC hint instructions.
++
+ fprofile
+ Common Var(profile_flag)
+ Enable basic program profiling code.
+diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
+index cbb844fbc..af0881f7a 100644
+--- a/gcc/config/aarch64/aarch64-protos.h
++++ b/gcc/config/aarch64/aarch64-protos.h
+@@ -702,12 +702,16 @@ extern struct tune_params aarch64_tune_params;
+   T (PLDL2STRM, pldl2strm, 3) \
+   T (PLDL3KEEP, pldl3keep, 4) \
+   T (PLDL3STRM, pldl3strm, 5) \
++  T (PLDL4KEEP, pldl4keep, 6) \
++  T (PLDL4STRM, pldl4strm, 7) \
+   T (PSTL1KEEP, pstl1keep, 8) \
+   T (PSTL1STRM, pstl1strm, 9) \
+   T (PSTL2KEEP, pstl2keep, 10) \
+   T (PSTL2STRM, pstl2strm, 11) \
+   T (PSTL3KEEP, pstl3keep, 12) \
+-  T (PSTL3STRM, pstl3strm, 13)
++  T (PSTL3STRM, pstl3strm, 13) \
++  T (PSTL4KEEP, pstl4keep, 14) \
++  T (PSTL4STRM, pstl4strm, 15)
+ 
+ #define AARCH64_SVENUM(UPPER, LOWER, VALUE) AARCH64_SV_##UPPER = VALUE,
+ enum aarch64_svpattern {
+diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
+index a8a5dc3a2..7808abf70 100644
+--- a/gcc/config/aarch64/aarch64-sve.md
++++ b/gcc/config/aarch64/aarch64-sve.md
+@@ -1952,7 +1952,7 @@
+ (define_insn "@aarch64_sve_prefetch<mode>"
+   [(prefetch (unspec:DI
+ 	       [(match_operand:<VPRED> 0 "register_operand" "Upl")
+-		(match_operand:SVE_FULL_I 1 "aarch64_sve_prefetch_operand" "UP<Vesize>")
++		(match_operand:SVE_FULL 1 "aarch64_sve_prefetch_operand" "UP<Vesize>")
+ 		(match_operand:DI 2 "const_int_operand")]
+ 	       UNSPEC_SVE_PREFETCH)
+ 	     (match_operand:DI 3 "const_int_operand")
+@@ -1985,14 +1985,14 @@
+ ;; 6: the prefetch operator (an svprfop)
+ ;; 7: the normal RTL prefetch rw flag
+ ;; 8: the normal RTL prefetch locality value
+-(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx4SI_ONLY:mode>"
++(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL:mode><VNx4SI_ONLY:mode>"
+   [(prefetch (unspec:DI
+ 	       [(match_operand:VNx4BI 0 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+-		(match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL_I:Vesize>" "Z, vg<SVE_FULL_I:Vesize>, rk, rk, rk, rk")
++		(match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL:Vesize>" "Z, vg<SVE_FULL:Vesize>, rk, rk, rk, rk")
+ 		(match_operand:VNx4SI_ONLY 2 "register_operand" "w, w, w, w, w, w")
+ 		(match_operand:DI 3 "const_int_operand" "i, i, Z, Ui1, Z, Ui1")
+-		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, Ui1, Ui1, Ui1, i, i")
+-		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
++		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL:Vesize>" "Ui1, Ui1, Ui1, Ui1, i, i")
++		(match_operand:SVE_FULL 5 "aarch64_simd_imm_zero")
+ 		(match_operand:DI 6 "const_int_operand")]
+ 	       UNSPEC_SVE_PREFETCH_GATHER)
+ 	     (match_operand:DI 7 "const_int_operand")
+@@ -2000,12 +2000,12 @@
+   "TARGET_SVE && TARGET_NON_STREAMING"
+   {
+     static const char *const insns[][2] = {
+-      "prf<SVE_FULL_I:Vesize>", "%0, [%2.s]",
+-      "prf<SVE_FULL_I:Vesize>", "%0, [%2.s, #%1]",
++      "prf<SVE_FULL:Vesize>", "%0, [%2.s]",
++      "prf<SVE_FULL:Vesize>", "%0, [%2.s, #%1]",
+       "prfb", "%0, [%1, %2.s, sxtw]",
+       "prfb", "%0, [%1, %2.s, uxtw]",
+-      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.s, sxtw %p4]",
+-      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.s, uxtw %p4]"
++      "prf<SVE_FULL:Vesize>", "%0, [%1, %2.s, sxtw %p4]",
++      "prf<SVE_FULL:Vesize>", "%0, [%1, %2.s, uxtw %p4]"
+     };
+     const char *const *parts = insns[which_alternative];
+     return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
+@@ -2014,14 +2014,14 @@
+ 
+ ;; Predicated gather prefetches for 64-bit elements.  The value of operand 3
+ ;; doesn't matter in this case.
+-(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx2DI_ONLY:mode>"
++(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL:mode><VNx2DI_ONLY:mode>"
+   [(prefetch (unspec:DI
+ 	       [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl, Upl, Upl")
+-		(match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL_I:Vesize>" "Z, vg<SVE_FULL_I:Vesize>, rk, rk")
++		(match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL:Vesize>" "Z, vg<SVE_FULL:Vesize>, rk, rk")
+ 		(match_operand:VNx2DI_ONLY 2 "register_operand" "w, w, w, w")
+ 		(match_operand:DI 3 "const_int_operand")
+-		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, Ui1, Ui1, i")
+-		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
++		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL:Vesize>" "Ui1, Ui1, Ui1, i")
++		(match_operand:SVE_FULL 5 "aarch64_simd_imm_zero")
+ 		(match_operand:DI 6 "const_int_operand")]
+ 	       UNSPEC_SVE_PREFETCH_GATHER)
+ 	     (match_operand:DI 7 "const_int_operand")
+@@ -2029,10 +2029,10 @@
+   "TARGET_SVE && TARGET_NON_STREAMING"
+   {
+     static const char *const insns[][2] = {
+-      "prf<SVE_FULL_I:Vesize>", "%0, [%2.d]",
+-      "prf<SVE_FULL_I:Vesize>", "%0, [%2.d, #%1]",
++      "prf<SVE_FULL:Vesize>", "%0, [%2.d]",
++      "prf<SVE_FULL:Vesize>", "%0, [%2.d, #%1]",
+       "prfb", "%0, [%1, %2.d]",
+-      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.d, lsl %p4]"
++      "prf<SVE_FULL:Vesize>", "%0, [%1, %2.d, lsl %p4]"
+     };
+     const char *const *parts = insns[which_alternative];
+     return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
+@@ -2040,7 +2040,7 @@
+ )
+ 
+ ;; Likewise, but with the offset being sign-extended from 32 bits.
+-(define_insn_and_rewrite "*aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx2DI_ONLY:mode>_sxtw"
++(define_insn_and_rewrite "*aarch64_sve_gather_prefetch<SVE_FULL:mode><VNx2DI_ONLY:mode>_sxtw"
+   [(prefetch (unspec:DI
+ 	       [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl")
+ 		(match_operand:DI 1 "register_operand" "rk, rk")
+@@ -2051,8 +2051,8 @@
+ 		       (match_operand:VNx2DI 2 "register_operand" "w, w")))]
+ 		  UNSPEC_PRED_X)
+ 		(match_operand:DI 3 "const_int_operand")
+-		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, i")
+-		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
++		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL:Vesize>" "Ui1, i")
++		(match_operand:SVE_FULL 5 "aarch64_simd_imm_zero")
+ 		(match_operand:DI 6 "const_int_operand")]
+ 	       UNSPEC_SVE_PREFETCH_GATHER)
+ 	     (match_operand:DI 7 "const_int_operand")
+@@ -2061,7 +2061,7 @@
+   {
+     static const char *const insns[][2] = {
+       "prfb", "%0, [%1, %2.d, sxtw]",
+-      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.d, sxtw %p4]"
++      "prf<SVE_FULL:Vesize>", "%0, [%1, %2.d, sxtw %p4]"
+     };
+     const char *const *parts = insns[which_alternative];
+     return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
+@@ -2073,7 +2073,7 @@
+ )
+ 
+ ;; Likewise, but with the offset being zero-extended from 32 bits.
+-(define_insn "*aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx2DI_ONLY:mode>_uxtw"
++(define_insn "*aarch64_sve_gather_prefetch<SVE_FULL:mode><VNx2DI_ONLY:mode>_uxtw"
+   [(prefetch (unspec:DI
+ 	       [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl")
+ 		(match_operand:DI 1 "register_operand" "rk, rk")
+@@ -2081,8 +2081,8 @@
+ 		  (match_operand:VNx2DI 2 "register_operand" "w, w")
+ 		  (match_operand:VNx2DI 9 "aarch64_sve_uxtw_immediate"))
+ 		(match_operand:DI 3 "const_int_operand")
+-		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, i")
+-		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
++		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL:Vesize>" "Ui1, i")
++		(match_operand:SVE_FULL 5 "aarch64_simd_imm_zero")
+ 		(match_operand:DI 6 "const_int_operand")]
+ 	       UNSPEC_SVE_PREFETCH_GATHER)
+ 	     (match_operand:DI 7 "const_int_operand")
+@@ -2091,7 +2091,7 @@
+   {
+     static const char *const insns[][2] = {
+       "prfb", "%0, [%1, %2.d, uxtw]",
+-      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.d, uxtw %p4]"
++      "prf<SVE_FULL:Vesize>", "%0, [%1, %2.d, uxtw %p4]"
+     };
+     const char *const *parts = insns[which_alternative];
+     return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index e9c387b24..a06c2c515 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -4408,6 +4408,13 @@ aarch64_sve_data_mode_p (machine_mode mode)
+   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
+ }
+ 
++/* Return true if MODE is an full SVE data vector mode.  */
++static bool
++aarch64_full_sve_data_mode_p (machine_mode mode)
++{
++  return aarch64_classify_vector_mode (mode) == VEC_SVE_DATA;
++}
++
+ /* Return the number of defined bytes in one constituent vector of
+    SVE mode MODE, which has vector flags VEC_FLAGS.  */
+ static poly_int64
+@@ -31796,6 +31803,17 @@ aarch64_libgcc_floating_mode_supported_p
+ #undef TARGET_ASM_FUNCTION_EPILOGUE
+ #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
+ 
++#undef TARGET_VECTORIZE_CODE_FOR_PREFETCH
++#define TARGET_VECTORIZE_CODE_FOR_PREFETCH code_for_aarch64_sve_prefetch
++
++#undef TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH
++#define TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH \
++  code_for_aarch64_sve_gather_prefetch
++
++#undef TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P
++#define TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P \
++  aarch64_full_sve_data_mode_p
++
+ #undef TARGET_HAVE_SHADOW_CALL_STACK
+ #define TARGET_HAVE_SHADOW_CALL_STACK true
+ 
+diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
+index 2f46bc793..69d296556 100644
+--- a/gcc/config/aarch64/aarch64.md
++++ b/gcc/config/aarch64/aarch64.md
+@@ -925,6 +925,45 @@
+   [(set_attr "type" "load_4")]
+ )
+ 
++(define_insn "prefetch_full"
++  [(prefetch_full (match_operand:DI 0 "aarch64_prefetch_operand" "Dp")
++            (match_operand:QI 1 "const_int_operand" "")
++            (match_operand:QI 2 "const_int_operand" ""))]
++  ""
++  {
++    const char * pftype[2][8] =
++    {
++      {"prfm\\tPLDL1KEEP, %0",
++       "prfm\\tPLDL1STRM, %0",
++       "prfm\\tPLDL2KEEP, %0",
++       "prfm\\tPLDL2STRM, %0",
++       "prfm\\tPLDL3KEEP, %0",
++       "prfm\\tPLDL3STRM, %0",
++       "prfm\\tPLDL4KEEP, %0",
++       "prfm\\tPLDL4STRM, %0"},
++      {"prfm\\tPSTL1KEEP, %0",
++       "prfm\\tPSTL1STRM, %0",
++       "prfm\\tPSTL2KEEP, %0",
++       "prfm\\tPSTL2STRM, %0",
++       "prfm\\tPSTL3KEEP, %0",
++       "prfm\\tPSTL3STRM, %0",
++       "prfm\\tPSTL4KEEP, %0",
++       "prfm\\tPSTL4STRM, %0"},
++    };
++
++    int prfop = INTVAL (operands[2]);
++
++    gcc_assert (IN_RANGE (prfop, 0, 7));
++
++    /* PRFM accepts the same addresses as a 64-bit LDR so wrap
++       the address into a DImode MEM so that aarch64_print_operand knows
++       how to print it.  */
++    operands[0] = gen_rtx_MEM (DImode, operands[0]);
++    return pftype[INTVAL (operands[1])][prfop];
++  }
++  [(set_attr "type" "load_4")]
++)
++
+ (define_insn "trap"
+   [(trap_if (const_int 1) (const_int 8))]
+   ""
+diff --git a/gcc/dce.cc b/gcc/dce.cc
+index 6676cbcd4..964a0a6d0 100644
+--- a/gcc/dce.cc
++++ b/gcc/dce.cc
+@@ -72,6 +72,7 @@ deletable_insn_p_1 (rtx body)
+   switch (GET_CODE (body))
+     {
+     case PREFETCH:
++    case PREFETCH_FULL:
+     case TRAP_IF:
+       /* The UNSPEC case was added here because the ia-64 claims that
+ 	 USEs do not work after reload and generates UNSPECS rather
+diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
+index 50bbbbc42..16ada7aae 100644
+--- a/gcc/doc/tm.texi
++++ b/gcc/doc/tm.texi
+@@ -6278,6 +6278,27 @@ The default is @code{NULL_TREE} which means to not vectorize scatter
+ stores.
+ @end deftypefn
+ 
++@deftypefn {Target Hook} insn_code TARGET_VECTORIZE_CODE_FOR_PREFETCH (machine_mode @var{arg})
++This hook should return the decl of a function that implements the
++vectorized variant of the function with the @code{combined_fn} code
++@var{code} or @code{NULL_TREE} if such a function is not available.
++The return type of the vectorized function shall be of vector type
++@var{vec_type_out} and the argument types should be @var{vec_type_in}.
++@end deftypefn
++
++@deftypefn {Target Hook} insn_code TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH (machine_mode @var{mode_to}, machine_mode @var{mode_form})
++This hook should return the decl of a function that implements the
++vectorized variant of the function with the @code{combined_fn} code
++@var{code} or @code{NULL_TREE} if such a function is not available.
++The return type of the vectorized function shall be of vector type
++@var{vec_type_out} and the argument types should be @var{vec_type_in}.
++@end deftypefn
++
++@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P (machine_mode @var{arg})
++This hook should return true if the target hardware architecture
++supports a full SVE data vector mode.
++@end deftypefn
++
+ @deftypefn {Target Hook} int TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN (struct cgraph_node *@var{}, struct cgraph_simd_clone *@var{}, @var{tree}, @var{int})
+ This hook should set @var{vecsize_mangle}, @var{vecsize_int}, @var{vecsize_float}
+ fields in @var{simd_clone} structure pointed by @var{clone_info} argument and also
+diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
+index cfda60304..88db8752e 100644
+--- a/gcc/doc/tm.texi.in
++++ b/gcc/doc/tm.texi.in
+@@ -4190,6 +4190,12 @@ address;  but often a machine-dependent strategy can generate better code.
+ 
+ @hook TARGET_VECTORIZE_BUILTIN_SCATTER
+ 
++@hook TARGET_VECTORIZE_CODE_FOR_PREFETCH
++
++@hook TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH
++
++@hook TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P
++
+ @hook TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
+ 
+ @hook TARGET_SIMD_CLONE_ADJUST
+diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
+index 8b1733e20..19811106f 100644
+--- a/gcc/internal-fn.cc
++++ b/gcc/internal-fn.cc
+@@ -107,11 +107,13 @@ init_internal_fns ()
+    direct_internal_fn.  */
+ #define not_direct { -2, -2, false }
+ #define mask_load_direct { -1, 2, false }
++#define mask_prefetch_direct { -1, 2, false }
+ #define load_lanes_direct { -1, -1, false }
+ #define mask_load_lanes_direct { -1, -1, false }
+ #define gather_load_direct { 3, 1, false }
+ #define len_load_direct { -1, -1, false }
+ #define mask_store_direct { 3, 2, false }
++#define gather_prefetch_direct { 3, 1, false }
+ #define store_lanes_direct { 0, 0, false }
+ #define mask_store_lanes_direct { 0, 0, false }
+ #define vec_cond_mask_direct { 1, 0, false }
+@@ -2745,6 +2747,53 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
+ #define expand_mask_load_lanes_optab_fn expand_mask_load_optab_fn
+ #define expand_len_load_optab_fn expand_partial_load_optab_fn
+ 
++/* Expand MASK_PREFETCH call STMT using optab OPTAB.
++   .MASK_STORE (_5, 64B, loop_mask_98, vect__8.10_102);
++   .MASK_PREFETCH (_68, 64B, loop_mask_98, vect__8.10_102, 4);
++*/
++
++static void
++expand_mask_prefetch_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
++{
++  if (targetm.vectorize.code_for_prefetch == NULL
++      || targetm.vectorize.prefetch_handleable_mode_p == NULL)
++    return;
++
++  tree base = gimple_call_arg (stmt, 0);
++  if (base == NULL_TREE)
++    return;
++
++  tree maskt = gimple_call_arg (stmt, 2);
++  tree target = gimple_call_arg (stmt, 3);
++  tree prfop = gimple_call_arg (stmt, 4);
++  HOST_WIDE_INT prfop_int = tree_to_uhwi (prfop);
++  /* Bit 3 of the prfop selects stores over loads.  */
++  HOST_WIDE_INT access = prfop_int & 8;
++  /* Bits 1 and 2 specify the locality; 0-based for svprfop but
++     1-based for PREFETCH.  */
++  HOST_WIDE_INT locality = ((prfop_int >> 1) & 3) + 1;
++
++  machine_mode m_mode = TYPE_MODE (TREE_TYPE (target));
++  if (!targetm.vectorize.prefetch_handleable_mode_p (m_mode))
++    return;
++  insn_code icode = targetm.vectorize.code_for_prefetch (m_mode);
++
++  rtx mask = expand_normal (maskt);
++  rtx base_rtx = expand_normal (base);
++  /* Convert ptr_mode value X to Pmode.  */
++  if (ptr_mode == SImode)
++    base_rtx = simplify_gen_unary (ZERO_EXTEND, DImode, base_rtx, SImode);
++
++  unsigned i = 0;
++  class expand_operand ops[5];
++  create_input_operand (&ops[i++], mask, TYPE_MODE (TREE_TYPE (maskt)));
++  create_address_operand (&ops[i++], base_rtx);
++  create_integer_operand (&ops[i++], prfop_int);
++  create_integer_operand (&ops[i++], access);
++  create_integer_operand (&ops[i++], locality);
++  expand_insn (icode, i, ops);
++}
++
+ /* Expand MASK_STORE{,_LANES} or LEN_STORE call STMT using optab OPTAB.  */
+ 
+ static void
+@@ -3402,6 +3451,70 @@ contains_call_div_mod (rtx_insn *insn)
+   return false;
+  }
+ 
++/* Expand {MASK_,}GATHER_PREFETCH call CALL using optab OPTAB.
++  vect_patt_97.14_77 = .MASK_GATHER_LOAD (_78, vect__14.13_79, 8, { 0.0, ... }, loop_mask_87);
++  .MASK_GATHER_PREFETCH (_45, vect__14.13_79, 8, { 0.0, ... }, loop_mask_87, vect_patt_97.14_77, 4);
++*/
++
++static void
++expand_gather_prefetch_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
++{
++  if (targetm.vectorize.code_for_gather_prefetch == NULL
++      || targetm.vectorize.prefetch_handleable_mode_p == NULL)
++    return;
++
++  /* Extracting tree nodes, only expand for scalar base and vector index.  */
++  tree base = gimple_call_arg (stmt, 0);
++  if (VECTOR_TYPE_P (TREE_TYPE (base)))
++    return;
++  tree offset = gimple_call_arg (stmt, 1);
++  if (VECTOR_TYPE_P (TREE_TYPE (offset)) == false)
++    return;
++
++  tree scale = gimple_call_arg (stmt, 2);
++  tree mask = gimple_call_arg (stmt, 4);
++  tree target = gimple_call_arg (stmt, 5);
++  tree prfop = gimple_call_arg (stmt, 6);
++
++  /* Convert to the rtx node.  */
++  rtx base_rtx = expand_normal (base);
++  /* Convert ptr_mode value X to Pmode.  */
++  if (ptr_mode == SImode)
++    base_rtx = simplify_gen_unary (ZERO_EXTEND, DImode, base_rtx, SImode);
++  rtx offset_rtx = expand_normal (offset);
++  rtx const_rtx = CONST0_RTX (TYPE_MODE (TREE_TYPE (target)));
++  rtx mask_rtx = expand_normal (mask);
++  HOST_WIDE_INT scale_int = tree_to_shwi (scale);
++  HOST_WIDE_INT prfop_int = tree_to_uhwi (prfop);
++  /* Bit 3 of the prfop selects stores over loads.  */
++  HOST_WIDE_INT access = prfop_int & 8;
++  /* Bits 1 and 2 specify the locality; 0-based for svprfop but
++     1-based for PREFETCH.  */
++  HOST_WIDE_INT locality = ((prfop_int >> 1) & 3) + 1;
++
++  /* add operand.  */
++  unsigned int i = 0;
++  class expand_operand ops[9];
++  create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask)));
++  create_address_operand (&ops[i++], base_rtx);
++  create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset)));
++  /* Check whether the index has unsigned.  */
++  create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset)));
++  create_integer_operand (&ops[i++], scale_int);
++  create_input_operand (&ops[i++], const_rtx, GET_MODE (const_rtx));
++  create_integer_operand (&ops[i++], prfop_int);
++  create_integer_operand (&ops[i++], access);
++  create_integer_operand (&ops[i++], locality);
++
++  machine_mode reg_mode = GET_MODE (offset_rtx);
++  machine_mode m_mode = TYPE_MODE (TREE_TYPE (target));
++  if (!targetm.vectorize.prefetch_handleable_mode_p (m_mode))
++    return;
++  insn_code icode = targetm.vectorize.code_for_gather_prefetch
++					       (m_mode, reg_mode);
++  expand_insn (icode, i, ops);
++}
++
+ /* Expand DIVMOD() using:
+  a) optab handler for udivmod/sdivmod if it is available.
+  b) If optab_handler doesn't exist, generate call to
+@@ -3767,10 +3880,12 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
+ #define direct_cond_binary_optab_supported_p direct_optab_supported_p
+ #define direct_cond_ternary_optab_supported_p direct_optab_supported_p
+ #define direct_mask_load_optab_supported_p convert_optab_supported_p
++#define direct_mask_prefetch_optab_supported_p direct_optab_supported_p
+ #define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p
+ #define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p
+ #define direct_gather_load_optab_supported_p convert_optab_supported_p
+ #define direct_len_load_optab_supported_p direct_optab_supported_p
++#define direct_gather_prefetch_optab_supported_p direct_optab_supported_p
+ #define direct_mask_store_optab_supported_p convert_optab_supported_p
+ #define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p
+ #define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p
+diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
+index d2d550d35..05fc50328 100644
+--- a/gcc/internal-fn.def
++++ b/gcc/internal-fn.def
+@@ -121,6 +121,8 @@ along with GCC; see the file COPYING3.  If not see
+ #endif
+ 
+ DEF_INTERNAL_OPTAB_FN (MASK_LOAD, ECF_PURE, maskload, mask_load)
++DEF_INTERNAL_OPTAB_FN (MASK_PREFETCH, ECF_NOVOPS | ECF_LEAF,
++		       maskprefetch, mask_prefetch)
+ DEF_INTERNAL_OPTAB_FN (LOAD_LANES, ECF_CONST, vec_load_lanes, load_lanes)
+ DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE,
+ 		       vec_mask_load_lanes, mask_load_lanes)
+@@ -128,6 +130,8 @@ DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE,
+ DEF_INTERNAL_OPTAB_FN (GATHER_LOAD, ECF_PURE, gather_load, gather_load)
+ DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE,
+ 		       mask_gather_load, gather_load)
++DEF_INTERNAL_OPTAB_FN (MASK_GATHER_PREFETCH, ECF_NOVOPS | ECF_LEAF,
++		       mask_gather_prefetch, gather_prefetch)
+ 
+ DEF_INTERNAL_OPTAB_FN (LEN_LOAD, ECF_PURE, len_load, len_load)
+ 
+diff --git a/gcc/ipa-pure-const.cc b/gcc/ipa-pure-const.cc
+index 2642df91e..222fe6465 100644
+--- a/gcc/ipa-pure-const.cc
++++ b/gcc/ipa-pure-const.cc
+@@ -534,6 +534,7 @@ builtin_safe_for_const_function_p (bool *looping, tree callee)
+ 	*looping = false;
+ 	return true;
+       case BUILT_IN_PREFETCH:
++      case BUILT_IN_PREFETCH_FULL:
+ 	*looping = true;
+ 	return true;
+       default:
+diff --git a/gcc/optabs.def b/gcc/optabs.def
+index dbf529434..8ca25a5cc 100644
+--- a/gcc/optabs.def
++++ b/gcc/optabs.def
+@@ -90,9 +90,11 @@ OPTAB_CD(vec_cmp_optab, "vec_cmp$a$b")
+ OPTAB_CD(vec_cmpu_optab, "vec_cmpu$a$b")
+ OPTAB_CD(vec_cmpeq_optab, "vec_cmpeq$a$b")
+ OPTAB_CD(maskload_optab, "maskload$a$b")
++OPTAB_CD(maskprefetch_optab, "maskprefetch$a$b")
+ OPTAB_CD(maskstore_optab, "maskstore$a$b")
+ OPTAB_CD(gather_load_optab, "gather_load$a$b")
+ OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b")
++OPTAB_CD(mask_gather_prefetch_optab, "mask_gather_prefetch$a$b")
+ OPTAB_CD(scatter_store_optab, "scatter_store$a$b")
+ OPTAB_CD(mask_scatter_store_optab, "mask_scatter_store$a$b")
+ OPTAB_CD(vec_extract_optab, "vec_extract$a$b")
+diff --git a/gcc/opts.cc b/gcc/opts.cc
+index 2433ace06..432b822e8 100644
+--- a/gcc/opts.cc
++++ b/gcc/opts.cc
+@@ -2108,6 +2108,13 @@ enable_fdo_optimizations (struct gcc_options *opts,
+   SET_OPTION_IF_UNSET (opts, opts_set, flag_tree_loop_distribution, value);
+ }
+ 
++static void
++set_cache_misses_profile_params (struct gcc_options *opts,
++				 struct gcc_options *opts_set)
++{
++  SET_OPTION_IF_UNSET (opts, opts_set, flag_prefetch_loop_arrays, 1);
++}
++
+ /* Enable cfgo-related flags.  */
+ 
+ static void
+@@ -3143,10 +3150,20 @@ common_handle_option (struct gcc_options *opts,
+       /* FALLTHRU */
+     case OPT_fauto_profile:
+       enable_fdo_optimizations (opts, opts_set, value);
+-	  /* 2 is special and means flag_profile_correction trun on by
+-	     -fauto-profile.  */
++      /* 2 is special and means flag_profile_correction trun on by
++	 -fauto-profile.  */
+       SET_OPTION_IF_UNSET (opts, opts_set, flag_profile_correction,
+-			   (value ? 2 : 0));
++		      (value ? 2 : 0));
++      break;
++
++    case OPT_fadditional_profile_:
++      opts->x_additional_profile_file = xstrdup (arg);
++      opts->x_flag_additional_profile = true;
++      value = true;
++      /* No break here - do -fadditional-profile processing. */
++      /* FALLTHRU */
++    case OPT_fadditional_profile:
++      opts->x_flag_ipa_extend_auto_profile = value;
+       break;
+ 
+     case OPT_fipa_struct_reorg_:
+@@ -3155,17 +3172,36 @@ common_handle_option (struct gcc_options *opts,
+     case OPT_fipa_struct_reorg:
+       opts->x_flag_ipa_struct_reorg = value;
+       if (value && !opts->x_struct_layout_optimize_level)
+-	{
+-	  /* Using the -fipa-struct-reorg option is equivalent to using
+-	     -fipa-struct-reorg=1.  */
+-	  opts->x_struct_layout_optimize_level = 1;
+-	}
++      {
++	      /* Using the -fipa-struct-reorg option is equivalent to using
++		 -fipa-struct-reorg=1.  */
++	      opts->x_struct_layout_optimize_level = 1;
++      }
+       break;
+ 
+     case OPT_fipa_reorder_fields:
+       SET_OPTION_IF_UNSET (opts, opts_set, flag_ipa_struct_reorg, value);
+       break;
+ 
++    case OPT_fipa_extend_auto_profile:
++      opts->x_flag_ipa_extend_auto_profile = opts->x_flag_cache_misses_profile
++	      ? true : value;
++      break;
++
++    case OPT_fcache_misses_profile_:
++      opts->x_cache_misses_profile_file = xstrdup (arg);
++      opts->x_flag_cache_misses_profile = true;
++      value = true;
++      /* No break here - do -fcache-misses-profile processing. */
++      /* FALLTHRU */
++    case OPT_fcache_misses_profile:
++      opts->x_flag_ipa_extend_auto_profile = value;
++      if (value)
++      {
++	      set_cache_misses_profile_params (opts, opts_set);
++      }
++      break;
++
+     case OPT_fcfgo_profile_generate_:
+       opts->x_profile_data_prefix = xstrdup (arg);
+       value = true;
+diff --git a/gcc/params.opt b/gcc/params.opt
+index e5472dfc8..e06e50611 100644
+--- a/gcc/params.opt
++++ b/gcc/params.opt
+@@ -1262,4 +1262,66 @@ Range for depended ldp search in split-ldp-stp path.
+ Common Joined UInteger Var(semi_relayout_level) Init(13) IntegerRange(11, 15) Param Optimization
+ Set capacity of each bucket to semi-relayout to (1 << semi-relayout-level) / 8 .
+ 
++-param=mem-access-ratio=
++Common Joined UInteger Var(param_mem_access_ratio) Init(20) IntegerRange(0, 100) Param Optimization
++Memory access ratio (in percent).
++
++-param=mem-access-num=
++Common Joined UInteger Var(param_mem_access_num) Init(3) Param Optimization
++Memory access num.
++
++-param=prefetch-offset=
++Common Joined UInteger Var(param_prefetch_offset) Init(1024)
++IntegerRange(1, 999999) Param Optimization
++Prefetch Offset, which is usually a power of two due to cache line size.
++
++-param=branch-prob-threshold=
++Common Joined UInteger Var(param_branch_prob_threshold) Init(80) IntegerRange(50, 100)
++Param Optimization
++High Execution Rate Branch Threshold.
++
++-param=issue-topn=
++Common Joined UInteger Var(param_issue_topn) Init(1) Param Optimization
++Issue topn LLC mem_ref hint.
++
++-param=force-issue=
++Common Joined UInteger Var(param_force_issue) Init(0) IntegerRange(0, 1) Param
++Force issue the topn LLC mem_ref hint, without generating dynamic multi-branches.
++
++-param=llc-capacity-per-core=
++Common Joined UInteger Var(param_llc_capacity_per_core) Init(107) IntegerRange(0, 999999) Param
++LLC capacity per core.
++
++-param=filter-kernels=
++Common Joined UInteger Var(param_filter_kernels) Init(1) IntegerRange(0, 1) Param
++Allow LLC allocate pass to greedily filter kernels by traversing the corresponding basic blocks
++through edges with branch probability no less than param_branch_prob_threshold.
++
++-param=outer-loop-nums=
++Common Joined UInteger Var(param_outer_loop_num) Init(1) IntegerRange(1, 10) Param
++Maximum number of outer loops allowed to extend outer loops for loops that
++cannot recognize inner loop boundaries.
++
++-param=llc-level=
++Common Joined UInteger Var(param_llc_level) Init(3) IntegerRange(3, 4)
++Param Optimization
++Specifies the HBM cache level.
++
++-param=filter-mode=
++Common Joined UInteger Var(param_filter_mode) Init(1) IntegerRange(0, 1) Param
++Set kernel filtering mode. Use basic block count by default; use branch probability mode when filter mode is turned off.
++
++-param=transfer-footprint=
++Common Joined UInteger Var(param_transfer_footprint) Init(1) IntegerRange(0, 1) Param
++Allow transferring the firstly calculated footprint expression to the target memory reference
++from which it is impossible to retrieve the foortprint.
++
++-param=llc-allocate-func-topn=
++Common Joined UInteger Var(param_llc_allocate_func_topn) Init(0) Param Optimization
++TopN functions of pmu counts to be analyzed in LLC allocation.
++
++-param=llc-allocate-func-counts-threshold=
++Common Joined UInteger Var(param_llc_allocate_func_counts_threshold) Init(1) Param Optimization
++Threshold functions of pmu counts to be analyzed in LLC allocation.
++
+ ; This comment is to ensure we retain the blank line above.
+diff --git a/gcc/passes.def b/gcc/passes.def
+index 90643d533..49001adde 100644
+--- a/gcc/passes.def
++++ b/gcc/passes.def
+@@ -141,6 +141,7 @@ along with GCC; see the file COPYING3.  If not see
+ 
+   NEXT_PASS (pass_target_clone);
+   NEXT_PASS (pass_ipa_auto_profile);
++  NEXT_PASS (pass_ipa_extend_auto_profile);
+   NEXT_PASS (pass_ipa_tree_profile);
+   PUSH_INSERT_PASSES_WITHIN (pass_ipa_tree_profile)
+       NEXT_PASS (pass_feedback_split_functions);
+@@ -325,6 +326,7 @@ along with GCC; see the file COPYING3.  If not see
+ 	  /* Run IVOPTs after the last pass that uses data-reference analysis
+ 	     as that doesn't handle TARGET_MEM_REFs.  */
+ 	  NEXT_PASS (pass_iv_optimize);
++	  NEXT_PASS (pass_llc_allocate);
+ 	  NEXT_PASS (pass_lim);
+ 	  NEXT_PASS (pass_tree_loop_done);
+       POP_INSERT_PASSES ()
+diff --git a/gcc/print-rtl.cc b/gcc/print-rtl.cc
+index 636113d5b..b7506514a 100644
+--- a/gcc/print-rtl.cc
++++ b/gcc/print-rtl.cc
+@@ -1579,6 +1579,12 @@ print_exp (pretty_printer *pp, const_rtx x, int verbose)
+       op[1] = XEXP (x, 1);
+       op[2] = XEXP (x, 2);
+       break;
++    case PREFETCH_FULL:
++      fun = "prefetch_full";
++      op[0] = XEXP (x, 0);
++      op[1] = XEXP (x, 1);
++      op[2] = XEXP (x, 2);
++      break;
+     case UNSPEC:
+     case UNSPEC_VOLATILE:
+       {
+diff --git a/gcc/rtl.def b/gcc/rtl.def
+index 08e31fa35..78ec1a021 100644
+--- a/gcc/rtl.def
++++ b/gcc/rtl.def
+@@ -282,6 +282,15 @@ DEF_RTL_EXPR(ADDR_DIFF_VEC, "addr_diff_vec", "eEee0", RTX_EXTRA)
+    whose prefetch instructions do not support them.  */
+ DEF_RTL_EXPR(PREFETCH, "prefetch", "eee", RTX_EXTRA)
+ 
++/* Memory prefetch, with attributes supported on some targets.
++   Operand 1 is the address of the memory to fetch.
++   Operand 2 is 1 for a write access, 0 otherwise.
++   Operand 3 is the level of prfop.
++
++   The attributes specified by operands 2 and 3 are ignored for targets
++   whose prefetch instructions do not support them.  */
++DEF_RTL_EXPR(PREFETCH_FULL, "prefetch_full", "eee", RTX_EXTRA)
++
+ /* ----------------------------------------------------------------------
+    At the top level of an instruction (perhaps under PARALLEL).
+    ---------------------------------------------------------------------- */
+diff --git a/gcc/rtl.h b/gcc/rtl.h
+index a0db225cb..844e1a7c3 100644
+--- a/gcc/rtl.h
++++ b/gcc/rtl.h
+@@ -2814,6 +2814,10 @@ do {								        \
+ #define PREFETCH_SCHEDULE_BARRIER_P(RTX)					\
+   (RTL_FLAG_CHECK1 ("PREFETCH_SCHEDULE_BARRIER_P", (RTX), PREFETCH)->volatil)
+ 
++/* True if RTX is flagged to be a scheduling barrier.  */
++#define PREFETCH_FULL_SCHEDULE_BARRIER_P(RTX)				\
++  (RTL_FLAG_CHECK1 ("PREFETCH_FULL_SCHEDULE_BARRIER_P", (RTX), PREFETCH_FULL)->volatil)
++
+ /* Indicate whether the machine has any sort of auto increment addressing.
+    If not, we can avoid checking for REG_INC notes.  */
+ 
+diff --git a/gcc/rtlanal.cc b/gcc/rtlanal.cc
+index c436c640c..7f5646ce7 100644
+--- a/gcc/rtlanal.cc
++++ b/gcc/rtlanal.cc
+@@ -1198,6 +1198,7 @@ reg_referenced_p (const_rtx x, const_rtx body)
+       return reg_overlap_mentioned_p (x, TRAP_CONDITION (body));
+ 
+     case PREFETCH:
++    case PREFETCH_FULL:
+       return reg_overlap_mentioned_p (x, XEXP (body, 0));
+ 
+     case UNSPEC:
+@@ -2042,6 +2043,7 @@ note_uses (rtx *pbody, void (*fun) (rtx *, void *), void *data)
+       return;
+ 
+     case PREFETCH:
++    case PREFETCH_FULL:
+       (*fun) (&XEXP (body, 0), data);
+       return;
+ 
+diff --git a/gcc/sched-deps.cc b/gcc/sched-deps.cc
+index 948aa0c3b..db453fb9b 100644
+--- a/gcc/sched-deps.cc
++++ b/gcc/sched-deps.cc
+@@ -2705,7 +2705,9 @@ sched_analyze_2 (class deps_desc *deps, rtx x, rtx_insn *insn)
+       break;
+ 
+     case PREFETCH:
+-      if (PREFETCH_SCHEDULE_BARRIER_P (x))
++    case PREFETCH_FULL:
++      if ((code == PREFETCH && PREFETCH_SCHEDULE_BARRIER_P (x))
++          || (code == PREFETCH_FULL && PREFETCH_FULL_SCHEDULE_BARRIER_P (x)))
+ 	reg_pending_barrier = TRUE_BARRIER;
+       /* Prefetch insn contains addresses only.  So if the prefetch
+ 	 address has no registers, there will be no dependencies on
+diff --git a/gcc/target-insns.def b/gcc/target-insns.def
+index de8c0092f..9cfa19475 100644
+--- a/gcc/target-insns.def
++++ b/gcc/target-insns.def
+@@ -77,6 +77,7 @@ DEF_TARGET_INSN (omp_simt_vote_any, (rtx x0, rtx x1))
+ DEF_TARGET_INSN (omp_simt_xchg_bfly, (rtx x0, rtx x1, rtx x2))
+ DEF_TARGET_INSN (omp_simt_xchg_idx, (rtx x0, rtx x1, rtx x2))
+ DEF_TARGET_INSN (prefetch, (rtx x0, rtx x1, rtx x2))
++DEF_TARGET_INSN (prefetch_full, (rtx x0, rtx x1, rtx x2))
+ DEF_TARGET_INSN (probe_stack, (rtx x0))
+ DEF_TARGET_INSN (probe_stack_address, (rtx x0))
+ DEF_TARGET_INSN (prologue, (void))
+diff --git a/gcc/target.def b/gcc/target.def
+index 142858fa3..646489540 100644
+--- a/gcc/target.def
++++ b/gcc/target.def
+@@ -2064,6 +2064,37 @@ it is for the vector version.",
+  (vec_info *vinfo, bool costing_for_scalar),
+  default_vectorize_create_costs)
+ 
++/* Function for vector prefetch operation.  */
++DEFHOOK
++(code_for_prefetch,
++ "This hook should return the decl of a function that implements the\n\
++vectorized variant of the function with the @code{combined_fn} code\n\
++@var{code} or @code{NULL_TREE} if such a function is not available.\n\
++The return type of the vectorized function shall be of vector type\n\
++@var{vec_type_out} and the argument types should be @var{vec_type_in}.",
++ insn_code, (machine_mode arg),
++ NULL)
++
++/* Function for vector gather prefetch operation.  */
++DEFHOOK
++(code_for_gather_prefetch,
++ "This hook should return the decl of a function that implements the\n\
++vectorized variant of the function with the @code{combined_fn} code\n\
++@var{code} or @code{NULL_TREE} if such a function is not available.\n\
++The return type of the vectorized function shall be of vector type\n\
++@var{vec_type_out} and the argument types should be @var{vec_type_in}.",
++ insn_code, (machine_mode mode_to, machine_mode mode_form),
++ NULL)
++
++/* Function to check whether the target hardware architecture supports
++   a full SVE data vector mode.  */
++DEFHOOK
++(prefetch_handleable_mode_p,
++ "This hook should return true if the target hardware architecture\n\
++supports a full SVE data vector mode.",
++ bool, (machine_mode arg),
++ NULL)
++
+ HOOK_VECTOR_END (vectorize)
+ 
+ #undef HOOK_PREFIX
+diff --git a/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp
+new file mode 100644
+index 000000000..1793ba9d1
+--- /dev/null
++++ b/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp
+@@ -0,0 +1,27 @@
++#   Copyright (C) 1997-2022 Free Software Foundation, Inc.
++
++# This program is free software; you can redistribute it and/or modify
++# it under the terms of the GNU General Public License as published by
++# the Free Software Foundation; either version 3 of the License, or
++# (at your option) any later version.
++#
++# This program is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++# GNU General Public License for more details.
++#
++# You should have received a copy of the GNU General Public License
++# along with GCC; see the file COPYING3.  If not see
++# <http://www.gnu.org/licenses/>.
++
++load_lib g++-dg.exp
++load_lib target-supports.exp
++
++# Initialize `dg'.
++dg-init
++
++dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.C]] \
++	"" "-fllc-allocate"
++
++# All done.
++dg-finish
+\ No newline at end of file
+diff --git a/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C b/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C
+new file mode 100644
+index 000000000..b5bf69510
+--- /dev/null
++++ b/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C
+@@ -0,0 +1,52 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param branch-prob-threshold=50  --param filter-kernels=0 --param mem-access-num=2 --param issue-topn=1 --param force-issue=1" } */
++#include "multidim_array.h"
++
++class Input
++{
++  public:
++    int metadata_offset = 13;
++    int exp_nr_images = 1;
++    MultidimArray<double> exp_Mweight;
++    void convertAllSquaredDifferencesToWeights();
++};
++
++int main()
++{
++  clock_t start = clock();
++  Input input;
++  int testIter = 2;
++
++  for (int i = 0; i < testIter; ++i)
++    {
++      input.convertAllSquaredDifferencesToWeights();
++    }
++  return 0;
++}
++
++void Input::convertAllSquaredDifferencesToWeights()
++{
++  for (int img_id = 0; img_id < exp_nr_images; img_id++)
++  {
++    int my_metadata_offset = metadata_offset + img_id;
++    MultidimArray<double> sorted_weight;
++
++    exp_Mweight.getRow(img_id, sorted_weight);
++    long int np = 0;
++    FOR_ALL_DIRECT_ELEMENTS_IN_MULTIDIMARRAY(sorted_weight)
++    {
++      if (DIRECT_MULTIDIM_ELEM(sorted_weight, n) > 0.)
++        {
++          DIRECT_MULTIDIM_ELEM(sorted_weight, np) = DIRECT_MULTIDIM_ELEM( \
++            sorted_weight, n);
++          np++;
++        }
++    }
++  }
++}
++
++
++
++/* { dg-final { scan-tree-dump-times "dense memory access" 1 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "__builtin_prefetch" 1 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */
+diff --git a/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h b/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h
+new file mode 100644
+index 000000000..682f24703
+--- /dev/null
++++ b/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h
+@@ -0,0 +1,186 @@
++#ifndef MULTIDIM_ARRAY_H
++#define MULTIDIM_ARRAY_H
++
++#include <iostream>
++
++#define RELION_ALIGNED_MALLOC malloc
++#define RELION_ALIGNED_FREE free
++
++#define STARTINGX(v) ((v).xinit)
++#define STARTINGY(v) ((v).yinit)
++#define NZYXSIZE(v) ((v).nzyxdim)
++
++#define DIRECT_MULTIDIM_ELEM(v,n) ((v).data[(n)])
++#define FOR_ALL_DIRECT_ELEMENTS_IN_MULTIDIMARRAY(v) \
++  for (long int n=0; n<NZYXSIZE(v); ++n)
++
++#define FOR_ALL_DIRECT_ELEMENTS_IN_MULTIDIMARRAY_ptr(v,n,ptr) \
++  for ((n)=0, (ptr)=(v).data; (n)<NZYXSIZE(v); ++(n), ++(ptr))
++
++#define DIRECT_A2D_ELEM(v,i,j) ((v).data[(i)*(v).xdim+(j)])
++#define A2D_ELEM(v, i, j) \
++  DIRECT_A2D_ELEM(v, (i) - STARTINGY(v), (j) - STARTINGX(v))
++
++#define DIRECT_A1D_ELEM(v, i) ((v).data[(i)])
++#define A1D_ELEM(v, i) DIRECT_A1D_ELEM(v, (i) - ((v).xinit))
++
++template<typename T>
++class MultidimArray
++{
++public:
++  T* data;
++  bool destroyData;
++  long int ndim;
++  long int zdim;
++  long int ydim;
++  long int xdim;
++  long int yxdim;
++  long int zyxdim;
++  long int nzyxdim;
++  long int zinit;
++  long int yinit;
++  long int xinit;
++  long int nzyxdimAlloc;
++
++public:
++  void clear()
++  {
++    coreDeallocate();
++    coreInit();
++  }
++
++  void coreInit()
++  {
++    xdim=0;
++    yxdim=0;
++    zyxdim=0;
++    nzyxdim=0;
++    ydim=1;
++    zdim=1;
++    ndim=1;
++    zinit=0;
++    yinit=0;
++    xinit=0;
++    data=NULL;
++    nzyxdimAlloc = 0;
++    destroyData=true;
++  }
++
++  void coreAllocate(long int _ndim, long int _zdim, long int _ydim, long int _xdim)
++  {
++    if (_ndim <= 0 || _zdim <= 0 || _ydim<=0 || _xdim<=0)
++      {
++        clear();
++        return;
++      }
++
++    ndim=_ndim;
++    zdim=_zdim;
++    ydim=_ydim;
++    xdim=_xdim;
++    yxdim=ydim*xdim;
++    zyxdim=zdim*yxdim;
++    nzyxdim=ndim*zyxdim;
++
++    coreAllocate();
++  }
++
++  void coreAllocate()
++  {
++    data = (T*)RELION_ALIGNED_MALLOC(sizeof(T) * nzyxdim);
++    nzyxdimAlloc = nzyxdim;
++  }
++
++  void coreDeallocate()
++  {
++    if (data != NULL && destroyData)
++      {
++        RELION_ALIGNED_FREE(data);
++      }
++    data=NULL;
++    nzyxdimAlloc = 0;
++  }
++
++  void resize(long int Ndim, long int Zdim, long int Ydim, long int Xdim)
++  {
++    if (Ndim*Zdim*Ydim*Xdim == nzyxdimAlloc && data != NULL)
++      {
++        ndim = Ndim;
++        xdim = Xdim;
++        ydim = Ydim;
++        zdim = Zdim;
++        yxdim = Ydim * Xdim;
++        zyxdim = Zdim * yxdim;
++        nzyxdim = Ndim * zyxdim;
++        nzyxdimAlloc = nzyxdim;
++        return;
++      }
++
++    if (Xdim <= 0 || Ydim <= 0 || Zdim <= 0 || Ndim <= 0)
++      {
++        clear();
++        return;
++      }
++
++    if (NZYXSIZE(*this) > 0 && data == NULL)
++      {
++        coreAllocate();
++        return;
++      }
++
++    size_t YXdim=Ydim*Xdim;
++    size_t ZYXdim=Zdim*YXdim;
++    size_t NZYXdim=Ndim*ZYXdim;
++
++    T * new_data = (T*)RELION_ALIGNED_MALLOC(sizeof(T) * NZYXdim);
++    for (long int l = 0; l < Ndim; l++)
++        for (long int k = 0; k < Zdim; k++)
++            for (long int i = 0; i < Ydim; i++)
++                for (long int j = 0; j < Xdim; j++)
++                  {
++                    T val;
++                    new_data[l*ZYXdim + k*YXdim+i*Xdim+j] = val;
++                  }
++    coreDeallocate();
++
++    data = new_data;
++    ndim = Ndim;
++    xdim = Xdim;
++    ydim = Ydim;
++    zdim = Zdim;
++    yxdim = Ydim * Xdim;
++    zyxdim = Zdim * yxdim;
++    nzyxdim = Ndim * zyxdim;
++    nzyxdimAlloc = nzyxdim;
++  }
++
++  void resize(long int Xdim)
++  {
++    resize(1, 1, 1, Xdim);
++  }
++
++  inline T& operator()(long int i, long int j) const
++  {
++    return A2D_ELEM(*this, i, j);
++  }
++
++  inline T& operator()(long int i) const
++  {
++    return A1D_ELEM(*this, i);
++  }
++
++  void getRow(long int i, MultidimArray<T>& v) const
++  {
++    if (xdim == 0 || ydim == 0)
++      {
++        v.clear();
++        return;
++      }
++
++    v.resize(xdim);
++    for (long int j = 0; j < xdim; j++)
++      v(j) = (*this)(i, j);
++  }
++};
++
++#endif /* MULTIDIM_ARRAY_H */
+\ No newline at end of file
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
+new file mode 100644
+index 000000000..091e654f9
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
+@@ -0,0 +1,61 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param issue-topn=2 --param branch-prob-threshold=50 --param filter-mode=0" } */
++
++#include <stdio.h>
++
++#define N 131590
++#define F 384477
++
++double diagPtr[N];
++double psiPtr[N];
++double ApsiPtr[N];
++int lPtr[F];
++int uPtr[F];
++double lowerPtr[F];
++double upperPtr[F];
++
++void
++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr,
++      int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces)
++{
++  for (int cell=0; cell<nCells; cell++)
++    ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
++
++  for (int face=0; face<nFaces; face++)
++    {
++      ApsiPtr[uPtr[face]] += lowerPtr[face]*psiPtr[lPtr[face]];
++      ApsiPtr[lPtr[face]] += upperPtr[face]*psiPtr[uPtr[face]];
++    }
++}
++
++int
++main (int argc, char *argv[])
++{
++  int nCells = N;
++  int nFaces = F;
++  int testIter = 2;
++
++  for (int i=0; i<testIter; i++)
++    AMUL (diagPtr,psiPtr,ApsiPtr,lPtr,uPtr,lowerPtr,upperPtr,nCells,nFaces);
++
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 5 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "Tracing succeeded" 29 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not   "Tracing failed" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "static_data_size:" 7 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){2}\}" 3 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){3}\}" 1 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times ", size: (?!(0\.000000))" 7 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times ", size: 0\.000000" 19 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tApsiPtr\\t\\(1.003952, 1, 5, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tpsiPtr\\t\\(1.003952, 1, 3, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tdiagPtr\\t\\(1.003952, 1, 1, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tlowerPtr\\t\\(2.933319, 1, 1, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tupperPtr\\t\\(2.933319, 1, 1, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tlPtr\\t\\(1.466660, 1, 1, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tuPtr\\t\\(1.466660, 1, 1, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "insert svprfd" 4 "llc_allocate" } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c
+new file mode 100644
+index 000000000..16a56ae03
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c
+@@ -0,0 +1,54 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param force-issue=1 --param filter-mode=0" } */
++
++#include <stdio.h>
++
++#define N 100000
++
++int A_i[N];
++int A_j[N];
++double A_data[N];
++double x_data[N];
++double y_data[N];
++int num_rows = N;
++
++void
++MatMult (int *A_i, int *A_j, double *A_data, double *x_data,
++         int num_rows, double *y_data)
++{
++  int i = 0;
++  int j = 0;
++  double temp = 0;
++  for (i = 0; i < num_rows; i++)
++    {
++      temp = y_data[i];
++      for (j = A_i[i]; j < A_i[i+1]; j++)
++        temp += A_data[j] * x_data[A_j[j]];
++      y_data[i] = temp;
++    }
++}
++
++int
++main (int argc, char *argv[])
++{
++  int testIter = 2;
++
++  for (int i = 0; i < testIter; i++)
++    MatMult (A_i, A_j, A_data, x_data, num_rows, y_data);
++
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 4 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "Tracing succeeded" 14 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not   "Tracing failed" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not   "static_data_size:" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not   ", size: (?!(0\.000000))" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times ", size: 0\.000000" 6 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "\\d\\tx_data\\t\\(0.000000, 1, 1, 0\\)" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "\\d\\tA_j\\t\\(0.000000, 1, 1, 0\\)" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "\\d\\tA_data\\t\\(0.000000, 1, 1, 0\\)" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not   "runtime issue" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "insert svprfd_gather" 2 "llc_allocate" } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp
+new file mode 100644
+index 000000000..05a3bf842
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp
+@@ -0,0 +1,27 @@
++#   Copyright (C) 2022-2023 Free Software Foundation, Inc.
++
++# This program is free software; you can redistribute it and/or modify
++# it under the terms of the GNU General Public License as published by
++# the Free Software Foundation; either version 3 of the License, or
++# (at your option) any later version.
++#
++# This program is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++# GNU General Public License for more details.
++#
++# You should have received a copy of the GNU General Public License
++# along with GCC; see the file COPYING3.  If not see
++# <http://www.gnu.org/licenses/>.
++
++load_lib gcc-dg.exp
++load_lib target-supports.exp
++
++# Initialize `dg'.
++dg-init
++
++dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \
++	"" "-fllc-allocate"
++
++# All done.
++dg-finish
+\ No newline at end of file
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c
+new file mode 100644
+index 000000000..113acbceb
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c
+@@ -0,0 +1,36 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c --param=mem-access-ratio=1 --param=mem-access-num=0" } */
++
++/* In this deja test case, we test how Phase 2 & 3 of llc-allocate pass deals
++   with an indirect memory access in a nested loop where the use-block for the
++   induction variable of this memory access is a child/descendent of its
++   def-block (we make it by defining the induction variable in the outer loop).
++   Therefore, the reference can be successfully traced after outer-loop
++   analysis.  */
++#include <stdlib.h>
++#include <time.h> 
++
++void cross_bb_indir_mem_acc (int *arr1, int *arr2, int *arr3, int *arr4, int n) {
++    srand (time (NULL));
++
++    int j_s;
++    int j_e = arr1[0];
++    int k;
++
++    for (int i = 0; i < n; i++)
++    {
++        j_s = j_e;
++        j_e = arr1[i + 1];
++
++        k = arr3[i];
++
++        for (int j = j_s; j < j_e; j++)
++        {
++           arr4[j] -= arr2[k];
++        }
++
++    }
++}
++
++/* { dg-final { scan-tree-dump "Unhandled indirect memory access tracing." "llc_allocate" } } */
++/* { dg-final { scan-tree-dump "Retrace indirect memory access after outer loop analysis:" "llc_allocate" } } */
+\ No newline at end of file
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c
+new file mode 100644
+index 000000000..a2e7f66a4
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c
+@@ -0,0 +1,61 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++#include <stdio.h>
++#define N 131590
++#define F 384477
++
++int ownStartPtr[F];
++double bPrimePtr[N];
++double diagPtr[N];
++double psiPtr[N];
++double upperPtr[F];
++double lowerPtr[F];
++int uPtr[F];
++
++void SMOOTH(int *ownStartPtr, double *bPrimePtr, double *diagPtr, double *psiPtr, int *uPtr, double *lowerPtr, double *upperPtr, int nCells);
++
++int main(int argc, char *argv[])
++{
++  int nCells = N;
++  int nFaces = F;
++  int testIter = 2;
++  for (int i = 0; i < testIter; i++)
++    {
++      SMOOTH(ownStartPtr, bPrimePtr, diagPtr, psiPtr, uPtr, lowerPtr, upperPtr, nCells);
++    }
++  return  0;
++}
++
++
++void SMOOTH(int *ownStartPtr, double *bPrimePtr, double *diagPtr, double *psiPtr, int *uPtr, double *lowerPtr, double *upperPtr, int nCells)
++{
++  double psii;
++  int fStart;
++  int fEnd = ownStartPtr[0];
++
++  for (int celli = 0; celli < nCells; celli++)
++    {
++      fStart = fEnd;
++      fEnd = ownStartPtr[celli + 1];
++      psii = bPrimePtr[celli];
++
++      for (int facei = fStart; facei<fEnd; facei++)
++	{
++	  psii -= upperPtr[facei] * psiPtr[uPtr[facei]];
++	}
++
++      psii /= diagPtr[celli];
++      for (int facei = fStart; facei < fEnd; facei++)
++	{
++	  bPrimePtr[uPtr[facei]] -= lowerPtr[facei] * psii;
++	}
++      psiPtr[celli] = psii;
++    }
++}
++
++/* { dg-final { scan-tree-dump-times "bPrimePtr : 3" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "diagPtr : 1" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "upperPtr : 1" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "psiPtr : 2" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "insert" 8 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not   "Processing loop 0"  "llc_allocate" } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-branch-in-loop.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-branch-in-loop.c
+new file mode 100644
+index 000000000..704f8792c
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-branch-in-loop.c
+@@ -0,0 +1,39 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c" } */
++
++/* In this deja test case, we test how Phase 4 of llc-allocate pass deals with
++   loop that contains a branching.  */
++#include <stdio.h>
++
++#define N 131590
++
++double diagPtr[N];
++double psiPtr[N];
++double ApsiPtr[N];
++
++void
++branch_in_loop (double *diagPtr, double *psiPtr, double *ApsiPtr, int nCells)
++{
++  for (int cell=0; cell<nCells; cell++)
++    {
++      if (psiPtr[cell] > 0)
++          ApsiPtr[cell] = 0;
++      else
++          ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
++    }
++}
++
++int
++main (int argc, char *argv[])
++{
++  int nCells = N;
++  int testIter = 100;
++
++  for (int i=0; i<testIter; i++)
++    {
++      branch_in_loop (diagPtr,psiPtr,ApsiPtr,nCells);
++    }
++  return  0;
++}
++
++/* { dg-final { scan-tree-dump "static issue" "llc_allocate" } } */
+\ No newline at end of file
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-break-in-loop.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-break-in-loop.c
+new file mode 100644
+index 000000000..a19d11506
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-break-in-loop.c
+@@ -0,0 +1,41 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c" } */
++
++/* In this deja test case, we test how Phase 4 of llc-allocate pass deals with
++   loop that contains a break statement (which introduces multiple exits for a
++   loop). Currently, loops with multiple exits are filtered by Phase 1.  */
++#include <stdio.h>
++
++#define N 131590
++
++double diagPtr[N];
++double psiPtr[N];
++double ApsiPtr[N];
++
++void
++break_in_loop (double *diagPtr, double *psiPtr, double *ApsiPtr, int nCells)
++{
++  for (int cell=0; cell<nCells; cell++)
++    {
++      if (psiPtr[cell] > 0)
++	break;
++      ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
++    }
++}
++
++int
++main (int argc, char *argv[])
++{
++  int nCells = N;
++  int testIter = 2;
++
++  for (int i=0; i<testIter; i++)
++    {
++      break_in_loop (diagPtr,psiPtr,ApsiPtr,nCells);
++    }
++  return  0;
++}
++
++/* { dg-final { scan-tree-dump "loop_multiple_exits" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not "Phase 2" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not "static issue" "llc_allocate" } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-goto-in-loop.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-goto-in-loop.c
+new file mode 100644
+index 000000000..9e017d7aa
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-goto-in-loop.c
+@@ -0,0 +1,50 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c" } */
++
++/* In this deja test case, we test how Phase 4 of llc-allocate pass deals with
++   loop that contains a goto statement (which introduces multiple exits for a
++   loop). Currently, loops with multiple exits are filtered by Phase 1.  */
++#include <stdio.h>
++
++#define N 131
++
++double diagPtr[N];
++int psiPtr[N];
++double ApsiPtr[N];
++
++void
++goto_in_loop (double *diagPtr, int *psiPtr, double *ApsiPtr, int nCells)
++{
++  for (int cell=0; cell<nCells; cell++)
++    {
++      if (psiPtr[cell] % 3 == 0)
++          goto zero;
++      else if (psiPtr[cell] % 3 == 1)
++          goto one;
++      else
++          ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
++    }
++
++  zero:
++      ApsiPtr[0] = 0.;
++      return;
++  one:
++      ApsiPtr[0] = 1.;
++}
++
++int
++main (int argc, char *argv[])
++{
++  int nCells = N;
++  int testIter = 2;
++
++  for (int i=0; i<testIter; i++)
++    {
++      goto_in_loop (diagPtr,psiPtr,ApsiPtr,nCells);
++    }
++  return  0;
++}
++
++/* { dg-final { scan-tree-dump "loop_multiple_exits" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not "Phase 2" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not "static issue" "llc_allocate" } } */
+\ No newline at end of file
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-same-loop-cycle.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-same-loop-cycle.c
+new file mode 100644
+index 000000000..16cb7012b
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-same-loop-cycle.c
+@@ -0,0 +1,129 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c --param=force-issue=1" } */
++
++/* In this deja test case, we test how Phase 4 of llc-allocate pass deals with
++   cfg that contains a backedge not being the latch of a formal GCC loop
++   structure.  */
++typedef unsigned long size_t;
++typedef long scalar_t__;
++
++typedef  struct TYPE_13__   TYPE_3__ ;
++typedef  struct TYPE_12__   TYPE_2__ ;
++typedef  struct TYPE_11__   TYPE_1__ ;
++
++struct dom_info {int nodes; int* dfs_parent; int* dfs_order; int* key; int* next_bucket; int* bucket; int* dom; int fake_exit_edge; TYPE_3__** dfs_to_bb; } ;
++typedef  enum cdi_direction { ____Placeholder_cdi_direction } cdi_direction ;
++struct TYPE_11__ {scalar_t__ index; } ;
++typedef  TYPE_1__ edge_iterator ;
++typedef  TYPE_2__* edge ;
++typedef  TYPE_3__* basic_block ;
++struct TYPE_13__ {size_t index; int preds; int succs; } ;
++struct TYPE_12__ {TYPE_3__* src; TYPE_3__* dest; } ;
++typedef  int TBB ;
++
++basic_block ENTRY_BLOCK_PTR ;
++basic_block EXIT_BLOCK_PTR ;
++scalar_t__ bitmap_bit_p (int,size_t) ;
++edge ei_edge (edge_iterator) ;
++int ei_end_p (edge_iterator) ;
++int ei_next (edge_iterator*) ;
++edge_iterator ei_start (int) ;
++size_t eval (struct dom_info*,int) ;
++size_t last_basic_block ;
++int link_roots (struct dom_info*,int,int) ;
++
++__attribute__((used)) static void
++calc_idoms (struct dom_info *di, enum cdi_direction reverse)
++{
++  TBB v, w, k, par;
++  basic_block en_block;
++  edge_iterator ei, einext;
++
++  if (reverse)
++    en_block = EXIT_BLOCK_PTR;
++  else
++    en_block = ENTRY_BLOCK_PTR;
++
++  /* Go backwards in DFS order, to first look at the leafs.  */
++  v = di->nodes;
++  while (v > 1)
++    {
++      basic_block bb = di->dfs_to_bb[v];
++      edge e;
++
++      par = di->dfs_parent[v];
++      k = v;
++
++      ei = (reverse) ? ei_start (bb->succs) : ei_start (bb->preds);
++
++      if (reverse)
++	{
++	  /* If this block has a fake edge to exit, process that first.  */
++	  if (bitmap_bit_p (di->fake_exit_edge, bb->index))
++	    {
++	      einext = ei;
++	      einext.index = 0;
++	      goto do_fake_exit_edge;
++	    }
++	}
++
++      /* Search all direct predecessors for the smallest node with a path
++	 to them.  That way we have the smallest node with also a path to
++	 us only over nodes behind us.  In effect we search for our
++	 semidominator.  */
++      while (!ei_end_p (ei))
++	{
++	  basic_block b;
++	  TBB k1;
++
++	  e = ei_edge (ei);
++	  b = (reverse) ? e->dest : e->src;
++	  einext = ei;
++	  ei_next (&einext);
++
++	  if (b == en_block)
++	    {
++	    do_fake_exit_edge:
++	      k1 = di->dfs_order[last_basic_block];
++	    }
++	  else
++	    k1 = di->dfs_order[b->index];
++
++	  /* Call eval() only if really needed.  If k1 is above V in DFS tree,
++	     then we know, that eval(k1) == k1 and key[k1] == k1.  */
++	  if (k1 > v)
++	    k1 = di->key[eval (di, k1)];
++	  if (k1 < k)
++	    k = k1;
++
++	  ei = einext;
++	}
++
++      di->key[v] = k;
++      link_roots (di, par, v);
++      di->next_bucket[v] = di->bucket[k];
++      di->bucket[k] = v;
++
++      /* Transform semidominators into dominators.  */
++      for (w = di->bucket[par]; w; w = di->next_bucket[w])
++	{
++	  k = eval (di, w);
++	  if (di->key[k] < di->key[w])
++	    di->dom[w] = k;
++	  else
++	    di->dom[w] = par;
++	}
++      /* We don't need to cleanup next_bucket[].  */
++      di->bucket[par] = 0;
++      v--;
++    }
++
++  /* Explicitly define the dominators.  */
++  di->dom[1] = 0;
++  for (v = 2; v <= di->nodes; v++)
++    if (di->dom[v] != di->key[v])
++      di->dom[v] = di->dom[di->dom[v]];
++}
++
++/* { dg-final { scan-tree-dump-times "Warning: Find cycle at bb index" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump "static issue" "llc_allocate" } } */
+\ No newline at end of file
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c
+new file mode 100644
+index 000000000..e18725f60
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c
+@@ -0,0 +1,50 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -c -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0" } */
++
++#include <stdio.h>
++
++typedef struct stack_def
++{
++  int top;                      /* index to top stack element */
++  unsigned long reg_set;        /* set of live registers */
++  unsigned char reg[128];       /* register - stack mapping */
++} *stack;
++
++typedef struct block_info_def
++{
++  struct stack_def stack_in;    /* Input stack configuration.  */
++  struct stack_def stack_out;   /* Output stack configuration.  */
++  unsigned long out_reg_set;    /* Stack regs live on output.  */
++  int done;                     /* True if block already converted.  */
++  int predecessors;             /* Number of predecessors that need
++                                   to be visited.  */
++} *block_info;
++
++typedef struct basic_block_def
++{
++  void *aux;
++} *basic_block;
++
++unsigned char
++convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high)
++{
++  stack output_stack;
++
++  output_stack = &(((block_info) bb->aux)->stack_in);
++  if (value_reg_low == -1)
++    output_stack->top = -1;
++  else
++    {
++      int reg;
++      output_stack->top = value_reg_high - value_reg_low;
++      for (reg = value_reg_low; reg <= value_reg_high; ++reg)
++        {
++          (output_stack->reg + 16)[value_reg_high - reg] = reg;
++          output_stack->reg_set |= (unsigned long) 1 << reg;
++        }
++    }
++  return output_stack->reg[0];
++}
++
++/* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */
+\ No newline at end of file
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c
+new file mode 100644
+index 000000000..328dc57bc
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options " -S -O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,0);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL1KEEP"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c
+new file mode 100644
+index 000000000..d9c919869
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,1);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL1STRM"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c
+new file mode 100644
+index 000000000..806366b5b
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,2);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL2KEEP"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c
+new file mode 100644
+index 000000000..91567d1e9
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c
+@@ -0,0 +1,16 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main()
++{
++  for(int i = 0; i < 100000; i++)
++    {
++      __builtin_prefetch_full(&val[i], 0, 3);
++      val[i] = i + 1;		
++    }
++}
++
++/* { dg-final { scan-assembler "PLDL2STRM"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c
+new file mode 100644
+index 000000000..c28150654
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,4);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL3KEEP"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c
+new file mode 100644
+index 000000000..e8d9c8693
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,5);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL3STRM"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c
+new file mode 100644
+index 000000000..b0281882f
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,6);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL4KEEP"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c
+new file mode 100644
+index 000000000..26807556f
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,7);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL4STRM"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c
+new file mode 100644
+index 000000000..4f2def13d
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,0);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL1KEEP"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c
+new file mode 100644
+index 000000000..ecc501f1f
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,1);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL1STRM"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c
+new file mode 100644
+index 000000000..d140f1ed1
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,2);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL2KEEP"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c
+new file mode 100644
+index 000000000..d6f170253
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,3);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL2STRM"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c
+new file mode 100644
+index 000000000..8da092b36
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,4);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL3KEEP"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c
+new file mode 100644
+index 000000000..4cf65188a
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,5);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL3STRM"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c
+new file mode 100644
+index 000000000..36f4a3aa0
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,6);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL4KEEP"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c
+new file mode 100644
+index 000000000..43d2d41d5
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,7);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL4STRM"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c
+new file mode 100644
+index 000000000..ba90e7ea4
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c
+@@ -0,0 +1,62 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0" } */
++
++#include <stdio.h>
++#include <stdlib.h>
++
++#define N 1000
++
++long a[N] = {0};
++long b[N] = {0};
++long c[N] = {0};
++
++double
++referenceTrace (double *psiPtr, int *lPtr, int *uPtr, int nCells)
++{
++  double sum;
++  for (int cell = 0; cell < nCells; cell++)
++    {
++      // Multi-layer pointer
++      sum += psiPtr[lPtr[cell]];
++      psiPtr[uPtr[cell]] = sum;
++
++      // Outer pointer, inner array
++      sum += psiPtr[b[cell]];
++      psiPtr[a[cell]] = sum;
++
++      // Multi-layer array
++      sum += a[b[cell]];
++      c[a[cell]] = sum;
++
++      // Outer array, inner pointer
++      sum += a[lPtr[cell]];
++      c[lPtr[cell]] = sum;
++    }
++  return sum;
++}
++
++int
++main (int argc, char *argv[])
++{
++  int testIter = 2;
++
++  double *psiPtr = NULL;
++  int *lPtr = NULL;
++  int *uPtr = NULL;
++  psiPtr = (double *) calloc (N, sizeof(double));
++  lPtr = (int *) calloc (N, sizeof(int));
++  uPtr = (int *) calloc (N, sizeof(int));
++
++  for (int i = 0; i < testIter; i++)
++    referenceTrace (psiPtr, lPtr, uPtr, N);
++
++  free (psiPtr);
++  free (lPtr);
++  free (uPtr);
++
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump-times "Tracing succeeded" 24 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "unhandled issue scene" 2 "llc_allocate" } } */
+\ No newline at end of file
+diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90
+new file mode 100644
+index 000000000..b0f68ebe3
+--- /dev/null
++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90
+@@ -0,0 +1,211 @@
++! { dg-do compile { target { aarch64*-*-linux* } } }
++! { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param branch-prob-threshold=50 --param filter-mode=0" }
++
++program main
++
++  IMPLICIT NONE
++  INTEGER :: ids,ide, jds,jde, kds,kde
++  INTEGER,parameter :: ims=-4,kms=1,jms=-4
++  INTEGER,parameter :: ime=210,kme=36,jme=192
++  INTEGER :: its,ite, jts,jte, kts,kte
++  INTEGER :: number_of_small_timesteps,rk_step, rk_order, step
++
++  REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: t_1, t_2, c2a, p, ph, pm1, al, alt
++
++
++  REAL, DIMENSION(ims:ime, jms:jme) :: mu, muts
++
++  REAL, DIMENSION(kms:kme) :: dnw, rdnw, znu
++
++  REAL :: rdx,rdy
++  REAL :: dts, t0, smdiv
++  REAL :: random1,time_begin,time_end,total_time
++
++  INTEGER :: i, j, k
++  INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end
++  INTEGER :: i_endu, j_endv
++  INTEGER :: interval=1
++  INTEGER :: epoch,iter
++
++  LOGICAL :: non_hydrostatic
++
++  data ids, jds, kds, its, jts, kts /6*1/
++  data ide, ite /2*205/
++  data jde, jte /2*187/
++  data kde, kte /2*36/
++
++  number_of_small_timesteps = 1
++  rk_step = 3
++  rk_order = 1
++  dts = 1.
++
++  rdx = 1.
++  rdy = 1.
++
++  t0 = 0.
++  smdiv = 1.
++  step = 1
++  non_hydrostatic = .true.
++
++  call random_number(random1)
++  interval = random1*100
++  interval=1
++
++  call random_seed(put=(/(i,i=1,10000,interval)/))
++
++  call random_number(alt)
++  call random_number(c2a)
++  call random_number(ph)
++  call random_number(pm1)
++  call random_number(mu)
++  call random_number(muts)
++  call random_number(dnw)
++  call random_number(rdnw)
++  call random_number(znu)
++
++  do iter=1,2
++  call calc_p_rho( al, p, ph,                        &
++                       alt, t_2, t_1, c2a, pm1,      &
++                       mu, muts, znu, t0,            &
++                       rdnw, dnw, smdiv,             &
++                       non_hydrostatic, step,        &
++                       ids, ide, jds, jde, kds, kde, &
++                       ims, ime, jms, jme, kms, kme, &
++                       its,ite, jts,jte, kts,kte    )
++
++  enddo
++
++end program
++
++
++SUBROUTINE calc_p_rho( al, p, ph,                    &
++                       alt, t_2, t_1, c2a, pm1,      &
++                       mu, muts, znu, t0,            &
++                       rdnw, dnw, smdiv,             &
++                       non_hydrostatic, step,        &
++                       ids, ide, jds, jde, kds, kde, &
++                       ims, ime, jms, jme, kms, kme, &
++                       its,ite, jts,jte, kts,kte    )
++
++  IMPLICIT NONE  ! religion first
++  !asb
++! declarations for the stuff coming in
++
++  INTEGER,      INTENT(IN   )    :: ids,ide, jds,jde, kds,kde
++  INTEGER,      INTENT(IN   )    :: ims,ime, jms,jme, kms,kme
++  INTEGER,      INTENT(IN   )    :: its,ite, jts,jte, kts,kte
++
++  INTEGER,      INTENT(IN   )    :: step
++
++  REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(  OUT) :: al,   &
++                                                               p
++
++  REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(IN   ) :: alt,   &
++                                                              t_2,   &
++                                                              t_1,   &
++                                                              c2a
++
++  REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(INOUT) :: ph, pm1
++
++  REAL, DIMENSION(ims:ime, jms:jme)         , INTENT(IN   ) :: mu,   &
++                                                               muts
++
++  REAL, DIMENSION(kms:kme)         , INTENT(IN   ) :: dnw,  &
++                                                      rdnw, &
++                                                      znu
++
++  REAL,                                       INTENT(IN   ) :: t0, smdiv
++
++  LOGICAL, INTENT(IN   )  :: non_hydrostatic
++
++! local variables
++
++  INTEGER :: i, j, k
++  INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end
++  REAL    :: ptmp
++
++   i_start = its
++   i_end   = min(ite,ide-1)
++   j_start = jts
++   j_end   = min(jte,jde-1)
++   k_start = kts
++   k_end = min(kte,kde-1)
++
++   IF (non_hydrostatic) THEN
++     DO j=j_start, j_end
++     DO k=k_start, k_end
++     DO i=i_start, i_end
++
++!  al computation is all dry, so ok with moisture
++
++      al(i,k,j)=-1./muts(i,j)*(alt(i,k,j)*mu(i,j)  &
++             +rdnw(k)*(ph(i,k+1,j)-ph(i,k,j)))
++
++!  this is temporally linearized p, no moisture correction needed
++
++      p(i,k,j)=c2a(i,k,j)*(alt(i,k,j)*(t_2(i,k,j)-mu(i,j)*t_1(i,k,j))  &
++                       /(muts(i,j)*(t0+t_1(i,k,j)))-al (i,k,j))
++
++     ENDDO
++     ENDDO
++     ENDDO
++
++   ELSE  ! hydrostatic calculation
++
++       DO j=j_start, j_end
++       DO k=k_start, k_end
++       DO i=i_start, i_end
++         p(i,k,j)=mu(i,j)*znu(k)
++         al(i,k,j)=alt(i,k,j)*(t_2(i,k,j)-mu(i,j)*t_1(i,k,j))            &
++                      /(muts(i,j)*(t0+t_1(i,k,j)))-p(i,k,j)/c2a(i,k,j)
++         ph(i,k+1,j)=ph(i,k,j)-dnw(k)*(muts(i,j)*al (i,k,j)              &
++                          +mu(i,j)*alt(i,k,j))
++       ENDDO
++       ENDDO
++       ENDDO
++
++   END IF
++
++!  divergence damping setup
++
++     IF (step == 0) then   ! we're initializing small timesteps
++       DO j=j_start, j_end
++       DO k=k_start, k_end
++       DO i=i_start, i_end
++         pm1(i,k,j)=p(i,k,j)
++       ENDDO
++       ENDDO
++       ENDDO
++     ELSE                     ! we're in the small timesteps
++       DO j=j_start, j_end    ! and adding div damping component
++       DO k=k_start, k_end
++       DO i=i_start, i_end
++         ptmp = p(i,k,j)
++         p(i,k,j) = p(i,k,j) + smdiv*(p(i,k,j)-pm1(i,k,j))
++         pm1(i,k,j) = ptmp
++       ENDDO
++       ENDDO
++       ENDDO
++     END IF
++
++END SUBROUTINE calc_p_rho
++
++! { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 6 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "Tracing succeeded" 46 "llc_allocate" } }
++! { dg-final { scan-tree-dump-not   "Tracing failed" "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 1 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){2}\}" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){4}\}" 1 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tp\\t\\(0.000000, 3, 1, 0\\)" 1 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tp\\t\\(0.000000, 3, 3, 0\\)" 1 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tpm1\\t\\(0.000000, 3, 2, 0\\)" 1 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tph\\t\\(0.000000, 3, 2, 0\\)" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tal\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\talt\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tt_1\\t\\(0.000000, 3, 1, 0\\)" 1 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tt_2\\t\\(0.000000, 3, 1, 0\\)" 1 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tc2a\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "runtime issue" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "insert svprfd" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "cumul_size.*150960\\)" 1 "llc_allocate" } }
+diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp
+new file mode 100644
+index 000000000..13d225f35
+--- /dev/null
++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp
+@@ -0,0 +1,29 @@
++#   Copyright (C) 2022-2023 Free Software Foundation, Inc.
++
++# This program is free software; you can redistribute it and/or modify
++# it under the terms of the GNU General Public License as published by
++# the Free Software Foundation; either version 3 of the License, or
++# (at your option) any later version.
++#
++# This program is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++# GNU General Public License for more details.
++#
++# You should have received a copy of the GNU General Public License
++# along with GCC; see the file COPYING3.  If not see
++# <http://www.gnu.org/licenses/>.
++
++# GCC testsuite that uses the `dg.exp' driver.
++
++load_lib gfortran-dg.exp
++
++# Initialize `dg'.
++dg-init
++
++# Main loop.
++gfortran-dg-runtest [lsort \
++    [glob -nocomplain $srcdir/$subdir/*.\[fF\]{,90,95,03,08} ] ] "" ""
++
++# All done.
++dg-finish
+diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90
+new file mode 100644
+index 000000000..501e6e74c
+--- /dev/null
++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90
+@@ -0,0 +1,62 @@
++! { dg-do compile { target { aarch64*-*-linux* } } }
++! { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno" }
++
++MODULE INPUT
++    IMPLICIT NONE
++
++    INTEGER, PARAMETER :: wp = 8, jpi = 25, jpj = 39, jpk = 31, kjpt = 2
++
++    INTEGER :: kt = 1, jpkm1 = 30, jpjm1 = 38, fs_jpim1 = 24, fs_2 = 2
++    REAL(wp), DIMENSION(jpi, jpj) :: e12t
++    REAL(wp), DIMENSION(jpi, jpj, jpk) :: fse3t_n
++    REAL(wp), DIMENSION(jpi, jpj, jpk, kjpt) :: pta
++
++END MODULE INPUT
++
++PROGRAM MAIN
++    USE INPUT
++
++    IMPLICIT NONE
++
++    INTEGER :: EPOCH
++
++! Initialize arrays
++
++    e12t = 1
++    fse3t_n = 1
++    pta = 1
++!
++
++    DO EPOCH=1,2
++        CALL tra_ldf_iso
++    ENDDO
++
++END PROGRAM MAIN
++
++SUBROUTINE tra_ldf_iso
++    USE INPUT
++
++    IMPLICIT NONE
++    !
++    INTEGER :: ji, jj, jk, jn   ! dummy loop indices
++    REAL(wp) :: zbtr, ztra            !   -      -
++    REAL(wp), DIMENSION(jpi, jpj, jpk) :: ztfw
++
++    DO jn = 1, kjpt
++        ztfw(:, :, 1) = 0.e0; ztfw(:, :, jpk) = 0.e0
++
++        DO jk = 1, jpkm1
++            DO jj = 2, jpjm1
++                DO ji = fs_2, fs_jpim1   ! vector opt.
++                    zbtr = 1.0/(e12t(ji, jj)*fse3t_n(ji, jj, jk))
++                    ztra = (ztfw(ji, jj, jk) - ztfw(ji, jj, jk + 1))*zbtr
++                    pta(ji, jj, jk, jn) = pta(ji, jj, jk, jn) + ztra
++                END DO
++            END DO
++        END DO
++        !
++    END DO
++    !
++END SUBROUTINE tra_ldf_iso
++
++! { dg-final { scan-tree-dump-times "Traced variables at vectp_ztfw" 2 "llc_allocate" } }
+diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90
+new file mode 100644
+index 000000000..7345759db
+--- /dev/null
++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90
+@@ -0,0 +1,58 @@
++! { dg-do compile { target { aarch64*-*-linux* } } }
++! { dg-options "-c -O3 -march=armv8.2-a+sve -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0 --param issue-topn=1 --param mem-access-ratio=5 --param mem-access-num=1" }
++
++Module module_domain
++    IMPLICIT NONE
++
++    REAL, PARAMETER :: g = 9.8
++    TYPE :: grid_type
++        REAL, POINTER   :: phb(:,:,:), ph_2(:,:,:), p(:,:,:), pb(:,:,:)
++        REAL, POINTER   :: fnm(:), fnp(:)
++    END TYPE
++END Module
++
++SUBROUTINE calc_p8w(p8w, ix, iy, k_start, k_end)
++
++   USE module_domain
++   !USE module_model_constants
++
++   IMPLICIT NONE
++
++
++   !TYPE (domain), INTENT(IN) :: grid
++   INTEGER, INTENT(IN) :: k_start, k_end, ix, iy
++   REAL, DIMENSION(k_start:k_end), INTENT(OUT) :: p8w
++
++
++   INTEGER :: k
++   REAL    :: z0, z1, z2, w1, w2
++   REAL, DIMENSION(k_start:k_end)   :: z_at_w
++   REAL, DIMENSION(k_start:k_end-1) :: z
++   TYPE (grid_type), POINTER :: grid
++
++
++   DO k = k_start, k_end
++      z_at_w(k) = (grid%phb(ix,k,iy)+grid%ph_2(ix,k,iy))/g
++   END DO
++
++   DO k = k_start, k_end-1
++      z(k) = 0.5*(z_at_w(k) + z_at_w(k+1))
++   END DO
++
++   DO k = k_start+1, k_end-1
++      p8w(k) = grid%fnm(k)*(grid%p(ix,k,iy)+grid%pb(ix,k,iy)) + &
++               grid%fnp(k)*(grid%p(ix,k-1,iy)+grid%pb(ix,k-1,iy))
++   END DO
++
++   z0 = z_at_w(k_start)
++   z1 = z(k_start)
++   z2 = z(k_start+1)
++   w1 = (z0 - z2)/(z1 - z2)
++   w2 = 1. - w1
++   p8w(k_start) = w1*(grid%p(ix,k_start,iy)+grid%pb(ix,k_start,iy)) + &
++                  w2*(grid%p(ix,k_start+1,iy)+grid%pb(ix,k_start+1,iy))
++
++END SUBROUTINE calc_p8w
++
++! { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } }
+\ No newline at end of file
+diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90
+new file mode 100644
+index 000000000..f79df5d26
+--- /dev/null
++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90
+@@ -0,0 +1,320 @@
++! { dg-do compile { target { aarch64*-*-linux* } } }
++! { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno  --param=branch-prob-threshold=50 --param=filter-kernels=0 --param=mem-access-num=2 --param=issue-topn=2  --param=force-issue=1 --param=outer-loop-nums=3" }
++!include "module_small_step_em.F90"
++
++Module add_type
++  IMPLICIT NONE
++
++  TYPE :: grid_config_rec_type
++      LOGICAL :: open_xs
++      LOGICAL :: open_ys
++      LOGICAL :: open_xe
++      LOGICAL :: open_ye
++      LOGICAL :: symmetric_xs
++      LOGICAL :: symmetric_xe
++      LOGICAL :: symmetric_ys
++      LOGICAL :: symmetric_ye
++      LOGICAL :: polar
++      LOGICAL :: nested
++      LOGICAL :: periodic_x
++      LOGICAL :: specified
++  END TYPE
++END Module
++
++program main
++
++
++!  include "module_small_step_em_modify.F90"
++
++!  use module_small_step_em
++!  use module_small_step_em_modify
++
++  use add_type
++
++  IMPLICIT NONE
++  INTEGER :: ids,ide, jds,jde, kds,kde
++  INTEGER,parameter :: ims=-4,kms=1,jms=-4
++  INTEGER,parameter :: ime=210,kme=36,jme=192
++  INTEGER :: its,ite, jts,jte, kts,kte
++  INTEGER :: number_of_small_timesteps,rk_step, rk_order, step, spec_zone
++
++  REAL, DIMENSION(ims:ime, kms:kme, jms:jme, 1:8) :: llcRefresh
++  REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: u, v, u_1, v_1, t_1, ww_1, ft!u, v, u_1, v_1, w_1, t_1, ww1, ww_1,ph_1, ft
++  REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: u_save, v_save, w_save, t_save, ph_save,h_diabatic
++  ! REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: u_2, v_2, w_2, t_2, ph_2
++  ! REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: c2a, ww_save, cqw, cqu, cqv, alpha, gamma, a
++  REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: ww!pb, p, ph, php, pm1, al, alt, ww, random_array
++  ! REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: ru_tend, rv_tend
++  REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: t, t_ave, uam, vam, wwam
++
++  REAL, DIMENSION(ims:ime, jms:jme) :: mu_1,mu_2, mu
++  REAL, DIMENSION(ims:ime, jms:jme) :: mub, muu, muv, mut,        &
++                                       msfux, msfuy,              &
++                                       msfvx, msfvx_inv, msfvy,   &
++                                       msftx, msfty
++
++  REAL, DIMENSION(ims:ime, jms:jme) :: muus, muvs, muts, mudf, muave
++  REAL, DIMENSION(ims:ime, jms:jme) :: mu_save, mu_tend
++
++  REAL, DIMENSION(kms:kme) :: rdn, rdnw,dnw, fnm, fnp, znu
++
++  REAL :: rdx,rdy
++  REAL :: dts, cf1, cf2, cf3, t0, emdiv, smdiv, epssm, g
++  REAL :: random1,time_begin,time_end,total_time
++
++  INTEGER :: i, j, k
++  INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end
++  INTEGER :: i_endu, j_endv
++  INTEGER :: interval=1
++  INTEGER :: epoch
++
++  LOGICAL :: non_hydrostatic, top_lid
++
++
++  TYPE (grid_config_rec_type) :: config_flags
++  config_flags%open_xs = .true.
++  config_flags%open_ys = .true.
++  config_flags%open_xe = .true.
++  config_flags%open_ye = .true.
++  config_flags%symmetric_xs = .true.
++  config_flags%symmetric_xe = .true.
++  config_flags%symmetric_ys = .true.
++  config_flags%symmetric_ye = .true.
++  config_flags%polar = .true.
++  config_flags%nested = .true.
++  config_flags%periodic_x = .true.
++  config_flags%specified = .true.
++
++  data ids, jds, kds, its, jts, kts /6*1/
++  data ide, ite /2*205/
++  data jde, jte /2*187/
++  data kde, kte /2*98/
++
++  number_of_small_timesteps = 1
++  rk_step = 1
++  rk_order = 1
++  dts = 1.
++  epssm = 1.
++  g = 1.
++
++  rdx = 1.
++  rdy = 1.
++  dts = 1.
++  cf1 = 1.
++  cf2 = 1.
++  cf3 = 1.
++
++  t0 = 0.
++  smdiv = 1.
++  emdiv = 1.
++  step = 1
++  spec_zone = 1
++
++  non_hydrostatic = .true.
++  top_lid = .true.
++
++  interval=1
++
++
++  total_time=0
++
++  call random_seed(put=(/(i,i=1,10000,interval)/))
++
++  call random_number(u)
++  call random_number(v)
++  call random_number(u_1)
++  call random_number(v_1)
++  call random_number(t_1)
++  call random_number(ft)
++
++  call random_number(ww)
++  call random_number(ww_1)
++  call random_number(t)
++  call random_number(t_ave)
++  call random_number(uam)
++  call random_number(vam)
++  call random_number(wwam)
++
++  call random_number(muu)
++  call random_number(muv)
++  call random_number(mut)
++  call random_number(msfux)
++  call random_number(msfuy)
++  call random_number(msfvx)
++  call random_number(msfvx_inv)
++  call random_number(msfvy)
++  call random_number(msftx)
++  call random_number(msfty)
++  call random_number(mu_tend)
++
++  call random_number(muave)
++  call random_number(muts)
++  call random_number(mudf)
++  call random_number(mu)
++
++  call random_number(fnm)
++  call random_number(fnp)
++  call random_number(dnw)
++  call random_number(rdnw)
++
++  DO j=jms, jme
++  DO k=kms, kme
++  DO i=ims, ime
++
++    llcRefresh(i,k,j,1)=i+k+j+7
++
++  ENDDO
++  ENDDO
++  ENDDO
++
++  do epoch = 1,2
++  call advance_mu_t_fortran_plu( ww, ww_1, u, u_1, v, v_1,            &
++                         mu, mut, muave, muts, muu, muv,      &
++                         mudf, uam, vam, wwam, t, t_1,        &
++                         t_ave, ft, mu_tend,                  &
++                         rdx, rdy, dts, epssm,                &
++                         dnw, fnm, fnp, rdnw,                 &
++                         msfux, msfuy, msfvx, msfvx_inv,      &
++                         msfvy, msftx, msfty,                 &
++                         step, config_flags,                  &
++                         ids, ide, jds, jde, kds, kde,        &
++                         ims, ime, jms, jme, kms, kme,        &
++                         its, ite, jts, jte, kts, kte        )
++  enddo
++end program
++
++
++
++SUBROUTINE advance_mu_t_fortran_plu( ww, ww_1, u, u_1, v, v_1,            &
++        mu, mut, muave, muts, muu, muv,      &
++        mudf, uam, vam, wwam, t, t_1,        &
++        t_ave, ft, mu_tend,                  &
++        rdx, rdy, dts, epssm,                &
++        dnw, fnm, fnp, rdnw,                 &
++        msfux, msfuy, msfvx, msfvx_inv,      &
++        msfvy, msftx, msfty,                 &
++        step, config_flags,                  &
++        ids, ide, jds, jde, kds, kde,        &
++        ims, ime, jms, jme, kms, kme,        &
++        its, ite, jts, jte, kts, kte        )
++  use add_type
++
++  IMPLICIT NONE  ! religion first
++
++  ! stuff coming in
++
++  TYPE(grid_config_rec_type), INTENT(IN   ) :: config_flags
++  INTEGER,      INTENT(IN   )    :: ids,ide, jds,jde, kds,kde
++  INTEGER,      INTENT(IN   )    :: ims,ime, jms,jme, kms,kme
++  INTEGER,      INTENT(IN   )    :: its,ite, jts,jte, kts,kte
++
++  INTEGER,      INTENT(IN   )    :: step
++
++  REAL, DIMENSION( ims:ime , kms:kme, jms:jme ),   &
++          INTENT(IN   ) ::                       &
++          u,   &
++          v,   &
++          u_1, &
++          v_1, &
++          t_1, &
++          ft
++
++  REAL, DIMENSION( ims:ime , kms:kme, jms:jme ),      &
++          INTENT(INOUT) ::                          &
++          ww,     &
++          ww_1,   &
++          t,      &
++          t_ave,  &
++          uam,    &
++          vam,    &
++          wwam
++
++  REAL, DIMENSION( ims:ime , jms:jme ),    INTENT(IN   ) :: muu,  &
++          muv,  &
++          mut,  &
++          msfux,&
++          msfuy,&
++          msfvx,&
++          msfvx_inv,&
++          msfvy,&
++          msftx,&
++          msfty,&
++          mu_tend
++
++  REAL, DIMENSION( ims:ime , jms:jme ),    INTENT( INOUT) :: muave, &
++          muts,  &
++          mudf
++
++  REAL, DIMENSION( ims:ime , jms:jme ),    INTENT(INOUT) :: mu
++
++  REAL, DIMENSION( kms:kme ),              INTENT(IN   ) :: fnm,    &
++          fnp,    &
++          dnw,    &
++          rdnw
++
++
++  REAL,                                    INTENT(IN   ) :: rdx,    &
++          rdy,    &
++          dts,    &
++          epssm
++
++  REAL, DIMENSION (its:ite, kts:kte) :: wdtn, dvdxi
++  REAL, DIMENSION (its:ite) :: dmdt
++
++  INTEGER :: i,j,k, i_start, i_end, j_start, j_end, k_start, k_end
++  INTEGER :: i_endu, j_endv
++  REAL    :: acc
++
++  INTEGER :: ubv, lbv, t1, t2, t3, t4, ceild, floord
++
++  ceild(t1, t2) = ceiling(REAL(t1)/REAL(t2))
++  floord(t1, t2) = floor(REAL(t1)/REAL(t2))
++  i_start = its
++  i_end   = min(ite,ide-1)
++  j_start = jts
++  j_end   = min(jte,jde-1)
++  k_start = kts
++  k_end   = kte-1
++  IF ( .NOT. config_flags%periodic_x )THEN
++    IF ( config_flags%specified .or. config_flags%nested ) then
++      i_start = max(its,ids+1)
++      i_end   = min(ite,ide-2)
++    ENDIF
++  ENDIF
++  IF ( config_flags%specified .or. config_flags%nested ) then
++    j_start = max(jts,jds+1)
++    j_end   = min(jte,jde-2)
++  ENDIF
++
++  i_endu = ite
++  j_endv = jte
++
++  DO j = j_start, j_end
++
++    DO i=i_start, i_end
++      dmdt(i) = 0.
++    ENDDO
++
++    DO k=k_start, k_end
++      DO i=i_start, i_end
++        dvdxi(i,k) = msftx(i,j)*msfty(i,j)*(      &
++                rdy*((v(i,k,j+1)+muv(i,j+1)*v_1(i,k,j+1)*msfvx_inv(i,j+1))  &
++                        -(v(i,k,j  )+muv(i,j  )*v_1(i,k,j)*msfvx_inv(i,j  ))) &
++                        +rdx*((u(i+1,k,j)+muu(i+1,j)*u_1(i+1,k,j)/msfuy(i+1,j))      &
++                        -(u(i,k,j  )+muu(i  ,j)*u_1(i,k,j  )/msfuy(i,j)) ))
++        dmdt(i)    = dmdt(i) + dnw(k)*dvdxi(i,k)
++      ENDDO
++    ENDDO
++    DO i=i_start, i_end
++      muave(i,j) = mu(i,j)
++      mu(i,j) = mu(i,j)+dts*(dmdt(i)+mu_tend(i,j))
++      mudf(i,j) = (dmdt(i)+mu_tend(i,j)) ! save tendency for div dampfilter
++      muts(i,j) = mut(i,j)+mu(i,j)
++      muave(i,j) =.5*((1.+epssm)*mu(i,j)+(1.-epssm)*muave(i,j))
++    ENDDO
++  ENDDO
++END SUBROUTINE advance_mu_t_fortran_plu
++
++! { dg-final { scan-tree-dump "issue_llc_hint" "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "analyze_nested_kernels" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump "Stop tracing the outer loop depth" "llc_allocate" } }
+\ No newline at end of file
+diff --git a/gcc/timevar.def b/gcc/timevar.def
+index 36c3e7d5a..14129a500 100644
+--- a/gcc/timevar.def
++++ b/gcc/timevar.def
+@@ -84,6 +84,7 @@ DEFTIMEVAR (TV_IPA_COMDATS	     , "ipa comdats")
+ DEFTIMEVAR (TV_IPA_HARDWARE_DETECTION, "ipa detection")
+ DEFTIMEVAR (TV_IPA_PREFETCH	     , "ipa prefetch")
+ DEFTIMEVAR (TV_IPA_STRUCT_REORG      , "ipa struct reorg optimization")
++DEFTIMEVAR (TV_IPA_EXTEND_AUTO_PROFILE, "ipa extend auto profile")
+ DEFTIMEVAR (TV_IPA_OPT		     , "ipa various optimizations")
+ DEFTIMEVAR (TV_IPA_LTO_DECOMPRESS    , "lto stream decompression")
+ DEFTIMEVAR (TV_IPA_LTO_COMPRESS      , "lto stream compression")
+@@ -215,6 +216,7 @@ DEFTIMEVAR (TV_TREE_LOOP_DISTRIBUTION, "tree loop distribution")
+ DEFTIMEVAR (TV_CHECK_DATA_DEPS       , "tree check data dependences")
+ DEFTIMEVAR (TV_TREE_PREFETCH	     , "tree prefetching")
+ DEFTIMEVAR (TV_TREE_LOOP_IVOPTS	     , "tree iv optimization")
++DEFTIMEVAR (TV_TREE_LLC_ALLOCATE     , "tree llc allocation")
+ DEFTIMEVAR (TV_PREDCOM		     , "predictive commoning")
+ DEFTIMEVAR (TV_TREE_CH		     , "tree copy headers")
+ DEFTIMEVAR (TV_TREE_SSA_UNCPROP	     , "tree SSA uncprop")
+diff --git a/gcc/toplev.cc b/gcc/toplev.cc
+index f00a166df..bdbd4de63 100644
+--- a/gcc/toplev.cc
++++ b/gcc/toplev.cc
+@@ -567,6 +567,12 @@ compile_file (void)
+       targetm.asm_out.output_ident (ident_str);
+     }
+ 
++  /* Extend auto profile finalization.  */
++  if (flag_ipa_extend_auto_profile)
++    {
++      free_extend_profile_info ();
++    }
++
+   /* Auto profile finalization. */
+   if (flag_auto_profile)
+     end_auto_profile ();
+diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
+index d33aaec8c..40f67a8ed 100644
+--- a/gcc/tree-cfg.cc
++++ b/gcc/tree-cfg.cc
+@@ -8476,6 +8476,17 @@ print_loops (FILE *file, int verbosity)
+     print_loop_and_siblings (file, bb->loop_father, 0, verbosity);
+ }
+ 
++/* Dump a loop to file.  */
++
++void
++loop_dump (FILE *file, class loop *loop)
++{
++  print_loop (file, loop, 0, 0);
++  fprintf (file, "vec_niter = ");
++  print_generic_expr (file, loop->vec_nb_iterations);
++  fprintf (file, "\n");
++}
++
+ /* Dump a loop.  */
+ 
+ DEBUG_FUNCTION void
+diff --git a/gcc/tree-cfg.h b/gcc/tree-cfg.h
+index bfe44c073..0982fa7cf 100644
+--- a/gcc/tree-cfg.h
++++ b/gcc/tree-cfg.h
+@@ -83,6 +83,7 @@ extern void dump_function_to_file (tree, FILE *, dump_flags_t);
+ extern void debug_function (tree, dump_flags_t);
+ extern void print_loops_bb (FILE *, basic_block, int, int);
+ extern void print_loops (FILE *, int);
++extern void loop_dump (FILE *file, class loop *loop);
+ extern void debug (class loop &ref);
+ extern void debug (class loop *ptr);
+ extern void debug_verbose (class loop &ref);
+diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
+index a98f84397..468353d13 100644
+--- a/gcc/tree-pass.h
++++ b/gcc/tree-pass.h
+@@ -395,6 +395,7 @@ extern gimple_opt_pass *make_pass_slp_vectorize_late (gcc::context *ctxt);
+ extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt);
+ extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt);
+ extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);
++extern gimple_opt_pass *make_pass_llc_allocate (gcc::context *ctxt);
+ extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt);
+ extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt);
+ extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt);
+@@ -536,6 +537,8 @@ extern simple_ipa_opt_pass *make_pass_ipa_hardware_detection (gcc::context *
+ 							      ctxt);
+ extern simple_ipa_opt_pass *make_pass_ipa_prefetch (gcc::context *ctxt);
+ extern simple_ipa_opt_pass *make_pass_ipa_struct_reorg (gcc::context *ctxt);
++extern simple_ipa_opt_pass *make_pass_ipa_extend_auto_profile (gcc::context
++							       *ctxt);
+ extern simple_ipa_opt_pass *make_pass_ipa_pta (gcc::context *ctxt);
+ extern simple_ipa_opt_pass *make_pass_ipa_tm (gcc::context *ctxt);
+ extern simple_ipa_opt_pass *make_pass_target_clone (gcc::context *ctxt);
+diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
+index 44157265c..4c014fb23 100644
+--- a/gcc/tree-scalar-evolution.cc
++++ b/gcc/tree-scalar-evolution.cc
+@@ -2789,7 +2789,7 @@ resolve_mixers (class loop *loop, tree chrec, bool *folded_casts)
+    the loop body has been executed 6 times.  */
+ 
+ tree
+-number_of_latch_executions (class loop *loop)
++number_of_latch_executions (class loop *loop, bool guarantee)
+ {
+   edge exit;
+   class tree_niter_desc niter_desc;
+@@ -2810,7 +2810,8 @@ number_of_latch_executions (class loop *loop)
+   res = chrec_dont_know;
+   exit = single_exit (loop);
+ 
+-  if (exit && number_of_iterations_exit (loop, exit, &niter_desc, false))
++  if (exit && number_of_iterations_exit (loop, exit, &niter_desc, false,
++					 true, NULL, guarantee))
+     {
+       may_be_zero = niter_desc.may_be_zero;
+       res = niter_desc.niter;
+@@ -2836,7 +2837,8 @@ number_of_latch_executions (class loop *loop)
+       fprintf (dump_file, "))\n");
+     }
+ 
+-  loop->nb_iterations = res;
++  if (guarantee)
++    loop->nb_iterations = res;
+   return res;
+ }
+ 
+diff --git a/gcc/tree-scalar-evolution.h b/gcc/tree-scalar-evolution.h
+index 0f90207bc..dc27d9545 100644
+--- a/gcc/tree-scalar-evolution.h
++++ b/gcc/tree-scalar-evolution.h
+@@ -21,7 +21,8 @@ along with GCC; see the file COPYING3.  If not see
+ #ifndef GCC_TREE_SCALAR_EVOLUTION_H
+ #define GCC_TREE_SCALAR_EVOLUTION_H
+ 
+-extern tree number_of_latch_executions (class loop *);
++extern tree number_of_latch_executions (class loop *,
++					bool guarantee = true);
+ extern gcond *get_loop_exit_condition (const class loop *);
+ 
+ extern void scev_initialize (void);
+diff --git a/gcc/tree-ssa-llc-allocate.cc b/gcc/tree-ssa-llc-allocate.cc
+new file mode 100644
+index 000000000..da6d72b94
+--- /dev/null
++++ b/gcc/tree-ssa-llc-allocate.cc
+@@ -0,0 +1,4150 @@
++/* LLC allocate.
++   Copyright (C) 2022-2023 Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify it
++under the terms of the GNU General Public License as published by the
++Free Software Foundation; either version 3, or (at your option) any
++later version.
++
++GCC is distributed in the hope that it will be useful, but WITHOUT
++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#include "config.h"
++#define INCLUDE_MAP
++#define INCLUDE_SET
++#define INCLUDE_VECTOR
++#define INCLUDE_LIST
++#define INCLUDE_ALGORITHM
++#include "system.h"
++#include "coretypes.h"
++#include "backend.h"
++#include "target.h"
++#include "rtl.h"
++#include "tree.h"
++#include "gimple.h"
++#include "predict.h"
++#include "tree-pass.h"
++#include "gimple-ssa.h"
++#include "optabs-query.h"
++#include "tree-pretty-print.h"
++#include "fold-const.h"
++#include "stor-layout.h"
++#include "gimplify.h"
++#include "gimple-iterator.h"
++#include "gimplify-me.h"
++#include "tree-ssa-loop-ivopts.h"
++#include "tree-ssa-loop-manip.h"
++#include "tree-ssa-loop-niter.h"
++#include "tree-ssa-loop.h"
++#include "ssa.h"
++#include "tree-into-ssa.h"
++#include "cfgloop.h"
++#include "tree-scalar-evolution.h"
++#include "langhooks.h"
++#include "tree-inline.h"
++#include "tree-data-ref.h"
++#include "diagnostic-core.h"
++#include "dbgcnt.h"
++#include "gimple-pretty-print.h"
++#include "internal-fn.h"
++#include "tree-cfg.h"
++#include "profile-count.h"
++#include "auto-profile.h"
++
++/* Number of parallel cores.  */
++const unsigned int PARALLEL_NUM = 304;
++
++/* Indirect access weight.  */
++const unsigned int INDIRECT_ACCESS_VALUE = 3;
++
++/* Write memory weight.  */
++const unsigned int WRITE_COST = 4;
++
++/* Maximum ratio of total prefetch data size to cache size.  */
++const double PREFETCH_CACHE_SIZE_RATIO = 0.8;
++
++/* Prefetch tool input max length.  */
++#ifndef PREFETCH_TOOL_INPUT_MAX_LEN
++#define PREFETCH_TOOL_INPUT_MAX_LEN 512
++#endif
++
++/* Prefetch tool number max length.  */
++#ifndef PREFETCH_TOOL_NUM_MAX_LEN
++#define PREFETCH_TOOL_NUM_MAX_LEN 9
++#endif
++
++#ifndef PREFETCH_FUNC_TOPN
++#define PREFETCH_FUNC_TOPN param_llc_allocate_func_topn
++#endif
++
++namespace {
++
++/* loop bound info of the memory reference located.  */
++struct loop_bound
++{
++  /* iv tree_node.  */
++  tree iv;
++
++  /* define stmt of iv.  */
++  gimple *def_stmt;
++
++  /* loop where stmt is located.  */
++  class loop *loop;
++
++  /* loop unroll factor.  */
++  unsigned int unroll;
++
++  /* Number of iterations of loop.  */
++  tree niters;
++
++  loop_bound (tree t, gimple *stmt)
++    {
++      iv = t;
++      def_stmt = stmt;
++      loop = loop_containing_stmt (stmt);
++      unroll = 1;
++      niters = chrec_dont_know;
++    }
++};
++
++/* method of calculating the data size.  */
++
++enum calc_type
++{
++  UNHANDLE_CALC = 0,
++  RUNTIME_CALC,
++  STATIC_CALC
++};
++
++/* Describes a info of a memory reference.  */
++
++struct data_ref
++{
++  /* The memory reference.  */
++  tree ref;
++
++  /* Statement where the ref is located.  */
++  gimple *stmt;
++
++  /* var_decl or param_decl, used for the ref_group.  */
++  tree var;
++
++  /* Base of the reference.  */
++  tree base;
++
++  /* Constant offset of the reference.  */
++  tree offset;
++
++  /* index of the reference.  */
++  tree index;
++
++  /* Constant step of the reference.  */
++  tree step;
++
++  /* loop boundary info of each dimension.  */
++  std::vector<loop_bound> loop_bounds;
++
++  /* memory data size, Unit: MB.  */
++  double data_size;
++
++  /* method of calculating the data size.  */
++  calc_type calc_by;
++
++  /* True if the info of ref is traced, and then record it.  */
++  unsigned int trace_status_p : 1;
++
++  /* True if the loop is vectorized.  */
++  unsigned int vectorize_p : 1;
++
++  /* True if the memory reference is shared.  */
++  unsigned int parallel_p : 1;
++
++  /* True if the memory reference is regular.  */
++  unsigned int regular_p : 1;
++
++  /* True if the memory reference is read.  */
++  unsigned int read_p : 1;
++
++  /* loop father depth.  */
++  unsigned int loop_depth;
++
++  /* bb index.  */
++  int bb_idx;
++
++  /* loop index.  */
++  int loop_idx;
++
++  data_ref ()
++    {
++      ref = NULL_TREE;
++      stmt = NULL;
++      var = NULL_TREE;
++      base = NULL_TREE;
++      offset = NULL_TREE;
++      index = NULL_TREE;
++      step = NULL_TREE;
++      data_size = 0;
++      calc_by = UNHANDLE_CALC;
++      trace_status_p = false;
++      vectorize_p = false;
++      parallel_p = false;
++      regular_p = true;
++      read_p = true;
++      loop_depth = 0;
++      bb_idx = 0;
++      loop_idx = 0;
++    }
++};
++
++/* ================ phase 1 get_dense_memory_kernels ================  */
++
++/* Add ref node and print.  */
++
++void
++add_ref (std::vector<data_ref> &references, tree op, gimple *stmt,
++	 bool vectorize_p, bool read_p)
++{
++  data_ref ref;
++  ref.ref = op;
++  ref.stmt = stmt;
++  ref.vectorize_p = vectorize_p;
++  ref.read_p = read_p;
++  ref.loop_depth = loop_depth (stmt->bb->loop_father);
++  ref.bb_idx = stmt->bb->index;
++  ref.loop_idx = stmt->bb->loop_father->num;
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      print_generic_expr (dump_file, ref.ref, TDF_LINENO);
++      fprintf (dump_file, "\n");
++    }
++  references.push_back (ref);
++}
++
++/* Get the references from the simple call (vectorization type).  */
++
++void
++get_references_in_gimple_call (gimple *stmt, std::vector<data_ref> &references)
++{
++  if (gimple_code (stmt) != GIMPLE_CALL)
++    return;
++
++  if (gimple_call_internal_p (stmt))
++    {
++      bool read_p = false;
++      switch (gimple_call_internal_fn (stmt))
++	{
++	  case IFN_MASK_GATHER_LOAD:
++	  case IFN_MASK_LOAD:
++	    {
++	      if (gimple_call_lhs (stmt) == NULL_TREE)
++		return;
++	      read_p = true;
++	      // FALLTHRU
++	    }
++	  case IFN_MASK_STORE:
++	    {
++	      /* _1 = &MEM[base: a_2(D), index: ivtmp_3, step: 8, offset: 0B];
++		 vect__1.1 = .MASK_LOAD (_1, 64B, loop_mask_4);
++
++		 _1 = &MEM[base: a_2(D), index: ivtmp_3, step: 8, offset: 0B];
++		 .MASK_STORE (_1, 64B, loop_mask_4, vect__1.2);
++
++		_1 = (sizetype) a_2(D);
++		 vect_patt_3.3 = .MASK_GATHER_LOAD (_1, vect__4.4, 8,
++						    { 0.0, ... }, loop_mask_5);
++	      */
++	      tree op1 = gimple_call_arg (stmt, 0);
++	      if (TREE_CODE (op1) != SSA_NAME)
++		{
++		  if (dump_file && (dump_flags & TDF_DETAILS))
++		    {
++		      fprintf (dump_file, "get_references_in_gimple_call: ");
++		      fprintf (dump_file, "find base that not ssa_name: ");
++		      print_generic_expr (dump_file, op1, TDF_LINENO);
++		      fprintf (dump_file, "\n");
++		    }
++		  return;
++		}
++	      gimple *op1_def = SSA_NAME_DEF_STMT (op1);
++	      if (op1_def != NULL && gimple_code (op1_def) == GIMPLE_ASSIGN)
++		{
++		  /* &MEM[base: xx]  */
++		  tree rhs1 = gimple_assign_rhs1 (op1_def);
++		  /* If the definition stmt of the operation is memory
++		     reference type, read it directly.  */
++		  if (TREE_CODE (rhs1) == ADDR_EXPR
++		      && TREE_CODE (TREE_OPERAND (rhs1, 0)) == TARGET_MEM_REF)
++		    op1 = TREE_OPERAND (rhs1, 0); /* MEM[base: xx]  */
++		}
++
++	      add_ref (references, op1, stmt, true, read_p);
++	      return;
++	    }
++	  default:
++	    return;
++	}
++    }
++}
++
++/* Check whether memory reference is located exactly in main function.
++   There are some other unexpected scenarios where mem ref or function is
++   tracing failed without loc info (newly generated gimple/function).  */
++
++bool
++is_reference_in_main_p (gimple *stmt)
++{
++  expanded_location xloc = expand_location (stmt->location);
++  if (DECL_NAME (cfun->decl) && MAIN_NAME_P (DECL_NAME (cfun->decl)))
++    {
++      /* NEXT STEP: Check why some functions have no end_locus.  */
++      if (!(DECL_SOURCE_LOCATION (current_function_decl)
++	    && cfun->function_end_locus))
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "Cannot find function start-end location.\n");
++	  return true;
++	}
++      else if (!(xloc.file && xloc.line))
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "Cannot find gimple statement location.\n");
++	      print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
++	    }
++	  return false;
++	}
++      int fn_start = expand_location (
++	DECL_SOURCE_LOCATION (current_function_decl)).line;
++      int fn_end = expand_location (cfun->function_end_locus).line;
++
++      if (xloc.line >= fn_start && xloc.line <= fn_end)
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "Memory access in main function: ");
++	      print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
++	    }
++	  return true;
++	}
++    }
++  return false;
++}
++
++/* Stores the locations of memory references in STMT to REFERENCES.  */
++
++void
++get_references_in_stmt (gimple *stmt, std::vector<data_ref> &references)
++{
++  if (!gimple_vuse (stmt))
++    return;
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "gimple_vuse: ");
++      print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
++    }
++
++  /* Filter out memory references located in main function. This is a
++     experimental filtering scheme ONLY for HPC case verification as
++     some HPC cases assign values for variables (mem ref) in main function.  */
++  if (is_reference_in_main_p (stmt))
++    return;
++
++  if (gimple_code (stmt) == GIMPLE_ASSIGN)
++    {
++      tree op0 = gimple_assign_lhs (stmt);
++      tree op1 = gimple_assign_rhs1 (stmt);
++      tree base = NULL_TREE;
++
++      /* _1 = MEM[base: a, index: i, step: 8, offset: 0B];  */
++      if (REFERENCE_CLASS_P (op1)  && (base = get_base_address (op1))
++	  && TREE_CODE (base) != SSA_NAME && !is_gimple_min_invariant (base))
++	add_ref (references, op1, stmt, false, true);
++
++      if (REFERENCE_CLASS_P (op0) && get_base_address (op0))
++	add_ref (references, op0, stmt, false, false);
++    }
++  else if (gimple_code (stmt) == GIMPLE_CALL)
++    get_references_in_gimple_call (stmt, references);
++
++  return;
++}
++
++/* flag of loop filter out.  */
++
++struct loop_filter_out_flag
++{
++  /* Use external call.  */
++  bool use_ext_call;
++
++  /* Use external node.  */
++  bool use_ext_node;
++
++  /* Use loop defined in macros.  */
++  bool use_macro_loop;
++
++  /* Use external node.  */
++  bool use_cond_func;
++};
++
++/* Check whether an external node is used.  */
++
++bool use_ext_node_p (const std::vector<data_ref> &references,
++		     unsigned int &start)
++{
++  expanded_location cfun_xloc
++	= expand_location (DECL_SOURCE_LOCATION (current_function_decl));
++
++  unsigned i = start;
++  start = references.size ();
++  for (; i < references.size (); i++)
++    {
++      data_ref ref = references[i];
++      expanded_location xloc = expand_location (ref.stmt->location);
++      if (xloc.file && filename_cmp (cfun_xloc.file, xloc.file))
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "use_ext_node\n\n");
++	  return true;
++	}
++    }
++  return false;
++}
++
++/* Determine whether to filter out loops by stmt.  */
++
++bool
++filter_out_loop_by_stmt_p (loop_filter_out_flag &loop_filter, gimple *stmt,
++			   const std::vector<data_ref> &references,
++			   unsigned int &start)
++{
++  expanded_location xloc = expand_location (stmt->location);
++  /* check use_ext_call.  */
++  if (gimple_code (stmt) == GIMPLE_CALL && !gimple_call_internal_p (stmt))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "use_ext_call: ");
++	  print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
++	}
++      loop_filter.use_ext_call = true;
++      return true;
++    }
++
++  /* check use_macro_loop.  */
++  if (xloc.file && xloc.column != 1)
++    loop_filter.use_macro_loop = false;
++
++  /* check use_cond_func, VEC_COND_EXPR/MIN_EXPR/MAX_EXPR.  */
++  if (gimple_code (stmt) == GIMPLE_ASSIGN)
++    {
++      enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
++      if (rhs_code == VEC_COND_EXPR || rhs_code == MIN_EXPR
++	  || rhs_code == MAX_EXPR)
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "use_cond_func: ");
++	      print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
++	    }
++	  loop_filter.use_cond_func = true;
++	  return true;
++	}
++    }
++
++  /* check use_ext_node.  */
++  if (use_ext_node_p (references, start))
++    {
++      loop_filter.use_ext_node = true;
++      return true;
++    }
++
++  return false;
++}
++
++/* Dump the flag type of the loop is filtered out.  */
++
++void
++dump_loop_filter_out_flag (loop_filter_out_flag &loop_filter)
++{
++  if (loop_filter.use_ext_call)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "non-dense mem access: use_ext_call\n");
++    }
++
++  if (loop_filter.use_ext_node)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "non-dense mem access: use_ext_node\n");
++    }
++
++  if (loop_filter.use_macro_loop)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "non-dense mem access: use_macro_loop\n");
++    }
++
++  if (loop_filter.use_cond_func)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "non-dense mem access: use_cond_func\n");
++    }
++}
++
++/* Get references in loop.  */
++
++bool
++get_references_in_loop (std::vector<data_ref> &references,
++			loop_filter_out_flag &loop_filter,
++			class loop *loop)
++{
++  unsigned int start = 0;
++  bool filter_out_loop = true;
++
++  /* Analyze each bb in the loop.  */
++  basic_block *body = get_loop_body_in_dom_order (loop);
++  for (unsigned i = 0; i < loop->num_nodes; i++)
++    {
++      basic_block bb = body[i];
++      if (bb->loop_father != loop)
++	continue;
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "\n==== the %dth loop bb body ====\n", i);
++	  gimple_dump_bb (dump_file, bb, 0, dump_flags);
++	  fprintf (dump_file, "\n");
++	}
++
++      gimple_stmt_iterator bsi;
++      for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
++	{
++	  gimple *stmt = gsi_stmt (bsi);
++	  get_references_in_stmt (stmt, references);
++	  filter_out_loop = filter_out_loop_by_stmt_p (loop_filter, stmt,
++						       references, start);
++	  if (filter_out_loop)
++	    break;
++	}
++      if (filter_out_loop)
++	break;
++    }
++  free (body);
++  return !filter_out_loop;
++}
++
++/* Computes an estimated number of insns in LOOP, weighted by WEIGHTS.
++   Assume that the HPC data reading and calculation process does not involve
++   adding branches in loops.  Therefore, all bbs of loops are directly used for
++   calculation (excluding embedded loops) without considering branch weighting.
++*/
++
++unsigned
++estimate_loop_insns (class loop *loop, eni_weights *weights)
++{
++  basic_block *body = get_loop_body (loop);
++  gimple_stmt_iterator gsi;
++  unsigned size = 0, i;
++
++  for (i = 0; i < loop->num_nodes; i++)
++    {
++      basic_block bb = body[i];
++      if (bb->loop_father != loop)
++	{
++	  continue;
++	}
++      for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
++	size += estimate_num_insns (gsi_stmt (gsi), weights);
++    }
++  free (body);
++
++  return size;
++}
++
++/* Check whether the memory access is dense.  */
++
++bool
++dense_memory_p (const std::vector<data_ref> &references, class loop *loop)
++{
++  int ref_count = references.size ();
++  unsigned int ninsns = estimate_loop_insns (loop, &eni_size_weights);
++  float mem_to_insn_ratio = (float)ref_count / (float)ninsns;
++
++  /* The number of cores to be run and DDR bandwidth information can be
++  transferred to flexibly adjust the threshold.  */
++  bool dense_mem = (mem_to_insn_ratio >= (param_mem_access_ratio / 100.0)
++		    && ref_count >= param_mem_access_num);
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (cfun->decl));
++
++      /* Dump dense memory source code location.  */
++      if (ref_count && references[0].stmt->location)
++	{
++	  expanded_location xloc = expand_location
++				     (references[0].stmt->location);
++	  int fn_start = 0;
++	  if (DECL_SOURCE_LOCATION (current_function_decl))
++	    fn_start = expand_location (
++			    DECL_SOURCE_LOCATION (current_function_decl)).line;
++	  int fn_end = fn_start;
++	  if (cfun->function_end_locus)
++	    fn_end = expand_location (cfun->function_end_locus).line;
++	  if (xloc.file)
++	    fprintf (dump_file, "[%s:%s(%d-%d):%d:%d] ",
++		      xloc.file, fn_name, fn_start, fn_end,
++		      xloc.line, xloc.column);
++	}
++
++      /* Dump memory dense information.  */
++      if (dense_mem)
++	fprintf (dump_file, "dense memory access: ");
++      else
++	fprintf (dump_file, "non-dense mem access: ");
++      fprintf (dump_file,
++	       "ref_count = %d, ninsns = %d, mem_to_insn_ratio = %f\n\n",
++	       ref_count, ninsns, mem_to_insn_ratio);
++    }
++
++  return dense_mem;
++}
++
++/* Analyze the inner loop and get the loop with dense memory access.  */
++
++void
++analyze_loop_dense_memory (std::vector<class loop *> &kernels,
++			  std::map<class loop *,
++				   std::vector<data_ref> > &kernels_refs,
++			  class loop *loop)
++{
++  std::vector<data_ref> references;
++  number_of_latch_executions (loop);
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "\n========== Processing loop %d: ==========\n",
++	       loop->num);
++      loop_dump (dump_file, loop);
++      flow_loop_dump (loop, dump_file, NULL, 1);
++      fprintf (dump_file, "loop unroll: %d\n", loop->unroll);
++    }
++
++  if (get_loop_exit_edges (loop).length () != 1)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "non-dense mem access: loop_multiple_exits\n");
++      return;
++    }
++
++  loop_filter_out_flag loop_filter = {false, false, true, false};
++
++  if (!get_references_in_loop (references, loop_filter, loop))
++    {
++      dump_loop_filter_out_flag (loop_filter);
++      return;
++    }
++
++  if (dense_memory_p (references, loop))
++    {
++      kernels_refs[loop] = references;
++      kernels.push_back (loop);
++    }
++}
++/* Analyze the inner loop and get the loop with dense memory access.  */
++
++bool
++get_dense_memory_kernels (std::vector<class loop *> &kernels,
++			  std::map<class loop *,
++				   std::vector<data_ref> > &kernels_refs)
++{
++  if (dump_file)
++    fprintf (dump_file, "\nPhase 1: get_dense_memory_kernels\n\n");
++  for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
++    analyze_loop_dense_memory (kernels, kernels_refs, loop);
++  return kernels.size () > 0;
++}
++
++/* ================ phase 2 trace_data_refs_info ================  */
++
++/* Determine whether the declaration is a non-vectorized.  */
++
++bool
++generic_decl_p (tree expr)
++{
++  if (expr == NULL_TREE)
++    return false;
++  enum tree_code expr_code = TREE_CODE (expr);
++  if (expr_code != VAR_DECL && expr_code != PARM_DECL
++      && expr_code != COMPONENT_REF)
++    return false;
++  return true;
++}
++
++/* Initial worklist preparation for source variable tracing.
++   Add different initial node based on different gimple statements.  */
++
++void
++add_worklist (std::vector<tree> &worklist, std::set<tree> &walked,
++	      gimple *def_stmt)
++{
++  if (gimple_code (def_stmt) == GIMPLE_PHI)
++    {
++      for (unsigned i = 0; i < gimple_phi_num_args (def_stmt); i++)
++	{
++	  tree node = gimple_phi_arg_def (def_stmt, i);
++	  if (!walked.count (node))
++	    {
++	      worklist.push_back (node);
++	      walked.insert (node);
++	    }
++	}
++    }
++  else if (is_gimple_assign (def_stmt))
++    {
++      tree_code rhs_code = gimple_assign_rhs_code (def_stmt);
++      if (rhs_code == POINTER_PLUS_EXPR || rhs_code == NEGATE_EXPR
++	  || rhs_code == NOP_EXPR || rhs_code == SSA_NAME
++	  || rhs_code == COMPONENT_REF)
++	{
++	  tree node = gimple_assign_rhs1 (def_stmt);
++	  if (!walked.count (node))
++	    {
++	      worklist.push_back (node);
++	      walked.insert (node);
++	    }
++	}
++      else if (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR)
++	{
++	  tree node = gimple_assign_rhs1 (def_stmt);
++	  if (!walked.count (node))
++	    {
++	      worklist.push_back (node);
++	      walked.insert (node);
++	    }
++	  node = gimple_assign_rhs2 (def_stmt);
++	  if (!walked.count (node))
++	    {
++	      worklist.push_back (node);
++	      walked.insert (node);
++	    }
++	}
++      else if (rhs_code == TARGET_MEM_REF || rhs_code == MEM_REF)
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "possibly unnested indirect memory access: ");
++	      print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO);
++	      fprintf (dump_file, "\n");
++	    }
++	}
++      else
++	{
++	  /* unhandled assign rhs_code: _219 = _17 * _70;
++	     _17 = *grid_56(D).sst.span;
++	     _70 = *grid_56(D).sst.dim[0].stride;
++	  */
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "unhandled assign rhs_code: ");
++	      print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO);
++	      fprintf (dump_file, "\n");
++	    }
++	}
++    }
++  else
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "unsupported tracing stmt: ");
++	  print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO);
++	  fprintf (dump_file, "\n");
++	}
++    }
++}
++
++
++/* Tracing source variables:
++   vectp.1 = a_2(D) + _3;
++   _4 = &MEM[base: vectp.1, index: ivtmp_5, step: 8, offset: 0B];
++   vect__1.6 = .MASK_LOAD (_4, 64B, loop_mask_7);
++
++   _1 = (sizetype) b_2(D);
++   vect_patt_3.3 = .MASK_GATHER_LOAD (_1, vect__4.4, 8, { 0.0, ... },
++				      loop_mask_5);
++  ...
++  Due to previous pass optimizations, the current tracing method can find
++  several source variable candidates.  We decide to record them in a map and
++  later filter out the true base variable by some criteria.
++*/
++
++void
++trace_base_var_helper (tree arg, std::set<tree> &walked,
++		       std::map<tree, int>& base_var_candid, bool is_vect_type)
++{
++  if (arg == NULL)
++    return;
++
++  /* Var_decl type: base address extracted from ARRAY_REF.  */
++  if (TREE_CODE (TREE_TYPE (arg)) == ARRAY_TYPE && TREE_CODE (arg) == VAR_DECL
++      && generic_decl_p (arg))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "var_decl type\n");
++      base_var_candid[arg] += 1;
++      return;
++    }
++
++  /* Array type.  */
++  tree op0 = NULL;
++  if (TREE_CODE (arg) == ADDR_EXPR
++      && (op0 = TREE_OPERAND (arg, 0)) && generic_decl_p (op0))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "array type\n");
++      base_var_candid[op0] += 1;
++      return;
++    }
++
++  /* Pointer type.  */
++  if (TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE && generic_decl_p (arg))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "pointer type\n");
++      base_var_candid[arg] += 1;
++      return;
++    }
++
++  /* SSA_NAME type.  */
++  if (TREE_CODE (arg) != SSA_NAME)
++    return;
++
++  tree tmp_var = SSA_NAME_VAR (arg);
++  if (tmp_var && !is_vect_type && TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "ssa pointer type\n");
++      base_var_candid[tmp_var] += 1;
++      return;
++    }
++
++  gimple *def_stmt = SSA_NAME_DEF_STMT (arg);
++  if (def_stmt == NULL)
++    return;
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      print_generic_expr (dump_file, arg, TDF_SLIM);
++      fprintf (dump_file, "\t\t: ");
++      print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM);
++    }
++
++  if (gimple_code (def_stmt) == GIMPLE_NOP)
++    {
++      if (!walked.count (tmp_var))
++	walked.insert (tmp_var);
++      trace_base_var_helper (tmp_var, walked, base_var_candid, is_vect_type);
++    }
++  else
++    {
++      std::vector<tree> worklist;
++      add_worklist (worklist, walked, def_stmt);
++      for (unsigned i = 0; i < worklist.size (); ++i)
++	trace_base_var_helper (worklist[i], walked, base_var_candid, is_vect_type);
++    }
++}
++
++/* Identify the base variable traced from base address of memory reference.
++   We recognize that current method could detect several base variable
++   candidates and the temporary criteria for base variable determination
++   is that either one of the following statement is true:
++    1) The number of base variable candidates is 1;
++    2) The number of detected gimple statements for some variable is 1.
++   We may use other criteria or relax the current criteria
++   (e.g., criterion 2: 1 -> any odd number).  */
++
++bool
++trace_base_var (data_ref &mem_ref, std::set<tree> &walked)
++{
++  tree &var = mem_ref.var;
++  tree arg = mem_ref.base;
++  std::map<tree, int> base_var_candid;
++  bool is_vect_type = TREE_CODE (TREE_TYPE (mem_ref.ref)) == VECTOR_TYPE;
++  trace_base_var_helper (arg, walked, base_var_candid, is_vect_type);
++  bool is_tracing_unusual = false;
++  if (base_var_candid.size () == 1)
++    var = base_var_candid.begin ()->first;
++  else
++    {
++      is_tracing_unusual = true;
++      for (std::map<tree, int>::iterator it = base_var_candid.begin ();
++	   it != base_var_candid.end (); ++it)
++	var = it->second == 1 ? it->first : var;
++    }
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "Traced variables at ");
++      print_generic_expr (dump_file, arg, TDF_SLIM);
++      fprintf (dump_file, ":\n");
++      for (std::map<tree, int>::iterator it = base_var_candid.begin ();
++	   it != base_var_candid.end (); ++it)
++	fprintf (dump_file, "%s:%d, ", get_name (it->first), it->second);
++      fprintf (dump_file, "\n");
++
++      if (var == NULL_TREE)
++	fprintf (dump_file, "Unhandled scenario for tracing base variable.\n");
++      else if (is_tracing_unusual && var != NULL_TREE)
++	fprintf (dump_file, "Tracing unusual number or occurrences of base "
++			    "variables.  Choose %s.\n",
++		 get_name (var));
++    }
++  return var != NULL_TREE;
++}
++
++/* Recursively trace and check whether the definition stmt of the
++   index operand is a recorded stmt in direct access tracing.
++   Return 0 if ref is a direct access a[].
++   Return 1 if ref is a non-nested indirect access a[b[]].
++   Return 2 if ref is a complex indirect memory access, such as a[f(b[])].  */
++
++int
++trace_indirect_operand (tree arg, std::set<gimple *> &traced_ref_stmt)
++{
++  /* Return 0 if tree `arg` is not an SSA for further tracing.  */
++  if (TREE_CODE (arg) != SSA_NAME)
++    return 0;
++
++  gimple *def_stmt = SSA_NAME_DEF_STMT (arg);
++
++  /* Return 1 if `index` has been detected as a traced direct memory access
++     before.  */
++  if (traced_ref_stmt.count (def_stmt))
++    return 1;
++
++  /* Return 0 if def stmt of `arg` is not in gimple assign type. Stop tracing
++     index operand and currently no memory access operand is detected.  */
++  if (!def_stmt || !is_gimple_assign (def_stmt))
++    return 0;
++
++  tree_code rhs_code = gimple_assign_rhs_code (def_stmt);
++  /* Collect a whitelist of gimple_assign_rhs_code for tracing pointer/array
++     type indirect memory access.  */
++  if (rhs_code != MULT_EXPR && rhs_code != NOP_EXPR
++      && rhs_code != CONVERT_EXPR && rhs_code != PLUS_EXPR)
++    {
++      /* Return 2 if tree code has any type representing references to storge,
++	 implying a complex indirect memory access scenario for future
++	 analysis.  */
++      if (rhs_code == MEM_REF || rhs_code == TARGET_MEM_REF
++	  || rhs_code == ARRAY_REF || rhs_code == ARRAY_RANGE_REF
++	  || rhs_code == COMPONENT_REF || rhs_code == ADDR_EXPR
++	  || rhs_code == INDIRECT_REF)
++	return 2;
++
++      /* Return 0 and stop tracing if tree code is not a common tracing
++	 operand, but still reflected as a non-reference type.
++	 Caveats: if we never deal with this tree code before, maybe it is
++	 more suitable to treat this scenario strictly.  */
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "unknown tracing tree code: %s\n",
++		   get_tree_code_name (rhs_code));
++	  print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      return 0;
++    }
++
++  tree op = NULL_TREE;
++  ssa_op_iter iter;
++  FOR_EACH_SSA_TREE_OPERAND (op, def_stmt, iter, SSA_OP_USE)
++    {
++      int trace_indir_p = trace_indirect_operand (op, traced_ref_stmt);
++      if (trace_indir_p != 0)
++	return trace_indir_p;
++    }
++  return 0;
++}
++
++/* Trace the pointer of the direct/indirect memory access:
++   1) Obtain the base address of the memory access.
++   2) If index variable is formed by another memory access operation (i.e., an
++      indication of indirect memory access), ensure that the index has been
++      traced in an already discovered direct memory access.
++   3) Otherwise, the memory access is in a more complex scenario and we need to
++      postpone the analysis later. For example, the indirect memory access is
++      nested, a[b[c[...]]], or the index variable (formed in another memory
++      access) has not been recorded/traced yet.
++   e.g.,
++   _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B];
++   _4 = (long unsigned int) _1;
++   _5 = _4 * 8;
++   _6 = p(D) + _5; // get base
++   _7 = *_6;       // start tracing
++*/
++
++bool
++trace_ptr_mem_ref (data_ref &mem_ref, std::set<gimple *> &traced_ref_stmt,
++		   std::vector<data_ref> &unresolved_refs)
++{
++  /* Simple scenario:
++     _2208 = np.120_2207 * 8;
++     _1921 = sorted_weight$data_381 + _2208;
++     *_1921 = _2206;
++
++     Complex scenario:
++     MEM[base: _3235, index: ivtmp.2768_3189, step: 4, offset: 0B] = _105;
++     _3236 = (sizetype) _214;
++     _3237 = _3236 * 4;
++     _3238 = _857 + _3237;  // base + index * step
++     _3239 = _3238 + 4;     // offset
++     MEM[base: _3239, index: ivtmp.2768_3189, step: 4, offset: 0B] = 0.0;
++  */
++  tree pointer = TREE_OPERAND (mem_ref.ref, 0);
++  tree offset = TREE_OPERAND (mem_ref.ref, 1);
++  if (TREE_CODE (offset) != INTEGER_CST)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Unhandled scenario for non-constant offset.\n");
++
++      return false;
++    }
++  if (TREE_CODE (pointer) != SSA_NAME)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Unhandled scenario for non-ssa pointer.\n");
++
++      return false;
++    }
++
++  /* Tracing back base address from SSA.  */
++  gimple *ptr_def_stmt = SSA_NAME_DEF_STMT (pointer);
++  if (ptr_def_stmt == NULL || gimple_code (ptr_def_stmt) != GIMPLE_ASSIGN
++      || gimple_assign_rhs_code (ptr_def_stmt) != POINTER_PLUS_EXPR)
++    return false;
++  tree base = gimple_assign_rhs1 (ptr_def_stmt);
++  /* index_offset = index * step.  */
++  tree index_offset = gimple_assign_rhs2 (ptr_def_stmt);
++
++  /* Tracing back index from SSA.  */
++  if (TREE_CODE (index_offset) != SSA_NAME)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  if (TREE_CODE (index_offset) == INTEGER_CST)
++	    fprintf (dump_file, "Constant index for memory access.\n");
++	  else
++	    fprintf (dump_file, "Unhandled scenario for index tracing.\n");
++	}
++      return false;
++    }
++
++  gimple *idx_def_stmt = SSA_NAME_DEF_STMT (index_offset);
++  if (idx_def_stmt == NULL || gimple_code (idx_def_stmt) != GIMPLE_ASSIGN
++      || gimple_assign_rhs_code (idx_def_stmt) != MULT_EXPR)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Unhandled scenario for index tracing.\n");
++      return false;
++    }
++
++  /* Split array index from total offset of index, `index * step`.  */
++  mem_ref.base = base;
++  mem_ref.offset = offset;
++  mem_ref.index = gimple_assign_rhs1 (idx_def_stmt);
++  mem_ref.step = gimple_assign_rhs2 (idx_def_stmt);
++  if (TREE_CODE (gimple_assign_rhs1 (idx_def_stmt)) == INTEGER_CST)
++    {
++      mem_ref.index = gimple_assign_rhs2 (idx_def_stmt);
++      mem_ref.step = gimple_assign_rhs1 (idx_def_stmt);
++    }
++
++  int trace_index_indir_p = trace_indirect_operand (mem_ref.index,
++						    traced_ref_stmt);
++  if (trace_index_indir_p == 0)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Direct memory access tracing succeeded.\n");
++    }
++  else if (trace_index_indir_p == 1)
++    {
++      mem_ref.regular_p = false;
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Indirect memory access tracing succeeded.\n");
++    }
++  else
++    {
++      /* Record indirect memory access with complex scenarios for future
++	 analysis.  */
++      unresolved_refs.push_back (mem_ref);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Unhandled indirect memory access tracing.\n");
++      return false;
++    }
++
++  return true;
++}
++
++/* Tracing direct memory reference information.  */
++
++bool
++trace_direct_mem_ref (data_ref &mem_ref)
++{
++  /* Direct memory access, regardless of whether it is in vectorized form,
++     can be determined through TARGET_MEM_REF:
++      address = base + index * step + offset.
++     MASK_LOAD example:
++      _43 = &MEM[base: _42, index: ivtmp_140, step: 8, offset: 0B];
++      vect__42.11_160 = .MASK_LOAD (_43, 64B, loop_mask_163);
++
++     In some cases (2D-array or complex-index 1D array), mem_ref's `base`
++     may actually represent `base + index * step` when `base` address updates
++     by a PHI operation, e.g.,
++      MEM[base: _51, offset: 0B]
++      _51 = (void *) ivtmp.18_11;
++      ivtmp.18_11 = PHI <ivtmp.18_43(10), ivtmp.18_52(14)>
++      ivtmp.18_43 = ivtmp.18_11 + 16;
++      ivtmp.18_52 = (unsigned long) _10;
++      _10 = arr2D_29(D) + _9;
++  */
++  mem_ref.base = TREE_OPERAND (mem_ref.ref, 0);
++  mem_ref.offset = TREE_OPERAND (mem_ref.ref, 1);
++  mem_ref.index = TREE_OPERAND (mem_ref.ref, 2);
++  mem_ref.step = TREE_OPERAND (mem_ref.ref, 3);
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "Direct memory access tracing succeeded.\n");
++
++  return true;
++}
++
++/* Tracing vectorized indirect memory reference information.
++   MASK_GATHER_LOAD example:
++    vect__45.13_146 = .MASK_LOAD (_41, 32B, loop_mask_153);
++    vect__46.14_145 = (vector([2,2]) long unsigned int) vect__45.13_146;
++    vect_patt_163.15_143 = .MASK_GATHER_LOAD (_144, vect__46.14_145, 8,
++      { 0.0, ... }, loop_mask_153);  */
++
++bool
++trace_indirect_mem_ref_vectorized (data_ref &mem_ref,
++				   std::set<gimple *> &traced_ref_stmt)
++{
++  /* Processing of vectorization types.  */
++  if (mem_ref.vectorize_p)
++    {
++      tree op = gimple_call_arg (mem_ref.stmt, 1);
++      if (trace_indirect_operand (op, traced_ref_stmt))
++	{
++	  mem_ref.base = gimple_call_arg (mem_ref.stmt, 0);
++	  mem_ref.index = gimple_call_arg (mem_ref.stmt, 1);
++	  mem_ref.step = gimple_call_arg (mem_ref.stmt, 2);
++	  mem_ref.regular_p = false;
++
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "Indirect memory access tracing succeeded.\n");
++	  return true;
++	}
++    }
++  return false;
++}
++
++/* Trace the array of the indirect memory access:
++   1) Obtain the base address of the indirect memory access.
++   2) Ensure that the index has been traced in the direct memory access.
++   e.g.,
++   _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B];
++   _4 = (integer(kind=8)) _1;
++   _5 = _4 + 135;
++   _6 = p[_5];       // start tracing
++*/
++
++bool
++trace_indirect_array (data_ref &mem_ref, std::set<gimple *> &traced_ref_stmt)
++{
++  tree base = TREE_OPERAND (mem_ref.ref, 0);
++  tree index = TREE_OPERAND (mem_ref.ref, 1);
++  if (trace_indirect_operand (index, traced_ref_stmt))
++    {
++      /* ARRAY_REF, The first operand is the array;
++		    the second is the index.  */
++      mem_ref.base = base;
++      mem_ref.index = index;
++      mem_ref.regular_p = false;
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Indirect memory access tracing succeeded.\n");
++
++      return true;
++    }
++
++  return false;
++}
++
++/* Trace memory references base info:
++   1) Memory access rule analysis and reference info tracing
++   2) Source variable tracing, along base address of memory reference
++   We will extend parallel analysis later.
++*/
++
++void
++trace_ref_info (data_ref &mem_ref, std::set<gimple *> &traced_ref_stmt,
++		std::vector<data_ref> &unresolved_refs)
++{
++  enum tree_code ref_code = TREE_CODE (mem_ref.ref);
++  /* 1) Direct and indirect access traces.  */
++  switch (ref_code)
++    {
++    case MEM_REF:
++      /* Non-vectorized direct/indirect access by pointer.  */
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "MEM_REF\n");
++      if (!trace_ptr_mem_ref (mem_ref, traced_ref_stmt, unresolved_refs))
++	return;
++      break;
++    case TARGET_MEM_REF:
++      /* Vectorized and non-vectorized direct access.  */
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "TARGET_MEM_REF\n");
++      if (!trace_direct_mem_ref (mem_ref))
++	return;
++      break;
++    case SSA_NAME:
++      /* Vectorized indirect memory access.  */
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "SSA_NAME\n");
++      if (!trace_indirect_mem_ref_vectorized (mem_ref, traced_ref_stmt))
++	return;
++      break;
++    case ARRAY_REF:
++      /* Non-vectorized indirect memory access.  */
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "ARRAY_REF\n");
++      if (!trace_indirect_array (mem_ref, traced_ref_stmt))
++	return;
++      break;
++    default:
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "ref is another tree-code: ");
++	  fprintf (dump_file, "stmt: ");
++	  print_gimple_stmt (dump_file, mem_ref.stmt, 0, TDF_LINENO);
++	  fprintf (dump_file, "ref: ");
++	  print_generic_expr (dump_file, mem_ref.ref, TDF_LINENO);
++	  fprintf (dump_file, "\n");
++	}
++      return;
++    }
++
++  /* 2) Source variable tracing.  */
++  std::set<tree> walked;
++  if (mem_ref.var == NULL_TREE
++      && !trace_base_var (mem_ref, walked))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Source variable tracing failed.\n\n");
++      return;
++    }
++
++  if (mem_ref.regular_p)
++    traced_ref_stmt.insert (mem_ref.stmt);
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "Tracing succeeded.\n\n");
++  mem_ref.trace_status_p = true;
++}
++
++/* Trace all references in the loop.  */
++
++void
++trace_loop_refs_info (std::vector<data_ref> &refs,
++		      std::set<gimple *> &traced_ref_stmt,
++		      std::vector<data_ref> &unresolved_refs)
++{
++  for (unsigned i = 0; i < refs.size (); ++i)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "trace_references_base_info %d:\n", i);
++	  print_generic_expr (dump_file, refs[i].ref, TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      trace_ref_info (refs[i], traced_ref_stmt, unresolved_refs);
++    }
++}
++
++/* Tracing and sorting reference groups.  */
++
++void
++trace_data_refs_info (std::vector<class loop *> &kernels,
++		      std::map<class loop *,
++			       std::vector<data_ref> > &loop_refs,
++		      std::set<gimple *> &traced_ref_stmt,
++		      std::vector<data_ref> &unresolved_refs)
++{
++  if (dump_file)
++    fprintf (dump_file, "\nPhase 2: trace_all_references_info\n\n");
++
++  for (unsigned i = 0; i < kernels.size (); ++i)
++    {
++      class loop *loop = kernels[i];
++      if (loop_refs.count (loop) == 0)
++	continue;
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "loop header %d:\n", loop->header->index);
++      trace_loop_refs_info (loop_refs[loop], traced_ref_stmt, unresolved_refs);
++    }
++}
++
++/* Retrace references base info for complex scenarios in indirect memory access
++   after Phase 3.  */
++
++void
++retrace_ref_info_unresolved (data_ref &mem_ref,
++			     std::set<gimple *> &traced_ref_stmt)
++{
++  /* 1) Indirect access traces.  */
++  int trace_index_indir_p = trace_indirect_operand (mem_ref.index,
++						    traced_ref_stmt);
++  if (trace_index_indir_p == 1)
++    {
++      mem_ref.regular_p = false;
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Indirect memory access tracing succeeded.\n");
++    }
++
++  /* 2) Source variable tracing.  */
++  std::set<tree> walked;
++  if (mem_ref.var == NULL_TREE
++      && !trace_base_var (mem_ref, walked))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Source variable tracing failed.\n\n");
++      return;
++    }
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "Tracing succeeded.\n\n");
++  mem_ref.trace_status_p = true;
++}
++
++/* Retrace all unresolved references.  */
++
++void
++retrace_loop_refs_info_unresolved (std::vector<data_ref> &unresolved_refs,
++				   std::set<gimple *> &traced_ref_stmt)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file,
++	     "\nRetrace indirect memory access after outer loop analysis:\n");
++  for (unsigned i = 0; i < unresolved_refs.size (); ++i)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "trace_references_base_info %d:\n", i);
++	  print_generic_expr (dump_file, unresolved_refs[i].ref, TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      retrace_ref_info_unresolved (unresolved_refs[i], traced_ref_stmt);
++    }
++}
++
++/* ================ phase 3 analyze_nested_kernels ================  */
++
++/* Return the inner most type for arrays and pointers of TYPE.  */
++
++tree
++inner_type (tree type)
++{
++  while (POINTER_TYPE_P (type)
++	 || TREE_CODE (type) == ARRAY_TYPE)
++    type = TREE_TYPE (type);
++  return type;
++}
++
++/* Check whether the input iv is the loop dimension boundary.  */
++
++bool
++loop_bound_iv_p (tree t, tree &outer_loop_t)
++{
++  if (t == NULL || TREE_CODE (t) != SSA_NAME
++      || TREE_CODE (TREE_TYPE (t)) != INTEGER_TYPE)
++    return false;
++
++  gimple *def_stmt = SSA_NAME_DEF_STMT (t);
++
++  /* NOP_EXPR convertion between PHI node and memory reference due to MACRO.
++    n_898 = PHI <n_907(355), 0(356)>
++    _757 = (sizetype) n_898;
++    _900 = MEM[base: _726, index: _757, step: 8, offset: 0B];
++  */
++  while (gimple_code (def_stmt) == GIMPLE_ASSIGN
++	 && gimple_assign_rhs_code (def_stmt) == NOP_EXPR)
++    def_stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (def_stmt));
++
++  if (gimple_code (def_stmt) != GIMPLE_PHI)
++    return false;
++
++  /* Filter scenarios with only two phi inputs.  */
++  if (gimple_phi_num_args (def_stmt) != 2)
++    return false;
++
++  gphi *phi_stmt = as_a <gphi *> (def_stmt);
++  basic_block src0 = gimple_phi_arg_edge (phi_stmt, 0)->src;
++  basic_block src1 = gimple_phi_arg_edge (phi_stmt, 1)->src;
++
++  class loop *loop = loop_containing_stmt (def_stmt);
++  bool res = false;
++  /* Two phi inputs, one from the current loop and one from the outer loop.  */
++  if ((src0->loop_father == loop) && (src1->loop_father == loop_outer (loop)))
++    {
++      outer_loop_t = gimple_phi_arg_def (def_stmt, 1);
++      res = true;
++    }
++  else if ((src1->loop_father == loop)
++	   && (src0->loop_father == loop_outer (loop)))
++    {
++      outer_loop_t = gimple_phi_arg_def (def_stmt, 0);
++      res = true;
++    }
++
++  if (res)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "===> ");
++	  print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM);
++	}
++      return true;
++    }
++  return false;
++}
++
++/* add worklist and walked list.  */
++
++void
++add_worklist_walked (std::vector<tree> &worklist, std::set<tree> &walked,
++		     tree node)
++{
++  if (!walked.count (node))
++    {
++      worklist.push_back (node);
++      /* Avoid phi node cycle introduction, which makes the worklist unable
++	 to end.  */
++      walked.insert (node);
++    }
++}
++
++/* check bound iv and add worklist.  */
++
++void
++check_bound_iv_and_add_worklist (std::vector<tree> &worklist,
++				 std::set<tree> &walked,
++				 std::set<basic_block> &walked_loop,
++				 tree t, data_ref &mem_ref)
++{
++  if (t == NULL_TREE || TREE_CODE (t) != SSA_NAME)
++    return;
++
++  gimple *def_stmt = SSA_NAME_DEF_STMT (t);
++  if (def_stmt == NULL)
++    return;
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      print_generic_expr (dump_file, t, TDF_SLIM);
++      fprintf (dump_file, "\t\t: ");
++      print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM);
++    }
++
++  if (gimple_code (def_stmt) == GIMPLE_PHI)
++    {
++      tree out_loop_t = NULL_TREE;
++      if (loop_bound_iv_p (t, out_loop_t))
++	{
++	  basic_block bb = gimple_bb (def_stmt);
++	  if (!walked_loop.count (bb))
++	    {
++	      mem_ref.loop_bounds.push_back (loop_bound (t, def_stmt));
++	      walked_loop.insert (bb);
++	    }
++	  add_worklist_walked (worklist, walked, out_loop_t);
++	}
++    }
++  else if (is_gimple_assign (def_stmt))
++    {
++      tree_code rhs_code = gimple_assign_rhs_code (def_stmt);
++
++      /* unary.  */
++      if (rhs_code == SSA_NAME || rhs_code == NOP_EXPR)
++	add_worklist_walked (worklist, walked, gimple_assign_rhs1 (def_stmt));
++      else if (rhs_code == POINTER_PLUS_EXPR)
++	add_worklist_walked (worklist, walked, gimple_assign_rhs2 (def_stmt));
++
++      /* binary.  */
++      else if (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR
++	       || rhs_code == MULT_EXPR)
++	{
++	  add_worklist_walked (worklist, walked, gimple_assign_rhs1 (def_stmt));
++	  add_worklist_walked (worklist, walked, gimple_assign_rhs2 (def_stmt));
++	}
++    }
++}
++
++/* DFS trace the loop bound of iv.  */
++
++bool
++trace_loop_bound_iv (data_ref &mem_ref)
++{
++  /* In indirect memory access, the size cannot be determined based on the
++     loop boundary. However, we can take advantage of loop bound as an upper
++     bound (unrepeated memory access) to predict the variable footprint
++     involved in the specific loop dimension.  */
++
++  /* Determine and record the boundary iv of the current index,
++     but do not trace it.  */
++  tree outer_loop_t = NULL_TREE;
++  /* indirect access example, mem_ref.index = _64
++    _62 = MEM[symbol: uPtr, index: ivtmp.22_96, step: 4, offset: 0B];
++    _63 = (long unsigned int) _62;
++    _64 = _63 * 8;
++    _65 = [openfoam_smooth.c:28:28] &bPrimePtr + _64;
++    _66 = *_65;  */
++  if (loop_bound_iv_p (mem_ref.index, outer_loop_t) || !mem_ref.regular_p)
++    {
++      mem_ref.loop_bounds.push_back (
++	    loop_bound (mem_ref.index, SSA_NAME_DEF_STMT (mem_ref.index)));
++      if (!mem_ref.regular_p)
++	return false;
++    }
++
++  std::vector<tree> worklist;
++  worklist.push_back (mem_ref.base);
++  std::set<tree> walked;
++  std::set<basic_block> walked_loop;
++
++  while (worklist.size ())
++    {
++      tree t = worklist.back ();
++      worklist.pop_back ();
++
++      /* add worklist.  */
++      check_bound_iv_and_add_worklist (worklist, walked, walked_loop, t, mem_ref);
++    }
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "\nmem_ref access dimension: %ld\n",
++	       mem_ref.loop_bounds.size ());
++      fprintf (dump_file, "Traced variables: ");
++      print_generic_expr (dump_file, mem_ref.base, TDF_SLIM);
++      fprintf (dump_file, "\n");
++    }
++
++  return mem_ref.loop_bounds.size () > 0;
++}
++
++/* dump loop bound.  */
++
++void
++loop_bound_dump (FILE *file, loop_bound &lb)
++{
++  class loop *loop = lb.loop;
++  fprintf (file, "loop_bound: loop_%d (", loop->num);
++  if (loop->header)
++    fprintf (file, "header = %d", loop->header->index);
++  else
++    {
++      fprintf (file, "deleted)\n");
++      return;
++    }
++  if (loop->latch)
++    fprintf (file, ", latch = %d", loop->latch->index);
++  fprintf (file, ", lb_niters = ");
++  print_generic_expr (file, lb.niters);
++  fprintf (file, ")\n\n");
++}
++
++/* static calculate data size.  */
++
++void
++static_calculate_data_size (data_ref &mem_ref)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nstatic_calculate_data_size\n");
++
++  tree size_unit = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (mem_ref.var)));
++  unsigned HOST_WIDE_INT type_size = size_unit ? tree_to_uhwi (size_unit) : 0;
++  for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i)
++    {
++      unsigned HOST_WIDE_INT est_niter = tree_to_uhwi
++					   (mem_ref.loop_bounds[i].niters);
++      unsigned int unroll = mem_ref.loop_bounds[i].unroll;
++      if (i == 0)
++	{
++	  /* The unit conversion between byte, kilobytes, and megabytes is
++	     1024.  */
++	  mem_ref.data_size = double (type_size
++				      * est_niter * unroll) / 1024 / 1024;
++	}
++      else
++	mem_ref.data_size *= est_niter * unroll;
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "static_data_size: %lf\n", mem_ref.data_size);
++    }
++}
++
++/* Recursive tracing and creating of dominant nodes.  */
++
++tree
++trace_and_create_dominate_expr (tree expr, class loop *outermost)
++{
++  if (expr == NULL_TREE || is_gimple_constant (expr))
++    return expr;
++
++  if (TREE_CODE (expr) != SSA_NAME)
++    return NULL_TREE;
++
++  if (SSA_NAME_IS_DEFAULT_DEF (expr))
++    return expr;
++
++  gimple *stmt = SSA_NAME_DEF_STMT (expr);
++  basic_block def_bb = gimple_bb (stmt);
++  if (def_bb == NULL || def_bb->loop_father == NULL)
++    return NULL_TREE;
++
++  if (dominated_by_p (CDI_DOMINATORS, outermost->header, def_bb))
++    return expr;
++
++  if (gimple_code (stmt) != GIMPLE_ASSIGN)
++    return NULL_TREE;
++
++  enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
++  tree_code_class code_class = TREE_CODE_CLASS (rhs_code);
++  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
++  tree rhs1 = trace_and_create_dominate_expr (gimple_assign_rhs1 (stmt),
++					      outermost);
++  if (rhs1 == NULL_TREE)
++    return NULL_TREE;
++
++  if (code_class == tcc_unary)
++    {
++      tree expr_new = build1 (rhs_code, type, rhs1);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "expr_new = ");
++	  print_generic_expr (dump_file, expr_new, TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      return expr_new;
++    }
++  else if (code_class == tcc_binary)
++    {
++      tree rhs2 = trace_and_create_dominate_expr (gimple_assign_rhs2 (stmt),
++						  outermost);
++      if (rhs2 == NULL_TREE)
++	return NULL_TREE;
++
++      tree expr_new = fold_build2 (rhs_code, type, rhs1, rhs2);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "expr_new = ");
++	  print_generic_expr (dump_file, expr_new, TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      return expr_new;
++    }
++
++  return NULL_TREE;
++}
++
++/* Recursive parsing and craating of nodes in expr expressions.  */
++
++tree
++parse_and_create_expr (tree expr, class loop *outermost)
++{
++  if (expr == NULL_TREE || expr == chrec_dont_know
++      || is_gimple_constant (expr) || TREE_CODE (expr) == ADDR_EXPR)
++    {
++      /* tcc_expression (e.g., &q) situation combined with tcc_unary.  */
++      if (TREE_CODE (expr) == ADDR_EXPR && dump_file
++	  && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "tcc_expression case in ADDR_EXPR: ");
++	  print_generic_expr (dump_file, expr, TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      return expr;
++    }
++
++  if (TREE_CODE (expr) == SSA_NAME)
++    return trace_and_create_dominate_expr (expr, outermost);
++  else if (EXPR_P (expr))
++    {
++      enum tree_code tree_code = TREE_CODE (expr);
++      tree_code_class code_class = TREE_CODE_CLASS (tree_code);
++      tree type = TREE_TYPE (expr);
++      tree op1 = parse_and_create_expr (TREE_OPERAND (expr, 0), outermost);
++      if (op1 == NULL_TREE)
++	return NULL_TREE;
++
++      if (code_class == tcc_unary)
++	{
++	  tree expr_new = build1 (tree_code, type, op1);
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "expr_new = ");
++	      print_generic_expr (dump_file, expr_new, TDF_SLIM);
++	      fprintf (dump_file, "\n");
++	    }
++	  return expr_new;
++	}
++      else if (code_class == tcc_binary)
++	{
++	  tree op2 = parse_and_create_expr (TREE_OPERAND (expr, 1), outermost);
++	  if (op2 == NULL_TREE)
++	    return NULL_TREE;
++
++	  tree expr_new = fold_build2 (tree_code, type, op1, op2);
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "expr_new = ");
++	      print_generic_expr (dump_file, expr_new, TDF_SLIM);
++	      fprintf (dump_file, "\n");
++	    }
++	  return expr_new;
++	}
++    }
++  return NULL_TREE;
++}
++
++/* Trace and creat dominate loop bounds.  */
++
++void
++trace_and_create_dominate_loop_bounds (data_ref &mem_ref)
++{
++  /* Check whether the niters is a loop dominant.
++     If not, trace and determine whether the result is dominant.  If yes,
++     create the expr of the dominant node.
++  */
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\ntrace_and_create_dominate_loop_bounds\n");
++
++  /* Determine the relationship between the boundary of the innermost loop and
++     the dominant of the outer loop and the processing.  */
++  loop_bound &outermost = mem_ref.loop_bounds.back ();
++  for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i)
++    {
++      loop_bound &current = mem_ref.loop_bounds[i];
++      tree &niters = current.niters;
++      if (TREE_CODE (niters) == COND_EXPR)
++	niters = TREE_OPERAND (niters, 1);
++
++      niters = parse_and_create_expr (niters, outermost.loop);
++
++      if (niters == NULL_TREE)
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM);
++	      fprintf (dump_file, "Tracing loop bound failed at dimension %d\n",
++		       i);
++	    }
++	  mem_ref.calc_by = UNHANDLE_CALC;
++	  break;
++	}
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	loop_bound_dump (dump_file, mem_ref.loop_bounds[i]);
++    }
++}
++
++/* trace the dimension and corresponding loop bounds of mem_ref.
++   This function is used to supplement the information of mem_ref.loop_bounds.
++*/
++
++void
++trace_ref_dimension_and_loop_bounds (data_ref &mem_ref)
++{
++  /* In the same loop, some memory access dimensions are different.  Remove
++     variables with fewer dimensions.
++     Previous cyclic filtering conditions and memory access node records and
++     tracing.
++     The false result is also processed.
++  */
++  if (dump_file)
++    fprintf (dump_file, "\ncalculate_data_size\n");
++
++  /* Trace the loop bound iv of ref to determine the dimension.  */
++  /* Record data from the loop perspective to avoid repeated tracing.  */
++  if (!trace_loop_bound_iv (mem_ref))
++    return;
++
++  /* The traced mem_ref may have multiple dimensions, which corresponds to
++     multiple loops.  */
++  /* And in the dimension-by-dimensional analysis, the computable way is
++     continuously reduced.  */
++  mem_ref.calc_by = STATIC_CALC;
++  for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i)
++    {
++      class loop *loop = mem_ref.loop_bounds[i].loop;
++      tree &niters = mem_ref.loop_bounds[i].niters;
++
++      /* Set NULL_TREE to ensure that nb_iterations are retraced and
++	 vec_nb_iterations are also extracted.  */
++      loop->nb_iterations = NULL_TREE;
++      niters = number_of_latch_executions (loop, false);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	loop_dump (dump_file, loop);
++
++      if (loop->unroll)
++	{
++	  if (loop->unroll == USHRT_MAX && dump_file
++	      && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "loop->unroll = USHRT_MAX = %d", USHRT_MAX);
++	  mem_ref.loop_bounds[i].unroll = loop->unroll;
++	}
++
++      if ((niters == chrec_dont_know) && loop->vec_nb_iterations
++	   && (loop->vec_nb_iterations != chrec_dont_know))
++	niters = loop->vec_nb_iterations;
++
++      if (niters == chrec_dont_know)
++	{
++	  /* We derive est_loop_niters from function
++	     `estimated_loop_iterations_int`. Usually only the innermost loop is
++	     vectorized, so vec_nb_iterations can be 4 or 8 times as large as
++	     `est_loop_niters` due to vectorization. However, function
++	     `estimated_loop_iterations_int` only returns an integer instead of
++	     a tree node expression, so it cannot substitute
++	     function `number_of_latch_executions` in runtime computation.  */
++	  HOST_WIDE_INT est_loop_niters = estimated_loop_iterations_int (loop);
++	  if (est_loop_niters >= 0 && est_loop_niters < INT_MAX)
++	    /* e.g., loop iterations from `estimated_loop_iterations_int`: (-1)
++	       loop_144 (header = 519, latch = 625, niter = scev_not_known,
++	       upper_bound = 1073741823, likely_upper_bound = 1073741823,
++	       unroll = 1)  */
++	    /* variable `niters` from `loop->vec_nb_iterations`
++	       <integer_cst 0xfffff57df5d0 type
++	       <integer_type 0xfffff625a1f8> constant 34>  */
++	    niters = build_int_cst (integer_type_node, (int) est_loop_niters);
++	}
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	loop_bound_dump (dump_file, mem_ref.loop_bounds[i]);
++
++      if (niters == NULL_TREE || niters == chrec_dont_know)
++	mem_ref.calc_by = std::min (mem_ref.calc_by, UNHANDLE_CALC);
++      else if (TREE_CODE (niters) != INTEGER_CST)
++	mem_ref.calc_by = std::min (mem_ref.calc_by, RUNTIME_CALC);
++      else
++	mem_ref.calc_by = std::min (mem_ref.calc_by, STATIC_CALC);
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  if (mem_ref.calc_by == 2)
++	    {
++	      fprintf (dump_file, "\nniters: ");
++	      print_generic_expr (dump_file, niters, TDF_SLIM);
++	      fprintf (dump_file, "\nSTATIC_CALC.\n");
++	    }
++	  else if (mem_ref.calc_by == 1)
++	    {
++	      fprintf (dump_file, "\nniters: ");
++	      print_generic_expr (dump_file, niters, TDF_SLIM);
++	      fprintf (dump_file, "\nRUNTIME_CALC.\n");
++	    }
++	  else
++	    fprintf (dump_file, "\nUNHANDLE_CALC.\n");
++	}
++    }
++
++  if (mem_ref.calc_by == RUNTIME_CALC)
++    trace_and_create_dominate_loop_bounds (mem_ref);
++  else if (mem_ref.calc_by == STATIC_CALC)
++    static_calculate_data_size (mem_ref);
++}
++
++/* Get the loop's niters tree.
++   Return NULL_TREE if not found.  */
++
++tree
++get_cur_loop_niters (std::map<class loop *, std::vector<data_ref> > &loop_refs,
++		     class loop *loop)
++{
++  if (loop_refs.count (loop) == 0)
++    return NULL_TREE;
++  std::vector<loop_bound> bounds = loop_refs[loop][0].loop_bounds;
++  return bounds.size () ? bounds[0].niters : NULL_TREE;
++}
++
++/* Trace the sources of the niters tree and return the
++   outermost depth of the loops containing them.
++   Return start_depth if not found.
++
++   example:
++   niters:(long) (((int) i_end_417 - (int) i_start_452) + 1)
++   operand_num: 1, subtree:(long) (((int) i_end_417 - (int) i_start_452) + 1)
++   operand_num: 2, subtree:((int) i_end_417 - (int) i_start_452) + 1
++   operand_num: 2, subtree:(int) i_end_417 - (int) i_start_452
++   operand_num: 1, subtree:(int) i_end_417
++   SSA_NAME of niters: i_end_417
++   gimple of SSA: i_end_417 = PHI <i_end_446(9), i_end_410(100)>
++   return gimple depth;
++*/
++
++unsigned
++trace_outer_loop_depth (tree niters, unsigned start_depth)
++{
++  /* If niter does not exist or the type is INTEGER_CST,
++     the loop bound is determined and return start_depth.  */
++  if (niters == NULL_TREE || TREE_CODE (niters) == INTEGER_CST)
++    return start_depth;
++
++  gimple *def_stmt = NULL;
++  /* niters examples: i_start_452, fEnd_35, fEnd_100.  */
++  enum tree_code niter_code = TREE_CODE (niters);
++  if (niter_code == SSA_NAME)
++    {
++      /* Trace the SSA that define this niter.  */
++      def_stmt = SSA_NAME_DEF_STMT (niters);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "ssa_name of niters: ");
++	  print_generic_expr (dump_file, niters);
++	  fprintf (dump_file, "\ngimple of ssa: \n");
++	  print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO);
++	  fprintf (dump_file, "\n");
++	}
++      /* Termination condition of dfs.  Return the depth of the bb block.  */
++      if (gimple_code (def_stmt) == GIMPLE_PHI
++	  || gimple_code (def_stmt) == GIMPLE_NOP)
++	{
++	  basic_block def_bb = gimple_bb (SSA_NAME_DEF_STMT (niters));
++	  if (def_bb == NULL || def_bb->loop_father == NULL)
++	    return start_depth;
++	  unsigned ret_depth = loop_depth (def_bb->loop_father);
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "Stop tracing the outer loop depth, ");
++	      fprintf (dump_file, "current depth: %d, current bb: %d\n",
++		       ret_depth, def_bb->index);
++	    }
++	  return ret_depth;
++	}
++      /* 'ASSIGN': Use dfs to trace the rhs of the assignment statement.  */
++      else if (gimple_code (def_stmt) == GIMPLE_ASSIGN)
++	{
++	  tree rhs = gimple_assign_rhs1 (def_stmt);
++	  if (TREE_CODE (rhs) == TARGET_MEM_REF)
++	    /* fEnd_35 = MEM[base: _19, index: ivtmp.96, step: 4,
++			     offset: 0B]  */
++	    return trace_outer_loop_depth (TREE_OPERAND (rhs, 2), start_depth);
++	  else
++	    {
++	      /* M.218_658 = MIN_EXPR <_631, _657>  */
++	      unsigned min_depth = start_depth;
++	      unsigned operand_num = gimple_num_ops (def_stmt);
++	      /* 'ASSIGN': start from 1 because op[0] is the lhs.  */
++	      for (unsigned i = 1; i < operand_num; i++)
++		{
++		  tree subtree = dyn_cast<gassign *>(def_stmt)->op[i];
++		  if (subtree == NULL)
++		    continue;
++		  unsigned depth = trace_outer_loop_depth (subtree, \
++				   start_depth);
++		  min_depth = MIN (min_depth, depth);
++		  }
++		return min_depth;
++	    }
++	}
++      else
++	{
++	  /* Adding termination conditions:
++	   1)  Niters is MEM variable;
++	   2)  Niters is a runtime value (smooth_uPtr), and consider
++	       finding footprint in other mem_ref;
++	   3)  Niters is loop variable (i_start/i_end), and the boundary in
++	       the outer loop depends on the variable j_start/j_end.  */
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "The loop termination condition is "
++				  "extended.\n");
++	    }
++	  return start_depth;
++	}
++    }
++  /* The operand nums can be obtained when the tree code is as follows.  */
++  else if (niter_code == NOP_EXPR || niter_code == MEM_REF
++	   || niter_code == ARRAY_REF || niter_code == COND_EXPR
++	   || niter_code == PLUS_EXPR || niter_code == MINUS_EXPR
++	   || niter_code == TARGET_MEM_REF || niter_code == POINTER_PLUS_EXPR)
++    {
++      /* operand_num is the operand in the niters statement.
++	 example: In the following niter statement, operand_num = 3.
++	 (unsigned int) fEnd_35 - (unsigned int) fEnd_100 + 4294967295.  */
++      unsigned operand_num = TREE_OPERAND_LENGTH (niters);
++      unsigned min_depth = start_depth;
++      for (unsigned i = 0; i < operand_num; i++)
++	{
++	  tree subtree = TREE_OPERAND (niters, i);
++	  if (subtree == NULL)
++	    continue;
++	  unsigned depth = trace_outer_loop_depth (subtree, start_depth);
++	  min_depth = MIN (min_depth, depth);
++	}
++      return min_depth;
++    }
++  else
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "niters is another tree code: %s\n",
++		   get_tree_code_name (niter_code));
++	  print_generic_expr (dump_file, niters, TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      return start_depth;
++    }
++}
++
++/* Traces the ref dimension information in each loop.  */
++
++void
++analyze_loop_refs_dimension (std::vector<data_ref> &refs)
++{
++  for (unsigned i = 0; i < refs.size (); ++i)
++    {
++      if (refs[i].trace_status_p == false)
++	continue;
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "trace_reference_dimension %d:\n", i);
++	  print_generic_expr (dump_file, refs[i].ref, TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      trace_ref_dimension_and_loop_bounds (refs[i]);
++    }
++}
++
++/* analyze nested kernels
++   1) multidimension loop analyze
++   2) extended outer loop analyze
++*/
++
++bool
++analyze_nested_kernels (std::vector<class loop *> &kernels,
++			std::map<class loop *,
++				 std::vector<data_ref> > &loop_refs,
++			std::set<gimple *> &traced_ref_stmt,
++			std::vector<data_ref> &unresolved_refs)
++{
++  if (dump_file)
++    fprintf (dump_file, "\nPhase 3: analyze_nested_kernels\n\n");
++
++  /* `kernels` may be added in during outer loop extension phase,
++     thus using initial size to avoid repeatedly analyzing.  */
++  unsigned init_kernels_size = kernels.size ();
++  for (unsigned i = 0; i < init_kernels_size; ++i)
++    {
++      class loop *loop = kernels[i];
++      if (loop_refs.count (loop) == 0)
++	continue;
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "loop header %d:\n", loop->header->index);
++      analyze_loop_refs_dimension (loop_refs[loop]);
++
++      unsigned depth = loop_depth (loop);
++      unsigned outer_depth = trace_outer_loop_depth (get_cur_loop_niters
++			     (loop_refs, loop), depth);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "cur_depth: %d, outer_depth: %d\n",
++		 depth, outer_depth);
++      /* param_outer_loop_num: number of loops of the extended outer loop.
++	 Outermost loop should not be extended when outer_depth = 0.
++	 `outer_depth == depth` means the current loop is the loop which
++	 boundary is known, so there is no need to extend the outer loop.  */
++      if (outer_depth == 0 || outer_depth == depth
++	  || depth > outer_depth + param_outer_loop_num)
++	continue;
++
++      /* Extend outer loop.  */
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "\nStart extending outer loop\n");
++      /* Superloops of the loop, start from the loop closest to the
++	  current loop in the outermost loop.  */
++      for (int j = 0; j < param_outer_loop_num && --depth; ++j)
++	{
++	  class loop *outer_loop = (*loop->superloops)[depth];
++	  /* The outer loop may be added when analyzing previous inner loops,
++	     i.e. the outer loop contains two or more inner loops.  */
++	  if (loop_refs.count (outer_loop))
++	    continue;
++	  /* phase1 ~ phase3 analysis on the extended outer loop.  */
++	  analyze_loop_dense_memory (kernels, loop_refs, outer_loop);
++	  if (loop_refs.count (outer_loop) == 0)
++	    continue;
++	  for (unsigned k = 0; k < loop_refs[outer_loop].size (); ++k)
++	    {
++	      if (dump_file && (dump_flags & TDF_DETAILS))
++		{
++		  fprintf (dump_file, "outer_analyze_nested_kernels %d: ", k);
++		  print_generic_expr (dump_file, loop_refs[outer_loop][k].ref,
++				      TDF_SLIM);
++		  fprintf (dump_file, "\n");
++		}
++	    }
++	  trace_loop_refs_info (loop_refs[outer_loop], traced_ref_stmt,
++				unresolved_refs);
++	  analyze_loop_refs_dimension (loop_refs[outer_loop]);
++	  outer_depth = trace_outer_loop_depth (get_cur_loop_niters
++						(loop_refs, outer_loop), depth);
++	  /* `outer_depth == depth` means the current loop is the loop which
++	   boundary is known, so there is no need to extend the outer loop.  */
++	  if (outer_depth == depth)
++	    break;
++	  else
++	    /* The outer loop cannot find the current loop boundary,
++	       Remove the record of outer_loop from the loop_refs.  */
++	    loop_refs.erase (outer_loop);
++	}
++    }
++  return true;
++}
++
++/* ================ phase 4 filter_and_sort_kernels ================  */
++
++/* Get the edge probability information of each basic block in the loop.  */
++
++float
++get_edge_prob (edge e, float minimum)
++{
++  float fvalue = 0;
++
++  profile_probability probability = e->probability;
++  if (probability.initialized_p ())
++    {
++      fvalue = probability.to_reg_br_prob_base () / float (REG_BR_PROB_BASE);
++      if (fvalue < minimum && probability.to_reg_br_prob_base ())
++	fvalue = minimum;
++    }
++  return fvalue;
++}
++
++/* Get the next bb with a high branch probability.  */
++
++basic_block
++next_high_probability_bb (basic_block bb)
++{
++  if (bb == NULL)
++    return NULL;
++
++  /* Limit the minimum probability value.  */
++  const float MINNUM_PROB = 0.00001f;
++  float minimum = MINNUM_PROB;
++
++  gimple *stmt = last_stmt (bb);
++  if (stmt && gimple_code (stmt) == GIMPLE_COND)
++    {
++      edge true_edge = NULL;
++      edge false_edge = NULL;
++      extract_true_false_edges_from_block (bb, &true_edge, &false_edge);
++
++      float true_edge_prob = get_edge_prob (true_edge, minimum);
++      float false_edge_prob = get_edge_prob (false_edge, minimum);
++      /* If the content of the branch does not include the candidate
++	 kernel, the branch probability may not be limited.  */
++      /* The edge_prob may have precision error during static prediction,
++	 so we need to relax the limit before comparison.  */
++      if ((true_edge_prob >= (param_branch_prob_threshold / 100.0) - minimum)
++	  && flow_bb_inside_loop_p (bb->loop_father, true_edge->dest))
++	return true_edge->dest;
++      else if ((false_edge_prob
++		>= (param_branch_prob_threshold / 100.0) - minimum)
++	       && flow_bb_inside_loop_p (bb->loop_father, false_edge->dest))
++	return false_edge->dest;
++      else
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "No high probability bb:");
++	      fprintf (dump_file, "current bb: %d, true: %f, false: %f\n",
++		       bb->index, true_edge_prob, false_edge_prob);
++	    }
++	  return NULL;
++	}
++    }
++  else
++    {
++      edge e = find_fallthru_edge (bb->succs);
++      if (e)
++	return e->dest;
++    }
++  return NULL;
++}
++
++
++/* Dump loop header bb.  */
++
++void
++dump_loop_headers (const char *name, std::vector<class loop *> &loops)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "\n\n%s:\n", name);
++      fprintf (dump_file, "{ ");
++      for (unsigned int i = 0; i < loops.size (); i++)
++	fprintf (dump_file, "%d(%d) ", loops[i]->num, loops[i]->header->index);
++      fprintf (dump_file, "}\n\n");
++    }
++}
++
++/* Combine and sort candidate loops.  */
++
++bool
++filter_and_sort_kernels (std::vector<class loop *> &sorted_kernels,
++			 std::vector<class loop *> &kernels)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nPhase 4: filter_and_sort_kernels:\n\n");
++
++  std::set<basic_block> end_bb;
++  std::list<basic_block> walked_header_bb; /* Used to record nested loops.  */
++  std::set<int> walked_non_header_bb_idx;
++
++  for (unsigned i = 0; i < kernels.size (); ++i)
++    {
++      if (kernels[i]->inner == NULL)
++	end_bb.insert (kernels[i]->header);
++    }
++
++  dump_loop_headers ("kernels", kernels);
++
++  if (!param_filter_kernels)
++    {
++      for (std::vector<class loop *>::iterator it = kernels.begin ();
++	   it != kernels.end (); ++it)
++	sorted_kernels.push_back (*it);
++    }
++  else
++    {
++      basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun);
++
++      while (bb)
++	{
++	  if (bb == NULL)
++	    return false;
++	  if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun))
++	    break;
++
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "%d ", bb->index);
++
++	  /* bb is not the head of the loop, go to the next.  */
++	  if (bb != bb->loop_father->header)
++	    {
++	      if (walked_non_header_bb_idx.count (bb->index))
++		{
++		  if (dump_file && (dump_flags & TDF_DETAILS))
++		    fprintf (dump_file, "Find same-loop cycle.  "
++					"Abort filtering process.\n");
++		  return false;
++		}
++	      walked_non_header_bb_idx.insert (bb->index);
++      	      bb = next_high_probability_bb (bb);
++	      continue;
++	    }
++
++	  /* bb is the head of the loop.  */
++	  if (bb != walked_header_bb.back ())
++	    {
++	      if (end_bb.count (bb))
++		{
++		  sorted_kernels.push_back (bb->loop_father);
++		  bb = single_exit (bb->loop_father)->dest;
++		  continue;
++		}
++	      if (loop_outer (bb->loop_father) != NULL
++		  && get_loop_exit_edges (bb->loop_father).length () != 1)
++		return false;
++	      walked_header_bb.push_back (bb);
++	      bb = next_high_probability_bb (bb);
++	      continue;
++	    }
++	  else
++	    {
++	      walked_header_bb.pop_back ();
++	      bb = single_exit (bb->loop_father)->dest;
++	      continue;
++	    }
++	}
++    }
++
++  dump_loop_headers ("sorted_kernels", sorted_kernels);
++  return true;
++}
++
++/* Check whether the given bb is null.  */
++
++bool
++check_null_bb (basic_block bb)
++{
++  if (bb == NULL)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Unexpected error at null bb.\n");
++      return true;
++    }
++  return false;
++}
++
++/* Check whether the loop father of the given bb is null.  */
++
++bool
++check_null_loop_father (basic_block bb)
++{
++  if (check_null_bb (bb))
++    return true;
++
++  if (bb->loop_father == NULL)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "bb %d's loop father is null.\n", bb->index);
++      return true;
++    }
++  return false;
++}
++
++/* States for bb during path traversal.  */
++
++enum bb_traversal_state
++{
++  NOT_TRAVERSED = 0,
++  UNDER_TRAVERSAL,
++  FULLY_TRAVERSED
++};
++
++/* Detect abnormal revisit for bb during path traversal where bb is
++   1) fully traversed,
++   2) non-loop-header bb but currently under traversal.  */
++
++bool
++revisit_bb_abnormal_p (basic_block bb, std::vector<int> &bb_visited,
++		       const std::set<int> &header_bb_idx_set,
++		       std::set<std::pair<int, int> > &unused_edges,
++		       int src_bb_idx)
++{
++  /* If the header bb has been already fully traversed, early exit
++     the function.  */
++  if (bb_visited[bb->index] == FULLY_TRAVERSED)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Already visited bb index %d. Abort.\n",
++		 bb->index);
++      return true;
++    }
++
++  /* If we revisit a non-header bb during next-bb traversal, we detect
++     an inner-loop cycle and dump warning info. Record this abnormal edge
++     in `unused_edges` for special treatment in path weight update.  */
++  if (!header_bb_idx_set.count (bb->index)
++      && bb_visited[bb->index] == UNDER_TRAVERSAL)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Warning: Find cycle at bb index %d. Abort.\n",
++		 bb->index);
++      unused_edges.insert (std::make_pair (src_bb_idx, bb->index));
++      return true;
++    }
++
++  return false;
++}
++
++/* Check successor bb through edge e. Return true if successor bb is NULL or
++   out of loop.  */
++
++bool
++check_succ_bb_abnormal_p (basic_block bb, edge e)
++{
++  if (check_null_bb (e->dest))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Null bb connected to src bb %d.\n", bb->index);
++
++      return true;
++    }
++
++  /* If bb is within one loop and the edge is pointing to the
++     outer loop, skip edge processing until a backedge to header
++     bb. `loop->num = 0` represents function body.  */
++  if (bb->loop_father->num != 0
++      && !flow_bb_inside_loop_p (bb->loop_father, e->dest))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Find edges to the outer loop at bb "
++			    "index %d to bb index %d. Abort.\n",
++		 bb->index, e->dest->index);
++
++      return true;
++    }
++
++  return false;
++}
++
++/* Criteria for retrieving the next bb in modified control-flow graph, which
++   creates a topological order for the bb traversal.  */
++
++void
++get_next_toposort_bb (basic_block bb, std::vector<int> &bb_visited,
++		      std::list<basic_block> &bb_topo_order,
++		      const std::set<int> &header_bb_idx_set,
++		      std::set<std::pair<int, int> > &unused_edges,
++		      int src_bb_idx)
++{
++  /* 1) Before bb returns to the loop header, bb will not go to the outer loop.
++     2) After returning to the loop header, traverse all exit_bbs.
++     NEXT STEP:
++     1) If goto jumps out of 2 loops, goto has to traverse smaller jumps first.
++     2) If path length is the same => choose higher depth traversal path.  */
++  if (check_null_bb (bb) || check_null_loop_father (bb))
++    return;
++
++  /* Find last bb of function.  */
++  if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun))
++    return;
++
++  if (revisit_bb_abnormal_p (bb, bb_visited, header_bb_idx_set, unused_edges,
++			     src_bb_idx))
++    return;
++
++  /* If we revisit the header bb of a loop, traverse all exit bbs.  */
++  if (header_bb_idx_set.count (bb->index)
++      && bb_visited[bb->index] == UNDER_TRAVERSAL)
++    {
++      unsigned i;
++      edge e;
++      auto_vec<edge> exits = get_loop_exit_edges (bb->loop_father);
++
++      if (exits.length () > 1 && dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Detect multiple exits at loop %d.\n",
++		 bb->loop_father->num);
++
++      FOR_EACH_VEC_ELT (exits, i, e)
++	{
++	  get_next_toposort_bb (e->dest, bb_visited, bb_topo_order,
++				header_bb_idx_set, unused_edges, src_bb_idx);
++	}
++      return;
++    }
++
++  /* Post-order traversal for normal bb.  */
++  bb_visited[bb->index] = UNDER_TRAVERSAL;
++  edge e;
++  edge_iterator ei;
++
++  FOR_EACH_EDGE (e, ei, bb->succs)
++    {
++      if (check_succ_bb_abnormal_p (bb, e))
++	continue;
++
++      get_next_toposort_bb (e->dest, bb_visited, bb_topo_order,
++			    header_bb_idx_set, unused_edges, bb->index);
++    }
++
++  /* bb is marked as fully traversed and all its descendents have been
++      fully traversed due to post-order traversal.  */
++  bb_visited[bb->index] = FULLY_TRAVERSED;
++  bb_topo_order.push_back (bb);
++}
++
++/* A struct that represents the longest path weight at each bb.  */
++
++struct weight
++{
++  /* Longest path weight at current bb.  */
++  gcov_type bb_count;
++
++  /* Prev bb from the current longest path.  */
++  int prev_bb_idx;
++};
++
++/* A helper function for checking whether overflow will occur when adding two
++   gcov_type weights.  */
++
++bool
++check_weight_overflow (gcov_type a, gcov_type b)
++{
++  if ((a > 0 && b > INT64_MAX - a) || (a < 0 && b < INT64_MIN - a))
++    return true;
++
++  return false;
++}
++
++/* A helper function that update the weight of the current longest path to
++   bb_idx_dst and a new path pointing from bb_idx_src to bb_idx_dst.  */
++
++void
++update_path_weight (std::vector<weight> &bb_weights, int bb_idx_src,
++		    int bb_idx_dst, gcov_type weight_dst)
++{
++  if (check_weight_overflow (bb_weights[bb_idx_src].bb_count, weight_dst)
++      && dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "WARNING: Path weight overflow at src bb %d "
++			  "and dest bb %d.\n",
++	       bb_idx_src, bb_idx_dst);
++    }
++  if (bb_weights[bb_idx_dst].bb_count
++      < bb_weights[bb_idx_src].bb_count + weight_dst)
++    {
++      bb_weights[bb_idx_dst].bb_count
++	= bb_weights[bb_idx_src].bb_count + weight_dst;
++      bb_weights[bb_idx_dst].prev_bb_idx = bb_idx_src;
++    }
++}
++
++/* Check whether the required bb/loop info for path update is null.  */
++
++bool
++check_null_info_in_path_update (basic_block bb, edge e)
++{
++  if (check_null_bb (e->dest))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Null bb detected for edge connected "
++			    "to src bb %d.\n",
++		 bb->index);
++      return true;
++    }
++
++  if (check_null_loop_father (bb) || check_null_loop_father (e->dest))
++    return true;
++
++  return false;
++}
++
++/* Update path weight to loop exit bbs where the current source bb is connected
++   to header bb using a backedge.  */
++
++void
++update_backedge_path_weight (std::vector<weight> &bb_weights, basic_block bb,
++			   const std::set<std::pair<int, int> > &unused_edges)
++{
++  unsigned i;
++  edge e_exit;
++  auto_vec<edge> exits = get_loop_exit_edges (bb->loop_father);
++  FOR_EACH_VEC_ELT (exits, i, e_exit)
++    {
++      if (check_null_bb (e_exit->dest))
++	{
++	  if (e_exit->src != NULL && dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "Null bb detected for exiting edge "
++				"connected to src bb %d.\n",
++		     e_exit->src->index);
++	  continue;
++	}
++
++      if (unused_edges.count (std::make_pair (bb->index, e_exit->dest->index)))
++	{
++	  /* Inner-loop-cycle backedge case.  */
++	  continue;
++	}
++      update_path_weight (bb_weights, bb->index, e_exit->dest->index,
++			  e_exit->dest->count.to_gcov_type ());
++    }
++}
++
++/* Update the longest length of the path through control flow graph.  */
++
++void
++update_max_length_of_path (std::vector<weight> &bb_weights,
++			   std::list<basic_block> &bb_topo_order,
++			   const std::set<int> &header_bb_idx_set,
++			   const std::set<std::pair<int, int> > &unused_edges)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "Start update weight traversal:\n");
++
++  while (!bb_topo_order.empty ())
++    {
++      basic_block bb = bb_topo_order.back ();
++      bb_topo_order.pop_back ();
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "%d ", bb->index);
++
++      edge e;
++      edge_iterator ei;
++      FOR_EACH_EDGE (e, ei, bb->succs)
++	{
++	  if (check_null_info_in_path_update (bb, e))
++	    continue;
++
++	  if (unused_edges.count (std::make_pair (bb->index, e->dest->index)))
++	    {
++	      /* Inner-loop-cycle backedge case.  */
++	      continue;
++	    }
++	  else if (bb->loop_father->num != 0
++		   && !flow_bb_inside_loop_p (bb->loop_father, e->dest))
++	    {
++	      /* Outer-loop edge case.  */
++	      continue;
++	    }
++	  else if (header_bb_idx_set.count (e->dest->index)
++	      && bb->loop_father == e->dest->loop_father)
++	    {
++	      /* Backedge case.  */
++	      update_backedge_path_weight (bb_weights, bb, unused_edges);
++	    }
++	  else
++	    {
++	      /* Normal edge case.  */
++	      update_path_weight (bb_weights, bb->index, e->dest->index,
++				  e->dest->count.to_gcov_type ());
++	    }
++	}
++    }
++}
++
++/* Collect all header bb of loops in the function beforehand.  */
++
++void
++collect_header_bb_for_fn (std::set<int> &header_bb_idx_set)
++{
++  for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
++    header_bb_idx_set.insert (loop->header->index);
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "\nCheck header bbs:\n");
++      for (std::set<int>::iterator it = header_bb_idx_set.begin ();
++	   it != header_bb_idx_set.end (); ++it)
++	fprintf (dump_file, "%d ", *it);
++      fprintf (dump_file, "\n");
++    }
++}
++
++/* Record loop executing order and bb high-executing path.  */
++
++void
++record_high_execution_path (std::vector<class loop *> &sorted_kernel,
++			    std::vector<int> &bb_path, int bb_num_max)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nPATH FOR %s: ", get_name (cfun->decl));
++
++  std::set<int> loop_set;
++  for (int i = bb_path.size() - 1; i >= 0; --i)
++    {
++      int bb_idx = bb_path[i];
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "%d ", bb_idx);
++      gcc_assert (bb_idx < bb_num_max);
++
++      class loop *loop = BASIC_BLOCK_FOR_FN (cfun, bb_idx)->loop_father;
++      if (!loop_set.count (loop->num))
++	{
++	  loop_set.insert (loop->num);
++	  sorted_kernel.push_back (loop);
++	}
++    }
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\n");
++}
++
++/* Combine and sort candidate loops using feedback information.  */
++
++bool
++filter_and_sort_kernels_feedback (std::vector<class loop *> &sorted_kernel,
++				  std::set<int> &bb_pathset)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nPhase 4: filter_and_sort_kernels:\n\n");
++
++  std::set<int> header_bb_idx_set;
++  std::list<basic_block> bb_topo_order;
++
++  /* Quoted from GCC internal, Chapter 15.1, "the index for any block should
++     never be greater than `last_basic_block`." Therefore, we use this
++     variable for retrieving the max bb index of a function.  */
++  /* Since the pass does not add/remove/merge basic blocks until Phase 6
++     and previous passes will update ssa accordingly, we do not need to
++     `compact_blocks` to update bb indices currently.  */
++  int bb_num_max = last_basic_block_for_fn (cfun) + 1;
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nMaximal number of possible bbs in the "
++			  "function: %d\n",
++	     bb_num_max);
++  std::vector<int> bb_visited = std::vector<int>(bb_num_max, 0);
++
++  collect_header_bb_for_fn (header_bb_idx_set);
++  basic_block bb_start = ENTRY_BLOCK_PTR_FOR_FN (cfun);
++
++  /* Step 1: Get topological order of bb during traversal.  */
++  std::set<std::pair<int, int> > unused_edges;
++  get_next_toposort_bb (bb_start, bb_visited, bb_topo_order, header_bb_idx_set,
++			unused_edges, -1);
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "\nCheck bbs in topological order:\n");
++      for (std::list<basic_block>::iterator it = bb_topo_order.begin ();
++	   it != bb_topo_order.end (); ++it)
++	fprintf (dump_file, "%d ", (*it)->index);
++      fprintf (dump_file, "\n");
++    }
++
++  /* Step 2: Update weights of nodes and path.  */
++  weight weight_init = {-1, -1};
++  std::vector<weight> bb_weights = std::vector<weight>(bb_num_max, weight_init);
++  bb_weights[0].bb_count = 0;  /* ENTRY bb has count 0 and prev bb as -1.  */
++  update_max_length_of_path (bb_weights, bb_topo_order, header_bb_idx_set,
++			     unused_edges);
++
++  /* Step 3: Backtrack a path from EXIT bb to ENTRY bb.  */
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nCheck counts for each bb:\n");
++
++  std::vector<int> bb_path;
++  int tmp_bb_idx = 1;
++  bb_pathset.insert (tmp_bb_idx);
++  bb_path.push_back (tmp_bb_idx);
++  tmp_bb_idx = bb_weights[tmp_bb_idx].prev_bb_idx;
++  while (tmp_bb_idx > 0 && tmp_bb_idx < bb_num_max)
++    {
++      if (bb_pathset.count (tmp_bb_idx))
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf(dump_file, "ERROR: already seen bb index %d\n",
++		    tmp_bb_idx);
++	  return false;
++	}
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "%d: %ld, ", tmp_bb_idx,
++		 bb_weights[tmp_bb_idx].bb_count);
++      bb_pathset.insert (tmp_bb_idx);
++      bb_path.push_back (tmp_bb_idx);
++      tmp_bb_idx = bb_weights[tmp_bb_idx].prev_bb_idx;
++    }
++  /* It is possible that the function exit code is wrapped around as an
++     variable, and thus, EXIT_BB in cfg is not connected to any bb.  */
++  if (tmp_bb_idx < 0 || tmp_bb_idx >= bb_num_max)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "unhandled scenario at backtracking highly "
++			      "executed path with tmp_bb_idx %d",
++		   tmp_bb_idx);
++	}
++      return false;
++    }
++
++  record_high_execution_path (sorted_kernel, bb_path, bb_num_max);
++
++  return true;
++}
++
++
++/* ================ phase 5 record_and_sort_ref_groups ================  */
++/* Memory reference score, different aspects of one memory reference.  */
++
++struct ref_score
++{
++  /* certain memory reference.  */
++  data_ref d_ref;
++
++  /* local count for bb where memory reference is located.  */
++  gcov_type bb_count;
++
++  /* line-location of memory reference.  */
++  int line;
++};
++
++/* Memory reference group, different reference of the same variable.  */
++
++struct ref_group
++{
++  /* source variables.  */
++  tree var;
++
++  /* variable size, Unit: MB.  */
++  double var_size;
++
++  /* first ref for insert hint.  */
++  data_ref first_use;
++
++  /* first ref with the highest-order CALC.  */
++  data_ref first_calc_use;
++
++  /* reuse scores of variables.  */
++  float reuse_level;
++
++  /* method of calculating the var size.  */
++  calc_type calc_by;
++
++  /* memory reference index for specific variable.  */
++  unsigned int mem_ref_index;
++
++  /* variable dimension.  */
++  unsigned int dim;
++
++  /* True if first_calc_use's footprint replaces that of first_use.  */
++  unsigned int transfer_ft;
++
++  /* Accessing Reference Records in Different Modes (key_index):
++    000: write, random, non-parallel
++    001: write, random, parallel
++    010: write, regular, non-parallel
++    011: write, regular, parallel
++    100: read, random, non-parallel
++    101: read, random, parallel
++    110: read, regular, non-parallel
++    111: read, regular, parallel
++  */
++  std::map<int, std::vector<data_ref> > ref_use;
++
++  /* scores for different memory references.  */
++  std::vector<ref_score> ref_scores;
++
++  ref_group ()
++    {
++      var = NULL_TREE;
++      var_size = 0;
++      reuse_level = 0;
++      calc_by = UNHANDLE_CALC;
++      mem_ref_index = 0;
++      dim = 1;
++      transfer_ft = 0;
++    }
++};
++
++/* Get the integer part for log(x) with the given base.  */
++
++static unsigned int
++flog (float x, float base)
++{
++  unsigned int res = 0;
++  while (x >= base)
++    {
++      ++res;
++      x /= base;
++    }
++  return res;
++}
++
++/* Calculate reuse time for a memory reference in ref_group.  */
++
++float
++calculate_reuse_times (std::vector<data_ref> &mem_refs, std::set<int> &loop_set,
++		       std::set<int> &bb_set, unsigned int var_dim)
++{
++  const float SAME_BB_REUSE_WEIGHT = 0.1;
++  const float SAME_LOOP_REUSE_WEIGHT = 0.5;
++  const float NORMAL_REUSE_WEIGHT = 1.;
++
++  float reuse_time_sum = 0.;
++  for (std::vector<data_ref>::iterator it = mem_refs.begin ();
++       it != mem_refs.end (); ++it)
++    {
++      const data_ref &mem_ref = *it;
++      float reuse_time = 0.;
++      if (bb_set.count (mem_ref.bb_idx))
++	{
++	  /* If the two mem_ref belong to the same bb, the new reuse
++	     weight will not exceed 0.1 divided by the mem_ref mode group
++	     size.
++	     NEXT STEP: The following equation may hold and cause commutative
++	     property of read and write op not holding:
++	      write + (reused) read != read + (reused) write.
++	     However, it seems that write mem_ref is always before read mem_ref,
++	     so the above comparison does not show up in calculation due to
++	     intrinsic in-order property of tree map, but this condition is
++	     quite fragile anyway.  */
++	  reuse_time = SAME_BB_REUSE_WEIGHT / mem_refs.size ();
++	}
++      else
++	{
++	  bb_set.insert (mem_ref.bb_idx);
++	  if (loop_set.count (mem_ref.loop_idx))
++	    {
++	      /* If the mem_ref belongs to a loop where any other mem_ref is in,
++		 the new reuse weight will be 0.5.  */
++	      reuse_time = SAME_LOOP_REUSE_WEIGHT;
++	    }
++	  else
++	    {
++	      /* If the mem_ref is reused but not in the same group with any
++		 other mem_ref, the new reuse weight will be 1.  */
++	      loop_set.insert (mem_ref.loop_idx);
++	      reuse_time = NORMAL_REUSE_WEIGHT;
++	    }
++	}
++      unsigned int used_dim = std::min (mem_ref.loop_depth, var_dim);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "used_dim : %u, loop_depth : %u\n", used_dim,
++		 mem_ref.loop_depth);
++      unsigned int power = flog (std::max (0u, mem_ref.loop_depth - used_dim)
++				 + 2, 2.);
++      reuse_time_sum += reuse_time * (used_dim * used_dim / 2.) * (power);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "(%f * (%u * %u / 2) * (%u) = %f\n",
++		 reuse_time, used_dim, used_dim, power,
++		 reuse_time * (used_dim * used_dim / 2.) * (power));
++    }
++  return reuse_time_sum;
++}
++
++/* Calculate reuse level.  */
++
++float
++calculate_reuse_level (std::map<int, std::vector<data_ref> > &var_use,
++		       unsigned int var_dim, double var_size)
++{
++  const float VAR_SIZE_CACHE_CAPACITY = 1 / 4.;
++  const int WITHIN_CACHE_SIZE_COST = 4;
++  const float BYTE_CONVERT_RATIO = 1024.;
++
++  float level = 0.;
++  std::set<int> loop_set;
++  std::set<int> bb_set;
++  bool has_write_op = false;
++  for (std::map<int, std::vector<data_ref> >::iterator it = var_use.begin ();
++       it != var_use.end (); ++it)
++    {
++      unsigned int parallel = 1;
++      unsigned int regular = 1;
++
++      if ((*it).second[0].parallel_p)
++	parallel = PARALLEL_NUM;
++      if (!(*it).second[0].regular_p)
++	regular = INDIRECT_ACCESS_VALUE;
++      if (!(*it).second[0].read_p)
++	has_write_op = true;
++
++      /* In serial reuse, we will later check whether they are in the
++	 same cacheline.  If yes, delete the reuse.  For details, see the
++	 reuse analysis of prefetching and eliminate redundancy.  */
++      float reuse_times = calculate_reuse_times ((*it).second, loop_set,
++						 bb_set, var_dim);
++      float add = parallel * reuse_times * regular;
++      level += add;
++      if (add && dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "%d : %d * %f * %d = %f\n",
++		 (*it).first, parallel, reuse_times, regular, add);
++    }
++
++  bool within_llc_size = var_size > param_l2_cache_size / BYTE_CONVERT_RATIO
++			 && var_size < VAR_SIZE_CACHE_CAPACITY
++				       * param_llc_capacity_per_core;
++
++  float final_level = has_write_op ? (level * WRITE_COST) : level;
++  final_level = within_llc_size ? (final_level * WITHIN_CACHE_SIZE_COST)
++				: final_level;
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "final level : %d * %f * %d = %f\n",
++	     has_write_op ? WRITE_COST : 1, level,
++	     within_llc_size ? WITHIN_CACHE_SIZE_COST : 1, final_level);
++  return final_level;
++}
++
++/* Comparison of reference reuse level.  */
++
++bool
++ref_group_reuse_cmp (const ref_group &a, const ref_group &b)
++{
++  if (a.reuse_level != b.reuse_level)
++    return a.reuse_level > b.reuse_level;
++  else
++    return get_name (a.var) < get_name (b.var);
++}
++
++/* Dump key information of reference group and memory access for llc hint.  */
++
++void
++dump_key_info_for_llc_hint (std::vector<ref_group> &ref_groups)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "\nLLC hint info:\n");
++      fprintf (dump_file, "rank\tvar\t(lineno, direct, vectorized, write)\n");
++      for (unsigned int i = 0; i < ref_groups.size (); ++i)
++	{
++	  fprintf (dump_file, "%d\t", i);
++	  print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM);
++	  data_ref &mem_ref = ref_groups[i].first_use;
++	  fprintf (dump_file, "\t(%d, %u, %u, %u)",
++		   expand_location (mem_ref.stmt->location).line,
++		   mem_ref.regular_p, mem_ref.vectorize_p, 1 - mem_ref.read_p);
++	  fprintf (dump_file, "\n");
++	}
++      fprintf (dump_file, "\n");
++    }
++}
++
++/* Sort reference groups.  */
++
++void
++sort_ref_groups (std::vector<ref_group> &ref_groups,
++		 std::map<tree, ref_group> &ref_groups_map)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nsort_ref_groups_by_reuse_level\n");
++
++  for (std::map<tree, ref_group>::iterator it = ref_groups_map.begin ();
++       it != ref_groups_map.end (); ++it)
++    {
++      (*it).second.reuse_level = calculate_reuse_level ((*it).second.ref_use,
++							(*it).second.dim,
++							(*it).second.var_size);
++      ref_groups.push_back ((*it).second);
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  print_generic_expr (dump_file, (*it).second.var, TDF_SLIM);
++	  fprintf (dump_file, " : %f\n\n", (*it).second.reuse_level);
++	}
++    }
++
++  std::sort (ref_groups.begin (), ref_groups.end (), ref_group_reuse_cmp);
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "\nsorted ref_groups:\n");
++      fprintf (dump_file, "rank\tvar\t(data_size, dim, num_of_mem_ref, "
++			  "need_tmp_name): reuse_level_score\n");
++      for (unsigned int i = 0; i < ref_groups.size (); ++i)
++	{
++	  fprintf (dump_file, "%d\t", i);
++	  print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM);
++	  int need_tmp_name = !get_name (ref_groups[i].var) ? 1 : 0;
++	  fprintf (dump_file, "\t(%lf, %u, %lu, %d)", ref_groups[i].var_size,
++		   ref_groups[i].dim, ref_groups[i].ref_scores.size (),
++		   need_tmp_name);
++	  fprintf (dump_file, " : %f\n", ref_groups[i].reuse_level);
++	}
++      fprintf (dump_file, "\n");
++
++      fprintf (dump_file, "first_use:\n");
++      for (unsigned int i = 0; i < ref_groups.size (); ++i)
++	{
++	  fprintf (dump_file, "%d ", i);
++	  print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM);
++	  fprintf (dump_file, " : ");
++	  if (!ref_groups[i].first_use.vectorize_p)
++	    print_generic_expr (dump_file, ref_groups[i].first_use.ref,
++				TDF_SLIM);
++	  else
++	    print_gimple_stmt (dump_file, ref_groups[i].first_use.stmt,
++				TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      fprintf (dump_file, "\n");
++    }
++    dump_key_info_for_llc_hint (ref_groups);
++}
++
++/* Attributes of variable data.  */
++
++enum data_attribute
++{
++  DA_PARALLEL = 0,
++  DA_REGULAR,
++  DA_READ
++};
++
++/* Record memory reference by use mode.
++   If the reference group is not found, create a group.  */
++
++void
++record_mem_ref (std::map<tree, ref_group> &ref_groups, data_ref &mem_ref)
++{
++  unsigned int index = (mem_ref.parallel_p << DA_PARALLEL)
++	      + (mem_ref.regular_p << DA_REGULAR) + (mem_ref.read_p << DA_READ);
++
++  if (!ref_groups.count (mem_ref.var))
++    {
++      ref_group ref_group;
++      ref_group.var = mem_ref.var;
++      ref_group.first_use = mem_ref;
++      ref_group.first_calc_use = mem_ref;
++      ref_groups[mem_ref.var] = ref_group;
++    }
++
++  /* Ref_groups' calc_by reflects the highest order of calc_by that can be
++     achieved by all mem_ref of ref_groups. The first mem_ref that achieves
++     this order is defined to be `first_calc_use`. Later after sorting
++     mem_refs, calc_by will be replaced by the calc_by of `first_use`, and
++     even by the calc_by of `first_calc_use`.  */
++  if (mem_ref.calc_by > ref_groups[mem_ref.var].calc_by)
++    {
++      ref_groups[mem_ref.var].calc_by = mem_ref.calc_by;
++      ref_groups[mem_ref.var].first_calc_use = mem_ref;
++    }
++  ref_groups[mem_ref.var].var_size = std::max (ref_groups[mem_ref.var].var_size,
++					       mem_ref.data_size);
++  ref_groups[mem_ref.var].dim = std::max (ref_groups[mem_ref.var].dim,
++				(unsigned int) mem_ref.loop_bounds.size ());
++  ref_groups[mem_ref.var].ref_use[index].push_back (mem_ref);
++
++  ref_score ref_level = { mem_ref, ((mem_ref.stmt)->bb->count).to_gcov_type (),
++			   expand_location (mem_ref.stmt->location).line };
++  ref_groups[mem_ref.var].ref_scores.push_back (ref_level);
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "recorded in: ");
++      print_generic_expr (dump_file, mem_ref.var, TDF_SLIM);
++      fprintf (dump_file, ":%d:%ld\n", index,
++	       ref_groups[mem_ref.var].ref_use[index].size () - 1);
++
++      fprintf (dump_file, "base: ");
++      print_generic_expr (dump_file, mem_ref.base, TDF_SLIM);
++
++      fprintf (dump_file, ", index: ");
++      print_generic_expr (dump_file, mem_ref.index, TDF_SLIM);
++
++      fprintf (dump_file, ", step: ");
++      if (mem_ref.step && cst_and_fits_in_hwi (mem_ref.step))
++	fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
++		 int_cst_value (mem_ref.step));
++      else
++	print_generic_expr (dump_file, mem_ref.step, TDF_SLIM);
++
++      fprintf (dump_file, ", offset: ");
++      if (mem_ref.offset && cst_and_fits_in_hwi (mem_ref.offset))
++	fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
++		 int_cst_value (mem_ref.offset));
++      else
++	print_generic_expr (dump_file, mem_ref.offset, TDF_SLIM);
++      fprintf (dump_file, ", %s", mem_ref.read_p ? "read" : "write");
++
++      fprintf (dump_file, ", size: %lf", mem_ref.data_size);
++      fprintf (dump_file, "\n\n");
++    }
++}
++
++/* Rank data reference index level.  */
++
++bool
++best_insert_cmp (const ref_score &a, const ref_score &b)
++{
++  /* NEXT STEP: We can also calculate gap using static/feedback info inferred
++     from historical maximum bb count:
++	gap = hist_max_bb_ct / (alpha * max (a.bb_ct, b.bb_ct)) + 1.
++     Also, bb count needs to be smoothed and scaled as divisor can be 0.
++     history maximum bb count can be obtained in Phase 4.  */
++  const float gap = 1;
++  if (a.d_ref.loop_depth != b.d_ref.loop_depth)
++    return a.d_ref.loop_depth > b.d_ref.loop_depth;
++  else if (a.d_ref.regular_p != b.d_ref.regular_p)
++    return a.d_ref.regular_p > b.d_ref.regular_p;
++  else if (abs (double (std::max (a.bb_count, b.bb_count) + 1)
++		/ double (std::min (a.bb_count, b.bb_count) + 1) - 1) > gap)
++    return a.bb_count > b.bb_count;
++  else if (a.line != b.line)
++    return a.line < b.line;
++  else if (a.d_ref.read_p != b.d_ref.read_p)
++    return a.d_ref.read_p < b.d_ref.read_p;
++  else
++    return a.d_ref.vectorize_p > b.d_ref.vectorize_p;
++}
++
++/* Sort data reference index level within one reference group in non-decreasing
++   order of the customized sorting scheme.  */
++
++void
++sort_mem_ref_in_ref_group (std::map<tree, ref_group> &ref_groups_map)
++{
++  if (dump_file)
++    fprintf (dump_file, "\nsorted data_references:\n");
++  for (std::map<tree, ref_group>::iterator it = ref_groups_map.begin ();
++       it != ref_groups_map.end (); ++it)
++    {
++      ref_group &curr_ref_group = (*it).second;
++      std::vector<ref_score> &ref_scores = curr_ref_group.ref_scores;
++      std::stable_sort (ref_scores.begin (), ref_scores.end (),
++			best_insert_cmp);
++      /* Update ref_group's first_use and calc_by with the first mem_ref after
++	 sorting.  */
++      curr_ref_group.first_use = curr_ref_group.ref_scores[0].d_ref;
++      curr_ref_group.calc_by = curr_ref_group.first_use.calc_by;
++
++      /* When transferring footprint is enabled, it is allowed to transfer
++	 the statically-calculated footprint of a mem_ref from the same
++	 ref_group to `first_use` mem_ref.  */
++      if (param_transfer_footprint
++	  && curr_ref_group.first_use.calc_by == UNHANDLE_CALC)
++	{
++	  if (curr_ref_group.first_calc_use.calc_by > RUNTIME_CALC)
++	    {
++	      if (dump_file && (dump_flags & TDF_DETAILS))
++		{
++		  print_generic_expr (dump_file, (*it).first, TDF_SLIM);
++		  fprintf (dump_file, "\nfirst_use: ");
++		  print_gimple_stmt (dump_file, curr_ref_group.first_use.stmt,
++				     0, TDF_LINENO);
++		  fprintf (dump_file, "first_calc_use: ");
++		  print_gimple_stmt (dump_file,
++				     curr_ref_group.first_calc_use.stmt,
++				     0, TDF_LINENO);
++		}
++
++	      curr_ref_group.calc_by = curr_ref_group.first_calc_use.calc_by;
++	      curr_ref_group.transfer_ft = 1;
++	    }
++	  else
++	    {
++	      if (dump_file && (dump_flags & TDF_DETAILS))
++		{
++		  print_generic_expr (dump_file, (*it).first, TDF_SLIM);
++		  fprintf (dump_file, ": cannot transfer footprint to "
++				      "first use mem_ref.\n");
++		}
++	    }
++	}
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  print_generic_expr (dump_file, (*it).first, TDF_SLIM);
++	  fprintf (dump_file, " : %lu\n", ref_scores.size ());
++	  for (unsigned int i = 0; i < ref_scores.size (); ++i)
++	    {
++	      fprintf (dump_file, "mem_ref_index %u: ", i);
++	      print_gimple_stmt (dump_file, ref_scores[i].d_ref.stmt, 0,
++				 TDF_LINENO);
++	      fprintf (dump_file, "bb-%d ",
++		       ref_scores[i].d_ref.stmt->bb->index);
++	      fprintf (dump_file, "count %ld\n", ref_scores[i].bb_count);
++	    }
++	  fprintf (dump_file, "\n\n");
++	}
++    }
++}
++
++/* Tracing and sorting reference groups.  */
++
++bool
++record_and_sort_ref_groups (std::vector<ref_group> &ref_groups,
++			    std::vector<class loop *> &kernels,
++			    std::map<class loop *,
++				     std::vector<data_ref> > &loop_refs,
++			    std::set<int> bb_pathset)
++{
++  if (dump_file)
++    fprintf (dump_file, "\nPhase 5: trace_all_references_details\n\n");
++
++  std::map<tree, ref_group> ref_groups_map;
++
++  for (unsigned i = 0; i < kernels.size (); ++i)
++    {
++      class loop *loop = kernels[i];
++      if (loop_refs.count (loop) == 0)
++	continue;
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "loop header %d:\n", loop->header->index);
++      for (unsigned j = 0; j < loop_refs[loop].size (); ++j)
++	{
++	  data_ref &mem_ref = loop_refs[loop][j];
++	  if (mem_ref.trace_status_p)
++	    {
++	      if (!param_filter_mode || (param_filter_mode
++		  && bb_pathset.count (mem_ref.stmt->bb->index)))
++		record_mem_ref (ref_groups_map, mem_ref);
++	    }
++	}
++    }
++
++  /* Sort mem_ref within ref_group by local count and update first_use's
++     data_ref, stable sort.  */
++  sort_mem_ref_in_ref_group (ref_groups_map);
++  sort_ref_groups (ref_groups, ref_groups_map);
++
++  return ref_groups.size () > 0;
++}
++
++/* ================ phase 6 issue_llc_hint ================  */
++
++/* Issue vectorized mask prefetch gimple.  */
++
++void
++issue_mask_prefetch (gimple *stmt)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "insert svprfd.\n");
++
++  /* vect__1.1 = .MASK_LOAD (_2, 32B, loop_mask_3);
++     .MASK_STORE (_4, 32B, loop_mask_5, vect__6.6);
++  */
++  tree dataref_ptr = gimple_call_arg (stmt, 0);
++  tree scale = gimple_call_arg (stmt, 1);
++  tree final_mask = gimple_call_arg (stmt, 2);
++  tree target = NULL_TREE;
++  if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
++    target = gimple_call_arg (stmt, 3);
++  else if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD)
++    target = gimple_call_lhs (stmt);
++  tree prfop = NULL_TREE;
++  if (param_llc_level == 3)
++    /* for simulation, 4: PLDL3KEEP.  */
++    prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4);
++  else if (param_llc_level == 4)
++    /* 6: PLDL4KEEP.  */
++    prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6);
++  else
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "LLC cache levels are illegal.\n");
++      return;
++    }
++
++  /* add offset.  */
++  gimple_stmt_iterator si = gsi_for_stmt (stmt);
++  /* target: vector_type - XXX_type.  */
++  if (target == NULL_TREE)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "unhandled scene: target vect is null");
++      return;
++    }
++  unsigned HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi
++		       (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target))));
++  tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance);
++  addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true,
++				   NULL, true, GSI_SAME_STMT);
++
++  gcall *call = gimple_build_call_internal (IFN_MASK_PREFETCH, 5, addr, scale,
++					    final_mask, target, prfop);
++  gsi_insert_after (&si, call, GSI_SAME_STMT);
++  update_ssa (TODO_update_ssa_only_virtuals);
++}
++
++/* Issue vectorized mask gather prefetch gimple.  */
++
++void
++issue_mask_gather_prefetch (gimple *stmt)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "insert svprfd_gather_uxindex.\n");
++
++  /* vect_patt_1.1 = .MASK_GATHER_LOAD (_2, vect__3.3, 8, { 0.0, ... },
++					loop_mask_4);  */
++  tree dataref_ptr = gimple_call_arg (stmt, 0);
++  tree vec_offset = gimple_call_arg (stmt, 1);
++  tree scale = gimple_call_arg (stmt, 2);
++  tree zero = gimple_call_arg (stmt, 3);
++  tree final_mask = gimple_call_arg (stmt, 4);
++  tree prfop = NULL_TREE;
++  if (param_llc_level == 3) // for simulation
++    prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4); // 4: PLDL3KEEP
++  else if (param_llc_level == 4)
++    prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6); // 6: PLDL4KEEP
++  else
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "LLC cache levels are illegal.\n");
++      return;
++    }
++
++  tree target = gimple_call_lhs (stmt);
++  /* add offset.  */
++  gimple_stmt_iterator si = gsi_for_stmt (stmt);
++  if (target == NULL_TREE)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "unhandled scene: target vect is null");
++      return;
++    }
++  unsigned HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi
++		       (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target))));
++  tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance);
++  addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true,
++				   NULL, true, GSI_SAME_STMT);
++
++  gcall *call = gimple_build_call_internal (IFN_MASK_GATHER_PREFETCH, 7, addr,
++					    vec_offset, scale, zero,
++					    final_mask, target, prfop);
++  gsi_insert_after (&si, call, GSI_SAME_STMT);
++  update_ssa (TODO_update_ssa_only_virtuals);
++}
++
++/* Issue builtin prefetch gimple.  */
++
++void
++issue_builtin_prefetch (data_ref &mem_ref)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "insert prfm.\n");
++  /* MEM[symbol: diagPtr, index: ivtmp_102, step: 8, offset: 0B] */
++  gimple *stmt = mem_ref.stmt;
++  tree ref = mem_ref.ref;
++
++  tree scale = mem_ref.step;
++  gimple_stmt_iterator si = gsi_for_stmt (stmt);
++  if (scale == NULL_TREE)
++    {
++      /* _190 = (void *) ivtmp.444_221;
++	 Cannot detect size unit at (void *).  */
++      scale = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (mem_ref.var)));
++      if (scale == NULL_TREE)
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "ERROR: Unknown size unit for the prefetching "
++				"variable.  Stop builtin_prefetch.\n\n");
++	  return;
++	}
++    }
++
++  tree addr = build_fold_addr_expr_with_type (ref, ptr_type_node);
++  addr = force_gimple_operand_gsi (&si, unshare_expr (addr),
++				   true, NULL, true, GSI_SAME_STMT);
++  unsigned HOST_WIDE_INT distance = param_prefetch_offset
++				      * tree_to_uhwi (scale);
++
++  addr = fold_build_pointer_plus_hwi (addr, distance);
++  addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true,
++				   NULL, true, GSI_SAME_STMT);
++  /* __builtin_prefetch (_68, 0, 1);
++     1st param: *addr, 2nd param: write/read (1/0), 3rd param: temporal locality
++     (high means strong locality) */
++  gcall *call = NULL;
++  if (param_llc_level == 3)
++    {
++      /* for simulation.
++	 BUILT_IN_PREFETCH (addr, rw, locality).  */
++      call = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
++				3, addr, integer_zero_node, integer_one_node);
++    }
++  else if (param_llc_level == 4)
++    {
++	tree prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6);
++	call = gimple_build_call (
++				builtin_decl_explicit (BUILT_IN_PREFETCH_FULL),
++				3, addr, integer_zero_node, prfop);
++    }
++  else
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "LLC cache levels are illegal.\n");
++      return;
++    }
++
++  gsi_insert_after (&si, call, GSI_SAME_STMT);
++  update_ssa (TODO_update_ssa_only_virtuals);
++}
++
++/* Static form insertion and issue instruction.  We may check the
++   determination of the ARM SVE architecture before SVE hint insertion.  */
++
++void
++static_issue (std::vector<ref_group> &ref_groups, int num_issue_var)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "static issue\n");
++
++  for (int i = 0; i < num_issue_var; ++i)
++    {
++      data_ref mem_ref = ref_groups[i].first_use;
++      if (mem_ref.vectorize_p)
++	{
++	  enum internal_fn ifn_code = gimple_call_internal_fn (mem_ref.stmt);
++	  if (ifn_code == IFN_MASK_STORE || ifn_code == IFN_MASK_LOAD)
++	    issue_mask_prefetch (mem_ref.stmt);
++	  else if (ifn_code == IFN_MASK_GATHER_LOAD)
++	    issue_mask_gather_prefetch (mem_ref.stmt);
++	  else
++	    if (dump_file && (dump_flags & TDF_DETAILS))
++	      fprintf (dump_file, "other vectorized internal function\n");
++	}
++      else
++	issue_builtin_prefetch (mem_ref);
++    }
++}
++
++/* Check whether all loop bounds (niters) used for calculating the footprints
++   of previously-executed ref_groups are defined in a dominated bb to the
++   currentbranch bb, where the conditional expression requires the loop bound
++   info.  */
++
++bool
++check_def_use_chain (std::vector<ref_group> &ref_groups,
++		     basic_block &branch_header_bb,
++		     std::vector<int> &ref_group_idx)
++{
++  for (std::vector<int>::iterator it = ref_group_idx.begin ();
++       it != ref_group_idx.end (); ++it)
++    {
++      /* Transferring mem_ref only takes place during footprint calculation.  */
++      ref_group &ref_group_curr = ref_groups[*it];
++      data_ref mem_ref = ref_group_curr.transfer_ft
++			  ? ref_group_curr.first_calc_use
++			  : ref_group_curr.first_use;
++      for (unsigned j = 0; j < mem_ref.loop_bounds.size (); ++j)
++	{
++	  tree niters = mem_ref.loop_bounds[j].niters;
++	  gimple *def_stmt = SSA_NAME_DEF_STMT (niters);
++	  basic_block def_bb = gimple_bb (def_stmt);
++	  /* Check dominator relationship of def bb and branch bb.  */
++	  /* Case 1: Check whether the def bb is the single predecessor block
++	     of header bb.  */
++	  if (single_pred_p (branch_header_bb))
++	    {
++	      basic_block branch_bb_prev = single_pred (branch_header_bb);
++	      if (branch_bb_prev->index == def_bb->index)
++		continue;
++	    }
++	  /* Case 2: Check whether the branch bb is dominated by the def
++	     bb.  */
++	  if (!dominated_by_p (CDI_DOMINATORS, branch_header_bb, def_bb))
++	    return false;
++	}
++    }
++  return true;
++}
++
++/* Generate the stmts for calculating the size.  Later we will consider nested
++   multi-branches scenarios and check more information of niters when it is
++   a COND_EXPR.  */
++
++tree
++calc_stmts_gen (std::vector<ref_group> &ref_groups,
++		gimple_seq &cond_expr_stmt_list,
++		basic_block branch_header_bb,
++		std::vector<int> &ref_group_idx_curr,
++		std::vector<int> &ref_group_idx_prev, tree &cumul_size)
++{
++  /* Check whether the bbs of def stmt for footprint loop bounds dominates
++     the bb of new runtime branching conditional.  */
++  if (!check_def_use_chain (ref_groups, branch_header_bb, ref_group_idx_prev))
++    return NULL_TREE;
++
++  /* Accumulated allocation size.  */
++  for (std::vector<int>::iterator it = ref_group_idx_curr.begin ();
++       it != ref_group_idx_curr.end (); ++it)
++    {
++      /* Transferring mem_ref only takes place during footprint calculation.  */
++      ref_group &ref_group_curr = ref_groups[*it];
++      data_ref mem_ref = ref_group_curr.transfer_ft
++			  ? ref_group_curr.first_calc_use
++			  : ref_group_curr.first_use;
++      tree var = mem_ref.var;
++      tree unit = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (var)));
++      /* _190 = (void *) ivtmp.444_221;
++	 Cannot detect size unit at (void *).  */
++      if (unit == NULL_TREE)
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "WARNING: Cannot detect size unit "
++				  "(use 1 byte) for variable %s: ",
++				  get_name (var));
++	      print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM);
++	      fprintf (dump_file, "\n");
++	    }
++	  unit = size_one_node;
++	}
++      tree size = NULL_TREE;
++      for (unsigned j = 0; j < mem_ref.loop_bounds.size (); ++j)
++	{
++	  tree niters = mem_ref.loop_bounds[j].niters;
++
++	  /* COND_EXPR.  */
++	  if (TREE_CODE (niters) == COND_EXPR)
++	    niters = TREE_OPERAND (niters, 1);
++	  if (size == NULL_TREE) 
++	    {
++		    size = niters;
++	    } else {
++		    size = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, 
++					size);
++	    }
++	}
++      unit = build1 (NOP_EXPR, TREE_TYPE (size), unit);
++      size = fold_build2 (MULT_EXPR, TREE_TYPE (size), size, unit);
++      size = build1 (FLOAT_EXPR, double_type_node, size);
++      cumul_size = fold_build2 (PLUS_EXPR, double_type_node, cumul_size,
++				size);
++      ref_group_idx_prev.push_back (*it);
++    }
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "cumul_size = ");
++      print_generic_expr (dump_file, cumul_size, TDF_SLIM);
++      fprintf (dump_file, "\n");
++    }
++  /* Create a stmt list for size calculation.  */
++  tree div = build_int_cst (TREE_TYPE (integer_zero_node), 1024 * 1024);
++  div = build1 (NOP_EXPR, double_type_node, div);
++  tree total_size = fold_build2 (RDIV_EXPR, double_type_node, cumul_size, div);
++
++  tree threshold = build_int_cst (TREE_TYPE (integer_zero_node),
++				  param_llc_capacity_per_core / 2);
++  threshold = build_real_from_int_cst (double_type_node, threshold);
++  tree cond_expr = fold_build2 (LE_EXPR, boolean_type_node, total_size,
++				threshold);
++
++  /* Convert cond_expr to stmt list.  */
++  cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
++				      &cond_expr_stmt_list, is_gimple_condexpr,
++				      NULL_TREE);
++  return cond_expr;
++}
++
++/* Retrieve the least number of loops that cover all target mem_refs.
++   Try to merge loops that the mem_refs reside to a common superloop and
++   maintain a worklist which relates NEED-TO-COPY loops with the target mem
++   refs inside using the following criteria:
++   1) If loop A is a superloop of loop B in the worklist, replace loop B with
++      loop A in the worklist, and attach all target mem_refs of loop B,
++      together with loop A's, to loop A.
++   2) If loop B in the worklist is a superloop of loop A, attach loop A's
++      target mem_ref to loop B.
++   3) If loop A is not a superloop/subloop of loop B in the worklist, replace
++      loop B with their lowest common superloop C in the worklist, and attach
++      all target mem_refs of loop A and loop B to loop C.
++   4) If loop A and loop B's lowest common superloop is function body
++      (loop 0), stop merging and maintain loop independence.  */
++
++void
++get_loop_worklist (std::vector<ref_group> &ref_groups, int num_issue_var,
++		   std::map<class loop *, std::vector<int> > &loop_worklist)
++{
++  for (int i = 0; i < num_issue_var; ++i)
++    {
++      data_ref &mem_ref = ref_groups[i].first_use;
++      class loop *loop_new = mem_ref.loop_bounds.front ().loop;
++      class loop *common_superloop = loop_new;
++      bool add_loop_worklist = false;
++
++      /* Use greedy algorithm to merge loops to a common superloop that can
++	 contain the current mem_refs.  */
++      std::map<class loop *, std::vector<int> >::iterator it_tmp;
++      std::vector<int> ref_group_idx_tmp;
++      std::map<class loop *, std::vector<int> >::iterator it;
++      for (it = loop_worklist.begin (); it != loop_worklist.end ();)
++	{
++	  class loop *loop_old = it->first;
++	  common_superloop = find_common_loop (loop_new, loop_old);
++	  if (common_superloop == NULL || common_superloop->num == 0)
++	    {
++	      /* Stop merging two loops if there is no common superloop for
++		 them except function body (loop 0).  */
++	      if (common_superloop != NULL
++		  && dump_file && (dump_flags & TDF_DETAILS))
++		{
++		  fprintf (dump_file, "ref_group %d's loop %d has no common "
++				      "superloop with existing loop %d\n",
++			   i, loop_new->num, loop_old->num);
++		}
++	      ++it;
++	      continue;
++	    }
++
++	  if (common_superloop->num == loop_old->num)
++	    {
++	      /* If loop_old is the superloop of loop_new, add current
++		 ref_group index to loop's worklist.  */
++	      loop_worklist[common_superloop].push_back (i);
++	      ++it;
++	    }
++	  else
++	    {
++	      /* If loop_old is not a superloop of loop_new, replace
++		 loop_old with the common superloop.  */
++	      it_tmp = it;
++	      ++it_tmp;
++	      ref_group_idx_tmp = it->second;
++	      loop_worklist.erase (it);
++	      it = it_tmp;
++	      add_loop_worklist = true;
++	    }
++	}
++
++      if (loop_worklist.empty () || add_loop_worklist)
++	{
++	  /* Update the new common superloop in loop_worklist.  */
++	  std::vector<int> &ref_groups_tmp = loop_worklist[common_superloop];
++	  ref_groups_tmp.push_back (i);
++	  for (std::vector<int>::iterator it = ref_group_idx_tmp.begin ();
++	       it != ref_group_idx_tmp.end (); ++it)
++	    ref_groups_tmp.push_back (*it);
++	  std::sort (ref_groups_tmp.begin (), ref_groups_tmp.end ());
++	}
++    }
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "runtime loop list:\n");
++      std::map<class loop *, std::vector<int> >::iterator it;
++      for (it = loop_worklist.begin (); it != loop_worklist.end (); ++it)
++	{
++	  fprintf (dump_file, "loop %d:", it->first->num);
++	  for (std::vector<int>::iterator idx_it = it->second.begin ();
++	       idx_it != it->second.end (); ++idx_it)
++	    {
++	      fprintf (dump_file, " %d", *idx_it);
++	    }
++	  fprintf (dump_file, "\n");
++	}
++    }
++}
++
++/* Runtime form insertion and issue instruction.  */
++
++void
++runtime_issue (std::vector<ref_group> &ref_groups, int num_issue_var,
++	       std::vector<class loop *> &sorted_kernels)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "runtime issue\n");
++
++  /* It is possible that the loop father of some mem_ref's bb may contain the
++     loop fathers of the others. Therefore, we intend to only copy loops
++     without inclusion relationship.  */
++  std::map<class loop *, std::vector<int> > loop_worklist;
++  get_loop_worklist (ref_groups, num_issue_var, loop_worklist);
++  bool get_first_ref_group = false;
++  std::vector<int> ref_group_idx_prev;
++
++  /* NEXT STEP: Multiple loop copies (possibly nested within one loop can cost
++     front-end bound due to branching within loop), we need to set up a
++     threshold such that we may compensate this time cost by space cost
++     in binary (copying outer loop).  */
++  tree cumul_size = build_real_from_int_cst (double_type_node,
++					     integer_zero_node);
++  for (std::vector<class loop *>::iterator it = sorted_kernels.begin ();
++       it != sorted_kernels.end (); ++it)
++    {
++      /* Start runtime branching until finding the first ref_group's loop.
++	 Skip any ref_groups if their `first_use` mem_refs are executed
++	 before the mem_ref of the first ref_group.  */
++      class loop *loop = *it;
++      if (!loop_worklist.count (loop)
++	  || (!get_first_ref_group && loop_worklist[loop][0] != 0))
++	continue;
++
++      std::vector<int> ref_group_idx_curr = loop_worklist[loop];
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "copy loop num: %d\n", loop->num);
++	}
++      /* If the exit edge points to bb with multiple inputs, split the exit
++	 edge and create a new bb, make the exit edge point to bb with only
++	 single input.  */
++      edge e = single_exit (loop);
++      if (e == NULL)
++	return;
++      if (!single_pred_p (e->dest))
++	{
++	  split_loop_exit_edge (e, true);
++	  if (dump_enabled_p ())
++	    dump_printf (MSG_NOTE, "split exit edge\n");
++	}
++
++      /* After updating SSA, we are not sure whether the gimple_seq stmt list
++	 is initialized and unchanged during iterations. Therefore, we need to
++	 recreate this stmt list for every loop copy.  */
++      gimple_seq cond_expr_stmt_list = NULL;
++      tree cond_expr = calc_stmts_gen (ref_groups, cond_expr_stmt_list,
++				       loop->header, ref_group_idx_curr,
++				       ref_group_idx_prev, cumul_size);
++      if (cond_expr == NULL_TREE)
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "incalculable variables for conditional\n");
++	  return;
++	}
++
++      /* Use the previous cond and generate a new branch and copy loop.  */
++      basic_block condition_bb = NULL;
++      profile_probability prob = profile_probability::likely ();
++      initialize_original_copy_tables ();
++      class loop *nloop = loop_version (loop, cond_expr, &condition_bb,
++					prob, prob.invert (), prob,
++					prob.invert (), true);
++      free_original_copy_tables ();
++
++      /* Insert the generated stmt list before cond_expr.  */
++      gimple_stmt_iterator cond_exp_gsi;
++      if (cond_expr_stmt_list)
++	{
++	  /* Function `gsi_insert_seq_before` will insert `cond_expr` (1st
++	     stmt) of `condition_bb` to the end of `cond_expr_stmt_list`.  */
++	  cond_exp_gsi = gsi_last_bb (condition_bb);
++	  gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
++				 GSI_SAME_STMT);
++	}
++    }
++
++  update_ssa (TODO_update_ssa);
++
++  /* Perform hint issue for branches that meet conditions.  */
++  static_issue (ref_groups, num_issue_var);
++}
++
++/* Issue llc hints through prefetch instructions.  */
++
++void
++issue_llc_hint (std::vector<ref_group> &ref_groups,
++		std::vector<class loop *> &sorted_kernels)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "issue_llc_hint:\n");
++
++  /* 1) If the issue-topn and force-issue options are available, top N var is
++	forcibly allocated then no runtime branch is generated.
++     2) If the issue-topn option is available and the size of top N var is
++	statically known, top N is statically allocated and no runtime branch
++	is generated.
++     3) If the issue-topn option is available and the size of the top N var is
++	unknown, but them is dynamically known, the top N is dynamically
++	allocated and generate runtime branches. (also depends on the screening
++	of the innermost variable boundary type)
++     4) If the dynamic runtime cannot know the size, such as indirect access,
++	optimization is skipped.
++  */
++  int num_issue_var = std::min (param_issue_topn, (int) ref_groups.size ());
++  if (num_issue_var == 0)
++    return;
++
++  if (num_issue_var < param_issue_topn
++      && dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "WARNING: Only %u (less than param_issue_topn = %d) "
++			  "ref_group(s) is found for llc hint.\n",
++	       num_issue_var, param_issue_topn);
++    }
++  if (param_force_issue)
++    {
++      static_issue (ref_groups, num_issue_var);
++      return;
++    }
++  calc_type topn_calc_type = STATIC_CALC;
++  for (int i = 0; i < num_issue_var; ++i)
++    topn_calc_type = std::min (topn_calc_type, ref_groups[i].calc_by);
++
++  if (topn_calc_type == STATIC_CALC)
++    {
++      /* Before static issue, we still need to collect data size of all target
++	 variables and compare the summation with LLC cache size.  */
++      double prefetch_data_size = 0.;
++      for (int i = 0; i < num_issue_var; ++i)
++	prefetch_data_size += ref_groups[i].var_size;
++
++      if (prefetch_data_size <= (double) param_llc_capacity_per_core
++				* PREFETCH_CACHE_SIZE_RATIO)
++	static_issue (ref_groups, num_issue_var);
++      else
++	if (dump_file && (dump_flags & TDF_DETAILS))
++	  fprintf (dump_file, "static issue: Prefetch size exceeds LLC cache "
++			      "size: %lf > %lf.\n",
++		   prefetch_data_size,
++		   (double) param_llc_capacity_per_core
++		   * PREFETCH_CACHE_SIZE_RATIO);
++    }
++  else if (topn_calc_type == RUNTIME_CALC)
++    runtime_issue (ref_groups, num_issue_var, sorted_kernels);
++  else
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "unhandled issue scene\n");
++    }
++}
++
++/* ==================== phase entry ====================  */
++
++/* The LLC intelligent allocation consists of 6 steps.  */
++
++void
++llc_allocate (void)
++{
++  std::map<class loop *, std::vector<data_ref> > kernels_refs;
++  std::vector<class loop *> kernels;
++  if (!get_dense_memory_kernels (kernels, kernels_refs))
++    return;
++
++  std::set<gimple *> traced_ref_stmt;
++  std::vector<data_ref> unresolved_refs;
++  trace_data_refs_info (kernels, kernels_refs, traced_ref_stmt,
++			unresolved_refs);
++
++  if (!analyze_nested_kernels (kernels, kernels_refs, traced_ref_stmt,
++			       unresolved_refs))
++    return;
++
++  retrace_loop_refs_info_unresolved (unresolved_refs, traced_ref_stmt);
++
++  std::vector<class loop *> sorted_kernels;
++  std::vector<ref_group> ref_groups;
++  if (param_filter_mode)
++    {
++      /* AutoFDO mode: include ENTRY bb and EXIT bb indices.  */
++      std::set<int> bb_pathset;
++      bb_pathset.insert (0);
++      bb_pathset.insert (1);
++      if (!filter_and_sort_kernels_feedback (sorted_kernels, bb_pathset))
++	return;
++
++      if (!record_and_sort_ref_groups (ref_groups, kernels, kernels_refs,
++				       bb_pathset))
++	return;
++    }
++  else
++    {
++      /* static mode.  */
++      std::set<int> bb_pathset;
++      if (!filter_and_sort_kernels (sorted_kernels, kernels))
++	return;
++
++      if (!record_and_sort_ref_groups (ref_groups, sorted_kernels, kernels_refs,
++				       bb_pathset))
++	return;
++    }
++
++  issue_llc_hint (ref_groups, sorted_kernels);
++}
++
++/* Check whether the function is an operator reloading function.  */
++
++bool
++operator_func_p (function *fn)
++{
++  const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn->decl));
++
++  if (fn_name && strncmp (fn_name, "operator", 8) == 0)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "operator_func: %s ", fn_name);
++
++      return true;
++    }
++  return false;
++}
++
++/* Check whether the function file location is known.  */
++
++bool
++func_location_p (function *fn)
++{
++  expanded_location fn_decl_xloc
++    = expand_location (DECL_SOURCE_LOCATION (current_function_decl));
++  expanded_location fn_xloc
++    = expand_location (fn->function_start_locus);
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "fn->function_start_locus = %d \n",
++	       fn->function_start_locus);
++      fprintf (dump_file, "fn_xloc.file = %s \n",
++	       fn_xloc.file ? fn_xloc.file : "NULL");
++      fprintf (dump_file, "fn_decl_xloc.file = %s \n",
++	       fn_decl_xloc.file ? fn_decl_xloc.file : "NULL");
++      fprintf (dump_file, "LOCATION_FILE (input_location) = %s \n",
++	LOCATION_FILE (input_location) ? LOCATION_FILE (input_location)
++				       : "NULL");
++    }
++  if (fn_decl_xloc.file == NULL)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Function location unknown, skip analysis \n");
++      return false;
++    }
++  /* Newly generated functions are filtered out, such as function constant
++     propagation func.constprop ().  */
++  if (LOCATION_FILE (input_location) != fn_decl_xloc.file)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Function location non-local, skip analysis \n");
++      return false;
++    }
++  return true;
++}
++
++/* Dump function information.  */
++
++void
++dump_function_info (function *fn)
++{
++  const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn->decl));
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "\nfn_name: %s\n", fn_name);
++      expanded_location cfun_xloc
++	= expand_location (DECL_SOURCE_LOCATION (current_function_decl));
++      if (cfun_xloc.line)
++	{
++	  if (cfun_xloc.file)
++	    fprintf (dump_file, "[%s:%d:%d]\n",
++		     cfun_xloc.file, cfun_xloc.line, cfun_xloc.column);
++	}
++      fprintf (dump_file, "\n");
++      flow_loops_dump (dump_file, NULL, 1);
++      fprintf (dump_file, "\n");
++    }
++}
++
++/* dump param.  */
++
++void
++dump_param (void)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++  {
++    fprintf (dump_file, "LLC allocate parameters:\n");
++    fprintf (dump_file, "    block size: %d\n", param_l1_cache_line_size);
++    fprintf (dump_file, "    L1 cache size: %d lines, %d kB\n",
++	param_l1_cache_size * 1024 / param_l1_cache_line_size,
++	param_l1_cache_size);
++    fprintf (dump_file, "    L1 cache line size: %d\n",
++	param_l1_cache_line_size);
++    fprintf (dump_file, "    L2 cache size: %d kB\n", param_l2_cache_size);
++    fprintf (dump_file, "    min mem_access_ratio: %d \n",
++	param_mem_access_ratio);
++    fprintf (dump_file, "    min mem_access_num: %d \n",
++	param_mem_access_num);
++    fprintf (dump_file, "\n");
++  }
++}
++
++/* Determine whether to analyze the function according to
++   the ordering of functions containing cycle counts.  */
++
++static bool
++should_analyze_func_p (void)
++{
++  gcov_type decl_uid = DECL_UID (current_function_decl);
++  gcov_type func_count = event_get_func_count (decl_uid, PMU_EVENT);
++  if (func_count == 0)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "function uid %ld cannot find profile data "
++			      "and skip prefetch analysis\n",
++		   decl_uid);
++	}
++      return false;
++    }
++  if (func_count < event_get_topn_function_total_count_thres ())
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "function uid %ld total counts is %lu: "
++			      "counts %lu < perf's top %d threshold %lu, "
++			      "skip prefetch analysis\n",
++		   decl_uid, func_count, func_count,
++		   PREFETCH_FUNC_TOPN, event_get_topn_function_total_count_thres ());
++	}
++      return false;
++    }
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "function uid %ld total counts is %lu: "
++			  "counts %lu >= perf's top %d threshold %lu, "
++			  "continue prefetch analysis\n",
++	       decl_uid, func_count, func_count,
++	       PREFETCH_FUNC_TOPN, event_get_topn_function_total_count_thres ());
++    }
++  return true;
++}
++
++const pass_data pass_data_llc_allocate =
++{
++  GIMPLE_PASS, /* type.  */
++  "llc_allocate", /* name.  */
++  OPTGROUP_LOOP, /* optinfo_flags.  */
++  TV_TREE_PREFETCH, /* tv_id.  */
++  (PROP_cfg | PROP_ssa), /* properties_required.  */
++  0, /* properties_provided.  */
++  0, /* properties_destroyed.  */
++  0, /* todo_flags_start.  */
++  0, /* todo_flags_finish.  */
++};
++
++class pass_llc_allocate : public gimple_opt_pass
++{
++public:
++  pass_llc_allocate (gcc::context *ctxt)
++    : gimple_opt_pass (pass_data_llc_allocate, ctxt)
++  {}
++
++  /* opt_pass methods.  */
++  virtual bool gate (function *)
++    {
++      return (optimize >= 2 && flag_llc_allocate > 0);
++    }
++  virtual unsigned int execute (function *);
++
++}; // class pass_llc_allocate
++
++unsigned int
++pass_llc_allocate::execute (function *fn)
++{
++  unsigned int ret = 0;
++
++  if (!targetm.have_prefetch ()
++      || targetm.vectorize.code_for_prefetch == NULL
++      || targetm.vectorize.prefetch_handleable_mode_p == NULL
++      || targetm.vectorize.code_for_gather_prefetch == NULL)
++    return 0;
++
++  if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
++    {
++      tree type = build_function_type_list (void_type_node,
++					    const_ptr_type_node, NULL_TREE);
++      tree decl = add_builtin_function ("__builtin_prefetch", type,
++					BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
++					NULL, NULL_TREE);
++      DECL_IS_NOVOPS (decl) = true;
++      set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
++    }
++  if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH_FULL))
++    {
++      tree type = build_function_type_list (void_type_node,
++					    const_ptr_type_node, NULL_TREE);
++      tree decl = add_builtin_function ("__builtin_prefetch_full", type,
++					BUILT_IN_PREFETCH_FULL, BUILT_IN_NORMAL,
++					NULL, NULL_TREE);
++      DECL_IS_NOVOPS (decl) = true;
++      set_builtin_decl (BUILT_IN_PREFETCH_FULL, decl, false);
++    }
++
++  dump_param ();
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "llc_allocate: %s\n",
++	     IDENTIFIER_POINTER (DECL_NAME (fn->decl)));
++
++  if (number_of_loops (fn) <= 1  || !func_location_p (fn)
++      || operator_func_p (fn))
++    return ret;
++
++  /* Filter only when combined with PMU event. When the should_analyze_func_p
++     analysis fails (for example, the function without PMU-event count),
++     in order to ensure the accuracy of the LLC allocation analysis, the
++     function does not perform native allocation processing.  */
++  if (flag_additional_profile && (!profile_exist (PMU_EVENT) || !should_analyze_func_p ()))
++    {
++      return 0;
++    }
++
++  dump_function_info (fn);
++
++  llc_allocate ();
++
++  return ret;
++}
++
++} // anon namespace
++
++gimple_opt_pass *
++make_pass_llc_allocate (gcc::context *ctxt)
++{
++  return new pass_llc_allocate (ctxt);
++}
+diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
+index 0353ffd30..0492dc6fd 100644
+--- a/gcc/tree-ssa-loop-niter.cc
++++ b/gcc/tree-ssa-loop-niter.cc
+@@ -2489,6 +2489,37 @@ loop_only_exit_p (const class loop *loop, basic_block *body, const_edge exit)
+   return true;
+ }
+ 
++/* Returns whether the number of vectorized iterations for the loop can be
++   estimated from the given IR and update the corresponding loop attribute,
++   e.g., next_mask_114 = .WHILE_ULT (_122, niters.5_75, { 0, ... });  */
++
++bool
++number_of_iterations_vect (class loop *loop, tree lhs, tree rhs)
++{
++  loop->vec_nb_iterations = chrec_dont_know;
++
++  if ((TREE_CODE (lhs) != SSA_NAME && TREE_CODE (rhs) != SSA_NAME)
++      || (TREE_CODE (lhs) == SSA_NAME && TREE_CODE (rhs) == SSA_NAME))
++    return false;
++
++  tree ssa = TREE_CODE (lhs) == SSA_NAME ? lhs : rhs;
++  gimple *def_stmt = SSA_NAME_DEF_STMT (ssa);
++
++  if (gimple_code (def_stmt) != GIMPLE_CALL
++      || !gimple_call_internal_p (def_stmt))
++    return false;
++
++  internal_fn ifn = gimple_call_internal_fn (def_stmt);
++  if (ifn != IFN_WHILE_ULT)
++    return false;
++
++  gcall *call = dyn_cast<gcall *> (def_stmt);
++  tree niters = gimple_call_arg (call, 1);
++  loop->vec_nb_iterations = niters;
++
++  return true;
++}
++
+ /* Stores description of number of iterations of LOOP derived from
+    EXIT (an exit edge of the LOOP) in NITER.  Returns true if some useful
+    information could be derived (and fields of NITER have meaning described
+@@ -2559,6 +2590,9 @@ number_of_iterations_exit_assumptions (class loop *loop, edge exit,
+   op1 = gimple_cond_rhs (stmt);
+   type = TREE_TYPE (op0);
+ 
++  if (TREE_CODE (type) == VECTOR_TYPE)
++    number_of_iterations_vect (loop, op0, op1);
++
+   if (TREE_CODE (type) != INTEGER_TYPE
+       && !POINTER_TYPE_P (type))
+     return false;
+@@ -2852,14 +2886,14 @@ bool
+ number_of_iterations_exit (class loop *loop, edge exit,
+ 			   class tree_niter_desc *niter,
+ 			   bool warn, bool every_iteration,
+-			   basic_block *body)
++			   basic_block *body, bool guarantee)
+ {
+   gcond *stmt;
+   if (!number_of_iterations_exit_assumptions (loop, exit, niter,
+ 					      &stmt, every_iteration, body))
+     return false;
+ 
+-  if (integer_nonzerop (niter->assumptions))
++  if (integer_nonzerop (niter->assumptions) || guarantee == false)
+     return true;
+ 
+   if (warn && dump_enabled_p ())
+diff --git a/gcc/tree-ssa-loop-niter.h b/gcc/tree-ssa-loop-niter.h
+index ceaf65e07..8f03458f7 100644
+--- a/gcc/tree-ssa-loop-niter.h
++++ b/gcc/tree-ssa-loop-niter.h
+@@ -27,7 +27,8 @@ extern bool loop_only_exit_p (const class loop *, basic_block *body,
+ extern bool number_of_iterations_exit (class loop *, edge,
+ 				       class tree_niter_desc *niter, bool,
+ 				       bool every_iteration = true,
+-				       basic_block * = NULL);
++				       basic_block * = NULL,
++				       bool guarantee = true);
+ extern bool number_of_iterations_exit_assumptions (class loop *, edge,
+ 						   class tree_niter_desc *,
+ 						   gcond **, bool = true,
+diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
+index 9d21e6d03..6e61f7140 100644
+--- a/gcc/tree-vect-loop-manip.cc
++++ b/gcc/tree-vect-loop-manip.cc
+@@ -3738,3 +3738,269 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
+ 
+   return nloop;
+ }
++
++class loop *
++vect_loop_versioning_2 (loop_vec_info loop_vinfo,
++		      gimple *loop_vectorized_call)
++{
++  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
++  class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
++  basic_block condition_bb;
++  gphi_iterator gsi;
++  gimple_stmt_iterator cond_exp_gsi;
++  basic_block merge_bb;
++  basic_block new_exit_bb;
++  edge new_exit_e, e;
++  gphi *orig_phi, *new_phi;
++  tree cond_expr = NULL_TREE;
++  gimple_seq cond_expr_stmt_list = NULL;
++  tree arg;
++  profile_probability prob = profile_probability::likely ();
++  gimple_seq gimplify_stmt_list = NULL;
++  tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
++  bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
++  bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
++  bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
++  poly_uint64 versioning_threshold
++    = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
++  tree version_simd_if_cond
++    = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
++  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
++
++  if (vect_apply_runtime_profitability_check_p (loop_vinfo)
++      && !ordered_p (th, versioning_threshold))
++    cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
++			     build_int_cst (TREE_TYPE (scalar_loop_iters),
++					    th - 1));
++  if (maybe_ne (versioning_threshold, 0U))
++    {
++      tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
++			       build_int_cst (TREE_TYPE (scalar_loop_iters),
++					      versioning_threshold - 1));
++      if (cond_expr)
++	cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
++				 expr, cond_expr);
++      else
++	cond_expr = expr;
++    }
++
++  if (version_niter)
++    vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
++
++  if (cond_expr)
++    cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
++					&cond_expr_stmt_list,
++					is_gimple_condexpr, NULL_TREE);
++
++  if (version_align)
++    vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
++				       &cond_expr_stmt_list);
++
++  if (version_alias)
++    {
++      vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
++      vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
++      vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
++    }
++
++  if (version_simd_if_cond)
++    {
++      gcc_assert (dom_info_available_p (CDI_DOMINATORS));
++      if (flag_checking)
++	if (basic_block bb
++	    = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
++	  gcc_assert (bb != loop->header
++		      && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
++		      && (scalar_loop == NULL
++			  || (bb != scalar_loop->header
++			      && dominated_by_p (CDI_DOMINATORS,
++						 scalar_loop->header, bb))));
++      tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
++      tree c = fold_build2 (NE_EXPR, boolean_type_node,
++			    version_simd_if_cond, zero);
++      if (cond_expr)
++	cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
++				 c, cond_expr);
++      else
++	cond_expr = c;
++      if (dump_enabled_p ())
++	dump_printf_loc (MSG_NOTE, vect_location,
++			 "created versioning for simd if condition check.\n");
++    }
++
++  cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
++				      &gimplify_stmt_list,
++				      is_gimple_condexpr, NULL_TREE);
++  gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
++
++  /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
++     invariant in.  */
++  class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
++  for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
++       !gsi_end_p (gsi); gsi_next (&gsi))
++    {
++      gimple *stmt = gsi_stmt (gsi);
++      update_stmt (stmt);
++      ssa_op_iter iter;
++      use_operand_p use_p;
++      basic_block def_bb;
++      FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
++	if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
++	    && flow_bb_inside_loop_p (outermost, def_bb))
++	  outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
++    }
++
++  /* Search for the outermost loop we can version.  Avoid versioning of
++     non-perfect nests but allow if-conversion versioned loops inside.  */
++  class loop *loop_to_version = loop;
++  if (flow_loop_nested_p (outermost, loop))
++    { 
++      if (dump_enabled_p ())
++	dump_printf_loc (MSG_NOTE, vect_location,
++			 "trying to apply versioning to outer loop %d\n",
++			 outermost->num);
++      if (outermost->num == 0)
++	outermost = superloop_at_depth (loop, 1);
++      /* And avoid applying versioning on non-perfect nests.  */
++      while (loop_to_version != outermost
++	     && single_exit (loop_outer (loop_to_version))
++	     && (!loop_outer (loop_to_version)->inner->next
++		 || vect_loop_vectorized_call (loop_to_version))
++	     && (!loop_outer (loop_to_version)->inner->next
++		 || !loop_outer (loop_to_version)->inner->next->next))
++	loop_to_version = loop_outer (loop_to_version);
++    }
++
++  /* Apply versioning.  If there is already a scalar version created by
++     if-conversion re-use that.  Note we cannot re-use the copy of
++     an if-converted outer-loop when vectorizing the inner loop only.  */
++  gcond *cond;
++  if ((!loop_to_version->inner || loop == loop_to_version)
++      && loop_vectorized_call)
++    {
++      gcc_assert (scalar_loop);
++      condition_bb = gimple_bb (loop_vectorized_call);
++      cond = as_a <gcond *> (last_stmt (condition_bb));
++      gimple_cond_set_condition_from_tree (cond, cond_expr);
++      update_stmt (cond);
++
++      if (cond_expr_stmt_list)
++	{
++	  cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
++	  gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
++				 GSI_SAME_STMT);
++	}
++
++      /* if-conversion uses profile_probability::always () for both paths,
++	 reset the paths probabilities appropriately.  */
++      edge te, fe;
++      extract_true_false_edges_from_block (condition_bb, &te, &fe);
++      te->probability = prob;
++      fe->probability = prob.invert ();
++      /* We can scale loops counts immediately but have to postpone
++	 scaling the scalar loop because we re-use it during peeling.  */
++      scale_loop_frequencies (loop_to_version, te->probability);
++      LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
++
++      nloop = scalar_loop;
++      if (dump_enabled_p ())
++	dump_printf_loc (MSG_NOTE, vect_location,
++			 "reusing %sloop version created by if conversion\n",
++			 loop_to_version != loop ? "outer " : "");
++    }
++  else
++    {
++      if (loop_to_version != loop
++	  && dump_enabled_p ())
++	dump_printf_loc (MSG_NOTE, vect_location,
++			 "applying loop versioning to outer loop %d\n",
++			 loop_to_version->num);
++
++      initialize_original_copy_tables ();
++      nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
++			    prob, prob.invert (), prob, prob.invert (), true);
++      gcc_assert (nloop);
++      nloop = get_loop_copy (loop);
++
++      /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
++	 reap those otherwise;  they also refer to the original
++	 loops.  */
++      class loop *l = loop;
++      while (gimple *call = vect_loop_vectorized_call (l))
++	{
++	  call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
++	  fold_loop_internal_call (call, boolean_false_node);
++	  l = loop_outer (l);
++	}
++      free_original_copy_tables ();
++
++      if (cond_expr_stmt_list)
++	{
++	  cond_exp_gsi = gsi_last_bb (condition_bb);
++	  gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
++				 GSI_SAME_STMT);
++	}
++
++      /* Loop versioning violates an assumption we try to maintain during
++	 vectorization - that the loop exit block has a single predecessor.
++	 After versioning, the exit block of both loop versions is the same
++	 basic block (i.e. it has two predecessors). Just in order to simplify
++	 following transformations in the vectorizer, we fix this situation
++	 here by adding a new (empty) block on the exit-edge of the loop,
++	 with the proper loop-exit phis to maintain loop-closed-form.
++	 If loop versioning wasn't done from loop, but scalar_loop instead,
++	 merge_bb will have already just a single successor.  */
++
++      merge_bb = single_exit (loop_to_version)->dest;
++      if (EDGE_COUNT (merge_bb->preds) >= 2)
++	{
++	  gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
++	  new_exit_bb = split_edge (single_exit (loop_to_version));
++	  new_exit_e = single_exit (loop_to_version);
++	  e = EDGE_SUCC (new_exit_bb, 0);
++
++	  for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
++	       gsi_next (&gsi))
++	    {
++	      tree new_res;
++	      orig_phi = gsi.phi ();
++	      new_res = copy_ssa_name (PHI_RESULT (orig_phi));
++	      new_phi = create_phi_node (new_res, new_exit_bb);
++	      arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
++	      add_phi_arg (new_phi, arg, new_exit_e,
++			   gimple_phi_arg_location_from_edge (orig_phi, e));
++	      adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
++	    }
++	}
++
++      update_ssa (TODO_update_ssa);
++    }
++
++  if (version_niter)
++    {
++      /* The versioned loop could be infinite, we need to clear existing
++	 niter information which is copied from the original loop.  */
++      gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
++      vect_free_loop_info_assumptions (nloop);
++      /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
++      loop_constraint_set (loop, LOOP_C_INFINITE);
++    }
++
++  if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
++      && dump_enabled_p ())
++    {
++      if (version_alias)
++	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
++			 vect_location,
++			 "loop versioned for vectorization because of "
++			 "possible aliasing\n");
++      if (version_align)
++	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
++			 vect_location,
++			 "loop versioned for vectorization to enhance "
++			 "alignment\n");
++
++    }
++
++  return nloop;
++}
+diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
+index 7f7577951..023a83c38 100644
+--- a/gcc/tree-vect-loop.cc
++++ b/gcc/tree-vect-loop.cc
+@@ -9735,8 +9735,11 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
+ 
+   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
+     {
+-      class loop *sloop
+-	= vect_loop_versioning (loop_vinfo, loop_vectorized_call);
++      class loop *sloop;
++      if (!(optimize >= 2 && flag_llc_allocate > 0)) 
++	sloop = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
++      else 
++	sloop = vect_loop_versioning_2 (loop_vinfo, loop_vectorized_call);
+       sloop->force_vectorize = false;
+       check_profitability = false;
+     }
+@@ -9989,7 +9992,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
+ 			   niters_vector_mult_vf, !niters_no_overflow);
+ 
+   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
+-  scale_profile_for_vect_loop (loop, assumed_vf);
++  if (!(optimize >= 2 && flag_llc_allocate > 0))
++    scale_profile_for_vect_loop (loop, assumed_vf);
+ 
+   /* True if the final iteration might not handle a full vector's
+      worth of scalar iterations.  */
+diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
+index e13bc6c99..85018f250 100644
+--- a/gcc/tree-vectorizer.h
++++ b/gcc/tree-vectorizer.h
+@@ -2177,6 +2177,7 @@ extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge);
+ class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *,
+ 						     class loop *, edge);
+ class loop *vect_loop_versioning (loop_vec_info, gimple *);
++class loop *vect_loop_versioning_2 (loop_vec_info, gimple *);
+ extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
+ 				    tree *, tree *, tree *, int, bool, bool,
+ 				    tree *);
+-- 
+2.44.0.windows.1
+
diff --git a/0357-Enhancing-BOLT-Optimization-with-AI.patch b/0357-Enhancing-BOLT-Optimization-with-AI.patch
new file mode 100644
index 0000000000000000000000000000000000000000..64f2239301274bb00c931403aa62cbd22717ec6e
--- /dev/null
+++ b/0357-Enhancing-BOLT-Optimization-with-AI.patch
@@ -0,0 +1,72 @@
+From 3dd233c1a7b20de2182ae4e98909ddace6612a0a Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?=E9=99=88=E9=B8=BF?= <chenhong92@huawei.com>
+Date: Tue, 25 Feb 2025 16:32:39 +0800
+Subject: [PATCH 2/2] Enhancing BOLT Optimization with AI.
+
+---
+ gcc/ipa-hardware-detection.cc |  2 +-
+ gcc/onnx.fdata                |  2 +-
+ gcc/opts.cc                   | 13 ++++++++++++-
+ 3 files changed, 14 insertions(+), 3 deletions(-)
+
+diff --git a/gcc/ipa-hardware-detection.cc b/gcc/ipa-hardware-detection.cc
+index 75b74aa03..6b36d685c 100644
+--- a/gcc/ipa-hardware-detection.cc
++++ b/gcc/ipa-hardware-detection.cc
+@@ -89,7 +89,7 @@ create_part_bb (basic_block last_bb, tree part_base)
+ 		     &gsi, PLUS_EXPR, unsigned_type_node, part_base,
+ 		     build_int_cst (unsigned_type_node, 4294963967));
+   gcond *cond = gimple_build_cond (LE_EXPR, part_cond,
+-				   build_int_cst (unsigned_type_node, 2),
++				   build_int_cst (unsigned_type_node, 128),
+ 				   NULL_TREE, NULL_TREE);
+   gimple_set_location (cond, input_location);
+   gsi_insert_before (&gsi, cond, GSI_SAME_STMT);
+diff --git a/gcc/onnx.fdata b/gcc/onnx.fdata
+index 234b1a045..77f4d9b1d 100644
+--- a/gcc/onnx.fdata
++++ b/gcc/onnx.fdata
+@@ -1 +1 @@
+-316365613139376535626535626234666331363163303835336362393535613530636234643633626364386566396132333232373733633230393865663664633761393137633266616431663436343236613231663865636236346133616662623761373633663830623231393063616534633032316538626436633731643237666333386462313164333630303936336137323863313634613031393931613164363237643262353162376133643935373036306336346161376563383862613138666663393538363731333639396239666362393336373737643238636639643761343231346131333463353261623633343633343866663966663365346231356532663139306164303361383836396333393339616236383439363661313661303665643535633961666563613431303466333534346564633533373862323031396339626536613030383761623236663432633564653130353935353135313736656235373632373739343662663034343334633035626465356237633439313164313338373637383365326138366162363234323765393736616438656463343339613031316630643031613465386464326334383565343838366435313137313166383433396531626137353932616538333330653164326438656166343339363262366264326632376564396434396333356565343733383164363264633937356663663338666530336166316634623264393031393536333863383165616536656238346462656337333638323338646535303638363933646565616264363966356566323465346538613762623864303766646338666264643466666537303263623162326539653435643130313061386235623631306630636163303536343164663364383738353266386330376562343962393037306133383363326138393238376435613332353933663235313030326664366166373632343532613130323237303265373433623362623162633661633363303235613236383166313465396162353938363931613765316565313864313038cd68834331701041d1d21041f17c20432483a94386647e4157c8e33b5f3d5d3ec5275e3ea689863c435a0f3a76acd63d5d9b803b24467c3baf847c3b67b89e3b852a313b2127853900000000d58ac23b200ab53a000000807d3119bc22f7a63a81549f3b93b5013baee4a33b62c1153b9ae08b3a6929a33b20038f399475983b430ab53a73fc0b3a2daa0ebad595953bc2f1e0bb33e9ccbbb978d83a5e77a53b41e4c93adf10a73bdf36643ad7fd983a61e8d93bc04a283a30c072382f942c3b5b3cc73a4392e43a422b093c79bc61b9a5309e3b00000000757baa3a03d8a93c3c31e33af526ebbb000000006431d43a1d0ae73aa450783b8c57afb9b8eae939ec8fab3b9581d83920d7a1ba0fc1af38b6aece3ab50bafbbd50db63a26aba33bcdeda33b00d9493ac22dac3cf8c4233bc2966e3bdf1bca3a8fb4d13af9b0983b2cbda73bdae2aa3bc93bae3b39e1ba380857953be8e7a73b49e9df3b20b0233b9fe3d43a0dbcaa3bd10cf0b978eea53b761ebe3b0a50a23b70bd47b79a7720bc6cd4ee3ae0d0f93a9c333ebb5098dfbbbf8fa53b445efebac7b9993b6182b93aef267c3a4aa09e3b46d9a83b9f95983a379e913c6516123a1b2ebd3aaf943c3a0b90803becba92bce68f673be723253c5d7f813ad779613800000080af3c65ba6999743900000080957a003d82f2fe39baab4d3b7f348c39b8d3323b3c1e253ace952dbbc9d364bc3aafaf373d0a633be8fdee3968b0fa39eb70a83a7cba4e3bdf2407bc40f50f3d94f4c3b9a828573b3f2bc3b99a5763bcccb838bb24f011bae3400dbdc3074fba30a829bb3dde6e3ad7c2caba2b2aa7b8d479a7bbebe2603a7025583b00000000017414ba680386bc9b365e3aaacb03bc000000006afd90b9a64e263980eb223c80a48ebcca9703392310573b1fd419bbf7368abc17a2083a3ceafab95eb11cbcf29995b9a64264bc8bae403bc1dc6139631c88bc12e3373c07cf0c3cdc93a6b97edbc0b917754d3b5cdc143c61ef393b40a809baf3861dbbafce623be550513b828382bc359d513afa4a25ba31394c3bb013da3a9835553bf3d9553bec2b65bcee09bab9f6343e3c03a59f39fb11053a078e7cbc5bd006bcfe23363b08d12cbb3cfb533bb98a8fbadcb99139cbd1573b24725e3b01014fb6dcbc45ba6ee024bb318db1baf39ce9b952d625bc41afddb91d7dffbbc0ba163b0387b93b2594623b00000000f60cf9ba483c983b0000008015e6c6bcbd45983b77d62ebcfbb69f3b7b5752bcc334ab3b4f9806bc9d89063cc0675a3b807426bca81a9f3b7ef56f3b6a96a13a045937bcd4a2f33cb92173bc40af783b26ac40bc5fef6b3beba6fe3b8c7207bc5e25443bfd99a33be7e7403b4c2508bc0c87bb3bb95dcd3abe228b3bac03deb91a2ab03add753bbc000000002e04703be98f1fbccef2af3b17ebe93c0000000020e37a3b46ba913b1fd7003b1f3f133df85d423bacc843bc5fada7bbc8680d3d8423503b2afc6c3b4e43033dcfcc7c3bcece053cdbb44ebc4151823ba14426bc6e942c3b3bdc4d3a34967f3b7687783bd0cd3ebcfc75053ade324ebcd10c32bc9ff9fbbb0b7430bcf60e4abcd6b6e03b295db43b25c75d3b88334fbc8d95883ac9c73ebcddf941bc2b18083c43044c3b405414bd7617963b9910a03bd5e70c3d9356f23c3a2750bc472107bce47d47bc0125243b3c41953b0f6134bc8c403bbc8fb3873ba5e218bcae5d06bc2dfe103b758a493b43cef63cd7438d3c2bf1eb3b2d4a833cf13a43bc5d14c4bd000000002932a7bc3191e4bb00000080224e753dea87dfbb41e28a3cbeb44b3d731d8f3c1312d2bb54e44dbc232b84bc74f9d9bd033bcb3cdda410bbeeeb47bdd7e44e3b3c3e21bb435712bdb3e6413c82e770393f20a53cc6642dbc325484bc410c4e3dcb49823dc262bd3c204a563d032393bb0887753c0cad943d3946abbbcb77b3bc9151c6ba860dc0bd00000080e5880d3a2f960ebd1bba99bcce3910bd0000008037acde3be98a983bd60b7c3c66ee27bd2431aab98b2b95bded06813bc17429bdf5a9e9bc4ff297bafad924bdc14d53bc901784bcad96073cd34989bc84580fbd1e276b3ca48e513c189796bbe15f8cbb39fa473cce9c693cbdd0843a4f07443dbf40c03c38a1893c3790ab3cb48c58bcc5e9863b684448bcb5c32abc0726a6ba1def9ebb57ce273d772b84bc1925c63d2e26d8bc24460cbcb0f807bc8dd0a5bc9ba312bd6ed5393c32e1f43cf3c58bbc8a5334bc8e0c53bbf78cb13c7805793c8d5800bbc4a5c2bcfc2c85bba79d3c3df00f493db55cb73cce71c43dc030f03cc953823c79c1f13d614db73d0000000074e98f3ca415183c000000801104803d9afea83ddf9ff93d835f9bbce8d8623cd67e093c453d143d7d8c90bc1434e23d24580b3e00711d3c729b903d81a0253c82e9b53cba65123ca564a23d7a53003c2c82ec3de139f93c58f58ebce101813ba5782d3d4e198e3dbaa40fbb58e2bc3bbf92943c98421e3df32c0c3cbc235ebcc2fe443c2789033e00000080b94ca73be81815bd1758e53c5df7053b00000080f9f63f3cc7a9893cb846823c65d2143c9bb50e3cced60c3e92fb983b583593bbbbfe263e390bdf3b696887bbd13e823c207890bc1cf0c23cd688163dd14e16bdd3cb813c95a6593c70d7083cd6c6e43b6d4d9b3c9455683c876e1f3e599ff83c4b377f3c2afd953cbeedd43ccbdb163d2d78fd3bcc84363c5c7fa63c22fedf3c3318e83d0ecdba3d0ea690bc462e9a3d0b11013cf19f503da4f8813db249c0bba7300f3c2d6c223dd1d7663b56b4c43d56e5f93c4799e43b0702a73d4e15ae3de8040e3cdfad72bc0ab6593d1fb7c9bb6f90b43dbfcab83b4cd802bbbd3c993be1a91c3c8f677cbaa83420bb0000008084bf263b6336adba00000000373bd13b521cffba733ac83bee8c9bba1306f73bbf5471ba8651773bc863ac3a6ed119bb926fc43b9368e5ba34f319bba9c8ebbaa74acb3b39169ebc812d573b4764beba5815ea3b5211caba956ec23a9e107d3b64dbc4ba674ac73be88107bb5354493b688c5cbaaf4571bab3d6b3bae566603b11b0b0ba6bd1d03b000000005b6cc1ba0720833c5210c7ba85cd97bc000000003c1fc4bab35dbbba30b6fe3b389ea2bc97eb8eba37bae43b697a293b87969abc5c9e04bb83acc2ba5f8fadbcf872c5bab03daf3ad509fe3b2f81f9ba4317863cd808bb3b0177f03b02dabdbab2efbdba3b03d83bc09f223c6030ec3b0137c13b29f5663bf195ce3bc10eff3b18cda93b35486cba7dd7c8ba0d51003c34dc93ba891be33bb785ea3bbb75a73ae04f2abba21d8f3b9065c3ba8892bbbac37d96bc6a0c9dbc596cef3bce5a063bd64cec3be62fb0baaa5cbbba1acbd03b5cdfe13b0f37e9ba48bb653cc513733bc352a0bacba0ffba469ababdf17dae3c939271bd7af718be283c113ed15fbb3d00000080920e90badfa4d63d00000080df70d63c18f5c03dbbb677bde981e83d5a78d2bd0985093e663cffbd9a12803d3fc0b33d65b388bd50a0dd3d011acc3d2df0203ed04095bd5c9a8abd7294323dd404b33dabc5bdbd8042a33d93cb6f3d3b81f7bd4e2e823dfdba273d83ea863d7d0f3cbeaff2133e565a3d3d0d66ca3d035bea398af8073ec3b79abd00000000d078ad3dbe475c3c9267013e1db874bd000000809e1db03d23f9cb3d902b14be16ea52bdc41ac33db52abbbdf07981bed9442ebd4d94a83de0f7a93d745d41bd4cebb33d01f57c3da16adbbd8ae0b63d2d6c763cc1f40ebe098cbebddc59b83d256bb13d359fa9bdf886e5bd0dfcbbbdd68d0dbd726807be83579fbd05c9dcbdd8e8983ccd620d3edcf69f3dc980e8bdcb6c923e1cdcb3bdcc6ec1bde600823d030d993d9b3a1d3d420bcd3d754dde3df8132cbdfabb85bdce89c3bdefd054beafc6c8bd931e7c3d30fed83dbef795bd7fb1b3bd6d9eb73de344993cceb603bebd216b3de45c803d70a1953e1dfb62bea91d2a3ed284113f544008bfc50498be00000000d460e13b0d05b2be0000000066f28ebe9adb9ebe4ee6323efd01c7be63c7ac3e888dfabe44e5e23ec6f631bed07890beb0fe4a3e6217babee93daabecf501abfd1b9573e7bd53a3f805fe2be10a48cbecac09a3e52c67bbef48823be9a9dd83eb74340be75deeabd3fbf42be9d1c3c3f74e10abf923b05beaec4a4be66229fbbb35ff6be8a146c3e00000080638d84be04cb14be4dcfe7be53ec253f00000000208c89be7039a8bef1170c3f70ac0d3fdcd59bbe68f4973e6b1f903f16a3eb3e6d0a86be361a81be4cba023f82c88cbe01132ebea3feb73e79f391beeb6826be5130063fc4d9953e4a6690becd328abe13b7803efa6fc23e9823993efea5b83d60e0f43eff276c3ed453ba3e374b63bdd41502bf17e36dbe5034c53ea2ac9dbfff04903e64879e3e4c9d35bede8b6cbe435ed2be4e73abbe3020bfbef8c6e83e8630343ff9de9e3e78d65c3f3659a63eee6f35bed729b4be3954623ef8778f3ef4758fbe75fd4ebed9e3ed3e770a23be6b403ebed7e596bd656a093d6e463cbddfa103be2213f73d8a4c973d0000000050dc0bbb9d64b03d00000080d5e9283d3e429c3d1d0b3ebd35e5c03d101cacbd51b0e83dd890d5bd1378483dcc6a903d431d54bda0cab63d6376a63df1ab0a3e1a586bbd6bcfdbbd5cfb8d3db01a903dced598bdc300823d43fc3a3db75ccebdfea14a3d8f27fe3cf98c533de7d924bef762fc3df248103dde32a53d8d067e3a17bee53d5b6573bd00000080285b8b3d5cd0283d3447d93dd249c3bd00000000317f8d3db47da63df06ffebd933ca7bd3da29e3d0f9496bd7f1c6dbe2feb8abdd761863d8d3f883dc33d9abdecee903dd3ed453dc93cb4bd785f933d08c8393d5eb3f4bdef249bbd4ffb943d98be8e3d7b5487bd365bbebd103397bd19bdd2bcbc53e4bd9eb77cbd7d6fb5bdcc15713c4f00f03d54fc7e3d1c98c0bd05dd853e0dea8fbd5a2d9cbd35aa4b3d879a713df19b783d214da73d821bb73dc52889bdb707d4bdfd3d9ebdb6e53cbe1ee9a2bdc740443d3594b23de3046bbd4acb8fbdbf54943d23de5e3dbd36ddbddae5363d5155473de961aa3ed3a432be5d8e583e1d9a173f318110bfbc47aabe00000000a572363c9755c8be00000000c5235abe14fbb1be3f59573e57f8dbbe0adcc13e40d705bf1517f43eb1d467be07b3a2be40216f3eb6d9cfbe84b1bbbeba571cbf5c30843e96700c3f700eb6beee52a3bedcd6ab3eb39b93be9fd159be7524eb3efdd668beee2a1abe9dd171be0acf3d3f227e12bfbb312dbe3966bbbee3f99bbcec2f01bf2790883e0000008047ae9dbe59a438be26a4f9bedaf7f93e000000004b62a0bea849bdbe00ac123fa905d63e3e96b2bed43da93eef558d3f0c73b23e65bf96be67359abe8cc9c53e372ba4be184265becf66cb3edd11a7bed6d249be506f0d3f7429af3e0ba1a8bed2aca1becacc973e154ad73e9df5a93e53dd003e4cbd023fedc88d3e01e5cc3eff3eb3bd194d0abfa27090beb4a9d93ed0bf9dbf7c95a13e7bb5af3eb12a6bbeace488beb5419fbe86c8bebedea8d1beb735b03e6889073fb9f1b13e34835e3f7d7bb73ef8e961be08c5cabe6ff4833e956fa13e33e0a7befd926ebe6009fe3e8bc653be843b65be6a18a0bdd534f93c01fc52bd6b6f09be77e3fd3dc1c59f3d00000000e97d92bbf664b83d00000000972f193da896a43d65e754bd5e81c83d269cb4bd1d84ef3d1f11ddbd55385e3d020b993d4fdf69bd12a5be3ddd95ae3d62cb0d3e7ff87fbd7418c7bdf8ca803d1ed5983d13dea1bd480c8b3df3be503d2202d6bd540d5e3dc93e133d530d673d1b012bbe8f7c013eb79d243d9072ad3d772a273acfa6ec3d0ce383bd00000000fd56943de9119e3c0f64e03dd0cdb0bd00000080274f963deba6ae3d1c0a05be683297bd1f18a73d49b19fbdc64270be58657bbd263a8f3d374f913d63948bbda1b0993ddab55b3d7485bcbd00fc9b3df3edb03c892c00be9350a6bd25af9d3d968b973d79fe90bdeca4c9bdba46a0bddb1004bdbc84ebbde25788bd3ab0bdbda690963ccab6f63d60cc883d918ec8bd51ba873e9a3e99bdd01da5bde567613d50fd813d6981613d7163af3ddbdcbe3d473578bd8c07c0bd6924a7bd21c53fbe8ca6abbdaaf9573d158dba3d86c37fbd572399bd2a109d3d6994dc3c1e95e4bd4e1d4b3d7eca5a3d032771bdc8f5403d4b1a09bde53ceabdbfd4db3de573753d000000004909b9ba32b88f3d00000000c1c16c3d8b42803d994910bd8ba8a03dac6f8bbd482cca3db803b7bd3e900f3dbd44693d09b223bd2e39963db96c893d58fbf83dd2092ebd01b411be3149c43d891f633dd5b879bd93444b3d60e4033dfcc3aebd6c231b3de078bd3c57241d3dc16d17bed815e03d92dad63cc208853d82c1823a3acdc63d6d833ebd00000000ad06563de91bf63ca912bb3d854001be00000000941b5e3d5cd1873d744ce1bd91fce3bd57a17b3d804275bd64d465be11f8bdbdae6a583d0b73503dfc7bd1bd7155633d3e6a0c3db57194bd8eae6b3d6798093d58a9d7bdc3db71bdbd2b693d602c5f3d20c14fbd73e59cbdd53e77bdf28694bc1c71c5bd7f773ebd704a96bdd115383cf8ebd13df406403d27019fbde07f823eef7668bd4fcf7fbd6184123d9a113f3ddd37ad3dfe6a8a3dda459a3d3a30bbbdafb00cbe663c80bd0e3832be203886bdf262123d1972913df4a036bd978267bd99a5673d02f7283d6ae8bfbdd981033d3e82193df70172bdf1b33b3d2c8309bdf837ebbd4944dc3d875a763d000000001630b3baef2e903d000000008d6c663da8b4803d58ca10bd9221a13d74e08bbd7fa6ca3d338bb7bd700e103d71236a3dc13924bd00b2963d3de8893d2c6cf93d4ab12ebd894d11be9e58c13d81f0633dae8e7abd69054c3d5b51043d5d42afbdc1b21b3db199bd3cdab61d3d61aa17be7e84e03d2c57d73c907b853d18e27a3ac747c73d073e3fbd0000000013d0563d4ba4ec3cf98ebb3db3d400be0000000068ea5e3df445883d2b22e2bded78e2bd97817c3d232376bdb02f66becefebbbd053d593d6b38513d5d50cfbdab27643d1ce40c3d1ce994bdc0866c3dc167043d7161d8bd9abb72bd8c016a3ddcfa5f3d6e8650bdf66e9dbda51c78bd8b6494bc14dcc5bd07203fbd87b396bd8478363cab61d23dd8be403d137e9fbda19d833edb4969bdf35780bd9506133d4ac83f3d4157a93d2be18a3d44c19a3db553b9bd422c0cbe29ae80bd8b1f32be2ab086bd7fe6123d5ce9913d1b4c37bd253c68bdf37a683dda1d253d1866c0bd1fef033d630f1a3d55a78ebd9363e43c877813bdf06300be01e5f03d82788b3d00000000c3734fba035aa53d00000000e511873d7b68913d4e6e1cbd7718b73d813fa9bdcc68e13d5460cdbdbca5213daa5f843dade935bd383bac3d31919b3d2bf8073e69c848bd95ef2cbe5d65d83dba17833d49ee93bddd1f693de61c123dab7ac5bdd1dd2d3d5cf9643c6e50333d290923bebe38f63d2111e33cba73993d0a4b1c3aff26de3de00c5abd0000008092d2793d9b92fc3c1457d13de3451bbe00000080993e803dcd659b3d9082f7bd691808bef0f6913d51b390bdbab66cbe5acce9bda06e743dd62e733d250efbbd92ab833d46cc1d3d0c53b4bdea20873d8341113dbc72edbda26d8cbd369a873d164c813d395d75bdbe4ab3bd341392bd2d3f81bc05cadcbd939c5ebdc34eb6bd0175de3b5116e93d1b04603d00c2c2bda59a863e93b488bd09f997bda483253d75f4573dfd1ad83d23ed9c3d4bbfad3dc5f9e5bdf05c27be036399bd72e93bbe8204a0bd71d8243d7988a73d352350bdb95188bdcbd0863df73d3e3d0cbed5bd59aa133d40402b3d52f370bd85b63b3d0fd708bd723aeabdcdb1db3d87df743d000000806cedb3bad56b8f3d00000000d46d663d35e87f3d67e50fbd8859a03da22e8bbd43f6c93d37deb6bd811c0f3daeb1683d765223bdb9ef953d1c24893da7e7f83d15a22dbdd3ac11bee609c13d9f83623d703979bd21b74a3d7b7b033def92aebd70b81a3d730bbd3c5cbd1c3d0ecd17bef7f0df3d8140d63cd8b9843d1d757e3ac997c63d11023ebd00000000937d553d0e57ef3c0edaba3dc82f01be000000807c885d3ded82873d247ae1bd3436e3bd170e7b3d5ab774bd32eb68bef817bdbd6edc573ddbec4f3d4721d1bde3c1623d5ffa0b3df23994bd8a176b3d63e0053deec1d7bd7a5971bd5899683d57945e3d72434fbd92a59cbd4c9e76bde50b94bcc364c5bdab1a3ebd841a96bd88b5363cb8bbd13d548a3f3dd1e69ebd72ce833edbe867bd6f517fbda50d123dc88a3e3d8f2ba93dd51b8a3d83fc993d10a3babddc870cbe05e47fbdc04732be17f885bdfdfe113d7b27913dd82436bd100667bddb16673d808c263dbfbcbfbdfa2c033d6118193dc7b077bde7d4433ddcae0dbdc3a7efbd20dfe03dbb5f7c3d00000000603dbdbab68b933d00000080e852703d98cf833d651615bd4fbfa43dce3a8fbdd0edce3d2297bbbdfe4c143dbdf46f3de7d728bd892b9a3d092a8d3d635efe3dce9033bdcff113be186ac83d49a3693d124780bd8e56513d255a083d402eb3bde733203df262c43c6343223d04aa1abe2d29e53d0392de3cb1a9883d8ea7833ab384cb3d0e5744bd00000000d2555c3d8ed7f63caaa1bf3d462903be000000801f8e643d88808b3d81a9e6bd24a3e8bd4854813dee347cbdeafb6bbe1474c1bd62c75e3d92a6563d5559d5bda6e2693dbb14113dd06d98bd8065723d3e040a3d071cddbdc1d478bd83d76f3d8d9e653d47f855bde307a1bdaf257ebdce119abce91fcabd9f3a44bdee319abd37ff3e3ccecad63d6be6453d101fa3bd177f853e88256fbdab7483bd6e52173d12e6443da4ecaf3d32268e3d324b9e3d0603bfbd44ea0ebe44cb83bdad6235be57ee89bd933e173da14e953dd84a3cbd7c026ebd384d6e3d9d5f2b3de293c4bd3104083dbe891e3d97b16fbd8d903a3dc1e207bd7219eabdc854db3df7f4733d00000000e850b1ba61f98e3d000000005023653d9b007f3dc20d0fbd7beb9f3dfab88abdd293c93d1e74b6bdb4490e3d3bc7673d755122bd3e7e953d89af883dc493f83d8dbc2cbdcd0711be5457c03d389b613d165078bd9bcf493d41ae023d0226aebd2ddc193d9366bb3c94df1b3d008d17be2995df3d3dbcd43c3446843d9367763ad233c63da91b3dbd000000805b93543d7659ea3c8b72ba3d3a4100be00000080009f5c3dc40f873d922ce1bd32cce0bdcc237a3df4cb73bd988d68bef1ddbabdc7f2563d12034f3d497ccebd01d8613dde280b3d17c693bdf72d6a3dd434033d60b6d7bdb06b70bddbae673d96ab5d3d24584ebdc2339cbdb0b675bd189c92bc81e0c4bd3e2b3dbdbd8e95bd1ac1343c405bd13dbca23e3d396f9ebda2b7833e68ff66bd3d687ebd9a39113d03a53d3df37fa83d74a9893df58c993d4e6db8bdf6030cbedcfa7ebd142532be818285bd7426113d02b5903d4d4035bd28f265bdd02b663da385233d2056bfbd115b023db03c183db67f6bbd2238373dd9f904bd22b7e6bd6a08d83dee556f3d00000000ade0acba75698c3d00000000f81e613d4b3c7a3de7e50bbd952b9d3da23788bdd76dc63d2d7cb3bd59220b3d7e4f633d7cf91ebde1db923d2333863dd90ef53dfb1b29bd309d0fbe4f9dbd3d9c365d3d57a273bd67be453d2576ff3c0b43abbde685163d9d25b73c9e81183dc2bb15be5940dc3dd0c4cf3cd1d5813da770743ad315c33df03939bd00000080845d503d6a53e83ccd6fb73da62cfebd00000080394c583db596843d6201debd1f8fdfbd2971753d652e6fbd538966be80adb9bdc0b3523dd9e04a3ddf6dcdbdbf735d3d780f083d5d2a91bd6ead653d6e14023d8e3fd4bd8fda6bbd9f37633dc054593d64394abd3a8099bd5d0f71bd21308fbccde4c1bdbb5139bd100693bdb242303c8a24ce3d22bc3a3dc7bb9bbdb39e823e728962bd89a779bd5f050e3d61c0393db4f7a53d2828873d21de963dd6f4b6bd38820abe1b387abd071b30be521083bd6bf50d3d5f208e3d217c31bd92a961bd07ba613dbd25223d3f48bcbd38d9fe3c41ed143dd523d63e2f30a6be5ce7723ed5b3503f886143bf5a97d9be00000000c1361f3cccf6febe00000000c6a7ccbe1e6ae3be53717f3e548d0ebf056cf73e199933bf669a223feb077ebe93bfcebea700913e864805bf36c6f3be43625dbfe4309a3e1cd6843f05862bbfa03fc9be5676dd3e6810b4be955869be71381b3f575a89be84a727be4f278bbe140b873fb12947bf300e3ebed7eaebbe6f18e1bb9c9930bf87c0a83e00000080a1a9bdbe755354be942526bf6d1c6c3f0000008047d1c4be18e0f0be20e6483f35e1493fbe1adfbe1a73d93e563bcf3fc2fb273f6dc3bfbe63b5b8bec4753a3f2078c9be5a7578bef4c2033f06e3d0beea966dbe5368403f7778d63e0baccebe57bfc5be121fb83eee420b3f4424db3e763c033e51892f3ffbd8a83e686e053fdedaa1bd5d843abfda1faabe45450d3f5488eabf600cce3e4ce3e23ed6a181be523aa9be643b16bf1a7ff5be76e208bf7cdd253f4b25803f6a67e33e789a9e3fe61fee3e8b9781bea90601bfbac0a13ead42cd3e5d54cdbe8fe093be127f2a3fccd368be1ee887be106a0c3eeee1893e82029f3d5725303e5642663f3d48ee3e04d172bdd485bc3d3547ff3ebf8421bebc6bdd3e023dcb3e23feb53e08b1363fd7ad833e92133b3f3091903e0fafc13f51adf03e4b2e0c3e6612e43e92e1dd3e0d091e3fb063833e89ea713e2698003ff850313fb951543ede510b3feed7bd3ff97d843ee29fd73e0161653eeff1ad3e7755773e09023e3ffd34bd3ebdd90e3f0fa2503eee46033feb2b833eef03febd8727d83e8ae5a83f665d0a3f17b35d3ed825d5bba70bf43e5126033f1900253e8a569a3e88b7ba3e7c70703eb6557e3ec9ba7d3e7c13ae3e5830d03ecc7b683e5adeee3ec9c7b83f2190693e9937bc3e9a92b03f24c51b3e101df53d7e77e33eff50233fd3666b3edf57163eee32983e5416253eee1e513e11fe1f3ecdef4e3ef340053e45e4273f99bccd3eb76e623e886d9e3f8fbd5a3eb44f543e27dbc33fd1e4d63eea1a123f46521a3f75a2083fdbf8533e77cf6c3e0bdb8a3eec4fc83e2d98653ec5310c3f5ec9ea3e3bf2513e3fb13f3e7277c13edbe3bf3ea6c69e3ea869d03eee4ba73eb83d76be0000803f
+\ No newline at end of file
++656137356462666463346364333361623035396139323366643262383764363763323530613631653861653634666630333030316562323662346133633566313233326432366139383465346338376266393132363438333765656463366235613461313434346139333334396265306163333731646537376430643834323664623863366163343363643130313435636565623834363361316133393230363937653835653762353534626439663133633538623062353439646237616630333237666136663433386334626639643465303163653832333062643863333664336630376231643964316231663933656333386338656262303734376137313565643963396535653131303763646533393234333735613333633132353061393531333935623539643834373266303861633739373862366663376365383233326139383939363566373061373361613939336537366631353334346563313061373365663635633332663437653136383235343635623234366430373330366336363237623962656465373233346131343264313137653838643334616430346339363732613237623866636364313232613934343261643231386531356430343965303330306332326266336634626163333461643139653962326566303064343333623037313762303934626336363537616339343637306633633066333231613063623339333539376461316632653234353938616133616463623534356232346135666261616339646638373031356633306161626465643665633066616264623965656138613233353331303236363565616133323131653935643363353832366663633434626236376663386335666364333530336433353234383031636264353761616638663031613263353738326438656265623236653338323232386565626464393034633962373835363264656664616439353336623462376139333134656662373033626135336138333136643032636430653334303861616439333736306363383862306439623962646435383931613161653334623666313236366366373962356536656434396231303338646265323666386461366430396262353536313433636132653466623061346164303635636162336536383062306637306438626232646636393462353563366437386531316463383239373361643230643566333736663330656538643461313161306163666361663064383962373736636162323565383865336630333461343939336231366437386265323439626332336166376262623837353163376533353066636339313233323761613766333633343432623331373530376432376534623831333339383964633439653966303663303439623739346133626330646333373831393930316439326233646565623761646664356230336233363230383833616266643463626536336133346664656433373630343738343262373863646131373633653939636430616439393731306435313664323166313530636464393664613738643461356437333564343036316262353462313336663335313763323661363564343330623965363866616534323163616337613964356465333333663739313835623363316462346539643539636435303166306664366135313063333630336531336532346134306234626632313565333739356139613430643630613834353666623762363363326431316538313730316132333165383561666434623564623831616263316464353664323731373332656430323435663836393162646335646164333437366436663633353630373762316161316436333461393763343130396235396237333534376639343063316463386432316330663138373338386632643361386565636665653766643836373930326664346163333162613163396664663531626161326134333762666330393261316637333265326138663036363736333734326230373730613665323665333266326265356534376133373330393466336639623431383863383433643563346265613561326134616334626262646637666163376365333962323036323537333737313534313066323364623937303534623665623237306566636366623763623431373666383236383134326365613136653238623231646339353938373030656631646238653961643434653765313834316231346231323563636633356131376538636666613866303638323362643436636638353665346166323633623861616639363862613835306234643961666662656334393366613066653061383333653965343633653837613263636530633863303861313363636361346132613431363962336539613463313366633934633761363761623039396365393263633638656134376162616631363838616238613137323032663864613035313363353335396432373530363233663234343136346339663435333834656630366537353336306137366434646264303466303633663630386363636337383066316631323836353135393134646139373035323637346164303965323536666335393864636364346433383564616435383862366464326130336536313934636139386438373462333162366230623165323533336263313430386430643661386261633061313631613639313232313734383136336464636231323130336231613131373336636238656635353635626666303535663331363332353338313363396334313631333034656133666365353561643830356565633137346638643739636136376432303761633436666465336232356236653164353163346461656165333038653266336161383966633961323462376262386430363536323932373263343731313562356139336265383765336139643837333966623265386131666561366161333138353261663139303338393733346335313934363761393137633266616431663436343236613231663865636236346133616662623761373633663830623231393063616534633032316538626436633731643261313866666339353836373133363939623966636239333637373764323863663964376134323134613133346335326162363334363334386666396666336534646565616264363966356566323465346538613762623864303766646338666264643466666537303263623162326539653435643130313061386235623631306232303139633962653661303038376162323666343263356465313035393535313531373665623537363237373934366266303434333463303562646535623762313565326631393061643033613838363963333933396162363834393636613136613036656435356339616665636134313034663335343465646335333738663063616330353634316466336438373835326638633037656234396239303730613338336332613839323837643561333235393366323531303032666436616435313137313166383433396531626137353932616538333330653164326438656166343339363262366264326632376564396434396333356565343733383137666333386462313164333630303936336137323863313634613031393931613164363237643262353162376133643935373036306336346161376563383862316365613139376535626535626234666331363163303835336362393535613530636234643633626364386566396132333232373733633230393865663664636334393131643133383736373833653261383661623632343237653937366164386564633433396130313166306430316134653864643263343835653438383687144d4395a52c41e3842b41b03f3743f207264565084e41e0fbdf3bd429b13da15eb33d7e47b93b309daa3890d2d93d0000803f2bd5bbb95024b1399db4d33b2956c2392e99b93969b868ba90d4eb3bd611a9b92d66c739a2115b3b1257b6b9ed46b2b917deac3b5b0d2a3c14887e3ce74b3eba00000000386ea33945cf5abb9cbce9ba3c4ff0ba5cc05b3bb4bc9eb91e09b03a6113e9ba3e17b539267dab397fdea9b906eea5397713973bfc7a2c3b51ff733b50c313bb000000003ea5d239842c8c3b6d275e3af5c58b394139b2391f56403bf47e56bb357a81bbc956923b57d2cc39909aaf3944807e3c4a78cbba9938a9b9d84ba7b99796bcb9d7bea5b925e37a39ff19a73ba25dc1397357b1b9c13cd6bb2d36423b6da35a3b90e500bbd402a0b95f930bbbbddfaeb9071fa9395281a6b934e3a23b0f238a3b93e1973964d9a9b98b2f273ceb4a9aba3ccab6b9e3ba033b48220ebb19fcaf3b6290a93b034407bb782fa1b89335943bb388a7b9655cc0b96a361d3c67b9b0b99e28cf39076f2dbaf05afdbafa6cb43962bc2e3bd590bebc30339f3b4476a23963e6663bd7ad183ca5c00cbb8a69c13956f574ba97dcf93b97098d3bd471ae3bcbb06dbbfddca9b9d7c803bc3eb703bc9bdaa8bb1c6718bc8e5bcebb224c013bcb7684bb4d85943b2ef403bcc090893bf736aa3beb62a13b3c1cb6bbb821a7bbaee808bbfe3dd33c0000008027b0f5bbd6e09d3ce2db183d576fb7bb3b9761bc4111813be54cbcbbab65043c93ce1cbce0b3c6bbca4e953bb347fabbc466ff3a88f532bc11bf4c3b72636dbc00000080850afebbaaba5e3b476f973c7ae755bbde44dbbbfac1633bab7793bbf3ae90bb010a84bb3a1c2cbc6597d6badcd277bbddbbfe3b5898933b6063903b0d8fb33b028d8e3b2adcdcba9f1585bb2e91b9bb582aa03b6334f8bb6e31333b5569253b1e71533b8e977e3bcaf945bba6b59c3b603ab2bb02c88e3b0ae4493bfc457ebb2ae3ddbbe8f9943b0c38a4bb160ed73b73adab3bb6100e3bf28fd83baa70603bad2683bb9d90033c52703a3c9546693b7d34903b63a6b83bf1af9cbb3b19a23bedd0f8bb58efb13b105b72bb17300abc025488bb0a5b543e33573a3b24eff2bb286f75bb5d55a9bbd8e4803beea116bca9cc2d3b1e1205bc1adf75bbfcaa413bf8b491bb3050903b6537a83c1736213cc6271abaf7b6293c98bae33be7b710bb07e333ba55389dbb41441f3c8bf89abbb9acb6bbdeb9acbb416e053be7a63fba124e36bc396d97bb000000809a5a083c15d242bca07d0abca36bba3c09ac8e3cf4ae85bb51f5cd3b8abb18bccc0f383c4a58dc3b335a9ebba65c0b3c2324d9bb35bbb83c748069bba62961bc000000003e13193c863f71bb752b1cbc578e6b3b5807f33b633e9abb9e8dac3c78dcb03c7b35193ac5df483cca37e83ad74a50bcbfd529bc33469cbb786498bbafc6c1bbe0f895bba7a7ed3a8ef7a339cb5ccd3be140abbbc2041a3ce9e950bb828f5ebbd63a6fbbdd1d85bb9d07a93cdc24a7bb765ac53b467b96bbe863b4bb083ea73a7869f83b4cf49dbb874003ba2440dfbb356db8bbf63d2abb79b4d5bb3b1bafbb27487139a45115bc45e7a8bcf34775bbfb9e98bb68e4c7bb1e930fb736e8acbb6c4b753cce27b5bb830ba83c91ce1b3cdc1def3ae198133ef5e688bb6b67073cdd1ac53a27b589ba81469fbbe150293c3c2884bba35bdd3b1583ba3ac3a194bb3debb13c495799bb078c2ebc384b7d3c97a8493cebb6853c73bb4d3c024dd83c696ca13b1cc702bc6847243ce3ac5dbcfa771abc96221fbc6147fabb3641023cb496023d28f266bb00000000bf65693ccfff0d3cbf68a8bb3b268bbc15de51bca098e1bb9c5980bdec61ee3c66cf983c1024423c51e0febb12336c3c25bd1c3dcebf50bc88eb4cbc0eaea73c00000000c56c163c597c5fbcaa7b593cb7d2e33b7b235a3c0c2447bc95c9edbba8320abc75b98b3cb7d19a3cf51d623bcd0a023d7e91ef3c725604bc91fafdbbdf1f21bc2f3bfcbb274a493b7c78393cdb0e213c01a719bc0a0368bd6ad23cbc210741bc0288df3c6303d1bb8c93bbbb97420fbcbe16353cc8bc18bcf9a61c3d62882a3c8a25683cff2e01bcb0580e3c5b11f33c47dd10bc5b522abc4230df3c6ca21c3d13c0103c8129e33c199e5c3c05752dbc9515f0bba0b724bcac0b383c1f2b12bcbe768d3c80baf23cd6fbc9bb1b04843c84da173cae2dbabc64e63dbc08d8613cbd872b3c029c2c3ce63be03c5e41853c608888bd90f692bd68bf2c3cd0f843bc4497bcbb3161e4bb4ede983c42e72bbc0283dd3b176925bcd6d71bbc7ba42cbb911a6a3dadca0b3c68582fbcc664f23c629c1b3c4f33133cd046413c7eb83c3ddbe0d1baf307ad3b00000080101215bcc0708e3ca925dd3b7899f33cfdb1493dc7bbf43bfb8b5dbb6ee9d2bb584924bcb3cf0abc0b6c0c3ce93e14bccd51943a3302dd3acfabb03ca9c22cbc000000005cb22abcaef7f33cdf6aae3ba949d6bb1ef214bc878db23c3863783b62c5363b67d0eb3c5e192dbc75b57cbb3d43ecbae55cd1bb0cd10a3c00d4083c8efd223cccb5063c1b2d70bb3abcbd3c883c14bc0f01133cee04963c6d7bad3c7480353cba707fbbc16cf63bcdd4a939ad1f123cde2f0cbc2c76053c59ff09bb009bba3cb24a0cbca7750b3c64f9493d982287bbbe111e3c7f48483cd59b5bbbea43e1baf906243d99f3e1bbbd01863c4f95d73c25a8073cfa81273c42548f3cef4b153c800841bc318aa9bb7c98873a55c218bc1338373dfa26bb3c22f2c53c6bbc0dbcc7900e3d0819da3ccec359bb9a671fbcfee714bcbb3f74bb8851ce3c1c76ac3cd7fef039437e0a3ccb42bcba11ea103a388af63b94700f3a4ef6293ad86816b900078b3bd1c62cba4ee1213a7a4b2a3bb98b30ba291c30ba4d97813b71c3033c9815a33b627549ba000000804646103a39a742bb747c0dbbe3ff41bb1186043b9aad27ba947610bad46e52bac8a9053acb9e223a86122dbafce80f3ae1144cba610bc73a34b4543be4f2bbbb00000000dbcd253ac4727d3bcf8cf33a44811d3ac114213ab973fd3a78684ebbfd3384bb29a1ed3a4a70063a11fb103a3f3aa83b629d2aba74f52cba6d1d2cba7aad31bab36e2bba566e013a79481d3b5c663c3a09cf2fba731f29bcc28e1c3b7067313b8bbaa6b9454d28ba15bab6bad4e82eba107a263a51992bbabaa220babbdc023bfd62103a28172dba1db3003cd7ef11b9c15330bae79bbc3a25ae20ba21b1feb95d37183be0206fba67882bbb784a883bba292cba034132ba4df5f43b60222fbaf5b10a3a75e20a3996f8f1bad29f0e3adb20b8397d2257bc2faa893ba3d90f3ab059bf3abba4db3b7757cdb9cde80d3a934d85bb4b8be639fadd0e3b2bb99b3b6dc792bb95cf2dbae68660be8b1d7f3e2632363e6a588c3e0dc7453e8a4979bd1d81593e87b001beae177e3e4f7449be4d9119be437e10bed7162dbe5e43333e3d94acbe80726ebd000000008bb1683edfc6573ebafeb0bdda2e62be36b673bed771d6bd2c5f453e93cd52be9a55963e74b73f3e90e102be14af6c3e2db5afbe2920f4bec3a4fabdc533b53e00000080be5a753e34650dbe4d70473f2edcce3d45ca513ebb4615be2b554bbf9d8900bf1c142f3e7350a23efc363f3dce2ea5bed70f5dbe30da00be4a45fabd5c2c24bebf70f5bdec31433dac40363ef82f333e4d140fbe16dca83e1aabccbd39e8c2bd7f85c2bd7d4bd7bd62b629bf3e2d0bbe0a0a2d3edcbcf6bd11c8c3be4ca7123e92f7543ea87e02be0c1d2d3e5f362abebc491bbe7f4c95bd614426bee96ed8bed1ba3c3e666850bee6ddd53eda2819be1c5efbbd970e2abed2e41c3e043410be6a7d983ecb6910be90d301bf8248823e5c5f1f3e50043c3ed31cfbbdb4c6663e1b1d083e27e14b3eee2aecbd72008c3eb518e03e444e783e3fe10a3e59940dbe1c03e6be9f11fdbd845eda3dbbd702beb486b7bdce1210be4662cabd997aff3ca8acd8bd5d06843d914d02beb963c83d9ca89c3d954a933d7154ab3da863b4bd1ba5283ef949f03c000000004c7ceebddd4ad1bdb34f333d5226db3d8e4ee83d5ec3593d8c34cabd488ed83de8691abe0e1ec4bd1041853dd79cf2bd503b2f3e7b316f3e4cdf7b3d0f623abe000000807d95fbbd06008d3ddcd1c6be1f1152bd17cdd6bd0a1d983d8d68d03ea043833eba4ab0bdf7d026be8359c0bc3967213e041ce33d3e29833d14a97e3d139ca73da1ae793d8060c4bc8356b7bde92cb7bd2ed5913d1c2d2abe50a24f3ddcf4463d94d5473d83a45a3dfce4b13ed5ce8d3dd2d1b0bdde047b3d6462433e65aa94bdaa15dabdfbda843ddf6caebd139fae3db36f9e3d67e8183d6bf4aa3d48e8573e985cbdbdc40fd63dbacb50be5309983d43cb7f3ddaafad3d4ad59ebd6bfe923d7fa51cbe58b4933de72d873e13a905bede95a0bd7d6cbabdcc197e3d5e80ecbd63418abd023cccbd87c7723da8b70fbe2a2867be97f7febde8028dbd3f9c8e3d8e4a6b3e19c6803d64c364be492c7e3ea28b373e97c28b3e8750453edc6c7ebdc7ad5a3ebebd01bead267d3ef16349beeb7919beaf7510be7b8e2dbe0fc8333e5bb5a0bea8de6ebd00000080e2f0673e49ed5b3e0ff5b0bdadf362be411c79be6dc5d6bd38f5473e5cfe52be16a8953ea54c3f3efbec02bec1e56b3ea8e6a6beceeefcbe360af8bde438b13e00000000fe7f743e219a0bbe578b503f0c26cf3d7a3a513e365516be9dbc4bbfd3f500bf66902f3e5686a13efa6a403dd3e599be7b1e5dbe9ce800be3b6cfabddc0224be459ef5bd6a68443da0f7363ee9db323ee40d0fbe9c41a63e8c16cbbd8073c2bdc758c5bd759dd7bd9bd829bf9d2c0bbe6ac32c3e52e9f6bd1dcdb9be9ba9123eaa61543eaa8a02beaf7d2d3ebbb72abe432f1bbe509694bd083127be2a22cdbec25e3d3e8c9950be2984e23e170217be3d83fbbd95da29be504f1d3ebd2b10bec5ca973e92d110be045402bf55c9813e1ceb1f3ed27c403ef27ffabd3b0a663e5ed4073e34104d3ecfe2eebde06a8b3e4098ea3e7f7e7c3ea2ba0a3ead3e0dbe56e6e7be4d34fdbda585d93d565202be9adfb6bdd9830fbe879ac9bde754003dfad4d7bd69c8833d9cc801bea9a2c73def389c3ddaed923dddc2aa3d3cc1b3bdc90c283ebfadf13c00000000f887edbd1c7dd0bd6f8b333d324cda3da364e73d3ead593d736fc9bd35b7d73d87d519bedf5fc3bd5c00853d67a4f1bd58a12e3e929a6e3ecf6f7b3da3c539be000000006f93fabd0aa88c3d88b0c6bebaf651bd98f4d5bd99ae973df84dd03e3efc823e46afafbd813726bec6c4c1bc63d1203e0f38e23d56ed823d85417e3dfb17a73ddb51793d9acec5bc5aafb6bd5682b6bd407b913d899329be5d954f3dedfb463d3dd9473dd18b5a3d75b6b13ebc7c8d3d7832b0bdc9a57a3d02c6423e454294bde339d9bd419b843d8dd4adbdce06ae3d31fc9d3d5952193d8662aa3d264d573eafabbcbdf03bd53d532f50be229b973dad607f3dfe20ad3d86599ebd9fa1923d940f1cbe814e933d2fe8863eda2105be8016a0bd11c0b9bd30a57d3d8f8eebbd6eee89bd1e75cbbdf16a723d0f290fbe186166be26f5fdbd51aa8cbd14418e3da3b26a3ef78e803db37c55be8869803e551f343ee1688d3ef091463e67047abd32cf543e448a01beb9c97f3e5edf43be38bb19bef68410be0e5427be2b1c313eda8aa5be8b506bbd000000001f106a3ea4754d3e08b6afbd824756be411864be609dd5bd781b453ed7a654beea8f973e7373403ecebe02be091e6e3e4d0aacbe8d43eabed2b3f5bd1d09b73e00000080f3ed763e3d9d09be7ae8433fd0eccd3d6ccb523eb88314bec41f4cbfad7a00bfb3112d3eafc0a33e9b7c3a3d956b9ebe160a5fbeb9af00beb8d9f9bd597c24bebbf7f4bd1724403d29fd333e8996323ec9160fbee170a63ea862cabde5e5c1bd27dfc3bdf772d6bd58af2cbf06240bbe0b7d2d3eae46f6bdbadcbfbe0ae7113eb604563e525a02be213f2b3e9a632bbe177c1bbef8df94bd6ac827be0811d4be4cee393eff3152bee817cd3eac6e14be13f6fabd0d742abe89e31b3e3a3e10be16c5993ea6e910be3fe703bfac2f833ef7a21d3e7909373e8ee3f7bdd21e683eddac073e9689483e3d17eebdeb118d3edf18e23eadd2783e23610a3e06310bbef644e6be7ea1fcbdd5fc54be10f67e3ee44a323e40788c3e81c4443ef9a975bd46e1523e93c9ffbdb4e77d3ea3f442be80fe17be65d30ebeed6f26be7f4a2f3efb84a4be77da67bd00000000242e683e481d4c3e10a2adbdd0c855be48bc62bef58fd2bdbc95443e4eb652be269f963eb89e3e3ea51801bec03a6c3ea7feaabef522eabedd0ff4bdd458b63e000000001c0c753eadc408beb11d433f38f3ca3d58f1503e7ba413beaa0b4cbff38300bfe6452b3e46cfa23e204c383d95699dbe50115dbe5e19febd0d9cf6bd6fb722be14bef1bdc62d3c3d292d323ee5ef313e1d660dbe7c24a63e8ad5c8bdf659c0bd7e0dc1bd856cd3bd5f882cbf457609be5eb42b3e130df3bd07cabebe3a36103e2128543ed9b400be3f72293eb79729be3bbc19bea27493bd7b0026be49f4d2be4216383e044450be0965cc3e0f9213bee4b7f7bd54ab28be262a1a3ef8880ebeb615993e2f3b0fbe2ec103bf733d823e33e41b3e43a3353ea33af6bd793b663e5506063e46aa463e6cffeabd171e8c3e1e40e23e3653783e79b8083e09560abe8942e6be3d6ff9bdbd38d93d863e02bebf73b6bd4a890fbed80bc9bdf4adfb3c5db3d7bd6f6a823d34b501be4a36c73d35f69a3d6f9f913d6f17aa3dd251b3bd7c66283ede43ec3c00000080903fedbd9345d0bdc3a9303d1506da3d815ee73d29ce563d1dc8c7bdcc47d73ddbee19beeecac2bdf8a3833d3564f1bd010b2f3e814a6f3e1579793d78ea39be00000080c45dfabda5c78b3d4464c7be72ac4fbd4780d5bde0c0963d090bd13e3866833e4436afbd9f6326be3986bdbc511e213eb3dbe13d858e813d097d7b3dc9e3a53d9088763de970c1bcde44b6bd13e0b5bd472b903d739129bee15d4d3dd9b3443d1167453db9af573d3f56b23eca288c3d9c79afbdb1db773d2c51433e6f8f93bd58cfd8bd553e833d5e58adbdba47ad3d43bc9c3d72ed163d379ea93dedf6573e1a4fbcbd4fc8d43dae9a50be7ecf963d489e7c3d47f5ab3d02bc9dbd1253913d5c271cbef35d923de557873e491805be1b819fbdf021b9bd76a77b3d5f44ebbdc82689bd8737cbbd642b703d59300fbe636d65beae3efcbd3fe88bbd665d8d3d2b616b3e6f5e7e3d33c4ef3d5a4ff3bd7221abbd56eb05be0ea3bcbdde3df03cb7dfc9bdeb15773d1eb0f2bd5dc1ba3dd366923d99b6893d81cd9f3d1c33a8bd0aed1c3e4f05e13c00000080dacfddbd5efae8bdeae7273d86fcec3d8ad5023e45194c3db68abbbdffc7c93d53820fbe25d1b6bddc5d793dd2a2e1bdba0d233ec440853ea3646b3d857249be000000007043eabd30ab833d07e5cfbe76a644bdd424c8bdbff98d3d7c0be53e2564933ec465a4bd3e031bbe2e56b5bcca32163e3192d33daa7b753da05e6e3def8e9c3d62bf693d4910b9bc23efaabdd2cfaabd035c883d22501ebef256423d6b493a3d002e3b3dc8e94c3de1f9c33ea59f843d07eda4bdcdfd6a3daec8353e0fc58abd45e9cabd99a0783d9faaa2bd18e4a23d8b0c943d8d7d0f3db27d9f3d48dd483e4686b0bdc977c73d3ebb61beade78d3d7a6b6f3da931a23d523394bd726f893d4b9311be6fee893da5ab933ee588f8bdb6d395bdc374c6bda6756d3d41f9dbbdec1c81bde952bebd800b633d339705bec81c70be3e29ecbd59ab83bdd329853df8b67e3eb00a713ddfe7da3d302f03bedef5b7bd617710be3adecabd9925003dac34d9bdff59843d94a402be1adfc83dc9099d3d93a6933da9bbab3dbad0b4bd7523293e061df13c000000803316efbdebccd1bd67cf333d53b0db3d8ee3e83d1c555a3dfab0cabd6816d93d61d91abe7895c4bd4295853d833af3bdefc02f3e6800703e197b7c3d20f43abe00000000ea3afcbd76558d3dc78ec7becc9652bde652d7bd9478983d8d2ed13eacb8833e0bb5b0bda74d27be3905c1bcc5dd213ed4ace33d817c833dbf4b7f3deb03a83de74e7a3d9a0fc5bc92c5b7bdcf9ab7bd5030923de4ac2abe712a503d8179473d7a5a483d66365b3d318db23ea6278e3d893bb1bd31a67b3db0fe433ee10395bd989edabd042f853df9d5aebd7b08af3dc5d19e3dd85a193d715bab3d3f9c583ee0cfbdbde795d63d457751bed464983d2137803d951bae3da5349fbd095a933dee161dbe810d943d77a7873e8b0306be42f6a0bd9eddbabd9fb67e3db718edbd44958abd3cbaccbd105f733df81b10bef0ee67be7fa0ffbd3c588dbd8ff28e3def146c3e0718813de067dc3d06c0edbd469cb0bdeadf02be9afab7bdf09be93cdb30d3bd137f703d05c5ecbd0b1cc03da3928e3dee15863dd94aa73dd501adbd8925193ee9f7db3c0000008069b6d8bd8bd8d8bd54ad233dc8f8de3d90f3ef3d7c8c463dcc09b7bd9ce5c43de6400cbe894cb2bd5ab9723da474dcbd0a1c1f3e7f24853e7081683d7f3629be000000806996e4bd6029833d2114cebebb803fbda83cc3bd89748e3d60d6fe3ef831963e57f5a8bdeb7f17be8437b0bc0497123e6177ce3db5ee6e3d23fd673d627f983d127a633d12e6b3bc091ab0bd9d90a6bd6dc3843dcf6f1abe97e73d3d42b3353db840363d7958473d2e08d53e711d813d78cea0bde6b0643d7357313eab9c8cbda337c6bd9000723d57d4a6bd08de9e3db62e903d2adc0a3dc98a9b3da3e4433e9761b6bda3a2c23d88e95cbe44258e3dc703693d85019e3d260797bd4ed0853dad460ebec774863d7b2d9a3e4bddf2bdaf889cbd62bdbebdd3b36a3d1eead6bd600882bd10dac5bd982e5d3d5b8d02be3cc450be27a8e6bdc1da84bd8da8843d7dbe813ec89a6a3d5d4fd93d504a02bef881b6bdda840fbe6651c9bd08e5fc3c039fd7bde720833dfabf01be1a5ac73de7b49b3d255c923dbf54aa3d0961b3bd2b1b283e27d3ed3c00000000bf63edbd2f3ed0bd8cc5313d5e17da3d593de73d7e1b583daf29c9bd3481d73d4fdc19bef20ec3bdf15a843dc583f1bd42b22e3e17b56e3ef0137a3d86d939be00000080327bfabd7f108c3d1eafc6be445850bdf2b8d5bd3027973d7c49d03e1809833e8d49afbd494426beb813bebcb2dc203eb50ce23d8f44823d19e57c3d21a3a63dc8ed773d3b12c2bc7b52b6bd2322b6bd84e7903d63a229be96fa4d3d0855453df235463de0fc583de3bab13e86e38c3d66c9afbd4844793d62dc423e51b693bdd100d9bd36f5833da96cadbd4d9fad3d067b9d3de17a173d2ef6a93d9f66573ea456bcbd4703d53dde4750be7513973da7067e3d68b4ac3dbbdb9dbdc20f923d03181cbe9ac1923db0f4863e731b05bec99b9fbd8f67b9bdef4c7c3d4e68ebbd955389bd3531cbbdf603713db5290fbeafa966be4ce2fdbd4e138cbdb0ab8d3d20cd6a3ebec67f3d9ef957bec152813e51cb353e30558e3ebd52483e7b4a7fbd8d69563e912703beafcb803e766246beae671bbe1e3012bed9ca29be58b7323e5333a6be425070bd0000000002d56b3e9e104f3eb773b2bd96c058be00b365bea4b5d8bd05fb473e264856be637f983e1c29423e675d04be1de46f3e63e3acbeebabebbed539fabdc7e5b73e0000008056b7783e41f20bbe89cb433f31f4d03db489543e23e316be25e44cbf124b01bfc8ae2e3e2cb1a43e2ba9403df0169fbee4a860be1f4e02be7910fdbd392e26befc28f8bd68ad443d439d353ed46c353e7ac010be6a06a83ec8a1cebdf813c6bd4ed7c6bd5a93d9bd51652dbfe8c90cbe592c2f3e997af9bd79a1c0beb47e133eabc4573e0df903be26d92c3ea30c2dbe23271dbe1b9e98bde47029bea0c8d4bef08a3b3e45d453be30e9cd3e5fcf16beb92dfebdf1282cbe84801d3e71e411bedab19a3e8e8e12be2aad04bfee17843efd391f3e6195383e086cfcbde1e1693e263f093eca2b4a3edc29f1bd7efb8d3e0590e33ef7c97b3ebbf50b3e55880dbe9ddbe7be46e8ffbd1e25dc3d43edebbdb496b2bde2cb01be45cbb6bd770cdf3c01c6d8bdc1256f3d43f5eabd8362c53d3cc28d3d4c54853dd29ba53d9770afbd10b4173ecd44da3c000000007929d7bda647d6bdc5a7223d5f05df3dcf97f03d8766453d51a8aebd044bb93dab090bbe092cb1bd0b5d713d14dcdabd85401f3ef69e863e5642673d834627be0000000027e2e2bd9f81833d66c6cfbe2b5e3ebd45efc1bd2f718f3db1c6033fe13b983e9b6eabbd711b16bea4adaebcd0d0103e190bc23db6976d3d9daf663d159c973d4532623dd556b2bc09a2b2bde48ba5bdf103843d55930ebecdbc3a3dfc6c323d04ba2c3d4a32463d0351da3ec763803dded69fbdad67633de850323e85e68cbd5ce2c4bd54a5703dbedaa8bd0bf1953d535b8f3da31c073d67eb923d5e27463e09a1b9bd6537b73d18d55dbe873e8f3d0bb5673df5129d3d348197bdfb0e853dd2060dbeb05e7d3daf629c3e4dfaf0bd170c9abdae6abcbde6c6693d0463d5bdf8c081bd8700cabdae54513df47a01be929b4abeac77dabd8ba984bd5f2e853dc5af833e124a693da068db3d435203beb583b8bd3e9310be5542cbbd5318013dc8b9d9bd6eef843d55bf02beff62c93d6e9d9d3d903b943d7c4bac3d1b60b5bdac48293e54fff23c000000800d6aefbdb152d2bd28e5343d122edc3df060e93d657b5b3d5716cbbda297d93decde1abedbfcc4bd882a863d5e8cf3bd52e42f3e28f76f3e889f7d3df5ae3abe000000000d91fcbd8ce78d3dd882c7be65b053bd17b2d7bd7b08993d5422d13e67b2833e2a45b1bdc15b27beeca6c2bc9806223ef729e43df711843d263b803d8595a83d58797b3d59b0c6bcfd53b8bd1a0fb8bd67c5923d5abe2abe1d48513d1a93483da975493de55b5c3de780b23ee0bc8e3dbaa9b1bda4d07c3d8618443e2c9895bd10fddabd50c4853da666afbd4696af3def649f3d875f1a3d24eaab3de3ac583eff5cbebd1d18d73de03451bef2f5983d2acc803dc3abae3dd1c79fbde8ee933da02a1dbe939f943df69f873ee62506be5d89a1bdf268bbbdc9d87f3de56dedbd112a8bbdd742cdbde882743d663810be419467be8de4ffbdfcec8dbd77838f3dfa106c3effac813dbb9bd83df8e801be3be5b5bd5c1f0fbe06acc8bdc9bcfc3cc8ecd6bd0ab4823d0c5f01be68b1c63dba2d9b3d33de913d7bc1a93d9ec6b2bd76ae273e0ec5ed3c00000000c2aaecbd1590cfbd9163313d7863d93de182e63df77c573d8680c8bddaced63d3f7419bedf6dc2bd69ec833df5c8f0bdb5442e3e68416e3e4a4b793dc16a39be00000000a3bcf9bd9f9a8b3dbc70c6befed24fbd1c0cd5bdaea5963d030ad03ee3ce823e66b2aebdb3d925be5c59bebc4671203edb54e13d8bd8813d0d167c3dea11a63d5025773d6255c2bcd7b5b5bd8f8ab5bd1b6b903d653529bedd6c4d3df0d3443de5b2453dab5b583d587eb13e516b8c3dc436afbdba79783d666c423e6c3893bd4e52d8bd1787833dfcd6acbd3f09ad3d1ff29c3d1640173d2f63a93dfdf4563e91b5bbbd4c52d43d74d64fbe0592963dcf357d3de31dac3dd9539dbd6592913d44af1bbe5244923d61ba863e42b904be32129fbd88c8b8bd8a817b3d3fb0eabdcee088bd3986cabd3046703d5bc40ebe001b66be521ffdbd859d8bbd10348d3dc4596a3e52f37e3dd45312bfaa602f3fe11df63ebb01413fd9d0073f7a322cbe4829113ff6b1b0be71a42e3fb04c06bf5589d1be8e02c5be4ccee5be0ce2f13e93a161bf7fa221be0000008066e21f3fe5430c3f2b1371be2dd712bf8b9d1bbfebcd91be8376073f3e0611bfeac64e3f7c9e033f4456b2be03a1223f78686abf9a0ca0bf4028a9bed568793f0000000054a3283f4649bdbe97d3044098a78d3eb917103f8020ccbea7380bc00f93afbf6e6cec3e44555f3fdc9c023e1ef757bfe81018bf1187afbe5e6baabecc24e0bec01aa7bed44b053ef6d5f53e55fcf53e6312c3bed4af633f98978bbe09c985be245586bef96492be41baedbf6cb8bdbe2880ed3e13f7a7be76a582bfc688c73edc3f123fd4cbb1be33ebe93eab18eabe37eed3be14f64dbe0e34e5be785690bfe6ddfd3ea95b0fbf14af8b3f3207ccbef42fabbe9c3ee8be041ed53e8aa7c4bea8cd513faa28c6beffc3b4bf8917333f1f72d73eb7f2f93ebba5aabe9d8a1e3fcd9fb93ef1dd083f7006a3bef684403fd48d9a3fe9ab2a3f1f4fbd3e516fbfbe72649dbf5459acbe3ce2d63ea5fae63e06cfa93e0415b53e83fcd73e8a1c973ee8499d3e09299f3e10e6c63e0965983e4656b23e3343b03e9e43b23e57bd953ee7f3f33e658b873e941a7bbe04d2bb3efa54463f9b0d723ede9fb43e4aa5163f555c8c3e7521803ed201e23e9267873eb546b83e07bcb03e8701af3ef7e2da3e2b1a333fcc08b23e71089b3e6536c2bdf12ce43ec3c4a43ec9deee3e6c04af3e90f0c53e19aa783ed7025a3f4a48103f9794d23ef147973ed7634a3e9084ee3ef8e9e73e1837a83e1fe2ac3e7ea6bb3e24ec9b3e3dc32f3e27f7bd3e2315cd3e3a67af3e5dde993e6209bd3e991da23e0112953eeb4cc73e4ea7373f4389b63e53e5bd3e38aba33ea7beed3e2dded33eadb68e3ecaeeb23ec79a963ef6c5d53ef7deb93e4ef4a53e4b2ec33e7825013f4d50bf3efc34e03e747f973e4718a23eeadfc03ea089cb3eab6da43e8e25a33ebef7e43ea9b3ea3eebbb0f3fcbaa8f3ef829043fcc263e3f917b973e8b8aa43ea938eb3ef7ec9a3e9acda03e25379f3ef927b13e73b5b23eedf0d13e6f15993ef928033fdae3d63e81e4963d
+\ No newline at end of file
+diff --git a/gcc/opts.cc b/gcc/opts.cc
+index 432b822e8..7508fc817 100644
+--- a/gcc/opts.cc
++++ b/gcc/opts.cc
+@@ -3486,12 +3486,19 @@ common_handle_option (struct gcc_options *opts,
+       break;
+     
+     case OPT_fauto_bolt_:
+-      opts->x_flag_auto_bolt = true;
++      if (get_optimize_decision_from_ai4c ())
++	{
++	  opts->x_flag_auto_bolt = true;
++	}
+       /* FALLTHRU */
+     case OPT_fauto_bolt:
+       if (opts->x_flag_bolt_use)
+         error_at (loc,
+ 		  "-fauto-bolt conflicts with -fbolt-use.");
++      if (get_optimize_decision_from_ai4c ())
++	{
++	  opts->x_flag_auto_bolt = true;
++	}
+       break;
+ 
+     case OPT_fbolt_use_:
+@@ -3499,6 +3506,10 @@ common_handle_option (struct gcc_options *opts,
+       if (opts->x_flag_auto_bolt)
+         error_at (loc,
+ 		  "-fauto-bolt conflicts with -fbolt-use.");
++      if (get_optimize_decision_from_ai4c ())
++	{
++	  opts->x_flag_bolt_use = true;
++	}
+     break;
+ 
+     case OPT_fbolt_target_:
+-- 
+2.44.0.windows.1
+
diff --git a/gcc.spec b/gcc.spec
index 5712046ad5db3f3a033db29a3fe0c6df5abf2a4f..940b51688c13604f01b4aa55541ac4dc90c6d795 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 73
+%global gcc_release 74
 
 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@@ -461,6 +461,8 @@ Patch352: 0352-Add-hip10c-machine-discription.patch
 Patch353: 0353-Add-hip10a-machine-discription.patch
 Patch354: 0354-Fix-for-hip11-and-hip10c-addrcost_table.patch
 Patch355: 0355-Fix-errors-in-ipa-struct-sfc-IBMY84-IBN2JO-IBN42Q.patch
+Patch356: 0356-add-llc-allocate-feature.patch
+Patch357: 0357-Enhancing-BOLT-Optimization-with-AI.patch
 
 # Part 1001-1999
 %ifarch sw_64
@@ -1602,6 +1604,8 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch -P353 -p1
 %patch -P354 -p1
 %patch -P355 -p1
+%patch -P356 -p1
+%patch -P357 -p1
 
 %ifarch sw_64
 %patch -P1001 -p1
@@ -4239,6 +4243,12 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Mon Feb 24 2025 chenhong <chenhong92@huawei.com> - 12.3.1-74
+- Type:Sync
+- ID:NA
+- SUG:NA
+- DESC:Add feedback llc allocate and support llc prefetch instruction
+
 * Mon Feb 24 2025 huzife <634763349@qq.com> - 12.3.1-73
 - Type:Bugfix
 - ID:NA