diff --git a/0365-add-llc-allocate-feature.patch b/0365-add-llc-allocate-feature.patch
new file mode 100644
index 0000000000000000000000000000000000000000..e7c8e126ae3f9ac4549925bbfdd56e90571815c6
--- /dev/null
+++ b/0365-add-llc-allocate-feature.patch
@@ -0,0 +1,8452 @@
+From 43e93c6df874a0bf78675fb4d3586d9ad1cb7dac Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?=E9=99=88=E9=B8=BF?= <chenhong92@huawei.com>
+Date: Tue, 25 Feb 2025 16:27:36 +0800
+Subject: [PATCH 1/2] add llc allocate feature
+
+---
+ gcc/Makefile.in                               |    1 +
+ gcc/auto-profile.cc                           |  491 +-
+ gcc/auto-profile.h                            |   30 +
+ gcc/builtins.cc                               |   82 +
+ gcc/builtins.def                              |    1 +
+ gcc/cfgloop.h                                 |    3 +
+ gcc/common.opt                                |   28 +
+ gcc/config/aarch64/aarch64-protos.h           |    6 +-
+ gcc/config/aarch64/aarch64-sve.md             |   48 +-
+ gcc/config/aarch64/aarch64.cc                 |   18 +
+ gcc/config/aarch64/aarch64.md                 |   39 +
+ gcc/dce.cc                                    |    1 +
+ gcc/doc/tm.texi                               |   21 +
+ gcc/doc/tm.texi.in                            |    6 +
+ gcc/internal-fn.cc                            |  115 +
+ gcc/internal-fn.def                           |    4 +
+ gcc/ipa-pure-const.cc                         |    1 +
+ gcc/optabs.def                                |    2 +
+ gcc/opts.cc                                   |   52 +-
+ gcc/params.opt                                |   62 +
+ gcc/passes.def                                |    2 +
+ gcc/print-rtl.cc                              |    6 +
+ gcc/rtl.def                                   |    9 +
+ gcc/rtl.h                                     |    4 +
+ gcc/rtlanal.cc                                |    2 +
+ gcc/sched-deps.cc                             |    4 +-
+ gcc/target-insns.def                          |    1 +
+ gcc/target.def                                |   31 +
+ .../g++.dg/llc-allocate/llc-allocate.exp      |   27 +
+ .../llc-allocate/llc-relion-expand-kernels.C  |   52 +
+ .../g++.dg/llc-allocate/multidim_array.h      |  186 +
+ gcc/testsuite/gcc.dg/llc-allocate/llc-1.c     |   61 +
+ gcc/testsuite/gcc.dg/llc-allocate/llc-2.c     |   54 +
+ .../gcc.dg/llc-allocate/llc-allocate.exp      |   27 +
+ .../llc-allocate/llc-cross-bb-indir-mem-acc.c |   36 +
+ .../llc-allocate/llc-extend-outer-loop.c      |   61 +
+ .../llc-feedback-branch-in-loop.c             |   39 +
+ .../llc-allocate/llc-feedback-break-in-loop.c |   41 +
+ .../llc-allocate/llc-feedback-goto-in-loop.c  |   50 +
+ .../llc-feedback-same-loop-cycle.c            |  129 +
+ .../gcc.dg/llc-allocate/llc-nonzero-offset.c  |   50 +
+ .../llc-prefetch-full-pldl1keep.c             |   14 +
+ .../llc-prefetch-full-pldl1strm.c             |   14 +
+ .../llc-prefetch-full-pldl2keep.c             |   14 +
+ .../llc-prefetch-full-pldl2strm.c             |   16 +
+ .../llc-prefetch-full-pldl3keep.c             |   14 +
+ .../llc-prefetch-full-pldl3strm.c             |   14 +
+ .../llc-prefetch-full-pldl4keep.c             |   14 +
+ .../llc-prefetch-full-pldl4strm.c             |   14 +
+ .../llc-prefetch-full-pstl1keep.c             |   14 +
+ .../llc-prefetch-full-pstl1strm.c             |   14 +
+ .../llc-prefetch-full-pstl2keep.c             |   14 +
+ .../llc-prefetch-full-pstl2strm.c             |   14 +
+ .../llc-prefetch-full-pstl3keep.c             |   14 +
+ .../llc-prefetch-full-pstl3strm.c             |   14 +
+ .../llc-prefetch-full-pstl4keep.c             |   14 +
+ .../llc-prefetch-full-pstl4strm.c             |   14 +
+ .../gcc.dg/llc-allocate/llc-ref-trace.c       |   62 +
+ .../gfortran.dg/llc-allocate/llc-3.f90        |  211 +
+ .../gfortran.dg/llc-allocate/llc-allocate.exp |   29 +
+ .../llc-trace-multiple-base-var.f90           |   62 +
+ .../llc-unknown-type-size-unit.f90            |   58 +
+ .../llc-allocate/llc-wrf-4-outer-loop-num.f90 |  320 ++
+ gcc/timevar.def                               |    2 +
+ gcc/toplev.cc                                 |    6 +
+ gcc/tree-cfg.cc                               |   11 +
+ gcc/tree-cfg.h                                |    1 +
+ gcc/tree-pass.h                               |    3 +
+ gcc/tree-scalar-evolution.cc                  |    8 +-
+ gcc/tree-scalar-evolution.h                   |    3 +-
+ gcc/tree-ssa-llc-allocate.cc                  | 4150 +++++++++++++++++
+ gcc/tree-ssa-loop-niter.cc                    |   38 +-
+ gcc/tree-ssa-loop-niter.h                     |    3 +-
+ gcc/tree-vect-loop-manip.cc                   |  266 ++
+ gcc/tree-vect-loop.cc                         |   10 +-
+ gcc/tree-vectorizer.h                         |    1 +
+ 76 files changed, 7308 insertions(+), 45 deletions(-)
+ create mode 100644 gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp
+ create mode 100644 gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C
+ create mode 100644 gcc/testsuite/g++.dg/llc-allocate/multidim_array.h
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-2.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-branch-in-loop.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-break-in-loop.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-goto-in-loop.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-same-loop-cycle.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c
+ create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c
+ create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90
+ create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp
+ create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90
+ create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90
+ create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90
+ create mode 100644 gcc/tree-ssa-llc-allocate.cc
+
+diff --git a/gcc/Makefile.in b/gcc/Makefile.in
+index 65f683bbd..ef7733580 100644
+--- a/gcc/Makefile.in
++++ b/gcc/Makefile.in
+@@ -1659,6 +1659,7 @@ OBJS = \
+ 	tree-ssa-loop-niter.o \
+ 	tree-ssa-loop-array-widen-compare.o \
+ 	tree-ssa-loop-prefetch.o \
++	tree-ssa-llc-allocate.o \
+ 	tree-ssa-loop-split.o \
+ 	tree-ssa-loop-unswitch.o \
+ 	tree-ssa-loop.o \
+diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
+index 5e85381ce..97c3bafd5 100644
+--- a/gcc/auto-profile.cc
++++ b/gcc/auto-profile.cc
+@@ -49,6 +49,9 @@ along with GCC; see the file COPYING3.  If not see
+ #include "auto-profile.h"
+ #include "tree-pretty-print.h"
+ #include "gimple-pretty-print.h"
++#include <map>
++#include <vector>
++#include <algorithm>
+ 
+ /* The following routines implements AutoFDO optimization.
+ 
+@@ -95,6 +98,8 @@ along with GCC; see the file COPYING3.  If not see
+ */
+ 
+ #define DEFAULT_AUTO_PROFILE_FILE "fbdata.afdo"
++#define DEFAULT_CACHE_MISSES_PROFILE_FILE "cmsdata.gcov"
++#define DEFAULT_ADDITIONAL_PROFILE_FILE "addldata.gcov"
+ #define AUTO_PROFILE_VERSION 2
+ 
+ namespace autofdo
+@@ -117,6 +122,14 @@ private:
+   bool annotated_;
+ };
+ 
++/* pair <func_decl, count>  */
++static bool
++event_count_cmp (std::pair<unsigned, gcov_type> &a,
++		 std::pair<unsigned, gcov_type> &b)
++{
++  return a.second > b.second;
++}
++
+ /* Represent a source location: (function_decl, lineno).  */
+ typedef std::pair<tree, unsigned> decl_lineno;
+ 
+@@ -311,6 +324,9 @@ public:
+   /* Mark LOC as annotated.  */
+   void mark_annotated (location_t loc);
+ 
++  /* Compute total count threshold of top functions in sampled data.  */
++  gcov_type calc_topn_function_total_count_thres (unsigned topn) const;
++
+ private:
+   /* Map from function_instance name index (in string_table) to
+      function_instance.  */
+@@ -338,6 +354,244 @@ static autofdo_source_profile *afdo_source_profile;
+ /* gcov_summary structure to store the profile_info.  */
+ static gcov_summary *afdo_profile_info;
+ 
++/* Check opts->x_flags and put file name into EVENT_FILES.  */
++
++static bool
++get_all_profile_names (const char **event_files)
++{
++  if (!(flag_auto_profile
++        || (flag_cache_misses_profile || flag_additional_profile)))
++    {
++      return false;
++    }
++
++  event_files[INST_EXEC] = auto_profile_file;
++
++  if (flag_cache_misses_profile)
++    {
++      if (cache_misses_profile_file == NULL)
++        {
++          if (additional_profile_file == NULL)
++        {
++          additional_profile_file = DEFAULT_ADDITIONAL_PROFILE_FILE;
++        }
++      event_files[PMU_EVENT] = additional_profile_file;
++        }
++      event_files[CACHE_MISSES] = cache_misses_profile_file;
++    }
++  else if (flag_additional_profile)
++    {
++      if (additional_profile_file == NULL)
++        {
++          additional_profile_file = DEFAULT_ADDITIONAL_PROFILE_FILE;
++        }
++      event_files[PMU_EVENT] = additional_profile_file;
++    }
++
++  return true;
++}
++
++static void read_profile (void);
++
++/* Maintain multiple profile data of different events with event_loc_count_map
++   and event_func_count_map.  */
++
++class extend_auto_profile
++{
++public:
++  bool auto_profile_exist (enum event_type type);
++  gcov_type get_loc_count (location_t, event_type);
++  gcov_type get_func_count (unsigned, event_type);
++  gcov_type get_topn_function_total_count_thres () const;
++  struct rank_info get_func_rank (unsigned, enum event_type);
++  /* There should be only one instance of class EXTEND_AUTO_PROFILE.  */
++  static extend_auto_profile *create ()
++    {
++      extend_auto_profile *map = new extend_auto_profile ();
++      if (map->read ())
++	{
++	  return map;
++	}
++      delete map;
++      return NULL;
++    }
++private:
++  /* Basic maps of extend_auto_profile.  */
++  typedef std::map<location_t, gcov_type> loc_count_map;
++  typedef std::map<unsigned, gcov_type> func_count_map;
++
++  /* Map of function_uid to its descending order rank of counts.  */
++  typedef std::map<unsigned, unsigned> rank_map;
++
++  /* Mapping hardware events to corresponding basic maps.  */
++  typedef std::map<event_type, loc_count_map> event_loc_count_map;
++  typedef std::map<event_type, func_count_map> event_func_count_map;
++  typedef std::map<event_type, rank_map> event_rank_map;
++
++  extend_auto_profile () {}
++  bool read ();
++  void set_loc_count ();
++  void process_extend_source_profile ();
++  void read_extend_afdo_file (const char*, event_type);
++  void rank_all_func ();
++  void dump_event ();
++  event_loc_count_map event_loc_map;
++  event_func_count_map event_func_map;
++  event_rank_map func_rank;
++  event_type profile_type;
++  gcov_type topn_function_total_count_thres;
++};
++
++/* Member functions for extend_auto_profile.  */
++
++bool
++extend_auto_profile::auto_profile_exist (enum event_type type)
++{
++  switch (type)
++    {
++      case INST_EXEC:
++	return event_func_map.count (INST_EXEC) != 0
++	       || event_loc_map.count (INST_EXEC) != 0;
++      case CACHE_MISSES:
++	return event_func_map.count (CACHE_MISSES) != 0
++	       || event_loc_map.count (CACHE_MISSES) != 0;
++      case PMU_EVENT:
++	return event_func_map.count (PMU_EVENT) != 0
++	       || event_loc_map.count (PMU_EVENT) != 0;
++      default:
++	  return false;
++    }
++}
++
++void
++extend_auto_profile::dump_event ()
++{
++  if (dump_file)
++    {
++      switch (profile_type)
++	{
++	  case INST_EXEC:
++	    fprintf (dump_file, "Processing event instruction execution.\n");
++	    break;
++	  case CACHE_MISSES:
++	    fprintf (dump_file, "Processing event cache misses.\n");
++	    break;
++        case PMU_EVENT:
++	    fprintf (dump_file, "Processing other PMU events.\n");
++	    break;
++	  default:
++	    break;
++	}
++    }
++}
++
++/* Return true if any profile data was read.  */
++
++bool
++extend_auto_profile::read ()
++{
++  const char *event_files[EVENT_NUMBER] = {NULL};
++  if (!get_all_profile_names (event_files))
++    {
++      return false;
++    }
++
++  /* Backup AFDO_STRING_TABLE and AFDO_SOURCE_PROFILE since we will create
++     new ones for each event_type.  */
++  autofdo::string_table *string_table_afdo = afdo_string_table;
++  autofdo::autofdo_source_profile *source_profile_afdo = afdo_source_profile;
++
++  for (unsigned i = 0; i < EVENT_NUMBER; i++)
++    {
++      if (event_files[i] == NULL)
++	{
++	  continue;
++	}
++      profile_type = (enum event_type) i;
++      dump_event ();
++      gcov_close ();
++      auto_profile_file = event_files[i];
++      read_profile ();
++      gcov_close ();
++
++      topn_function_total_count_thres = param_llc_allocate_func_counts_threshold;
++      if (param_llc_allocate_func_topn > 0 && profile_type == PMU_EVENT)
++        {
++	  topn_function_total_count_thres
++	    = afdo_source_profile->calc_topn_function_total_count_thres (
++		param_llc_allocate_func_topn);
++        }
++
++      process_extend_source_profile ();
++
++      delete afdo_source_profile;
++      delete afdo_string_table;
++    }
++
++  /* Restore AFDO_STRING_TABLE and AFDO_SOURCE_PROFILE.  Function
++     END_AUTO_PROFILE will free them at the end of compilation.  */
++  afdo_string_table = string_table_afdo;
++  afdo_source_profile = source_profile_afdo;
++  return true;
++}
++
++/* Helper functions.  */
++
++gcov_type
++extend_auto_profile::get_loc_count (location_t loc, event_type type)
++{
++  event_loc_count_map::iterator event_iter = event_loc_map.find (type);
++  if (event_iter != event_loc_map.end ())
++    {
++      loc_count_map::iterator loc_iter = event_iter->second.find (loc);
++      if (loc_iter != event_iter->second.end ())
++	{
++	  return loc_iter->second;
++	}
++    }
++  return 0;
++}
++
++struct rank_info
++extend_auto_profile::get_func_rank (unsigned decl_uid, enum event_type type)
++{
++  struct rank_info info = {0, 0};
++  event_rank_map::iterator event_iter = func_rank.find (type);
++  if (event_iter != func_rank.end ())
++    {
++      rank_map::iterator func_iter = event_iter->second.find (decl_uid);
++      if (func_iter != event_iter->second.end ())
++	{
++	  info.rank = func_iter->second;
++	  info.total = event_iter->second.size ();
++	}
++    }
++  return info;
++}
++
++gcov_type
++extend_auto_profile::get_func_count (unsigned decl_uid, event_type type)
++{
++  event_func_count_map::iterator event_iter = event_func_map.find (type);
++  if (event_iter != event_func_map.end ())
++    {
++      func_count_map::iterator func_iter = event_iter->second.find (decl_uid);
++      if (func_iter != event_iter->second.end ())
++	{
++	  return func_iter->second;
++	}
++    }
++  return 0;
++}
++
++gcov_type
++extend_auto_profile::get_topn_function_total_count_thres () const
++{
++  return topn_function_total_count_thres;
++}
++
++static extend_auto_profile *extend_profile;
++
+ /* Helper functions.  */
+ 
+ /* Return the original name of NAME: strip the suffix that starts
+@@ -483,7 +737,7 @@ string_table::get_index (const char *name) const
+   return iter->second;
+ }
+ 
+-/* Return the index of a given function DECL. Return -1 if DECL is not 
++/* Return the index of a given function DECL. Return -1 if DECL is not
+    found in string table.  */
+ 
+ int
+@@ -917,6 +1171,31 @@ autofdo_source_profile::get_function_instance_by_inline_stack (
+   return s;
+ }
+ 
++/* Compute total count threshold of top functions in sampled data.  */
++
++gcov_type
++autofdo_source_profile::calc_topn_function_total_count_thres (
++    unsigned topn) const
++{
++  std::set<gcov_type> func_counts;
++  for (name_function_instance_map::const_iterator iter = map_.begin ();
++       iter != map_.end (); ++iter)
++    {
++      if (func_counts.size () < topn)
++        func_counts.insert (iter->second->total_count ());
++      else if (*func_counts.begin () < iter->second->total_count ())
++        {
++          func_counts.erase (func_counts.begin ());
++          func_counts.insert (iter->second->total_count ());
++        }
++    }
++ 
++  gcov_type func_counts_topn = *func_counts.begin ();
++  if (func_counts.size () == topn
++      && param_llc_allocate_func_counts_threshold < func_counts_topn)
++    return func_counts_topn;
++}
++
+ /* Module profile is only used by LIPO. Here we simply ignore it.  */
+ 
+ static void
+@@ -1842,6 +2121,132 @@ auto_profile (void)
+ 
+   return TODO_rebuild_cgraph_edges;
+ }
++
++
++void
++extend_auto_profile::rank_all_func ()
++{
++  std::vector<std::pair<unsigned, gcov_type> > func_sorted;
++  event_func_count_map::iterator event_iter
++				 = event_func_map.find (profile_type);
++  if (event_iter != event_func_map.end ())
++    {
++      func_count_map::iterator func_iter;
++      for (func_iter = event_iter->second.begin ();
++	   func_iter != event_iter->second.end (); func_iter++)
++	{
++	  func_sorted.push_back (std::make_pair (func_iter->first,
++						 func_iter->second));
++	}
++
++      std::sort (func_sorted.begin (), func_sorted.end (), event_count_cmp);
++
++      for (unsigned i = 0; i < func_sorted.size (); ++i)
++	{
++	  func_rank[profile_type][func_sorted[i].first] = i + 1;
++	}
++    }
++}
++
++/* Iterate stmts in cfun and maintain its count to EVENT_LOC_MAP.  */
++
++void
++extend_auto_profile::set_loc_count ()
++{
++  basic_block bb;
++  FOR_EACH_BB_FN (bb, cfun)
++    {
++      gimple_stmt_iterator gsi;
++      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
++	{
++	  count_info info;
++	  gimple *stmt = gsi_stmt (gsi);
++	  if (gimple_clobber_p (stmt) || is_gimple_debug (stmt))
++	    {
++	      continue;
++	    }
++	  if (afdo_source_profile->get_count_info (stmt, &info))
++	    {
++	      location_t loc = gimple_location (stmt);
++	      event_loc_map[profile_type][loc] += info.count;
++	      if (dump_file && (dump_flags & TDF_DETAILS))
++		{
++		  fprintf (dump_file, "stmt ");
++		  print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
++		  fprintf (dump_file, "counts %ld\n",
++			   event_loc_map[profile_type][loc]);
++		}
++	    }
++	}
++    }
++}
++
++/* Process data in extend_auto_source_profile, save them into two maps.
++   1. gimple_location to count.
++   2. function_index to count.  */
++void
++extend_auto_profile::process_extend_source_profile ()
++{
++  struct cgraph_node *node;
++  if (symtab->state == FINISHED)
++    {
++      return;
++    }
++  FOR_EACH_FUNCTION (node)
++    {
++      if (!gimple_has_body_p (node->decl) || node->inlined_to)
++	{
++	  continue;
++	}
++
++      /* Don't profile functions produced for builtin stuff.  */
++      if (DECL_SOURCE_LOCATION (node->decl) == BUILTINS_LOCATION)
++	{
++	  continue;
++	}
++
++      function *fn = DECL_STRUCT_FUNCTION (node->decl);
++      push_cfun (fn);
++
++      const function_instance *s
++      = afdo_source_profile->get_function_instance_by_decl (
++	  current_function_decl);
++
++      if (s == NULL)
++	{
++	  pop_cfun ();
++	  continue;
++	}
++      unsigned int decl_uid = DECL_UID (current_function_decl);
++      gcov_type count = s->total_count ();
++      if (dump_file)
++	{
++	  fprintf (dump_file, "Extend auto-profile for function %s.\n",
++			       node->dump_name ());
++	}
++      event_func_map[profile_type][decl_uid] += count;
++      set_loc_count ();
++      pop_cfun ();
++    }
++  rank_all_func ();
++}
++
++/* Main entry of extend_auto_profile.  */
++
++static void
++extend_source_profile ()
++{
++  extend_profile = autofdo::extend_auto_profile::create ();
++  if (dump_file)
++    {
++      if (extend_profile == NULL)
++	{
++	  fprintf (dump_file, "No profile file is found.\n");
++	  return;
++	}
++      fprintf (dump_file, "Extend profile info generated.\n");
++    }
++}
+ } /* namespace autofdo.  */
+ 
+ /* Read the profile from the profile data file.  */
+@@ -1870,6 +2275,48 @@ end_auto_profile (void)
+   profile_info = NULL;
+ }
+ 
++/* Extern function to get profile info in other passes.  */
++
++bool
++profile_exist (enum event_type type)
++{
++  return autofdo::extend_profile != NULL
++	 && autofdo::extend_profile->auto_profile_exist (type);
++}
++
++gcov_type
++event_get_loc_count (location_t loc, event_type type)
++{
++  return autofdo::extend_profile->get_loc_count (loc, type);
++}
++
++gcov_type
++event_get_func_count (unsigned decl_uid, event_type type)
++{
++  return autofdo::extend_profile->get_func_count (decl_uid, type);
++}
++
++struct rank_info
++event_get_func_rank (unsigned decl_uid, enum event_type type)
++{
++  return autofdo::extend_profile->get_func_rank (decl_uid, type);
++}
++
++gcov_type
++event_get_topn_function_total_count_thres ()
++{
++  return autofdo::extend_profile->get_topn_function_total_count_thres ();
++}
++
++void
++free_extend_profile_info ()
++{
++  if (autofdo::extend_profile != NULL)
++    {
++      delete autofdo::extend_profile;
++    }
++}
++
+ /* Returns TRUE if EDGE is hot enough to be inlined early.  */
+ 
+ bool
+@@ -1931,8 +2378,50 @@ public:
+ 
+ } // anon namespace
+ 
++namespace
++{
++const pass_data pass_data_ipa_extend_auto_profile =
++{
++  SIMPLE_IPA_PASS, /* type */
++  "ex-afdo", /* name */
++  OPTGROUP_NONE, /* optinfo_flags */
++  TV_IPA_EXTEND_AUTO_PROFILE, /* tv_id */
++  0, /* properties_required */
++  0, /* properties_provided */
++  0, /* properties_destroyed */
++  0, /* todo_flags_start */
++  0, /* todo_flags_finish */
++};
++
++class pass_ipa_extend_auto_profile : public simple_ipa_opt_pass
++{
++public:
++  pass_ipa_extend_auto_profile (gcc::context *ctxt)
++    : simple_ipa_opt_pass (pass_data_ipa_extend_auto_profile, ctxt)
++  {}
++
++  /* opt_pass methods: */
++  virtual bool gate (function *) {return (flag_ipa_extend_auto_profile > 0);}
++  virtual unsigned int execute (function *);
++
++};
++
++unsigned int
++pass_ipa_extend_auto_profile::execute (function *fun)
++{
++  autofdo::extend_source_profile ();
++  return 0;
++}
++} // anon namespace
++
+ simple_ipa_opt_pass *
+ make_pass_ipa_auto_profile (gcc::context *ctxt)
+ {
+   return new pass_ipa_auto_profile (ctxt);
+ }
++
++simple_ipa_opt_pass *
++make_pass_ipa_extend_auto_profile (gcc::context *ctxt)
++{
++  return new pass_ipa_extend_auto_profile (ctxt);
++}
+diff --git a/gcc/auto-profile.h b/gcc/auto-profile.h
+index bf3f90f2f..dea0b18e6 100644
+--- a/gcc/auto-profile.h
++++ b/gcc/auto-profile.h
+@@ -21,6 +21,14 @@ along with GCC; see the file COPYING3.  If not see
+ #ifndef AUTO_PROFILE_H
+ #define AUTO_PROFILE_H
+ 
++enum event_type
++{
++  INST_EXEC = 0,
++  CACHE_MISSES,
++  PMU_EVENT,
++  EVENT_NUMBER
++};
++
+ /* Read, process, finalize AutoFDO data structures.  */
+ extern void read_autofdo_file (void);
+ extern void end_auto_profile (void);
+@@ -28,4 +36,26 @@ extern void end_auto_profile (void);
+ /* Returns TRUE if EDGE is hot enough to be inlined early.  */
+ extern bool afdo_callsite_hot_enough_for_early_inline (struct cgraph_edge *);
+ 
++/* Chcek if profile exists before using this profile.  */
++extern bool profile_exist (enum event_type);
++
++/* Given func decl_uid or gimple location and event_type, return count.
++   Count is 0 if function or gimple is not sampled.  */
++extern gcov_type event_get_func_count (unsigned, enum event_type);
++extern gcov_type event_get_loc_count (location_t, enum event_type);
++extern gcov_type event_get_topn_function_total_count_thres ();
++
++struct rank_info
++{
++  unsigned total;
++  unsigned rank;
++};
++
++/* Given function decl_uid and event type, return rank_info.  Rank_info
++   is {0, 0} if function was not sampled.  */
++extern struct rank_info event_get_func_rank (unsigned, enum event_type);
++
++/* Free memory allocated by autofdo::extern_profile.  */
++extern void free_extend_profile_info ();
++
+ #endif /* AUTO_PROFILE_H */
+diff --git a/gcc/builtins.cc b/gcc/builtins.cc
+index 57929a42b..dc2e9c3f3 100644
+--- a/gcc/builtins.cc
++++ b/gcc/builtins.cc
+@@ -1352,6 +1352,85 @@ expand_builtin_prefetch (tree exp)
+     emit_insn (op0);
+ }
+ 
++/* Expand a call to __builtin_prefetch_full.  */
++
++static void
++expand_builtin_prefetch_full (tree exp)
++{
++  tree arg0, arg1, arg2;
++  int nargs;
++  rtx op0, op1, op2;
++
++  if (!validate_arglist (exp, POINTER_TYPE, 0))
++    return;
++
++  arg0 = CALL_EXPR_ARG (exp, 0);
++
++  /* Arguments 1 and 2 are optional; argument 1 (read/write) defaults to
++     zero (read) and argument 2 (locality) defaults to 3 (high degree of
++     locality).  */
++  nargs = call_expr_nargs (exp);
++  if (nargs > 1)
++    arg1 = CALL_EXPR_ARG (exp, 1);
++  else
++    arg1 = integer_zero_node;
++  if (nargs > 2)
++    arg2 = CALL_EXPR_ARG (exp, 2);
++  else
++    arg2 = integer_three_node;
++
++  /* Argument 0 is an address.  */
++  op0 = expand_expr (arg0, NULL_RTX, Pmode, EXPAND_NORMAL);
++
++  /* Argument 1 (read/write flag) must be a compile-time constant int.  */
++  if (TREE_CODE (arg1) != INTEGER_CST)
++    {
++      error ("second argument to %<__builtin_prefetch_full%> must be a "
++             "constant");
++      arg1 = integer_zero_node;
++    }
++  op1 = expand_normal (arg1);
++  /* Argument 1 must be either zero or one.  */
++  if (INTVAL (op1) != 0 && INTVAL (op1) != 1)
++    {
++      warning (0, "invalid second argument to %<__builtin_prefetch_full%>;"
++	       " using zero");
++      op1 = const0_rtx;
++    }
++
++  /* Argument 2 (locality) must be a compile-time constant int.  */
++  if (TREE_CODE (arg2) != INTEGER_CST)
++    {
++      error ("third argument to %<__builtin_prefetch_full%> must be a "
++             "constant");
++      arg2 = integer_zero_node;
++    }
++  op2 = expand_normal (arg2);
++  /* Argument 2 must be 0-7.  */
++  if (INTVAL (op2) < 0 || INTVAL (op2) > 7)
++    {
++      warning (0, "invalid third argument to %<__builtin_prefetch_full%>; "
++               "using zero");
++      op2 = const0_rtx;
++    }
++
++  if (targetm.have_prefetch_full ())
++    {
++      class expand_operand ops[3];
++
++      create_address_operand (&ops[0], op0);
++      create_integer_operand (&ops[1], INTVAL (op1));
++      create_integer_operand (&ops[2], INTVAL (op2));
++      if (maybe_expand_insn (targetm.code_for_prefetch_full, 3, ops))
++	return;
++    }
++
++  /* Don't do anything with direct references to volatile memory, but
++     generate code to handle other side effects.  */
++  if (!MEM_P (op0) && side_effects_p (op0))
++    emit_insn (op0);
++}
++
+ /* Get a MEM rtx for expression EXP which is the address of an operand
+    to be used in a string instruction (cmpstrsi, cpymemsi, ..).  LEN is
+    the maximum length of the block of memory that might be accessed or
+@@ -7598,6 +7677,9 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
+     case BUILT_IN_PREFETCH:
+       expand_builtin_prefetch (exp);
+       return const0_rtx;
++    case BUILT_IN_PREFETCH_FULL:
++      expand_builtin_prefetch_full (exp);
++      return const0_rtx;
+ 
+     case BUILT_IN_INIT_TRAMPOLINE:
+       return expand_builtin_init_trampoline (exp, true);
+diff --git a/gcc/builtins.def b/gcc/builtins.def
+index 005976f34..f2e0c357d 100644
+--- a/gcc/builtins.def
++++ b/gcc/builtins.def
+@@ -924,6 +924,7 @@ DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl", BT_FN_INT_ULONG, ATTR_C
+ DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll", BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
+ DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign", BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
+ DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
++DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH_FULL, "prefetch_full", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
+ DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE, ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
+ DEF_GCC_BUILTIN        (BUILT_IN_RETURN, "return", BT_FN_VOID_PTR, ATTR_NORETURN_NOTHROW_LEAF_LIST)
+ DEF_GCC_BUILTIN        (BUILT_IN_RETURN_ADDRESS, "return_address", BT_FN_PTR_UINT, ATTR_LEAF_LIST)
+diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
+index d2714e20c..794bc3ecc 100644
+--- a/gcc/cfgloop.h
++++ b/gcc/cfgloop.h
+@@ -272,6 +272,9 @@ public:
+      the basic-block from being collected but its index can still be
+      reused.  */
+   basic_block former_header;
++
++  /* Number of latch executions from vectorization.  */
++  tree vec_nb_iterations;
+ };
+ 
+ /* Set if the loop is known to be infinite.  */
+diff --git a/gcc/common.opt b/gcc/common.opt
+index 6ab7ba4cc..e6ffa1c58 100644
+--- a/gcc/common.opt
++++ b/gcc/common.opt
+@@ -1148,6 +1148,26 @@ Common Joined RejectNegative Var(auto_profile_file)
+ Use sample profile information for call graph node weights. The profile
+ file is specified in the argument.
+ 
++fcache-misses-profile
++Common Var(flag_cache_misses_profile)
++Use sample profile information for source code cache miss count.  The default
++profile file is cmsdata.gcov in `pwd`.
++
++fcache-misses-profile=
++Common Joined RejectNegative Var(cache_misses_profile_file)
++Use sample profile information for source code cache miss count.  The profile
++file is specified in the argument.
++
++fadditional-profile
++Common Var(flag_additional_profile)
++Use additional PMU-event sample profile information for source code bb count.
++The default profile file is addldata.gcov in `pwd`.
++
++fadditional-profile=
++Common Joined RejectNegative Var(additional_profile_file)
++Use additional PMU-event sample profile information for source code bb count.
++The profile file is specified in the argument.
++
+ ; -fcheck-bounds causes gcc to generate array bounds checks.
+ ; For C, C++ and ObjC: defaults off.
+ ; For Java: defaults to on.
+@@ -2074,6 +2094,10 @@ fipa-struct-sfc-shadow
+ Common Var(flag_ipa_struct_sfc_shadow) Init(0) Optimization
+ Enable field shadowing optimization in static struct field compression.
+ 
++fipa-extend-auto-profile
++Common Var(flag_ipa_extend_auto_profile)
++Use sample profile information for source code.
++
+ fipa-vrp
+ Common Var(flag_ipa_vrp) Optimization
+ Perform IPA Value Range Propagation.
+@@ -2424,6 +2448,10 @@ fipa-prefetch
+ Common Var(flag_ipa_prefetch) Init(0) Optimization
+ Generate prefetch instructions, if available, using IPA info.
+ 
++fllc-allocate
++Common Var(flag_llc_allocate) Init(-1) Optimization
++Generate LLC hint instructions.
++
+ fprofile
+ Common Var(profile_flag)
+ Enable basic program profiling code.
+diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
+index cbb844fbc..af0881f7a 100644
+--- a/gcc/config/aarch64/aarch64-protos.h
++++ b/gcc/config/aarch64/aarch64-protos.h
+@@ -702,12 +702,16 @@ extern struct tune_params aarch64_tune_params;
+   T (PLDL2STRM, pldl2strm, 3) \
+   T (PLDL3KEEP, pldl3keep, 4) \
+   T (PLDL3STRM, pldl3strm, 5) \
++  T (PLDL4KEEP, pldl4keep, 6) \
++  T (PLDL4STRM, pldl4strm, 7) \
+   T (PSTL1KEEP, pstl1keep, 8) \
+   T (PSTL1STRM, pstl1strm, 9) \
+   T (PSTL2KEEP, pstl2keep, 10) \
+   T (PSTL2STRM, pstl2strm, 11) \
+   T (PSTL3KEEP, pstl3keep, 12) \
+-  T (PSTL3STRM, pstl3strm, 13)
++  T (PSTL3STRM, pstl3strm, 13) \
++  T (PSTL4KEEP, pstl4keep, 14) \
++  T (PSTL4STRM, pstl4strm, 15)
+ 
+ #define AARCH64_SVENUM(UPPER, LOWER, VALUE) AARCH64_SV_##UPPER = VALUE,
+ enum aarch64_svpattern {
+diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
+index a8a5dc3a2..7808abf70 100644
+--- a/gcc/config/aarch64/aarch64-sve.md
++++ b/gcc/config/aarch64/aarch64-sve.md
+@@ -1952,7 +1952,7 @@
+ (define_insn "@aarch64_sve_prefetch<mode>"
+   [(prefetch (unspec:DI
+ 	       [(match_operand:<VPRED> 0 "register_operand" "Upl")
+-		(match_operand:SVE_FULL_I 1 "aarch64_sve_prefetch_operand" "UP<Vesize>")
++		(match_operand:SVE_FULL 1 "aarch64_sve_prefetch_operand" "UP<Vesize>")
+ 		(match_operand:DI 2 "const_int_operand")]
+ 	       UNSPEC_SVE_PREFETCH)
+ 	     (match_operand:DI 3 "const_int_operand")
+@@ -1985,14 +1985,14 @@
+ ;; 6: the prefetch operator (an svprfop)
+ ;; 7: the normal RTL prefetch rw flag
+ ;; 8: the normal RTL prefetch locality value
+-(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx4SI_ONLY:mode>"
++(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL:mode><VNx4SI_ONLY:mode>"
+   [(prefetch (unspec:DI
+ 	       [(match_operand:VNx4BI 0 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+-		(match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL_I:Vesize>" "Z, vg<SVE_FULL_I:Vesize>, rk, rk, rk, rk")
++		(match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL:Vesize>" "Z, vg<SVE_FULL:Vesize>, rk, rk, rk, rk")
+ 		(match_operand:VNx4SI_ONLY 2 "register_operand" "w, w, w, w, w, w")
+ 		(match_operand:DI 3 "const_int_operand" "i, i, Z, Ui1, Z, Ui1")
+-		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, Ui1, Ui1, Ui1, i, i")
+-		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
++		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL:Vesize>" "Ui1, Ui1, Ui1, Ui1, i, i")
++		(match_operand:SVE_FULL 5 "aarch64_simd_imm_zero")
+ 		(match_operand:DI 6 "const_int_operand")]
+ 	       UNSPEC_SVE_PREFETCH_GATHER)
+ 	     (match_operand:DI 7 "const_int_operand")
+@@ -2000,12 +2000,12 @@
+   "TARGET_SVE && TARGET_NON_STREAMING"
+   {
+     static const char *const insns[][2] = {
+-      "prf<SVE_FULL_I:Vesize>", "%0, [%2.s]",
+-      "prf<SVE_FULL_I:Vesize>", "%0, [%2.s, #%1]",
++      "prf<SVE_FULL:Vesize>", "%0, [%2.s]",
++      "prf<SVE_FULL:Vesize>", "%0, [%2.s, #%1]",
+       "prfb", "%0, [%1, %2.s, sxtw]",
+       "prfb", "%0, [%1, %2.s, uxtw]",
+-      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.s, sxtw %p4]",
+-      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.s, uxtw %p4]"
++      "prf<SVE_FULL:Vesize>", "%0, [%1, %2.s, sxtw %p4]",
++      "prf<SVE_FULL:Vesize>", "%0, [%1, %2.s, uxtw %p4]"
+     };
+     const char *const *parts = insns[which_alternative];
+     return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
+@@ -2014,14 +2014,14 @@
+ 
+ ;; Predicated gather prefetches for 64-bit elements.  The value of operand 3
+ ;; doesn't matter in this case.
+-(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx2DI_ONLY:mode>"
++(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL:mode><VNx2DI_ONLY:mode>"
+   [(prefetch (unspec:DI
+ 	       [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl, Upl, Upl")
+-		(match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL_I:Vesize>" "Z, vg<SVE_FULL_I:Vesize>, rk, rk")
++		(match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL:Vesize>" "Z, vg<SVE_FULL:Vesize>, rk, rk")
+ 		(match_operand:VNx2DI_ONLY 2 "register_operand" "w, w, w, w")
+ 		(match_operand:DI 3 "const_int_operand")
+-		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, Ui1, Ui1, i")
+-		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
++		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL:Vesize>" "Ui1, Ui1, Ui1, i")
++		(match_operand:SVE_FULL 5 "aarch64_simd_imm_zero")
+ 		(match_operand:DI 6 "const_int_operand")]
+ 	       UNSPEC_SVE_PREFETCH_GATHER)
+ 	     (match_operand:DI 7 "const_int_operand")
+@@ -2029,10 +2029,10 @@
+   "TARGET_SVE && TARGET_NON_STREAMING"
+   {
+     static const char *const insns[][2] = {
+-      "prf<SVE_FULL_I:Vesize>", "%0, [%2.d]",
+-      "prf<SVE_FULL_I:Vesize>", "%0, [%2.d, #%1]",
++      "prf<SVE_FULL:Vesize>", "%0, [%2.d]",
++      "prf<SVE_FULL:Vesize>", "%0, [%2.d, #%1]",
+       "prfb", "%0, [%1, %2.d]",
+-      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.d, lsl %p4]"
++      "prf<SVE_FULL:Vesize>", "%0, [%1, %2.d, lsl %p4]"
+     };
+     const char *const *parts = insns[which_alternative];
+     return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
+@@ -2040,7 +2040,7 @@
+ )
+ 
+ ;; Likewise, but with the offset being sign-extended from 32 bits.
+-(define_insn_and_rewrite "*aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx2DI_ONLY:mode>_sxtw"
++(define_insn_and_rewrite "*aarch64_sve_gather_prefetch<SVE_FULL:mode><VNx2DI_ONLY:mode>_sxtw"
+   [(prefetch (unspec:DI
+ 	       [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl")
+ 		(match_operand:DI 1 "register_operand" "rk, rk")
+@@ -2051,8 +2051,8 @@
+ 		       (match_operand:VNx2DI 2 "register_operand" "w, w")))]
+ 		  UNSPEC_PRED_X)
+ 		(match_operand:DI 3 "const_int_operand")
+-		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, i")
+-		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
++		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL:Vesize>" "Ui1, i")
++		(match_operand:SVE_FULL 5 "aarch64_simd_imm_zero")
+ 		(match_operand:DI 6 "const_int_operand")]
+ 	       UNSPEC_SVE_PREFETCH_GATHER)
+ 	     (match_operand:DI 7 "const_int_operand")
+@@ -2061,7 +2061,7 @@
+   {
+     static const char *const insns[][2] = {
+       "prfb", "%0, [%1, %2.d, sxtw]",
+-      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.d, sxtw %p4]"
++      "prf<SVE_FULL:Vesize>", "%0, [%1, %2.d, sxtw %p4]"
+     };
+     const char *const *parts = insns[which_alternative];
+     return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
+@@ -2073,7 +2073,7 @@
+ )
+ 
+ ;; Likewise, but with the offset being zero-extended from 32 bits.
+-(define_insn "*aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx2DI_ONLY:mode>_uxtw"
++(define_insn "*aarch64_sve_gather_prefetch<SVE_FULL:mode><VNx2DI_ONLY:mode>_uxtw"
+   [(prefetch (unspec:DI
+ 	       [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl")
+ 		(match_operand:DI 1 "register_operand" "rk, rk")
+@@ -2081,8 +2081,8 @@
+ 		  (match_operand:VNx2DI 2 "register_operand" "w, w")
+ 		  (match_operand:VNx2DI 9 "aarch64_sve_uxtw_immediate"))
+ 		(match_operand:DI 3 "const_int_operand")
+-		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, i")
+-		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
++		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL:Vesize>" "Ui1, i")
++		(match_operand:SVE_FULL 5 "aarch64_simd_imm_zero")
+ 		(match_operand:DI 6 "const_int_operand")]
+ 	       UNSPEC_SVE_PREFETCH_GATHER)
+ 	     (match_operand:DI 7 "const_int_operand")
+@@ -2091,7 +2091,7 @@
+   {
+     static const char *const insns[][2] = {
+       "prfb", "%0, [%1, %2.d, uxtw]",
+-      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.d, uxtw %p4]"
++      "prf<SVE_FULL:Vesize>", "%0, [%1, %2.d, uxtw %p4]"
+     };
+     const char *const *parts = insns[which_alternative];
+     return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index e9c387b24..a06c2c515 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -4408,6 +4408,13 @@ aarch64_sve_data_mode_p (machine_mode mode)
+   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
+ }
+ 
++/* Return true if MODE is an full SVE data vector mode.  */
++static bool
++aarch64_full_sve_data_mode_p (machine_mode mode)
++{
++  return aarch64_classify_vector_mode (mode) == VEC_SVE_DATA;
++}
++
+ /* Return the number of defined bytes in one constituent vector of
+    SVE mode MODE, which has vector flags VEC_FLAGS.  */
+ static poly_int64
+@@ -31796,6 +31803,17 @@ aarch64_libgcc_floating_mode_supported_p
+ #undef TARGET_ASM_FUNCTION_EPILOGUE
+ #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
+ 
++#undef TARGET_VECTORIZE_CODE_FOR_PREFETCH
++#define TARGET_VECTORIZE_CODE_FOR_PREFETCH code_for_aarch64_sve_prefetch
++
++#undef TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH
++#define TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH \
++  code_for_aarch64_sve_gather_prefetch
++
++#undef TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P
++#define TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P \
++  aarch64_full_sve_data_mode_p
++
+ #undef TARGET_HAVE_SHADOW_CALL_STACK
+ #define TARGET_HAVE_SHADOW_CALL_STACK true
+ 
+diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
+index 2f46bc793..69d296556 100644
+--- a/gcc/config/aarch64/aarch64.md
++++ b/gcc/config/aarch64/aarch64.md
+@@ -925,6 +925,45 @@
+   [(set_attr "type" "load_4")]
+ )
+ 
++(define_insn "prefetch_full"
++  [(prefetch_full (match_operand:DI 0 "aarch64_prefetch_operand" "Dp")
++            (match_operand:QI 1 "const_int_operand" "")
++            (match_operand:QI 2 "const_int_operand" ""))]
++  ""
++  {
++    const char * pftype[2][8] =
++    {
++      {"prfm\\tPLDL1KEEP, %0",
++       "prfm\\tPLDL1STRM, %0",
++       "prfm\\tPLDL2KEEP, %0",
++       "prfm\\tPLDL2STRM, %0",
++       "prfm\\tPLDL3KEEP, %0",
++       "prfm\\tPLDL3STRM, %0",
++       "prfm\\tPLDL4KEEP, %0",
++       "prfm\\tPLDL4STRM, %0"},
++      {"prfm\\tPSTL1KEEP, %0",
++       "prfm\\tPSTL1STRM, %0",
++       "prfm\\tPSTL2KEEP, %0",
++       "prfm\\tPSTL2STRM, %0",
++       "prfm\\tPSTL3KEEP, %0",
++       "prfm\\tPSTL3STRM, %0",
++       "prfm\\tPSTL4KEEP, %0",
++       "prfm\\tPSTL4STRM, %0"},
++    };
++
++    int prfop = INTVAL (operands[2]);
++
++    gcc_assert (IN_RANGE (prfop, 0, 7));
++
++    /* PRFM accepts the same addresses as a 64-bit LDR so wrap
++       the address into a DImode MEM so that aarch64_print_operand knows
++       how to print it.  */
++    operands[0] = gen_rtx_MEM (DImode, operands[0]);
++    return pftype[INTVAL (operands[1])][prfop];
++  }
++  [(set_attr "type" "load_4")]
++)
++
+ (define_insn "trap"
+   [(trap_if (const_int 1) (const_int 8))]
+   ""
+diff --git a/gcc/dce.cc b/gcc/dce.cc
+index 6676cbcd4..964a0a6d0 100644
+--- a/gcc/dce.cc
++++ b/gcc/dce.cc
+@@ -72,6 +72,7 @@ deletable_insn_p_1 (rtx body)
+   switch (GET_CODE (body))
+     {
+     case PREFETCH:
++    case PREFETCH_FULL:
+     case TRAP_IF:
+       /* The UNSPEC case was added here because the ia-64 claims that
+ 	 USEs do not work after reload and generates UNSPECS rather
+diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
+index 50bbbbc42..16ada7aae 100644
+--- a/gcc/doc/tm.texi
++++ b/gcc/doc/tm.texi
+@@ -6278,6 +6278,27 @@ The default is @code{NULL_TREE} which means to not vectorize scatter
+ stores.
+ @end deftypefn
+ 
++@deftypefn {Target Hook} insn_code TARGET_VECTORIZE_CODE_FOR_PREFETCH (machine_mode @var{arg})
++This hook should return the decl of a function that implements the
++vectorized variant of the function with the @code{combined_fn} code
++@var{code} or @code{NULL_TREE} if such a function is not available.
++The return type of the vectorized function shall be of vector type
++@var{vec_type_out} and the argument types should be @var{vec_type_in}.
++@end deftypefn
++
++@deftypefn {Target Hook} insn_code TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH (machine_mode @var{mode_to}, machine_mode @var{mode_form})
++This hook should return the decl of a function that implements the
++vectorized variant of the function with the @code{combined_fn} code
++@var{code} or @code{NULL_TREE} if such a function is not available.
++The return type of the vectorized function shall be of vector type
++@var{vec_type_out} and the argument types should be @var{vec_type_in}.
++@end deftypefn
++
++@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P (machine_mode @var{arg})
++This hook should return true if the target hardware architecture
++supports a full SVE data vector mode.
++@end deftypefn
++
+ @deftypefn {Target Hook} int TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN (struct cgraph_node *@var{}, struct cgraph_simd_clone *@var{}, @var{tree}, @var{int})
+ This hook should set @var{vecsize_mangle}, @var{vecsize_int}, @var{vecsize_float}
+ fields in @var{simd_clone} structure pointed by @var{clone_info} argument and also
+diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
+index cfda60304..88db8752e 100644
+--- a/gcc/doc/tm.texi.in
++++ b/gcc/doc/tm.texi.in
+@@ -4190,6 +4190,12 @@ address;  but often a machine-dependent strategy can generate better code.
+ 
+ @hook TARGET_VECTORIZE_BUILTIN_SCATTER
+ 
++@hook TARGET_VECTORIZE_CODE_FOR_PREFETCH
++
++@hook TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH
++
++@hook TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P
++
+ @hook TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
+ 
+ @hook TARGET_SIMD_CLONE_ADJUST
+diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
+index 8b1733e20..19811106f 100644
+--- a/gcc/internal-fn.cc
++++ b/gcc/internal-fn.cc
+@@ -107,11 +107,13 @@ init_internal_fns ()
+    direct_internal_fn.  */
+ #define not_direct { -2, -2, false }
+ #define mask_load_direct { -1, 2, false }
++#define mask_prefetch_direct { -1, 2, false }
+ #define load_lanes_direct { -1, -1, false }
+ #define mask_load_lanes_direct { -1, -1, false }
+ #define gather_load_direct { 3, 1, false }
+ #define len_load_direct { -1, -1, false }
+ #define mask_store_direct { 3, 2, false }
++#define gather_prefetch_direct { 3, 1, false }
+ #define store_lanes_direct { 0, 0, false }
+ #define mask_store_lanes_direct { 0, 0, false }
+ #define vec_cond_mask_direct { 1, 0, false }
+@@ -2745,6 +2747,53 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
+ #define expand_mask_load_lanes_optab_fn expand_mask_load_optab_fn
+ #define expand_len_load_optab_fn expand_partial_load_optab_fn
+ 
++/* Expand MASK_PREFETCH call STMT using optab OPTAB.
++   .MASK_STORE (_5, 64B, loop_mask_98, vect__8.10_102);
++   .MASK_PREFETCH (_68, 64B, loop_mask_98, vect__8.10_102, 4);
++*/
++
++static void
++expand_mask_prefetch_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
++{
++  if (targetm.vectorize.code_for_prefetch == NULL
++      || targetm.vectorize.prefetch_handleable_mode_p == NULL)
++    return;
++
++  tree base = gimple_call_arg (stmt, 0);
++  if (base == NULL_TREE)
++    return;
++
++  tree maskt = gimple_call_arg (stmt, 2);
++  tree target = gimple_call_arg (stmt, 3);
++  tree prfop = gimple_call_arg (stmt, 4);
++  HOST_WIDE_INT prfop_int = tree_to_uhwi (prfop);
++  /* Bit 3 of the prfop selects stores over loads.  */
++  HOST_WIDE_INT access = prfop_int & 8;
++  /* Bits 1 and 2 specify the locality; 0-based for svprfop but
++     1-based for PREFETCH.  */
++  HOST_WIDE_INT locality = ((prfop_int >> 1) & 3) + 1;
++
++  machine_mode m_mode = TYPE_MODE (TREE_TYPE (target));
++  if (!targetm.vectorize.prefetch_handleable_mode_p (m_mode))
++    return;
++  insn_code icode = targetm.vectorize.code_for_prefetch (m_mode);
++
++  rtx mask = expand_normal (maskt);
++  rtx base_rtx = expand_normal (base);
++  /* Convert ptr_mode value X to Pmode.  */
++  if (ptr_mode == SImode)
++    base_rtx = simplify_gen_unary (ZERO_EXTEND, DImode, base_rtx, SImode);
++
++  unsigned i = 0;
++  class expand_operand ops[5];
++  create_input_operand (&ops[i++], mask, TYPE_MODE (TREE_TYPE (maskt)));
++  create_address_operand (&ops[i++], base_rtx);
++  create_integer_operand (&ops[i++], prfop_int);
++  create_integer_operand (&ops[i++], access);
++  create_integer_operand (&ops[i++], locality);
++  expand_insn (icode, i, ops);
++}
++
+ /* Expand MASK_STORE{,_LANES} or LEN_STORE call STMT using optab OPTAB.  */
+ 
+ static void
+@@ -3402,6 +3451,70 @@ contains_call_div_mod (rtx_insn *insn)
+   return false;
+  }
+ 
++/* Expand {MASK_,}GATHER_PREFETCH call CALL using optab OPTAB.
++  vect_patt_97.14_77 = .MASK_GATHER_LOAD (_78, vect__14.13_79, 8, { 0.0, ... }, loop_mask_87);
++  .MASK_GATHER_PREFETCH (_45, vect__14.13_79, 8, { 0.0, ... }, loop_mask_87, vect_patt_97.14_77, 4);
++*/
++
++static void
++expand_gather_prefetch_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
++{
++  if (targetm.vectorize.code_for_gather_prefetch == NULL
++      || targetm.vectorize.prefetch_handleable_mode_p == NULL)
++    return;
++
++  /* Extracting tree nodes, only expand for scalar base and vector index.  */
++  tree base = gimple_call_arg (stmt, 0);
++  if (VECTOR_TYPE_P (TREE_TYPE (base)))
++    return;
++  tree offset = gimple_call_arg (stmt, 1);
++  if (VECTOR_TYPE_P (TREE_TYPE (offset)) == false)
++    return;
++
++  tree scale = gimple_call_arg (stmt, 2);
++  tree mask = gimple_call_arg (stmt, 4);
++  tree target = gimple_call_arg (stmt, 5);
++  tree prfop = gimple_call_arg (stmt, 6);
++
++  /* Convert to the rtx node.  */
++  rtx base_rtx = expand_normal (base);
++  /* Convert ptr_mode value X to Pmode.  */
++  if (ptr_mode == SImode)
++    base_rtx = simplify_gen_unary (ZERO_EXTEND, DImode, base_rtx, SImode);
++  rtx offset_rtx = expand_normal (offset);
++  rtx const_rtx = CONST0_RTX (TYPE_MODE (TREE_TYPE (target)));
++  rtx mask_rtx = expand_normal (mask);
++  HOST_WIDE_INT scale_int = tree_to_shwi (scale);
++  HOST_WIDE_INT prfop_int = tree_to_uhwi (prfop);
++  /* Bit 3 of the prfop selects stores over loads.  */
++  HOST_WIDE_INT access = prfop_int & 8;
++  /* Bits 1 and 2 specify the locality; 0-based for svprfop but
++     1-based for PREFETCH.  */
++  HOST_WIDE_INT locality = ((prfop_int >> 1) & 3) + 1;
++
++  /* add operand.  */
++  unsigned int i = 0;
++  class expand_operand ops[9];
++  create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask)));
++  create_address_operand (&ops[i++], base_rtx);
++  create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset)));
++  /* Check whether the index has unsigned.  */
++  create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset)));
++  create_integer_operand (&ops[i++], scale_int);
++  create_input_operand (&ops[i++], const_rtx, GET_MODE (const_rtx));
++  create_integer_operand (&ops[i++], prfop_int);
++  create_integer_operand (&ops[i++], access);
++  create_integer_operand (&ops[i++], locality);
++
++  machine_mode reg_mode = GET_MODE (offset_rtx);
++  machine_mode m_mode = TYPE_MODE (TREE_TYPE (target));
++  if (!targetm.vectorize.prefetch_handleable_mode_p (m_mode))
++    return;
++  insn_code icode = targetm.vectorize.code_for_gather_prefetch
++					       (m_mode, reg_mode);
++  expand_insn (icode, i, ops);
++}
++
+ /* Expand DIVMOD() using:
+  a) optab handler for udivmod/sdivmod if it is available.
+  b) If optab_handler doesn't exist, generate call to
+@@ -3767,10 +3880,12 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
+ #define direct_cond_binary_optab_supported_p direct_optab_supported_p
+ #define direct_cond_ternary_optab_supported_p direct_optab_supported_p
+ #define direct_mask_load_optab_supported_p convert_optab_supported_p
++#define direct_mask_prefetch_optab_supported_p direct_optab_supported_p
+ #define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p
+ #define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p
+ #define direct_gather_load_optab_supported_p convert_optab_supported_p
+ #define direct_len_load_optab_supported_p direct_optab_supported_p
++#define direct_gather_prefetch_optab_supported_p direct_optab_supported_p
+ #define direct_mask_store_optab_supported_p convert_optab_supported_p
+ #define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p
+ #define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p
+diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
+index d2d550d35..05fc50328 100644
+--- a/gcc/internal-fn.def
++++ b/gcc/internal-fn.def
+@@ -121,6 +121,8 @@ along with GCC; see the file COPYING3.  If not see
+ #endif
+ 
+ DEF_INTERNAL_OPTAB_FN (MASK_LOAD, ECF_PURE, maskload, mask_load)
++DEF_INTERNAL_OPTAB_FN (MASK_PREFETCH, ECF_NOVOPS | ECF_LEAF,
++		       maskprefetch, mask_prefetch)
+ DEF_INTERNAL_OPTAB_FN (LOAD_LANES, ECF_CONST, vec_load_lanes, load_lanes)
+ DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE,
+ 		       vec_mask_load_lanes, mask_load_lanes)
+@@ -128,6 +130,8 @@ DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE,
+ DEF_INTERNAL_OPTAB_FN (GATHER_LOAD, ECF_PURE, gather_load, gather_load)
+ DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE,
+ 		       mask_gather_load, gather_load)
++DEF_INTERNAL_OPTAB_FN (MASK_GATHER_PREFETCH, ECF_NOVOPS | ECF_LEAF,
++		       mask_gather_prefetch, gather_prefetch)
+ 
+ DEF_INTERNAL_OPTAB_FN (LEN_LOAD, ECF_PURE, len_load, len_load)
+ 
+diff --git a/gcc/ipa-pure-const.cc b/gcc/ipa-pure-const.cc
+index 2642df91e..222fe6465 100644
+--- a/gcc/ipa-pure-const.cc
++++ b/gcc/ipa-pure-const.cc
+@@ -534,6 +534,7 @@ builtin_safe_for_const_function_p (bool *looping, tree callee)
+ 	*looping = false;
+ 	return true;
+       case BUILT_IN_PREFETCH:
++      case BUILT_IN_PREFETCH_FULL:
+ 	*looping = true;
+ 	return true;
+       default:
+diff --git a/gcc/optabs.def b/gcc/optabs.def
+index dbf529434..8ca25a5cc 100644
+--- a/gcc/optabs.def
++++ b/gcc/optabs.def
+@@ -90,9 +90,11 @@ OPTAB_CD(vec_cmp_optab, "vec_cmp$a$b")
+ OPTAB_CD(vec_cmpu_optab, "vec_cmpu$a$b")
+ OPTAB_CD(vec_cmpeq_optab, "vec_cmpeq$a$b")
+ OPTAB_CD(maskload_optab, "maskload$a$b")
++OPTAB_CD(maskprefetch_optab, "maskprefetch$a$b")
+ OPTAB_CD(maskstore_optab, "maskstore$a$b")
+ OPTAB_CD(gather_load_optab, "gather_load$a$b")
+ OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b")
++OPTAB_CD(mask_gather_prefetch_optab, "mask_gather_prefetch$a$b")
+ OPTAB_CD(scatter_store_optab, "scatter_store$a$b")
+ OPTAB_CD(mask_scatter_store_optab, "mask_scatter_store$a$b")
+ OPTAB_CD(vec_extract_optab, "vec_extract$a$b")
+diff --git a/gcc/opts.cc b/gcc/opts.cc
+index 2433ace06..432b822e8 100644
+--- a/gcc/opts.cc
++++ b/gcc/opts.cc
+@@ -2108,6 +2108,13 @@ enable_fdo_optimizations (struct gcc_options *opts,
+   SET_OPTION_IF_UNSET (opts, opts_set, flag_tree_loop_distribution, value);
+ }
+ 
++static void
++set_cache_misses_profile_params (struct gcc_options *opts,
++				 struct gcc_options *opts_set)
++{
++  SET_OPTION_IF_UNSET (opts, opts_set, flag_prefetch_loop_arrays, 1);
++}
++
+ /* Enable cfgo-related flags.  */
+ 
+ static void
+@@ -3143,10 +3150,20 @@ common_handle_option (struct gcc_options *opts,
+       /* FALLTHRU */
+     case OPT_fauto_profile:
+       enable_fdo_optimizations (opts, opts_set, value);
+-	  /* 2 is special and means flag_profile_correction trun on by
+-	     -fauto-profile.  */
++      /* 2 is special and means flag_profile_correction trun on by
++	 -fauto-profile.  */
+       SET_OPTION_IF_UNSET (opts, opts_set, flag_profile_correction,
+-			   (value ? 2 : 0));
++		      (value ? 2 : 0));
++      break;
++
++    case OPT_fadditional_profile_:
++      opts->x_additional_profile_file = xstrdup (arg);
++      opts->x_flag_additional_profile = true;
++      value = true;
++      /* No break here - do -fadditional-profile processing. */
++      /* FALLTHRU */
++    case OPT_fadditional_profile:
++      opts->x_flag_ipa_extend_auto_profile = value;
+       break;
+ 
+     case OPT_fipa_struct_reorg_:
+@@ -3155,17 +3172,36 @@ common_handle_option (struct gcc_options *opts,
+     case OPT_fipa_struct_reorg:
+       opts->x_flag_ipa_struct_reorg = value;
+       if (value && !opts->x_struct_layout_optimize_level)
+-	{
+-	  /* Using the -fipa-struct-reorg option is equivalent to using
+-	     -fipa-struct-reorg=1.  */
+-	  opts->x_struct_layout_optimize_level = 1;
+-	}
++      {
++	      /* Using the -fipa-struct-reorg option is equivalent to using
++		 -fipa-struct-reorg=1.  */
++	      opts->x_struct_layout_optimize_level = 1;
++      }
+       break;
+ 
+     case OPT_fipa_reorder_fields:
+       SET_OPTION_IF_UNSET (opts, opts_set, flag_ipa_struct_reorg, value);
+       break;
+ 
++    case OPT_fipa_extend_auto_profile:
++      opts->x_flag_ipa_extend_auto_profile = opts->x_flag_cache_misses_profile
++	      ? true : value;
++      break;
++
++    case OPT_fcache_misses_profile_:
++      opts->x_cache_misses_profile_file = xstrdup (arg);
++      opts->x_flag_cache_misses_profile = true;
++      value = true;
++      /* No break here - do -fcache-misses-profile processing. */
++      /* FALLTHRU */
++    case OPT_fcache_misses_profile:
++      opts->x_flag_ipa_extend_auto_profile = value;
++      if (value)
++      {
++	      set_cache_misses_profile_params (opts, opts_set);
++      }
++      break;
++
+     case OPT_fcfgo_profile_generate_:
+       opts->x_profile_data_prefix = xstrdup (arg);
+       value = true;
+diff --git a/gcc/params.opt b/gcc/params.opt
+index e5472dfc8..e06e50611 100644
+--- a/gcc/params.opt
++++ b/gcc/params.opt
+@@ -1262,4 +1262,66 @@ Range for depended ldp search in split-ldp-stp path.
+ Common Joined UInteger Var(semi_relayout_level) Init(13) IntegerRange(11, 15) Param Optimization
+ Set capacity of each bucket to semi-relayout to (1 << semi-relayout-level) / 8 .
+ 
++-param=mem-access-ratio=
++Common Joined UInteger Var(param_mem_access_ratio) Init(20) IntegerRange(0, 100) Param Optimization
++Memory access ratio (in percent).
++
++-param=mem-access-num=
++Common Joined UInteger Var(param_mem_access_num) Init(3) Param Optimization
++Memory access num.
++
++-param=prefetch-offset=
++Common Joined UInteger Var(param_prefetch_offset) Init(1024)
++IntegerRange(1, 999999) Param Optimization
++Prefetch Offset, which is usually a power of two due to cache line size.
++
++-param=branch-prob-threshold=
++Common Joined UInteger Var(param_branch_prob_threshold) Init(80) IntegerRange(50, 100)
++Param Optimization
++High Execution Rate Branch Threshold.
++
++-param=issue-topn=
++Common Joined UInteger Var(param_issue_topn) Init(1) Param Optimization
++Issue topn LLC mem_ref hint.
++
++-param=force-issue=
++Common Joined UInteger Var(param_force_issue) Init(0) IntegerRange(0, 1) Param
++Force issue the topn LLC mem_ref hint, without generating dynamic multi-branches.
++
++-param=llc-capacity-per-core=
++Common Joined UInteger Var(param_llc_capacity_per_core) Init(107) IntegerRange(0, 999999) Param
++LLC capacity per core.
++
++-param=filter-kernels=
++Common Joined UInteger Var(param_filter_kernels) Init(1) IntegerRange(0, 1) Param
++Allow LLC allocate pass to greedily filter kernels by traversing the corresponding basic blocks
++through edges with branch probability no less than param_branch_prob_threshold.
++
++-param=outer-loop-nums=
++Common Joined UInteger Var(param_outer_loop_num) Init(1) IntegerRange(1, 10) Param
++Maximum number of outer loops allowed to extend outer loops for loops that
++cannot recognize inner loop boundaries.
++
++-param=llc-level=
++Common Joined UInteger Var(param_llc_level) Init(3) IntegerRange(3, 4)
++Param Optimization
++Specifies the HBM cache level.
++
++-param=filter-mode=
++Common Joined UInteger Var(param_filter_mode) Init(1) IntegerRange(0, 1) Param
++Set kernel filtering mode. Use basic block count by default; use branch probability mode when filter mode is turned off.
++
++-param=transfer-footprint=
++Common Joined UInteger Var(param_transfer_footprint) Init(1) IntegerRange(0, 1) Param
++Allow transferring the firstly calculated footprint expression to the target memory reference
++from which it is impossible to retrieve the foortprint.
++
++-param=llc-allocate-func-topn=
++Common Joined UInteger Var(param_llc_allocate_func_topn) Init(0) Param Optimization
++TopN functions of pmu counts to be analyzed in LLC allocation.
++
++-param=llc-allocate-func-counts-threshold=
++Common Joined UInteger Var(param_llc_allocate_func_counts_threshold) Init(1) Param Optimization
++Threshold functions of pmu counts to be analyzed in LLC allocation.
++
+ ; This comment is to ensure we retain the blank line above.
+diff --git a/gcc/passes.def b/gcc/passes.def
+index 90643d533..49001adde 100644
+--- a/gcc/passes.def
++++ b/gcc/passes.def
+@@ -141,6 +141,7 @@ along with GCC; see the file COPYING3.  If not see
+ 
+   NEXT_PASS (pass_target_clone);
+   NEXT_PASS (pass_ipa_auto_profile);
++  NEXT_PASS (pass_ipa_extend_auto_profile);
+   NEXT_PASS (pass_ipa_tree_profile);
+   PUSH_INSERT_PASSES_WITHIN (pass_ipa_tree_profile)
+       NEXT_PASS (pass_feedback_split_functions);
+@@ -325,6 +326,7 @@ along with GCC; see the file COPYING3.  If not see
+ 	  /* Run IVOPTs after the last pass that uses data-reference analysis
+ 	     as that doesn't handle TARGET_MEM_REFs.  */
+ 	  NEXT_PASS (pass_iv_optimize);
++	  NEXT_PASS (pass_llc_allocate);
+ 	  NEXT_PASS (pass_lim);
+ 	  NEXT_PASS (pass_tree_loop_done);
+       POP_INSERT_PASSES ()
+diff --git a/gcc/print-rtl.cc b/gcc/print-rtl.cc
+index 636113d5b..b7506514a 100644
+--- a/gcc/print-rtl.cc
++++ b/gcc/print-rtl.cc
+@@ -1579,6 +1579,12 @@ print_exp (pretty_printer *pp, const_rtx x, int verbose)
+       op[1] = XEXP (x, 1);
+       op[2] = XEXP (x, 2);
+       break;
++    case PREFETCH_FULL:
++      fun = "prefetch_full";
++      op[0] = XEXP (x, 0);
++      op[1] = XEXP (x, 1);
++      op[2] = XEXP (x, 2);
++      break;
+     case UNSPEC:
+     case UNSPEC_VOLATILE:
+       {
+diff --git a/gcc/rtl.def b/gcc/rtl.def
+index 08e31fa35..78ec1a021 100644
+--- a/gcc/rtl.def
++++ b/gcc/rtl.def
+@@ -282,6 +282,15 @@ DEF_RTL_EXPR(ADDR_DIFF_VEC, "addr_diff_vec", "eEee0", RTX_EXTRA)
+    whose prefetch instructions do not support them.  */
+ DEF_RTL_EXPR(PREFETCH, "prefetch", "eee", RTX_EXTRA)
+ 
++/* Memory prefetch, with attributes supported on some targets.
++   Operand 1 is the address of the memory to fetch.
++   Operand 2 is 1 for a write access, 0 otherwise.
++   Operand 3 is the level of prfop.
++
++   The attributes specified by operands 2 and 3 are ignored for targets
++   whose prefetch instructions do not support them.  */
++DEF_RTL_EXPR(PREFETCH_FULL, "prefetch_full", "eee", RTX_EXTRA)
++
+ /* ----------------------------------------------------------------------
+    At the top level of an instruction (perhaps under PARALLEL).
+    ---------------------------------------------------------------------- */
+diff --git a/gcc/rtl.h b/gcc/rtl.h
+index a0db225cb..844e1a7c3 100644
+--- a/gcc/rtl.h
++++ b/gcc/rtl.h
+@@ -2814,6 +2814,10 @@ do {								        \
+ #define PREFETCH_SCHEDULE_BARRIER_P(RTX)					\
+   (RTL_FLAG_CHECK1 ("PREFETCH_SCHEDULE_BARRIER_P", (RTX), PREFETCH)->volatil)
+ 
++/* True if RTX is flagged to be a scheduling barrier.  */
++#define PREFETCH_FULL_SCHEDULE_BARRIER_P(RTX)				\
++  (RTL_FLAG_CHECK1 ("PREFETCH_FULL_SCHEDULE_BARRIER_P", (RTX), PREFETCH_FULL)->volatil)
++
+ /* Indicate whether the machine has any sort of auto increment addressing.
+    If not, we can avoid checking for REG_INC notes.  */
+ 
+diff --git a/gcc/rtlanal.cc b/gcc/rtlanal.cc
+index c436c640c..7f5646ce7 100644
+--- a/gcc/rtlanal.cc
++++ b/gcc/rtlanal.cc
+@@ -1198,6 +1198,7 @@ reg_referenced_p (const_rtx x, const_rtx body)
+       return reg_overlap_mentioned_p (x, TRAP_CONDITION (body));
+ 
+     case PREFETCH:
++    case PREFETCH_FULL:
+       return reg_overlap_mentioned_p (x, XEXP (body, 0));
+ 
+     case UNSPEC:
+@@ -2042,6 +2043,7 @@ note_uses (rtx *pbody, void (*fun) (rtx *, void *), void *data)
+       return;
+ 
+     case PREFETCH:
++    case PREFETCH_FULL:
+       (*fun) (&XEXP (body, 0), data);
+       return;
+ 
+diff --git a/gcc/sched-deps.cc b/gcc/sched-deps.cc
+index 948aa0c3b..db453fb9b 100644
+--- a/gcc/sched-deps.cc
++++ b/gcc/sched-deps.cc
+@@ -2705,7 +2705,9 @@ sched_analyze_2 (class deps_desc *deps, rtx x, rtx_insn *insn)
+       break;
+ 
+     case PREFETCH:
+-      if (PREFETCH_SCHEDULE_BARRIER_P (x))
++    case PREFETCH_FULL:
++      if ((code == PREFETCH && PREFETCH_SCHEDULE_BARRIER_P (x))
++          || (code == PREFETCH_FULL && PREFETCH_FULL_SCHEDULE_BARRIER_P (x)))
+ 	reg_pending_barrier = TRUE_BARRIER;
+       /* Prefetch insn contains addresses only.  So if the prefetch
+ 	 address has no registers, there will be no dependencies on
+diff --git a/gcc/target-insns.def b/gcc/target-insns.def
+index de8c0092f..9cfa19475 100644
+--- a/gcc/target-insns.def
++++ b/gcc/target-insns.def
+@@ -77,6 +77,7 @@ DEF_TARGET_INSN (omp_simt_vote_any, (rtx x0, rtx x1))
+ DEF_TARGET_INSN (omp_simt_xchg_bfly, (rtx x0, rtx x1, rtx x2))
+ DEF_TARGET_INSN (omp_simt_xchg_idx, (rtx x0, rtx x1, rtx x2))
+ DEF_TARGET_INSN (prefetch, (rtx x0, rtx x1, rtx x2))
++DEF_TARGET_INSN (prefetch_full, (rtx x0, rtx x1, rtx x2))
+ DEF_TARGET_INSN (probe_stack, (rtx x0))
+ DEF_TARGET_INSN (probe_stack_address, (rtx x0))
+ DEF_TARGET_INSN (prologue, (void))
+diff --git a/gcc/target.def b/gcc/target.def
+index 142858fa3..646489540 100644
+--- a/gcc/target.def
++++ b/gcc/target.def
+@@ -2064,6 +2064,37 @@ it is for the vector version.",
+  (vec_info *vinfo, bool costing_for_scalar),
+  default_vectorize_create_costs)
+ 
++/* Function for vector prefetch operation.  */
++DEFHOOK
++(code_for_prefetch,
++ "This hook should return the decl of a function that implements the\n\
++vectorized variant of the function with the @code{combined_fn} code\n\
++@var{code} or @code{NULL_TREE} if such a function is not available.\n\
++The return type of the vectorized function shall be of vector type\n\
++@var{vec_type_out} and the argument types should be @var{vec_type_in}.",
++ insn_code, (machine_mode arg),
++ NULL)
++
++/* Function for vector gather prefetch operation.  */
++DEFHOOK
++(code_for_gather_prefetch,
++ "This hook should return the decl of a function that implements the\n\
++vectorized variant of the function with the @code{combined_fn} code\n\
++@var{code} or @code{NULL_TREE} if such a function is not available.\n\
++The return type of the vectorized function shall be of vector type\n\
++@var{vec_type_out} and the argument types should be @var{vec_type_in}.",
++ insn_code, (machine_mode mode_to, machine_mode mode_form),
++ NULL)
++
++/* Function to check whether the target hardware architecture supports
++   a full SVE data vector mode.  */
++DEFHOOK
++(prefetch_handleable_mode_p,
++ "This hook should return true if the target hardware architecture\n\
++supports a full SVE data vector mode.",
++ bool, (machine_mode arg),
++ NULL)
++
+ HOOK_VECTOR_END (vectorize)
+ 
+ #undef HOOK_PREFIX
+diff --git a/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp
+new file mode 100644
+index 000000000..1793ba9d1
+--- /dev/null
++++ b/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp
+@@ -0,0 +1,27 @@
++#   Copyright (C) 1997-2022 Free Software Foundation, Inc.
++
++# This program is free software; you can redistribute it and/or modify
++# it under the terms of the GNU General Public License as published by
++# the Free Software Foundation; either version 3 of the License, or
++# (at your option) any later version.
++#
++# This program is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++# GNU General Public License for more details.
++#
++# You should have received a copy of the GNU General Public License
++# along with GCC; see the file COPYING3.  If not see
++# <http://www.gnu.org/licenses/>.
++
++load_lib g++-dg.exp
++load_lib target-supports.exp
++
++# Initialize `dg'.
++dg-init
++
++dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.C]] \
++	"" "-fllc-allocate"
++
++# All done.
++dg-finish
+\ No newline at end of file
+diff --git a/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C b/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C
+new file mode 100644
+index 000000000..b5bf69510
+--- /dev/null
++++ b/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C
+@@ -0,0 +1,52 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param branch-prob-threshold=50  --param filter-kernels=0 --param mem-access-num=2 --param issue-topn=1 --param force-issue=1" } */
++#include "multidim_array.h"
++
++class Input
++{
++  public:
++    int metadata_offset = 13;
++    int exp_nr_images = 1;
++    MultidimArray<double> exp_Mweight;
++    void convertAllSquaredDifferencesToWeights();
++};
++
++int main()
++{
++  clock_t start = clock();
++  Input input;
++  int testIter = 2;
++
++  for (int i = 0; i < testIter; ++i)
++    {
++      input.convertAllSquaredDifferencesToWeights();
++    }
++  return 0;
++}
++
++void Input::convertAllSquaredDifferencesToWeights()
++{
++  for (int img_id = 0; img_id < exp_nr_images; img_id++)
++  {
++    int my_metadata_offset = metadata_offset + img_id;
++    MultidimArray<double> sorted_weight;
++
++    exp_Mweight.getRow(img_id, sorted_weight);
++    long int np = 0;
++    FOR_ALL_DIRECT_ELEMENTS_IN_MULTIDIMARRAY(sorted_weight)
++    {
++      if (DIRECT_MULTIDIM_ELEM(sorted_weight, n) > 0.)
++        {
++          DIRECT_MULTIDIM_ELEM(sorted_weight, np) = DIRECT_MULTIDIM_ELEM( \
++            sorted_weight, n);
++          np++;
++        }
++    }
++  }
++}
++
++
++
++/* { dg-final { scan-tree-dump-times "dense memory access" 1 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "__builtin_prefetch" 1 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */
+diff --git a/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h b/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h
+new file mode 100644
+index 000000000..682f24703
+--- /dev/null
++++ b/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h
+@@ -0,0 +1,186 @@
++#ifndef MULTIDIM_ARRAY_H
++#define MULTIDIM_ARRAY_H
++
++#include <iostream>
++
++#define RELION_ALIGNED_MALLOC malloc
++#define RELION_ALIGNED_FREE free
++
++#define STARTINGX(v) ((v).xinit)
++#define STARTINGY(v) ((v).yinit)
++#define NZYXSIZE(v) ((v).nzyxdim)
++
++#define DIRECT_MULTIDIM_ELEM(v,n) ((v).data[(n)])
++#define FOR_ALL_DIRECT_ELEMENTS_IN_MULTIDIMARRAY(v) \
++  for (long int n=0; n<NZYXSIZE(v); ++n)
++
++#define FOR_ALL_DIRECT_ELEMENTS_IN_MULTIDIMARRAY_ptr(v,n,ptr) \
++  for ((n)=0, (ptr)=(v).data; (n)<NZYXSIZE(v); ++(n), ++(ptr))
++
++#define DIRECT_A2D_ELEM(v,i,j) ((v).data[(i)*(v).xdim+(j)])
++#define A2D_ELEM(v, i, j) \
++  DIRECT_A2D_ELEM(v, (i) - STARTINGY(v), (j) - STARTINGX(v))
++
++#define DIRECT_A1D_ELEM(v, i) ((v).data[(i)])
++#define A1D_ELEM(v, i) DIRECT_A1D_ELEM(v, (i) - ((v).xinit))
++
++template<typename T>
++class MultidimArray
++{
++public:
++  T* data;
++  bool destroyData;
++  long int ndim;
++  long int zdim;
++  long int ydim;
++  long int xdim;
++  long int yxdim;
++  long int zyxdim;
++  long int nzyxdim;
++  long int zinit;
++  long int yinit;
++  long int xinit;
++  long int nzyxdimAlloc;
++
++public:
++  void clear()
++  {
++    coreDeallocate();
++    coreInit();
++  }
++
++  void coreInit()
++  {
++    xdim=0;
++    yxdim=0;
++    zyxdim=0;
++    nzyxdim=0;
++    ydim=1;
++    zdim=1;
++    ndim=1;
++    zinit=0;
++    yinit=0;
++    xinit=0;
++    data=NULL;
++    nzyxdimAlloc = 0;
++    destroyData=true;
++  }
++
++  void coreAllocate(long int _ndim, long int _zdim, long int _ydim, long int _xdim)
++  {
++    if (_ndim <= 0 || _zdim <= 0 || _ydim<=0 || _xdim<=0)
++      {
++        clear();
++        return;
++      }
++
++    ndim=_ndim;
++    zdim=_zdim;
++    ydim=_ydim;
++    xdim=_xdim;
++    yxdim=ydim*xdim;
++    zyxdim=zdim*yxdim;
++    nzyxdim=ndim*zyxdim;
++
++    coreAllocate();
++  }
++
++  void coreAllocate()
++  {
++    data = (T*)RELION_ALIGNED_MALLOC(sizeof(T) * nzyxdim);
++    nzyxdimAlloc = nzyxdim;
++  }
++
++  void coreDeallocate()
++  {
++    if (data != NULL && destroyData)
++      {
++        RELION_ALIGNED_FREE(data);
++      }
++    data=NULL;
++    nzyxdimAlloc = 0;
++  }
++
++  void resize(long int Ndim, long int Zdim, long int Ydim, long int Xdim)
++  {
++    if (Ndim*Zdim*Ydim*Xdim == nzyxdimAlloc && data != NULL)
++      {
++        ndim = Ndim;
++        xdim = Xdim;
++        ydim = Ydim;
++        zdim = Zdim;
++        yxdim = Ydim * Xdim;
++        zyxdim = Zdim * yxdim;
++        nzyxdim = Ndim * zyxdim;
++        nzyxdimAlloc = nzyxdim;
++        return;
++      }
++
++    if (Xdim <= 0 || Ydim <= 0 || Zdim <= 0 || Ndim <= 0)
++      {
++        clear();
++        return;
++      }
++
++    if (NZYXSIZE(*this) > 0 && data == NULL)
++      {
++        coreAllocate();
++        return;
++      }
++
++    size_t YXdim=Ydim*Xdim;
++    size_t ZYXdim=Zdim*YXdim;
++    size_t NZYXdim=Ndim*ZYXdim;
++
++    T * new_data = (T*)RELION_ALIGNED_MALLOC(sizeof(T) * NZYXdim);
++    for (long int l = 0; l < Ndim; l++)
++        for (long int k = 0; k < Zdim; k++)
++            for (long int i = 0; i < Ydim; i++)
++                for (long int j = 0; j < Xdim; j++)
++                  {
++                    T val;
++                    new_data[l*ZYXdim + k*YXdim+i*Xdim+j] = val;
++                  }
++    coreDeallocate();
++
++    data = new_data;
++    ndim = Ndim;
++    xdim = Xdim;
++    ydim = Ydim;
++    zdim = Zdim;
++    yxdim = Ydim * Xdim;
++    zyxdim = Zdim * yxdim;
++    nzyxdim = Ndim * zyxdim;
++    nzyxdimAlloc = nzyxdim;
++  }
++
++  void resize(long int Xdim)
++  {
++    resize(1, 1, 1, Xdim);
++  }
++
++  inline T& operator()(long int i, long int j) const
++  {
++    return A2D_ELEM(*this, i, j);
++  }
++
++  inline T& operator()(long int i) const
++  {
++    return A1D_ELEM(*this, i);
++  }
++
++  void getRow(long int i, MultidimArray<T>& v) const
++  {
++    if (xdim == 0 || ydim == 0)
++      {
++        v.clear();
++        return;
++      }
++
++    v.resize(xdim);
++    for (long int j = 0; j < xdim; j++)
++      v(j) = (*this)(i, j);
++  }
++};
++
++#endif /* MULTIDIM_ARRAY_H */
+\ No newline at end of file
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
+new file mode 100644
+index 000000000..091e654f9
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
+@@ -0,0 +1,61 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param issue-topn=2 --param branch-prob-threshold=50 --param filter-mode=0" } */
++
++#include <stdio.h>
++
++#define N 131590
++#define F 384477
++
++double diagPtr[N];
++double psiPtr[N];
++double ApsiPtr[N];
++int lPtr[F];
++int uPtr[F];
++double lowerPtr[F];
++double upperPtr[F];
++
++void
++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr,
++      int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces)
++{
++  for (int cell=0; cell<nCells; cell++)
++    ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
++
++  for (int face=0; face<nFaces; face++)
++    {
++      ApsiPtr[uPtr[face]] += lowerPtr[face]*psiPtr[lPtr[face]];
++      ApsiPtr[lPtr[face]] += upperPtr[face]*psiPtr[uPtr[face]];
++    }
++}
++
++int
++main (int argc, char *argv[])
++{
++  int nCells = N;
++  int nFaces = F;
++  int testIter = 2;
++
++  for (int i=0; i<testIter; i++)
++    AMUL (diagPtr,psiPtr,ApsiPtr,lPtr,uPtr,lowerPtr,upperPtr,nCells,nFaces);
++
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 5 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "Tracing succeeded" 29 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not   "Tracing failed" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "static_data_size:" 7 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){2}\}" 3 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){3}\}" 1 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times ", size: (?!(0\.000000))" 7 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times ", size: 0\.000000" 19 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tApsiPtr\\t\\(1.003952, 1, 5, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tpsiPtr\\t\\(1.003952, 1, 3, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tdiagPtr\\t\\(1.003952, 1, 1, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tlowerPtr\\t\\(2.933319, 1, 1, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tupperPtr\\t\\(2.933319, 1, 1, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tlPtr\\t\\(1.466660, 1, 1, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump       "\\d\\tuPtr\\t\\(1.466660, 1, 1, 0\\)" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "insert svprfd" 4 "llc_allocate" } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c
+new file mode 100644
+index 000000000..16a56ae03
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c
+@@ -0,0 +1,54 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param force-issue=1 --param filter-mode=0" } */
++
++#include <stdio.h>
++
++#define N 100000
++
++int A_i[N];
++int A_j[N];
++double A_data[N];
++double x_data[N];
++double y_data[N];
++int num_rows = N;
++
++void
++MatMult (int *A_i, int *A_j, double *A_data, double *x_data,
++         int num_rows, double *y_data)
++{
++  int i = 0;
++  int j = 0;
++  double temp = 0;
++  for (i = 0; i < num_rows; i++)
++    {
++      temp = y_data[i];
++      for (j = A_i[i]; j < A_i[i+1]; j++)
++        temp += A_data[j] * x_data[A_j[j]];
++      y_data[i] = temp;
++    }
++}
++
++int
++main (int argc, char *argv[])
++{
++  int testIter = 2;
++
++  for (int i = 0; i < testIter; i++)
++    MatMult (A_i, A_j, A_data, x_data, num_rows, y_data);
++
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 4 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "Tracing succeeded" 14 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not   "Tracing failed" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not   "static_data_size:" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not   ", size: (?!(0\.000000))" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times ", size: 0\.000000" 6 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "\\d\\tx_data\\t\\(0.000000, 1, 1, 0\\)" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "\\d\\tA_j\\t\\(0.000000, 1, 1, 0\\)" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "\\d\\tA_data\\t\\(0.000000, 1, 1, 0\\)" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not   "runtime issue" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "insert svprfd_gather" 2 "llc_allocate" } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp
+new file mode 100644
+index 000000000..05a3bf842
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp
+@@ -0,0 +1,27 @@
++#   Copyright (C) 2022-2023 Free Software Foundation, Inc.
++
++# This program is free software; you can redistribute it and/or modify
++# it under the terms of the GNU General Public License as published by
++# the Free Software Foundation; either version 3 of the License, or
++# (at your option) any later version.
++#
++# This program is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++# GNU General Public License for more details.
++#
++# You should have received a copy of the GNU General Public License
++# along with GCC; see the file COPYING3.  If not see
++# <http://www.gnu.org/licenses/>.
++
++load_lib gcc-dg.exp
++load_lib target-supports.exp
++
++# Initialize `dg'.
++dg-init
++
++dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \
++	"" "-fllc-allocate"
++
++# All done.
++dg-finish
+\ No newline at end of file
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c
+new file mode 100644
+index 000000000..113acbceb
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-cross-bb-indir-mem-acc.c
+@@ -0,0 +1,36 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c --param=mem-access-ratio=1 --param=mem-access-num=0" } */
++
++/* In this deja test case, we test how Phase 2 & 3 of llc-allocate pass deals
++   with an indirect memory access in a nested loop where the use-block for the
++   induction variable of this memory access is a child/descendent of its
++   def-block (we make it by defining the induction variable in the outer loop).
++   Therefore, the reference can be successfully traced after outer-loop
++   analysis.  */
++#include <stdlib.h>
++#include <time.h> 
++
++void cross_bb_indir_mem_acc (int *arr1, int *arr2, int *arr3, int *arr4, int n) {
++    srand (time (NULL));
++
++    int j_s;
++    int j_e = arr1[0];
++    int k;
++
++    for (int i = 0; i < n; i++)
++    {
++        j_s = j_e;
++        j_e = arr1[i + 1];
++
++        k = arr3[i];
++
++        for (int j = j_s; j < j_e; j++)
++        {
++           arr4[j] -= arr2[k];
++        }
++
++    }
++}
++
++/* { dg-final { scan-tree-dump "Unhandled indirect memory access tracing." "llc_allocate" } } */
++/* { dg-final { scan-tree-dump "Retrace indirect memory access after outer loop analysis:" "llc_allocate" } } */
+\ No newline at end of file
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c
+new file mode 100644
+index 000000000..a2e7f66a4
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c
+@@ -0,0 +1,61 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++#include <stdio.h>
++#define N 131590
++#define F 384477
++
++int ownStartPtr[F];
++double bPrimePtr[N];
++double diagPtr[N];
++double psiPtr[N];
++double upperPtr[F];
++double lowerPtr[F];
++int uPtr[F];
++
++void SMOOTH(int *ownStartPtr, double *bPrimePtr, double *diagPtr, double *psiPtr, int *uPtr, double *lowerPtr, double *upperPtr, int nCells);
++
++int main(int argc, char *argv[])
++{
++  int nCells = N;
++  int nFaces = F;
++  int testIter = 2;
++  for (int i = 0; i < testIter; i++)
++    {
++      SMOOTH(ownStartPtr, bPrimePtr, diagPtr, psiPtr, uPtr, lowerPtr, upperPtr, nCells);
++    }
++  return  0;
++}
++
++
++void SMOOTH(int *ownStartPtr, double *bPrimePtr, double *diagPtr, double *psiPtr, int *uPtr, double *lowerPtr, double *upperPtr, int nCells)
++{
++  double psii;
++  int fStart;
++  int fEnd = ownStartPtr[0];
++
++  for (int celli = 0; celli < nCells; celli++)
++    {
++      fStart = fEnd;
++      fEnd = ownStartPtr[celli + 1];
++      psii = bPrimePtr[celli];
++
++      for (int facei = fStart; facei<fEnd; facei++)
++	{
++	  psii -= upperPtr[facei] * psiPtr[uPtr[facei]];
++	}
++
++      psii /= diagPtr[celli];
++      for (int facei = fStart; facei < fEnd; facei++)
++	{
++	  bPrimePtr[uPtr[facei]] -= lowerPtr[facei] * psii;
++	}
++      psiPtr[celli] = psii;
++    }
++}
++
++/* { dg-final { scan-tree-dump-times "bPrimePtr : 3" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "diagPtr : 1" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "upperPtr : 1" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "psiPtr : 2" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "insert" 8 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not   "Processing loop 0"  "llc_allocate" } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-branch-in-loop.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-branch-in-loop.c
+new file mode 100644
+index 000000000..704f8792c
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-branch-in-loop.c
+@@ -0,0 +1,39 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c" } */
++
++/* In this deja test case, we test how Phase 4 of llc-allocate pass deals with
++   loop that contains a branching.  */
++#include <stdio.h>
++
++#define N 131590
++
++double diagPtr[N];
++double psiPtr[N];
++double ApsiPtr[N];
++
++void
++branch_in_loop (double *diagPtr, double *psiPtr, double *ApsiPtr, int nCells)
++{
++  for (int cell=0; cell<nCells; cell++)
++    {
++      if (psiPtr[cell] > 0)
++          ApsiPtr[cell] = 0;
++      else
++          ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
++    }
++}
++
++int
++main (int argc, char *argv[])
++{
++  int nCells = N;
++  int testIter = 100;
++
++  for (int i=0; i<testIter; i++)
++    {
++      branch_in_loop (diagPtr,psiPtr,ApsiPtr,nCells);
++    }
++  return  0;
++}
++
++/* { dg-final { scan-tree-dump "static issue" "llc_allocate" } } */
+\ No newline at end of file
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-break-in-loop.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-break-in-loop.c
+new file mode 100644
+index 000000000..a19d11506
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-break-in-loop.c
+@@ -0,0 +1,41 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c" } */
++
++/* In this deja test case, we test how Phase 4 of llc-allocate pass deals with
++   loop that contains a break statement (which introduces multiple exits for a
++   loop). Currently, loops with multiple exits are filtered by Phase 1.  */
++#include <stdio.h>
++
++#define N 131590
++
++double diagPtr[N];
++double psiPtr[N];
++double ApsiPtr[N];
++
++void
++break_in_loop (double *diagPtr, double *psiPtr, double *ApsiPtr, int nCells)
++{
++  for (int cell=0; cell<nCells; cell++)
++    {
++      if (psiPtr[cell] > 0)
++	break;
++      ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
++    }
++}
++
++int
++main (int argc, char *argv[])
++{
++  int nCells = N;
++  int testIter = 2;
++
++  for (int i=0; i<testIter; i++)
++    {
++      break_in_loop (diagPtr,psiPtr,ApsiPtr,nCells);
++    }
++  return  0;
++}
++
++/* { dg-final { scan-tree-dump "loop_multiple_exits" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not "Phase 2" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not "static issue" "llc_allocate" } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-goto-in-loop.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-goto-in-loop.c
+new file mode 100644
+index 000000000..9e017d7aa
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-goto-in-loop.c
+@@ -0,0 +1,50 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c" } */
++
++/* In this deja test case, we test how Phase 4 of llc-allocate pass deals with
++   loop that contains a goto statement (which introduces multiple exits for a
++   loop). Currently, loops with multiple exits are filtered by Phase 1.  */
++#include <stdio.h>
++
++#define N 131
++
++double diagPtr[N];
++int psiPtr[N];
++double ApsiPtr[N];
++
++void
++goto_in_loop (double *diagPtr, int *psiPtr, double *ApsiPtr, int nCells)
++{
++  for (int cell=0; cell<nCells; cell++)
++    {
++      if (psiPtr[cell] % 3 == 0)
++          goto zero;
++      else if (psiPtr[cell] % 3 == 1)
++          goto one;
++      else
++          ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
++    }
++
++  zero:
++      ApsiPtr[0] = 0.;
++      return;
++  one:
++      ApsiPtr[0] = 1.;
++}
++
++int
++main (int argc, char *argv[])
++{
++  int nCells = N;
++  int testIter = 2;
++
++  for (int i=0; i<testIter; i++)
++    {
++      goto_in_loop (diagPtr,psiPtr,ApsiPtr,nCells);
++    }
++  return  0;
++}
++
++/* { dg-final { scan-tree-dump "loop_multiple_exits" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not "Phase 2" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not "static issue" "llc_allocate" } } */
+\ No newline at end of file
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-same-loop-cycle.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-same-loop-cycle.c
+new file mode 100644
+index 000000000..16cb7012b
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-feedback-same-loop-cycle.c
+@@ -0,0 +1,129 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno -c --param=force-issue=1" } */
++
++/* In this deja test case, we test how Phase 4 of llc-allocate pass deals with
++   cfg that contains a backedge not being the latch of a formal GCC loop
++   structure.  */
++typedef unsigned long size_t;
++typedef long scalar_t__;
++
++typedef  struct TYPE_13__   TYPE_3__ ;
++typedef  struct TYPE_12__   TYPE_2__ ;
++typedef  struct TYPE_11__   TYPE_1__ ;
++
++struct dom_info {int nodes; int* dfs_parent; int* dfs_order; int* key; int* next_bucket; int* bucket; int* dom; int fake_exit_edge; TYPE_3__** dfs_to_bb; } ;
++typedef  enum cdi_direction { ____Placeholder_cdi_direction } cdi_direction ;
++struct TYPE_11__ {scalar_t__ index; } ;
++typedef  TYPE_1__ edge_iterator ;
++typedef  TYPE_2__* edge ;
++typedef  TYPE_3__* basic_block ;
++struct TYPE_13__ {size_t index; int preds; int succs; } ;
++struct TYPE_12__ {TYPE_3__* src; TYPE_3__* dest; } ;
++typedef  int TBB ;
++
++basic_block ENTRY_BLOCK_PTR ;
++basic_block EXIT_BLOCK_PTR ;
++scalar_t__ bitmap_bit_p (int,size_t) ;
++edge ei_edge (edge_iterator) ;
++int ei_end_p (edge_iterator) ;
++int ei_next (edge_iterator*) ;
++edge_iterator ei_start (int) ;
++size_t eval (struct dom_info*,int) ;
++size_t last_basic_block ;
++int link_roots (struct dom_info*,int,int) ;
++
++__attribute__((used)) static void
++calc_idoms (struct dom_info *di, enum cdi_direction reverse)
++{
++  TBB v, w, k, par;
++  basic_block en_block;
++  edge_iterator ei, einext;
++
++  if (reverse)
++    en_block = EXIT_BLOCK_PTR;
++  else
++    en_block = ENTRY_BLOCK_PTR;
++
++  /* Go backwards in DFS order, to first look at the leafs.  */
++  v = di->nodes;
++  while (v > 1)
++    {
++      basic_block bb = di->dfs_to_bb[v];
++      edge e;
++
++      par = di->dfs_parent[v];
++      k = v;
++
++      ei = (reverse) ? ei_start (bb->succs) : ei_start (bb->preds);
++
++      if (reverse)
++	{
++	  /* If this block has a fake edge to exit, process that first.  */
++	  if (bitmap_bit_p (di->fake_exit_edge, bb->index))
++	    {
++	      einext = ei;
++	      einext.index = 0;
++	      goto do_fake_exit_edge;
++	    }
++	}
++
++      /* Search all direct predecessors for the smallest node with a path
++	 to them.  That way we have the smallest node with also a path to
++	 us only over nodes behind us.  In effect we search for our
++	 semidominator.  */
++      while (!ei_end_p (ei))
++	{
++	  basic_block b;
++	  TBB k1;
++
++	  e = ei_edge (ei);
++	  b = (reverse) ? e->dest : e->src;
++	  einext = ei;
++	  ei_next (&einext);
++
++	  if (b == en_block)
++	    {
++	    do_fake_exit_edge:
++	      k1 = di->dfs_order[last_basic_block];
++	    }
++	  else
++	    k1 = di->dfs_order[b->index];
++
++	  /* Call eval() only if really needed.  If k1 is above V in DFS tree,
++	     then we know, that eval(k1) == k1 and key[k1] == k1.  */
++	  if (k1 > v)
++	    k1 = di->key[eval (di, k1)];
++	  if (k1 < k)
++	    k = k1;
++
++	  ei = einext;
++	}
++
++      di->key[v] = k;
++      link_roots (di, par, v);
++      di->next_bucket[v] = di->bucket[k];
++      di->bucket[k] = v;
++
++      /* Transform semidominators into dominators.  */
++      for (w = di->bucket[par]; w; w = di->next_bucket[w])
++	{
++	  k = eval (di, w);
++	  if (di->key[k] < di->key[w])
++	    di->dom[w] = k;
++	  else
++	    di->dom[w] = par;
++	}
++      /* We don't need to cleanup next_bucket[].  */
++      di->bucket[par] = 0;
++      v--;
++    }
++
++  /* Explicitly define the dominators.  */
++  di->dom[1] = 0;
++  for (v = 2; v <= di->nodes; v++)
++    if (di->dom[v] != di->key[v])
++      di->dom[v] = di->dom[di->dom[v]];
++}
++
++/* { dg-final { scan-tree-dump-times "Warning: Find cycle at bb index" 2 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump "static issue" "llc_allocate" } } */
+\ No newline at end of file
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c
+new file mode 100644
+index 000000000..e18725f60
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c
+@@ -0,0 +1,50 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -c -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0" } */
++
++#include <stdio.h>
++
++typedef struct stack_def
++{
++  int top;                      /* index to top stack element */
++  unsigned long reg_set;        /* set of live registers */
++  unsigned char reg[128];       /* register - stack mapping */
++} *stack;
++
++typedef struct block_info_def
++{
++  struct stack_def stack_in;    /* Input stack configuration.  */
++  struct stack_def stack_out;   /* Output stack configuration.  */
++  unsigned long out_reg_set;    /* Stack regs live on output.  */
++  int done;                     /* True if block already converted.  */
++  int predecessors;             /* Number of predecessors that need
++                                   to be visited.  */
++} *block_info;
++
++typedef struct basic_block_def
++{
++  void *aux;
++} *basic_block;
++
++unsigned char
++convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high)
++{
++  stack output_stack;
++
++  output_stack = &(((block_info) bb->aux)->stack_in);
++  if (value_reg_low == -1)
++    output_stack->top = -1;
++  else
++    {
++      int reg;
++      output_stack->top = value_reg_high - value_reg_low;
++      for (reg = value_reg_low; reg <= value_reg_high; ++reg)
++        {
++          (output_stack->reg + 16)[value_reg_high - reg] = reg;
++          output_stack->reg_set |= (unsigned long) 1 << reg;
++        }
++    }
++  return output_stack->reg[0];
++}
++
++/* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */
+\ No newline at end of file
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c
+new file mode 100644
+index 000000000..328dc57bc
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1keep.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options " -S -O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,0);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL1KEEP"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c
+new file mode 100644
+index 000000000..d9c919869
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl1strm.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,1);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL1STRM"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c
+new file mode 100644
+index 000000000..806366b5b
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2keep.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,2);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL2KEEP"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c
+new file mode 100644
+index 000000000..91567d1e9
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl2strm.c
+@@ -0,0 +1,16 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main()
++{
++  for(int i = 0; i < 100000; i++)
++    {
++      __builtin_prefetch_full(&val[i], 0, 3);
++      val[i] = i + 1;		
++    }
++}
++
++/* { dg-final { scan-assembler "PLDL2STRM"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c
+new file mode 100644
+index 000000000..c28150654
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3keep.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,4);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL3KEEP"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c
+new file mode 100644
+index 000000000..e8d9c8693
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl3strm.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,5);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL3STRM"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c
+new file mode 100644
+index 000000000..b0281882f
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4keep.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,6);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL4KEEP"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c
+new file mode 100644
+index 000000000..26807556f
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pldl4strm.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],0,7);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PLDL4STRM"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c
+new file mode 100644
+index 000000000..4f2def13d
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1keep.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,0);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL1KEEP"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c
+new file mode 100644
+index 000000000..ecc501f1f
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl1strm.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,1);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL1STRM"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c
+new file mode 100644
+index 000000000..d140f1ed1
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2keep.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,2);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL2KEEP"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c
+new file mode 100644
+index 000000000..d6f170253
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl2strm.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,3);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL2STRM"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c
+new file mode 100644
+index 000000000..8da092b36
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3keep.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,4);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL3KEEP"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c
+new file mode 100644
+index 000000000..4cf65188a
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl3strm.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,5);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL3STRM"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c
+new file mode 100644
+index 000000000..36f4a3aa0
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4keep.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,6);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL4KEEP"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c
+new file mode 100644
+index 000000000..43d2d41d5
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-prefetch-full-pstl4strm.c
+@@ -0,0 +1,14 @@
++
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */
++
++
++int val[100000];
++int main(){
++	for(int i=0;i<100000;i++){
++		__builtin_prefetch_full(&val[i],1,7);
++		val[i]=i+1;		
++	}
++}
++
++/* { dg-final { scan-assembler "PSTL4STRM"  } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c
+new file mode 100644
+index 000000000..ba90e7ea4
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c
+@@ -0,0 +1,62 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0" } */
++
++#include <stdio.h>
++#include <stdlib.h>
++
++#define N 1000
++
++long a[N] = {0};
++long b[N] = {0};
++long c[N] = {0};
++
++double
++referenceTrace (double *psiPtr, int *lPtr, int *uPtr, int nCells)
++{
++  double sum;
++  for (int cell = 0; cell < nCells; cell++)
++    {
++      // Multi-layer pointer
++      sum += psiPtr[lPtr[cell]];
++      psiPtr[uPtr[cell]] = sum;
++
++      // Outer pointer, inner array
++      sum += psiPtr[b[cell]];
++      psiPtr[a[cell]] = sum;
++
++      // Multi-layer array
++      sum += a[b[cell]];
++      c[a[cell]] = sum;
++
++      // Outer array, inner pointer
++      sum += a[lPtr[cell]];
++      c[lPtr[cell]] = sum;
++    }
++  return sum;
++}
++
++int
++main (int argc, char *argv[])
++{
++  int testIter = 2;
++
++  double *psiPtr = NULL;
++  int *lPtr = NULL;
++  int *uPtr = NULL;
++  psiPtr = (double *) calloc (N, sizeof(double));
++  lPtr = (int *) calloc (N, sizeof(int));
++  uPtr = (int *) calloc (N, sizeof(int));
++
++  for (int i = 0; i < testIter; i++)
++    referenceTrace (psiPtr, lPtr, uPtr, N);
++
++  free (psiPtr);
++  free (lPtr);
++  free (uPtr);
++
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump-times "Tracing succeeded" 24 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "unhandled issue scene" 2 "llc_allocate" } } */
+\ No newline at end of file
+diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90
+new file mode 100644
+index 000000000..b0f68ebe3
+--- /dev/null
++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90
+@@ -0,0 +1,211 @@
++! { dg-do compile { target { aarch64*-*-linux* } } }
++! { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param branch-prob-threshold=50 --param filter-mode=0" }
++
++program main
++
++  IMPLICIT NONE
++  INTEGER :: ids,ide, jds,jde, kds,kde
++  INTEGER,parameter :: ims=-4,kms=1,jms=-4
++  INTEGER,parameter :: ime=210,kme=36,jme=192
++  INTEGER :: its,ite, jts,jte, kts,kte
++  INTEGER :: number_of_small_timesteps,rk_step, rk_order, step
++
++  REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: t_1, t_2, c2a, p, ph, pm1, al, alt
++
++
++  REAL, DIMENSION(ims:ime, jms:jme) :: mu, muts
++
++  REAL, DIMENSION(kms:kme) :: dnw, rdnw, znu
++
++  REAL :: rdx,rdy
++  REAL :: dts, t0, smdiv
++  REAL :: random1,time_begin,time_end,total_time
++
++  INTEGER :: i, j, k
++  INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end
++  INTEGER :: i_endu, j_endv
++  INTEGER :: interval=1
++  INTEGER :: epoch,iter
++
++  LOGICAL :: non_hydrostatic
++
++  data ids, jds, kds, its, jts, kts /6*1/
++  data ide, ite /2*205/
++  data jde, jte /2*187/
++  data kde, kte /2*36/
++
++  number_of_small_timesteps = 1
++  rk_step = 3
++  rk_order = 1
++  dts = 1.
++
++  rdx = 1.
++  rdy = 1.
++
++  t0 = 0.
++  smdiv = 1.
++  step = 1
++  non_hydrostatic = .true.
++
++  call random_number(random1)
++  interval = random1*100
++  interval=1
++
++  call random_seed(put=(/(i,i=1,10000,interval)/))
++
++  call random_number(alt)
++  call random_number(c2a)
++  call random_number(ph)
++  call random_number(pm1)
++  call random_number(mu)
++  call random_number(muts)
++  call random_number(dnw)
++  call random_number(rdnw)
++  call random_number(znu)
++
++  do iter=1,2
++  call calc_p_rho( al, p, ph,                        &
++                       alt, t_2, t_1, c2a, pm1,      &
++                       mu, muts, znu, t0,            &
++                       rdnw, dnw, smdiv,             &
++                       non_hydrostatic, step,        &
++                       ids, ide, jds, jde, kds, kde, &
++                       ims, ime, jms, jme, kms, kme, &
++                       its,ite, jts,jte, kts,kte    )
++
++  enddo
++
++end program
++
++
++SUBROUTINE calc_p_rho( al, p, ph,                    &
++                       alt, t_2, t_1, c2a, pm1,      &
++                       mu, muts, znu, t0,            &
++                       rdnw, dnw, smdiv,             &
++                       non_hydrostatic, step,        &
++                       ids, ide, jds, jde, kds, kde, &
++                       ims, ime, jms, jme, kms, kme, &
++                       its,ite, jts,jte, kts,kte    )
++
++  IMPLICIT NONE  ! religion first
++  !asb
++! declarations for the stuff coming in
++
++  INTEGER,      INTENT(IN   )    :: ids,ide, jds,jde, kds,kde
++  INTEGER,      INTENT(IN   )    :: ims,ime, jms,jme, kms,kme
++  INTEGER,      INTENT(IN   )    :: its,ite, jts,jte, kts,kte
++
++  INTEGER,      INTENT(IN   )    :: step
++
++  REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(  OUT) :: al,   &
++                                                               p
++
++  REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(IN   ) :: alt,   &
++                                                              t_2,   &
++                                                              t_1,   &
++                                                              c2a
++
++  REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(INOUT) :: ph, pm1
++
++  REAL, DIMENSION(ims:ime, jms:jme)         , INTENT(IN   ) :: mu,   &
++                                                               muts
++
++  REAL, DIMENSION(kms:kme)         , INTENT(IN   ) :: dnw,  &
++                                                      rdnw, &
++                                                      znu
++
++  REAL,                                       INTENT(IN   ) :: t0, smdiv
++
++  LOGICAL, INTENT(IN   )  :: non_hydrostatic
++
++! local variables
++
++  INTEGER :: i, j, k
++  INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end
++  REAL    :: ptmp
++
++   i_start = its
++   i_end   = min(ite,ide-1)
++   j_start = jts
++   j_end   = min(jte,jde-1)
++   k_start = kts
++   k_end = min(kte,kde-1)
++
++   IF (non_hydrostatic) THEN
++     DO j=j_start, j_end
++     DO k=k_start, k_end
++     DO i=i_start, i_end
++
++!  al computation is all dry, so ok with moisture
++
++      al(i,k,j)=-1./muts(i,j)*(alt(i,k,j)*mu(i,j)  &
++             +rdnw(k)*(ph(i,k+1,j)-ph(i,k,j)))
++
++!  this is temporally linearized p, no moisture correction needed
++
++      p(i,k,j)=c2a(i,k,j)*(alt(i,k,j)*(t_2(i,k,j)-mu(i,j)*t_1(i,k,j))  &
++                       /(muts(i,j)*(t0+t_1(i,k,j)))-al (i,k,j))
++
++     ENDDO
++     ENDDO
++     ENDDO
++
++   ELSE  ! hydrostatic calculation
++
++       DO j=j_start, j_end
++       DO k=k_start, k_end
++       DO i=i_start, i_end
++         p(i,k,j)=mu(i,j)*znu(k)
++         al(i,k,j)=alt(i,k,j)*(t_2(i,k,j)-mu(i,j)*t_1(i,k,j))            &
++                      /(muts(i,j)*(t0+t_1(i,k,j)))-p(i,k,j)/c2a(i,k,j)
++         ph(i,k+1,j)=ph(i,k,j)-dnw(k)*(muts(i,j)*al (i,k,j)              &
++                          +mu(i,j)*alt(i,k,j))
++       ENDDO
++       ENDDO
++       ENDDO
++
++   END IF
++
++!  divergence damping setup
++
++     IF (step == 0) then   ! we're initializing small timesteps
++       DO j=j_start, j_end
++       DO k=k_start, k_end
++       DO i=i_start, i_end
++         pm1(i,k,j)=p(i,k,j)
++       ENDDO
++       ENDDO
++       ENDDO
++     ELSE                     ! we're in the small timesteps
++       DO j=j_start, j_end    ! and adding div damping component
++       DO k=k_start, k_end
++       DO i=i_start, i_end
++         ptmp = p(i,k,j)
++         p(i,k,j) = p(i,k,j) + smdiv*(p(i,k,j)-pm1(i,k,j))
++         pm1(i,k,j) = ptmp
++       ENDDO
++       ENDDO
++       ENDDO
++     END IF
++
++END SUBROUTINE calc_p_rho
++
++! { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 6 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "Tracing succeeded" 46 "llc_allocate" } }
++! { dg-final { scan-tree-dump-not   "Tracing failed" "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 1 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){2}\}" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){4}\}" 1 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tp\\t\\(0.000000, 3, 1, 0\\)" 1 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tp\\t\\(0.000000, 3, 3, 0\\)" 1 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tpm1\\t\\(0.000000, 3, 2, 0\\)" 1 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tph\\t\\(0.000000, 3, 2, 0\\)" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tal\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\talt\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tt_1\\t\\(0.000000, 3, 1, 0\\)" 1 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tt_2\\t\\(0.000000, 3, 1, 0\\)" 1 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "\\d\\tc2a\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "runtime issue" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "insert svprfd" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "cumul_size.*150960\\)" 1 "llc_allocate" } }
+diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp
+new file mode 100644
+index 000000000..13d225f35
+--- /dev/null
++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp
+@@ -0,0 +1,29 @@
++#   Copyright (C) 2022-2023 Free Software Foundation, Inc.
++
++# This program is free software; you can redistribute it and/or modify
++# it under the terms of the GNU General Public License as published by
++# the Free Software Foundation; either version 3 of the License, or
++# (at your option) any later version.
++#
++# This program is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++# GNU General Public License for more details.
++#
++# You should have received a copy of the GNU General Public License
++# along with GCC; see the file COPYING3.  If not see
++# <http://www.gnu.org/licenses/>.
++
++# GCC testsuite that uses the `dg.exp' driver.
++
++load_lib gfortran-dg.exp
++
++# Initialize `dg'.
++dg-init
++
++# Main loop.
++gfortran-dg-runtest [lsort \
++    [glob -nocomplain $srcdir/$subdir/*.\[fF\]{,90,95,03,08} ] ] "" ""
++
++# All done.
++dg-finish
+diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90
+new file mode 100644
+index 000000000..501e6e74c
+--- /dev/null
++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90
+@@ -0,0 +1,62 @@
++! { dg-do compile { target { aarch64*-*-linux* } } }
++! { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno" }
++
++MODULE INPUT
++    IMPLICIT NONE
++
++    INTEGER, PARAMETER :: wp = 8, jpi = 25, jpj = 39, jpk = 31, kjpt = 2
++
++    INTEGER :: kt = 1, jpkm1 = 30, jpjm1 = 38, fs_jpim1 = 24, fs_2 = 2
++    REAL(wp), DIMENSION(jpi, jpj) :: e12t
++    REAL(wp), DIMENSION(jpi, jpj, jpk) :: fse3t_n
++    REAL(wp), DIMENSION(jpi, jpj, jpk, kjpt) :: pta
++
++END MODULE INPUT
++
++PROGRAM MAIN
++    USE INPUT
++
++    IMPLICIT NONE
++
++    INTEGER :: EPOCH
++
++! Initialize arrays
++
++    e12t = 1
++    fse3t_n = 1
++    pta = 1
++!
++
++    DO EPOCH=1,2
++        CALL tra_ldf_iso
++    ENDDO
++
++END PROGRAM MAIN
++
++SUBROUTINE tra_ldf_iso
++    USE INPUT
++
++    IMPLICIT NONE
++    !
++    INTEGER :: ji, jj, jk, jn   ! dummy loop indices
++    REAL(wp) :: zbtr, ztra            !   -      -
++    REAL(wp), DIMENSION(jpi, jpj, jpk) :: ztfw
++
++    DO jn = 1, kjpt
++        ztfw(:, :, 1) = 0.e0; ztfw(:, :, jpk) = 0.e0
++
++        DO jk = 1, jpkm1
++            DO jj = 2, jpjm1
++                DO ji = fs_2, fs_jpim1   ! vector opt.
++                    zbtr = 1.0/(e12t(ji, jj)*fse3t_n(ji, jj, jk))
++                    ztra = (ztfw(ji, jj, jk) - ztfw(ji, jj, jk + 1))*zbtr
++                    pta(ji, jj, jk, jn) = pta(ji, jj, jk, jn) + ztra
++                END DO
++            END DO
++        END DO
++        !
++    END DO
++    !
++END SUBROUTINE tra_ldf_iso
++
++! { dg-final { scan-tree-dump-times "Traced variables at vectp_ztfw" 2 "llc_allocate" } }
+diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90
+new file mode 100644
+index 000000000..7345759db
+--- /dev/null
++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90
+@@ -0,0 +1,58 @@
++! { dg-do compile { target { aarch64*-*-linux* } } }
++! { dg-options "-c -O3 -march=armv8.2-a+sve -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0 --param issue-topn=1 --param mem-access-ratio=5 --param mem-access-num=1" }
++
++Module module_domain
++    IMPLICIT NONE
++
++    REAL, PARAMETER :: g = 9.8
++    TYPE :: grid_type
++        REAL, POINTER   :: phb(:,:,:), ph_2(:,:,:), p(:,:,:), pb(:,:,:)
++        REAL, POINTER   :: fnm(:), fnp(:)
++    END TYPE
++END Module
++
++SUBROUTINE calc_p8w(p8w, ix, iy, k_start, k_end)
++
++   USE module_domain
++   !USE module_model_constants
++
++   IMPLICIT NONE
++
++
++   !TYPE (domain), INTENT(IN) :: grid
++   INTEGER, INTENT(IN) :: k_start, k_end, ix, iy
++   REAL, DIMENSION(k_start:k_end), INTENT(OUT) :: p8w
++
++
++   INTEGER :: k
++   REAL    :: z0, z1, z2, w1, w2
++   REAL, DIMENSION(k_start:k_end)   :: z_at_w
++   REAL, DIMENSION(k_start:k_end-1) :: z
++   TYPE (grid_type), POINTER :: grid
++
++
++   DO k = k_start, k_end
++      z_at_w(k) = (grid%phb(ix,k,iy)+grid%ph_2(ix,k,iy))/g
++   END DO
++
++   DO k = k_start, k_end-1
++      z(k) = 0.5*(z_at_w(k) + z_at_w(k+1))
++   END DO
++
++   DO k = k_start+1, k_end-1
++      p8w(k) = grid%fnm(k)*(grid%p(ix,k,iy)+grid%pb(ix,k,iy)) + &
++               grid%fnp(k)*(grid%p(ix,k-1,iy)+grid%pb(ix,k-1,iy))
++   END DO
++
++   z0 = z_at_w(k_start)
++   z1 = z(k_start)
++   z2 = z(k_start+1)
++   w1 = (z0 - z2)/(z1 - z2)
++   w2 = 1. - w1
++   p8w(k_start) = w1*(grid%p(ix,k_start,iy)+grid%pb(ix,k_start,iy)) + &
++                  w2*(grid%p(ix,k_start+1,iy)+grid%pb(ix,k_start+1,iy))
++
++END SUBROUTINE calc_p8w
++
++! { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } }
+\ No newline at end of file
+diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90
+new file mode 100644
+index 000000000..f79df5d26
+--- /dev/null
++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-wrf-4-outer-loop-num.f90
+@@ -0,0 +1,320 @@
++! { dg-do compile { target { aarch64*-*-linux* } } }
++! { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno  --param=branch-prob-threshold=50 --param=filter-kernels=0 --param=mem-access-num=2 --param=issue-topn=2  --param=force-issue=1 --param=outer-loop-nums=3" }
++!include "module_small_step_em.F90"
++
++Module add_type
++  IMPLICIT NONE
++
++  TYPE :: grid_config_rec_type
++      LOGICAL :: open_xs
++      LOGICAL :: open_ys
++      LOGICAL :: open_xe
++      LOGICAL :: open_ye
++      LOGICAL :: symmetric_xs
++      LOGICAL :: symmetric_xe
++      LOGICAL :: symmetric_ys
++      LOGICAL :: symmetric_ye
++      LOGICAL :: polar
++      LOGICAL :: nested
++      LOGICAL :: periodic_x
++      LOGICAL :: specified
++  END TYPE
++END Module
++
++program main
++
++
++!  include "module_small_step_em_modify.F90"
++
++!  use module_small_step_em
++!  use module_small_step_em_modify
++
++  use add_type
++
++  IMPLICIT NONE
++  INTEGER :: ids,ide, jds,jde, kds,kde
++  INTEGER,parameter :: ims=-4,kms=1,jms=-4
++  INTEGER,parameter :: ime=210,kme=36,jme=192
++  INTEGER :: its,ite, jts,jte, kts,kte
++  INTEGER :: number_of_small_timesteps,rk_step, rk_order, step, spec_zone
++
++  REAL, DIMENSION(ims:ime, kms:kme, jms:jme, 1:8) :: llcRefresh
++  REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: u, v, u_1, v_1, t_1, ww_1, ft!u, v, u_1, v_1, w_1, t_1, ww1, ww_1,ph_1, ft
++  REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: u_save, v_save, w_save, t_save, ph_save,h_diabatic
++  ! REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: u_2, v_2, w_2, t_2, ph_2
++  ! REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: c2a, ww_save, cqw, cqu, cqv, alpha, gamma, a
++  REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: ww!pb, p, ph, php, pm1, al, alt, ww, random_array
++  ! REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: ru_tend, rv_tend
++  REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: t, t_ave, uam, vam, wwam
++
++  REAL, DIMENSION(ims:ime, jms:jme) :: mu_1,mu_2, mu
++  REAL, DIMENSION(ims:ime, jms:jme) :: mub, muu, muv, mut,        &
++                                       msfux, msfuy,              &
++                                       msfvx, msfvx_inv, msfvy,   &
++                                       msftx, msfty
++
++  REAL, DIMENSION(ims:ime, jms:jme) :: muus, muvs, muts, mudf, muave
++  REAL, DIMENSION(ims:ime, jms:jme) :: mu_save, mu_tend
++
++  REAL, DIMENSION(kms:kme) :: rdn, rdnw,dnw, fnm, fnp, znu
++
++  REAL :: rdx,rdy
++  REAL :: dts, cf1, cf2, cf3, t0, emdiv, smdiv, epssm, g
++  REAL :: random1,time_begin,time_end,total_time
++
++  INTEGER :: i, j, k
++  INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end
++  INTEGER :: i_endu, j_endv
++  INTEGER :: interval=1
++  INTEGER :: epoch
++
++  LOGICAL :: non_hydrostatic, top_lid
++
++
++  TYPE (grid_config_rec_type) :: config_flags
++  config_flags%open_xs = .true.
++  config_flags%open_ys = .true.
++  config_flags%open_xe = .true.
++  config_flags%open_ye = .true.
++  config_flags%symmetric_xs = .true.
++  config_flags%symmetric_xe = .true.
++  config_flags%symmetric_ys = .true.
++  config_flags%symmetric_ye = .true.
++  config_flags%polar = .true.
++  config_flags%nested = .true.
++  config_flags%periodic_x = .true.
++  config_flags%specified = .true.
++
++  data ids, jds, kds, its, jts, kts /6*1/
++  data ide, ite /2*205/
++  data jde, jte /2*187/
++  data kde, kte /2*98/
++
++  number_of_small_timesteps = 1
++  rk_step = 1
++  rk_order = 1
++  dts = 1.
++  epssm = 1.
++  g = 1.
++
++  rdx = 1.
++  rdy = 1.
++  dts = 1.
++  cf1 = 1.
++  cf2 = 1.
++  cf3 = 1.
++
++  t0 = 0.
++  smdiv = 1.
++  emdiv = 1.
++  step = 1
++  spec_zone = 1
++
++  non_hydrostatic = .true.
++  top_lid = .true.
++
++  interval=1
++
++
++  total_time=0
++
++  call random_seed(put=(/(i,i=1,10000,interval)/))
++
++  call random_number(u)
++  call random_number(v)
++  call random_number(u_1)
++  call random_number(v_1)
++  call random_number(t_1)
++  call random_number(ft)
++
++  call random_number(ww)
++  call random_number(ww_1)
++  call random_number(t)
++  call random_number(t_ave)
++  call random_number(uam)
++  call random_number(vam)
++  call random_number(wwam)
++
++  call random_number(muu)
++  call random_number(muv)
++  call random_number(mut)
++  call random_number(msfux)
++  call random_number(msfuy)
++  call random_number(msfvx)
++  call random_number(msfvx_inv)
++  call random_number(msfvy)
++  call random_number(msftx)
++  call random_number(msfty)
++  call random_number(mu_tend)
++
++  call random_number(muave)
++  call random_number(muts)
++  call random_number(mudf)
++  call random_number(mu)
++
++  call random_number(fnm)
++  call random_number(fnp)
++  call random_number(dnw)
++  call random_number(rdnw)
++
++  DO j=jms, jme
++  DO k=kms, kme
++  DO i=ims, ime
++
++    llcRefresh(i,k,j,1)=i+k+j+7
++
++  ENDDO
++  ENDDO
++  ENDDO
++
++  do epoch = 1,2
++  call advance_mu_t_fortran_plu( ww, ww_1, u, u_1, v, v_1,            &
++                         mu, mut, muave, muts, muu, muv,      &
++                         mudf, uam, vam, wwam, t, t_1,        &
++                         t_ave, ft, mu_tend,                  &
++                         rdx, rdy, dts, epssm,                &
++                         dnw, fnm, fnp, rdnw,                 &
++                         msfux, msfuy, msfvx, msfvx_inv,      &
++                         msfvy, msftx, msfty,                 &
++                         step, config_flags,                  &
++                         ids, ide, jds, jde, kds, kde,        &
++                         ims, ime, jms, jme, kms, kme,        &
++                         its, ite, jts, jte, kts, kte        )
++  enddo
++end program
++
++
++
++SUBROUTINE advance_mu_t_fortran_plu( ww, ww_1, u, u_1, v, v_1,            &
++        mu, mut, muave, muts, muu, muv,      &
++        mudf, uam, vam, wwam, t, t_1,        &
++        t_ave, ft, mu_tend,                  &
++        rdx, rdy, dts, epssm,                &
++        dnw, fnm, fnp, rdnw,                 &
++        msfux, msfuy, msfvx, msfvx_inv,      &
++        msfvy, msftx, msfty,                 &
++        step, config_flags,                  &
++        ids, ide, jds, jde, kds, kde,        &
++        ims, ime, jms, jme, kms, kme,        &
++        its, ite, jts, jte, kts, kte        )
++  use add_type
++
++  IMPLICIT NONE  ! religion first
++
++  ! stuff coming in
++
++  TYPE(grid_config_rec_type), INTENT(IN   ) :: config_flags
++  INTEGER,      INTENT(IN   )    :: ids,ide, jds,jde, kds,kde
++  INTEGER,      INTENT(IN   )    :: ims,ime, jms,jme, kms,kme
++  INTEGER,      INTENT(IN   )    :: its,ite, jts,jte, kts,kte
++
++  INTEGER,      INTENT(IN   )    :: step
++
++  REAL, DIMENSION( ims:ime , kms:kme, jms:jme ),   &
++          INTENT(IN   ) ::                       &
++          u,   &
++          v,   &
++          u_1, &
++          v_1, &
++          t_1, &
++          ft
++
++  REAL, DIMENSION( ims:ime , kms:kme, jms:jme ),      &
++          INTENT(INOUT) ::                          &
++          ww,     &
++          ww_1,   &
++          t,      &
++          t_ave,  &
++          uam,    &
++          vam,    &
++          wwam
++
++  REAL, DIMENSION( ims:ime , jms:jme ),    INTENT(IN   ) :: muu,  &
++          muv,  &
++          mut,  &
++          msfux,&
++          msfuy,&
++          msfvx,&
++          msfvx_inv,&
++          msfvy,&
++          msftx,&
++          msfty,&
++          mu_tend
++
++  REAL, DIMENSION( ims:ime , jms:jme ),    INTENT( INOUT) :: muave, &
++          muts,  &
++          mudf
++
++  REAL, DIMENSION( ims:ime , jms:jme ),    INTENT(INOUT) :: mu
++
++  REAL, DIMENSION( kms:kme ),              INTENT(IN   ) :: fnm,    &
++          fnp,    &
++          dnw,    &
++          rdnw
++
++
++  REAL,                                    INTENT(IN   ) :: rdx,    &
++          rdy,    &
++          dts,    &
++          epssm
++
++  REAL, DIMENSION (its:ite, kts:kte) :: wdtn, dvdxi
++  REAL, DIMENSION (its:ite) :: dmdt
++
++  INTEGER :: i,j,k, i_start, i_end, j_start, j_end, k_start, k_end
++  INTEGER :: i_endu, j_endv
++  REAL    :: acc
++
++  INTEGER :: ubv, lbv, t1, t2, t3, t4, ceild, floord
++
++  ceild(t1, t2) = ceiling(REAL(t1)/REAL(t2))
++  floord(t1, t2) = floor(REAL(t1)/REAL(t2))
++  i_start = its
++  i_end   = min(ite,ide-1)
++  j_start = jts
++  j_end   = min(jte,jde-1)
++  k_start = kts
++  k_end   = kte-1
++  IF ( .NOT. config_flags%periodic_x )THEN
++    IF ( config_flags%specified .or. config_flags%nested ) then
++      i_start = max(its,ids+1)
++      i_end   = min(ite,ide-2)
++    ENDIF
++  ENDIF
++  IF ( config_flags%specified .or. config_flags%nested ) then
++    j_start = max(jts,jds+1)
++    j_end   = min(jte,jde-2)
++  ENDIF
++
++  i_endu = ite
++  j_endv = jte
++
++  DO j = j_start, j_end
++
++    DO i=i_start, i_end
++      dmdt(i) = 0.
++    ENDDO
++
++    DO k=k_start, k_end
++      DO i=i_start, i_end
++        dvdxi(i,k) = msftx(i,j)*msfty(i,j)*(      &
++                rdy*((v(i,k,j+1)+muv(i,j+1)*v_1(i,k,j+1)*msfvx_inv(i,j+1))  &
++                        -(v(i,k,j  )+muv(i,j  )*v_1(i,k,j)*msfvx_inv(i,j  ))) &
++                        +rdx*((u(i+1,k,j)+muu(i+1,j)*u_1(i+1,k,j)/msfuy(i+1,j))      &
++                        -(u(i,k,j  )+muu(i  ,j)*u_1(i,k,j  )/msfuy(i,j)) ))
++        dmdt(i)    = dmdt(i) + dnw(k)*dvdxi(i,k)
++      ENDDO
++    ENDDO
++    DO i=i_start, i_end
++      muave(i,j) = mu(i,j)
++      mu(i,j) = mu(i,j)+dts*(dmdt(i)+mu_tend(i,j))
++      mudf(i,j) = (dmdt(i)+mu_tend(i,j)) ! save tendency for div dampfilter
++      muts(i,j) = mut(i,j)+mu(i,j)
++      muave(i,j) =.5*((1.+epssm)*mu(i,j)+(1.-epssm)*muave(i,j))
++    ENDDO
++  ENDDO
++END SUBROUTINE advance_mu_t_fortran_plu
++
++! { dg-final { scan-tree-dump "issue_llc_hint" "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "analyze_nested_kernels" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump "Stop tracing the outer loop depth" "llc_allocate" } }
+\ No newline at end of file
+diff --git a/gcc/timevar.def b/gcc/timevar.def
+index 36c3e7d5a..14129a500 100644
+--- a/gcc/timevar.def
++++ b/gcc/timevar.def
+@@ -84,6 +84,7 @@ DEFTIMEVAR (TV_IPA_COMDATS	     , "ipa comdats")
+ DEFTIMEVAR (TV_IPA_HARDWARE_DETECTION, "ipa detection")
+ DEFTIMEVAR (TV_IPA_PREFETCH	     , "ipa prefetch")
+ DEFTIMEVAR (TV_IPA_STRUCT_REORG      , "ipa struct reorg optimization")
++DEFTIMEVAR (TV_IPA_EXTEND_AUTO_PROFILE, "ipa extend auto profile")
+ DEFTIMEVAR (TV_IPA_OPT		     , "ipa various optimizations")
+ DEFTIMEVAR (TV_IPA_LTO_DECOMPRESS    , "lto stream decompression")
+ DEFTIMEVAR (TV_IPA_LTO_COMPRESS      , "lto stream compression")
+@@ -215,6 +216,7 @@ DEFTIMEVAR (TV_TREE_LOOP_DISTRIBUTION, "tree loop distribution")
+ DEFTIMEVAR (TV_CHECK_DATA_DEPS       , "tree check data dependences")
+ DEFTIMEVAR (TV_TREE_PREFETCH	     , "tree prefetching")
+ DEFTIMEVAR (TV_TREE_LOOP_IVOPTS	     , "tree iv optimization")
++DEFTIMEVAR (TV_TREE_LLC_ALLOCATE     , "tree llc allocation")
+ DEFTIMEVAR (TV_PREDCOM		     , "predictive commoning")
+ DEFTIMEVAR (TV_TREE_CH		     , "tree copy headers")
+ DEFTIMEVAR (TV_TREE_SSA_UNCPROP	     , "tree SSA uncprop")
+diff --git a/gcc/toplev.cc b/gcc/toplev.cc
+index f00a166df..bdbd4de63 100644
+--- a/gcc/toplev.cc
++++ b/gcc/toplev.cc
+@@ -567,6 +567,12 @@ compile_file (void)
+       targetm.asm_out.output_ident (ident_str);
+     }
+ 
++  /* Extend auto profile finalization.  */
++  if (flag_ipa_extend_auto_profile)
++    {
++      free_extend_profile_info ();
++    }
++
+   /* Auto profile finalization. */
+   if (flag_auto_profile)
+     end_auto_profile ();
+diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
+index d33aaec8c..40f67a8ed 100644
+--- a/gcc/tree-cfg.cc
++++ b/gcc/tree-cfg.cc
+@@ -8476,6 +8476,17 @@ print_loops (FILE *file, int verbosity)
+     print_loop_and_siblings (file, bb->loop_father, 0, verbosity);
+ }
+ 
++/* Dump a loop to file.  */
++
++void
++loop_dump (FILE *file, class loop *loop)
++{
++  print_loop (file, loop, 0, 0);
++  fprintf (file, "vec_niter = ");
++  print_generic_expr (file, loop->vec_nb_iterations);
++  fprintf (file, "\n");
++}
++
+ /* Dump a loop.  */
+ 
+ DEBUG_FUNCTION void
+diff --git a/gcc/tree-cfg.h b/gcc/tree-cfg.h
+index bfe44c073..0982fa7cf 100644
+--- a/gcc/tree-cfg.h
++++ b/gcc/tree-cfg.h
+@@ -83,6 +83,7 @@ extern void dump_function_to_file (tree, FILE *, dump_flags_t);
+ extern void debug_function (tree, dump_flags_t);
+ extern void print_loops_bb (FILE *, basic_block, int, int);
+ extern void print_loops (FILE *, int);
++extern void loop_dump (FILE *file, class loop *loop);
+ extern void debug (class loop &ref);
+ extern void debug (class loop *ptr);
+ extern void debug_verbose (class loop &ref);
+diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
+index a98f84397..468353d13 100644
+--- a/gcc/tree-pass.h
++++ b/gcc/tree-pass.h
+@@ -395,6 +395,7 @@ extern gimple_opt_pass *make_pass_slp_vectorize_late (gcc::context *ctxt);
+ extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt);
+ extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt);
+ extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);
++extern gimple_opt_pass *make_pass_llc_allocate (gcc::context *ctxt);
+ extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt);
+ extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt);
+ extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt);
+@@ -536,6 +537,8 @@ extern simple_ipa_opt_pass *make_pass_ipa_hardware_detection (gcc::context *
+ 							      ctxt);
+ extern simple_ipa_opt_pass *make_pass_ipa_prefetch (gcc::context *ctxt);
+ extern simple_ipa_opt_pass *make_pass_ipa_struct_reorg (gcc::context *ctxt);
++extern simple_ipa_opt_pass *make_pass_ipa_extend_auto_profile (gcc::context
++							       *ctxt);
+ extern simple_ipa_opt_pass *make_pass_ipa_pta (gcc::context *ctxt);
+ extern simple_ipa_opt_pass *make_pass_ipa_tm (gcc::context *ctxt);
+ extern simple_ipa_opt_pass *make_pass_target_clone (gcc::context *ctxt);
+diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
+index 44157265c..4c014fb23 100644
+--- a/gcc/tree-scalar-evolution.cc
++++ b/gcc/tree-scalar-evolution.cc
+@@ -2789,7 +2789,7 @@ resolve_mixers (class loop *loop, tree chrec, bool *folded_casts)
+    the loop body has been executed 6 times.  */
+ 
+ tree
+-number_of_latch_executions (class loop *loop)
++number_of_latch_executions (class loop *loop, bool guarantee)
+ {
+   edge exit;
+   class tree_niter_desc niter_desc;
+@@ -2810,7 +2810,8 @@ number_of_latch_executions (class loop *loop)
+   res = chrec_dont_know;
+   exit = single_exit (loop);
+ 
+-  if (exit && number_of_iterations_exit (loop, exit, &niter_desc, false))
++  if (exit && number_of_iterations_exit (loop, exit, &niter_desc, false,
++					 true, NULL, guarantee))
+     {
+       may_be_zero = niter_desc.may_be_zero;
+       res = niter_desc.niter;
+@@ -2836,7 +2837,8 @@ number_of_latch_executions (class loop *loop)
+       fprintf (dump_file, "))\n");
+     }
+ 
+-  loop->nb_iterations = res;
++  if (guarantee)
++    loop->nb_iterations = res;
+   return res;
+ }
+ 
+diff --git a/gcc/tree-scalar-evolution.h b/gcc/tree-scalar-evolution.h
+index 0f90207bc..dc27d9545 100644
+--- a/gcc/tree-scalar-evolution.h
++++ b/gcc/tree-scalar-evolution.h
+@@ -21,7 +21,8 @@ along with GCC; see the file COPYING3.  If not see
+ #ifndef GCC_TREE_SCALAR_EVOLUTION_H
+ #define GCC_TREE_SCALAR_EVOLUTION_H
+ 
+-extern tree number_of_latch_executions (class loop *);
++extern tree number_of_latch_executions (class loop *,
++					bool guarantee = true);
+ extern gcond *get_loop_exit_condition (const class loop *);
+ 
+ extern void scev_initialize (void);
+diff --git a/gcc/tree-ssa-llc-allocate.cc b/gcc/tree-ssa-llc-allocate.cc
+new file mode 100644
+index 000000000..da6d72b94
+--- /dev/null
++++ b/gcc/tree-ssa-llc-allocate.cc
+@@ -0,0 +1,4150 @@
++/* LLC allocate.
++   Copyright (C) 2022-2023 Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify it
++under the terms of the GNU General Public License as published by the
++Free Software Foundation; either version 3, or (at your option) any
++later version.
++
++GCC is distributed in the hope that it will be useful, but WITHOUT
++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#include "config.h"
++#define INCLUDE_MAP
++#define INCLUDE_SET
++#define INCLUDE_VECTOR
++#define INCLUDE_LIST
++#define INCLUDE_ALGORITHM
++#include "system.h"
++#include "coretypes.h"
++#include "backend.h"
++#include "target.h"
++#include "rtl.h"
++#include "tree.h"
++#include "gimple.h"
++#include "predict.h"
++#include "tree-pass.h"
++#include "gimple-ssa.h"
++#include "optabs-query.h"
++#include "tree-pretty-print.h"
++#include "fold-const.h"
++#include "stor-layout.h"
++#include "gimplify.h"
++#include "gimple-iterator.h"
++#include "gimplify-me.h"
++#include "tree-ssa-loop-ivopts.h"
++#include "tree-ssa-loop-manip.h"
++#include "tree-ssa-loop-niter.h"
++#include "tree-ssa-loop.h"
++#include "ssa.h"
++#include "tree-into-ssa.h"
++#include "cfgloop.h"
++#include "tree-scalar-evolution.h"
++#include "langhooks.h"
++#include "tree-inline.h"
++#include "tree-data-ref.h"
++#include "diagnostic-core.h"
++#include "dbgcnt.h"
++#include "gimple-pretty-print.h"
++#include "internal-fn.h"
++#include "tree-cfg.h"
++#include "profile-count.h"
++#include "auto-profile.h"
++
++/* Number of parallel cores.  */
++const unsigned int PARALLEL_NUM = 304;
++
++/* Indirect access weight.  */
++const unsigned int INDIRECT_ACCESS_VALUE = 3;
++
++/* Write memory weight.  */
++const unsigned int WRITE_COST = 4;
++
++/* Maximum ratio of total prefetch data size to cache size.  */
++const double PREFETCH_CACHE_SIZE_RATIO = 0.8;
++
++/* Prefetch tool input max length.  */
++#ifndef PREFETCH_TOOL_INPUT_MAX_LEN
++#define PREFETCH_TOOL_INPUT_MAX_LEN 512
++#endif
++
++/* Prefetch tool number max length.  */
++#ifndef PREFETCH_TOOL_NUM_MAX_LEN
++#define PREFETCH_TOOL_NUM_MAX_LEN 9
++#endif
++
++#ifndef PREFETCH_FUNC_TOPN
++#define PREFETCH_FUNC_TOPN param_llc_allocate_func_topn
++#endif
++
++namespace {
++
++/* loop bound info of the memory reference located.  */
++struct loop_bound
++{
++  /* iv tree_node.  */
++  tree iv;
++
++  /* define stmt of iv.  */
++  gimple *def_stmt;
++
++  /* loop where stmt is located.  */
++  class loop *loop;
++
++  /* loop unroll factor.  */
++  unsigned int unroll;
++
++  /* Number of iterations of loop.  */
++  tree niters;
++
++  loop_bound (tree t, gimple *stmt)
++    {
++      iv = t;
++      def_stmt = stmt;
++      loop = loop_containing_stmt (stmt);
++      unroll = 1;
++      niters = chrec_dont_know;
++    }
++};
++
++/* method of calculating the data size.  */
++
++enum calc_type
++{
++  UNHANDLE_CALC = 0,
++  RUNTIME_CALC,
++  STATIC_CALC
++};
++
++/* Describes a info of a memory reference.  */
++
++struct data_ref
++{
++  /* The memory reference.  */
++  tree ref;
++
++  /* Statement where the ref is located.  */
++  gimple *stmt;
++
++  /* var_decl or param_decl, used for the ref_group.  */
++  tree var;
++
++  /* Base of the reference.  */
++  tree base;
++
++  /* Constant offset of the reference.  */
++  tree offset;
++
++  /* index of the reference.  */
++  tree index;
++
++  /* Constant step of the reference.  */
++  tree step;
++
++  /* loop boundary info of each dimension.  */
++  std::vector<loop_bound> loop_bounds;
++
++  /* memory data size, Unit: MB.  */
++  double data_size;
++
++  /* method of calculating the data size.  */
++  calc_type calc_by;
++
++  /* True if the info of ref is traced, and then record it.  */
++  unsigned int trace_status_p : 1;
++
++  /* True if the loop is vectorized.  */
++  unsigned int vectorize_p : 1;
++
++  /* True if the memory reference is shared.  */
++  unsigned int parallel_p : 1;
++
++  /* True if the memory reference is regular.  */
++  unsigned int regular_p : 1;
++
++  /* True if the memory reference is read.  */
++  unsigned int read_p : 1;
++
++  /* loop father depth.  */
++  unsigned int loop_depth;
++
++  /* bb index.  */
++  int bb_idx;
++
++  /* loop index.  */
++  int loop_idx;
++
++  data_ref ()
++    {
++      ref = NULL_TREE;
++      stmt = NULL;
++      var = NULL_TREE;
++      base = NULL_TREE;
++      offset = NULL_TREE;
++      index = NULL_TREE;
++      step = NULL_TREE;
++      data_size = 0;
++      calc_by = UNHANDLE_CALC;
++      trace_status_p = false;
++      vectorize_p = false;
++      parallel_p = false;
++      regular_p = true;
++      read_p = true;
++      loop_depth = 0;
++      bb_idx = 0;
++      loop_idx = 0;
++    }
++};
++
++/* ================ phase 1 get_dense_memory_kernels ================  */
++
++/* Add ref node and print.  */
++
++void
++add_ref (std::vector<data_ref> &references, tree op, gimple *stmt,
++	 bool vectorize_p, bool read_p)
++{
++  data_ref ref;
++  ref.ref = op;
++  ref.stmt = stmt;
++  ref.vectorize_p = vectorize_p;
++  ref.read_p = read_p;
++  ref.loop_depth = loop_depth (stmt->bb->loop_father);
++  ref.bb_idx = stmt->bb->index;
++  ref.loop_idx = stmt->bb->loop_father->num;
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      print_generic_expr (dump_file, ref.ref, TDF_LINENO);
++      fprintf (dump_file, "\n");
++    }
++  references.push_back (ref);
++}
++
++/* Get the references from the simple call (vectorization type).  */
++
++void
++get_references_in_gimple_call (gimple *stmt, std::vector<data_ref> &references)
++{
++  if (gimple_code (stmt) != GIMPLE_CALL)
++    return;
++
++  if (gimple_call_internal_p (stmt))
++    {
++      bool read_p = false;
++      switch (gimple_call_internal_fn (stmt))
++	{
++	  case IFN_MASK_GATHER_LOAD:
++	  case IFN_MASK_LOAD:
++	    {
++	      if (gimple_call_lhs (stmt) == NULL_TREE)
++		return;
++	      read_p = true;
++	      // FALLTHRU
++	    }
++	  case IFN_MASK_STORE:
++	    {
++	      /* _1 = &MEM[base: a_2(D), index: ivtmp_3, step: 8, offset: 0B];
++		 vect__1.1 = .MASK_LOAD (_1, 64B, loop_mask_4);
++
++		 _1 = &MEM[base: a_2(D), index: ivtmp_3, step: 8, offset: 0B];
++		 .MASK_STORE (_1, 64B, loop_mask_4, vect__1.2);
++
++		_1 = (sizetype) a_2(D);
++		 vect_patt_3.3 = .MASK_GATHER_LOAD (_1, vect__4.4, 8,
++						    { 0.0, ... }, loop_mask_5);
++	      */
++	      tree op1 = gimple_call_arg (stmt, 0);
++	      if (TREE_CODE (op1) != SSA_NAME)
++		{
++		  if (dump_file && (dump_flags & TDF_DETAILS))
++		    {
++		      fprintf (dump_file, "get_references_in_gimple_call: ");
++		      fprintf (dump_file, "find base that not ssa_name: ");
++		      print_generic_expr (dump_file, op1, TDF_LINENO);
++		      fprintf (dump_file, "\n");
++		    }
++		  return;
++		}
++	      gimple *op1_def = SSA_NAME_DEF_STMT (op1);
++	      if (op1_def != NULL && gimple_code (op1_def) == GIMPLE_ASSIGN)
++		{
++		  /* &MEM[base: xx]  */
++		  tree rhs1 = gimple_assign_rhs1 (op1_def);
++		  /* If the definition stmt of the operation is memory
++		     reference type, read it directly.  */
++		  if (TREE_CODE (rhs1) == ADDR_EXPR
++		      && TREE_CODE (TREE_OPERAND (rhs1, 0)) == TARGET_MEM_REF)
++		    op1 = TREE_OPERAND (rhs1, 0); /* MEM[base: xx]  */
++		}
++
++	      add_ref (references, op1, stmt, true, read_p);
++	      return;
++	    }
++	  default:
++	    return;
++	}
++    }
++}
++
++/* Check whether memory reference is located exactly in main function.
++   There are some other unexpected scenarios where mem ref or function is
++   tracing failed without loc info (newly generated gimple/function).  */
++
++bool
++is_reference_in_main_p (gimple *stmt)
++{
++  expanded_location xloc = expand_location (stmt->location);
++  if (DECL_NAME (cfun->decl) && MAIN_NAME_P (DECL_NAME (cfun->decl)))
++    {
++      /* NEXT STEP: Check why some functions have no end_locus.  */
++      if (!(DECL_SOURCE_LOCATION (current_function_decl)
++	    && cfun->function_end_locus))
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "Cannot find function start-end location.\n");
++	  return true;
++	}
++      else if (!(xloc.file && xloc.line))
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "Cannot find gimple statement location.\n");
++	      print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
++	    }
++	  return false;
++	}
++      int fn_start = expand_location (
++	DECL_SOURCE_LOCATION (current_function_decl)).line;
++      int fn_end = expand_location (cfun->function_end_locus).line;
++
++      if (xloc.line >= fn_start && xloc.line <= fn_end)
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "Memory access in main function: ");
++	      print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
++	    }
++	  return true;
++	}
++    }
++  return false;
++}
++
++/* Stores the locations of memory references in STMT to REFERENCES.  */
++
++void
++get_references_in_stmt (gimple *stmt, std::vector<data_ref> &references)
++{
++  if (!gimple_vuse (stmt))
++    return;
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "gimple_vuse: ");
++      print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
++    }
++
++  /* Filter out memory references located in main function. This is a
++     experimental filtering scheme ONLY for HPC case verification as
++     some HPC cases assign values for variables (mem ref) in main function.  */
++  if (is_reference_in_main_p (stmt))
++    return;
++
++  if (gimple_code (stmt) == GIMPLE_ASSIGN)
++    {
++      tree op0 = gimple_assign_lhs (stmt);
++      tree op1 = gimple_assign_rhs1 (stmt);
++      tree base = NULL_TREE;
++
++      /* _1 = MEM[base: a, index: i, step: 8, offset: 0B];  */
++      if (REFERENCE_CLASS_P (op1)  && (base = get_base_address (op1))
++	  && TREE_CODE (base) != SSA_NAME && !is_gimple_min_invariant (base))
++	add_ref (references, op1, stmt, false, true);
++
++      if (REFERENCE_CLASS_P (op0) && get_base_address (op0))
++	add_ref (references, op0, stmt, false, false);
++    }
++  else if (gimple_code (stmt) == GIMPLE_CALL)
++    get_references_in_gimple_call (stmt, references);
++
++  return;
++}
++
++/* flag of loop filter out.  */
++
++struct loop_filter_out_flag
++{
++  /* Use external call.  */
++  bool use_ext_call;
++
++  /* Use external node.  */
++  bool use_ext_node;
++
++  /* Use loop defined in macros.  */
++  bool use_macro_loop;
++
++  /* Use external node.  */
++  bool use_cond_func;
++};
++
++/* Check whether an external node is used.  */
++
++bool use_ext_node_p (const std::vector<data_ref> &references,
++		     unsigned int &start)
++{
++  expanded_location cfun_xloc
++	= expand_location (DECL_SOURCE_LOCATION (current_function_decl));
++
++  unsigned i = start;
++  start = references.size ();
++  for (; i < references.size (); i++)
++    {
++      data_ref ref = references[i];
++      expanded_location xloc = expand_location (ref.stmt->location);
++      if (xloc.file && filename_cmp (cfun_xloc.file, xloc.file))
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "use_ext_node\n\n");
++	  return true;
++	}
++    }
++  return false;
++}
++
++/* Determine whether to filter out loops by stmt.  */
++
++bool
++filter_out_loop_by_stmt_p (loop_filter_out_flag &loop_filter, gimple *stmt,
++			   const std::vector<data_ref> &references,
++			   unsigned int &start)
++{
++  expanded_location xloc = expand_location (stmt->location);
++  /* check use_ext_call.  */
++  if (gimple_code (stmt) == GIMPLE_CALL && !gimple_call_internal_p (stmt))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "use_ext_call: ");
++	  print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
++	}
++      loop_filter.use_ext_call = true;
++      return true;
++    }
++
++  /* check use_macro_loop.  */
++  if (xloc.file && xloc.column != 1)
++    loop_filter.use_macro_loop = false;
++
++  /* check use_cond_func, VEC_COND_EXPR/MIN_EXPR/MAX_EXPR.  */
++  if (gimple_code (stmt) == GIMPLE_ASSIGN)
++    {
++      enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
++      if (rhs_code == VEC_COND_EXPR || rhs_code == MIN_EXPR
++	  || rhs_code == MAX_EXPR)
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "use_cond_func: ");
++	      print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO);
++	    }
++	  loop_filter.use_cond_func = true;
++	  return true;
++	}
++    }
++
++  /* check use_ext_node.  */
++  if (use_ext_node_p (references, start))
++    {
++      loop_filter.use_ext_node = true;
++      return true;
++    }
++
++  return false;
++}
++
++/* Dump the flag type of the loop is filtered out.  */
++
++void
++dump_loop_filter_out_flag (loop_filter_out_flag &loop_filter)
++{
++  if (loop_filter.use_ext_call)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "non-dense mem access: use_ext_call\n");
++    }
++
++  if (loop_filter.use_ext_node)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "non-dense mem access: use_ext_node\n");
++    }
++
++  if (loop_filter.use_macro_loop)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "non-dense mem access: use_macro_loop\n");
++    }
++
++  if (loop_filter.use_cond_func)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "non-dense mem access: use_cond_func\n");
++    }
++}
++
++/* Get references in loop.  */
++
++bool
++get_references_in_loop (std::vector<data_ref> &references,
++			loop_filter_out_flag &loop_filter,
++			class loop *loop)
++{
++  unsigned int start = 0;
++  bool filter_out_loop = true;
++
++  /* Analyze each bb in the loop.  */
++  basic_block *body = get_loop_body_in_dom_order (loop);
++  for (unsigned i = 0; i < loop->num_nodes; i++)
++    {
++      basic_block bb = body[i];
++      if (bb->loop_father != loop)
++	continue;
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "\n==== the %dth loop bb body ====\n", i);
++	  gimple_dump_bb (dump_file, bb, 0, dump_flags);
++	  fprintf (dump_file, "\n");
++	}
++
++      gimple_stmt_iterator bsi;
++      for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
++	{
++	  gimple *stmt = gsi_stmt (bsi);
++	  get_references_in_stmt (stmt, references);
++	  filter_out_loop = filter_out_loop_by_stmt_p (loop_filter, stmt,
++						       references, start);
++	  if (filter_out_loop)
++	    break;
++	}
++      if (filter_out_loop)
++	break;
++    }
++  free (body);
++  return !filter_out_loop;
++}
++
++/* Computes an estimated number of insns in LOOP, weighted by WEIGHTS.
++   Assume that the HPC data reading and calculation process does not involve
++   adding branches in loops.  Therefore, all bbs of loops are directly used for
++   calculation (excluding embedded loops) without considering branch weighting.
++*/
++
++unsigned
++estimate_loop_insns (class loop *loop, eni_weights *weights)
++{
++  basic_block *body = get_loop_body (loop);
++  gimple_stmt_iterator gsi;
++  unsigned size = 0, i;
++
++  for (i = 0; i < loop->num_nodes; i++)
++    {
++      basic_block bb = body[i];
++      if (bb->loop_father != loop)
++	{
++	  continue;
++	}
++      for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
++	size += estimate_num_insns (gsi_stmt (gsi), weights);
++    }
++  free (body);
++
++  return size;
++}
++
++/* Check whether the memory access is dense.  */
++
++bool
++dense_memory_p (const std::vector<data_ref> &references, class loop *loop)
++{
++  int ref_count = references.size ();
++  unsigned int ninsns = estimate_loop_insns (loop, &eni_size_weights);
++  float mem_to_insn_ratio = (float)ref_count / (float)ninsns;
++
++  /* The number of cores to be run and DDR bandwidth information can be
++  transferred to flexibly adjust the threshold.  */
++  bool dense_mem = (mem_to_insn_ratio >= (param_mem_access_ratio / 100.0)
++		    && ref_count >= param_mem_access_num);
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (cfun->decl));
++
++      /* Dump dense memory source code location.  */
++      if (ref_count && references[0].stmt->location)
++	{
++	  expanded_location xloc = expand_location
++				     (references[0].stmt->location);
++	  int fn_start = 0;
++	  if (DECL_SOURCE_LOCATION (current_function_decl))
++	    fn_start = expand_location (
++			    DECL_SOURCE_LOCATION (current_function_decl)).line;
++	  int fn_end = fn_start;
++	  if (cfun->function_end_locus)
++	    fn_end = expand_location (cfun->function_end_locus).line;
++	  if (xloc.file)
++	    fprintf (dump_file, "[%s:%s(%d-%d):%d:%d] ",
++		      xloc.file, fn_name, fn_start, fn_end,
++		      xloc.line, xloc.column);
++	}
++
++      /* Dump memory dense information.  */
++      if (dense_mem)
++	fprintf (dump_file, "dense memory access: ");
++      else
++	fprintf (dump_file, "non-dense mem access: ");
++      fprintf (dump_file,
++	       "ref_count = %d, ninsns = %d, mem_to_insn_ratio = %f\n\n",
++	       ref_count, ninsns, mem_to_insn_ratio);
++    }
++
++  return dense_mem;
++}
++
++/* Analyze the inner loop and get the loop with dense memory access.  */
++
++void
++analyze_loop_dense_memory (std::vector<class loop *> &kernels,
++			  std::map<class loop *,
++				   std::vector<data_ref> > &kernels_refs,
++			  class loop *loop)
++{
++  std::vector<data_ref> references;
++  number_of_latch_executions (loop);
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "\n========== Processing loop %d: ==========\n",
++	       loop->num);
++      loop_dump (dump_file, loop);
++      flow_loop_dump (loop, dump_file, NULL, 1);
++      fprintf (dump_file, "loop unroll: %d\n", loop->unroll);
++    }
++
++  if (get_loop_exit_edges (loop).length () != 1)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "non-dense mem access: loop_multiple_exits\n");
++      return;
++    }
++
++  loop_filter_out_flag loop_filter = {false, false, true, false};
++
++  if (!get_references_in_loop (references, loop_filter, loop))
++    {
++      dump_loop_filter_out_flag (loop_filter);
++      return;
++    }
++
++  if (dense_memory_p (references, loop))
++    {
++      kernels_refs[loop] = references;
++      kernels.push_back (loop);
++    }
++}
++/* Analyze the inner loop and get the loop with dense memory access.  */
++
++bool
++get_dense_memory_kernels (std::vector<class loop *> &kernels,
++			  std::map<class loop *,
++				   std::vector<data_ref> > &kernels_refs)
++{
++  if (dump_file)
++    fprintf (dump_file, "\nPhase 1: get_dense_memory_kernels\n\n");
++  for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
++    analyze_loop_dense_memory (kernels, kernels_refs, loop);
++  return kernels.size () > 0;
++}
++
++/* ================ phase 2 trace_data_refs_info ================  */
++
++/* Determine whether the declaration is a non-vectorized.  */
++
++bool
++generic_decl_p (tree expr)
++{
++  if (expr == NULL_TREE)
++    return false;
++  enum tree_code expr_code = TREE_CODE (expr);
++  if (expr_code != VAR_DECL && expr_code != PARM_DECL
++      && expr_code != COMPONENT_REF)
++    return false;
++  return true;
++}
++
++/* Initial worklist preparation for source variable tracing.
++   Add different initial node based on different gimple statements.  */
++
++void
++add_worklist (std::vector<tree> &worklist, std::set<tree> &walked,
++	      gimple *def_stmt)
++{
++  if (gimple_code (def_stmt) == GIMPLE_PHI)
++    {
++      for (unsigned i = 0; i < gimple_phi_num_args (def_stmt); i++)
++	{
++	  tree node = gimple_phi_arg_def (def_stmt, i);
++	  if (!walked.count (node))
++	    {
++	      worklist.push_back (node);
++	      walked.insert (node);
++	    }
++	}
++    }
++  else if (is_gimple_assign (def_stmt))
++    {
++      tree_code rhs_code = gimple_assign_rhs_code (def_stmt);
++      if (rhs_code == POINTER_PLUS_EXPR || rhs_code == NEGATE_EXPR
++	  || rhs_code == NOP_EXPR || rhs_code == SSA_NAME
++	  || rhs_code == COMPONENT_REF)
++	{
++	  tree node = gimple_assign_rhs1 (def_stmt);
++	  if (!walked.count (node))
++	    {
++	      worklist.push_back (node);
++	      walked.insert (node);
++	    }
++	}
++      else if (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR)
++	{
++	  tree node = gimple_assign_rhs1 (def_stmt);
++	  if (!walked.count (node))
++	    {
++	      worklist.push_back (node);
++	      walked.insert (node);
++	    }
++	  node = gimple_assign_rhs2 (def_stmt);
++	  if (!walked.count (node))
++	    {
++	      worklist.push_back (node);
++	      walked.insert (node);
++	    }
++	}
++      else if (rhs_code == TARGET_MEM_REF || rhs_code == MEM_REF)
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "possibly unnested indirect memory access: ");
++	      print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO);
++	      fprintf (dump_file, "\n");
++	    }
++	}
++      else
++	{
++	  /* unhandled assign rhs_code: _219 = _17 * _70;
++	     _17 = *grid_56(D).sst.span;
++	     _70 = *grid_56(D).sst.dim[0].stride;
++	  */
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "unhandled assign rhs_code: ");
++	      print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO);
++	      fprintf (dump_file, "\n");
++	    }
++	}
++    }
++  else
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "unsupported tracing stmt: ");
++	  print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO);
++	  fprintf (dump_file, "\n");
++	}
++    }
++}
++
++
++/* Tracing source variables:
++   vectp.1 = a_2(D) + _3;
++   _4 = &MEM[base: vectp.1, index: ivtmp_5, step: 8, offset: 0B];
++   vect__1.6 = .MASK_LOAD (_4, 64B, loop_mask_7);
++
++   _1 = (sizetype) b_2(D);
++   vect_patt_3.3 = .MASK_GATHER_LOAD (_1, vect__4.4, 8, { 0.0, ... },
++				      loop_mask_5);
++  ...
++  Due to previous pass optimizations, the current tracing method can find
++  several source variable candidates.  We decide to record them in a map and
++  later filter out the true base variable by some criteria.
++*/
++
++void
++trace_base_var_helper (tree arg, std::set<tree> &walked,
++		       std::map<tree, int>& base_var_candid, bool is_vect_type)
++{
++  if (arg == NULL)
++    return;
++
++  /* Var_decl type: base address extracted from ARRAY_REF.  */
++  if (TREE_CODE (TREE_TYPE (arg)) == ARRAY_TYPE && TREE_CODE (arg) == VAR_DECL
++      && generic_decl_p (arg))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "var_decl type\n");
++      base_var_candid[arg] += 1;
++      return;
++    }
++
++  /* Array type.  */
++  tree op0 = NULL;
++  if (TREE_CODE (arg) == ADDR_EXPR
++      && (op0 = TREE_OPERAND (arg, 0)) && generic_decl_p (op0))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "array type\n");
++      base_var_candid[op0] += 1;
++      return;
++    }
++
++  /* Pointer type.  */
++  if (TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE && generic_decl_p (arg))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "pointer type\n");
++      base_var_candid[arg] += 1;
++      return;
++    }
++
++  /* SSA_NAME type.  */
++  if (TREE_CODE (arg) != SSA_NAME)
++    return;
++
++  tree tmp_var = SSA_NAME_VAR (arg);
++  if (tmp_var && !is_vect_type && TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "ssa pointer type\n");
++      base_var_candid[tmp_var] += 1;
++      return;
++    }
++
++  gimple *def_stmt = SSA_NAME_DEF_STMT (arg);
++  if (def_stmt == NULL)
++    return;
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      print_generic_expr (dump_file, arg, TDF_SLIM);
++      fprintf (dump_file, "\t\t: ");
++      print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM);
++    }
++
++  if (gimple_code (def_stmt) == GIMPLE_NOP)
++    {
++      if (!walked.count (tmp_var))
++	walked.insert (tmp_var);
++      trace_base_var_helper (tmp_var, walked, base_var_candid, is_vect_type);
++    }
++  else
++    {
++      std::vector<tree> worklist;
++      add_worklist (worklist, walked, def_stmt);
++      for (unsigned i = 0; i < worklist.size (); ++i)
++	trace_base_var_helper (worklist[i], walked, base_var_candid, is_vect_type);
++    }
++}
++
++/* Identify the base variable traced from base address of memory reference.
++   We recognize that current method could detect several base variable
++   candidates and the temporary criteria for base variable determination
++   is that either one of the following statement is true:
++    1) The number of base variable candidates is 1;
++    2) The number of detected gimple statements for some variable is 1.
++   We may use other criteria or relax the current criteria
++   (e.g., criterion 2: 1 -> any odd number).  */
++
++bool
++trace_base_var (data_ref &mem_ref, std::set<tree> &walked)
++{
++  tree &var = mem_ref.var;
++  tree arg = mem_ref.base;
++  std::map<tree, int> base_var_candid;
++  bool is_vect_type = TREE_CODE (TREE_TYPE (mem_ref.ref)) == VECTOR_TYPE;
++  trace_base_var_helper (arg, walked, base_var_candid, is_vect_type);
++  bool is_tracing_unusual = false;
++  if (base_var_candid.size () == 1)
++    var = base_var_candid.begin ()->first;
++  else
++    {
++      is_tracing_unusual = true;
++      for (std::map<tree, int>::iterator it = base_var_candid.begin ();
++	   it != base_var_candid.end (); ++it)
++	var = it->second == 1 ? it->first : var;
++    }
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "Traced variables at ");
++      print_generic_expr (dump_file, arg, TDF_SLIM);
++      fprintf (dump_file, ":\n");
++      for (std::map<tree, int>::iterator it = base_var_candid.begin ();
++	   it != base_var_candid.end (); ++it)
++	fprintf (dump_file, "%s:%d, ", get_name (it->first), it->second);
++      fprintf (dump_file, "\n");
++
++      if (var == NULL_TREE)
++	fprintf (dump_file, "Unhandled scenario for tracing base variable.\n");
++      else if (is_tracing_unusual && var != NULL_TREE)
++	fprintf (dump_file, "Tracing unusual number or occurrences of base "
++			    "variables.  Choose %s.\n",
++		 get_name (var));
++    }
++  return var != NULL_TREE;
++}
++
++/* Recursively trace and check whether the definition stmt of the
++   index operand is a recorded stmt in direct access tracing.
++   Return 0 if ref is a direct access a[].
++   Return 1 if ref is a non-nested indirect access a[b[]].
++   Return 2 if ref is a complex indirect memory access, such as a[f(b[])].  */
++
++int
++trace_indirect_operand (tree arg, std::set<gimple *> &traced_ref_stmt)
++{
++  /* Return 0 if tree `arg` is not an SSA for further tracing.  */
++  if (TREE_CODE (arg) != SSA_NAME)
++    return 0;
++
++  gimple *def_stmt = SSA_NAME_DEF_STMT (arg);
++
++  /* Return 1 if `index` has been detected as a traced direct memory access
++     before.  */
++  if (traced_ref_stmt.count (def_stmt))
++    return 1;
++
++  /* Return 0 if def stmt of `arg` is not in gimple assign type. Stop tracing
++     index operand and currently no memory access operand is detected.  */
++  if (!def_stmt || !is_gimple_assign (def_stmt))
++    return 0;
++
++  tree_code rhs_code = gimple_assign_rhs_code (def_stmt);
++  /* Collect a whitelist of gimple_assign_rhs_code for tracing pointer/array
++     type indirect memory access.  */
++  if (rhs_code != MULT_EXPR && rhs_code != NOP_EXPR
++      && rhs_code != CONVERT_EXPR && rhs_code != PLUS_EXPR)
++    {
++      /* Return 2 if tree code has any type representing references to storge,
++	 implying a complex indirect memory access scenario for future
++	 analysis.  */
++      if (rhs_code == MEM_REF || rhs_code == TARGET_MEM_REF
++	  || rhs_code == ARRAY_REF || rhs_code == ARRAY_RANGE_REF
++	  || rhs_code == COMPONENT_REF || rhs_code == ADDR_EXPR
++	  || rhs_code == INDIRECT_REF)
++	return 2;
++
++      /* Return 0 and stop tracing if tree code is not a common tracing
++	 operand, but still reflected as a non-reference type.
++	 Caveats: if we never deal with this tree code before, maybe it is
++	 more suitable to treat this scenario strictly.  */
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "unknown tracing tree code: %s\n",
++		   get_tree_code_name (rhs_code));
++	  print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      return 0;
++    }
++
++  tree op = NULL_TREE;
++  ssa_op_iter iter;
++  FOR_EACH_SSA_TREE_OPERAND (op, def_stmt, iter, SSA_OP_USE)
++    {
++      int trace_indir_p = trace_indirect_operand (op, traced_ref_stmt);
++      if (trace_indir_p != 0)
++	return trace_indir_p;
++    }
++  return 0;
++}
++
++/* Trace the pointer of the direct/indirect memory access:
++   1) Obtain the base address of the memory access.
++   2) If index variable is formed by another memory access operation (i.e., an
++      indication of indirect memory access), ensure that the index has been
++      traced in an already discovered direct memory access.
++   3) Otherwise, the memory access is in a more complex scenario and we need to
++      postpone the analysis later. For example, the indirect memory access is
++      nested, a[b[c[...]]], or the index variable (formed in another memory
++      access) has not been recorded/traced yet.
++   e.g.,
++   _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B];
++   _4 = (long unsigned int) _1;
++   _5 = _4 * 8;
++   _6 = p(D) + _5; // get base
++   _7 = *_6;       // start tracing
++*/
++
++bool
++trace_ptr_mem_ref (data_ref &mem_ref, std::set<gimple *> &traced_ref_stmt,
++		   std::vector<data_ref> &unresolved_refs)
++{
++  /* Simple scenario:
++     _2208 = np.120_2207 * 8;
++     _1921 = sorted_weight$data_381 + _2208;
++     *_1921 = _2206;
++
++     Complex scenario:
++     MEM[base: _3235, index: ivtmp.2768_3189, step: 4, offset: 0B] = _105;
++     _3236 = (sizetype) _214;
++     _3237 = _3236 * 4;
++     _3238 = _857 + _3237;  // base + index * step
++     _3239 = _3238 + 4;     // offset
++     MEM[base: _3239, index: ivtmp.2768_3189, step: 4, offset: 0B] = 0.0;
++  */
++  tree pointer = TREE_OPERAND (mem_ref.ref, 0);
++  tree offset = TREE_OPERAND (mem_ref.ref, 1);
++  if (TREE_CODE (offset) != INTEGER_CST)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Unhandled scenario for non-constant offset.\n");
++
++      return false;
++    }
++  if (TREE_CODE (pointer) != SSA_NAME)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Unhandled scenario for non-ssa pointer.\n");
++
++      return false;
++    }
++
++  /* Tracing back base address from SSA.  */
++  gimple *ptr_def_stmt = SSA_NAME_DEF_STMT (pointer);
++  if (ptr_def_stmt == NULL || gimple_code (ptr_def_stmt) != GIMPLE_ASSIGN
++      || gimple_assign_rhs_code (ptr_def_stmt) != POINTER_PLUS_EXPR)
++    return false;
++  tree base = gimple_assign_rhs1 (ptr_def_stmt);
++  /* index_offset = index * step.  */
++  tree index_offset = gimple_assign_rhs2 (ptr_def_stmt);
++
++  /* Tracing back index from SSA.  */
++  if (TREE_CODE (index_offset) != SSA_NAME)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  if (TREE_CODE (index_offset) == INTEGER_CST)
++	    fprintf (dump_file, "Constant index for memory access.\n");
++	  else
++	    fprintf (dump_file, "Unhandled scenario for index tracing.\n");
++	}
++      return false;
++    }
++
++  gimple *idx_def_stmt = SSA_NAME_DEF_STMT (index_offset);
++  if (idx_def_stmt == NULL || gimple_code (idx_def_stmt) != GIMPLE_ASSIGN
++      || gimple_assign_rhs_code (idx_def_stmt) != MULT_EXPR)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Unhandled scenario for index tracing.\n");
++      return false;
++    }
++
++  /* Split array index from total offset of index, `index * step`.  */
++  mem_ref.base = base;
++  mem_ref.offset = offset;
++  mem_ref.index = gimple_assign_rhs1 (idx_def_stmt);
++  mem_ref.step = gimple_assign_rhs2 (idx_def_stmt);
++  if (TREE_CODE (gimple_assign_rhs1 (idx_def_stmt)) == INTEGER_CST)
++    {
++      mem_ref.index = gimple_assign_rhs2 (idx_def_stmt);
++      mem_ref.step = gimple_assign_rhs1 (idx_def_stmt);
++    }
++
++  int trace_index_indir_p = trace_indirect_operand (mem_ref.index,
++						    traced_ref_stmt);
++  if (trace_index_indir_p == 0)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Direct memory access tracing succeeded.\n");
++    }
++  else if (trace_index_indir_p == 1)
++    {
++      mem_ref.regular_p = false;
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Indirect memory access tracing succeeded.\n");
++    }
++  else
++    {
++      /* Record indirect memory access with complex scenarios for future
++	 analysis.  */
++      unresolved_refs.push_back (mem_ref);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Unhandled indirect memory access tracing.\n");
++      return false;
++    }
++
++  return true;
++}
++
++/* Tracing direct memory reference information.  */
++
++bool
++trace_direct_mem_ref (data_ref &mem_ref)
++{
++  /* Direct memory access, regardless of whether it is in vectorized form,
++     can be determined through TARGET_MEM_REF:
++      address = base + index * step + offset.
++     MASK_LOAD example:
++      _43 = &MEM[base: _42, index: ivtmp_140, step: 8, offset: 0B];
++      vect__42.11_160 = .MASK_LOAD (_43, 64B, loop_mask_163);
++
++     In some cases (2D-array or complex-index 1D array), mem_ref's `base`
++     may actually represent `base + index * step` when `base` address updates
++     by a PHI operation, e.g.,
++      MEM[base: _51, offset: 0B]
++      _51 = (void *) ivtmp.18_11;
++      ivtmp.18_11 = PHI <ivtmp.18_43(10), ivtmp.18_52(14)>
++      ivtmp.18_43 = ivtmp.18_11 + 16;
++      ivtmp.18_52 = (unsigned long) _10;
++      _10 = arr2D_29(D) + _9;
++  */
++  mem_ref.base = TREE_OPERAND (mem_ref.ref, 0);
++  mem_ref.offset = TREE_OPERAND (mem_ref.ref, 1);
++  mem_ref.index = TREE_OPERAND (mem_ref.ref, 2);
++  mem_ref.step = TREE_OPERAND (mem_ref.ref, 3);
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "Direct memory access tracing succeeded.\n");
++
++  return true;
++}
++
++/* Tracing vectorized indirect memory reference information.
++   MASK_GATHER_LOAD example:
++    vect__45.13_146 = .MASK_LOAD (_41, 32B, loop_mask_153);
++    vect__46.14_145 = (vector([2,2]) long unsigned int) vect__45.13_146;
++    vect_patt_163.15_143 = .MASK_GATHER_LOAD (_144, vect__46.14_145, 8,
++      { 0.0, ... }, loop_mask_153);  */
++
++bool
++trace_indirect_mem_ref_vectorized (data_ref &mem_ref,
++				   std::set<gimple *> &traced_ref_stmt)
++{
++  /* Processing of vectorization types.  */
++  if (mem_ref.vectorize_p)
++    {
++      tree op = gimple_call_arg (mem_ref.stmt, 1);
++      if (trace_indirect_operand (op, traced_ref_stmt))
++	{
++	  mem_ref.base = gimple_call_arg (mem_ref.stmt, 0);
++	  mem_ref.index = gimple_call_arg (mem_ref.stmt, 1);
++	  mem_ref.step = gimple_call_arg (mem_ref.stmt, 2);
++	  mem_ref.regular_p = false;
++
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "Indirect memory access tracing succeeded.\n");
++	  return true;
++	}
++    }
++  return false;
++}
++
++/* Trace the array of the indirect memory access:
++   1) Obtain the base address of the indirect memory access.
++   2) Ensure that the index has been traced in the direct memory access.
++   e.g.,
++   _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B];
++   _4 = (integer(kind=8)) _1;
++   _5 = _4 + 135;
++   _6 = p[_5];       // start tracing
++*/
++
++bool
++trace_indirect_array (data_ref &mem_ref, std::set<gimple *> &traced_ref_stmt)
++{
++  tree base = TREE_OPERAND (mem_ref.ref, 0);
++  tree index = TREE_OPERAND (mem_ref.ref, 1);
++  if (trace_indirect_operand (index, traced_ref_stmt))
++    {
++      /* ARRAY_REF, The first operand is the array;
++		    the second is the index.  */
++      mem_ref.base = base;
++      mem_ref.index = index;
++      mem_ref.regular_p = false;
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Indirect memory access tracing succeeded.\n");
++
++      return true;
++    }
++
++  return false;
++}
++
++/* Trace memory references base info:
++   1) Memory access rule analysis and reference info tracing
++   2) Source variable tracing, along base address of memory reference
++   We will extend parallel analysis later.
++*/
++
++void
++trace_ref_info (data_ref &mem_ref, std::set<gimple *> &traced_ref_stmt,
++		std::vector<data_ref> &unresolved_refs)
++{
++  enum tree_code ref_code = TREE_CODE (mem_ref.ref);
++  /* 1) Direct and indirect access traces.  */
++  switch (ref_code)
++    {
++    case MEM_REF:
++      /* Non-vectorized direct/indirect access by pointer.  */
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "MEM_REF\n");
++      if (!trace_ptr_mem_ref (mem_ref, traced_ref_stmt, unresolved_refs))
++	return;
++      break;
++    case TARGET_MEM_REF:
++      /* Vectorized and non-vectorized direct access.  */
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "TARGET_MEM_REF\n");
++      if (!trace_direct_mem_ref (mem_ref))
++	return;
++      break;
++    case SSA_NAME:
++      /* Vectorized indirect memory access.  */
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "SSA_NAME\n");
++      if (!trace_indirect_mem_ref_vectorized (mem_ref, traced_ref_stmt))
++	return;
++      break;
++    case ARRAY_REF:
++      /* Non-vectorized indirect memory access.  */
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "ARRAY_REF\n");
++      if (!trace_indirect_array (mem_ref, traced_ref_stmt))
++	return;
++      break;
++    default:
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "ref is another tree-code: ");
++	  fprintf (dump_file, "stmt: ");
++	  print_gimple_stmt (dump_file, mem_ref.stmt, 0, TDF_LINENO);
++	  fprintf (dump_file, "ref: ");
++	  print_generic_expr (dump_file, mem_ref.ref, TDF_LINENO);
++	  fprintf (dump_file, "\n");
++	}
++      return;
++    }
++
++  /* 2) Source variable tracing.  */
++  std::set<tree> walked;
++  if (mem_ref.var == NULL_TREE
++      && !trace_base_var (mem_ref, walked))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Source variable tracing failed.\n\n");
++      return;
++    }
++
++  if (mem_ref.regular_p)
++    traced_ref_stmt.insert (mem_ref.stmt);
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "Tracing succeeded.\n\n");
++  mem_ref.trace_status_p = true;
++}
++
++/* Trace all references in the loop.  */
++
++void
++trace_loop_refs_info (std::vector<data_ref> &refs,
++		      std::set<gimple *> &traced_ref_stmt,
++		      std::vector<data_ref> &unresolved_refs)
++{
++  for (unsigned i = 0; i < refs.size (); ++i)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "trace_references_base_info %d:\n", i);
++	  print_generic_expr (dump_file, refs[i].ref, TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      trace_ref_info (refs[i], traced_ref_stmt, unresolved_refs);
++    }
++}
++
++/* Tracing and sorting reference groups.  */
++
++void
++trace_data_refs_info (std::vector<class loop *> &kernels,
++		      std::map<class loop *,
++			       std::vector<data_ref> > &loop_refs,
++		      std::set<gimple *> &traced_ref_stmt,
++		      std::vector<data_ref> &unresolved_refs)
++{
++  if (dump_file)
++    fprintf (dump_file, "\nPhase 2: trace_all_references_info\n\n");
++
++  for (unsigned i = 0; i < kernels.size (); ++i)
++    {
++      class loop *loop = kernels[i];
++      if (loop_refs.count (loop) == 0)
++	continue;
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "loop header %d:\n", loop->header->index);
++      trace_loop_refs_info (loop_refs[loop], traced_ref_stmt, unresolved_refs);
++    }
++}
++
++/* Retrace references base info for complex scenarios in indirect memory access
++   after Phase 3.  */
++
++void
++retrace_ref_info_unresolved (data_ref &mem_ref,
++			     std::set<gimple *> &traced_ref_stmt)
++{
++  /* 1) Indirect access traces.  */
++  int trace_index_indir_p = trace_indirect_operand (mem_ref.index,
++						    traced_ref_stmt);
++  if (trace_index_indir_p == 1)
++    {
++      mem_ref.regular_p = false;
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Indirect memory access tracing succeeded.\n");
++    }
++
++  /* 2) Source variable tracing.  */
++  std::set<tree> walked;
++  if (mem_ref.var == NULL_TREE
++      && !trace_base_var (mem_ref, walked))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Source variable tracing failed.\n\n");
++      return;
++    }
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "Tracing succeeded.\n\n");
++  mem_ref.trace_status_p = true;
++}
++
++/* Retrace all unresolved references.  */
++
++void
++retrace_loop_refs_info_unresolved (std::vector<data_ref> &unresolved_refs,
++				   std::set<gimple *> &traced_ref_stmt)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file,
++	     "\nRetrace indirect memory access after outer loop analysis:\n");
++  for (unsigned i = 0; i < unresolved_refs.size (); ++i)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "trace_references_base_info %d:\n", i);
++	  print_generic_expr (dump_file, unresolved_refs[i].ref, TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      retrace_ref_info_unresolved (unresolved_refs[i], traced_ref_stmt);
++    }
++}
++
++/* ================ phase 3 analyze_nested_kernels ================  */
++
++/* Return the inner most type for arrays and pointers of TYPE.  */
++
++tree
++inner_type (tree type)
++{
++  while (POINTER_TYPE_P (type)
++	 || TREE_CODE (type) == ARRAY_TYPE)
++    type = TREE_TYPE (type);
++  return type;
++}
++
++/* Check whether the input iv is the loop dimension boundary.  */
++
++bool
++loop_bound_iv_p (tree t, tree &outer_loop_t)
++{
++  if (t == NULL || TREE_CODE (t) != SSA_NAME
++      || TREE_CODE (TREE_TYPE (t)) != INTEGER_TYPE)
++    return false;
++
++  gimple *def_stmt = SSA_NAME_DEF_STMT (t);
++
++  /* NOP_EXPR convertion between PHI node and memory reference due to MACRO.
++    n_898 = PHI <n_907(355), 0(356)>
++    _757 = (sizetype) n_898;
++    _900 = MEM[base: _726, index: _757, step: 8, offset: 0B];
++  */
++  while (gimple_code (def_stmt) == GIMPLE_ASSIGN
++	 && gimple_assign_rhs_code (def_stmt) == NOP_EXPR)
++    def_stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (def_stmt));
++
++  if (gimple_code (def_stmt) != GIMPLE_PHI)
++    return false;
++
++  /* Filter scenarios with only two phi inputs.  */
++  if (gimple_phi_num_args (def_stmt) != 2)
++    return false;
++
++  gphi *phi_stmt = as_a <gphi *> (def_stmt);
++  basic_block src0 = gimple_phi_arg_edge (phi_stmt, 0)->src;
++  basic_block src1 = gimple_phi_arg_edge (phi_stmt, 1)->src;
++
++  class loop *loop = loop_containing_stmt (def_stmt);
++  bool res = false;
++  /* Two phi inputs, one from the current loop and one from the outer loop.  */
++  if ((src0->loop_father == loop) && (src1->loop_father == loop_outer (loop)))
++    {
++      outer_loop_t = gimple_phi_arg_def (def_stmt, 1);
++      res = true;
++    }
++  else if ((src1->loop_father == loop)
++	   && (src0->loop_father == loop_outer (loop)))
++    {
++      outer_loop_t = gimple_phi_arg_def (def_stmt, 0);
++      res = true;
++    }
++
++  if (res)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "===> ");
++	  print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM);
++	}
++      return true;
++    }
++  return false;
++}
++
++/* add worklist and walked list.  */
++
++void
++add_worklist_walked (std::vector<tree> &worklist, std::set<tree> &walked,
++		     tree node)
++{
++  if (!walked.count (node))
++    {
++      worklist.push_back (node);
++      /* Avoid phi node cycle introduction, which makes the worklist unable
++	 to end.  */
++      walked.insert (node);
++    }
++}
++
++/* check bound iv and add worklist.  */
++
++void
++check_bound_iv_and_add_worklist (std::vector<tree> &worklist,
++				 std::set<tree> &walked,
++				 std::set<basic_block> &walked_loop,
++				 tree t, data_ref &mem_ref)
++{
++  if (t == NULL_TREE || TREE_CODE (t) != SSA_NAME)
++    return;
++
++  gimple *def_stmt = SSA_NAME_DEF_STMT (t);
++  if (def_stmt == NULL)
++    return;
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      print_generic_expr (dump_file, t, TDF_SLIM);
++      fprintf (dump_file, "\t\t: ");
++      print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM);
++    }
++
++  if (gimple_code (def_stmt) == GIMPLE_PHI)
++    {
++      tree out_loop_t = NULL_TREE;
++      if (loop_bound_iv_p (t, out_loop_t))
++	{
++	  basic_block bb = gimple_bb (def_stmt);
++	  if (!walked_loop.count (bb))
++	    {
++	      mem_ref.loop_bounds.push_back (loop_bound (t, def_stmt));
++	      walked_loop.insert (bb);
++	    }
++	  add_worklist_walked (worklist, walked, out_loop_t);
++	}
++    }
++  else if (is_gimple_assign (def_stmt))
++    {
++      tree_code rhs_code = gimple_assign_rhs_code (def_stmt);
++
++      /* unary.  */
++      if (rhs_code == SSA_NAME || rhs_code == NOP_EXPR)
++	add_worklist_walked (worklist, walked, gimple_assign_rhs1 (def_stmt));
++      else if (rhs_code == POINTER_PLUS_EXPR)
++	add_worklist_walked (worklist, walked, gimple_assign_rhs2 (def_stmt));
++
++      /* binary.  */
++      else if (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR
++	       || rhs_code == MULT_EXPR)
++	{
++	  add_worklist_walked (worklist, walked, gimple_assign_rhs1 (def_stmt));
++	  add_worklist_walked (worklist, walked, gimple_assign_rhs2 (def_stmt));
++	}
++    }
++}
++
++/* DFS trace the loop bound of iv.  */
++
++bool
++trace_loop_bound_iv (data_ref &mem_ref)
++{
++  /* In indirect memory access, the size cannot be determined based on the
++     loop boundary. However, we can take advantage of loop bound as an upper
++     bound (unrepeated memory access) to predict the variable footprint
++     involved in the specific loop dimension.  */
++
++  /* Determine and record the boundary iv of the current index,
++     but do not trace it.  */
++  tree outer_loop_t = NULL_TREE;
++  /* indirect access example, mem_ref.index = _64
++    _62 = MEM[symbol: uPtr, index: ivtmp.22_96, step: 4, offset: 0B];
++    _63 = (long unsigned int) _62;
++    _64 = _63 * 8;
++    _65 = [openfoam_smooth.c:28:28] &bPrimePtr + _64;
++    _66 = *_65;  */
++  if (loop_bound_iv_p (mem_ref.index, outer_loop_t) || !mem_ref.regular_p)
++    {
++      mem_ref.loop_bounds.push_back (
++	    loop_bound (mem_ref.index, SSA_NAME_DEF_STMT (mem_ref.index)));
++      if (!mem_ref.regular_p)
++	return false;
++    }
++
++  std::vector<tree> worklist;
++  worklist.push_back (mem_ref.base);
++  std::set<tree> walked;
++  std::set<basic_block> walked_loop;
++
++  while (worklist.size ())
++    {
++      tree t = worklist.back ();
++      worklist.pop_back ();
++
++      /* add worklist.  */
++      check_bound_iv_and_add_worklist (worklist, walked, walked_loop, t, mem_ref);
++    }
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "\nmem_ref access dimension: %ld\n",
++	       mem_ref.loop_bounds.size ());
++      fprintf (dump_file, "Traced variables: ");
++      print_generic_expr (dump_file, mem_ref.base, TDF_SLIM);
++      fprintf (dump_file, "\n");
++    }
++
++  return mem_ref.loop_bounds.size () > 0;
++}
++
++/* dump loop bound.  */
++
++void
++loop_bound_dump (FILE *file, loop_bound &lb)
++{
++  class loop *loop = lb.loop;
++  fprintf (file, "loop_bound: loop_%d (", loop->num);
++  if (loop->header)
++    fprintf (file, "header = %d", loop->header->index);
++  else
++    {
++      fprintf (file, "deleted)\n");
++      return;
++    }
++  if (loop->latch)
++    fprintf (file, ", latch = %d", loop->latch->index);
++  fprintf (file, ", lb_niters = ");
++  print_generic_expr (file, lb.niters);
++  fprintf (file, ")\n\n");
++}
++
++/* static calculate data size.  */
++
++void
++static_calculate_data_size (data_ref &mem_ref)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nstatic_calculate_data_size\n");
++
++  tree size_unit = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (mem_ref.var)));
++  unsigned HOST_WIDE_INT type_size = size_unit ? tree_to_uhwi (size_unit) : 0;
++  for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i)
++    {
++      unsigned HOST_WIDE_INT est_niter = tree_to_uhwi
++					   (mem_ref.loop_bounds[i].niters);
++      unsigned int unroll = mem_ref.loop_bounds[i].unroll;
++      if (i == 0)
++	{
++	  /* The unit conversion between byte, kilobytes, and megabytes is
++	     1024.  */
++	  mem_ref.data_size = double (type_size
++				      * est_niter * unroll) / 1024 / 1024;
++	}
++      else
++	mem_ref.data_size *= est_niter * unroll;
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "static_data_size: %lf\n", mem_ref.data_size);
++    }
++}
++
++/* Recursive tracing and creating of dominant nodes.  */
++
++tree
++trace_and_create_dominate_expr (tree expr, class loop *outermost)
++{
++  if (expr == NULL_TREE || is_gimple_constant (expr))
++    return expr;
++
++  if (TREE_CODE (expr) != SSA_NAME)
++    return NULL_TREE;
++
++  if (SSA_NAME_IS_DEFAULT_DEF (expr))
++    return expr;
++
++  gimple *stmt = SSA_NAME_DEF_STMT (expr);
++  basic_block def_bb = gimple_bb (stmt);
++  if (def_bb == NULL || def_bb->loop_father == NULL)
++    return NULL_TREE;
++
++  if (dominated_by_p (CDI_DOMINATORS, outermost->header, def_bb))
++    return expr;
++
++  if (gimple_code (stmt) != GIMPLE_ASSIGN)
++    return NULL_TREE;
++
++  enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
++  tree_code_class code_class = TREE_CODE_CLASS (rhs_code);
++  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
++  tree rhs1 = trace_and_create_dominate_expr (gimple_assign_rhs1 (stmt),
++					      outermost);
++  if (rhs1 == NULL_TREE)
++    return NULL_TREE;
++
++  if (code_class == tcc_unary)
++    {
++      tree expr_new = build1 (rhs_code, type, rhs1);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "expr_new = ");
++	  print_generic_expr (dump_file, expr_new, TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      return expr_new;
++    }
++  else if (code_class == tcc_binary)
++    {
++      tree rhs2 = trace_and_create_dominate_expr (gimple_assign_rhs2 (stmt),
++						  outermost);
++      if (rhs2 == NULL_TREE)
++	return NULL_TREE;
++
++      tree expr_new = fold_build2 (rhs_code, type, rhs1, rhs2);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "expr_new = ");
++	  print_generic_expr (dump_file, expr_new, TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      return expr_new;
++    }
++
++  return NULL_TREE;
++}
++
++/* Recursive parsing and craating of nodes in expr expressions.  */
++
++tree
++parse_and_create_expr (tree expr, class loop *outermost)
++{
++  if (expr == NULL_TREE || expr == chrec_dont_know
++      || is_gimple_constant (expr) || TREE_CODE (expr) == ADDR_EXPR)
++    {
++      /* tcc_expression (e.g., &q) situation combined with tcc_unary.  */
++      if (TREE_CODE (expr) == ADDR_EXPR && dump_file
++	  && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "tcc_expression case in ADDR_EXPR: ");
++	  print_generic_expr (dump_file, expr, TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      return expr;
++    }
++
++  if (TREE_CODE (expr) == SSA_NAME)
++    return trace_and_create_dominate_expr (expr, outermost);
++  else if (EXPR_P (expr))
++    {
++      enum tree_code tree_code = TREE_CODE (expr);
++      tree_code_class code_class = TREE_CODE_CLASS (tree_code);
++      tree type = TREE_TYPE (expr);
++      tree op1 = parse_and_create_expr (TREE_OPERAND (expr, 0), outermost);
++      if (op1 == NULL_TREE)
++	return NULL_TREE;
++
++      if (code_class == tcc_unary)
++	{
++	  tree expr_new = build1 (tree_code, type, op1);
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "expr_new = ");
++	      print_generic_expr (dump_file, expr_new, TDF_SLIM);
++	      fprintf (dump_file, "\n");
++	    }
++	  return expr_new;
++	}
++      else if (code_class == tcc_binary)
++	{
++	  tree op2 = parse_and_create_expr (TREE_OPERAND (expr, 1), outermost);
++	  if (op2 == NULL_TREE)
++	    return NULL_TREE;
++
++	  tree expr_new = fold_build2 (tree_code, type, op1, op2);
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "expr_new = ");
++	      print_generic_expr (dump_file, expr_new, TDF_SLIM);
++	      fprintf (dump_file, "\n");
++	    }
++	  return expr_new;
++	}
++    }
++  return NULL_TREE;
++}
++
++/* Trace and creat dominate loop bounds.  */
++
++void
++trace_and_create_dominate_loop_bounds (data_ref &mem_ref)
++{
++  /* Check whether the niters is a loop dominant.
++     If not, trace and determine whether the result is dominant.  If yes,
++     create the expr of the dominant node.
++  */
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\ntrace_and_create_dominate_loop_bounds\n");
++
++  /* Determine the relationship between the boundary of the innermost loop and
++     the dominant of the outer loop and the processing.  */
++  loop_bound &outermost = mem_ref.loop_bounds.back ();
++  for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i)
++    {
++      loop_bound &current = mem_ref.loop_bounds[i];
++      tree &niters = current.niters;
++      if (TREE_CODE (niters) == COND_EXPR)
++	niters = TREE_OPERAND (niters, 1);
++
++      niters = parse_and_create_expr (niters, outermost.loop);
++
++      if (niters == NULL_TREE)
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM);
++	      fprintf (dump_file, "Tracing loop bound failed at dimension %d\n",
++		       i);
++	    }
++	  mem_ref.calc_by = UNHANDLE_CALC;
++	  break;
++	}
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	loop_bound_dump (dump_file, mem_ref.loop_bounds[i]);
++    }
++}
++
++/* trace the dimension and corresponding loop bounds of mem_ref.
++   This function is used to supplement the information of mem_ref.loop_bounds.
++*/
++
++void
++trace_ref_dimension_and_loop_bounds (data_ref &mem_ref)
++{
++  /* In the same loop, some memory access dimensions are different.  Remove
++     variables with fewer dimensions.
++     Previous cyclic filtering conditions and memory access node records and
++     tracing.
++     The false result is also processed.
++  */
++  if (dump_file)
++    fprintf (dump_file, "\ncalculate_data_size\n");
++
++  /* Trace the loop bound iv of ref to determine the dimension.  */
++  /* Record data from the loop perspective to avoid repeated tracing.  */
++  if (!trace_loop_bound_iv (mem_ref))
++    return;
++
++  /* The traced mem_ref may have multiple dimensions, which corresponds to
++     multiple loops.  */
++  /* And in the dimension-by-dimensional analysis, the computable way is
++     continuously reduced.  */
++  mem_ref.calc_by = STATIC_CALC;
++  for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i)
++    {
++      class loop *loop = mem_ref.loop_bounds[i].loop;
++      tree &niters = mem_ref.loop_bounds[i].niters;
++
++      /* Set NULL_TREE to ensure that nb_iterations are retraced and
++	 vec_nb_iterations are also extracted.  */
++      loop->nb_iterations = NULL_TREE;
++      niters = number_of_latch_executions (loop, false);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	loop_dump (dump_file, loop);
++
++      if (loop->unroll)
++	{
++	  if (loop->unroll == USHRT_MAX && dump_file
++	      && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "loop->unroll = USHRT_MAX = %d", USHRT_MAX);
++	  mem_ref.loop_bounds[i].unroll = loop->unroll;
++	}
++
++      if ((niters == chrec_dont_know) && loop->vec_nb_iterations
++	   && (loop->vec_nb_iterations != chrec_dont_know))
++	niters = loop->vec_nb_iterations;
++
++      if (niters == chrec_dont_know)
++	{
++	  /* We derive est_loop_niters from function
++	     `estimated_loop_iterations_int`. Usually only the innermost loop is
++	     vectorized, so vec_nb_iterations can be 4 or 8 times as large as
++	     `est_loop_niters` due to vectorization. However, function
++	     `estimated_loop_iterations_int` only returns an integer instead of
++	     a tree node expression, so it cannot substitute
++	     function `number_of_latch_executions` in runtime computation.  */
++	  HOST_WIDE_INT est_loop_niters = estimated_loop_iterations_int (loop);
++	  if (est_loop_niters >= 0 && est_loop_niters < INT_MAX)
++	    /* e.g., loop iterations from `estimated_loop_iterations_int`: (-1)
++	       loop_144 (header = 519, latch = 625, niter = scev_not_known,
++	       upper_bound = 1073741823, likely_upper_bound = 1073741823,
++	       unroll = 1)  */
++	    /* variable `niters` from `loop->vec_nb_iterations`
++	       <integer_cst 0xfffff57df5d0 type
++	       <integer_type 0xfffff625a1f8> constant 34>  */
++	    niters = build_int_cst (integer_type_node, (int) est_loop_niters);
++	}
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	loop_bound_dump (dump_file, mem_ref.loop_bounds[i]);
++
++      if (niters == NULL_TREE || niters == chrec_dont_know)
++	mem_ref.calc_by = std::min (mem_ref.calc_by, UNHANDLE_CALC);
++      else if (TREE_CODE (niters) != INTEGER_CST)
++	mem_ref.calc_by = std::min (mem_ref.calc_by, RUNTIME_CALC);
++      else
++	mem_ref.calc_by = std::min (mem_ref.calc_by, STATIC_CALC);
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  if (mem_ref.calc_by == 2)
++	    {
++	      fprintf (dump_file, "\nniters: ");
++	      print_generic_expr (dump_file, niters, TDF_SLIM);
++	      fprintf (dump_file, "\nSTATIC_CALC.\n");
++	    }
++	  else if (mem_ref.calc_by == 1)
++	    {
++	      fprintf (dump_file, "\nniters: ");
++	      print_generic_expr (dump_file, niters, TDF_SLIM);
++	      fprintf (dump_file, "\nRUNTIME_CALC.\n");
++	    }
++	  else
++	    fprintf (dump_file, "\nUNHANDLE_CALC.\n");
++	}
++    }
++
++  if (mem_ref.calc_by == RUNTIME_CALC)
++    trace_and_create_dominate_loop_bounds (mem_ref);
++  else if (mem_ref.calc_by == STATIC_CALC)
++    static_calculate_data_size (mem_ref);
++}
++
++/* Get the loop's niters tree.
++   Return NULL_TREE if not found.  */
++
++tree
++get_cur_loop_niters (std::map<class loop *, std::vector<data_ref> > &loop_refs,
++		     class loop *loop)
++{
++  if (loop_refs.count (loop) == 0)
++    return NULL_TREE;
++  std::vector<loop_bound> bounds = loop_refs[loop][0].loop_bounds;
++  return bounds.size () ? bounds[0].niters : NULL_TREE;
++}
++
++/* Trace the sources of the niters tree and return the
++   outermost depth of the loops containing them.
++   Return start_depth if not found.
++
++   example:
++   niters:(long) (((int) i_end_417 - (int) i_start_452) + 1)
++   operand_num: 1, subtree:(long) (((int) i_end_417 - (int) i_start_452) + 1)
++   operand_num: 2, subtree:((int) i_end_417 - (int) i_start_452) + 1
++   operand_num: 2, subtree:(int) i_end_417 - (int) i_start_452
++   operand_num: 1, subtree:(int) i_end_417
++   SSA_NAME of niters: i_end_417
++   gimple of SSA: i_end_417 = PHI <i_end_446(9), i_end_410(100)>
++   return gimple depth;
++*/
++
++unsigned
++trace_outer_loop_depth (tree niters, unsigned start_depth)
++{
++  /* If niter does not exist or the type is INTEGER_CST,
++     the loop bound is determined and return start_depth.  */
++  if (niters == NULL_TREE || TREE_CODE (niters) == INTEGER_CST)
++    return start_depth;
++
++  gimple *def_stmt = NULL;
++  /* niters examples: i_start_452, fEnd_35, fEnd_100.  */
++  enum tree_code niter_code = TREE_CODE (niters);
++  if (niter_code == SSA_NAME)
++    {
++      /* Trace the SSA that define this niter.  */
++      def_stmt = SSA_NAME_DEF_STMT (niters);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "ssa_name of niters: ");
++	  print_generic_expr (dump_file, niters);
++	  fprintf (dump_file, "\ngimple of ssa: \n");
++	  print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO);
++	  fprintf (dump_file, "\n");
++	}
++      /* Termination condition of dfs.  Return the depth of the bb block.  */
++      if (gimple_code (def_stmt) == GIMPLE_PHI
++	  || gimple_code (def_stmt) == GIMPLE_NOP)
++	{
++	  basic_block def_bb = gimple_bb (SSA_NAME_DEF_STMT (niters));
++	  if (def_bb == NULL || def_bb->loop_father == NULL)
++	    return start_depth;
++	  unsigned ret_depth = loop_depth (def_bb->loop_father);
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "Stop tracing the outer loop depth, ");
++	      fprintf (dump_file, "current depth: %d, current bb: %d\n",
++		       ret_depth, def_bb->index);
++	    }
++	  return ret_depth;
++	}
++      /* 'ASSIGN': Use dfs to trace the rhs of the assignment statement.  */
++      else if (gimple_code (def_stmt) == GIMPLE_ASSIGN)
++	{
++	  tree rhs = gimple_assign_rhs1 (def_stmt);
++	  if (TREE_CODE (rhs) == TARGET_MEM_REF)
++	    /* fEnd_35 = MEM[base: _19, index: ivtmp.96, step: 4,
++			     offset: 0B]  */
++	    return trace_outer_loop_depth (TREE_OPERAND (rhs, 2), start_depth);
++	  else
++	    {
++	      /* M.218_658 = MIN_EXPR <_631, _657>  */
++	      unsigned min_depth = start_depth;
++	      unsigned operand_num = gimple_num_ops (def_stmt);
++	      /* 'ASSIGN': start from 1 because op[0] is the lhs.  */
++	      for (unsigned i = 1; i < operand_num; i++)
++		{
++		  tree subtree = dyn_cast<gassign *>(def_stmt)->op[i];
++		  if (subtree == NULL)
++		    continue;
++		  unsigned depth = trace_outer_loop_depth (subtree, \
++				   start_depth);
++		  min_depth = MIN (min_depth, depth);
++		  }
++		return min_depth;
++	    }
++	}
++      else
++	{
++	  /* Adding termination conditions:
++	   1)  Niters is MEM variable;
++	   2)  Niters is a runtime value (smooth_uPtr), and consider
++	       finding footprint in other mem_ref;
++	   3)  Niters is loop variable (i_start/i_end), and the boundary in
++	       the outer loop depends on the variable j_start/j_end.  */
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "The loop termination condition is "
++				  "extended.\n");
++	    }
++	  return start_depth;
++	}
++    }
++  /* The operand nums can be obtained when the tree code is as follows.  */
++  else if (niter_code == NOP_EXPR || niter_code == MEM_REF
++	   || niter_code == ARRAY_REF || niter_code == COND_EXPR
++	   || niter_code == PLUS_EXPR || niter_code == MINUS_EXPR
++	   || niter_code == TARGET_MEM_REF || niter_code == POINTER_PLUS_EXPR)
++    {
++      /* operand_num is the operand in the niters statement.
++	 example: In the following niter statement, operand_num = 3.
++	 (unsigned int) fEnd_35 - (unsigned int) fEnd_100 + 4294967295.  */
++      unsigned operand_num = TREE_OPERAND_LENGTH (niters);
++      unsigned min_depth = start_depth;
++      for (unsigned i = 0; i < operand_num; i++)
++	{
++	  tree subtree = TREE_OPERAND (niters, i);
++	  if (subtree == NULL)
++	    continue;
++	  unsigned depth = trace_outer_loop_depth (subtree, start_depth);
++	  min_depth = MIN (min_depth, depth);
++	}
++      return min_depth;
++    }
++  else
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "niters is another tree code: %s\n",
++		   get_tree_code_name (niter_code));
++	  print_generic_expr (dump_file, niters, TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      return start_depth;
++    }
++}
++
++/* Traces the ref dimension information in each loop.  */
++
++void
++analyze_loop_refs_dimension (std::vector<data_ref> &refs)
++{
++  for (unsigned i = 0; i < refs.size (); ++i)
++    {
++      if (refs[i].trace_status_p == false)
++	continue;
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "trace_reference_dimension %d:\n", i);
++	  print_generic_expr (dump_file, refs[i].ref, TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      trace_ref_dimension_and_loop_bounds (refs[i]);
++    }
++}
++
++/* analyze nested kernels
++   1) multidimension loop analyze
++   2) extended outer loop analyze
++*/
++
++bool
++analyze_nested_kernels (std::vector<class loop *> &kernels,
++			std::map<class loop *,
++				 std::vector<data_ref> > &loop_refs,
++			std::set<gimple *> &traced_ref_stmt,
++			std::vector<data_ref> &unresolved_refs)
++{
++  if (dump_file)
++    fprintf (dump_file, "\nPhase 3: analyze_nested_kernels\n\n");
++
++  /* `kernels` may be added in during outer loop extension phase,
++     thus using initial size to avoid repeatedly analyzing.  */
++  unsigned init_kernels_size = kernels.size ();
++  for (unsigned i = 0; i < init_kernels_size; ++i)
++    {
++      class loop *loop = kernels[i];
++      if (loop_refs.count (loop) == 0)
++	continue;
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "loop header %d:\n", loop->header->index);
++      analyze_loop_refs_dimension (loop_refs[loop]);
++
++      unsigned depth = loop_depth (loop);
++      unsigned outer_depth = trace_outer_loop_depth (get_cur_loop_niters
++			     (loop_refs, loop), depth);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "cur_depth: %d, outer_depth: %d\n",
++		 depth, outer_depth);
++      /* param_outer_loop_num: number of loops of the extended outer loop.
++	 Outermost loop should not be extended when outer_depth = 0.
++	 `outer_depth == depth` means the current loop is the loop which
++	 boundary is known, so there is no need to extend the outer loop.  */
++      if (outer_depth == 0 || outer_depth == depth
++	  || depth > outer_depth + param_outer_loop_num)
++	continue;
++
++      /* Extend outer loop.  */
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "\nStart extending outer loop\n");
++      /* Superloops of the loop, start from the loop closest to the
++	  current loop in the outermost loop.  */
++      for (int j = 0; j < param_outer_loop_num && --depth; ++j)
++	{
++	  class loop *outer_loop = (*loop->superloops)[depth];
++	  /* The outer loop may be added when analyzing previous inner loops,
++	     i.e. the outer loop contains two or more inner loops.  */
++	  if (loop_refs.count (outer_loop))
++	    continue;
++	  /* phase1 ~ phase3 analysis on the extended outer loop.  */
++	  analyze_loop_dense_memory (kernels, loop_refs, outer_loop);
++	  if (loop_refs.count (outer_loop) == 0)
++	    continue;
++	  for (unsigned k = 0; k < loop_refs[outer_loop].size (); ++k)
++	    {
++	      if (dump_file && (dump_flags & TDF_DETAILS))
++		{
++		  fprintf (dump_file, "outer_analyze_nested_kernels %d: ", k);
++		  print_generic_expr (dump_file, loop_refs[outer_loop][k].ref,
++				      TDF_SLIM);
++		  fprintf (dump_file, "\n");
++		}
++	    }
++	  trace_loop_refs_info (loop_refs[outer_loop], traced_ref_stmt,
++				unresolved_refs);
++	  analyze_loop_refs_dimension (loop_refs[outer_loop]);
++	  outer_depth = trace_outer_loop_depth (get_cur_loop_niters
++						(loop_refs, outer_loop), depth);
++	  /* `outer_depth == depth` means the current loop is the loop which
++	   boundary is known, so there is no need to extend the outer loop.  */
++	  if (outer_depth == depth)
++	    break;
++	  else
++	    /* The outer loop cannot find the current loop boundary,
++	       Remove the record of outer_loop from the loop_refs.  */
++	    loop_refs.erase (outer_loop);
++	}
++    }
++  return true;
++}
++
++/* ================ phase 4 filter_and_sort_kernels ================  */
++
++/* Get the edge probability information of each basic block in the loop.  */
++
++float
++get_edge_prob (edge e, float minimum)
++{
++  float fvalue = 0;
++
++  profile_probability probability = e->probability;
++  if (probability.initialized_p ())
++    {
++      fvalue = probability.to_reg_br_prob_base () / float (REG_BR_PROB_BASE);
++      if (fvalue < minimum && probability.to_reg_br_prob_base ())
++	fvalue = minimum;
++    }
++  return fvalue;
++}
++
++/* Get the next bb with a high branch probability.  */
++
++basic_block
++next_high_probability_bb (basic_block bb)
++{
++  if (bb == NULL)
++    return NULL;
++
++  /* Limit the minimum probability value.  */
++  const float MINNUM_PROB = 0.00001f;
++  float minimum = MINNUM_PROB;
++
++  gimple *stmt = last_stmt (bb);
++  if (stmt && gimple_code (stmt) == GIMPLE_COND)
++    {
++      edge true_edge = NULL;
++      edge false_edge = NULL;
++      extract_true_false_edges_from_block (bb, &true_edge, &false_edge);
++
++      float true_edge_prob = get_edge_prob (true_edge, minimum);
++      float false_edge_prob = get_edge_prob (false_edge, minimum);
++      /* If the content of the branch does not include the candidate
++	 kernel, the branch probability may not be limited.  */
++      /* The edge_prob may have precision error during static prediction,
++	 so we need to relax the limit before comparison.  */
++      if ((true_edge_prob >= (param_branch_prob_threshold / 100.0) - minimum)
++	  && flow_bb_inside_loop_p (bb->loop_father, true_edge->dest))
++	return true_edge->dest;
++      else if ((false_edge_prob
++		>= (param_branch_prob_threshold / 100.0) - minimum)
++	       && flow_bb_inside_loop_p (bb->loop_father, false_edge->dest))
++	return false_edge->dest;
++      else
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "No high probability bb:");
++	      fprintf (dump_file, "current bb: %d, true: %f, false: %f\n",
++		       bb->index, true_edge_prob, false_edge_prob);
++	    }
++	  return NULL;
++	}
++    }
++  else
++    {
++      edge e = find_fallthru_edge (bb->succs);
++      if (e)
++	return e->dest;
++    }
++  return NULL;
++}
++
++
++/* Dump loop header bb.  */
++
++void
++dump_loop_headers (const char *name, std::vector<class loop *> &loops)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "\n\n%s:\n", name);
++      fprintf (dump_file, "{ ");
++      for (unsigned int i = 0; i < loops.size (); i++)
++	fprintf (dump_file, "%d(%d) ", loops[i]->num, loops[i]->header->index);
++      fprintf (dump_file, "}\n\n");
++    }
++}
++
++/* Combine and sort candidate loops.  */
++
++bool
++filter_and_sort_kernels (std::vector<class loop *> &sorted_kernels,
++			 std::vector<class loop *> &kernels)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nPhase 4: filter_and_sort_kernels:\n\n");
++
++  std::set<basic_block> end_bb;
++  std::list<basic_block> walked_header_bb; /* Used to record nested loops.  */
++  std::set<int> walked_non_header_bb_idx;
++
++  for (unsigned i = 0; i < kernels.size (); ++i)
++    {
++      if (kernels[i]->inner == NULL)
++	end_bb.insert (kernels[i]->header);
++    }
++
++  dump_loop_headers ("kernels", kernels);
++
++  if (!param_filter_kernels)
++    {
++      for (std::vector<class loop *>::iterator it = kernels.begin ();
++	   it != kernels.end (); ++it)
++	sorted_kernels.push_back (*it);
++    }
++  else
++    {
++      basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun);
++
++      while (bb)
++	{
++	  if (bb == NULL)
++	    return false;
++	  if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun))
++	    break;
++
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "%d ", bb->index);
++
++	  /* bb is not the head of the loop, go to the next.  */
++	  if (bb != bb->loop_father->header)
++	    {
++	      if (walked_non_header_bb_idx.count (bb->index))
++		{
++		  if (dump_file && (dump_flags & TDF_DETAILS))
++		    fprintf (dump_file, "Find same-loop cycle.  "
++					"Abort filtering process.\n");
++		  return false;
++		}
++	      walked_non_header_bb_idx.insert (bb->index);
++      	      bb = next_high_probability_bb (bb);
++	      continue;
++	    }
++
++	  /* bb is the head of the loop.  */
++	  if (bb != walked_header_bb.back ())
++	    {
++	      if (end_bb.count (bb))
++		{
++		  sorted_kernels.push_back (bb->loop_father);
++		  bb = single_exit (bb->loop_father)->dest;
++		  continue;
++		}
++	      if (loop_outer (bb->loop_father) != NULL
++		  && get_loop_exit_edges (bb->loop_father).length () != 1)
++		return false;
++	      walked_header_bb.push_back (bb);
++	      bb = next_high_probability_bb (bb);
++	      continue;
++	    }
++	  else
++	    {
++	      walked_header_bb.pop_back ();
++	      bb = single_exit (bb->loop_father)->dest;
++	      continue;
++	    }
++	}
++    }
++
++  dump_loop_headers ("sorted_kernels", sorted_kernels);
++  return true;
++}
++
++/* Check whether the given bb is null.  */
++
++bool
++check_null_bb (basic_block bb)
++{
++  if (bb == NULL)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Unexpected error at null bb.\n");
++      return true;
++    }
++  return false;
++}
++
++/* Check whether the loop father of the given bb is null.  */
++
++bool
++check_null_loop_father (basic_block bb)
++{
++  if (check_null_bb (bb))
++    return true;
++
++  if (bb->loop_father == NULL)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "bb %d's loop father is null.\n", bb->index);
++      return true;
++    }
++  return false;
++}
++
++/* States for bb during path traversal.  */
++
++enum bb_traversal_state
++{
++  NOT_TRAVERSED = 0,
++  UNDER_TRAVERSAL,
++  FULLY_TRAVERSED
++};
++
++/* Detect abnormal revisit for bb during path traversal where bb is
++   1) fully traversed,
++   2) non-loop-header bb but currently under traversal.  */
++
++bool
++revisit_bb_abnormal_p (basic_block bb, std::vector<int> &bb_visited,
++		       const std::set<int> &header_bb_idx_set,
++		       std::set<std::pair<int, int> > &unused_edges,
++		       int src_bb_idx)
++{
++  /* If the header bb has been already fully traversed, early exit
++     the function.  */
++  if (bb_visited[bb->index] == FULLY_TRAVERSED)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Already visited bb index %d. Abort.\n",
++		 bb->index);
++      return true;
++    }
++
++  /* If we revisit a non-header bb during next-bb traversal, we detect
++     an inner-loop cycle and dump warning info. Record this abnormal edge
++     in `unused_edges` for special treatment in path weight update.  */
++  if (!header_bb_idx_set.count (bb->index)
++      && bb_visited[bb->index] == UNDER_TRAVERSAL)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Warning: Find cycle at bb index %d. Abort.\n",
++		 bb->index);
++      unused_edges.insert (std::make_pair (src_bb_idx, bb->index));
++      return true;
++    }
++
++  return false;
++}
++
++/* Check successor bb through edge e. Return true if successor bb is NULL or
++   out of loop.  */
++
++bool
++check_succ_bb_abnormal_p (basic_block bb, edge e)
++{
++  if (check_null_bb (e->dest))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Null bb connected to src bb %d.\n", bb->index);
++
++      return true;
++    }
++
++  /* If bb is within one loop and the edge is pointing to the
++     outer loop, skip edge processing until a backedge to header
++     bb. `loop->num = 0` represents function body.  */
++  if (bb->loop_father->num != 0
++      && !flow_bb_inside_loop_p (bb->loop_father, e->dest))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Find edges to the outer loop at bb "
++			    "index %d to bb index %d. Abort.\n",
++		 bb->index, e->dest->index);
++
++      return true;
++    }
++
++  return false;
++}
++
++/* Criteria for retrieving the next bb in modified control-flow graph, which
++   creates a topological order for the bb traversal.  */
++
++void
++get_next_toposort_bb (basic_block bb, std::vector<int> &bb_visited,
++		      std::list<basic_block> &bb_topo_order,
++		      const std::set<int> &header_bb_idx_set,
++		      std::set<std::pair<int, int> > &unused_edges,
++		      int src_bb_idx)
++{
++  /* 1) Before bb returns to the loop header, bb will not go to the outer loop.
++     2) After returning to the loop header, traverse all exit_bbs.
++     NEXT STEP:
++     1) If goto jumps out of 2 loops, goto has to traverse smaller jumps first.
++     2) If path length is the same => choose higher depth traversal path.  */
++  if (check_null_bb (bb) || check_null_loop_father (bb))
++    return;
++
++  /* Find last bb of function.  */
++  if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun))
++    return;
++
++  if (revisit_bb_abnormal_p (bb, bb_visited, header_bb_idx_set, unused_edges,
++			     src_bb_idx))
++    return;
++
++  /* If we revisit the header bb of a loop, traverse all exit bbs.  */
++  if (header_bb_idx_set.count (bb->index)
++      && bb_visited[bb->index] == UNDER_TRAVERSAL)
++    {
++      unsigned i;
++      edge e;
++      auto_vec<edge> exits = get_loop_exit_edges (bb->loop_father);
++
++      if (exits.length () > 1 && dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Detect multiple exits at loop %d.\n",
++		 bb->loop_father->num);
++
++      FOR_EACH_VEC_ELT (exits, i, e)
++	{
++	  get_next_toposort_bb (e->dest, bb_visited, bb_topo_order,
++				header_bb_idx_set, unused_edges, src_bb_idx);
++	}
++      return;
++    }
++
++  /* Post-order traversal for normal bb.  */
++  bb_visited[bb->index] = UNDER_TRAVERSAL;
++  edge e;
++  edge_iterator ei;
++
++  FOR_EACH_EDGE (e, ei, bb->succs)
++    {
++      if (check_succ_bb_abnormal_p (bb, e))
++	continue;
++
++      get_next_toposort_bb (e->dest, bb_visited, bb_topo_order,
++			    header_bb_idx_set, unused_edges, bb->index);
++    }
++
++  /* bb is marked as fully traversed and all its descendents have been
++      fully traversed due to post-order traversal.  */
++  bb_visited[bb->index] = FULLY_TRAVERSED;
++  bb_topo_order.push_back (bb);
++}
++
++/* A struct that represents the longest path weight at each bb.  */
++
++struct weight
++{
++  /* Longest path weight at current bb.  */
++  gcov_type bb_count;
++
++  /* Prev bb from the current longest path.  */
++  int prev_bb_idx;
++};
++
++/* A helper function for checking whether overflow will occur when adding two
++   gcov_type weights.  */
++
++bool
++check_weight_overflow (gcov_type a, gcov_type b)
++{
++  if ((a > 0 && b > INT64_MAX - a) || (a < 0 && b < INT64_MIN - a))
++    return true;
++
++  return false;
++}
++
++/* A helper function that update the weight of the current longest path to
++   bb_idx_dst and a new path pointing from bb_idx_src to bb_idx_dst.  */
++
++void
++update_path_weight (std::vector<weight> &bb_weights, int bb_idx_src,
++		    int bb_idx_dst, gcov_type weight_dst)
++{
++  if (check_weight_overflow (bb_weights[bb_idx_src].bb_count, weight_dst)
++      && dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "WARNING: Path weight overflow at src bb %d "
++			  "and dest bb %d.\n",
++	       bb_idx_src, bb_idx_dst);
++    }
++  if (bb_weights[bb_idx_dst].bb_count
++      < bb_weights[bb_idx_src].bb_count + weight_dst)
++    {
++      bb_weights[bb_idx_dst].bb_count
++	= bb_weights[bb_idx_src].bb_count + weight_dst;
++      bb_weights[bb_idx_dst].prev_bb_idx = bb_idx_src;
++    }
++}
++
++/* Check whether the required bb/loop info for path update is null.  */
++
++bool
++check_null_info_in_path_update (basic_block bb, edge e)
++{
++  if (check_null_bb (e->dest))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Null bb detected for edge connected "
++			    "to src bb %d.\n",
++		 bb->index);
++      return true;
++    }
++
++  if (check_null_loop_father (bb) || check_null_loop_father (e->dest))
++    return true;
++
++  return false;
++}
++
++/* Update path weight to loop exit bbs where the current source bb is connected
++   to header bb using a backedge.  */
++
++void
++update_backedge_path_weight (std::vector<weight> &bb_weights, basic_block bb,
++			   const std::set<std::pair<int, int> > &unused_edges)
++{
++  unsigned i;
++  edge e_exit;
++  auto_vec<edge> exits = get_loop_exit_edges (bb->loop_father);
++  FOR_EACH_VEC_ELT (exits, i, e_exit)
++    {
++      if (check_null_bb (e_exit->dest))
++	{
++	  if (e_exit->src != NULL && dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "Null bb detected for exiting edge "
++				"connected to src bb %d.\n",
++		     e_exit->src->index);
++	  continue;
++	}
++
++      if (unused_edges.count (std::make_pair (bb->index, e_exit->dest->index)))
++	{
++	  /* Inner-loop-cycle backedge case.  */
++	  continue;
++	}
++      update_path_weight (bb_weights, bb->index, e_exit->dest->index,
++			  e_exit->dest->count.to_gcov_type ());
++    }
++}
++
++/* Update the longest length of the path through control flow graph.  */
++
++void
++update_max_length_of_path (std::vector<weight> &bb_weights,
++			   std::list<basic_block> &bb_topo_order,
++			   const std::set<int> &header_bb_idx_set,
++			   const std::set<std::pair<int, int> > &unused_edges)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "Start update weight traversal:\n");
++
++  while (!bb_topo_order.empty ())
++    {
++      basic_block bb = bb_topo_order.back ();
++      bb_topo_order.pop_back ();
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "%d ", bb->index);
++
++      edge e;
++      edge_iterator ei;
++      FOR_EACH_EDGE (e, ei, bb->succs)
++	{
++	  if (check_null_info_in_path_update (bb, e))
++	    continue;
++
++	  if (unused_edges.count (std::make_pair (bb->index, e->dest->index)))
++	    {
++	      /* Inner-loop-cycle backedge case.  */
++	      continue;
++	    }
++	  else if (bb->loop_father->num != 0
++		   && !flow_bb_inside_loop_p (bb->loop_father, e->dest))
++	    {
++	      /* Outer-loop edge case.  */
++	      continue;
++	    }
++	  else if (header_bb_idx_set.count (e->dest->index)
++	      && bb->loop_father == e->dest->loop_father)
++	    {
++	      /* Backedge case.  */
++	      update_backedge_path_weight (bb_weights, bb, unused_edges);
++	    }
++	  else
++	    {
++	      /* Normal edge case.  */
++	      update_path_weight (bb_weights, bb->index, e->dest->index,
++				  e->dest->count.to_gcov_type ());
++	    }
++	}
++    }
++}
++
++/* Collect all header bb of loops in the function beforehand.  */
++
++void
++collect_header_bb_for_fn (std::set<int> &header_bb_idx_set)
++{
++  for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
++    header_bb_idx_set.insert (loop->header->index);
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "\nCheck header bbs:\n");
++      for (std::set<int>::iterator it = header_bb_idx_set.begin ();
++	   it != header_bb_idx_set.end (); ++it)
++	fprintf (dump_file, "%d ", *it);
++      fprintf (dump_file, "\n");
++    }
++}
++
++/* Record loop executing order and bb high-executing path.  */
++
++void
++record_high_execution_path (std::vector<class loop *> &sorted_kernel,
++			    std::vector<int> &bb_path, int bb_num_max)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nPATH FOR %s: ", get_name (cfun->decl));
++
++  std::set<int> loop_set;
++  for (int i = bb_path.size() - 1; i >= 0; --i)
++    {
++      int bb_idx = bb_path[i];
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "%d ", bb_idx);
++      gcc_assert (bb_idx < bb_num_max);
++
++      class loop *loop = BASIC_BLOCK_FOR_FN (cfun, bb_idx)->loop_father;
++      if (!loop_set.count (loop->num))
++	{
++	  loop_set.insert (loop->num);
++	  sorted_kernel.push_back (loop);
++	}
++    }
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\n");
++}
++
++/* Combine and sort candidate loops using feedback information.  */
++
++bool
++filter_and_sort_kernels_feedback (std::vector<class loop *> &sorted_kernel,
++				  std::set<int> &bb_pathset)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nPhase 4: filter_and_sort_kernels:\n\n");
++
++  std::set<int> header_bb_idx_set;
++  std::list<basic_block> bb_topo_order;
++
++  /* Quoted from GCC internal, Chapter 15.1, "the index for any block should
++     never be greater than `last_basic_block`." Therefore, we use this
++     variable for retrieving the max bb index of a function.  */
++  /* Since the pass does not add/remove/merge basic blocks until Phase 6
++     and previous passes will update ssa accordingly, we do not need to
++     `compact_blocks` to update bb indices currently.  */
++  int bb_num_max = last_basic_block_for_fn (cfun) + 1;
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nMaximal number of possible bbs in the "
++			  "function: %d\n",
++	     bb_num_max);
++  std::vector<int> bb_visited = std::vector<int>(bb_num_max, 0);
++
++  collect_header_bb_for_fn (header_bb_idx_set);
++  basic_block bb_start = ENTRY_BLOCK_PTR_FOR_FN (cfun);
++
++  /* Step 1: Get topological order of bb during traversal.  */
++  std::set<std::pair<int, int> > unused_edges;
++  get_next_toposort_bb (bb_start, bb_visited, bb_topo_order, header_bb_idx_set,
++			unused_edges, -1);
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "\nCheck bbs in topological order:\n");
++      for (std::list<basic_block>::iterator it = bb_topo_order.begin ();
++	   it != bb_topo_order.end (); ++it)
++	fprintf (dump_file, "%d ", (*it)->index);
++      fprintf (dump_file, "\n");
++    }
++
++  /* Step 2: Update weights of nodes and path.  */
++  weight weight_init = {-1, -1};
++  std::vector<weight> bb_weights = std::vector<weight>(bb_num_max, weight_init);
++  bb_weights[0].bb_count = 0;  /* ENTRY bb has count 0 and prev bb as -1.  */
++  update_max_length_of_path (bb_weights, bb_topo_order, header_bb_idx_set,
++			     unused_edges);
++
++  /* Step 3: Backtrack a path from EXIT bb to ENTRY bb.  */
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nCheck counts for each bb:\n");
++
++  std::vector<int> bb_path;
++  int tmp_bb_idx = 1;
++  bb_pathset.insert (tmp_bb_idx);
++  bb_path.push_back (tmp_bb_idx);
++  tmp_bb_idx = bb_weights[tmp_bb_idx].prev_bb_idx;
++  while (tmp_bb_idx > 0 && tmp_bb_idx < bb_num_max)
++    {
++      if (bb_pathset.count (tmp_bb_idx))
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf(dump_file, "ERROR: already seen bb index %d\n",
++		    tmp_bb_idx);
++	  return false;
++	}
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "%d: %ld, ", tmp_bb_idx,
++		 bb_weights[tmp_bb_idx].bb_count);
++      bb_pathset.insert (tmp_bb_idx);
++      bb_path.push_back (tmp_bb_idx);
++      tmp_bb_idx = bb_weights[tmp_bb_idx].prev_bb_idx;
++    }
++  /* It is possible that the function exit code is wrapped around as an
++     variable, and thus, EXIT_BB in cfg is not connected to any bb.  */
++  if (tmp_bb_idx < 0 || tmp_bb_idx >= bb_num_max)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "unhandled scenario at backtracking highly "
++			      "executed path with tmp_bb_idx %d",
++		   tmp_bb_idx);
++	}
++      return false;
++    }
++
++  record_high_execution_path (sorted_kernel, bb_path, bb_num_max);
++
++  return true;
++}
++
++
++/* ================ phase 5 record_and_sort_ref_groups ================  */
++/* Memory reference score, different aspects of one memory reference.  */
++
++struct ref_score
++{
++  /* certain memory reference.  */
++  data_ref d_ref;
++
++  /* local count for bb where memory reference is located.  */
++  gcov_type bb_count;
++
++  /* line-location of memory reference.  */
++  int line;
++};
++
++/* Memory reference group, different reference of the same variable.  */
++
++struct ref_group
++{
++  /* source variables.  */
++  tree var;
++
++  /* variable size, Unit: MB.  */
++  double var_size;
++
++  /* first ref for insert hint.  */
++  data_ref first_use;
++
++  /* first ref with the highest-order CALC.  */
++  data_ref first_calc_use;
++
++  /* reuse scores of variables.  */
++  float reuse_level;
++
++  /* method of calculating the var size.  */
++  calc_type calc_by;
++
++  /* memory reference index for specific variable.  */
++  unsigned int mem_ref_index;
++
++  /* variable dimension.  */
++  unsigned int dim;
++
++  /* True if first_calc_use's footprint replaces that of first_use.  */
++  unsigned int transfer_ft;
++
++  /* Accessing Reference Records in Different Modes (key_index):
++    000: write, random, non-parallel
++    001: write, random, parallel
++    010: write, regular, non-parallel
++    011: write, regular, parallel
++    100: read, random, non-parallel
++    101: read, random, parallel
++    110: read, regular, non-parallel
++    111: read, regular, parallel
++  */
++  std::map<int, std::vector<data_ref> > ref_use;
++
++  /* scores for different memory references.  */
++  std::vector<ref_score> ref_scores;
++
++  ref_group ()
++    {
++      var = NULL_TREE;
++      var_size = 0;
++      reuse_level = 0;
++      calc_by = UNHANDLE_CALC;
++      mem_ref_index = 0;
++      dim = 1;
++      transfer_ft = 0;
++    }
++};
++
++/* Get the integer part for log(x) with the given base.  */
++
++static unsigned int
++flog (float x, float base)
++{
++  unsigned int res = 0;
++  while (x >= base)
++    {
++      ++res;
++      x /= base;
++    }
++  return res;
++}
++
++/* Calculate reuse time for a memory reference in ref_group.  */
++
++float
++calculate_reuse_times (std::vector<data_ref> &mem_refs, std::set<int> &loop_set,
++		       std::set<int> &bb_set, unsigned int var_dim)
++{
++  const float SAME_BB_REUSE_WEIGHT = 0.1;
++  const float SAME_LOOP_REUSE_WEIGHT = 0.5;
++  const float NORMAL_REUSE_WEIGHT = 1.;
++
++  float reuse_time_sum = 0.;
++  for (std::vector<data_ref>::iterator it = mem_refs.begin ();
++       it != mem_refs.end (); ++it)
++    {
++      const data_ref &mem_ref = *it;
++      float reuse_time = 0.;
++      if (bb_set.count (mem_ref.bb_idx))
++	{
++	  /* If the two mem_ref belong to the same bb, the new reuse
++	     weight will not exceed 0.1 divided by the mem_ref mode group
++	     size.
++	     NEXT STEP: The following equation may hold and cause commutative
++	     property of read and write op not holding:
++	      write + (reused) read != read + (reused) write.
++	     However, it seems that write mem_ref is always before read mem_ref,
++	     so the above comparison does not show up in calculation due to
++	     intrinsic in-order property of tree map, but this condition is
++	     quite fragile anyway.  */
++	  reuse_time = SAME_BB_REUSE_WEIGHT / mem_refs.size ();
++	}
++      else
++	{
++	  bb_set.insert (mem_ref.bb_idx);
++	  if (loop_set.count (mem_ref.loop_idx))
++	    {
++	      /* If the mem_ref belongs to a loop where any other mem_ref is in,
++		 the new reuse weight will be 0.5.  */
++	      reuse_time = SAME_LOOP_REUSE_WEIGHT;
++	    }
++	  else
++	    {
++	      /* If the mem_ref is reused but not in the same group with any
++		 other mem_ref, the new reuse weight will be 1.  */
++	      loop_set.insert (mem_ref.loop_idx);
++	      reuse_time = NORMAL_REUSE_WEIGHT;
++	    }
++	}
++      unsigned int used_dim = std::min (mem_ref.loop_depth, var_dim);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "used_dim : %u, loop_depth : %u\n", used_dim,
++		 mem_ref.loop_depth);
++      unsigned int power = flog (std::max (0u, mem_ref.loop_depth - used_dim)
++				 + 2, 2.);
++      reuse_time_sum += reuse_time * (used_dim * used_dim / 2.) * (power);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "(%f * (%u * %u / 2) * (%u) = %f\n",
++		 reuse_time, used_dim, used_dim, power,
++		 reuse_time * (used_dim * used_dim / 2.) * (power));
++    }
++  return reuse_time_sum;
++}
++
++/* Calculate reuse level.  */
++
++float
++calculate_reuse_level (std::map<int, std::vector<data_ref> > &var_use,
++		       unsigned int var_dim, double var_size)
++{
++  const float VAR_SIZE_CACHE_CAPACITY = 1 / 4.;
++  const int WITHIN_CACHE_SIZE_COST = 4;
++  const float BYTE_CONVERT_RATIO = 1024.;
++
++  float level = 0.;
++  std::set<int> loop_set;
++  std::set<int> bb_set;
++  bool has_write_op = false;
++  for (std::map<int, std::vector<data_ref> >::iterator it = var_use.begin ();
++       it != var_use.end (); ++it)
++    {
++      unsigned int parallel = 1;
++      unsigned int regular = 1;
++
++      if ((*it).second[0].parallel_p)
++	parallel = PARALLEL_NUM;
++      if (!(*it).second[0].regular_p)
++	regular = INDIRECT_ACCESS_VALUE;
++      if (!(*it).second[0].read_p)
++	has_write_op = true;
++
++      /* In serial reuse, we will later check whether they are in the
++	 same cacheline.  If yes, delete the reuse.  For details, see the
++	 reuse analysis of prefetching and eliminate redundancy.  */
++      float reuse_times = calculate_reuse_times ((*it).second, loop_set,
++						 bb_set, var_dim);
++      float add = parallel * reuse_times * regular;
++      level += add;
++      if (add && dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "%d : %d * %f * %d = %f\n",
++		 (*it).first, parallel, reuse_times, regular, add);
++    }
++
++  bool within_llc_size = var_size > param_l2_cache_size / BYTE_CONVERT_RATIO
++			 && var_size < VAR_SIZE_CACHE_CAPACITY
++				       * param_llc_capacity_per_core;
++
++  float final_level = has_write_op ? (level * WRITE_COST) : level;
++  final_level = within_llc_size ? (final_level * WITHIN_CACHE_SIZE_COST)
++				: final_level;
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "final level : %d * %f * %d = %f\n",
++	     has_write_op ? WRITE_COST : 1, level,
++	     within_llc_size ? WITHIN_CACHE_SIZE_COST : 1, final_level);
++  return final_level;
++}
++
++/* Comparison of reference reuse level.  */
++
++bool
++ref_group_reuse_cmp (const ref_group &a, const ref_group &b)
++{
++  if (a.reuse_level != b.reuse_level)
++    return a.reuse_level > b.reuse_level;
++  else
++    return get_name (a.var) < get_name (b.var);
++}
++
++/* Dump key information of reference group and memory access for llc hint.  */
++
++void
++dump_key_info_for_llc_hint (std::vector<ref_group> &ref_groups)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "\nLLC hint info:\n");
++      fprintf (dump_file, "rank\tvar\t(lineno, direct, vectorized, write)\n");
++      for (unsigned int i = 0; i < ref_groups.size (); ++i)
++	{
++	  fprintf (dump_file, "%d\t", i);
++	  print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM);
++	  data_ref &mem_ref = ref_groups[i].first_use;
++	  fprintf (dump_file, "\t(%d, %u, %u, %u)",
++		   expand_location (mem_ref.stmt->location).line,
++		   mem_ref.regular_p, mem_ref.vectorize_p, 1 - mem_ref.read_p);
++	  fprintf (dump_file, "\n");
++	}
++      fprintf (dump_file, "\n");
++    }
++}
++
++/* Sort reference groups.  */
++
++void
++sort_ref_groups (std::vector<ref_group> &ref_groups,
++		 std::map<tree, ref_group> &ref_groups_map)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nsort_ref_groups_by_reuse_level\n");
++
++  for (std::map<tree, ref_group>::iterator it = ref_groups_map.begin ();
++       it != ref_groups_map.end (); ++it)
++    {
++      (*it).second.reuse_level = calculate_reuse_level ((*it).second.ref_use,
++							(*it).second.dim,
++							(*it).second.var_size);
++      ref_groups.push_back ((*it).second);
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  print_generic_expr (dump_file, (*it).second.var, TDF_SLIM);
++	  fprintf (dump_file, " : %f\n\n", (*it).second.reuse_level);
++	}
++    }
++
++  std::sort (ref_groups.begin (), ref_groups.end (), ref_group_reuse_cmp);
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "\nsorted ref_groups:\n");
++      fprintf (dump_file, "rank\tvar\t(data_size, dim, num_of_mem_ref, "
++			  "need_tmp_name): reuse_level_score\n");
++      for (unsigned int i = 0; i < ref_groups.size (); ++i)
++	{
++	  fprintf (dump_file, "%d\t", i);
++	  print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM);
++	  int need_tmp_name = !get_name (ref_groups[i].var) ? 1 : 0;
++	  fprintf (dump_file, "\t(%lf, %u, %lu, %d)", ref_groups[i].var_size,
++		   ref_groups[i].dim, ref_groups[i].ref_scores.size (),
++		   need_tmp_name);
++	  fprintf (dump_file, " : %f\n", ref_groups[i].reuse_level);
++	}
++      fprintf (dump_file, "\n");
++
++      fprintf (dump_file, "first_use:\n");
++      for (unsigned int i = 0; i < ref_groups.size (); ++i)
++	{
++	  fprintf (dump_file, "%d ", i);
++	  print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM);
++	  fprintf (dump_file, " : ");
++	  if (!ref_groups[i].first_use.vectorize_p)
++	    print_generic_expr (dump_file, ref_groups[i].first_use.ref,
++				TDF_SLIM);
++	  else
++	    print_gimple_stmt (dump_file, ref_groups[i].first_use.stmt,
++				TDF_SLIM);
++	  fprintf (dump_file, "\n");
++	}
++      fprintf (dump_file, "\n");
++    }
++    dump_key_info_for_llc_hint (ref_groups);
++}
++
++/* Attributes of variable data.  */
++
++enum data_attribute
++{
++  DA_PARALLEL = 0,
++  DA_REGULAR,
++  DA_READ
++};
++
++/* Record memory reference by use mode.
++   If the reference group is not found, create a group.  */
++
++void
++record_mem_ref (std::map<tree, ref_group> &ref_groups, data_ref &mem_ref)
++{
++  unsigned int index = (mem_ref.parallel_p << DA_PARALLEL)
++	      + (mem_ref.regular_p << DA_REGULAR) + (mem_ref.read_p << DA_READ);
++
++  if (!ref_groups.count (mem_ref.var))
++    {
++      ref_group ref_group;
++      ref_group.var = mem_ref.var;
++      ref_group.first_use = mem_ref;
++      ref_group.first_calc_use = mem_ref;
++      ref_groups[mem_ref.var] = ref_group;
++    }
++
++  /* Ref_groups' calc_by reflects the highest order of calc_by that can be
++     achieved by all mem_ref of ref_groups. The first mem_ref that achieves
++     this order is defined to be `first_calc_use`. Later after sorting
++     mem_refs, calc_by will be replaced by the calc_by of `first_use`, and
++     even by the calc_by of `first_calc_use`.  */
++  if (mem_ref.calc_by > ref_groups[mem_ref.var].calc_by)
++    {
++      ref_groups[mem_ref.var].calc_by = mem_ref.calc_by;
++      ref_groups[mem_ref.var].first_calc_use = mem_ref;
++    }
++  ref_groups[mem_ref.var].var_size = std::max (ref_groups[mem_ref.var].var_size,
++					       mem_ref.data_size);
++  ref_groups[mem_ref.var].dim = std::max (ref_groups[mem_ref.var].dim,
++				(unsigned int) mem_ref.loop_bounds.size ());
++  ref_groups[mem_ref.var].ref_use[index].push_back (mem_ref);
++
++  ref_score ref_level = { mem_ref, ((mem_ref.stmt)->bb->count).to_gcov_type (),
++			   expand_location (mem_ref.stmt->location).line };
++  ref_groups[mem_ref.var].ref_scores.push_back (ref_level);
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "recorded in: ");
++      print_generic_expr (dump_file, mem_ref.var, TDF_SLIM);
++      fprintf (dump_file, ":%d:%ld\n", index,
++	       ref_groups[mem_ref.var].ref_use[index].size () - 1);
++
++      fprintf (dump_file, "base: ");
++      print_generic_expr (dump_file, mem_ref.base, TDF_SLIM);
++
++      fprintf (dump_file, ", index: ");
++      print_generic_expr (dump_file, mem_ref.index, TDF_SLIM);
++
++      fprintf (dump_file, ", step: ");
++      if (mem_ref.step && cst_and_fits_in_hwi (mem_ref.step))
++	fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
++		 int_cst_value (mem_ref.step));
++      else
++	print_generic_expr (dump_file, mem_ref.step, TDF_SLIM);
++
++      fprintf (dump_file, ", offset: ");
++      if (mem_ref.offset && cst_and_fits_in_hwi (mem_ref.offset))
++	fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
++		 int_cst_value (mem_ref.offset));
++      else
++	print_generic_expr (dump_file, mem_ref.offset, TDF_SLIM);
++      fprintf (dump_file, ", %s", mem_ref.read_p ? "read" : "write");
++
++      fprintf (dump_file, ", size: %lf", mem_ref.data_size);
++      fprintf (dump_file, "\n\n");
++    }
++}
++
++/* Rank data reference index level.  */
++
++bool
++best_insert_cmp (const ref_score &a, const ref_score &b)
++{
++  /* NEXT STEP: We can also calculate gap using static/feedback info inferred
++     from historical maximum bb count:
++	gap = hist_max_bb_ct / (alpha * max (a.bb_ct, b.bb_ct)) + 1.
++     Also, bb count needs to be smoothed and scaled as divisor can be 0.
++     history maximum bb count can be obtained in Phase 4.  */
++  const float gap = 1;
++  if (a.d_ref.loop_depth != b.d_ref.loop_depth)
++    return a.d_ref.loop_depth > b.d_ref.loop_depth;
++  else if (a.d_ref.regular_p != b.d_ref.regular_p)
++    return a.d_ref.regular_p > b.d_ref.regular_p;
++  else if (abs (double (std::max (a.bb_count, b.bb_count) + 1)
++		/ double (std::min (a.bb_count, b.bb_count) + 1) - 1) > gap)
++    return a.bb_count > b.bb_count;
++  else if (a.line != b.line)
++    return a.line < b.line;
++  else if (a.d_ref.read_p != b.d_ref.read_p)
++    return a.d_ref.read_p < b.d_ref.read_p;
++  else
++    return a.d_ref.vectorize_p > b.d_ref.vectorize_p;
++}
++
++/* Sort data reference index level within one reference group in non-decreasing
++   order of the customized sorting scheme.  */
++
++void
++sort_mem_ref_in_ref_group (std::map<tree, ref_group> &ref_groups_map)
++{
++  if (dump_file)
++    fprintf (dump_file, "\nsorted data_references:\n");
++  for (std::map<tree, ref_group>::iterator it = ref_groups_map.begin ();
++       it != ref_groups_map.end (); ++it)
++    {
++      ref_group &curr_ref_group = (*it).second;
++      std::vector<ref_score> &ref_scores = curr_ref_group.ref_scores;
++      std::stable_sort (ref_scores.begin (), ref_scores.end (),
++			best_insert_cmp);
++      /* Update ref_group's first_use and calc_by with the first mem_ref after
++	 sorting.  */
++      curr_ref_group.first_use = curr_ref_group.ref_scores[0].d_ref;
++      curr_ref_group.calc_by = curr_ref_group.first_use.calc_by;
++
++      /* When transferring footprint is enabled, it is allowed to transfer
++	 the statically-calculated footprint of a mem_ref from the same
++	 ref_group to `first_use` mem_ref.  */
++      if (param_transfer_footprint
++	  && curr_ref_group.first_use.calc_by == UNHANDLE_CALC)
++	{
++	  if (curr_ref_group.first_calc_use.calc_by > RUNTIME_CALC)
++	    {
++	      if (dump_file && (dump_flags & TDF_DETAILS))
++		{
++		  print_generic_expr (dump_file, (*it).first, TDF_SLIM);
++		  fprintf (dump_file, "\nfirst_use: ");
++		  print_gimple_stmt (dump_file, curr_ref_group.first_use.stmt,
++				     0, TDF_LINENO);
++		  fprintf (dump_file, "first_calc_use: ");
++		  print_gimple_stmt (dump_file,
++				     curr_ref_group.first_calc_use.stmt,
++				     0, TDF_LINENO);
++		}
++
++	      curr_ref_group.calc_by = curr_ref_group.first_calc_use.calc_by;
++	      curr_ref_group.transfer_ft = 1;
++	    }
++	  else
++	    {
++	      if (dump_file && (dump_flags & TDF_DETAILS))
++		{
++		  print_generic_expr (dump_file, (*it).first, TDF_SLIM);
++		  fprintf (dump_file, ": cannot transfer footprint to "
++				      "first use mem_ref.\n");
++		}
++	    }
++	}
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  print_generic_expr (dump_file, (*it).first, TDF_SLIM);
++	  fprintf (dump_file, " : %lu\n", ref_scores.size ());
++	  for (unsigned int i = 0; i < ref_scores.size (); ++i)
++	    {
++	      fprintf (dump_file, "mem_ref_index %u: ", i);
++	      print_gimple_stmt (dump_file, ref_scores[i].d_ref.stmt, 0,
++				 TDF_LINENO);
++	      fprintf (dump_file, "bb-%d ",
++		       ref_scores[i].d_ref.stmt->bb->index);
++	      fprintf (dump_file, "count %ld\n", ref_scores[i].bb_count);
++	    }
++	  fprintf (dump_file, "\n\n");
++	}
++    }
++}
++
++/* Tracing and sorting reference groups.  */
++
++bool
++record_and_sort_ref_groups (std::vector<ref_group> &ref_groups,
++			    std::vector<class loop *> &kernels,
++			    std::map<class loop *,
++				     std::vector<data_ref> > &loop_refs,
++			    std::set<int> bb_pathset)
++{
++  if (dump_file)
++    fprintf (dump_file, "\nPhase 5: trace_all_references_details\n\n");
++
++  std::map<tree, ref_group> ref_groups_map;
++
++  for (unsigned i = 0; i < kernels.size (); ++i)
++    {
++      class loop *loop = kernels[i];
++      if (loop_refs.count (loop) == 0)
++	continue;
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "loop header %d:\n", loop->header->index);
++      for (unsigned j = 0; j < loop_refs[loop].size (); ++j)
++	{
++	  data_ref &mem_ref = loop_refs[loop][j];
++	  if (mem_ref.trace_status_p)
++	    {
++	      if (!param_filter_mode || (param_filter_mode
++		  && bb_pathset.count (mem_ref.stmt->bb->index)))
++		record_mem_ref (ref_groups_map, mem_ref);
++	    }
++	}
++    }
++
++  /* Sort mem_ref within ref_group by local count and update first_use's
++     data_ref, stable sort.  */
++  sort_mem_ref_in_ref_group (ref_groups_map);
++  sort_ref_groups (ref_groups, ref_groups_map);
++
++  return ref_groups.size () > 0;
++}
++
++/* ================ phase 6 issue_llc_hint ================  */
++
++/* Issue vectorized mask prefetch gimple.  */
++
++void
++issue_mask_prefetch (gimple *stmt)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "insert svprfd.\n");
++
++  /* vect__1.1 = .MASK_LOAD (_2, 32B, loop_mask_3);
++     .MASK_STORE (_4, 32B, loop_mask_5, vect__6.6);
++  */
++  tree dataref_ptr = gimple_call_arg (stmt, 0);
++  tree scale = gimple_call_arg (stmt, 1);
++  tree final_mask = gimple_call_arg (stmt, 2);
++  tree target = NULL_TREE;
++  if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
++    target = gimple_call_arg (stmt, 3);
++  else if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD)
++    target = gimple_call_lhs (stmt);
++  tree prfop = NULL_TREE;
++  if (param_llc_level == 3)
++    /* for simulation, 4: PLDL3KEEP.  */
++    prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4);
++  else if (param_llc_level == 4)
++    /* 6: PLDL4KEEP.  */
++    prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6);
++  else
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "LLC cache levels are illegal.\n");
++      return;
++    }
++
++  /* add offset.  */
++  gimple_stmt_iterator si = gsi_for_stmt (stmt);
++  /* target: vector_type - XXX_type.  */
++  if (target == NULL_TREE)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "unhandled scene: target vect is null");
++      return;
++    }
++  unsigned HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi
++		       (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target))));
++  tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance);
++  addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true,
++				   NULL, true, GSI_SAME_STMT);
++
++  gcall *call = gimple_build_call_internal (IFN_MASK_PREFETCH, 5, addr, scale,
++					    final_mask, target, prfop);
++  gsi_insert_after (&si, call, GSI_SAME_STMT);
++  update_ssa (TODO_update_ssa_only_virtuals);
++}
++
++/* Issue vectorized mask gather prefetch gimple.  */
++
++void
++issue_mask_gather_prefetch (gimple *stmt)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "insert svprfd_gather_uxindex.\n");
++
++  /* vect_patt_1.1 = .MASK_GATHER_LOAD (_2, vect__3.3, 8, { 0.0, ... },
++					loop_mask_4);  */
++  tree dataref_ptr = gimple_call_arg (stmt, 0);
++  tree vec_offset = gimple_call_arg (stmt, 1);
++  tree scale = gimple_call_arg (stmt, 2);
++  tree zero = gimple_call_arg (stmt, 3);
++  tree final_mask = gimple_call_arg (stmt, 4);
++  tree prfop = NULL_TREE;
++  if (param_llc_level == 3) // for simulation
++    prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4); // 4: PLDL3KEEP
++  else if (param_llc_level == 4)
++    prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6); // 6: PLDL4KEEP
++  else
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "LLC cache levels are illegal.\n");
++      return;
++    }
++
++  tree target = gimple_call_lhs (stmt);
++  /* add offset.  */
++  gimple_stmt_iterator si = gsi_for_stmt (stmt);
++  if (target == NULL_TREE)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "unhandled scene: target vect is null");
++      return;
++    }
++  unsigned HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi
++		       (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target))));
++  tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance);
++  addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true,
++				   NULL, true, GSI_SAME_STMT);
++
++  gcall *call = gimple_build_call_internal (IFN_MASK_GATHER_PREFETCH, 7, addr,
++					    vec_offset, scale, zero,
++					    final_mask, target, prfop);
++  gsi_insert_after (&si, call, GSI_SAME_STMT);
++  update_ssa (TODO_update_ssa_only_virtuals);
++}
++
++/* Issue builtin prefetch gimple.  */
++
++void
++issue_builtin_prefetch (data_ref &mem_ref)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "insert prfm.\n");
++  /* MEM[symbol: diagPtr, index: ivtmp_102, step: 8, offset: 0B] */
++  gimple *stmt = mem_ref.stmt;
++  tree ref = mem_ref.ref;
++
++  tree scale = mem_ref.step;
++  gimple_stmt_iterator si = gsi_for_stmt (stmt);
++  if (scale == NULL_TREE)
++    {
++      /* _190 = (void *) ivtmp.444_221;
++	 Cannot detect size unit at (void *).  */
++      scale = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (mem_ref.var)));
++      if (scale == NULL_TREE)
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "ERROR: Unknown size unit for the prefetching "
++				"variable.  Stop builtin_prefetch.\n\n");
++	  return;
++	}
++    }
++
++  tree addr = build_fold_addr_expr_with_type (ref, ptr_type_node);
++  addr = force_gimple_operand_gsi (&si, unshare_expr (addr),
++				   true, NULL, true, GSI_SAME_STMT);
++  unsigned HOST_WIDE_INT distance = param_prefetch_offset
++				      * tree_to_uhwi (scale);
++
++  addr = fold_build_pointer_plus_hwi (addr, distance);
++  addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true,
++				   NULL, true, GSI_SAME_STMT);
++  /* __builtin_prefetch (_68, 0, 1);
++     1st param: *addr, 2nd param: write/read (1/0), 3rd param: temporal locality
++     (high means strong locality) */
++  gcall *call = NULL;
++  if (param_llc_level == 3)
++    {
++      /* for simulation.
++	 BUILT_IN_PREFETCH (addr, rw, locality).  */
++      call = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
++				3, addr, integer_zero_node, integer_one_node);
++    }
++  else if (param_llc_level == 4)
++    {
++	tree prfop = build_int_cst (TREE_TYPE (integer_zero_node), 6);
++	call = gimple_build_call (
++				builtin_decl_explicit (BUILT_IN_PREFETCH_FULL),
++				3, addr, integer_zero_node, prfop);
++    }
++  else
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "LLC cache levels are illegal.\n");
++      return;
++    }
++
++  gsi_insert_after (&si, call, GSI_SAME_STMT);
++  update_ssa (TODO_update_ssa_only_virtuals);
++}
++
++/* Static form insertion and issue instruction.  We may check the
++   determination of the ARM SVE architecture before SVE hint insertion.  */
++
++void
++static_issue (std::vector<ref_group> &ref_groups, int num_issue_var)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "static issue\n");
++
++  for (int i = 0; i < num_issue_var; ++i)
++    {
++      data_ref mem_ref = ref_groups[i].first_use;
++      if (mem_ref.vectorize_p)
++	{
++	  enum internal_fn ifn_code = gimple_call_internal_fn (mem_ref.stmt);
++	  if (ifn_code == IFN_MASK_STORE || ifn_code == IFN_MASK_LOAD)
++	    issue_mask_prefetch (mem_ref.stmt);
++	  else if (ifn_code == IFN_MASK_GATHER_LOAD)
++	    issue_mask_gather_prefetch (mem_ref.stmt);
++	  else
++	    if (dump_file && (dump_flags & TDF_DETAILS))
++	      fprintf (dump_file, "other vectorized internal function\n");
++	}
++      else
++	issue_builtin_prefetch (mem_ref);
++    }
++}
++
++/* Check whether all loop bounds (niters) used for calculating the footprints
++   of previously-executed ref_groups are defined in a dominated bb to the
++   currentbranch bb, where the conditional expression requires the loop bound
++   info.  */
++
++bool
++check_def_use_chain (std::vector<ref_group> &ref_groups,
++		     basic_block &branch_header_bb,
++		     std::vector<int> &ref_group_idx)
++{
++  for (std::vector<int>::iterator it = ref_group_idx.begin ();
++       it != ref_group_idx.end (); ++it)
++    {
++      /* Transferring mem_ref only takes place during footprint calculation.  */
++      ref_group &ref_group_curr = ref_groups[*it];
++      data_ref mem_ref = ref_group_curr.transfer_ft
++			  ? ref_group_curr.first_calc_use
++			  : ref_group_curr.first_use;
++      for (unsigned j = 0; j < mem_ref.loop_bounds.size (); ++j)
++	{
++	  tree niters = mem_ref.loop_bounds[j].niters;
++	  gimple *def_stmt = SSA_NAME_DEF_STMT (niters);
++	  basic_block def_bb = gimple_bb (def_stmt);
++	  /* Check dominator relationship of def bb and branch bb.  */
++	  /* Case 1: Check whether the def bb is the single predecessor block
++	     of header bb.  */
++	  if (single_pred_p (branch_header_bb))
++	    {
++	      basic_block branch_bb_prev = single_pred (branch_header_bb);
++	      if (branch_bb_prev->index == def_bb->index)
++		continue;
++	    }
++	  /* Case 2: Check whether the branch bb is dominated by the def
++	     bb.  */
++	  if (!dominated_by_p (CDI_DOMINATORS, branch_header_bb, def_bb))
++	    return false;
++	}
++    }
++  return true;
++}
++
++/* Generate the stmts for calculating the size.  Later we will consider nested
++   multi-branches scenarios and check more information of niters when it is
++   a COND_EXPR.  */
++
++tree
++calc_stmts_gen (std::vector<ref_group> &ref_groups,
++		gimple_seq &cond_expr_stmt_list,
++		basic_block branch_header_bb,
++		std::vector<int> &ref_group_idx_curr,
++		std::vector<int> &ref_group_idx_prev, tree &cumul_size)
++{
++  /* Check whether the bbs of def stmt for footprint loop bounds dominates
++     the bb of new runtime branching conditional.  */
++  if (!check_def_use_chain (ref_groups, branch_header_bb, ref_group_idx_prev))
++    return NULL_TREE;
++
++  /* Accumulated allocation size.  */
++  for (std::vector<int>::iterator it = ref_group_idx_curr.begin ();
++       it != ref_group_idx_curr.end (); ++it)
++    {
++      /* Transferring mem_ref only takes place during footprint calculation.  */
++      ref_group &ref_group_curr = ref_groups[*it];
++      data_ref mem_ref = ref_group_curr.transfer_ft
++			  ? ref_group_curr.first_calc_use
++			  : ref_group_curr.first_use;
++      tree var = mem_ref.var;
++      tree unit = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (var)));
++      /* _190 = (void *) ivtmp.444_221;
++	 Cannot detect size unit at (void *).  */
++      if (unit == NULL_TREE)
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "WARNING: Cannot detect size unit "
++				  "(use 1 byte) for variable %s: ",
++				  get_name (var));
++	      print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM);
++	      fprintf (dump_file, "\n");
++	    }
++	  unit = size_one_node;
++	}
++      tree size = NULL_TREE;
++      for (unsigned j = 0; j < mem_ref.loop_bounds.size (); ++j)
++	{
++	  tree niters = mem_ref.loop_bounds[j].niters;
++
++	  /* COND_EXPR.  */
++	  if (TREE_CODE (niters) == COND_EXPR)
++	    niters = TREE_OPERAND (niters, 1);
++	  if (size == NULL_TREE) 
++	    {
++		    size = niters;
++	    } else {
++		    size = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, 
++					size);
++	    }
++	}
++      unit = build1 (NOP_EXPR, TREE_TYPE (size), unit);
++      size = fold_build2 (MULT_EXPR, TREE_TYPE (size), size, unit);
++      size = build1 (FLOAT_EXPR, double_type_node, size);
++      cumul_size = fold_build2 (PLUS_EXPR, double_type_node, cumul_size,
++				size);
++      ref_group_idx_prev.push_back (*it);
++    }
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "cumul_size = ");
++      print_generic_expr (dump_file, cumul_size, TDF_SLIM);
++      fprintf (dump_file, "\n");
++    }
++  /* Create a stmt list for size calculation.  */
++  tree div = build_int_cst (TREE_TYPE (integer_zero_node), 1024 * 1024);
++  div = build1 (NOP_EXPR, double_type_node, div);
++  tree total_size = fold_build2 (RDIV_EXPR, double_type_node, cumul_size, div);
++
++  tree threshold = build_int_cst (TREE_TYPE (integer_zero_node),
++				  param_llc_capacity_per_core / 2);
++  threshold = build_real_from_int_cst (double_type_node, threshold);
++  tree cond_expr = fold_build2 (LE_EXPR, boolean_type_node, total_size,
++				threshold);
++
++  /* Convert cond_expr to stmt list.  */
++  cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
++				      &cond_expr_stmt_list, is_gimple_condexpr,
++				      NULL_TREE);
++  return cond_expr;
++}
++
++/* Retrieve the least number of loops that cover all target mem_refs.
++   Try to merge loops that the mem_refs reside to a common superloop and
++   maintain a worklist which relates NEED-TO-COPY loops with the target mem
++   refs inside using the following criteria:
++   1) If loop A is a superloop of loop B in the worklist, replace loop B with
++      loop A in the worklist, and attach all target mem_refs of loop B,
++      together with loop A's, to loop A.
++   2) If loop B in the worklist is a superloop of loop A, attach loop A's
++      target mem_ref to loop B.
++   3) If loop A is not a superloop/subloop of loop B in the worklist, replace
++      loop B with their lowest common superloop C in the worklist, and attach
++      all target mem_refs of loop A and loop B to loop C.
++   4) If loop A and loop B's lowest common superloop is function body
++      (loop 0), stop merging and maintain loop independence.  */
++
++void
++get_loop_worklist (std::vector<ref_group> &ref_groups, int num_issue_var,
++		   std::map<class loop *, std::vector<int> > &loop_worklist)
++{
++  for (int i = 0; i < num_issue_var; ++i)
++    {
++      data_ref &mem_ref = ref_groups[i].first_use;
++      class loop *loop_new = mem_ref.loop_bounds.front ().loop;
++      class loop *common_superloop = loop_new;
++      bool add_loop_worklist = false;
++
++      /* Use greedy algorithm to merge loops to a common superloop that can
++	 contain the current mem_refs.  */
++      std::map<class loop *, std::vector<int> >::iterator it_tmp;
++      std::vector<int> ref_group_idx_tmp;
++      std::map<class loop *, std::vector<int> >::iterator it;
++      for (it = loop_worklist.begin (); it != loop_worklist.end ();)
++	{
++	  class loop *loop_old = it->first;
++	  common_superloop = find_common_loop (loop_new, loop_old);
++	  if (common_superloop == NULL || common_superloop->num == 0)
++	    {
++	      /* Stop merging two loops if there is no common superloop for
++		 them except function body (loop 0).  */
++	      if (common_superloop != NULL
++		  && dump_file && (dump_flags & TDF_DETAILS))
++		{
++		  fprintf (dump_file, "ref_group %d's loop %d has no common "
++				      "superloop with existing loop %d\n",
++			   i, loop_new->num, loop_old->num);
++		}
++	      ++it;
++	      continue;
++	    }
++
++	  if (common_superloop->num == loop_old->num)
++	    {
++	      /* If loop_old is the superloop of loop_new, add current
++		 ref_group index to loop's worklist.  */
++	      loop_worklist[common_superloop].push_back (i);
++	      ++it;
++	    }
++	  else
++	    {
++	      /* If loop_old is not a superloop of loop_new, replace
++		 loop_old with the common superloop.  */
++	      it_tmp = it;
++	      ++it_tmp;
++	      ref_group_idx_tmp = it->second;
++	      loop_worklist.erase (it);
++	      it = it_tmp;
++	      add_loop_worklist = true;
++	    }
++	}
++
++      if (loop_worklist.empty () || add_loop_worklist)
++	{
++	  /* Update the new common superloop in loop_worklist.  */
++	  std::vector<int> &ref_groups_tmp = loop_worklist[common_superloop];
++	  ref_groups_tmp.push_back (i);
++	  for (std::vector<int>::iterator it = ref_group_idx_tmp.begin ();
++	       it != ref_group_idx_tmp.end (); ++it)
++	    ref_groups_tmp.push_back (*it);
++	  std::sort (ref_groups_tmp.begin (), ref_groups_tmp.end ());
++	}
++    }
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "runtime loop list:\n");
++      std::map<class loop *, std::vector<int> >::iterator it;
++      for (it = loop_worklist.begin (); it != loop_worklist.end (); ++it)
++	{
++	  fprintf (dump_file, "loop %d:", it->first->num);
++	  for (std::vector<int>::iterator idx_it = it->second.begin ();
++	       idx_it != it->second.end (); ++idx_it)
++	    {
++	      fprintf (dump_file, " %d", *idx_it);
++	    }
++	  fprintf (dump_file, "\n");
++	}
++    }
++}
++
++/* Runtime form insertion and issue instruction.  */
++
++void
++runtime_issue (std::vector<ref_group> &ref_groups, int num_issue_var,
++	       std::vector<class loop *> &sorted_kernels)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "runtime issue\n");
++
++  /* It is possible that the loop father of some mem_ref's bb may contain the
++     loop fathers of the others. Therefore, we intend to only copy loops
++     without inclusion relationship.  */
++  std::map<class loop *, std::vector<int> > loop_worklist;
++  get_loop_worklist (ref_groups, num_issue_var, loop_worklist);
++  bool get_first_ref_group = false;
++  std::vector<int> ref_group_idx_prev;
++
++  /* NEXT STEP: Multiple loop copies (possibly nested within one loop can cost
++     front-end bound due to branching within loop), we need to set up a
++     threshold such that we may compensate this time cost by space cost
++     in binary (copying outer loop).  */
++  tree cumul_size = build_real_from_int_cst (double_type_node,
++					     integer_zero_node);
++  for (std::vector<class loop *>::iterator it = sorted_kernels.begin ();
++       it != sorted_kernels.end (); ++it)
++    {
++      /* Start runtime branching until finding the first ref_group's loop.
++	 Skip any ref_groups if their `first_use` mem_refs are executed
++	 before the mem_ref of the first ref_group.  */
++      class loop *loop = *it;
++      if (!loop_worklist.count (loop)
++	  || (!get_first_ref_group && loop_worklist[loop][0] != 0))
++	continue;
++
++      std::vector<int> ref_group_idx_curr = loop_worklist[loop];
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "copy loop num: %d\n", loop->num);
++	}
++      /* If the exit edge points to bb with multiple inputs, split the exit
++	 edge and create a new bb, make the exit edge point to bb with only
++	 single input.  */
++      edge e = single_exit (loop);
++      if (e == NULL)
++	return;
++      if (!single_pred_p (e->dest))
++	{
++	  split_loop_exit_edge (e, true);
++	  if (dump_enabled_p ())
++	    dump_printf (MSG_NOTE, "split exit edge\n");
++	}
++
++      /* After updating SSA, we are not sure whether the gimple_seq stmt list
++	 is initialized and unchanged during iterations. Therefore, we need to
++	 recreate this stmt list for every loop copy.  */
++      gimple_seq cond_expr_stmt_list = NULL;
++      tree cond_expr = calc_stmts_gen (ref_groups, cond_expr_stmt_list,
++				       loop->header, ref_group_idx_curr,
++				       ref_group_idx_prev, cumul_size);
++      if (cond_expr == NULL_TREE)
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "incalculable variables for conditional\n");
++	  return;
++	}
++
++      /* Use the previous cond and generate a new branch and copy loop.  */
++      basic_block condition_bb = NULL;
++      profile_probability prob = profile_probability::likely ();
++      initialize_original_copy_tables ();
++      class loop *nloop = loop_version (loop, cond_expr, &condition_bb,
++					prob, prob.invert (), prob,
++					prob.invert (), true);
++      free_original_copy_tables ();
++
++      /* Insert the generated stmt list before cond_expr.  */
++      gimple_stmt_iterator cond_exp_gsi;
++      if (cond_expr_stmt_list)
++	{
++	  /* Function `gsi_insert_seq_before` will insert `cond_expr` (1st
++	     stmt) of `condition_bb` to the end of `cond_expr_stmt_list`.  */
++	  cond_exp_gsi = gsi_last_bb (condition_bb);
++	  gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
++				 GSI_SAME_STMT);
++	}
++    }
++
++  update_ssa (TODO_update_ssa);
++
++  /* Perform hint issue for branches that meet conditions.  */
++  static_issue (ref_groups, num_issue_var);
++}
++
++/* Issue llc hints through prefetch instructions.  */
++
++void
++issue_llc_hint (std::vector<ref_group> &ref_groups,
++		std::vector<class loop *> &sorted_kernels)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "issue_llc_hint:\n");
++
++  /* 1) If the issue-topn and force-issue options are available, top N var is
++	forcibly allocated then no runtime branch is generated.
++     2) If the issue-topn option is available and the size of top N var is
++	statically known, top N is statically allocated and no runtime branch
++	is generated.
++     3) If the issue-topn option is available and the size of the top N var is
++	unknown, but them is dynamically known, the top N is dynamically
++	allocated and generate runtime branches. (also depends on the screening
++	of the innermost variable boundary type)
++     4) If the dynamic runtime cannot know the size, such as indirect access,
++	optimization is skipped.
++  */
++  int num_issue_var = std::min (param_issue_topn, (int) ref_groups.size ());
++  if (num_issue_var == 0)
++    return;
++
++  if (num_issue_var < param_issue_topn
++      && dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "WARNING: Only %u (less than param_issue_topn = %d) "
++			  "ref_group(s) is found for llc hint.\n",
++	       num_issue_var, param_issue_topn);
++    }
++  if (param_force_issue)
++    {
++      static_issue (ref_groups, num_issue_var);
++      return;
++    }
++  calc_type topn_calc_type = STATIC_CALC;
++  for (int i = 0; i < num_issue_var; ++i)
++    topn_calc_type = std::min (topn_calc_type, ref_groups[i].calc_by);
++
++  if (topn_calc_type == STATIC_CALC)
++    {
++      /* Before static issue, we still need to collect data size of all target
++	 variables and compare the summation with LLC cache size.  */
++      double prefetch_data_size = 0.;
++      for (int i = 0; i < num_issue_var; ++i)
++	prefetch_data_size += ref_groups[i].var_size;
++
++      if (prefetch_data_size <= (double) param_llc_capacity_per_core
++				* PREFETCH_CACHE_SIZE_RATIO)
++	static_issue (ref_groups, num_issue_var);
++      else
++	if (dump_file && (dump_flags & TDF_DETAILS))
++	  fprintf (dump_file, "static issue: Prefetch size exceeds LLC cache "
++			      "size: %lf > %lf.\n",
++		   prefetch_data_size,
++		   (double) param_llc_capacity_per_core
++		   * PREFETCH_CACHE_SIZE_RATIO);
++    }
++  else if (topn_calc_type == RUNTIME_CALC)
++    runtime_issue (ref_groups, num_issue_var, sorted_kernels);
++  else
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "unhandled issue scene\n");
++    }
++}
++
++/* ==================== phase entry ====================  */
++
++/* The LLC intelligent allocation consists of 6 steps.  */
++
++void
++llc_allocate (void)
++{
++  std::map<class loop *, std::vector<data_ref> > kernels_refs;
++  std::vector<class loop *> kernels;
++  if (!get_dense_memory_kernels (kernels, kernels_refs))
++    return;
++
++  std::set<gimple *> traced_ref_stmt;
++  std::vector<data_ref> unresolved_refs;
++  trace_data_refs_info (kernels, kernels_refs, traced_ref_stmt,
++			unresolved_refs);
++
++  if (!analyze_nested_kernels (kernels, kernels_refs, traced_ref_stmt,
++			       unresolved_refs))
++    return;
++
++  retrace_loop_refs_info_unresolved (unresolved_refs, traced_ref_stmt);
++
++  std::vector<class loop *> sorted_kernels;
++  std::vector<ref_group> ref_groups;
++  if (param_filter_mode)
++    {
++      /* AutoFDO mode: include ENTRY bb and EXIT bb indices.  */
++      std::set<int> bb_pathset;
++      bb_pathset.insert (0);
++      bb_pathset.insert (1);
++      if (!filter_and_sort_kernels_feedback (sorted_kernels, bb_pathset))
++	return;
++
++      if (!record_and_sort_ref_groups (ref_groups, kernels, kernels_refs,
++				       bb_pathset))
++	return;
++    }
++  else
++    {
++      /* static mode.  */
++      std::set<int> bb_pathset;
++      if (!filter_and_sort_kernels (sorted_kernels, kernels))
++	return;
++
++      if (!record_and_sort_ref_groups (ref_groups, sorted_kernels, kernels_refs,
++				       bb_pathset))
++	return;
++    }
++
++  issue_llc_hint (ref_groups, sorted_kernels);
++}
++
++/* Check whether the function is an operator reloading function.  */
++
++bool
++operator_func_p (function *fn)
++{
++  const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn->decl));
++
++  if (fn_name && strncmp (fn_name, "operator", 8) == 0)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "operator_func: %s ", fn_name);
++
++      return true;
++    }
++  return false;
++}
++
++/* Check whether the function file location is known.  */
++
++bool
++func_location_p (function *fn)
++{
++  expanded_location fn_decl_xloc
++    = expand_location (DECL_SOURCE_LOCATION (current_function_decl));
++  expanded_location fn_xloc
++    = expand_location (fn->function_start_locus);
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "fn->function_start_locus = %d \n",
++	       fn->function_start_locus);
++      fprintf (dump_file, "fn_xloc.file = %s \n",
++	       fn_xloc.file ? fn_xloc.file : "NULL");
++      fprintf (dump_file, "fn_decl_xloc.file = %s \n",
++	       fn_decl_xloc.file ? fn_decl_xloc.file : "NULL");
++      fprintf (dump_file, "LOCATION_FILE (input_location) = %s \n",
++	LOCATION_FILE (input_location) ? LOCATION_FILE (input_location)
++				       : "NULL");
++    }
++  if (fn_decl_xloc.file == NULL)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Function location unknown, skip analysis \n");
++      return false;
++    }
++  /* Newly generated functions are filtered out, such as function constant
++     propagation func.constprop ().  */
++  if (LOCATION_FILE (input_location) != fn_decl_xloc.file)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Function location non-local, skip analysis \n");
++      return false;
++    }
++  return true;
++}
++
++/* Dump function information.  */
++
++void
++dump_function_info (function *fn)
++{
++  const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn->decl));
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "\nfn_name: %s\n", fn_name);
++      expanded_location cfun_xloc
++	= expand_location (DECL_SOURCE_LOCATION (current_function_decl));
++      if (cfun_xloc.line)
++	{
++	  if (cfun_xloc.file)
++	    fprintf (dump_file, "[%s:%d:%d]\n",
++		     cfun_xloc.file, cfun_xloc.line, cfun_xloc.column);
++	}
++      fprintf (dump_file, "\n");
++      flow_loops_dump (dump_file, NULL, 1);
++      fprintf (dump_file, "\n");
++    }
++}
++
++/* dump param.  */
++
++void
++dump_param (void)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++  {
++    fprintf (dump_file, "LLC allocate parameters:\n");
++    fprintf (dump_file, "    block size: %d\n", param_l1_cache_line_size);
++    fprintf (dump_file, "    L1 cache size: %d lines, %d kB\n",
++	param_l1_cache_size * 1024 / param_l1_cache_line_size,
++	param_l1_cache_size);
++    fprintf (dump_file, "    L1 cache line size: %d\n",
++	param_l1_cache_line_size);
++    fprintf (dump_file, "    L2 cache size: %d kB\n", param_l2_cache_size);
++    fprintf (dump_file, "    min mem_access_ratio: %d \n",
++	param_mem_access_ratio);
++    fprintf (dump_file, "    min mem_access_num: %d \n",
++	param_mem_access_num);
++    fprintf (dump_file, "\n");
++  }
++}
++
++/* Determine whether to analyze the function according to
++   the ordering of functions containing cycle counts.  */
++
++static bool
++should_analyze_func_p (void)
++{
++  gcov_type decl_uid = DECL_UID (current_function_decl);
++  gcov_type func_count = event_get_func_count (decl_uid, PMU_EVENT);
++  if (func_count == 0)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "function uid %ld cannot find profile data "
++			      "and skip prefetch analysis\n",
++		   decl_uid);
++	}
++      return false;
++    }
++  if (func_count < event_get_topn_function_total_count_thres ())
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "function uid %ld total counts is %lu: "
++			      "counts %lu < perf's top %d threshold %lu, "
++			      "skip prefetch analysis\n",
++		   decl_uid, func_count, func_count,
++		   PREFETCH_FUNC_TOPN, event_get_topn_function_total_count_thres ());
++	}
++      return false;
++    }
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "function uid %ld total counts is %lu: "
++			  "counts %lu >= perf's top %d threshold %lu, "
++			  "continue prefetch analysis\n",
++	       decl_uid, func_count, func_count,
++	       PREFETCH_FUNC_TOPN, event_get_topn_function_total_count_thres ());
++    }
++  return true;
++}
++
++const pass_data pass_data_llc_allocate =
++{
++  GIMPLE_PASS, /* type.  */
++  "llc_allocate", /* name.  */
++  OPTGROUP_LOOP, /* optinfo_flags.  */
++  TV_TREE_PREFETCH, /* tv_id.  */
++  (PROP_cfg | PROP_ssa), /* properties_required.  */
++  0, /* properties_provided.  */
++  0, /* properties_destroyed.  */
++  0, /* todo_flags_start.  */
++  0, /* todo_flags_finish.  */
++};
++
++class pass_llc_allocate : public gimple_opt_pass
++{
++public:
++  pass_llc_allocate (gcc::context *ctxt)
++    : gimple_opt_pass (pass_data_llc_allocate, ctxt)
++  {}
++
++  /* opt_pass methods.  */
++  virtual bool gate (function *)
++    {
++      return (optimize >= 2 && flag_llc_allocate > 0);
++    }
++  virtual unsigned int execute (function *);
++
++}; // class pass_llc_allocate
++
++unsigned int
++pass_llc_allocate::execute (function *fn)
++{
++  unsigned int ret = 0;
++
++  if (!targetm.have_prefetch ()
++      || targetm.vectorize.code_for_prefetch == NULL
++      || targetm.vectorize.prefetch_handleable_mode_p == NULL
++      || targetm.vectorize.code_for_gather_prefetch == NULL)
++    return 0;
++
++  if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
++    {
++      tree type = build_function_type_list (void_type_node,
++					    const_ptr_type_node, NULL_TREE);
++      tree decl = add_builtin_function ("__builtin_prefetch", type,
++					BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
++					NULL, NULL_TREE);
++      DECL_IS_NOVOPS (decl) = true;
++      set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
++    }
++  if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH_FULL))
++    {
++      tree type = build_function_type_list (void_type_node,
++					    const_ptr_type_node, NULL_TREE);
++      tree decl = add_builtin_function ("__builtin_prefetch_full", type,
++					BUILT_IN_PREFETCH_FULL, BUILT_IN_NORMAL,
++					NULL, NULL_TREE);
++      DECL_IS_NOVOPS (decl) = true;
++      set_builtin_decl (BUILT_IN_PREFETCH_FULL, decl, false);
++    }
++
++  dump_param ();
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "llc_allocate: %s\n",
++	     IDENTIFIER_POINTER (DECL_NAME (fn->decl)));
++
++  if (number_of_loops (fn) <= 1  || !func_location_p (fn)
++      || operator_func_p (fn))
++    return ret;
++
++  /* Filter only when combined with PMU event. When the should_analyze_func_p
++     analysis fails (for example, the function without PMU-event count),
++     in order to ensure the accuracy of the LLC allocation analysis, the
++     function does not perform native allocation processing.  */
++  if (flag_additional_profile && (!profile_exist (PMU_EVENT) || !should_analyze_func_p ()))
++    {
++      return 0;
++    }
++
++  dump_function_info (fn);
++
++  llc_allocate ();
++
++  return ret;
++}
++
++} // anon namespace
++
++gimple_opt_pass *
++make_pass_llc_allocate (gcc::context *ctxt)
++{
++  return new pass_llc_allocate (ctxt);
++}
+diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
+index 0353ffd30..0492dc6fd 100644
+--- a/gcc/tree-ssa-loop-niter.cc
++++ b/gcc/tree-ssa-loop-niter.cc
+@@ -2489,6 +2489,37 @@ loop_only_exit_p (const class loop *loop, basic_block *body, const_edge exit)
+   return true;
+ }
+ 
++/* Returns whether the number of vectorized iterations for the loop can be
++   estimated from the given IR and update the corresponding loop attribute,
++   e.g., next_mask_114 = .WHILE_ULT (_122, niters.5_75, { 0, ... });  */
++
++bool
++number_of_iterations_vect (class loop *loop, tree lhs, tree rhs)
++{
++  loop->vec_nb_iterations = chrec_dont_know;
++
++  if ((TREE_CODE (lhs) != SSA_NAME && TREE_CODE (rhs) != SSA_NAME)
++      || (TREE_CODE (lhs) == SSA_NAME && TREE_CODE (rhs) == SSA_NAME))
++    return false;
++
++  tree ssa = TREE_CODE (lhs) == SSA_NAME ? lhs : rhs;
++  gimple *def_stmt = SSA_NAME_DEF_STMT (ssa);
++
++  if (gimple_code (def_stmt) != GIMPLE_CALL
++      || !gimple_call_internal_p (def_stmt))
++    return false;
++
++  internal_fn ifn = gimple_call_internal_fn (def_stmt);
++  if (ifn != IFN_WHILE_ULT)
++    return false;
++
++  gcall *call = dyn_cast<gcall *> (def_stmt);
++  tree niters = gimple_call_arg (call, 1);
++  loop->vec_nb_iterations = niters;
++
++  return true;
++}
++
+ /* Stores description of number of iterations of LOOP derived from
+    EXIT (an exit edge of the LOOP) in NITER.  Returns true if some useful
+    information could be derived (and fields of NITER have meaning described
+@@ -2559,6 +2590,9 @@ number_of_iterations_exit_assumptions (class loop *loop, edge exit,
+   op1 = gimple_cond_rhs (stmt);
+   type = TREE_TYPE (op0);
+ 
++  if (TREE_CODE (type) == VECTOR_TYPE)
++    number_of_iterations_vect (loop, op0, op1);
++
+   if (TREE_CODE (type) != INTEGER_TYPE
+       && !POINTER_TYPE_P (type))
+     return false;
+@@ -2852,14 +2886,14 @@ bool
+ number_of_iterations_exit (class loop *loop, edge exit,
+ 			   class tree_niter_desc *niter,
+ 			   bool warn, bool every_iteration,
+-			   basic_block *body)
++			   basic_block *body, bool guarantee)
+ {
+   gcond *stmt;
+   if (!number_of_iterations_exit_assumptions (loop, exit, niter,
+ 					      &stmt, every_iteration, body))
+     return false;
+ 
+-  if (integer_nonzerop (niter->assumptions))
++  if (integer_nonzerop (niter->assumptions) || guarantee == false)
+     return true;
+ 
+   if (warn && dump_enabled_p ())
+diff --git a/gcc/tree-ssa-loop-niter.h b/gcc/tree-ssa-loop-niter.h
+index ceaf65e07..8f03458f7 100644
+--- a/gcc/tree-ssa-loop-niter.h
++++ b/gcc/tree-ssa-loop-niter.h
+@@ -27,7 +27,8 @@ extern bool loop_only_exit_p (const class loop *, basic_block *body,
+ extern bool number_of_iterations_exit (class loop *, edge,
+ 				       class tree_niter_desc *niter, bool,
+ 				       bool every_iteration = true,
+-				       basic_block * = NULL);
++				       basic_block * = NULL,
++				       bool guarantee = true);
+ extern bool number_of_iterations_exit_assumptions (class loop *, edge,
+ 						   class tree_niter_desc *,
+ 						   gcond **, bool = true,
+diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
+index 9d21e6d03..6e61f7140 100644
+--- a/gcc/tree-vect-loop-manip.cc
++++ b/gcc/tree-vect-loop-manip.cc
+@@ -3738,3 +3738,269 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
+ 
+   return nloop;
+ }
++
++class loop *
++vect_loop_versioning_2 (loop_vec_info loop_vinfo,
++		      gimple *loop_vectorized_call)
++{
++  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
++  class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
++  basic_block condition_bb;
++  gphi_iterator gsi;
++  gimple_stmt_iterator cond_exp_gsi;
++  basic_block merge_bb;
++  basic_block new_exit_bb;
++  edge new_exit_e, e;
++  gphi *orig_phi, *new_phi;
++  tree cond_expr = NULL_TREE;
++  gimple_seq cond_expr_stmt_list = NULL;
++  tree arg;
++  profile_probability prob = profile_probability::likely ();
++  gimple_seq gimplify_stmt_list = NULL;
++  tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
++  bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
++  bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
++  bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
++  poly_uint64 versioning_threshold
++    = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
++  tree version_simd_if_cond
++    = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
++  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
++
++  if (vect_apply_runtime_profitability_check_p (loop_vinfo)
++      && !ordered_p (th, versioning_threshold))
++    cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
++			     build_int_cst (TREE_TYPE (scalar_loop_iters),
++					    th - 1));
++  if (maybe_ne (versioning_threshold, 0U))
++    {
++      tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
++			       build_int_cst (TREE_TYPE (scalar_loop_iters),
++					      versioning_threshold - 1));
++      if (cond_expr)
++	cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
++				 expr, cond_expr);
++      else
++	cond_expr = expr;
++    }
++
++  if (version_niter)
++    vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
++
++  if (cond_expr)
++    cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
++					&cond_expr_stmt_list,
++					is_gimple_condexpr, NULL_TREE);
++
++  if (version_align)
++    vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
++				       &cond_expr_stmt_list);
++
++  if (version_alias)
++    {
++      vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
++      vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
++      vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
++    }
++
++  if (version_simd_if_cond)
++    {
++      gcc_assert (dom_info_available_p (CDI_DOMINATORS));
++      if (flag_checking)
++	if (basic_block bb
++	    = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
++	  gcc_assert (bb != loop->header
++		      && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
++		      && (scalar_loop == NULL
++			  || (bb != scalar_loop->header
++			      && dominated_by_p (CDI_DOMINATORS,
++						 scalar_loop->header, bb))));
++      tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
++      tree c = fold_build2 (NE_EXPR, boolean_type_node,
++			    version_simd_if_cond, zero);
++      if (cond_expr)
++	cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
++				 c, cond_expr);
++      else
++	cond_expr = c;
++      if (dump_enabled_p ())
++	dump_printf_loc (MSG_NOTE, vect_location,
++			 "created versioning for simd if condition check.\n");
++    }
++
++  cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
++				      &gimplify_stmt_list,
++				      is_gimple_condexpr, NULL_TREE);
++  gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
++
++  /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
++     invariant in.  */
++  class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
++  for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
++       !gsi_end_p (gsi); gsi_next (&gsi))
++    {
++      gimple *stmt = gsi_stmt (gsi);
++      update_stmt (stmt);
++      ssa_op_iter iter;
++      use_operand_p use_p;
++      basic_block def_bb;
++      FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
++	if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
++	    && flow_bb_inside_loop_p (outermost, def_bb))
++	  outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
++    }
++
++  /* Search for the outermost loop we can version.  Avoid versioning of
++     non-perfect nests but allow if-conversion versioned loops inside.  */
++  class loop *loop_to_version = loop;
++  if (flow_loop_nested_p (outermost, loop))
++    { 
++      if (dump_enabled_p ())
++	dump_printf_loc (MSG_NOTE, vect_location,
++			 "trying to apply versioning to outer loop %d\n",
++			 outermost->num);
++      if (outermost->num == 0)
++	outermost = superloop_at_depth (loop, 1);
++      /* And avoid applying versioning on non-perfect nests.  */
++      while (loop_to_version != outermost
++	     && single_exit (loop_outer (loop_to_version))
++	     && (!loop_outer (loop_to_version)->inner->next
++		 || vect_loop_vectorized_call (loop_to_version))
++	     && (!loop_outer (loop_to_version)->inner->next
++		 || !loop_outer (loop_to_version)->inner->next->next))
++	loop_to_version = loop_outer (loop_to_version);
++    }
++
++  /* Apply versioning.  If there is already a scalar version created by
++     if-conversion re-use that.  Note we cannot re-use the copy of
++     an if-converted outer-loop when vectorizing the inner loop only.  */
++  gcond *cond;
++  if ((!loop_to_version->inner || loop == loop_to_version)
++      && loop_vectorized_call)
++    {
++      gcc_assert (scalar_loop);
++      condition_bb = gimple_bb (loop_vectorized_call);
++      cond = as_a <gcond *> (last_stmt (condition_bb));
++      gimple_cond_set_condition_from_tree (cond, cond_expr);
++      update_stmt (cond);
++
++      if (cond_expr_stmt_list)
++	{
++	  cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
++	  gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
++				 GSI_SAME_STMT);
++	}
++
++      /* if-conversion uses profile_probability::always () for both paths,
++	 reset the paths probabilities appropriately.  */
++      edge te, fe;
++      extract_true_false_edges_from_block (condition_bb, &te, &fe);
++      te->probability = prob;
++      fe->probability = prob.invert ();
++      /* We can scale loops counts immediately but have to postpone
++	 scaling the scalar loop because we re-use it during peeling.  */
++      scale_loop_frequencies (loop_to_version, te->probability);
++      LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
++
++      nloop = scalar_loop;
++      if (dump_enabled_p ())
++	dump_printf_loc (MSG_NOTE, vect_location,
++			 "reusing %sloop version created by if conversion\n",
++			 loop_to_version != loop ? "outer " : "");
++    }
++  else
++    {
++      if (loop_to_version != loop
++	  && dump_enabled_p ())
++	dump_printf_loc (MSG_NOTE, vect_location,
++			 "applying loop versioning to outer loop %d\n",
++			 loop_to_version->num);
++
++      initialize_original_copy_tables ();
++      nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
++			    prob, prob.invert (), prob, prob.invert (), true);
++      gcc_assert (nloop);
++      nloop = get_loop_copy (loop);
++
++      /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
++	 reap those otherwise;  they also refer to the original
++	 loops.  */
++      class loop *l = loop;
++      while (gimple *call = vect_loop_vectorized_call (l))
++	{
++	  call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
++	  fold_loop_internal_call (call, boolean_false_node);
++	  l = loop_outer (l);
++	}
++      free_original_copy_tables ();
++
++      if (cond_expr_stmt_list)
++	{
++	  cond_exp_gsi = gsi_last_bb (condition_bb);
++	  gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
++				 GSI_SAME_STMT);
++	}
++
++      /* Loop versioning violates an assumption we try to maintain during
++	 vectorization - that the loop exit block has a single predecessor.
++	 After versioning, the exit block of both loop versions is the same
++	 basic block (i.e. it has two predecessors). Just in order to simplify
++	 following transformations in the vectorizer, we fix this situation
++	 here by adding a new (empty) block on the exit-edge of the loop,
++	 with the proper loop-exit phis to maintain loop-closed-form.
++	 If loop versioning wasn't done from loop, but scalar_loop instead,
++	 merge_bb will have already just a single successor.  */
++
++      merge_bb = single_exit (loop_to_version)->dest;
++      if (EDGE_COUNT (merge_bb->preds) >= 2)
++	{
++	  gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
++	  new_exit_bb = split_edge (single_exit (loop_to_version));
++	  new_exit_e = single_exit (loop_to_version);
++	  e = EDGE_SUCC (new_exit_bb, 0);
++
++	  for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
++	       gsi_next (&gsi))
++	    {
++	      tree new_res;
++	      orig_phi = gsi.phi ();
++	      new_res = copy_ssa_name (PHI_RESULT (orig_phi));
++	      new_phi = create_phi_node (new_res, new_exit_bb);
++	      arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
++	      add_phi_arg (new_phi, arg, new_exit_e,
++			   gimple_phi_arg_location_from_edge (orig_phi, e));
++	      adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
++	    }
++	}
++
++      update_ssa (TODO_update_ssa);
++    }
++
++  if (version_niter)
++    {
++      /* The versioned loop could be infinite, we need to clear existing
++	 niter information which is copied from the original loop.  */
++      gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
++      vect_free_loop_info_assumptions (nloop);
++      /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
++      loop_constraint_set (loop, LOOP_C_INFINITE);
++    }
++
++  if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
++      && dump_enabled_p ())
++    {
++      if (version_alias)
++	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
++			 vect_location,
++			 "loop versioned for vectorization because of "
++			 "possible aliasing\n");
++      if (version_align)
++	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
++			 vect_location,
++			 "loop versioned for vectorization to enhance "
++			 "alignment\n");
++
++    }
++
++  return nloop;
++}
+diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
+index 7f7577951..023a83c38 100644
+--- a/gcc/tree-vect-loop.cc
++++ b/gcc/tree-vect-loop.cc
+@@ -9735,8 +9735,11 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
+ 
+   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
+     {
+-      class loop *sloop
+-	= vect_loop_versioning (loop_vinfo, loop_vectorized_call);
++      class loop *sloop;
++      if (!(optimize >= 2 && flag_llc_allocate > 0)) 
++	sloop = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
++      else 
++	sloop = vect_loop_versioning_2 (loop_vinfo, loop_vectorized_call);
+       sloop->force_vectorize = false;
+       check_profitability = false;
+     }
+@@ -9989,7 +9992,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
+ 			   niters_vector_mult_vf, !niters_no_overflow);
+ 
+   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
+-  scale_profile_for_vect_loop (loop, assumed_vf);
++  if (!(optimize >= 2 && flag_llc_allocate > 0))
++    scale_profile_for_vect_loop (loop, assumed_vf);
+ 
+   /* True if the final iteration might not handle a full vector's
+      worth of scalar iterations.  */
+diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
+index e13bc6c99..85018f250 100644
+--- a/gcc/tree-vectorizer.h
++++ b/gcc/tree-vectorizer.h
+@@ -2177,6 +2177,7 @@ extern bool slpeel_can_duplicate_loop_p (const class loop *, const_edge);
+ class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *,
+ 						     class loop *, edge);
+ class loop *vect_loop_versioning (loop_vec_info, gimple *);
++class loop *vect_loop_versioning_2 (loop_vec_info, gimple *);
+ extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
+ 				    tree *, tree *, tree *, int, bool, bool,
+ 				    tree *);
+-- 
+2.44.0.windows.1
+
diff --git a/0366-fix-prefetch-case-failed.patch b/0366-fix-prefetch-case-failed.patch
new file mode 100644
index 0000000000000000000000000000000000000000..9c21445f26bf3767034be574c6891d5546cb2bd2
--- /dev/null
+++ b/0366-fix-prefetch-case-failed.patch
@@ -0,0 +1,144 @@
+From c7bdc03e48a0b6e213c5a4b8c821665d7ca897bb Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?=E9=99=88=E9=B8=BF?= <chenhong92@huawei.com>
+Date: Thu, 6 Mar 2025 14:58:57 +0800
+Subject: [PATCH] fix prefetch case failed
+
+---
+ gcc/params.opt                                              | 2 +-
+ .../gcc.target/aarch64/sve/acle/general-c/prefetch_1.c      | 6 +++---
+ .../aarch64/sve/acle/general-c/prefetch_gather_index_1.c    | 6 +++---
+ .../aarch64/sve/acle/general-c/prefetch_gather_index_2.c    | 6 +++---
+ .../aarch64/sve/acle/general-c/prefetch_gather_offset_1.c   | 6 +++---
+ .../aarch64/sve/acle/general-c/prefetch_gather_offset_2.c   | 6 +++---
+ .../aarch64/sve/acle/general-c/prefetch_gather_offset_3.c   | 6 +++---
+ .../aarch64/sve/acle/general-c/prefetch_gather_offset_4.c   | 6 +++---
+ 8 files changed, 22 insertions(+), 22 deletions(-)
+
+diff --git a/gcc/params.opt b/gcc/params.opt
+index e06e50611..a716f2cc4 100644
+--- a/gcc/params.opt
++++ b/gcc/params.opt
+@@ -1305,7 +1305,7 @@ cannot recognize inner loop boundaries.
+ -param=llc-level=
+ Common Joined UInteger Var(param_llc_level) Init(3) IntegerRange(3, 4)
+ Param Optimization
+-Specifies the HBM cache level.
++Specifies the LLC cache level.
+ 
+ -param=filter-mode=
+ Common Joined UInteger Var(param_filter_mode) Init(1) IntegerRange(0, 1) Param
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_1.c
+index 316f77fc7..c8094ba2b 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_1.c
+@@ -10,8 +10,8 @@ f1 (svbool_t pg, int32_t *s32_ptr, enum svprfop op)
+   svprfb (pg, s32_ptr, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 3 of 'svprfb', which expects a valid 'enum svprfop' value} } */
+   svprfb (pg, s32_ptr, (enum svprfop) 0);
+   svprfb (pg, s32_ptr, (enum svprfop) 5);
+-  svprfb (pg, s32_ptr, (enum svprfop) 6); /* { dg-error {passing 6 to argument 3 of 'svprfb', which expects a valid 'enum svprfop' value} } */
+-  svprfb (pg, s32_ptr, (enum svprfop) 7); /* { dg-error {passing 7 to argument 3 of 'svprfb', which expects a valid 'enum svprfop' value} } */
++  svprfb (pg, s32_ptr, (enum svprfop) 6);
++  svprfb (pg, s32_ptr, (enum svprfop) 7);
+   svprfb (pg, s32_ptr, (enum svprfop) 8);
+-  svprfb (pg, s32_ptr, (enum svprfop) 14); /* { dg-error {passing 14 to argument 3 of 'svprfb', which expects a valid 'enum svprfop' value} } */
++  svprfb (pg, s32_ptr, (enum svprfop) 14);
+ }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_1.c
+index c33c95440..862ec082b 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_1.c
+@@ -46,8 +46,8 @@ f1 (svbool_t pg, int32_t *s32_ptr, void *void_ptr, void **ptr_ptr,
+   svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 4 of 'svprfh_gather_index', which expects a valid 'enum svprfop' value} } */
+   svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 0);
+   svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 5);
+-  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 4 of 'svprfh_gather_index', which expects a valid 'enum svprfop' value} } */
+-  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 4 of 'svprfh_gather_index', which expects a valid 'enum svprfop' value} } */
++  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 6);
++  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 7);
+   svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 8);
+-  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 4 of 'svprfh_gather_index', which expects a valid 'enum svprfop' value} } */
++  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 14);
+ }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_2.c
+index 3d7797305..f4873c631 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_2.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_2.c
+@@ -10,8 +10,8 @@ f1 (svbool_t pg, int32_t *s32_ptr, svint32_t s32, enum svprfop op)
+   svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 4 of 'svprfh_gather_s32index', which expects a valid 'enum svprfop' value} } */
+   svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 0);
+   svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 5);
+-  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 4 of 'svprfh_gather_s32index', which expects a valid 'enum svprfop' value} } */
+-  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 4 of 'svprfh_gather_s32index', which expects a valid 'enum svprfop' value} } */
++  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 6);
++  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 7);
+   svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 8);
+-  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 4 of 'svprfh_gather_s32index', which expects a valid 'enum svprfop' value} } */
++  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 14);
+ }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_1.c
+index cc61901cb..3b82b4777 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_1.c
+@@ -46,8 +46,8 @@ f1 (svbool_t pg, int32_t *s32_ptr, void *void_ptr, void **ptr_ptr,
+   svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 4 of 'svprfb_gather_offset', which expects a valid 'enum svprfop' value} } */
+   svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 0);
+   svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 5);
+-  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 4 of 'svprfb_gather_offset', which expects a valid 'enum svprfop' value} } */
+-  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 4 of 'svprfb_gather_offset', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 6);
++  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 7);
+   svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 8);
+-  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 4 of 'svprfb_gather_offset', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 14);
+ }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_2.c
+index 88e0c35e7..2be620de5 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_2.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_2.c
+@@ -30,8 +30,8 @@ f1 (svbool_t pg, svint8_t s8, svuint8_t u8,
+   svprfb_gather (pg, u32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 3 of 'svprfb_gather', which expects a valid 'enum svprfop' value} } */
+   svprfb_gather (pg, u32, (enum svprfop) 0);
+   svprfb_gather (pg, u32, (enum svprfop) 5);
+-  svprfb_gather (pg, u32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 3 of 'svprfb_gather', which expects a valid 'enum svprfop' value} } */
+-  svprfb_gather (pg, u32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 3 of 'svprfb_gather', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather (pg, u32, (enum svprfop) 6);
++  svprfb_gather (pg, u32, (enum svprfop) 7);
+   svprfb_gather (pg, u32, (enum svprfop) 8);
+-  svprfb_gather (pg, u32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 3 of 'svprfb_gather', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather (pg, u32, (enum svprfop) 14);
+ }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_3.c
+index 24b4aa190..9a1d931e9 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_3.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_3.c
+@@ -10,8 +10,8 @@ f1 (svbool_t pg, int32_t *s32_ptr, svint32_t s32, enum svprfop op)
+   svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 4 of 'svprfb_gather_s32offset', which expects a valid 'enum svprfop' value} } */
+   svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 0);
+   svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 5);
+-  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 4 of 'svprfb_gather_s32offset', which expects a valid 'enum svprfop' value} } */
+-  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 4 of 'svprfb_gather_s32offset', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 6);
++  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 7);
+   svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 8);
+-  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 4 of 'svprfb_gather_s32offset', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 14);
+ }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_4.c
+index 63ccdc5a4..f7ca09507 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_4.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_4.c
+@@ -10,8 +10,8 @@ f1 (svbool_t pg, svuint32_t u32, enum svprfop op)
+   svprfb_gather_u32base (pg, u32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 3 of 'svprfb_gather_u32base', which expects a valid 'enum svprfop' value} } */
+   svprfb_gather_u32base (pg, u32, (enum svprfop) 0);
+   svprfb_gather_u32base (pg, u32, (enum svprfop) 5);
+-  svprfb_gather_u32base (pg, u32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 3 of 'svprfb_gather_u32base', which expects a valid 'enum svprfop' value} } */
+-  svprfb_gather_u32base (pg, u32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 3 of 'svprfb_gather_u32base', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather_u32base (pg, u32, (enum svprfop) 6);
++  svprfb_gather_u32base (pg, u32, (enum svprfop) 7);
+   svprfb_gather_u32base (pg, u32, (enum svprfop) 8);
+-  svprfb_gather_u32base (pg, u32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 3 of 'svprfb_gather_u32base', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather_u32base (pg, u32, (enum svprfop) 14);
+ }
+-- 
+2.44.0.windows.1
+
diff --git a/0367-llc-feature-bugfix.patch b/0367-llc-feature-bugfix.patch
new file mode 100644
index 0000000000000000000000000000000000000000..57abf68d46b774159d0bfb3b3c8c4a699f8649f0
--- /dev/null
+++ b/0367-llc-feature-bugfix.patch
@@ -0,0 +1,79 @@
+From 9bb4c61897abb16d77a0614d4465bf2b0d67b265 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?=E9=99=88=E9=B8=BF?= <chenhong92@huawei.com>
+Date: Mon, 10 Mar 2025 17:00:37 +0800
+Subject: [PATCH] llc feature bugfix
+
+---
+ gcc/params.opt                            |  2 +-
+ gcc/testsuite/gcc.dg/llc-allocate/llc-1.c |  2 +-
+ gcc/tree-ssa-llc-allocate.cc              |  2 +-
+ gcc/tree-vect-loop.cc                     | 10 +++-------
+ 4 files changed, 6 insertions(+), 10 deletions(-)
+
+diff --git a/gcc/params.opt b/gcc/params.opt
+index a716f2cc4..ed7559783 100644
+--- a/gcc/params.opt
++++ b/gcc/params.opt
+@@ -1285,7 +1285,7 @@ Common Joined UInteger Var(param_issue_topn) Init(1) Param Optimization
+ Issue topn LLC mem_ref hint.
+ 
+ -param=force-issue=
+-Common Joined UInteger Var(param_force_issue) Init(0) IntegerRange(0, 1) Param
++Common Joined UInteger Var(param_force_issue) Init(1) IntegerRange(0, 1) Param
+ Force issue the topn LLC mem_ref hint, without generating dynamic multi-branches.
+ 
+ -param=llc-capacity-per-core=
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
+index 091e654f9..0b81394ad 100644
+--- a/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
+@@ -58,4 +58,4 @@ main (int argc, char *argv[])
+ /* { dg-final { scan-tree-dump       "\\d\\tuPtr\\t\\(1.466660, 1, 1, 0\\)" "llc_allocate" } } */
+ /* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */
+ /* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
+-/* { dg-final { scan-tree-dump-times "insert svprfd" 4 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "insert svprfd" 2 "llc_allocate" } } */
+diff --git a/gcc/tree-ssa-llc-allocate.cc b/gcc/tree-ssa-llc-allocate.cc
+index da6d72b94..d10d60459 100644
+--- a/gcc/tree-ssa-llc-allocate.cc
++++ b/gcc/tree-ssa-llc-allocate.cc
+@@ -3822,7 +3822,7 @@ issue_llc_hint (std::vector<ref_group> &ref_groups,
+ 			  "ref_group(s) is found for llc hint.\n",
+ 	       num_issue_var, param_issue_topn);
+     }
+-  if (param_force_issue)
++  if (param_force_issue == 1 || param_force_issue == 0)
+     {
+       static_issue (ref_groups, num_issue_var);
+       return;
+diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
+index 023a83c38..7f7577951 100644
+--- a/gcc/tree-vect-loop.cc
++++ b/gcc/tree-vect-loop.cc
+@@ -9735,11 +9735,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
+ 
+   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
+     {
+-      class loop *sloop;
+-      if (!(optimize >= 2 && flag_llc_allocate > 0)) 
+-	sloop = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
+-      else 
+-	sloop = vect_loop_versioning_2 (loop_vinfo, loop_vectorized_call);
++      class loop *sloop
++	= vect_loop_versioning (loop_vinfo, loop_vectorized_call);
+       sloop->force_vectorize = false;
+       check_profitability = false;
+     }
+@@ -9992,8 +9989,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
+ 			   niters_vector_mult_vf, !niters_no_overflow);
+ 
+   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
+-  if (!(optimize >= 2 && flag_llc_allocate > 0))
+-    scale_profile_for_vect_loop (loop, assumed_vf);
++  scale_profile_for_vect_loop (loop, assumed_vf);
+ 
+   /* True if the final iteration might not handle a full vector's
+      worth of scalar iterations.  */
+-- 
+2.44.0.windows.1
+
diff --git a/0368-fix-llc-feature-case-failed.patch b/0368-fix-llc-feature-case-failed.patch
new file mode 100644
index 0000000000000000000000000000000000000000..3bfcf72aa224f09c77652150932bc019c87d2078
--- /dev/null
+++ b/0368-fix-llc-feature-case-failed.patch
@@ -0,0 +1,78 @@
+From 889fed32e6e86a64974ec9edc69cd2c88c14e6f0 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?=E9=99=88=E9=B8=BF?= <chenhong92@huawei.com>
+Date: Sat, 15 Mar 2025 14:58:11 +0800
+Subject: [PATCH] fix llc feature case failed
+
+---
+ gcc/testsuite/gcc.dg/llc-allocate/llc-1.c                     | 2 +-
+ gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c        | 2 +-
+ gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c             | 2 +-
+ gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90              | 4 ++--
+ .../gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90   | 2 +-
+ 5 files changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
+index 0b81394ad..55d1396d4 100644
+--- a/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c
+@@ -56,6 +56,6 @@ main (int argc, char *argv[])
+ /* { dg-final { scan-tree-dump       "\\d\\tupperPtr\\t\\(2.933319, 1, 1, 0\\)" "llc_allocate" } } */
+ /* { dg-final { scan-tree-dump       "\\d\\tlPtr\\t\\(1.466660, 1, 1, 0\\)" "llc_allocate" } } */
+ /* { dg-final { scan-tree-dump       "\\d\\tuPtr\\t\\(1.466660, 1, 1, 0\\)" "llc_allocate" } } */
+-/* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "runtime issue" 0 "llc_allocate" } } */
+ /* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */
+ /* { dg-final { scan-tree-dump-times "insert svprfd" 2 "llc_allocate" } } */
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c
+index e18725f60..5e908b380 100644
+--- a/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c
+@@ -46,5 +46,5 @@ convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high)
+   return output_stack->reg[0];
+ }
+ 
+-/* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */
++/* { dg-final { scan-tree-dump-times "runtime issue" 0 "llc_allocate" } } */
+ /* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */
+\ No newline at end of file
+diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c
+index ba90e7ea4..9196d1d95 100644
+--- a/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c
++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c
+@@ -59,4 +59,4 @@ main (int argc, char *argv[])
+ 
+ /* { dg-final { scan-tree-dump-times "Tracing succeeded" 24 "llc_allocate" } } */
+ /* { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } */
+-/* { dg-final { scan-tree-dump-times "unhandled issue scene" 2 "llc_allocate" } } */
+\ No newline at end of file
++/* { dg-final { scan-tree-dump-times "unhandled issue scene" 0 "llc_allocate" } } */
+\ No newline at end of file
+diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90
+index b0f68ebe3..da9669639 100644
+--- a/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90
++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90
+@@ -205,7 +205,7 @@ END SUBROUTINE calc_p_rho
+ ! { dg-final { scan-tree-dump-times "\\d\\tt_1\\t\\(0.000000, 3, 1, 0\\)" 1 "llc_allocate" } }
+ ! { dg-final { scan-tree-dump-times "\\d\\tt_2\\t\\(0.000000, 3, 1, 0\\)" 1 "llc_allocate" } }
+ ! { dg-final { scan-tree-dump-times "\\d\\tc2a\\t\\(0.000000, 3, 1, 0\\)" 2 "llc_allocate" } }
+-! { dg-final { scan-tree-dump-times "runtime issue" 2 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "runtime issue" 0 "llc_allocate" } }
+ ! { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } }
+ ! { dg-final { scan-tree-dump-times "insert svprfd" 2 "llc_allocate" } }
+-! { dg-final { scan-tree-dump-times "cumul_size.*150960\\)" 1 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "cumul_size.*150960\\)" 0 "llc_allocate" } }
+diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90
+index 7345759db..eb2cc8690 100644
+--- a/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90
++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90
+@@ -54,5 +54,5 @@ SUBROUTINE calc_p8w(p8w, ix, iy, k_start, k_end)
+ 
+ END SUBROUTINE calc_p8w
+ 
+-! { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } }
++! { dg-final { scan-tree-dump-times "runtime issue" 0 "llc_allocate" } }
+ ! { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } }
+\ No newline at end of file
+-- 
+2.44.0.windows.1
+
diff --git a/gcc.spec b/gcc.spec
index 21df78d5169c173669b7342e88d07133e060155e..fe1b8f161985fb5004d06c05ae2f77f3046cd990 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 78
+%global gcc_release 79
 
 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@@ -474,6 +474,10 @@ Patch361: 0361-Enhancing-BOLT-Optimization-with-AI.patch
 Patch362: 0362-Modify-cache-size-for-hip10a-and-hip10c.patch
 Patch363: 0363-SVE-Add-std-find-with-sve.patch
 Patch364: 0364-CFGO-Enable-flag_profile_partial_training-for-CFGO-b.patch
+Patch365: 0365-add-llc-allocate-feature.patch
+Patch366: 0366-fix-prefetch-case-failed.patch
+Patch367: 0367-llc-feature-bugfix.patch
+Patch368: 0368-fix-llc-feature-case-failed.patch
 
 # Part 1001-1999
 %ifarch sw_64
@@ -1624,6 +1628,10 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch -P362 -p1
 %patch -P363 -p1
 %patch -P364 -p1
+%patch -P365 -p1
+%patch -P366 -p1
+%patch -P367 -p1
+%patch -P368 -p1
 
 %ifarch sw_64
 %patch -P1001 -p1
@@ -4251,6 +4259,10 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Sat May 3 2025 huang-xiaoquan <huangxiaoquan1@huawei.com> - 12.3.1-79
+- Type: Sync
+- DESC: Sync patches from openeuler/gcc.
+
 * Mon Apr 28 2025 liyancheng <412998149@qq.com> - 12.3.1-78
 - Type: Sync
 - DESC: Sync patches from openeuler/gcc.