diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..6494636477cc415612b6c339d218118d0fbe5901
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c
@@ -0,0 +1,67 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-do run { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -ftree-slp-transpose-vectorize -fdump-tree-ldist-all-details -save-temps" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+static unsigned inline abs2 (unsigned a)
+{
+  unsigned s = ((a>>15)&0x10001)*0xffff;
+  return (a+s)^s;
+}
+
+int foo (unsigned char *oxa, int ia, unsigned char *oxb, int ib)
+{
+  unsigned tmp[4][4];
+  unsigned a0, a1, a2, a3;
+  int sum = 0;
+  for (int i = 0; i < 4; i++, oxa += ia, oxb += ib)
+    {
+      a0 = (oxa[0] - oxb[0]) + ((oxa[4] - oxb[4]) << 16);
+      a1 = (oxa[1] - oxb[1]) + ((oxa[5] - oxb[5]) << 16);
+      a2 = (oxa[2] - oxb[2]) + ((oxa[6] - oxb[6]) << 16);
+      a3 = (oxa[3] - oxb[3]) + ((oxa[7] - oxb[7]) << 16);
+      int t0 = a0 + a1;
+      int t1 = a0 - a1;
+      int t2 = a2 + a3;
+      int t3 = a2 - a3;
+      tmp[i][0] = t0 + t2;
+      tmp[i][2] = t0 - t2;
+      tmp[i][1] = t1 + t3;
+      tmp[i][3] = t1 - t3;
+    }
+  for (int i = 0; i < 4; i++)
+    {
+      int t0 = tmp[0][i] + tmp[1][i];
+      int t1 = tmp[0][i] - tmp[1][i];
+      int t2 = tmp[2][i] + tmp[3][i];
+      int t3 = tmp[2][i] - tmp[3][i];
+      a0 = t0 + t2;
+      a2 = t0 - t2;
+      a1 = t1 + t3;
+      a3 = t1 - t3;
+      sum += abs2 (a0) + abs2 (a1) + abs2 (a2) + abs2 (a3);
+    }
+  return (((unsigned short) sum) + ((unsigned) sum >>16)) >> 1;
+}
+
+int main ()
+{
+  unsigned char oxa[128] = {0};
+  unsigned char oxb[128] = {0};
+  for (int i = 0; i < 128; i++)
+    {
+      oxa[i] += i * 3;
+      oxb[i] = i * 2;
+    }
+  int sum = foo (oxa, 16, oxb, 32);
+  if (sum != 736)
+    {
+      abort ();
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "Insertion done: 4 temp arrays inserted" 1 "ldist" } } */
+/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 1 "ldist" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..1b50fd27d6a03a9a512e6acd65fcfe0a59760061
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c
@@ -0,0 +1,17 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -ftree-slp-transpose-vectorize -fdump-tree-ldist-all-details" } */
+
+unsigned a0[4], a1[4], a2[4], a3[4];
+
+void foo (unsigned char *oxa, int ia, unsigned char *oxb, int ib)
+{
+  for (int i = 0; i < 4; i++, oxa += ia, oxb += ib)
+    {
+      a0[i] = (oxa[0] - oxb[0]) + ((oxa[4] - oxb[4]) << 16);
+      a1[i] = (oxa[1] - oxb[1]) + ((oxa[5] - oxb[5]) << 16);
+      a2[i] = (oxa[2] - oxb[2]) + ((oxa[6] - oxb[6]) << 16);
+      a3[i] = (oxa[3] - oxb[3]) + ((oxa[7] - oxb[7]) << 16);
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "Loop 1 not distributed." 1 "ldist" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..94b992b050dcc005da2ef8c24d570b186e9c7b87
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c
@@ -0,0 +1,19 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -ftree-slp-transpose-vectorize -fdump-tree-ldist-all-details" } */
+
+unsigned a0[4], a1[4], a2[4], a3[4];
+
+void foo (unsigned char *oxa, int ia, unsigned char *oxb, int ib)
+{
+  for (int i = 0; i < 4; i++, oxa += ia, oxb += ib)
+    {
+      a0[i] = ((oxa[0] - oxb[0]) + ((oxa[4] - oxb[4]) << 16)) + 1;
+      a1[i] = ((oxa[1] - oxb[1]) + ((oxa[5] - oxb[5]) << 16)) - 2;
+      a2[i] = ((oxa[2] - oxb[2]) + ((oxa[6] - oxb[6]) << 16)) * 3;
+      a3[i] = ((oxa[3] - oxb[3]) + ((oxa[7] - oxb[7]) << 16)) / 4;
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "Insertion done: 4 temp arrays inserted" 1 "ldist" } } */
+/* { dg-final { scan-tree-dump-times "Insertion removed" 1 "ldist" } } */
+/* { dg-final { scan-tree-dump-times "Loop 1 not distributed." 1 "ldist" } } */
\ No newline at end of file
diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c
index 888af48946f58121c0dfae46b3ff2895bbf1930d..8c370c13048389e8bd5d54e401e531feb6d4a5a0 100644
--- a/gcc/tree-loop-distribution.c
+++ b/gcc/tree-loop-distribution.c
@@ -36,6 +36,47 @@ along with GCC; see the file COPYING3.  If not see
    |   D(I) = A(I-1)*E
    |ENDDO
 
+   If an unvectorizable loop has grouped loads, and calculations from grouped
+   loads are isomorphic, build temp arrays using stmts where isomorphic
+   calculations end.  Afer distribution, the partition built from temp
+   arrays can be vectorized in pass SLP after loop unrolling.  For example,
+
+   |DO I = 1, N
+   |    A = FOO (ARG_1);
+   |    B = FOO (ARG_2);
+   |    C = BAR_0 (A);
+   |    D = BAR_1 (B);
+   |ENDDO
+
+   is transformed to
+
+   |DO I = 1, N
+   |    J = FOO (ARG_1);
+   |    K = FOO (ARG_2);
+   |    X[I] = J;
+   |    Y[I] = K;
+   |    A = X[I];
+   |    B = Y[I];
+   |    C = BAR_0 (A);
+   |    D = BAR_1 (B);
+   |ENDDO
+
+   and is then distributed to
+
+   |DO I = 1, N
+   |    J = FOO (ARG_1);
+   |    K = FOO (ARG_2);
+   |    X[I] = J;
+   |    Y[I] = K;
+   |ENDDO
+
+   |DO I = 1, N
+   |    A = X[I];
+   |    B = Y[I];
+   |    C = BAR_0 (A);
+   |    D = BAR_1 (B);
+   |ENDDO
+
    Loop distribution is the dual of loop fusion.  It separates statements
    of a loop (or loop nest) into multiple loops (or loop nests) with the
    same loop header.  The major goal is to separate statements which may
@@ -44,7 +85,9 @@ along with GCC; see the file COPYING3.  If not see
 
      1) Seed partitions with specific type statements.  For now we support
 	two types seed statements: statement defining variable used outside
-	of loop; statement storing to memory.
+	of loop; statement storing to memory.  Moreover, for unvectorizable
+	loops, we try to find isomorphic stmts from grouped load and build
+	temp arrays as new seed statements.
      2) Build reduced dependence graph (RDG) for loop to be distributed.
 	The vertices (RDG:V) model all statements in the loop and the edges
 	(RDG:E) model flow and control dependencies between statements.
@@ -103,6 +146,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "cfganal.h"
 #include "gimple-iterator.h"
 #include "gimplify-me.h"
+#include "gimplify.h"
 #include "stor-layout.h"
 #include "tree-cfg.h"
 #include "tree-ssa-loop-manip.h"
@@ -115,6 +159,9 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-vectorizer.h"
 #include "tree-eh.h"
 #include "gimple-fold.h"
+#include "optabs-tree.h"
+#include <map>
+#include <algorithm>
 
 
 #define MAX_DATAREFS_NUM \
@@ -183,6 +230,52 @@ struct rdg_vertex
 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
 
+/* Results of isomorphic group analysis.  */
+#define UNINITIALIZED	(0)
+#define ISOMORPHIC	(1)
+#define HETEROGENEOUS	(1 << 1)
+#define UNCERTAIN	(1 << 2)
+
+/* Information of a stmt while analyzing isomorphic use in group.  */
+
+typedef struct _group_info
+{
+  gimple *stmt;
+
+  /* True if stmt can be a cut point.  */
+  bool cut_point;
+
+  /* For use_stmt with two rhses, one of which is the lhs of stmt.
+     If the other is unknown to be isomorphic, mark it uncertain.  */
+  bool uncertain;
+
+  /* Searching of isomorphic stmt reaches heterogeneous groups or reaches
+     MEM stmts.  */
+  bool done;
+
+  _group_info ()
+    {
+      stmt = NULL;
+      cut_point = false;
+      uncertain = false;
+      done = false;
+    }
+} *group_info;
+
+/* PAIR of cut points and corresponding profit.  */
+typedef std::pair<vec<gimple *> *, int> stmts_profit;
+
+/* MAP of vector factor VF and corresponding stmts_profit PAIR.  */
+typedef std::map<unsigned, stmts_profit> vf_stmts_profit_map;
+
+/* PAIR of group_num and iteration_num.  We consider rhses from the same
+   group and interation are isomorphic.  */
+typedef std::pair<unsigned, unsigned> group_iteration;
+
+/* An isomorphic stmt is detetmined by lhs of use_stmt, group_num and
+   the iteration_num when we insert this stmt to this map.  */
+typedef std::map<tree, group_iteration> isomer_stmt_lhs;
+
 /* Data dependence type.  */
 
 enum rdg_dep_type
@@ -594,7 +687,8 @@ class loop_distribution
   /* Returns true when PARTITION1 and PARTITION2 access the same memory
      object in RDG.  */
   bool share_memory_accesses (struct graph *rdg,
-			      partition *partition1, partition *partition2);
+			      partition *partition1, partition *partition2,
+			      hash_set<tree> *excluded_arrays);
 
   /* For each seed statement in STARTING_STMTS, this function builds
      partition for it by adding depended statements according to RDG.
@@ -637,8 +731,43 @@ class loop_distribution
 
   /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
      ALIAS_DDRS contains ddrs which need runtime alias check.  */
-  void finalize_partitions (class loop *loop, vec<struct partition *>
-			    *partitions, vec<ddr_p> *alias_ddrs);
+  void finalize_partitions (class loop *loop,
+			    vec<struct partition *> *partitions,
+			    vec<ddr_p> *alias_ddrs, bitmap producers);
+
+  /* Analyze loop form and if it's vectorizable to decide if we need to
+     insert temp arrays to distribute it.  */
+  bool may_insert_temp_arrays (loop_p loop, struct graph *&rdg,
+			       control_dependences *cd);
+
+  /* Reset gimple_uid of GIMPLE_DEBUG and GIMPLE_LABEL to -1.  */
+  void reset_gimple_uid (loop_p loop);
+
+  bool check_loop_vectorizable (loop_p loop);
+
+  /* If loop is not distributed, remove inserted temp arrays.  */
+  void remove_insertion (loop_p loop, struct graph *flow_only_rdg,
+			 bitmap producers, struct partition *partition);
+
+  /* Insert temp arrays if isomorphic computation exists.  Temp arrays will be
+     regarded as SEED_STMTS for building partitions in succeeding processes.  */
+  bool insert_temp_arrays (loop_p loop, vec<gimple *> seed_stmts,
+			   hash_set<tree> *tmp_array_vars, bitmap producers);
+
+  inline void rebuild_rdg (loop_p loop, struct graph *&rdg,
+			   control_dependences *cd);
+
+  void build_producers (loop_p loop, bitmap producers,
+			vec<gimple *> &transformed);
+
+  void do_insertion (loop_p loop, struct graph *flow_only_rdg, tree iv,
+		     bitmap cut_points, hash_set <tree> *tmp_array_vars,
+		     bitmap producers);
+
+  /* Fuse PARTITIONS built from inserted temp arrays into one partition,
+     fuse the rest into another.  */
+  void merge_remaining_partitions (vec<struct partition *> *partitions,
+				   bitmap producers);
 
   /* Distributes the code from LOOP in such a way that producer statements
      are placed before consumer statements.  Tries to separate only the
@@ -1852,7 +1981,8 @@ loop_distribution::classify_partition (loop_p loop,
 
 bool
 loop_distribution::share_memory_accesses (struct graph *rdg,
-		       partition *partition1, partition *partition2)
+		       partition *partition1, partition *partition2,
+		       hash_set <tree> *excluded_arrays)
 {
   unsigned i, j;
   bitmap_iterator bi, bj;
@@ -1886,8 +2016,13 @@ loop_distribution::share_memory_accesses (struct graph *rdg,
 	  if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
 	      && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
 	      && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
-	      && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
-	    return true;
+	      && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0)
+	      /* An exception, if PARTITION1 and PARTITION2 contain the
+		 temp array we inserted, do not merge them.  */
+	      && !excluded_arrays->contains (DR_REF (dr1)))
+	    {
+	      return true;
+	    }
 	}
     }
 
@@ -2848,13 +2983,47 @@ fuse_memset_builtins (vec<struct partition *> *partitions)
     }
 }
 
+void
+loop_distribution::merge_remaining_partitions
+			(vec<struct partition *> *partitions,
+			 bitmap producers)
+{
+  struct partition *partition = NULL;
+  struct partition *p1 = NULL, *p2 = NULL;
+  for (unsigned i = 0; partitions->iterate (i, &partition); i++)
+    {
+      if (bitmap_intersect_p (producers, partition->stmts))
+	{
+	  if (p1 == NULL)
+	    {
+	      p1 = partition;
+	      continue;
+	    }
+	  partition_merge_into (NULL, p1, partition, FUSE_FINALIZE);
+	}
+      else
+	{
+	  if (p2 == NULL)
+	    {
+	      p2 = partition;
+	      continue;
+	    }
+	  partition_merge_into (NULL, p2, partition, FUSE_FINALIZE);
+	}
+      partitions->unordered_remove (i);
+      partition_free (partition);
+      i--;
+    }
+}
+
 void
 loop_distribution::finalize_partitions (class loop *loop,
 					vec<struct partition *> *partitions,
-					vec<ddr_p> *alias_ddrs)
+					vec<ddr_p> *alias_ddrs,
+					bitmap producers)
 {
   unsigned i;
-  struct partition *partition, *a;
+  struct partition *partition;
 
   if (partitions->length () == 1
       || alias_ddrs->length () > 0)
@@ -2886,13 +3055,7 @@ loop_distribution::finalize_partitions (class loop *loop,
       || (loop->inner == NULL
 	  && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
     {
-      a = (*partitions)[0];
-      for (i = 1; partitions->iterate (i, &partition); ++i)
-	{
-	  partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
-	  partition_free (partition);
-	}
-      partitions->truncate (1);
+      merge_remaining_partitions (partitions, producers);
     }
 
   /* Fuse memset builtins if possible.  */
@@ -2900,157 +3063,1506 @@ loop_distribution::finalize_partitions (class loop *loop,
     fuse_memset_builtins (partitions);
 }
 
-/* Distributes the code from LOOP in such a way that producer statements
-   are placed before consumer statements.  Tries to separate only the
-   statements from STMTS into separate loops.  Returns the number of
-   distributed loops.  Set NB_CALLS to number of generated builtin calls.
-   Set *DESTROY_P to whether LOOP needs to be destroyed.  */
+/* Gimple uids of GIMPLE_DEBUG and GIMPLE_LABEL were changed during function
+   vect_analyze_loop, reset them to -1.  */
 
-int
-loop_distribution::distribute_loop (class loop *loop, vec<gimple *> stmts,
-		 control_dependences *cd, int *nb_calls, bool *destroy_p,
-		 bool only_patterns_p)
+void
+loop_distribution::reset_gimple_uid (loop_p loop)
 {
-  ddrs_table = new hash_table<ddr_hasher> (389);
-  struct graph *rdg;
-  partition *partition;
-  int i, nbp;
-
-  *destroy_p = false;
-  *nb_calls = 0;
-  loop_nest.create (0);
-  if (!find_loop_nest (loop, &loop_nest))
+  basic_block *bbs = get_loop_body_in_custom_order (loop, this, bb_top_order_cmp_r);
+  for (int i = 0; i < int (loop->num_nodes); i++)
     {
-      loop_nest.release ();
-      delete ddrs_table;
-      return 0;
+      basic_block bb = bbs[i];
+      for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
+	   gsi_next (&gsi))
+	{
+	  gimple *stmt = gsi_stmt (gsi);
+	  if (is_gimple_debug (stmt) || gimple_code (stmt) == GIMPLE_LABEL)
+	    {
+	      gimple_set_uid (stmt, -1);
+	    }
+	}
     }
+  free (bbs);
+}
 
-  datarefs_vec.create (20);
-  has_nonaddressable_dataref_p = false;
-  rdg = build_rdg (loop, cd);
-  if (!rdg)
+bool
+loop_distribution::check_loop_vectorizable (loop_p loop)
+{
+  vec_info_shared shared;
+  vect_analyze_loop (loop, &shared, true);
+  loop_vec_info vinfo = loop_vec_info_for_loop (loop);
+  reset_gimple_uid (loop);
+  if (vinfo == NULL)
     {
       if (dump_file && (dump_flags & TDF_DETAILS))
-	fprintf (dump_file,
-		 "Loop %d not distributed: failed to build the RDG.\n",
-		 loop->num);
+	{
+	  fprintf (dump_file,
+		   "Loop %d no temp array insertion: bad data access pattern,"
+		   " unable to generate loop_vinfo.\n", loop->num);
+	}
+      return false;
+    }
+  if (vinfo->vectorizable)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "Loop %d no temp array insertion: original loop"
+			      " can be vectorized without distribution.\n",
+			      loop->num);
+	}
+      delete vinfo;
+      loop->aux = NULL;
+      return false;
+    }
+  if (vinfo->grouped_loads.length () == 0)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "Loop %d no temp array insertion: original loop"
+			      " has no grouped loads.\n" , loop->num);
+	}
+      delete vinfo;
+      loop->aux = NULL;
+      return false;
+    }
+  return true;
+}
 
-      loop_nest.release ();
-      free_data_refs (datarefs_vec);
-      delete ddrs_table;
-      return 0;
+inline void
+loop_distribution::rebuild_rdg (loop_p loop, struct graph *&rdg,
+				control_dependences *cd)
+{
+  free_rdg (rdg);
+  rdg = build_rdg (loop, cd);
+  gcc_checking_assert (rdg != NULL);
+}
+
+bool
+loop_distribution::may_insert_temp_arrays (loop_p loop, struct graph *&rdg,
+					   control_dependences *cd)
+{
+  if (!(flag_tree_slp_transpose_vectorize && flag_tree_loop_vectorize))
+    {
+      return false;
     }
 
-  if (datarefs_vec.length () > MAX_DATAREFS_NUM)
+  /* Only loops with two basic blocks HEADER and LATCH are supported.  HEADER
+     is the main body of a LOOP and LATCH is the basic block that controls the
+     LOOP execution.  Size of temp array is determined by loop execution time,
+     so it must be a const.  */
+  tree loop_extent = number_of_latch_executions (loop);
+  if (loop->inner != NULL || loop->num_nodes > 2
+      || rdg->n_vertices > param_slp_max_insns_in_bb
+      || TREE_CODE (loop_extent) != INTEGER_CST)
     {
       if (dump_file && (dump_flags & TDF_DETAILS))
-	fprintf (dump_file,
-		 "Loop %d not distributed: too many memory references.\n",
-		 loop->num);
+	{
+	  fprintf (dump_file, "Loop %d: no temp array insertion: bad loop"
+			      " form.\n", loop->num);
+	}
+      return false;
+    }
 
-      free_rdg (rdg);
-      loop_nest.release ();
-      free_data_refs (datarefs_vec);
-      delete ddrs_table;
-      return 0;
+  if (loop->dont_vectorize)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "Loop %d: no temp array insertion: this loop"
+			      " should never be vectorized.\n",
+			      loop->num);
+	}
+      return false;
     }
 
-  data_reference_p dref;
-  for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
-    dref->aux = (void *) (uintptr_t) i;
+  /* Do not distribute a LOOP that is able to be vectorized without
+     distribution.  */
+  if (!check_loop_vectorizable (loop))
+    {
+      rebuild_rdg (loop, rdg, cd);
+      return false;
+    }
 
-  if (dump_file && (dump_flags & TDF_DETAILS))
-    dump_rdg (dump_file, rdg);
+  rebuild_rdg (loop, rdg, cd);
+  return true;
+}
 
-  auto_vec<struct partition *, 3> partitions;
-  rdg_build_partitions (rdg, stmts, &partitions);
+/* Return max grouped loads' length if all groupes length satisfy len = 2 ^ n.
+   Otherwise, return 0.  */
 
-  auto_vec<ddr_p> alias_ddrs;
+static unsigned
+get_max_vf (loop_vec_info vinfo)
+{
+  unsigned size = 0;
+  unsigned max = 0;
+  stmt_vec_info stmt_info;
+  unsigned i = 0;
+  FOR_EACH_VEC_ELT (vinfo->grouped_loads, i, stmt_info)
+    {
+      size = stmt_info->size;
+      if (!pow2p_hwi (size))
+	{
+	  return 0;
+	}
+      max = size > max ? size : max;
+    }
+  return max;
+}
 
-  auto_bitmap stmt_in_all_partitions;
-  bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
-  for (i = 1; partitions.iterate (i, &partition); ++i)
-    bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
+/* Convert grouped_loads from linked list to vector with length vf.  Init
+   group_info of each stmt in the same group and put then into a vector.  And
+   these vectors consist WORKLISTS.  We will re-analyze a group if it is
+   uncertain, so we regard WORKLISTS as a circular queue.  */
 
-  bool any_builtin = false;
-  bool reduction_in_all = false;
-  FOR_EACH_VEC_ELT (partitions, i, partition)
+static unsigned
+build_queue (loop_vec_info vinfo, unsigned vf,
+	     vec<vec<group_info> *> &worklists)
+{
+  stmt_vec_info stmt_info;
+  unsigned i = 0;
+  group_info ginfo = NULL;
+  vec<group_info> *worklist = NULL;
+  FOR_EACH_VEC_ELT (vinfo->grouped_loads, i, stmt_info)
     {
-      reduction_in_all
-	|= classify_partition (loop, rdg, partition, stmt_in_all_partitions);
-      any_builtin |= partition_builtin_p (partition);
+      unsigned group_size = stmt_info->size;
+      stmt_vec_info c_stmt_info = stmt_info;
+      bool succ = true;
+      while (group_size >= vf)
+	{
+	  vec_alloc (worklist, vf);
+	  for (unsigned j = 0; j < vf; ++j)
+	    {
+	      if (c_stmt_info == NULL)
+		{
+		  worklist->release ();
+		  succ = false;
+		  break;
+		}
+	      ginfo = new _group_info ();
+	      ginfo->stmt = c_stmt_info->stmt;
+	      worklist->safe_push (ginfo);
+	      c_stmt_info = c_stmt_info->next_element;
+	    }
+	  if (!succ)
+	    {
+	      break;
+	    }
+	  worklists.safe_push (worklist);
+	  group_size -= vf;
+	}
     }
+  return worklists.length ();
+}
 
-  /* If we are only distributing patterns but did not detect any,
-     simply bail out.  */
-  if (only_patterns_p
-      && !any_builtin)
+static bool
+check_same_oprand_type (tree op1, tree op2)
+{
+  tree type1 = TREE_TYPE (op1);
+  tree type2 = TREE_TYPE (op2);
+  if (TREE_CODE (type1) != INTEGER_TYPE && TREE_CODE (type1) != REAL_TYPE)
     {
-      nbp = 0;
-      goto ldist_done;
+      return false;
     }
+  return (TREE_CODE (type1) == TREE_CODE (type2)
+	  && TYPE_UNSIGNED (type1) == TYPE_UNSIGNED (type2)
+	  && TYPE_PRECISION (type1) == TYPE_PRECISION (type2));
+}
 
-  /* If we are only distributing patterns fuse all partitions that
-     were not classified as builtins.  This also avoids chopping
-     a loop into pieces, separated by builtin calls.  That is, we
-     only want no or a single loop body remaining.  */
-  struct partition *into;
-  if (only_patterns_p)
+static bool
+bit_field_p (gimple *stmt)
+{
+  unsigned i = 0;
+  auto_vec<data_reference_p, 2> datarefs_vec;
+  data_reference_p dr;
+  if (!find_data_references_in_stmt (NULL, stmt, &datarefs_vec))
     {
-      for (i = 0; partitions.iterate (i, &into); ++i)
-	if (!partition_builtin_p (into))
-	  break;
-      for (++i; partitions.iterate (i, &partition); ++i)
-	if (!partition_builtin_p (partition))
-	  {
-	    partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
-	    partitions.unordered_remove (i);
-	    partition_free (partition);
-	    i--;
-	  }
+      return true;
     }
 
-  /* Due to limitations in the transform phase we have to fuse all
-     reduction partitions into the last partition so the existing
-     loop will contain all loop-closed PHI nodes.  */
-  for (i = 0; partitions.iterate (i, &into); ++i)
-    if (partition_reduction_p (into))
-      break;
-  for (i = i + 1; partitions.iterate (i, &partition); ++i)
-    if (partition_reduction_p (partition))
-      {
-	partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
-	partitions.unordered_remove (i);
-	partition_free (partition);
-	i--;
-      }
-
-  /* Apply our simple cost model - fuse partitions with similar
-     memory accesses.  */
-  for (i = 0; partitions.iterate (i, &into); ++i)
+  FOR_EACH_VEC_ELT (datarefs_vec, i, dr)
     {
-      bool changed = false;
-      if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
-	continue;
-      for (int j = i + 1;
-	   partitions.iterate (j, &partition); ++j)
+      if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
+	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
 	{
-	  if (share_memory_accesses (rdg, into, partition))
-	    {
-	      partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
-	      partitions.unordered_remove (j);
-	      partition_free (partition);
-	      j--;
-	      changed = true;
-	    }
+	  return true;
 	}
-      /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
-         accesses when 1 and 2 have similar accesses but not 0 and 1
-	 then in the next iteration we will fail to consider merging
-	 1 into 0,2.  So try again if we did any merging into 0.  */
-      if (changed)
-	i--;
+    }
+  return false;
+}
+
+static inline bool
+shift_operation (enum tree_code op)
+{
+  return op == LSHIFT_EXPR || op == RSHIFT_EXPR || op == LROTATE_EXPR
+	 || op == RROTATE_EXPR;
+}
+
+/* Return relationship between USE_STMT and the first use_stmt of the group.
+   RHS1 is the lhs of stmt recorded in group_info.  If another rhs of use_stmt
+   is not a constant, return UNCERTAIN and re-check it later.  */
+
+static unsigned
+check_isomorphic (gimple *use_stmt, gimple *&first,
+		  tree rhs1, vec<tree> &hetero_lhs)
+{
+  /* Check same operation.  */
+  enum tree_code rhs_code_first = gimple_assign_rhs_code (first);
+  enum tree_code rhs_code_current = gimple_assign_rhs_code (use_stmt);
+  if (rhs_code_first != rhs_code_current)
+    {
+      return HETEROGENEOUS;
+    }
+  /* For shift operations, oprands should be equal.  */
+  if (shift_operation (rhs_code_current))
+    {
+      tree shift_op_first = gimple_assign_rhs2 (first);
+      tree shift_op_current = gimple_assign_rhs2 (use_stmt);
+      if (!operand_equal_p (shift_op_first, shift_op_current, 0)
+	  || !TREE_CONSTANT (shift_op_first))
+	{
+	  return HETEROGENEOUS;
+	}
+      return ISOMORPHIC;
+    }
+  /* Type convertion expr or assignment.  */
+  if (gimple_num_ops (first) == 2)
+    {
+      return (rhs_code_first == NOP_EXPR || rhs_code_first == CONVERT_EXPR
+	      || rhs_code_first == SSA_NAME) ? ISOMORPHIC : HETEROGENEOUS;
+    }
+  /* We find USE_STMT of LHS of current stmt, denote it as RHS1 of USE_STMT and
+     the other one as RHS2.  Check if calculation of RHS2 is isomorphic with
+     RHS2 of the first USE_STMT.  */
+  tree rhs2_first = gimple_assign_rhs1 (use_stmt) == rhs1
+		    ? gimple_assign_rhs2 (first) : gimple_assign_rhs1 (first);
+  tree rhs2 = gimple_assign_rhs1 (use_stmt) == rhs1
+	      ? gimple_assign_rhs2 (use_stmt) : gimple_assign_rhs1 (use_stmt);
+
+  if (check_same_oprand_type (rhs2_first, rhs2))
+    {
+      if (TREE_CONSTANT (rhs2))
+	{
+	  return ISOMORPHIC;
+	}
+      else if (hetero_lhs.contains (rhs2))
+	{
+	  return HETEROGENEOUS;
+	}
+
+      /* Provisionally set the stmt as uncertain and analyze the whole group
+	 in function CHECK_UNCERTAIN later if all use_stmts are uncertain.  */
+      return UNCERTAIN;
+    }
+  return HETEROGENEOUS;
+}
+
+static bool
+unsupported_operations (gimple *stmt)
+{
+  enum tree_code code = gimple_assign_rhs_code (stmt);
+  if (TREE_CODE_CLASS (code) == tcc_comparison)
+    return false;
+
+  return code == COND_EXPR;
+}
+
+/* Check if the single use_stmt of STMT is isomorphic with the first one's
+   use_stmt in current group.  */
+
+static unsigned
+check_use_stmt (group_info elmt, gimple *&first,
+		vec<gimple *> &tmp_stmts, vec<tree> &hetero_lhs)
+{
+  if (gimple_code (elmt->stmt) != GIMPLE_ASSIGN)
+    {
+      return HETEROGENEOUS;
+    }
+  use_operand_p dummy;
+  tree lhs = gimple_assign_lhs (elmt->stmt);
+  gimple *use_stmt = NULL;
+  single_imm_use (lhs, &dummy, &use_stmt);
+  /* STMTs with three rhs are not supported, e.g., GIMPLE_COND.  */
+  if (use_stmt == NULL || gimple_code (use_stmt) != GIMPLE_ASSIGN
+      || unsupported_operations (use_stmt) || bit_field_p (use_stmt))
+    {
+      return HETEROGENEOUS;
+    }
+  tmp_stmts.safe_push (use_stmt);
+  if (first == NULL)
+    {
+      first = use_stmt;
+      return UNINITIALIZED;
+    }
+  /* Check if current use_stmt and the first menber's use_stmt in the group
+     are of the same type.  */
+  tree lhs_first = gimple_assign_lhs (first);
+  tree use_lhs = gimple_assign_lhs (use_stmt);
+  if (!check_same_oprand_type (lhs_first, use_lhs))
+    {
+      return HETEROGENEOUS;
+    }
+  return check_isomorphic (use_stmt, first, lhs, hetero_lhs);
+}
+
+/* Replace stmt field in group with stmts in TMP_STMTS, and insert their
+   lhs_info to ISOMER_LHS.  */
+
+static void
+update_isomer_lhs (vec<group_info> *group, unsigned group_num,
+		   unsigned iteration, isomer_stmt_lhs &isomer_lhs,
+		   vec<gimple *> tmp_stmts, int &profit,
+		   vec<unsigned> &merged_groups)
+{
+  group_info elmt = NULL;
+  /* Do not insert temp array if isomorphic stmts from grouped load have
+     only casting operations.  Once isomorphic calculation has 3 oprands,
+     such as plus operation, this group can be regarded as cut point.  */
+  bool operated = (gimple_num_ops (tmp_stmts[0]) == 3);
+  /* Do not insert temp arrays if search of iosomophic stmts reaches
+     MEM stmts.  */
+  bool has_vdef = gimple_vdef (tmp_stmts[0]) != NULL;
+  bool merge = false;
+  for (unsigned i = 0; i < group->length (); i++)
+    {
+      elmt = (*group)[i];
+      elmt->stmt = has_vdef ? NULL : tmp_stmts[i];
+      elmt->cut_point = has_vdef ? false : (elmt->cut_point || operated);
+      elmt->uncertain = false;
+      elmt->done = has_vdef;
+      tree lhs = gimple_assign_lhs (tmp_stmts[i]);
+      if (isomer_lhs.find (lhs) != isomer_lhs.end ())
+	{
+	  merge = true;
+	  continue;
+	}
+      isomer_lhs[lhs] = std::make_pair (group_num, iteration);
+    }
+  if (merge)
+    {
+      merged_groups.safe_push (group_num);
+      profit = 0;
+      return;
+    }
+  enum vect_cost_for_stmt kind = scalar_stmt;
+  int scalar_cost = builtin_vectorization_cost (kind, NULL_TREE, 0);
+  profit = (tmp_stmts.length () - 1) * scalar_cost;
+}
+
+/* Try to find rhs2 in ISOMER_LHS, if all rhs2 were found and their group_num
+   and iteration are same, GROUP is isomorphic.  */
+
+static unsigned
+check_isomorphic_rhs (vec<group_info> *group, vec<gimple *> &tmp_stmts,
+		      isomer_stmt_lhs &isomer_lhs)
+{
+  group_info elmt = NULL;
+  gimple *stmt = NULL;
+  unsigned j = 0;
+  unsigned group_num_tmp = -1u;
+  unsigned iteration_tmp = -1u;
+  tree rhs1 = NULL;
+  tree rhs2 = NULL;
+  unsigned status = UNINITIALIZED;
+  FOR_EACH_VEC_ELT (*group, j, elmt)
+    {
+      rhs1 = gimple_assign_lhs (elmt->stmt);
+      stmt = tmp_stmts[j];
+      rhs2 = (rhs1 == gimple_assign_rhs1 (stmt))
+	     ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
+      isomer_stmt_lhs::iterator iter = isomer_lhs.find (rhs2);
+      if (iter != isomer_lhs.end ())
+	{
+	  if (group_num_tmp == -1u)
+	    {
+	      group_num_tmp = iter->second.first;
+	      iteration_tmp = iter->second.second;
+	      status |= ISOMORPHIC;
+	      continue;
+	    }
+	  unsigned group_num_rhs2 = iter->second.first;
+	  unsigned iteration_rhs2 = iter->second.second;
+	  if (group_num_rhs2 == group_num_tmp
+	      && iteration_rhs2 == iteration_tmp)
+	    {
+	      status |= ISOMORPHIC;
+	      continue;
+	    }
+	  return HETEROGENEOUS;
+	}
+      else
+	{
+	  status |= UNCERTAIN;
+	}
+    }
+  return status;
+}
+
+/* Update group_info for uncertain groups.  */
+
+static void
+update_uncertain_stmts (vec<group_info> *group, unsigned group_num,
+			 unsigned iteration, vec<gimple *> &tmp_stmts)
+{
+  unsigned j = 0;
+  group_info elmt = NULL;
+  FOR_EACH_VEC_ELT (*group, j, elmt)
+    {
+      elmt->uncertain = true;
+      elmt->done = false;
+    }
+}
+
+/* Push stmts in TMP_STMTS into HETERO_LHS.  */
+
+static void
+set_hetero (vec<group_info> *group, vec<tree> &hetero_lhs,
+	    vec<gimple *> &tmp_stmts)
+{
+  group_info elmt = NULL;
+  unsigned i = 0;
+  for (i = 0; i < group->length (); i++)
+    {
+      elmt = (*group)[i];
+      elmt->uncertain = false;
+      elmt->done = true;
+    }
+  gimple *stmt = NULL;
+  FOR_EACH_VEC_ELT (tmp_stmts, i, stmt)
+    {
+      if (stmt != NULL)
+	{
+	  hetero_lhs.safe_push (gimple_assign_lhs (stmt));
+	}
+    }
+}
+
+/* Given an uncertain group, TMP_STMTS are use_stmts of stmts in GROUP.
+   Rhs1 is the lhs of stmt in GROUP, rhs2 is the other rhs of USE_STMT.
+
+   Try to find rhs2 in ISOMER_LHS, if all found rhs2 have same group_num
+   and iteration, this uncertain group is isomorphic.
+
+   If no rhs matched, this GROUP remains uncertain and update group_info.
+
+   Otherwise, this GROUP is heterogeneous.  */
+
+static bool
+check_uncertain (vec<group_info> *group, unsigned group_num,
+		 unsigned iteration, bool &fatal, int &profit,
+		 vec<gimple *> &tmp_stmts, isomer_stmt_lhs &isomer_lhs,
+		 vec<tree> &hetero_lhs, vec<unsigned> &merged_groups)
+{
+  unsigned status = check_isomorphic_rhs (group, tmp_stmts, isomer_lhs);
+
+  switch (status)
+    {
+      case UNCERTAIN:
+	update_uncertain_stmts (group, group_num, iteration, tmp_stmts);
+	return false;
+      case ISOMORPHIC:
+	update_isomer_lhs (group, group_num, iteration, isomer_lhs,
+			   tmp_stmts, profit, merged_groups);
+	return false;
+      default:
+	set_hetero (group, hetero_lhs, tmp_stmts);
+	return true;
+    }
+}
+
+/* Return false if analysis of this group is not finished, e.g., isomorphic or
+   uncertain.  Calculate the profit if vectorized.  */
+
+static bool
+check_group (vec<group_info> *group, unsigned group_num, unsigned iteration,
+	     bool &fatal, int &profit, vec<unsigned> &merged_groups,
+	     isomer_stmt_lhs &isomer_lhs, vec<tree> &hetero_lhs)
+{
+  unsigned j = 0;
+  group_info elmt = NULL;
+  gimple *first = NULL;
+  unsigned res = 0;
+  /* Record single use stmts in TMP_STMTS and decide whether replace stmts in
+     ginfo in succeeding processes.  */
+  auto_vec<gimple *> tmp_stmts;
+  FOR_EACH_VEC_ELT (*group, j, elmt)
+    {
+      if (merged_groups.contains (group_num))
+	{
+	  return true;
+	}
+      res |= check_use_stmt (elmt, first, tmp_stmts, hetero_lhs);
+    }
+
+  /* Update each group member according to RES.  */
+  switch (res)
+    {
+      case ISOMORPHIC:
+	update_isomer_lhs (group, group_num, iteration, isomer_lhs,
+			   tmp_stmts, profit, merged_groups);
+	return false;
+      case UNCERTAIN:
+	return check_uncertain (group, group_num, iteration, fatal, profit,
+				tmp_stmts, isomer_lhs, hetero_lhs,
+				merged_groups);
+      default:
+	set_hetero (group, hetero_lhs, tmp_stmts);
+	return true;
+    }
+}
+
+/* Return true if all analysises are done except uncertain groups.  */
+
+static bool
+end_of_search (vec<vec<group_info> *> &circular_queue,
+	       vec<unsigned> &merged_groups)
+{
+  unsigned i = 0;
+  vec<group_info> *group = NULL;
+  group_info elmt = NULL;
+  FOR_EACH_VEC_ELT (circular_queue, i, group)
+    {
+      if (merged_groups.contains (i))
+	continue;
+      elmt = (*group)[0];
+      if (!elmt->done && !elmt->uncertain)
+	return false;
+    }
+  return true;
+}
+
+/* Push valid stmts to STMTS as cutpoints.  */
+
+static bool
+check_any_cutpoints (vec<vec<group_info> *> &circular_queue,
+		     vec<gimple *> *&stmts, vec<unsigned> &merged_groups)
+{
+  unsigned front = 0;
+  vec<group_info> *group = NULL;
+  group_info elmt = NULL;
+  unsigned max = circular_queue.length () * circular_queue[0]->length ();
+  vec_alloc (stmts, max);
+  while (front < circular_queue.length ())
+    {
+      unsigned i = 0;
+      if (merged_groups.contains (front))
+	{
+	  front++;
+	  continue;
+	}
+      group = circular_queue[front++];
+      FOR_EACH_VEC_ELT (*group, i, elmt)
+	{
+	  if (elmt->stmt != NULL && elmt->done && elmt->cut_point)
+	    {
+	      stmts->safe_push (elmt->stmt);
+	    }
+	}
+    }
+  return stmts->length () != 0;
+}
+
+/* Grouped loads are isomorphic.  Make pair for group number and iteration,
+   map load stmt to this pair.  We set iteration 0 here.  */
+
+static void
+init_isomer_lhs (vec<vec<group_info> *> &groups, isomer_stmt_lhs &isomer_lhs)
+{
+  vec<group_info> *group = NULL;
+  group_info elmt = NULL;
+  unsigned i = 0;
+  FOR_EACH_VEC_ELT (groups, i, group)
+    {
+      unsigned j = 0;
+      FOR_EACH_VEC_ELT (*group, j, elmt)
+	{
+	  isomer_lhs[gimple_assign_lhs (elmt->stmt)] = std::make_pair (i, 0);
+	}
+    }
+}
+
+static int
+load_store_profit (unsigned times, unsigned vf, unsigned tmp)
+{
+  int profit = 0;
+  enum vect_cost_for_stmt kind = scalar_load;
+  int scalar_cost = builtin_vectorization_cost (kind, NULL_TREE, 0);
+  profit += (times - (times / vf)) * scalar_cost;
+  profit -= tmp / vf * scalar_cost;
+  kind = scalar_store;
+  scalar_cost = builtin_vectorization_cost (kind, NULL_TREE, 0);
+  profit -= tmp / vf * scalar_cost;
+  return profit;
+}
+
+/* Breadth first search the graph consisting of define-use chain starting from
+   the circular queue initialized by function BUILD_QUEUE.  Find single use of
+   each stmt in group and check if they are isomorphic.  Isomorphic is defined
+   as same rhs type, same operator, and isomorphic calculation of each rhs
+   starting from load.  If another rhs is uncertain to be isomorphic, put it
+   at the end of circular queue and re-analyze it during the next iteration.
+   If a group shares the same use_stmt with another group, skip one of them in
+   succeedor prcoesses as merged.  Iterate the circular queue until all
+   remianing groups heterogeneous or reaches MEN stmts.  If all other groups
+   have finishes the analysis, and the remaining groups are uncertain,
+   return false to avoid endless loop.  */
+
+bool
+bfs_find_isomer_stmts (vec<vec<group_info> *> &circular_queue,
+		       stmts_profit &profit_pair, unsigned vf,
+		       bool &reach_vdef)
+{
+  isomer_stmt_lhs isomer_lhs;
+  auto_vec<tree> hetero_lhs;
+  auto_vec<unsigned> merged_groups;
+  vec<group_info> *group = NULL;
+  /* True if analysis finishes.  */
+  bool done = false;
+  int profit_sum = 0;
+  vec<gimple *> *stmts = NULL;
+  init_isomer_lhs (circular_queue, isomer_lhs);
+  for (unsigned i = 1; !done; ++i)
+    {
+      unsigned front = 0;
+      bool fatal = false;
+      /* Re-initialize DONE to TRUE while a new iteration begins.  */
+      done = true;
+      while (front < circular_queue.length ())
+	{
+	  int profit = 0;
+	  group = circular_queue[front];
+	  done &= check_group (group, front, i, fatal, profit, merged_groups,
+			       isomer_lhs, hetero_lhs);
+	  if (fatal)
+	    {
+	      return false;
+	    }
+	  profit_sum += profit;
+	  if (profit != 0 && (*group)[0]->stmt == NULL)
+	    {
+	      reach_vdef = true;
+	      return false;
+	    }
+	  ++front;
+	}
+      /* Uncertain result, return.  */
+      if (!done && end_of_search (circular_queue, merged_groups))
+	{
+	  return false;
+	}
+    }
+  if (check_any_cutpoints (circular_queue, stmts, merged_groups))
+    {
+      profit_pair.first = stmts;
+      unsigned loads = circular_queue.length () * circular_queue[0]->length ();
+      profit_pair.second = profit_sum + load_store_profit (loads, vf,
+							   stmts->length ());
+      if (profit_pair.second > 0)
+	{
+	  return true;
+	}
+    }
+  return false;
+}
+
+/* Free memory allocated by ginfo.  */
+
+static void
+free_ginfos (vec<vec<group_info> *> &worklists)
+{
+  vec<group_info> *worklist;
+  unsigned i = 0;
+  while (i < worklists.length ())
+    {
+      worklist = worklists[i++];
+      group_info ginfo;
+      unsigned j = 0;
+      FOR_EACH_VEC_ELT (*worklist, j, ginfo)
+	{
+	  delete ginfo;
+	}
+    }
+}
+
+static void
+release_tmp_stmts (vf_stmts_profit_map &candi_stmts)
+{
+  vf_stmts_profit_map::iterator iter;
+  for (iter = candi_stmts.begin (); iter != candi_stmts.end (); ++iter)
+    {
+      iter->second.first->release ();
+    }
+}
+
+/* Choose the group of stmt with maximun profit.  */
+
+static bool
+decide_stmts_by_profit (vf_stmts_profit_map &candi_stmts, vec<gimple *> &stmts)
+{
+  vf_stmts_profit_map::iterator iter;
+  int profit = 0;
+  int max = 0;
+  vec<gimple *> *tmp = NULL;
+  for (iter = candi_stmts.begin (); iter != candi_stmts.end (); ++iter)
+    {
+      profit = iter->second.second;
+      if (profit > max)
+	{
+	  tmp = iter->second.first;
+	  max = profit;
+	}
+      else
+	{
+	  iter->second.first->release ();
+	}
+    }
+  if (max == 0)
+    {
+      release_tmp_stmts (candi_stmts);
+      return false;
+    }
+  unsigned i = 0;
+  gimple *stmt = NULL;
+  FOR_EACH_VEC_ELT (*tmp, i, stmt)
+    {
+      stmts.safe_push (stmt);
+    }
+  release_tmp_stmts (candi_stmts);
+  return stmts.length () != 0;
+}
+
+/* Find isomorphic stmts from grouped loads with vector factor VF.
+
+   Given source code as follows and ignore casting.
+
+   a0 = (a[0] + b[0]) + ((a[4] - b[4]) << 16);
+   a1 = (a[1] + b[1]) + ((a[5] - b[5]) << 16);
+   a2 = (a[2] + b[2]) + ((a[6] - b[6]) << 16);
+   a3 = (a[3] + b[3]) + ((a[7] - b[7]) << 16);
+
+   We get grouped loads in VINFO as
+
+   GROUP_1		GROUP_2
+   _1 = *a		_11 = *b
+   _2 = *(a + 1)	_12 = *(b + 1)
+   _3 = *(a + 2)	_13 = *(b + 2)
+   _4 = *(a + 3)	_14 = *(b + 3)
+   _5 = *(a + 4)	_15 = *(b + 4)
+   _6 = *(a + 5)	_16 = *(b + 5)
+   _7 = *(a + 6)	_17 = *(b + 6)
+   _8 = *(a + 7)	_18 = *(b + 7)
+
+   First we try VF = 8, we get two worklists
+
+   WORKLIST_1		WORKLIST_2
+   _1 = *a		_11 = *b
+   _2 = *(a + 1)	_12 = *(b + 1)
+   _3 = *(a + 2)	_13 = *(b + 2)
+   _4 = *(a + 3)	_14 = *(b + 3)
+   _5 = *(a + 4)	_15 = *(b + 4)
+   _6 = *(a + 5)	_16 = *(b + 5)
+   _7 = *(a + 6)	_17 = *(b + 6)
+   _8 = *(a + 7)	_18 = *(b + 7)
+
+   We find _111 = _1 + _11 and _115 = _5 - _15 are not isomorphic,
+   so we try VF = VF / 2.
+
+   GROUP_1		GROUP_2
+   _1 = *a		_5 = *(a + 4)
+   _2 = *(a + 1)	_6 = *(a + 5)
+   _3 = *(a + 2)	_7 = *(a + 6)
+   _4 = *(a + 3)	_8 = *(a + 7)
+
+   GROUP_3		GROUP_4
+   _11 = *b		_15 = *(b + 4)
+   _12 = *(b + 1)	_16 = *(b + 5)
+   _13 = *(b + 2)	_17 = *(b + 6)
+   _14 = *(b + 3)	_18 = *(b + 7)
+
+   We first analyze group_1, and find all operations are isomorphic, then
+   replace stmts in group_1 with their use_stmts.  Group_2 as well.
+
+   GROUP_1		GROUP_2
+   _111 = _1 + _11	_115 = _5 - _15
+   _112 = _2 + _12	_116 = _6 - _16
+   _113 = _3 + _13	_117 = _7 - _17
+   _114 = _4 + _14	_118 = _8 - _18
+
+   When analyzing group_3 and group_4, we find their use_stmts are the same
+   as group_1 and group_2.  So group_3 is regarded as being merged to group_1
+   and group_4 being merged to group_2.  In future procedures, we will skip
+   group_3 and group_4.
+
+   We repeat such processing until opreations are not isomorphic or searching
+   reaches MEM stmts.  In our given case, searching end up at a0, a1, a2 and
+   a3.  */
+
+static bool
+find_isomorphic_stmts (loop_vec_info vinfo, vec<gimple *> &stmts)
+{
+  unsigned vf = get_max_vf (vinfo);
+  if (vf == 0)
+    {
+      return false;
+    }
+  auto_vec<vec<group_info> *> circular_queue;
+  /* Map of vector factor and corresponding vectorizing profit.  */
+  stmts_profit profit_map;
+  /* Map of cut_points and vector factor.  */
+  vf_stmts_profit_map candi_stmts;
+  bool reach_vdef = false;
+  while (vf > 2)
+    {
+      if (build_queue (vinfo, vf, circular_queue) == 0)
+	{
+	  return false;
+	}
+      if (!bfs_find_isomer_stmts (circular_queue, profit_map, vf, reach_vdef))
+	{
+	  if (reach_vdef)
+	    {
+	      release_tmp_stmts (candi_stmts);
+	      circular_queue.release ();
+	      return false;
+	    }
+	  vf /= 2;
+	  circular_queue.release ();
+	  continue;
+	}
+      candi_stmts[vf] = profit_map;
+      free_ginfos (circular_queue);
+      vf /= 2;
+      circular_queue.release ();
+    }
+  return decide_stmts_by_profit (candi_stmts, stmts);
+}
+
+/* Get iv from SEED_STMTS and make sure each seed_stmt has only one iv as index
+   and all indices are the same.  */
+
+static tree
+find_index (vec<gimple *> seed_stmts)
+{
+  if (seed_stmts.length () == 0)
+    {
+      return NULL;
+    }
+  bool found_index = false;
+  tree index = NULL;
+  unsigned ui = 0;
+  for (ui = 0; ui < seed_stmts.length (); ui++)
+    {
+      if (!gimple_vdef (seed_stmts[ui]))
+	{
+	  return NULL;
+	}
+      tree lhs = gimple_assign_lhs (seed_stmts[ui]);
+      unsigned num_index = 0;
+      while (TREE_CODE (lhs) == ARRAY_REF)
+	{
+	  if (TREE_CODE (TREE_OPERAND (lhs, 1)) == SSA_NAME)
+	    {
+	      num_index++;
+	      if (num_index > 1)
+		return NULL;
+	      if (index == NULL)
+		{
+		  index = TREE_OPERAND (lhs, 1);
+		  found_index = true;
+		}
+	      else if (index != TREE_OPERAND (lhs, 1))
+		return NULL;
+	    }
+	  lhs = TREE_OPERAND (lhs, 0);
+  	}
+      if (!found_index)
+	return NULL;
+    }
+  return index;
+}
+
+/* Check if expression of phi is an increament of a const.  */
+
+static void
+check_phi_inc (struct vertex *v_phi, struct graph *rdg, bool &found_inc)
+{
+  struct graph_edge *e_phi;
+  for (e_phi = v_phi->succ; e_phi; e_phi = e_phi->succ_next)
+    {
+      struct vertex *v_inc = &(rdg->vertices[e_phi->dest]);
+      if (!is_gimple_assign (RDGV_STMT (v_inc))
+	  || gimple_expr_code (RDGV_STMT (v_inc)) != PLUS_EXPR)
+	{
+	  continue;
+	}
+      tree rhs1 = gimple_assign_rhs1 (RDGV_STMT (v_inc));
+      tree rhs2 = gimple_assign_rhs2 (RDGV_STMT (v_inc));
+      if (!(integer_onep (rhs1) || integer_onep (rhs2)))
+	{
+	  continue;
+	}
+      struct graph_edge *e_inc;
+      /* find cycle with only two vertices inc and phi: inc <--> phi.  */
+      bool found_cycle = false;
+      for (e_inc = v_inc->succ; e_inc; e_inc = e_inc->succ_next)
+	{
+	  if (e_inc->dest == e_phi->src)
+	    {
+	      found_cycle = true;
+	      break;
+	    }
+	}
+      if (!found_cycle)
+	{
+	  continue;
+	}
+      found_inc = true;
+    }
+}
+
+/* Check if phi satisfies form like PHI <0, i>.  */
+
+static inline bool
+iv_check_phi_stmt (gimple *phi_stmt)
+{
+  return gimple_phi_num_args (phi_stmt) == 2
+	 && (integer_zerop (gimple_phi_arg_def (phi_stmt, 0))
+	     || integer_zerop (gimple_phi_arg_def (phi_stmt, 1)));
+}
+
+/* Make sure the iteration varible is a phi.  */
+
+static tree
+get_iv_from_seed (struct graph *flow_only_rdg, vec<gimple *> seed_stmts)
+{
+  tree index = find_index (seed_stmts);
+  if (index == NULL)
+    {
+      return NULL;
+    }
+  for (int i = 0; i < flow_only_rdg->n_vertices; i++)
+    {
+      struct vertex *v = &(flow_only_rdg->vertices[i]);
+      if (RDGV_STMT (v) != seed_stmts[0])
+	{
+	  continue;
+	}
+      struct graph_edge *e;
+      bool found_phi = false;
+      for (e = v->pred; e; e = e->pred_next)
+	{
+	  struct vertex *v_phi = &(flow_only_rdg->vertices[e->src]);
+	  gimple *phi_stmt = RDGV_STMT (v_phi);
+	  if (gimple_code (phi_stmt) != GIMPLE_PHI
+	      || gimple_phi_result (phi_stmt) != index)
+	    {
+	      continue;
+	    }
+	  if (!iv_check_phi_stmt (phi_stmt))
+	    {
+	      return NULL;
+	    }
+	  /* find inc expr in succ of phi.  */
+	  bool found_inc = false;
+	  check_phi_inc (v_phi, flow_only_rdg, found_inc);
+	  if (!found_inc)
+	    {
+	      return NULL;
+	    }
+	  found_phi = true;
+	  break;
+  	}
+      if (!found_phi)
+	{
+	  return NULL;
+	}
+      break;
+    }
+  return index;
+}
+
+/* Do not distribute loop if vertexes in ROOT_MAP have antidependence with in
+   FLOW_ONLY_RDG.  */
+
+static bool
+check_no_dependency (struct graph *flow_only_rdg, bitmap root_map)
+{
+  bitmap_iterator bi;
+  unsigned ui;
+  auto_vec<unsigned, 16> visited_nodes;
+  auto_bitmap visited_map;
+  EXECUTE_IF_SET_IN_BITMAP (root_map, 0, ui, bi)
+    {
+      visited_nodes.safe_push (ui);
+    }
+  for (ui = 0; ui < visited_nodes.length (); ui++)
+    {
+      struct vertex *v = &(flow_only_rdg->vertices[visited_nodes[ui]]);
+      struct graph_edge *e;
+      for (e = v->succ; e; e = e->succ_next)
+	{
+	  if (bitmap_bit_p (root_map, e->dest))
+	    {
+	      return false;
+	    }
+	  if (bitmap_bit_p (visited_map, e->dest))
+	    {
+	      continue;
+	    }
+	  visited_nodes.safe_push (e->dest);
+	  bitmap_set_bit (visited_map, e->dest);
+	}
+    }
+  return true;
+}
+
+/* Find isomorphic stmts from GROUPED_LOADS in VINFO and make sure
+   there is no dependency among those STMT we found.  */
+
+static unsigned
+get_cut_points (struct graph *flow_only_rdg, bitmap cut_points,
+		loop_vec_info vinfo)
+{
+  unsigned n_stmts = 0;
+
+  /* STMTS that may be CUT_POINTS.  */
+  auto_vec<gimple *> stmts;
+  if (!find_isomorphic_stmts (vinfo, stmts))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "No temp array insertion: no isomorphic stmts"
+			      " were found.\n");
+	}
+      return 0;
+    }
+
+  for (int i = 0; i < flow_only_rdg->n_vertices; i++)
+    {
+      if (stmts.contains (RDG_STMT (flow_only_rdg, i)))
+	{
+	  bitmap_set_bit (cut_points, i);
+	}
+    }
+  n_stmts = bitmap_count_bits (cut_points);
+
+  bool succ = check_no_dependency (flow_only_rdg, cut_points);
+  if (!succ)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "No temp array inserted: data dependency"
+			      " among isomorphic stmts.\n");
+	}
+      return 0;
+    }
+  return n_stmts;
+}
+
+static void
+build_temp_array (struct vertex *v, gimple_stmt_iterator &gsi,
+		  poly_uint64 array_extent, tree iv,
+		  hash_set<tree> *tmp_array_vars, vec<gimple *> *transformed)
+{
+  gimple *stmt = RDGV_STMT (v);
+  tree lhs = gimple_assign_lhs (stmt);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "original stmt:\t");
+      print_gimple_stmt (dump_file, stmt, 0, TDF_VOPS|TDF_MEMSYMS);
+    }
+  tree var_ssa = duplicate_ssa_name (lhs, stmt);
+  gimple_assign_set_lhs (stmt, var_ssa);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "changed to:\t");
+      print_gimple_stmt (dump_file, stmt, 0, TDF_VOPS | TDF_MEMSYMS);
+    }
+  gimple_set_uid (gsi_stmt (gsi), -1);
+  tree vect_elt_type = TREE_TYPE (lhs);
+  tree array_type = build_array_type_nelts (vect_elt_type, array_extent);
+  tree array = create_tmp_var (array_type);
+  tree array_ssa = build4 (ARRAY_REF, vect_elt_type, array, iv, NULL, NULL);
+  tmp_array_vars->add (array_ssa);
+  gimple *store = gimple_build_assign (array_ssa, var_ssa);
+  tree new_vdef = make_ssa_name (gimple_vop (cfun), store);
+  gsi_insert_after (&gsi, store, GSI_NEW_STMT);
+  gimple_set_vdef (store, new_vdef);
+  transformed->safe_push (store);
+  gimple_set_uid (gsi_stmt (gsi), -1);
+  tree array_ssa2 = build4 (ARRAY_REF, vect_elt_type, array, iv, NULL, NULL);
+  tmp_array_vars->add (array_ssa2);
+  gimple *load = gimple_build_assign (lhs, array_ssa2);
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "insert stmt:\t");
+      print_gimple_stmt (dump_file, store, 0, TDF_VOPS|TDF_MEMSYMS);
+      fprintf (dump_file, " and stmt:\t");
+      print_gimple_stmt (dump_file, load, 0, TDF_VOPS|TDF_MEMSYMS);
+    }
+  gimple_set_vuse (load, new_vdef);
+  gsi_insert_after (&gsi, load, GSI_NEW_STMT);
+  gimple_set_uid (gsi_stmt (gsi), -1);
+}
+
+/* Set bitmap PRODUCERS based on vec TRANSFORMED.  */
+
+void
+loop_distribution::build_producers (loop_p loop, bitmap producers,
+				    vec<gimple *> &transformed)
+{
+  auto_vec<gimple *, 10> stmts;
+  stmts_from_loop (loop, &stmts);
+  int i = 0;
+  gimple *stmt = NULL;
+
+  FOR_EACH_VEC_ELT (stmts, i, stmt)
+    {
+      gimple_set_uid (stmt, i);
+    }
+  i = 0;
+  FOR_EACH_VEC_ELT (transformed, i, stmt)
+    {
+      bitmap_set_bit (producers, stmt->uid);
+    }
+}
+
+/* Transform stmt
+
+   A = FOO (ARG_1);
+
+   to
+
+   STMT_1: A1 = FOO (ARG_1);
+   STMT_2: X[I] = A1;
+   STMT_3: A = X[I];
+
+   Producer is STMT_2 who defines the temp array and consumer is
+   STMT_3 who uses the temp array.  */
+
+void
+loop_distribution::do_insertion (loop_p loop, struct graph *flow_only_rdg,
+				 tree iv, bitmap cut_points,
+				 hash_set<tree> *tmp_array_vars,
+				 bitmap producers)
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "=== do insertion ===\n");
+    }
+
+  auto_vec<gimple *> transformed;
+
+  /* Execution times of loop.  */
+  poly_uint64 array_extent
+    = tree_to_poly_uint64 (number_of_latch_executions (loop)) + 1;
+
+  basic_block *bbs = get_loop_body_in_custom_order (loop, this, bb_top_order_cmp_r);
+
+  for (int i = 0; i < int (loop->num_nodes); i++)
+    {
+      basic_block bb = bbs[i];
+
+      /* Find all cut points in bb and transform them.  */
+      for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
+	   gsi_next (&gsi))
+	{
+	  unsigned j = gimple_uid (gsi_stmt (gsi));
+	  if (bitmap_bit_p (cut_points, j))
+	    {
+	      struct vertex *v = &(flow_only_rdg->vertices[j]);
+	      build_temp_array (v, gsi, array_extent, iv, tmp_array_vars,
+				&transformed);
+	    }
+	}
+    }
+  build_producers (loop, producers, transformed);
+  update_ssa (TODO_update_ssa);
+  free (bbs);
+}
+
+/* After temp array insertion, given stmts
+   STMT_1: M = FOO (ARG_1);
+   STMT_2: X[I] = M;
+   STMT_3: A = X[I];
+   STMT_2 is the producer, STMT_1 is its prev and STMT_3 is its next.
+   Replace M with A, and remove STMT_2 and STMT_3.  */
+
+static void
+reset_gimple_assign (struct graph *flow_only_rdg, struct partition *partition,
+		     gimple_stmt_iterator &gsi, int j)
+{
+  struct vertex *v = &(flow_only_rdg->vertices[j]);
+  gimple *stmt = RDGV_STMT (v);
+  gimple *prev = stmt->prev;
+  gimple *next = stmt->next;
+  tree n_lhs = gimple_assign_lhs (next);
+  gimple_assign_set_lhs (prev, n_lhs);
+  unlink_stmt_vdef (stmt);
+  if (partition)
+    {
+      bitmap_clear_bit (partition->stmts, gimple_uid (gsi_stmt (gsi)));
+    }
+  gsi_remove (&gsi, true);
+  release_defs (stmt);
+  if (partition)
+    {
+      bitmap_clear_bit (partition->stmts, gimple_uid (gsi_stmt (gsi)));
+    }
+  gsi_remove (&gsi, true);
+}
+
+void
+loop_distribution::remove_insertion (loop_p loop, struct graph *flow_only_rdg,
+		  bitmap producers, struct partition *partition)
+{
+  basic_block *bbs = get_loop_body_in_custom_order (loop, this, bb_top_order_cmp_r);
+  for (int i = 0; i < int (loop->num_nodes); i++)
+    {
+      basic_block bb = bbs[i];
+      for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
+	   gsi_next (&gsi))
+	{
+	  unsigned j = gimple_uid (gsi_stmt (gsi));
+	  if (bitmap_bit_p (producers, j))
+	    {
+	      reset_gimple_assign (flow_only_rdg, partition, gsi, j);
+	    }
+	}
+    }
+  update_ssa (TODO_update_ssa);
+  free (bbs);
+}
+
+/* Insert temp arrays if isomorphic computation exists.  Temp arrays will be
+   regarded as SEED_STMTS for building partitions in succeeding processes.  */
+
+bool
+loop_distribution::insert_temp_arrays (loop_p loop, vec<gimple *> seed_stmts,
+			hash_set<tree> *tmp_array_vars, bitmap producers)
+{
+  struct graph *flow_only_rdg = build_rdg (loop, NULL);
+  gcc_checking_assert (flow_only_rdg != NULL);
+  tree iv = get_iv_from_seed (flow_only_rdg, seed_stmts);
+  if (iv == NULL)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "Loop %d no temp array insertion: failed to get"
+			      " iteration variable.\n", loop->num);
+	}
+      free_rdg (flow_only_rdg);
+      return false;
+  }
+  auto_bitmap cut_points;
+  loop_vec_info vinfo = loop_vec_info_for_loop (loop);
+  unsigned n_cut_points = get_cut_points (flow_only_rdg, cut_points, vinfo);
+  delete vinfo;
+  loop->aux = NULL;
+  if (n_cut_points == 0)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "Loop %d no temp array insertion: no cut points"
+			      " found.\n", loop->num);
+	}
+      free_rdg (flow_only_rdg);
+      return false;
+    }
+  do_insertion (loop, flow_only_rdg, iv, cut_points, tmp_array_vars, producers);
+  if (dump_enabled_p ())
+    {
+      dump_user_location_t loc = find_loop_location (loop);
+      dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, "Insertion done:"
+		       " %d temp arrays inserted in Loop %d.\n",
+		       n_cut_points, loop->num);
+    }
+  free_rdg (flow_only_rdg);
+  return true;
+}
+
+static bool find_seed_stmts_for_distribution (class loop *, vec<gimple *> *);
+
+/* Distributes the code from LOOP in such a way that producer statements
+   are placed before consumer statements.  Tries to separate only the
+   statements from STMTS into separate loops.  Returns the number of
+   distributed loops.  Set NB_CALLS to number of generated builtin calls.
+   Set *DESTROY_P to whether LOOP needs to be destroyed.  */
+
+int
+loop_distribution::distribute_loop (class loop *loop, vec<gimple *> stmts,
+		 control_dependences *cd, int *nb_calls, bool *destroy_p,
+		 bool only_patterns_p)
+{
+  ddrs_table = new hash_table<ddr_hasher> (389);
+  struct graph *rdg;
+  partition *partition;
+  int i, nbp;
+
+  *destroy_p = false;
+  *nb_calls = 0;
+  loop_nest.create (0);
+  if (!find_loop_nest (loop, &loop_nest))
+    {
+      loop_nest.release ();
+      delete ddrs_table;
+      return 0;
+    }
+
+  datarefs_vec.create (20);
+  has_nonaddressable_dataref_p = false;
+  rdg = build_rdg (loop, cd);
+  if (!rdg)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file,
+		 "Loop %d not distributed: failed to build the RDG.\n",
+		 loop->num);
+
+      loop_nest.release ();
+      free_data_refs (datarefs_vec);
+      delete ddrs_table;
+      return 0;
+    }
+
+  if (datarefs_vec.length () > MAX_DATAREFS_NUM)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file,
+		 "Loop %d not distributed: too many memory references.\n",
+		 loop->num);
+
+      free_rdg (rdg);
+      loop_nest.release ();
+      free_data_refs (datarefs_vec);
+      delete ddrs_table;
+      return 0;
+    }
+
+  /* Try to distribute LOOP if LOOP is simple enough and unable to vectorize.
+     If LOOP has grouped loads, recursively find isomorphic stmts and insert
+     temp arrays, rebuild RDG and call find_seed_stmts_for_distribution
+     to replace STMTS.  */
+
+  hash_set<tree> tmp_array_vars;
+
+  /* STMTs that define those inserted TMP_ARRAYs.  */
+  auto_bitmap producers;
+
+  /* New SEED_STMTS after insertion.  */
+  auto_vec<gimple *> work_list;
+  bool insert_success = false;
+  if (may_insert_temp_arrays (loop, rdg, cd))
+    {
+      if (insert_temp_arrays (loop, stmts, &tmp_array_vars, producers))
+	{
+	  if (find_seed_stmts_for_distribution (loop, &work_list))
+	    {
+	      insert_success = true;
+	      stmts = work_list;
+	    }
+	  else
+	    {
+	      remove_insertion (loop, rdg, producers, NULL);
+	    }
+	  rebuild_rdg (loop, rdg, cd);
+	}
+    }
+
+  data_reference_p dref;
+  for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
+    dref->aux = (void *) (uintptr_t) i;
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    dump_rdg (dump_file, rdg);
+
+  auto_vec<struct partition *, 3> partitions;
+  rdg_build_partitions (rdg, stmts, &partitions);
+
+  auto_vec<ddr_p> alias_ddrs;
+
+  auto_bitmap stmt_in_all_partitions;
+  bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
+  for (i = 1; partitions.iterate (i, &partition); ++i)
+    bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
+
+  bool any_builtin = false;
+  bool reduction_in_all = false;
+  FOR_EACH_VEC_ELT (partitions, i, partition)
+    {
+      reduction_in_all
+	|= classify_partition (loop, rdg, partition, stmt_in_all_partitions);
+      any_builtin |= partition_builtin_p (partition);
+    }
+
+  /* If we are only distributing patterns but did not detect any,
+     simply bail out.  */
+  if (only_patterns_p
+      && !any_builtin)
+    {
+      nbp = 0;
+      goto ldist_done;
+    }
+
+  /* If we are only distributing patterns fuse all partitions that
+     were not classified as builtins.  This also avoids chopping
+     a loop into pieces, separated by builtin calls.  That is, we
+     only want no or a single loop body remaining.  */
+  struct partition *into;
+  if (only_patterns_p)
+    {
+      for (i = 0; partitions.iterate (i, &into); ++i)
+	if (!partition_builtin_p (into))
+	  break;
+      for (++i; partitions.iterate (i, &partition); ++i)
+	if (!partition_builtin_p (partition))
+	  {
+	    partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
+	    partitions.unordered_remove (i);
+	    partition_free (partition);
+	    i--;
+	  }
+    }
+
+  /* Due to limitations in the transform phase we have to fuse all
+     reduction partitions into the last partition so the existing
+     loop will contain all loop-closed PHI nodes.  */
+  for (i = 0; partitions.iterate (i, &into); ++i)
+    if (partition_reduction_p (into))
+      break;
+  for (i = i + 1; partitions.iterate (i, &partition); ++i)
+    if (partition_reduction_p (partition))
+      {
+	partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
+	partitions.unordered_remove (i);
+	partition_free (partition);
+	i--;
+      }
+
+  /* Apply our simple cost model - fuse partitions with similar
+     memory accesses.  */
+  for (i = 0; partitions.iterate (i, &into); ++i)
+    {
+      bool changed = false;
+      if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
+	continue;
+      for (int j = i + 1;
+	   partitions.iterate (j, &partition); ++j)
+	{
+	  if (share_memory_accesses (rdg, into, partition, &tmp_array_vars))
+	    {
+	      partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
+	      partitions.unordered_remove (j);
+	      partition_free (partition);
+	      j--;
+	      changed = true;
+	    }
+	}
+      /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
+         accesses when 1 and 2 have similar accesses but not 0 and 1
+	 then in the next iteration we will fail to consider merging
+	 1 into 0,2.  So try again if we did any merging into 0.  */
+      if (changed)
+	i--;
     }
 
   /* Put a non-builtin partition last if we need to preserve a reduction.
@@ -3086,7 +4598,7 @@ loop_distribution::distribute_loop (class loop *loop, vec<gimple *> stmts,
 	}
     }
 
-  finalize_partitions (loop, &partitions, &alias_ddrs);
+  finalize_partitions (loop, &partitions, &alias_ddrs, producers);
 
   /* If there is a reduction in all partitions make sure the last one
      is not classified for builtin code generation.  */
@@ -3104,6 +4616,24 @@ loop_distribution::distribute_loop (class loop *loop, vec<gimple *> stmts,
     }
 
   nbp = partitions.length ();
+
+  /* If we have inserted TMP_ARRAYs but there is only one partition left in
+     the succeeding processes, remove those inserted TMP_ARRAYs back to the
+     original version.  */
+
+  if (nbp == 1 && insert_success)
+    {
+      struct partition *partition = NULL;
+      partitions.iterate (0, &partition);
+      remove_insertion (loop, rdg, producers, partition);
+      if (dump_enabled_p ())
+	{
+	  dump_user_location_t loc = find_loop_location (loop);
+	  dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, "Insertion removed:"
+			   " unable to distribute loop %d.\n", loop->num);
+	}
+    }
+
   if (nbp == 0
       || (nbp == 1 && !partition_builtin_p (partitions[0]))
       || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 899b5608745186c684d8b450bbdb6349c16a6667..9d7f65d1abf88cd8ef80b8ccf2a78b7da9cf3c63 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2516,9 +2516,11 @@ vect_reanalyze_as_main_loop (loop_vec_info loop_vinfo, unsigned int *n_stmts)
 
    Apply a set of analyses on LOOP, and create a loop_vec_info struct
    for it.  The different analyses will record information in the
-   loop_vec_info struct.  */
+   loop_vec_info struct.  When RESULT_ONLY_P is true, quit analysis
+   if loop is vectorizable, otherwise, do not delete vinfo.*/
 opt_loop_vec_info
-vect_analyze_loop (class loop *loop, vec_info_shared *shared)
+vect_analyze_loop (class loop *loop, vec_info_shared *shared,
+		   bool result_only_p)
 {
   auto_vector_modes vector_modes;
 
@@ -2545,6 +2547,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
   unsigned n_stmts = 0;
   machine_mode autodetected_vector_mode = VOIDmode;
   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
+  /* Loop_vinfo for loop-distribution pass.  */
+  opt_loop_vec_info fail_loop_vinfo = opt_loop_vec_info::success (NULL);
   machine_mode next_vector_mode = VOIDmode;
   poly_uint64 lowest_th = 0;
   unsigned vectorized_loops = 0;
@@ -2633,6 +2637,13 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
       if (res)
 	{
 	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
+	  /* In loop-distribution pass, we only need to get loop_vinfo, do not
+	     conduct further operations.  */
+	  if (result_only_p)
+	    {
+	      loop->aux = (loop_vec_info) loop_vinfo;
+	      return loop_vinfo;
+	    }
 	  vectorized_loops++;
 
 	  /* Once we hit the desired simdlen for the first time,
@@ -2724,7 +2735,19 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
 	}
       else
 	{
-	  delete loop_vinfo;
+	  /* If current analysis shows LOOP is unable to vectorize, loop_vinfo
+	     will be deleted.  If LOOP is under ldist analysis, backup it before
+	     it is deleted and return it if all modes are analyzed and still
+	     fail to vectorize.  */
+	  if (result_only_p && (mode_i == vector_modes.length ()
+	      || autodetected_vector_mode == VOIDmode))
+	    {
+	      fail_loop_vinfo = loop_vinfo;
+	    }
+	  else
+	    {
+	      delete loop_vinfo;
+	    }
 	  if (fatal)
 	    {
 	      gcc_checking_assert (first_loop_vinfo == NULL);
@@ -2773,6 +2796,14 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
       return first_loop_vinfo;
     }
 
+  /* Return loop_vinfo for ldist if loop is unvectorizable.  */
+  if (result_only_p && (mode_i == vector_modes.length ()
+      || autodetected_vector_mode == VOIDmode))
+    {
+      loop->aux = (loop_vec_info) fail_loop_vinfo;
+      return fail_loop_vinfo;
+    }
+
   return opt_loop_vec_info::propagate_failure (res);
 }
 
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1c4a6c42124074cb5c6bfd2b44bd46ad211dd1ca..dc8175f007b99c198ebf1556bbb4ae1c28a2da4f 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1896,7 +1896,8 @@ extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree,
 				  enum tree_code);
 extern bool needs_fold_left_reduction_p (tree, tree_code);
 /* Drive for loop analysis stage.  */
-extern opt_loop_vec_info vect_analyze_loop (class loop *, vec_info_shared *);
+extern opt_loop_vec_info vect_analyze_loop (class loop *, vec_info_shared *,
+					    bool result_only_p = false);
 extern tree vect_build_loop_niters (loop_vec_info, bool * = NULL);
 extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *,
 					 tree *, bool);