diff --git a/0029-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch b/0029-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch
new file mode 100644
index 0000000000000000000000000000000000000000..c40886ab41742bb3eb4d4ff9065405ba6fff5065
--- /dev/null
+++ b/0029-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch
@@ -0,0 +1,194 @@
+From aa39a66f6029fe16a656d7c6339908b953fb1e04 Mon Sep 17 00:00:00 2001
+From: Diachkov Ilia WX1215920 <diachkov.ilia1@huawei-partners.com>
+Date: Thu, 22 Feb 2024 11:27:43 +0300
+Subject: [PATCH 01/18] Add insn defs and correct costs for cmlt generation
+
+---
+ gcc/config/aarch64/aarch64-simd.md  | 48 +++++++++++++++++++++++++++++
+ gcc/config/aarch64/aarch64.cc       | 15 +++++++++
+ gcc/config/aarch64/aarch64.opt      |  4 +++
+ gcc/config/aarch64/iterators.md     |  3 +-
+ gcc/config/aarch64/predicates.md    | 25 +++++++++++++++
+ gcc/testsuite/gcc.dg/combine-cmlt.c | 20 ++++++++++++
+ 6 files changed, 114 insertions(+), 1 deletion(-)
+ create mode 100755 gcc/testsuite/gcc.dg/combine-cmlt.c
+
+diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
+index ee7f0b89c..82f73805f 100644
+--- a/gcc/config/aarch64/aarch64-simd.md
++++ b/gcc/config/aarch64/aarch64-simd.md
+@@ -6454,6 +6454,54 @@
+   [(set_attr "type" "neon_compare<q>, neon_compare_zero<q>")]
+ )
+ 
++;; Use cmlt to replace vector arithmetic operations like this (SImode example):
++;; B = (((A >> 15) & 0x00010001) << 16) - ((A >> 15) & 0x00010001)
++;; TODO: maybe extend to scalar operations or other cm** instructions.
++
++(define_insn "*aarch64_cmlt_as_arith<mode>"
++  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
++	(minus:<V_INT_EQUIV>
++	  (ashift:<V_INT_EQUIV>
++	    (and:<V_INT_EQUIV>
++	      (lshiftrt:<V_INT_EQUIV>
++		(match_operand:VDQHSD 1 "register_operand" "w")
++		(match_operand:VDQHSD 2 "half_size_minus_one_operand"))
++	      (match_operand:VDQHSD 3 "cmlt_arith_mask_operand"))
++	    (match_operand:VDQHSD 4 "half_size_operand"))
++	  (and:<V_INT_EQUIV>
++	    (lshiftrt:<V_INT_EQUIV>
++	      (match_dup 1)
++	      (match_dup 2))
++	    (match_dup 3))))]
++  "TARGET_SIMD && flag_cmlt_arith"
++  "cmlt\t%<v>0.<V2ntype>, %<v>1.<V2ntype>, #0"
++  [(set_attr "type" "neon_compare_zero")]
++)
++
++;; The helper definition that allows combiner to use the previous pattern.
++
++(define_insn_and_split "*arch64_cmlt_tmp<mode>"
++  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
++	(and:<V_INT_EQUIV>
++	  (lshiftrt:<V_INT_EQUIV>
++	    (match_operand:VDQHSD 1 "register_operand" "w")
++	    (match_operand:VDQHSD 2 "half_size_minus_one_operand"))
++	  (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
++  "TARGET_SIMD && flag_cmlt_arith"
++  "#"
++  "&& reload_completed"
++  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
++	(lshiftrt:<V_INT_EQUIV>
++	  (match_operand:VDQHSD 1 "register_operand")
++	  (match_operand:VDQHSD 2 "half_size_minus_one_operand")))
++   (set (match_dup 0)
++	(and:<V_INT_EQUIV>
++	  (match_dup 0)
++	  (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
++  ""
++  [(set_attr "type" "neon_compare_zero")]
++)
++
+ (define_insn_and_split "aarch64_cm<optab>di"
+   [(set (match_operand:DI 0 "register_operand" "=w,w,r")
+ 	(neg:DI
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index a3da4ca30..04072ca25 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -14064,6 +14064,21 @@ cost_minus:
+ 	    return true;
+ 	  }
+ 
++	/* Detect aarch64_cmlt_as_arith instruction. Now only this pattern
++	   matches the condition. The costs of cmlt and sub instructions
++	   are comparable, so we are not increasing the cost here.  */
++	if (flag_cmlt_arith && GET_CODE (op0) == ASHIFT
++	    && GET_CODE (op1) == AND)
++	  {
++	    rtx op0_subop0 = XEXP (op0, 0);
++	    if (rtx_equal_p (op0_subop0, op1))
++	      {
++		rtx lshrt_op = XEXP (op0_subop0, 0);
++		if (GET_CODE (lshrt_op) == LSHIFTRT)
++		  return true;
++	      }
++	  }
++
+ 	/* Look for SUB (extended register).  */
+ 	if (is_a <scalar_int_mode> (mode)
+ 	    && aarch64_rtx_arith_op_extract_p (op1))
+diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
+index a64b927e9..101664c7c 100644
+--- a/gcc/config/aarch64/aarch64.opt
++++ b/gcc/config/aarch64/aarch64.opt
+@@ -262,6 +262,10 @@ Use an immediate to offset from the stack protector guard register, sp_el0.
+ This option is for use with fstack-protector-strong and not for use in
+ user-land code.
+ 
++mcmlt-arith
++Target Var(flag_cmlt_arith) Optimization Init(0)
++Use SIMD cmlt instruction to perform some arithmetic/logic calculations.
++
+ TargetVariable
+ long aarch64_stack_protector_guard_offset = 0
+ 
+diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
+index 26a840d7f..967e6b0b1 100644
+--- a/gcc/config/aarch64/iterators.md
++++ b/gcc/config/aarch64/iterators.md
+@@ -1485,7 +1485,8 @@
+ 			  (V2DI "2s")])
+ 
+ ;; Register suffix narrowed modes for VQN.
+-(define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h")
++(define_mode_attr V2ntype [(V4HI "8b") (V2SI "4h")
++			   (V8HI "16b") (V4SI "8h")
+ 			   (V2DI "4s")])
+ 
+ ;; Widened modes of vector modes.
+diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
+index c308015ac..07c14aacb 100644
+--- a/gcc/config/aarch64/predicates.md
++++ b/gcc/config/aarch64/predicates.md
+@@ -49,6 +49,31 @@
+   return CONST_INT_P (op) && IN_RANGE (INTVAL (op), 1, 3);
+ })
+ 
++(define_predicate "half_size_minus_one_operand"
++  (match_code "const_vector")
++{
++  op = unwrap_const_vec_duplicate (op);
++  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
++  return CONST_INT_P (op) && (UINTVAL (op) == size - 1);
++})
++
++(define_predicate "half_size_operand"
++  (match_code "const_vector")
++{
++  op = unwrap_const_vec_duplicate (op);
++  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
++  return CONST_INT_P (op) && (UINTVAL (op) == size);
++})
++
++(define_predicate "cmlt_arith_mask_operand"
++  (match_code "const_vector")
++{
++  op = unwrap_const_vec_duplicate (op);
++  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
++  unsigned long long mask = ((unsigned long long) 1 << size) | 1;
++  return CONST_INT_P (op) && (UINTVAL (op) == mask);
++})
++
+ (define_predicate "subreg_lowpart_operator"
+   (ior (match_code "truncate")
+        (and (match_code "subreg")
+diff --git a/gcc/testsuite/gcc.dg/combine-cmlt.c b/gcc/testsuite/gcc.dg/combine-cmlt.c
+new file mode 100755
+index 000000000..b4c9a37ff
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/combine-cmlt.c
+@@ -0,0 +1,20 @@
++/* { dg-do compile { target aarch64-*-* } } */
++/* { dg-options "-O3 -mcmlt-arith" } */
++
++/* The test checks usage of cmlt insns for arithmetic/logic calculations
++ * in foo ().  It's inspired by sources of x264 codec.  */
++
++typedef unsigned short int uint16_t;
++typedef unsigned int uint32_t;
++
++void foo( uint32_t *a, uint32_t *b)
++{
++  for (unsigned i = 0; i < 4; i++)
++    {
++      uint32_t s = ((a[i]>>((8 * sizeof(uint16_t))-1))
++		    &(((uint32_t)1<<(8 * sizeof(uint16_t)))+1))*((uint16_t)-1);
++      b[i] = (a[i]+s)^s;
++    }
++}
++
++/* { dg-final { scan-assembler-times {cmlt\t} 1 } }  */
+-- 
+2.33.0
+
diff --git a/0030-rtl-ifcvt-introduce-rtl-ifcvt-enchancements.patch b/0030-rtl-ifcvt-introduce-rtl-ifcvt-enchancements.patch
new file mode 100644
index 0000000000000000000000000000000000000000..813eba9323f9b19aff134995289462e26eb04dfa
--- /dev/null
+++ b/0030-rtl-ifcvt-introduce-rtl-ifcvt-enchancements.patch
@@ -0,0 +1,560 @@
+From 4cae948c1c00ad7a59f0f234f809fbd9a0208eb4 Mon Sep 17 00:00:00 2001
+From: vchernon <chernonog.vyacheslav@huawei.com>
+Date: Wed, 28 Feb 2024 23:05:12 +0800
+Subject: [PATCH 02/18] [rtl-ifcvt] introduce rtl ifcvt enchancements     new
+ option:       -fifcvt-allow-complicated-cmps:         allows ifcvt to deal
+ with complicated cmps like
+
+        cmp reg1 (reg2 + reg3)
+
+        can increase compilation time
+    new param:
+      -param=ifcvt-allow-register-renaming=[0,1,2]
+        1 : allows ifcvt to rename registers in then and else bb
+        2 : allows to rename registers in condition and else/then bb
+        can increase compilation time and register pressure
+---
+ gcc/common.opt                                |   4 +
+ gcc/ifcvt.cc                                  | 291 +++++++++++++++---
+ gcc/params.opt                                |   4 +
+ .../gcc.c-torture/execute/ifcvt-renaming-1.c  |  35 +++
+ gcc/testsuite/gcc.dg/ifcvt-6.c                |  27 ++
+ 5 files changed, 311 insertions(+), 50 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c
+ create mode 100644 gcc/testsuite/gcc.dg/ifcvt-6.c
+
+diff --git a/gcc/common.opt b/gcc/common.opt
+index c7c6bc256..aa00fb7b0 100644
+--- a/gcc/common.opt
++++ b/gcc/common.opt
+@@ -3691,4 +3691,8 @@ fipa-ra
+ Common Var(flag_ipa_ra) Optimization
+ Use caller save register across calls if possible.
+ 
++fifcvt-allow-complicated-cmps
++Common Var(flag_ifcvt_allow_complicated_cmps) Optimization
++Allow RTL if-conversion pass to deal with complicated cmps (can increase compilation time).
++
+ ; This comment is to ensure we retain the blank line above.
+diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc
+index 2c1eba312..584db7b55 100644
+--- a/gcc/ifcvt.cc
++++ b/gcc/ifcvt.cc
+@@ -886,7 +886,9 @@ noce_emit_store_flag (struct noce_if_info *if_info, rtx x, int reversep,
+     }
+ 
+   /* Don't even try if the comparison operands or the mode of X are weird.  */
+-  if (cond_complex || !SCALAR_INT_MODE_P (GET_MODE (x)))
++  if (!flag_ifcvt_allow_complicated_cmps
++      && (cond_complex
++	  || !SCALAR_INT_MODE_P (GET_MODE (x))))
+     return NULL_RTX;
+ 
+   return emit_store_flag (x, code, XEXP (cond, 0),
+@@ -1965,7 +1967,8 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc)
+   /* Currently support only simple single sets in test_bb.  */
+   if (!sset
+       || !noce_operand_ok (SET_DEST (sset))
+-      || contains_ccmode_rtx_p (SET_DEST (sset))
++      || (!flag_ifcvt_allow_complicated_cmps
++	  && contains_ccmode_rtx_p (SET_DEST (sset)))
+       || !noce_operand_ok (SET_SRC (sset)))
+     return false;
+ 
+@@ -1979,13 +1982,17 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc)
+    in this function.  */
+ 
+ static bool
+-bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
++bbs_ok_for_cmove_arith (basic_block bb_a,
++			basic_block bb_b,
++			rtx to_rename,
++			bitmap conflict_regs)
+ {
+   rtx_insn *a_insn;
+   bitmap bba_sets = BITMAP_ALLOC (&reg_obstack);
+-
++  bitmap intersections = BITMAP_ALLOC (&reg_obstack);
+   df_ref def;
+   df_ref use;
++  rtx_insn *last_a = last_active_insn (bb_a, FALSE);
+ 
+   FOR_BB_INSNS (bb_a, a_insn)
+     {
+@@ -1995,18 +2002,15 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
+       rtx sset_a = single_set (a_insn);
+ 
+       if (!sset_a)
+-	{
+-	  BITMAP_FREE (bba_sets);
+-	  return false;
+-	}
++	goto end_cmove_arith_check_and_fail;
+       /* Record all registers that BB_A sets.  */
+       FOR_EACH_INSN_DEF (def, a_insn)
+-	if (!(to_rename && DF_REF_REG (def) == to_rename))
++	if (!(to_rename && DF_REF_REG (def) == to_rename && a_insn == last_a))
+ 	  bitmap_set_bit (bba_sets, DF_REF_REGNO (def));
+     }
+ 
++  bitmap_and (intersections, df_get_live_in (bb_b), bba_sets);
+   rtx_insn *b_insn;
+-
+   FOR_BB_INSNS (bb_b, b_insn)
+     {
+       if (!active_insn_p (b_insn))
+@@ -2015,10 +2019,7 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
+       rtx sset_b = single_set (b_insn);
+ 
+       if (!sset_b)
+-	{
+-	  BITMAP_FREE (bba_sets);
+-	  return false;
+-	}
++	goto end_cmove_arith_check_and_fail;
+ 
+       /* Make sure this is a REG and not some instance
+ 	 of ZERO_EXTRACT or SUBREG or other dangerous stuff.
+@@ -2030,25 +2031,34 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
+       if (MEM_P (SET_DEST (sset_b)))
+ 	gcc_assert (rtx_equal_p (SET_DEST (sset_b), to_rename));
+       else if (!REG_P (SET_DEST (sset_b)))
+-	{
+-	  BITMAP_FREE (bba_sets);
+-	  return false;
+-	}
++	goto end_cmove_arith_check_and_fail;
+ 
+-      /* If the insn uses a reg set in BB_A return false.  */
++      /* If the insn uses a reg set in BB_A return false
++	 or try to collect register list for renaming.  */
+       FOR_EACH_INSN_USE (use, b_insn)
+ 	{
+-	  if (bitmap_bit_p (bba_sets, DF_REF_REGNO (use)))
++	  if (bitmap_bit_p (intersections, DF_REF_REGNO (use)))
+ 	    {
+-	      BITMAP_FREE (bba_sets);
+-	      return false;
++	      if (param_ifcvt_allow_register_renaming < 1)
++		  goto end_cmove_arith_check_and_fail;
++
++	      /* Those regs should be renamed.  We can't rename CC reg, but
++		 possibly we can provide combined comparison in the future.  */
++	      if (GET_MODE_CLASS (GET_MODE (DF_REF_REG (use))) == MODE_CC)
++		goto end_cmove_arith_check_and_fail;
++	      bitmap_set_bit (conflict_regs, DF_REF_REGNO (use));
+ 	    }
+ 	}
+-
+     }
+ 
+   BITMAP_FREE (bba_sets);
++  BITMAP_FREE (intersections);
+   return true;
++
++end_cmove_arith_check_and_fail:
++  BITMAP_FREE (bba_sets);
++  BITMAP_FREE (intersections);
++  return false;
+ }
+ 
+ /* Emit copies of all the active instructions in BB except the last.
+@@ -2103,6 +2113,142 @@ noce_emit_bb (rtx last_insn, basic_block bb, bool simple)
+   return true;
+ }
+ 
++/* This function tries to rename regs that intersect with considered bb
++   inside condition expression.  Condition expression will be moved down
++   if the optimization will be applied, so it is essential to be sure that
++   all intersected registers will be renamed otherwise transformation
++   can't be applied.  Function returns true if renaming was successful
++   and optimization can proceed futher.  */
++
++static bool
++noce_rename_regs_in_cond (struct noce_if_info *if_info, bitmap cond_rename_regs)
++{
++  bool success = true;
++  if (bitmap_empty_p (cond_rename_regs))
++    return true;
++  if (param_ifcvt_allow_register_renaming < 2)
++    return false;
++  df_ref use;
++  rtx_insn *cmp_insn = if_info->cond_earliest;
++  /*  Jump instruction as a condion currently unsupported.  */
++  if (JUMP_P (cmp_insn))
++    return false;
++  rtx_insn *before_cmp = PREV_INSN (cmp_insn);
++  start_sequence ();
++  rtx_insn *copy_of_cmp = as_a <rtx_insn *> (copy_rtx (cmp_insn));
++  basic_block cmp_block = BLOCK_FOR_INSN (cmp_insn);
++  FOR_EACH_INSN_USE (use, cmp_insn)
++    {
++      if (bitmap_bit_p (cond_rename_regs, DF_REF_REGNO (use)))
++	{
++	  rtx use_reg = DF_REF_REG (use);
++	  rtx tmp = gen_reg_rtx (GET_MODE (use_reg));
++	  if (!validate_replace_rtx (use_reg, tmp, copy_of_cmp))
++	    {
++	      end_sequence ();
++	      return false;
++	    }
++	  noce_emit_move_insn (tmp, use_reg);
++	}
++    }
++
++  emit_insn (PATTERN (copy_of_cmp));
++  rtx_insn *seq = get_insns ();
++  unshare_all_rtl_in_chain (seq);
++  end_sequence ();
++
++  emit_insn_after_setloc (seq, before_cmp, INSN_LOCATION (cmp_insn));
++  delete_insn_and_edges (cmp_insn);
++  rtx_insn *insn;
++  FOR_BB_INSNS (cmp_block, insn)
++    df_insn_rescan (insn);
++
++  if_info->cond = noce_get_condition (if_info->jump,
++				      &copy_of_cmp,
++				      if_info->then_else_reversed);
++  if_info->cond_earliest = copy_of_cmp;
++  if_info->rev_cond = NULL_RTX;
++
++  return success;
++}
++
++/* This function tries to rename regs that intersect with considered bb.
++   return true if the renaming was successful and optimization can
++   proceed futher, false otherwise.  */
++static bool
++noce_rename_regs_in_bb (basic_block test_bb, bitmap rename_regs)
++{
++  if (bitmap_empty_p (rename_regs))
++    return true;
++  rtx_insn *insn;
++  rtx_insn *last_insn = last_active_insn (test_bb, FALSE);
++  bool res = true;
++  start_sequence ();
++  FOR_BB_INSNS (test_bb, insn)
++    {
++      if (!active_insn_p (insn))
++	continue;
++      /* Only ssets are supported for now.  */
++      rtx sset = single_set (insn);
++      gcc_assert (sset);
++      rtx x = SET_DEST (sset);
++      if (!REG_P (x) || !bitmap_bit_p (rename_regs, REGNO (x)))
++	continue;
++      /* Do not need to rename dest in the last instruction
++	 it will be renamed anyway.  */
++      if (insn == last_insn)
++	continue;
++      machine_mode mode = GET_MODE (x);
++      rtx tmp = gen_reg_rtx (mode);
++      if (!validate_replace_rtx_part (x, tmp, &SET_DEST (sset), insn))
++	{
++	  gcc_assert (insn != last_insn);
++	  /* We can generate additional move for such case,
++	     but it will increase register preasure.
++	     For now just stop transformation.  */
++	  rtx result_rtx = SET_DEST (single_set (last_insn));
++	  if (REG_P (result_rtx) && (x != result_rtx))
++	    {
++	      res = false;
++	      break;
++	    }
++	  if (!validate_replace_rtx (x, tmp, insn))
++	    gcc_unreachable ();
++	  noce_emit_move_insn (tmp,x);
++	}
++      set_used_flags (insn);
++      rtx_insn *rename_candidate;
++      for (rename_candidate = NEXT_INSN (insn);
++	   rename_candidate && rename_candidate!= NEXT_INSN (BB_END (test_bb));
++	   rename_candidate = NEXT_INSN (rename_candidate))
++	{
++	  if (!reg_overlap_mentioned_p (x, rename_candidate))
++	    continue;
++
++	  int replace_res = TRUE;
++	  if (rename_candidate == last_insn)
++	    {
++	      validate_replace_src_group (x, tmp, rename_candidate);
++	      replace_res = apply_change_group ();
++	    }
++	  else
++	    replace_res = validate_replace_rtx (x, tmp, rename_candidate);
++	  gcc_assert (replace_res);
++	  set_used_flags (rename_candidate);
++	}
++      set_used_flags (x);
++      set_used_flags (tmp);
++    }
++    rtx_insn *seq = get_insns ();
++    unshare_all_rtl_in_chain (seq);
++    end_sequence ();
++    emit_insn_before_setloc (seq, first_active_insn (test_bb),
++			     INSN_LOCATION (first_active_insn (test_bb)));
++    FOR_BB_INSNS (test_bb, insn)
++      df_insn_rescan (insn);
++  return res;
++}
++
+ /* Try more complex cases involving conditional_move.  */
+ 
+ static int
+@@ -2185,11 +2331,30 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
+ 	  std::swap (then_bb, else_bb);
+ 	}
+     }
+-
++  bitmap else_bb_rename_regs = BITMAP_ALLOC (&reg_obstack);
++  bitmap then_bb_rename_regs = BITMAP_ALLOC (&reg_obstack);
+   if (then_bb && else_bb
+-      && (!bbs_ok_for_cmove_arith (then_bb, else_bb,  if_info->orig_x)
+-	  || !bbs_ok_for_cmove_arith (else_bb, then_bb,  if_info->orig_x)))
+-    return FALSE;
++      && (!bbs_ok_for_cmove_arith (then_bb, else_bb,
++				   if_info->orig_x,
++				   then_bb_rename_regs)
++	  || !bbs_ok_for_cmove_arith (else_bb, then_bb,
++				      if_info->orig_x,
++				      else_bb_rename_regs)))
++    {
++      BITMAP_FREE (then_bb_rename_regs);
++      BITMAP_FREE (else_bb_rename_regs);
++      return FALSE;
++    }
++  bool prepass_renaming = noce_rename_regs_in_bb (then_bb,
++						  then_bb_rename_regs)
++			  && noce_rename_regs_in_bb (else_bb,
++						     else_bb_rename_regs);
++
++  BITMAP_FREE (then_bb_rename_regs);
++  BITMAP_FREE (else_bb_rename_regs);
++
++  if (!prepass_renaming)
++   return FALSE;
+ 
+   start_sequence ();
+ 
+@@ -3072,7 +3237,8 @@ noce_operand_ok (const_rtx op)
+ 
+ static bool
+ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
+-			      unsigned int *cost, bool *simple_p)
++			     unsigned int *cost, bool *simple_p,
++			     bitmap cond_rename_regs)
+ {
+   if (!test_bb)
+     return false;
+@@ -3112,8 +3278,9 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
+   rtx_insn *prev_last_insn = PREV_INSN (last_insn);
+   gcc_assert (prev_last_insn);
+ 
+-  /* For now, disallow setting x multiple times in test_bb.  */
+-  if (REG_P (x) && reg_set_between_p (x, first_insn, prev_last_insn))
++  if (REG_P (x)
++      && reg_set_between_p (x, first_insn, prev_last_insn)
++      && param_ifcvt_allow_register_renaming < 1)
+     return false;
+ 
+   bitmap test_bb_temps = BITMAP_ALLOC (&reg_obstack);
+@@ -3125,25 +3292,35 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
+   rtx_insn *insn;
+   FOR_BB_INSNS (test_bb, insn)
+     {
+-      if (insn != last_insn)
+-	{
+-	  if (!active_insn_p (insn))
+-	    continue;
++      if (insn == last_insn)
++	continue;
++      if (!active_insn_p (insn))
++	continue;
+ 
+-	  if (!insn_valid_noce_process_p (insn, cc))
+-	    goto free_bitmap_and_fail;
++      if (!insn_valid_noce_process_p (insn, cc))
++	goto free_bitmap_and_fail;
+ 
+-	  rtx sset = single_set (insn);
+-	  gcc_assert (sset);
++      rtx sset = single_set (insn);
++      gcc_assert (sset);
+ 
+-	  if (contains_mem_rtx_p (SET_SRC (sset))
+-	      || !REG_P (SET_DEST (sset))
+-	      || reg_overlap_mentioned_p (SET_DEST (sset), cond))
+-	    goto free_bitmap_and_fail;
++      if (contains_mem_rtx_p (SET_SRC (sset))
++	  || !REG_P (SET_DEST (sset)))
++	goto free_bitmap_and_fail;
+ 
+-	  potential_cost += pattern_cost (sset, speed_p);
+-	  bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
++      if (reg_overlap_mentioned_p (SET_DEST (sset), cond))
++	{
++	  if (param_ifcvt_allow_register_renaming < 1)
++	    goto free_bitmap_and_fail;
++	  rtx sset_dest = SET_DEST (sset);
++	  if (REG_P (sset_dest)
++	      && (GET_MODE_CLASS (GET_MODE (sset_dest)) != MODE_CC))
++	    bitmap_set_bit (cond_rename_regs, REGNO (sset_dest));
++	  else
++	    goto free_bitmap_and_fail;
+ 	}
++	potential_cost += pattern_cost (sset, speed_p);
++	if (SET_DEST (sset) != SET_DEST (last_set))
++	  bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
+     }
+ 
+   /* If any of the intermediate results in test_bb are live after test_bb
+@@ -3777,15 +3954,29 @@ noce_process_if_block (struct noce_if_info *if_info)
+ 
+   bool speed_p = optimize_bb_for_speed_p (test_bb);
+   unsigned int then_cost = 0, else_cost = 0;
++  bitmap cond_rename_regs = BITMAP_ALLOC (&reg_obstack);
+   if (!bb_valid_for_noce_process_p (then_bb, cond, &then_cost,
+-				    &if_info->then_simple))
+-    return false;
++				    &if_info->then_simple, cond_rename_regs))
++    {
++      BITMAP_FREE (cond_rename_regs);
++      return false;
++    }
+ 
+   if (else_bb
+       && !bb_valid_for_noce_process_p (else_bb, cond, &else_cost,
+-				       &if_info->else_simple))
+-    return false;
++				       &if_info->else_simple, cond_rename_regs))
++    {
++      BITMAP_FREE (cond_rename_regs);
++      return false;
++    }
+ 
++  if (!noce_rename_regs_in_cond (if_info, cond_rename_regs))
++    {
++      BITMAP_FREE (cond_rename_regs);
++      return false;
++    }
++  BITMAP_FREE (cond_rename_regs);
++  cond = if_info->cond;
+   if (speed_p)
+     if_info->original_cost += average_cost (then_cost, else_cost,
+ 					    find_edge (test_bb, then_bb));
+@@ -5823,12 +6014,13 @@ if_convert (bool after_combine)
+ {
+   basic_block bb;
+   int pass;
+-
+   if (optimize == 1)
+     {
+       df_live_add_problem ();
+       df_live_set_all_dirty ();
+     }
++  free_dominance_info (CDI_DOMINATORS);
++  cleanup_cfg (CLEANUP_EXPENSIVE);
+ 
+   /* Record whether we are after combine pass.  */
+   ifcvt_after_combine = after_combine;
+@@ -5933,7 +6125,6 @@ rest_of_handle_if_conversion (void)
+ 	  dump_reg_info (dump_file);
+ 	  dump_flow_info (dump_file, dump_flags);
+ 	}
+-      cleanup_cfg (CLEANUP_EXPENSIVE);
+       if_convert (false);
+       if (num_updated_if_blocks)
+ 	/* Get rid of any dead CC-related instructions.  */
+diff --git a/gcc/params.opt b/gcc/params.opt
+index d2196dc68..ba87f820b 100644
+--- a/gcc/params.opt
++++ b/gcc/params.opt
+@@ -669,6 +669,10 @@ Maximum permissible cost for the sequence that would be generated by the RTL if-
+ Common Joined UInteger Var(param_max_rtl_if_conversion_unpredictable_cost) Init(40) IntegerRange(0, 200) Param Optimization
+ Maximum permissible cost for the sequence that would be generated by the RTL if-conversion pass for a branch that is considered unpredictable.
+ 
++-param=ifcvt-allow-register-renaming=
++Common Joined UInteger Var(param_ifcvt_allow_register_renaming) IntegerRange(0, 2) Param Optimization
++Allow RTL if-conversion pass to aggressively rename registers in basic blocks.  Sometimes additional moves will be created.
++
+ -param=max-sched-extend-regions-iters=
+ Common Joined UInteger Var(param_max_sched_extend_regions_iters) Param Optimization
+ The maximum number of iterations through CFG to extend regions.
+diff --git a/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c b/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c
+new file mode 100644
+index 000000000..65c4d4140
+--- /dev/null
++++ b/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c
+@@ -0,0 +1,35 @@
++
++extern void abort(void);
++
++__attribute__ ((noinline))
++int foo (int x, int y, int z, int a, int b)
++{
++  if (a < 2) {
++      if (a == 0) {
++	  if (x - y < 0)
++	    x = x - y + z;
++	  else
++	    x = x - y;
++	}
++      else {
++	  if (x + y >= z)
++	    x = x + y - z;
++	  else
++	    x = x + y;
++	}
++    }
++  return x;
++}
++
++int main(void) {
++  if (foo (5,10,7,0,1) != 2) // x - y + z = -5 + 7 = 2
++    abort ();
++  if (foo (50,10,7,0,1) != 40) // x - y = 40
++    abort ();
++  if (foo (5,10,7,1,1) != 8) // x + y - z = 5 + 10 - 7 = 8
++    abort ();
++  if (foo (5,10,70,1,1) != 15) // x + y = 15
++    abort ();
++  return 0;
++}
++
+diff --git a/gcc/testsuite/gcc.dg/ifcvt-6.c b/gcc/testsuite/gcc.dg/ifcvt-6.c
+new file mode 100644
+index 000000000..be9a67b3f
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/ifcvt-6.c
+@@ -0,0 +1,27 @@
++/* { dg-do compile { target { aarch64*-*-* } } } */
++/* { dg-options "-fdump-rtl-ce1 -O2 --param max-rtl-if-conversion-unpredictable-cost=100 --param max-rtl-if-conversion-predictable-cost=100 --param=ifcvt-allow-register-renaming=2 -fifcvt-allow-complicated-cmps" } */
++
++typedef unsigned int uint16_t;
++
++uint16_t
++foo (uint16_t x, uint16_t y, uint16_t z, uint16_t a,
++     uint16_t b, uint16_t c, uint16_t d) {
++  int i = 1;
++  int j = 1;
++  if (a > b) {
++      j = x;
++      if (b > c)
++	i = y;
++      else
++	i = z;
++    }
++  else {
++      j = y;
++      if (c > d)
++	i = z;
++    }
++  return i * j;
++}
++
++/* { dg-final { scan-rtl-dump "7 true changes made" "ce1" } } */
++
+-- 
+2.33.0
+
diff --git a/0031-Perform-early-if-conversion-of-simple-arithmetic.patch b/0031-Perform-early-if-conversion-of-simple-arithmetic.patch
new file mode 100644
index 0000000000000000000000000000000000000000..14de678e3eb6cf0242eb59aaeecc2dd340c34c39
--- /dev/null
+++ b/0031-Perform-early-if-conversion-of-simple-arithmetic.patch
@@ -0,0 +1,109 @@
+From 310eade1450995b55d9f8120561022fbf164b2ec Mon Sep 17 00:00:00 2001
+From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
+Date: Thu, 12 Jan 2023 14:52:49 +0300
+Subject: [PATCH 03/18] Perform early if-conversion of simple arithmetic
+
+---
+ gcc/common.opt                      |  4 ++++
+ gcc/match.pd                        | 25 +++++++++++++++++++
+ gcc/testsuite/gcc.dg/ifcvt-gimple.c | 37 +++++++++++++++++++++++++++++
+ 3 files changed, 66 insertions(+)
+ create mode 100644 gcc/testsuite/gcc.dg/ifcvt-gimple.c
+
+diff --git a/gcc/common.opt b/gcc/common.opt
+index aa00fb7b0..dac477c04 100644
+--- a/gcc/common.opt
++++ b/gcc/common.opt
+@@ -1821,6 +1821,10 @@ fif-conversion2
+ Common Var(flag_if_conversion2) Optimization
+ Perform conversion of conditional jumps to conditional execution.
+ 
++fif-conversion-gimple
++Common Var(flag_if_conversion_gimple) Optimization
++Perform conversion of conditional jumps to branchless equivalents during gimple transformations.
++
+ fstack-reuse=
+ Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization
+ -fstack-reuse=[all|named_vars|none]	Set stack reuse level for local variables.
+diff --git a/gcc/match.pd b/gcc/match.pd
+index 6f24d5079..3cbaf2a5b 100644
+--- a/gcc/match.pd
++++ b/gcc/match.pd
+@@ -4278,6 +4278,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
+   )
+  )
+ )
++
++(if (flag_if_conversion_gimple)
++ (for simple_op (plus minus bit_and bit_ior bit_xor)
++  (simplify
++   (cond @0 (simple_op @1 INTEGER_CST@2) @1)
++   (switch
++    /* a = cond ? a + 1 : a -> a = a + ((int) cond) */
++    (if (integer_onep (@2))
++     (simple_op @1 (convert (convert:boolean_type_node @0))))
++    /* a = cond ? a + powerof2cst : a ->
++       a = a + ((int) cond) << log2 (powerof2cst) */
++    (if (INTEGRAL_TYPE_P (type) && integer_pow2p (@2))
++     (with
++      {
++	tree shift = build_int_cst (integer_type_node, tree_log2 (@2));
++      }
++      (simple_op @1 (lshift (convert (convert:boolean_type_node @0))
++			    { shift; })
++      )
++     )
++    )
++   )
++  )
++ )
++)
+ #endif
+ 
+ #if GIMPLE
+diff --git a/gcc/testsuite/gcc.dg/ifcvt-gimple.c b/gcc/testsuite/gcc.dg/ifcvt-gimple.c
+new file mode 100644
+index 000000000..0f7c87e5c
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/ifcvt-gimple.c
+@@ -0,0 +1,37 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fif-conversion-gimple -fdump-tree-optimized" } */
++
++int test_int (int optimizable_int) {
++    if (optimizable_int > 5)
++	++optimizable_int;
++    return optimizable_int;
++}
++
++int test_int_pow2 (int optimizable_int_pow2) {
++    if (optimizable_int_pow2 <= 4)
++	optimizable_int_pow2 += 1024;
++    return optimizable_int_pow2;
++}
++
++int test_int_non_pow2 (int not_optimizable_int_non_pow2) {
++    if (not_optimizable_int_non_pow2 == 1)
++	not_optimizable_int_non_pow2 += 513;
++    return not_optimizable_int_non_pow2;
++}
++
++float test_float (float not_optimizable_float) {
++    if (not_optimizable_float > 5)
++	not_optimizable_float += 1;
++    return not_optimizable_float;
++}
++
++/* Expecting if-else block in test_float and test_int_non_pow2 only. */
++/* { dg-final { scan-tree-dump-not "if \\(optimizable" "optimized" } } */
++/* { dg-final { scan-tree-dump "if \\(not_optimizable_int_non_pow2" "optimized" } } */
++/* { dg-final { scan-tree-dump "if \\(not_optimizable_float" "optimized" } } */
++/* { dg-final { scan-tree-dump-times "if " 2 "optimized" } } */
++/* { dg-final { scan-tree-dump-times "else" 2 "optimized" } } */
++
++/* Expecting shifted result only for optimizable_int_pow2. */
++/* { dg-final { scan-tree-dump-times " << " 1 "optimized" } } */
++/* { dg-final { scan-tree-dump " << 10;" "optimized" } } */
+-- 
+2.33.0
+
diff --git a/0032-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch b/0032-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch
new file mode 100644
index 0000000000000000000000000000000000000000..9b2be003020a7f8af73007a10dbdccc38d7935a9
--- /dev/null
+++ b/0032-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch
@@ -0,0 +1,252 @@
+From 6684509e81e4341675c73a7dc853180229a8abcb Mon Sep 17 00:00:00 2001
+From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
+Date: Tue, 24 Jan 2023 16:43:40 +0300
+Subject: [PATCH 04/18] Add option to allow matching uaddsub overflow for widen
+ ops too.
+
+---
+ gcc/common.opt                 |   5 ++
+ gcc/testsuite/gcc.dg/uaddsub.c | 143 +++++++++++++++++++++++++++++++++
+ gcc/tree-ssa-math-opts.cc      |  43 ++++++++--
+ 3 files changed, 184 insertions(+), 7 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.dg/uaddsub.c
+
+diff --git a/gcc/common.opt b/gcc/common.opt
+index dac477c04..39c90604e 100644
+--- a/gcc/common.opt
++++ b/gcc/common.opt
+@@ -3106,6 +3106,11 @@ freciprocal-math
+ Common Var(flag_reciprocal_math) SetByCombined Optimization
+ Same as -fassociative-math for expressions which include division.
+ 
++fuaddsub-overflow-match-all
++Common Var(flag_uaddsub_overflow_match_all)
++Match unsigned add/sub overflow even if the target does not support
++the corresponding instruction.
++
+ ; Nonzero means that unsafe floating-point math optimizations are allowed
+ ; for the sake of speed.  IEEE compliance is not guaranteed, and operations
+ ; are allowed to assume that their arguments and results are "normal"
+diff --git a/gcc/testsuite/gcc.dg/uaddsub.c b/gcc/testsuite/gcc.dg/uaddsub.c
+new file mode 100644
+index 000000000..96c26d308
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/uaddsub.c
+@@ -0,0 +1,143 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fuaddsub-overflow-match-all -fdump-tree-optimized" } */
++#include <stdint.h>
++
++typedef unsigned __int128 uint128_t;
++typedef struct uint256_t
++{
++  uint128_t lo;
++  uint128_t hi;
++} uint256_t;
++
++uint16_t add16 (uint8_t a, uint8_t b)
++{
++  uint8_t tmp = a + b;
++  uint8_t overflow = 0;
++  if (tmp < a)
++    overflow = 1;
++
++  uint16_t res = overflow;
++  res <<= 8;
++  res += tmp;
++  return res;
++}
++
++uint32_t add32 (uint16_t a, uint16_t b)
++{
++  uint16_t tmp = a + b;
++  uint16_t overflow = 0;
++  if (tmp < a)
++    overflow = 1;
++
++  uint32_t res = overflow;
++  res <<= 16;
++  res += tmp;
++  return res;
++}
++
++uint64_t add64 (uint32_t a, uint32_t b)
++{
++  uint32_t tmp = a + b;
++  uint32_t overflow = 0;
++  if (tmp < a)
++    overflow = 1;
++
++  uint64_t res = overflow;
++  res <<= 32;
++  res += tmp;
++  return res;
++}
++
++uint128_t add128 (uint64_t a, uint64_t b)
++{
++  uint64_t tmp = a + b;
++  uint64_t overflow = 0;
++  if (tmp < a)
++    overflow = 1;
++
++  uint128_t res = overflow;
++  res <<= 64;
++  res += tmp;
++  return res;
++}
++
++uint256_t add256 (uint128_t a, uint128_t b)
++{
++  uint128_t tmp = a + b;
++  uint128_t overflow = 0;
++  if (tmp < a)
++    overflow = 1;
++
++  uint256_t res;
++  res.hi = overflow;
++  res.lo = tmp;
++  return res;
++}
++
++uint16_t sub16 (uint8_t a, uint8_t b)
++{
++  uint8_t tmp = a - b;
++  uint8_t overflow = 0;
++  if (tmp > a)
++    overflow = -1;
++
++  uint16_t res = overflow;
++  res <<= 8;
++  res += tmp;
++  return res;
++}
++
++uint32_t sub32 (uint16_t a, uint16_t b)
++{
++  uint16_t tmp = a - b;
++  uint16_t overflow = 0;
++  if (tmp > a)
++    overflow = -1;
++
++  uint32_t res = overflow;
++  res <<= 16;
++  res += tmp;
++  return res;
++}
++
++uint64_t sub64 (uint32_t a, uint32_t b)
++{
++  uint32_t tmp = a - b;
++  uint32_t overflow = 0;
++  if (tmp > a)
++    overflow = -1;
++
++  uint64_t res = overflow;
++  res <<= 32;
++  res += tmp;
++  return res;
++}
++
++uint128_t sub128 (uint64_t a, uint64_t b)
++{
++  uint64_t tmp = a - b;
++  uint64_t overflow = 0;
++  if (tmp > a)
++    overflow = -1;
++
++  uint128_t res = overflow;
++  res <<= 64;
++  res += tmp;
++  return res;
++}
++
++uint256_t sub256 (uint128_t a, uint128_t b)
++{
++  uint128_t tmp = a - b;
++  uint128_t overflow = 0;
++  if (tmp > a)
++    overflow = -1;
++
++  uint256_t res;
++  res.hi = overflow;
++  res.lo = tmp;
++  return res;
++}
++
++/* { dg-final { scan-tree-dump-times "= .ADD_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 5 "optimized" } } */
++/* { dg-final { scan-tree-dump-times "= .SUB_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 5 "optimized" } } */
+diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
+index 232e903b0..55d6ee8ae 100644
+--- a/gcc/tree-ssa-math-opts.cc
++++ b/gcc/tree-ssa-math-opts.cc
+@@ -3468,6 +3468,27 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
+     }
+ }
+ 
++/* Check if the corresponding operation has wider equivalent on the target.  */
++
++static bool
++wider_optab_check_p (optab op, machine_mode mode, int unsignedp)
++{
++  machine_mode wider_mode;
++  FOR_EACH_WIDER_MODE (wider_mode, mode)
++    {
++      machine_mode next_mode;
++      if (optab_handler (op, wider_mode) != CODE_FOR_nothing
++	  || (op == smul_optab
++	      && GET_MODE_WIDER_MODE (wider_mode).exists (&next_mode)
++	      && (find_widening_optab_handler ((unsignedp
++						? umul_widen_optab
++						: smul_widen_optab),
++						next_mode, mode))))
++	return true;
++    }
++
++  return false;
++}
+ 
+ /* Helper function of match_arith_overflow.  For MUL_OVERFLOW, if we have
+    a check for non-zero like:
+@@ -3903,15 +3924,22 @@ match_arith_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
+ 		       || code == MINUS_EXPR
+ 		       || code == MULT_EXPR
+ 		       || code == BIT_NOT_EXPR);
++  int unsignedp = TYPE_UNSIGNED (type);
+   if (!INTEGRAL_TYPE_P (type)
+-      || !TYPE_UNSIGNED (type)
+-      || has_zero_uses (lhs)
+-      || (code != PLUS_EXPR
+-	  && code != MULT_EXPR
+-	  && optab_handler (code == MINUS_EXPR ? usubv4_optab : uaddv4_optab,
+-			    TYPE_MODE (type)) == CODE_FOR_nothing))
++      || !unsignedp
++      || has_zero_uses (lhs))
+     return false;
+ 
++  if (code == PLUS_EXPR || code == MINUS_EXPR)
++    {
++      machine_mode mode = TYPE_MODE (type);
++      optab op = code == PLUS_EXPR ? uaddv4_optab : usubv4_optab;
++      if (optab_handler (op, mode) == CODE_FOR_nothing
++	  && (!flag_uaddsub_overflow_match_all
++	      || !wider_optab_check_p (op, mode, unsignedp)))
++	return false;
++    }
++
+   tree rhs1 = gimple_assign_rhs1 (stmt);
+   tree rhs2 = gimple_assign_rhs2 (stmt);
+   FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
+@@ -3986,7 +4014,8 @@ match_arith_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
+       || (code != MULT_EXPR && (code == BIT_NOT_EXPR ? use_seen : !use_seen))
+       || (code == PLUS_EXPR
+ 	  && optab_handler (uaddv4_optab,
+-			    TYPE_MODE (type)) == CODE_FOR_nothing)
++			    TYPE_MODE (type)) == CODE_FOR_nothing
++	  && !flag_uaddsub_overflow_match_all)
+       || (code == MULT_EXPR
+ 	  && optab_handler (cast_stmt ? mulv4_optab : umulv4_optab,
+ 			    TYPE_MODE (type)) == CODE_FOR_nothing))
+-- 
+2.33.0
+
diff --git a/0033-Match-double-sized-mul-pattern.patch b/0033-Match-double-sized-mul-pattern.patch
new file mode 100644
index 0000000000000000000000000000000000000000..9d4e56f24732f19ce3d77e9c6bea04549b45c099
--- /dev/null
+++ b/0033-Match-double-sized-mul-pattern.patch
@@ -0,0 +1,488 @@
+From e7b22f97f960b62e555dfd6f2e3ae43973fcbb3e Mon Sep 17 00:00:00 2001
+From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
+Date: Wed, 25 Jan 2023 15:04:07 +0300
+Subject: [PATCH 05/18] Match double sized mul pattern
+
+---
+ gcc/match.pd                              | 136 +++++++++++++++++++++
+ gcc/testsuite/gcc.dg/double_sized_mul-1.c | 141 ++++++++++++++++++++++
+ gcc/testsuite/gcc.dg/double_sized_mul-2.c |  62 ++++++++++
+ gcc/tree-ssa-math-opts.cc                 |  80 ++++++++++++
+ 4 files changed, 419 insertions(+)
+ create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-1.c
+ create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-2.c
+
+diff --git a/gcc/match.pd b/gcc/match.pd
+index 3cbaf2a5b..61866cb90 100644
+--- a/gcc/match.pd
++++ b/gcc/match.pd
+@@ -7895,3 +7895,139 @@ and,
+ 	       == TYPE_UNSIGNED (TREE_TYPE (@3))))
+        && single_use (@4)
+        && single_use (@5))))
++
++/* Match multiplication with double sized result.
++
++   Consider the following calculations:
++   arg0 * arg1 = (2^(bit_size/2) * arg0_hi + arg0_lo)
++	       * (2^(bit_size/2) * arg1_hi + arg1_lo)
++   arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi
++	       + 2^(bit_size/2) * (arg0_hi * arg1_lo + arg0_lo * arg1_hi)
++	       + arg0_lo * arg1_lo
++
++   The products of high and low parts fits in bit_size values, thus they are
++   placed in high and low parts of result respectively.
++
++   The sum of the mixed products may overflow, so we need a detection for that.
++   Also it has a bit_size/2 offset, thus it intersects with both high and low
++   parts of result.  Overflow detection constant is bit_size/2 due to this.
++
++   With this info:
++   arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi
++	       + 2^(bit_size/2) * middle
++	       + 2^bit_size * possible_middle_overflow
++	       + arg0_lo * arg1_lo
++   arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow)
++	       + 2^(bit_size/2) * (2^(bit_size/2) * middle_hi + middle_lo)
++	       + arg0_lo * arg1_lo
++   arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + middle_hi
++	       +	       possible_middle_overflow)
++	       + 2^(bit_size/2) * middle_lo
++	       + arg0_lo * arg1_lo
++
++   The last sum can produce overflow for the high result part.  With this:
++   arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow
++	       +	       possible_res_lo_overflow + middle_hi)
++	       + res_lo
++	       = res_hi + res_lo
++
++   This formula is quite big to fit into one match pattern with all of the
++   combinations of terms inside it.  There are many helpers for better code
++   readability.
++
++   The simplification basis is res_hi: assuming that res_lo only is not
++   real practical case for such calculations.
++
++   Overflow handling is done via matching complex calculations:
++   the realpart and imagpart are quite handy here.  */
++/* Match low and high parts of the argument.  */
++(match (double_size_mul_arg_lo @0 @1)
++ (bit_and @0 INTEGER_CST@1)
++  (if (wi::to_wide (@1)
++       == wi::mask (TYPE_PRECISION (type) / 2, false, TYPE_PRECISION (type)))))
++(match (double_size_mul_arg_hi @0 @1)
++ (rshift @0 INTEGER_CST@1)
++  (if (wi::to_wide (@1) == TYPE_PRECISION (type) / 2)))
++
++/* Match various argument parts products.  */
++(match (double_size_mul_lolo @0 @1)
++ (mult@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_lo @1 @3))
++  (if (single_use (@4))))
++(match (double_size_mul_hihi @0 @1)
++ (mult@4 (double_size_mul_arg_hi @0 @2) (double_size_mul_arg_hi @1 @3))
++  (if (single_use (@4))))
++(match (double_size_mul_lohi @0 @1)
++ (mult:c@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_hi @1 @3))
++  (if (single_use (@4))))
++
++/* Match complex middle sum.  */
++(match (double_size_mul_middle_complex @0 @1)
++ (IFN_ADD_OVERFLOW@2 (double_size_mul_lohi @0 @1) (double_size_mul_lohi @1 @0))
++  (if (num_imm_uses (@2) == 2)))
++
++/* Match real middle results.  */
++(match (double_size_mul_middle @0 @1)
++ (realpart@2 (double_size_mul_middle_complex @0 @1))
++  (if (num_imm_uses (@2) == 2)))
++(match (double_size_mul_middleres_lo @0 @1)
++ (lshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2)
++  (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
++       && single_use (@3))))
++(match (double_size_mul_middleres_hi @0 @1)
++ (rshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2)
++  (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
++       && single_use (@3))))
++
++/* Match low result part.  */
++/* Number of uses may be < 2 in case when we are interested in
++   high part only.  */
++(match (double_size_mul_res_lo_complex @0 @1)
++ (IFN_ADD_OVERFLOW:c@2
++  (double_size_mul_lolo:c @0 @1) (double_size_mul_middleres_lo @0 @1))
++  (if (num_imm_uses (@2) <= 2)))
++(match (double_size_mul_res_lo @0 @1)
++ (realpart (double_size_mul_res_lo_complex @0 @1)))
++
++/* Match overflow terms.  */
++(match (double_size_mul_overflow_check_lo @0 @1 @5)
++ (convert@4 (ne@3
++  (imagpart@2 (double_size_mul_res_lo_complex@5 @0 @1)) integer_zerop))
++  (if (single_use (@2) && single_use (@3) && single_use (@4))))
++(match (double_size_mul_overflow_check_hi @0 @1)
++ (lshift@6 (convert@5 (ne@4
++  (imagpart@3 (double_size_mul_middle_complex @0 @1)) integer_zerop))
++	   INTEGER_CST@2)
++  (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
++       && single_use (@3) && single_use (@4) && single_use (@5)
++       && single_use (@6))))
++
++/* Match all possible permutations for high result part calculations.  */
++(for op1 (double_size_mul_hihi
++	  double_size_mul_overflow_check_hi
++	  double_size_mul_middleres_hi)
++     op2 (double_size_mul_overflow_check_hi
++	  double_size_mul_middleres_hi
++	  double_size_mul_hihi)
++     op3 (double_size_mul_middleres_hi
++	  double_size_mul_hihi
++	  double_size_mul_overflow_check_hi)
++ (match (double_size_mul_candidate @0 @1 @2 @3)
++  (plus:c@2
++   (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3) (op1:c @0 @1))
++   (plus:c@5 (op2:c @0 @1) (op3:c @0 @1)))
++    (if (single_use (@4) && single_use (@5))))
++ (match (double_size_mul_candidate @0 @1 @2 @3)
++  (plus:c@2 (double_size_mul_overflow_check_lo @0 @1 @3)
++   (plus:c@4 (op1:c @0 @1)
++    (plus:c@5 (op2:c @0 @1) (op3:c @0 @1))))
++     (if (single_use (@4) && single_use (@5))))
++ (match (double_size_mul_candidate @0 @1 @2 @3)
++  (plus:c@2 (op1:c @0 @1)
++   (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3)
++    (plus:c@5 (op2:c @0 @1) (op3:c @0 @1))))
++     (if (single_use (@4) && single_use (@5))))
++ (match (double_size_mul_candidate @0 @1 @2 @3)
++  (plus:c@2 (op1:c @0 @1)
++   (plus:c@4 (op2:c @0 @1)
++    (plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1))))
++     (if (single_use (@4) && single_use (@5)))))
+diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-1.c b/gcc/testsuite/gcc.dg/double_sized_mul-1.c
+new file mode 100644
+index 000000000..4d475cc8a
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/double_sized_mul-1.c
+@@ -0,0 +1,141 @@
++/* { dg-do compile } */
++/* fif-conversion-gimple and fuaddsub-overflow-match-all are required for
++   proper overflow detection in some cases.  */
++/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */
++#include <stdint.h>
++
++typedef unsigned __int128 uint128_t;
++
++uint16_t mul16 (uint8_t a, uint8_t b)
++{
++  uint8_t a_lo = a & 0xF;
++  uint8_t b_lo = b & 0xF;
++  uint8_t a_hi = a >> 4;
++  uint8_t b_hi = b >> 4;
++  uint8_t lolo = a_lo * b_lo;
++  uint8_t lohi = a_lo * b_hi;
++  uint8_t hilo = a_hi * b_lo;
++  uint8_t hihi = a_hi * b_hi;
++  uint8_t middle = hilo + lohi;
++  uint8_t middle_hi = middle >> 4;
++  uint8_t middle_lo = middle << 4;
++  uint8_t res_lo = lolo + middle_lo;
++  uint8_t res_hi = hihi + middle_hi;
++  res_hi += (res_lo < middle_lo ? 1 : 0);
++  res_hi += (middle < hilo ? 0x10 : 0);
++  uint16_t res = ((uint16_t) res_hi) << 8;
++  res += res_lo;
++  return res;
++}
++
++uint32_t mul32 (uint16_t a, uint16_t b)
++{
++  uint16_t a_lo = a & 0xFF;
++  uint16_t b_lo = b & 0xFF;
++  uint16_t a_hi = a >> 8;
++  uint16_t b_hi = b >> 8;
++  uint16_t lolo = a_lo * b_lo;
++  uint16_t lohi = a_lo * b_hi;
++  uint16_t hilo = a_hi * b_lo;
++  uint16_t hihi = a_hi * b_hi;
++  uint16_t middle = hilo + lohi;
++  uint16_t middle_hi = middle >> 8;
++  uint16_t middle_lo = middle << 8;
++  uint16_t res_lo = lolo + middle_lo;
++  uint16_t res_hi = hihi + middle_hi;
++  res_hi += (res_lo < middle_lo ? 1 : 0);
++  res_hi += (middle < hilo ? 0x100 : 0);
++  uint32_t res = ((uint32_t) res_hi) << 16;
++  res += res_lo;
++  return res;
++}
++
++uint64_t mul64 (uint32_t a, uint32_t b)
++{
++  uint32_t a_lo = a & 0xFFFF;
++  uint32_t b_lo = b & 0xFFFF;
++  uint32_t a_hi = a >> 16;
++  uint32_t b_hi = b >> 16;
++  uint32_t lolo = a_lo * b_lo;
++  uint32_t lohi = a_lo * b_hi;
++  uint32_t hilo = a_hi * b_lo;
++  uint32_t hihi = a_hi * b_hi;
++  uint32_t middle = hilo + lohi;
++  uint32_t middle_hi = middle >> 16;
++  uint32_t middle_lo = middle << 16;
++  uint32_t res_lo = lolo + middle_lo;
++  uint32_t res_hi = hihi + middle_hi;
++  res_hi += (res_lo < middle_lo ? 1 : 0);
++  res_hi += (middle < hilo ? 0x10000 : 0);
++  uint64_t res = ((uint64_t) res_hi) << 32;
++  res += res_lo;
++  return res;
++}
++
++uint128_t mul128 (uint64_t a, uint64_t b)
++{
++  uint64_t a_lo = a & 0xFFFFFFFF;
++  uint64_t b_lo = b & 0xFFFFFFFF;
++  uint64_t a_hi = a >> 32;
++  uint64_t b_hi = b >> 32;
++  uint64_t lolo = a_lo * b_lo;
++  uint64_t lohi = a_lo * b_hi;
++  uint64_t hilo = a_hi * b_lo;
++  uint64_t hihi = a_hi * b_hi;
++  uint64_t middle = hilo + lohi;
++  uint64_t middle_hi = middle >> 32;
++  uint64_t middle_lo = middle << 32;
++  uint64_t res_lo = lolo + middle_lo;
++  uint64_t res_hi = hihi + middle_hi;
++  res_hi += (res_lo < middle_lo ? 1 : 0);
++  res_hi += (middle < hilo ? 0x100000000 : 0);
++  uint128_t res = ((uint128_t) res_hi) << 64;
++  res += res_lo;
++  return res;
++}
++
++uint64_t mul64_perm (uint32_t a, uint32_t b)
++{
++  uint32_t a_lo = a & 0xFFFF;
++  uint32_t b_lo = b & 0xFFFF;
++  uint32_t a_hi = a >> 16;
++  uint32_t b_hi = b >> 16;
++  uint32_t lolo = a_lo * b_lo;
++  uint32_t lohi = a_lo * b_hi;
++  uint32_t hilo = a_hi * b_lo;
++  uint32_t hihi = a_hi * b_hi;
++  uint32_t middle = hilo + lohi;
++  uint32_t middle_hi = middle >> 16;
++  uint32_t middle_lo = middle << 16;
++  uint32_t res_lo = lolo + middle_lo;
++  uint32_t res_hi = hihi + middle_hi;
++  res_hi = res_lo < middle_lo ? res_hi + 1 : res_hi;
++  res_hi = middle < hilo ? res_hi + 0x10000 : res_hi;
++  uint64_t res = ((uint64_t) res_hi) << 32;
++  res += res_lo;
++  return res;
++}
++
++uint128_t mul128_perm (uint64_t a, uint64_t b)
++{
++  uint64_t a_lo = a & 0xFFFFFFFF;
++  uint64_t b_lo = b & 0xFFFFFFFF;
++  uint64_t a_hi = a >> 32;
++  uint64_t b_hi = b >> 32;
++  uint64_t lolo = a_lo * b_lo;
++  uint64_t lohi = a_lo * b_hi;
++  uint64_t hilo = a_hi * b_lo;
++  uint64_t hihi = a_hi * b_hi;
++  uint64_t middle = hilo + lohi;
++  uint64_t middle_hi = middle >> 32;
++  uint64_t middle_lo = middle << 32;
++  uint64_t res_lo = lolo + middle_lo;
++  uint64_t res_hi = hihi + middle_hi;
++  res_hi = res_lo < middle_lo ? res_hi + 1 : res_hi;
++  res_hi = middle < hilo ? res_hi + 0x100000000 : res_hi;
++  uint128_t res = ((uint128_t) res_hi) << 64;
++  res += res_lo;
++  return res;
++}
++
++/* { dg-final { scan-tree-dump-times "double sized mul optimized: 1" 6 "widening_mul" } } */
+diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-2.c b/gcc/testsuite/gcc.dg/double_sized_mul-2.c
+new file mode 100644
+index 000000000..cc6e5af25
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/double_sized_mul-2.c
+@@ -0,0 +1,62 @@
++/* { dg-do compile } */
++/* fif-conversion-gimple is required for proper overflow detection
++   in some cases.  */
++/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */
++#include <stdint.h>
++
++typedef unsigned __int128 uint128_t;
++typedef struct uint256_t
++{
++    uint128_t lo;
++    uint128_t hi;
++} uint256_t;
++
++uint64_t mul64_double_use (uint32_t a, uint32_t b)
++{
++  uint32_t a_lo = a & 0xFFFF;
++  uint32_t b_lo = b & 0xFFFF;
++  uint32_t a_hi = a >> 16;
++  uint32_t b_hi = b >> 16;
++  uint32_t lolo = a_lo * b_lo;
++  uint32_t lohi = a_lo * b_hi;
++  uint32_t hilo = a_hi * b_lo;
++  uint32_t hihi = a_hi * b_hi;
++  uint32_t middle = hilo + lohi;
++  uint32_t middle_hi = middle >> 16;
++  uint32_t middle_lo = middle << 16;
++  uint32_t res_lo = lolo + middle_lo;
++  uint32_t res_hi = hihi + middle_hi;
++  res_hi += (res_lo < middle_lo ? 1 : 0);
++  res_hi += (middle < hilo ? 0x10000 : 0);
++  uint64_t res = ((uint64_t) res_hi) << 32;
++  res += res_lo;
++  return res + lolo;
++}
++
++uint256_t mul256 (uint128_t a, uint128_t b)
++{
++  uint128_t a_lo = a & 0xFFFFFFFFFFFFFFFF;
++  uint128_t b_lo = b & 0xFFFFFFFFFFFFFFFF;
++  uint128_t a_hi = a >> 64;
++  uint128_t b_hi = b >> 64;
++  uint128_t lolo = a_lo * b_lo;
++  uint128_t lohi = a_lo * b_hi;
++  uint128_t hilo = a_hi * b_lo;
++  uint128_t hihi = a_hi * b_hi;
++  uint128_t middle = hilo + lohi;
++  uint128_t middle_hi = middle >> 64;
++  uint128_t middle_lo = middle << 64;
++  uint128_t res_lo = lolo + middle_lo;
++  uint128_t res_hi = hihi + middle_hi;
++  res_hi += (res_lo < middle_lo ? 1 : 0);
++  /* Constant is to big warning WA */
++  uint128_t overflow_tmp = (middle < hilo ? 1 : 0);
++  overflow_tmp <<= 64;
++  res_hi += overflow_tmp;
++  uint256_t res;
++  res.lo = res_lo;
++  res.hi = res_hi;
++  return res;
++}
++
++/* { dg-final { scan-tree-dump-not "double sized mul optimized" "widening_mul" } } */
+diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
+index 55d6ee8ae..2c06b8a60 100644
+--- a/gcc/tree-ssa-math-opts.cc
++++ b/gcc/tree-ssa-math-opts.cc
+@@ -210,6 +210,9 @@ static struct
+ 
+   /* Number of highpart multiplication ops inserted.  */
+   int highpart_mults_inserted;
++
++  /* Number of optimized double sized multiplications.  */
++  int double_sized_mul_optimized;
+ } widen_mul_stats;
+ 
+ /* The instance of "struct occurrence" representing the highest
+@@ -4893,6 +4896,78 @@ optimize_spaceship (gimple *stmt)
+ }
+ 
+ 
++/* Pattern matcher for double sized multiplication defined in match.pd.  */
++extern bool gimple_double_size_mul_candidate (tree, tree*, tree (*)(tree));
++
++static bool
++convert_double_size_mul (gimple_stmt_iterator *gsi, gimple *stmt)
++{
++  gimple *use_stmt, *complex_res_lo;
++  gimple_stmt_iterator insert_before;
++  imm_use_iterator use_iter;
++  tree match[4]; // arg0, arg1, res_hi, complex_res_lo
++  tree arg0, arg1, widen_mult, new_type, tmp;
++  tree lhs = gimple_assign_lhs (stmt);
++  location_t loc = UNKNOWN_LOCATION;
++  machine_mode mode;
++
++  if (!gimple_double_size_mul_candidate (lhs, match, NULL))
++    return false;
++
++  new_type = build_nonstandard_integer_type (
++	  TYPE_PRECISION (TREE_TYPE (match[0])) * 2, 1);
++  mode = TYPE_MODE (new_type);
++
++  /* Early return if the target multiplication doesn't exist on target.  */
++  if (optab_handler (smul_optab, mode) == CODE_FOR_nothing
++      && !wider_optab_check_p (smul_optab, mode, 1))
++    return false;
++
++  /* Determine the point where the wide multiplication
++     should be inserted.  Complex low res is OK since it is required
++     by both high and low part getters, thus it dominates both of them.  */
++  complex_res_lo = SSA_NAME_DEF_STMT (match[3]);
++  insert_before = gsi_for_stmt (complex_res_lo);
++  gsi_next (&insert_before);
++
++  /* Create the widen multiplication.  */
++  arg0 = build_and_insert_cast (&insert_before, loc, new_type, match[0]);
++  arg1 = build_and_insert_cast (&insert_before, loc, new_type, match[1]);
++  widen_mult = build_and_insert_binop (&insert_before, loc, "widen_mult",
++				       MULT_EXPR, arg0, arg1);
++
++  /* Find the mult low part getter.  */
++  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, match[3])
++    if (gimple_assign_rhs_code (use_stmt) == REALPART_EXPR)
++      break;
++
++  /* Create high and low (if needed) parts extractors.  */
++  /* Low part.  */
++  if (use_stmt)
++    {
++      loc = gimple_location (use_stmt);
++      tmp = build_and_insert_cast (&insert_before, loc,
++	  	      		   TREE_TYPE (gimple_get_lhs (use_stmt)),
++	  			   widen_mult);
++      gassign *new_stmt = gimple_build_assign (gimple_get_lhs (use_stmt),
++	    				       NOP_EXPR, tmp);
++      gsi_replace (&insert_before, new_stmt, true);
++    }
++
++  /* High part.  */
++  loc = gimple_location (stmt);
++  tmp = build_and_insert_binop (gsi, loc, "widen_mult_hi",
++				RSHIFT_EXPR, widen_mult,
++				build_int_cst (new_type,
++					       TYPE_PRECISION (new_type) / 2));
++  tmp = build_and_insert_cast (gsi, loc, TREE_TYPE (lhs), tmp);
++  gassign *new_stmt = gimple_build_assign (lhs, NOP_EXPR, tmp);
++  gsi_replace (gsi, new_stmt, true);
++
++  widen_mul_stats.double_sized_mul_optimized++;
++  return true;
++}
++
+ /* Find integer multiplications where the operands are extended from
+    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
+    or MULT_HIGHPART_EXPR where appropriate.  */
+@@ -4987,6 +5062,9 @@ math_opts_dom_walker::after_dom_children (basic_block bb)
+ 	      break;
+ 
+ 	    case PLUS_EXPR:
++	      if (convert_double_size_mul (&gsi, stmt))
++		break;
++	      __attribute__ ((fallthrough));
+ 	    case MINUS_EXPR:
+ 	      if (!convert_plusminus_to_widen (&gsi, stmt, code))
+ 		match_arith_overflow (&gsi, stmt, code, m_cfg_changed_p);
+@@ -5091,6 +5169,8 @@ pass_optimize_widening_mul::execute (function *fun)
+ 			    widen_mul_stats.divmod_calls_inserted);
+   statistics_counter_event (fun, "highpart multiplications inserted",
+ 			    widen_mul_stats.highpart_mults_inserted);
++  statistics_counter_event (fun, "double sized mul optimized",
++			    widen_mul_stats.double_sized_mul_optimized);
+ 
+   return cfg_changed ? TODO_cleanup_cfg : 0;
+ }
+-- 
+2.33.0
+
diff --git a/0034-Port-icp-patch-to-GCC-12.patch b/0034-Port-icp-patch-to-GCC-12.patch
new file mode 100644
index 0000000000000000000000000000000000000000..d0b34126ce497f8f912ef10b2815735d5b7650a7
--- /dev/null
+++ b/0034-Port-icp-patch-to-GCC-12.patch
@@ -0,0 +1,2387 @@
+From b73462757734c62f64e7a4379340679ec6f19669 Mon Sep 17 00:00:00 2001
+From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
+Date: Tue, 27 Feb 2024 07:28:12 +0800
+Subject: [PATCH 06/18] Port icp patch to GCC 12
+
+---
+ gcc/common.opt              |    8 +
+ gcc/dbgcnt.def              |    1 +
+ gcc/ipa-devirt.cc           | 1855 +++++++++++++++++++++++++++++++++++
+ gcc/passes.def              |    1 +
+ gcc/testsuite/gcc.dg/icp1.c |   40 +
+ gcc/testsuite/gcc.dg/icp2.c |   38 +
+ gcc/testsuite/gcc.dg/icp3.c |   52 +
+ gcc/testsuite/gcc.dg/icp4.c |   55 ++
+ gcc/testsuite/gcc.dg/icp5.c |   66 ++
+ gcc/testsuite/gcc.dg/icp6.c |   66 ++
+ gcc/testsuite/gcc.dg/icp7.c |   48 +
+ gcc/timevar.def             |    1 +
+ gcc/tree-pass.h             |    1 +
+ 13 files changed, 2232 insertions(+)
+ create mode 100644 gcc/testsuite/gcc.dg/icp1.c
+ create mode 100644 gcc/testsuite/gcc.dg/icp2.c
+ create mode 100644 gcc/testsuite/gcc.dg/icp3.c
+ create mode 100644 gcc/testsuite/gcc.dg/icp4.c
+ create mode 100644 gcc/testsuite/gcc.dg/icp5.c
+ create mode 100644 gcc/testsuite/gcc.dg/icp6.c
+ create mode 100644 gcc/testsuite/gcc.dg/icp7.c
+
+diff --git a/gcc/common.opt b/gcc/common.opt
+index 39c90604e..16aadccf6 100644
+--- a/gcc/common.opt
++++ b/gcc/common.opt
+@@ -1316,6 +1316,14 @@ fdevirtualize
+ Common Var(flag_devirtualize) Optimization
+ Try to convert virtual calls to direct ones.
+ 
++ficp
++Common Var(flag_icp) Optimization Init(0)
++Try to promote indirect calls to direct ones.
++
++ficp-speculatively
++Common Var(flag_icp_speculatively) Optimization
++Promote indirect calls speculatively.
++
+ fdiagnostics-show-location=
+ Common Joined RejectNegative Enum(diagnostic_prefixing_rule)
+ -fdiagnostics-show-location=[once|every-line]	How often to emit source location at the beginning of line-wrapped diagnostics.
+diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def
+index 3aa18cd0c..a00bbc31b 100644
+--- a/gcc/dbgcnt.def
++++ b/gcc/dbgcnt.def
+@@ -170,6 +170,7 @@ DEBUG_COUNTER (graphite_scop)
+ DEBUG_COUNTER (hoist)
+ DEBUG_COUNTER (hoist_insn)
+ DEBUG_COUNTER (ia64_sched2)
++DEBUG_COUNTER (icp)
+ DEBUG_COUNTER (if_after_combine)
+ DEBUG_COUNTER (if_after_reload)
+ DEBUG_COUNTER (if_conversion)
+diff --git a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc
+index 74fe65608..383839189 100644
+--- a/gcc/ipa-devirt.cc
++++ b/gcc/ipa-devirt.cc
+@@ -103,9 +103,14 @@ along with GCC; see the file COPYING3.  If not see
+   indirect polymorphic edge all possible polymorphic call targets of the call.
+ 
+   pass_ipa_devirt performs simple speculative devirtualization.
++  pass_ipa_icp performs simple indirect call promotion.
+ */
+ 
+ #include "config.h"
++#define INCLUDE_ALGORITHM
++#define INCLUDE_SET
++#define INCLUDE_MAP
++#define INCLUDE_LIST
+ #include "system.h"
+ #include "coretypes.h"
+ #include "backend.h"
+@@ -127,6 +132,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "ipa-fnsummary.h"
+ #include "demangle.h"
+ #include "dbgcnt.h"
++#include "gimple-iterator.h"
+ #include "gimple-pretty-print.h"
+ #include "intl.h"
+ #include "stringpool.h"
+@@ -4401,5 +4407,1854 @@ make_pass_ipa_odr (gcc::context *ctxt)
+   return new pass_ipa_odr (ctxt);
+ }
+ 
++/* Function signature map used to look up function decl which corresponds to
++   the given function type.  */
++typedef std::set<unsigned> type_set;
++typedef std::set<tree> decl_set;
++typedef std::map<unsigned, type_set*> type_alias_map;
++typedef std::map<unsigned, decl_set*> type_decl_map;
++typedef std::map<unsigned, tree> uid_to_type_map;
++typedef std::map<tree, tree> type_map;
++
++static bool has_address_taken_functions_with_varargs = false;
++static type_set *unsafe_types = NULL;
++static type_alias_map *fta_map = NULL;
++static type_alias_map *ta_map = NULL;
++static type_map *ctype_map = NULL;
++static type_alias_map *cbase_to_ptype = NULL;
++static type_decl_map *fs_map = NULL;
++static uid_to_type_map *type_uid_map = NULL;
++
++static void
++print_type_set(unsigned ftype_uid, type_alias_map *map)
++{
++  if (!map->count (ftype_uid))
++    return;
++  type_set* s = (*map)[ftype_uid];
++  for (type_set::const_iterator it = s->begin (); it != s->end (); it++)
++    fprintf (dump_file, it == s->begin () ? "%d" : ", %d", *it);
++}
++
++static void
++dump_type_with_uid (const char *msg, tree type, dump_flags_t flags = TDF_NONE)
++{
++  fprintf (dump_file, msg);
++  print_generic_expr (dump_file, type, flags);
++  fprintf (dump_file, " (%d)\n", TYPE_UID (type));
++}
++
++/* Walk aggregate type and collect types of scalar elements.  */
++
++static void
++collect_scalar_types (tree tp, std::list<tree> &types)
++{
++  /* TODO: take into account different field offsets.
++     Also support array casts.  */
++  if (tp && dump_file && (dump_flags & TDF_DETAILS))
++    dump_type_with_uid ("Walk var's type: ", tp, TDF_UID);
++  if (RECORD_OR_UNION_TYPE_P (tp))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Record's fields {\n");
++      for (tree field = TYPE_FIELDS (tp); field;
++	   field = DECL_CHAIN (field))
++	{
++	  if (TREE_CODE (field) != FIELD_DECL)
++	    continue;
++	  collect_scalar_types (TREE_TYPE (field), types);
++	}
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "}\n");
++      return;
++    }
++  if (TREE_CODE (tp) == ARRAY_TYPE)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Array's innermost type:\n");
++      /* Take the innermost component type.  */
++      tree elt;
++      for (elt = TREE_TYPE (tp); TREE_CODE (elt) == ARRAY_TYPE;
++	   elt = TREE_TYPE (elt))
++	if (dump_file && (dump_flags & TDF_DETAILS))
++	  print_generic_expr (dump_file, elt);
++      collect_scalar_types (elt, types);
++      return;
++    }
++  types.push_back (tp);
++}
++
++static void maybe_register_aliases (tree type1, tree type2);
++
++/* Walk type lists and maybe register type aliases.  */
++
++static void
++compare_type_lists (std::list<tree> tlist1, std::list<tree> tlist2)
++{
++  for (std::list<tree>::iterator ti1 = tlist1.begin (), ti2 = tlist2.begin ();
++       ti1 != tlist1.end (); ++ti1, ++ti2)
++    {
++      /* TODO: correct the analysis results if lists have different length.  */
++      if (ti2 == tlist2.end ())
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "Type lists with different length!\n");
++	  break;
++	}
++      maybe_register_aliases (*ti1, *ti2);
++    }
++}
++
++/* For two given types collect scalar element types and
++   compare the result lists to find type aliases.  */
++
++static void
++collect_scalar_types_and_find_aliases (tree t1, tree t2)
++{
++  std::list<tree> tlist1;
++  std::list<tree> tlist2;
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "First type list: ");
++  collect_scalar_types (t1, tlist1);
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "Second type list: ");
++  collect_scalar_types (t2, tlist2);
++  compare_type_lists (tlist1, tlist2);
++}
++
++/* Dump type with the corresponding set from the map.  */
++
++static void
++dump_type_uid_with_set (const char *msg, tree type, type_alias_map *map,
++			bool dump_type = true, bool with_newline = true)
++{
++  fprintf (dump_file, msg, TYPE_UID (type));
++  if (dump_type)
++    print_generic_expr (dump_file, type);
++  fprintf (dump_file, " (");
++  print_type_set (TYPE_UID (type), map);
++  fprintf (dump_file, ")");
++  fprintf (dump_file, with_newline ? "\n" : " ");
++}
++
++static void
++dump_two_types_uids_with_set (const char *msg, unsigned t1_uid,
++			      unsigned t2_uid, type_alias_map *map)
++{
++  fprintf (dump_file, msg, t1_uid, t2_uid);
++  fprintf (dump_file, " (");
++  print_type_set (t1_uid, map);
++  fprintf (dump_file, ")\n");
++}
++
++/* Register type aliases in the map.  Return true if new alias
++   is registered.  */
++
++static bool
++register_ailas_type (tree type, tree alias_type, type_alias_map *map,
++		     bool only_merge = false)
++{
++  /* TODO: maybe support the case with one missed type.  */
++  if (!type || !alias_type)
++    return false;
++  unsigned type_uid = TYPE_UID (type);
++  unsigned alias_type_uid = TYPE_UID (alias_type);
++  if (type_uid_map->count (type_uid) == 0)
++    (*type_uid_map)[type_uid] = type;
++  if (type_uid_map->count (alias_type_uid) == 0)
++    (*type_uid_map)[alias_type_uid] = alias_type;
++
++  if (map->count (type_uid) == 0 && map->count (alias_type_uid) == 0)
++    {
++      (*map)[type_uid] = new type_set ();
++      (*map)[alias_type_uid] = (*map)[type_uid];
++    }
++  else if (map->count (type_uid) == 0)
++    (*map)[type_uid] = (*map)[alias_type_uid];
++  else if (map->count (alias_type_uid) == 0)
++    (*map)[alias_type_uid] = (*map)[type_uid];
++  else if (map->count (type_uid) && map->count (alias_type_uid))
++    {
++      if ((*map)[type_uid] == (*map)[alias_type_uid])
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    dump_two_types_uids_with_set ("Types (%d) and (%d) are already in",
++					  type_uid, alias_type_uid, map);
++	  return false;
++	}
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  dump_type_uid_with_set ("T1 (%d) in set", type, map, false, true);
++	  dump_type_uid_with_set ("T2 (%d) in set", alias_type, map,
++				  false, true);
++	}
++      (*map)[type_uid]->insert ((*map)[alias_type_uid]->begin (),
++				(*map)[alias_type_uid]->end ());
++      type_set *type_set = (*map)[alias_type_uid];
++      for (type_set::const_iterator it1 = type_set->begin ();
++	   it1 != type_set->end (); ++it1)
++	(*map)[*it1] = (*map)[type_uid];
++      delete type_set;
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "MERGE: ");
++    }
++   if (!only_merge)
++     {
++       (*map)[type_uid]->insert (alias_type_uid);
++       (*map)[type_uid]->insert (type_uid);
++     }
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    dump_two_types_uids_with_set ("Insert types (%d) and (%d) into set",
++				  type_uid, alias_type_uid, map);
++  return true;
++}
++
++static void
++dump_two_types_with_uids (const char *msg, tree t1, tree t2)
++{
++  fprintf (dump_file, msg);
++  print_generic_expr (dump_file, t1, TDF_UID);
++  fprintf (dump_file, " (%d), ", TYPE_UID (t1));
++  print_generic_expr (dump_file, t2, TDF_UID);
++  fprintf (dump_file, " (%d)\n", TYPE_UID (t2));
++}
++
++static void
++analyze_pointees (tree type1, tree type2)
++{
++  gcc_assert (POINTER_TYPE_P (type1) && POINTER_TYPE_P (type2));
++  tree base1 = TREE_TYPE (type1);
++  tree base2 = TREE_TYPE (type2);
++  /* TODO: maybe analyze void pointers.  */
++  if (VOID_TYPE_P(base1) || VOID_TYPE_P(base2))
++    return;
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    dump_two_types_with_uids ("Walk pointee types: ", base1, base2);
++  collect_scalar_types_and_find_aliases (base1, base2);
++}
++
++static void
++map_canonical_base_to_pointer (tree type, tree to_insert)
++{
++  type = TYPE_MAIN_VARIANT (type);
++  tree base_type = TREE_TYPE (type);
++  tree cbase_type = TYPE_CANONICAL (base_type);
++  if (!cbase_type)
++    return;
++  unsigned cbase_type_uid = TYPE_UID (cbase_type);
++  if (type_uid_map->count (cbase_type_uid) == 0)
++    (*type_uid_map)[cbase_type_uid] = cbase_type;
++
++  if (cbase_to_ptype->count (cbase_type_uid) == 0)
++    {
++      (*cbase_to_ptype)[cbase_type_uid] = new type_set ();
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "New map cb-to-p=(%d): ", cbase_type_uid);
++    }
++  else if (!(*cbase_to_ptype)[cbase_type_uid]->count (TYPE_UID (to_insert)))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Found map cb-to-p=(%d): ", cbase_type_uid);
++    }
++  else
++    return;
++  /* Add all variants of 'to_insert' type.  */
++  for (tree t = to_insert; t; t = TYPE_NEXT_VARIANT (t))
++    {
++      unsigned t_uid = TYPE_UID (t);
++      if (!(*cbase_to_ptype)[cbase_type_uid]->count (t_uid))
++	{
++	  (*cbase_to_ptype)[cbase_type_uid]->insert (t_uid);
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	     fprintf (dump_file, "(%d) ", t_uid);
++	}
++      if (type_uid_map->count (t_uid) == 0)
++	(*type_uid_map)[t_uid] = t;
++    }
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\n");
++}
++
++/* Analyse two types and maybe register them as aliases. Also collect
++   unsafe function types and map canonical base types to corresponding
++   pointer types.  */
++
++static void
++maybe_register_aliases (tree type1, tree type2)
++{
++  if (type1 && POINTER_TYPE_P (type1) && !FUNCTION_POINTER_TYPE_P (type1))
++    map_canonical_base_to_pointer (type1, type1);
++  if (type2 && POINTER_TYPE_P (type2) && !FUNCTION_POINTER_TYPE_P (type2))
++    map_canonical_base_to_pointer (type2, type2);
++
++  if (type1 == type2 || !type1 || !type2)
++    return;
++
++  if (POINTER_TYPE_P (type1) && POINTER_TYPE_P (type2))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	dump_two_types_with_uids ("Pointer types: ", type1, type2);
++      if (register_ailas_type (type1, type2, ta_map))
++	analyze_pointees (type1, type2);
++    }
++  /* If function and non-function type pointers alias,
++     the function type is unsafe.  */
++  if (FUNCTION_POINTER_TYPE_P (type1) && !FUNCTION_POINTER_TYPE_P (type2))
++    unsafe_types->insert (TYPE_UID (type1));
++  if (FUNCTION_POINTER_TYPE_P (type2) && !FUNCTION_POINTER_TYPE_P (type1))
++    unsafe_types->insert (TYPE_UID (type2));
++
++  /* Try to figure out with pointers to incomplete types.  */
++  if (POINTER_TYPE_P (type1) && POINTER_TYPE_P (type2))
++    {
++      type1 = TYPE_MAIN_VARIANT (type1);
++      type2 = TYPE_MAIN_VARIANT (type2);
++      tree base1 = TREE_TYPE (type1);
++      tree base2 = TREE_TYPE (type2);
++      if (RECORD_OR_UNION_TYPE_P (base1) && RECORD_OR_UNION_TYPE_P (base2))
++	{
++	  tree cb1 = TYPE_CANONICAL (base1);
++	  tree cb2 = TYPE_CANONICAL (base2);
++	  if (cb1 && !cb2)
++	    map_canonical_base_to_pointer (type1, type2);
++	  if (cb2 && !cb1)
++	    map_canonical_base_to_pointer (type2, type1);
++	}
++    }
++}
++
++/* Maybe register non-void/equal type aliases.  */
++
++static void
++maybe_register_non_void_aliases (tree t1, tree t2)
++{
++  gcc_assert (t1 && t2);
++  if (type_uid_map->count (TYPE_UID (t1)) == 0)
++    (*type_uid_map)[TYPE_UID (t1)] = t1;
++  if (type_uid_map->count (TYPE_UID (t2)) == 0)
++    (*type_uid_map)[TYPE_UID (t2)] = t2;
++
++  /* Skip equal and void types.  */
++  if (t1 == t2 || VOID_TYPE_P (t1) || VOID_TYPE_P (t2))
++    return;
++  maybe_register_aliases (t1, t2);
++}
++
++/* Detect function type in call stmt.  */
++
++static tree
++get_call_fntype (gcall *stmt)
++{
++  tree fntype = NULL;
++  if (gimple_call_fndecl (stmt) && TREE_TYPE (gimple_call_fndecl (stmt)))
++    fntype = TREE_TYPE (gimple_call_fndecl (stmt));
++  else
++    {
++      tree call_fn = gimple_call_fn (stmt);
++      tree ptype = TREE_TYPE (call_fn);
++      gcc_assert (ptype && TREE_TYPE (ptype));
++      fntype = TREE_TYPE (ptype);
++    }
++  gcc_assert (fntype && fntype != void_type_node
++	      && (TREE_CODE (fntype) == FUNCTION_TYPE
++		  || TREE_CODE (fntype) == METHOD_TYPE));
++  return fntype;
++}
++
++static void
++dump_global_var (tree decl)
++{
++  fprintf (dump_file, "Analyze global var: ");
++  print_generic_decl (dump_file, decl, TDF_NONE);
++  fprintf (dump_file, "\n");
++}
++
++static void
++collect_block_elt_types (tree tp, std::list<tree> &types, tree block)
++{
++  tree vt = TREE_TYPE (tp);
++  gcc_assert (vt);
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      const char *msg = TREE_CODE (block) == BLOCK ? "VAR's block: " :
++						     "VAR's ctor: ";
++      fprintf (dump_file, msg);
++      print_generic_expr (dump_file, tp);
++      dump_type_with_uid (" with type ", vt);
++    }
++  collect_scalar_types (vt, types);
++}
++
++/* Compare types of initialization block's or constructor's elements and
++   fields of the initializer type to find type aliases.  */
++
++static void
++compare_block_and_init_type (tree block, tree t1)
++{
++  std::list<tree> tlist1;
++  std::list<tree> tlist2;
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "Init's type list: ");
++  collect_scalar_types (t1, tlist1);
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "Block's type list: ");
++  if (TREE_CODE (block) == CONSTRUCTOR)
++    {
++      unsigned HOST_WIDE_INT idx;
++      tree value;
++      FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (block), idx, value)
++	{
++	  gcc_assert (value);
++	  collect_block_elt_types (value, tlist2, block);
++	}
++    }
++  else if (TREE_CODE (block) == BLOCK)
++    for (tree var = BLOCK_VARS (block); var; var = DECL_CHAIN (var))
++      {
++	if (TREE_CODE (var) != VAR_DECL)
++	  continue;
++	collect_block_elt_types (var, tlist2, block);
++      }
++  else
++    gcc_unreachable ();
++  compare_type_lists (tlist1, tlist2);
++}
++
++/* Analyze global var to find type aliases comparing types of var and
++   initializer elements.  */
++
++static void
++analyze_global_var (varpool_node *var)
++{
++  var->get_constructor();
++  tree decl = var->decl;
++  if (TREE_CODE (decl) == SSA_NAME || !DECL_INITIAL (decl)
++      || integer_zerop (DECL_INITIAL (decl)))
++    return;
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    dump_global_var (decl);
++  tree var_type = TREE_TYPE (decl);
++  tree init_type = TREE_TYPE (DECL_INITIAL (decl));
++  gcc_assert (var_type && init_type);
++  if (RECORD_OR_UNION_TYPE_P (init_type)
++      && !initializer_zerop (DECL_INITIAL (decl)))
++    compare_block_and_init_type (DECL_INITIAL (decl), init_type);
++  else if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "Is not a record with nonzero init\n");
++
++  if (var_type == init_type)
++    return;
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    dump_two_types_with_uids ("Mismatch of var and init types: ",
++			      var_type, init_type);
++  collect_scalar_types_and_find_aliases (var_type, init_type);
++}
++
++static void
++dump_function_node_info (struct cgraph_node *n)
++{
++  fprintf (dump_file, "\nAnalyse function node: ");
++  print_generic_expr (dump_file, n->decl);
++  fprintf (dump_file, "\n");
++  tree fndecl_type = TREE_TYPE (n->decl);
++  dump_type_with_uid ("Function decl type: ", fndecl_type, TDF_UID);
++  if (TREE_TYPE (fndecl_type))
++    dump_type_with_uid ("Return type: ", TREE_TYPE (fndecl_type));
++  tree argt = TYPE_ARG_TYPES (fndecl_type);
++  for (unsigned i = 1; argt && argt != void_type_node
++       && !VOID_TYPE_P (TREE_VALUE (argt)); ++i, argt = TREE_CHAIN (argt))
++    {
++      tree atype = TREE_VALUE (argt);
++      fprintf (dump_file, "%d-arg type: ", i);
++      dump_type_with_uid ("", atype);
++    }
++  fprintf (dump_file, "\n");
++}
++
++static void
++dump_call_stmt_info (gcall *stmt, tree fntype)
++{
++  fprintf (dump_file, "\nAnalyse call stmt: ");
++  if (stmt)
++    print_gimple_stmt (dump_file, stmt, 3, TDF_DETAILS);
++  else
++    fprintf (dump_file, "(no stmt)\n");
++  dump_type_with_uid ("fntype=", fntype, TDF_UID);
++  if (gimple_call_fntype (stmt))
++    dump_type_with_uid ("fntype1=", gimple_call_fntype (stmt), TDF_UID);
++  if (gimple_call_fndecl (stmt) && TREE_TYPE (gimple_call_fndecl (stmt)))
++    dump_type_with_uid ("fntype2=", TREE_TYPE (gimple_call_fndecl (stmt)),
++			TDF_UID);
++}
++
++/* Dump actual and formal arg types.  */
++
++static void
++dump_arg_types_with_uids (int i, tree t1, tree t2)
++{
++  if (i >= 0)
++    fprintf (dump_file, "Call's %d-arg types: ", i);
++  else
++    fprintf (dump_file, "Call's return types: ");
++  fprintf (dump_file, "(%d) and (%d) ", TYPE_UID (t1), TYPE_UID (t2));
++  print_generic_expr (dump_file, t1, TDF_UID);
++  fprintf (dump_file, " ");
++  print_generic_expr (dump_file, t2, TDF_UID);
++  fprintf (dump_file, "\n");
++}
++
++/* Analyze call graph edge with connected call stmt to find type aliases in
++   arguments and return value casts.  */
++
++static void
++analyze_cgraph_edge (cgraph_edge *e)
++{
++  gcall *stmt = e->call_stmt;
++  gcc_assert (stmt != NULL);
++  tree fntype = get_call_fntype (stmt);
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    dump_call_stmt_info (stmt, fntype);
++  if (gimple_has_lhs (stmt))
++    {
++      tree t1 = TREE_TYPE (gimple_call_lhs (stmt));
++      tree t2 = TREE_TYPE (fntype);
++      const int is_return_arg = -1;
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	dump_arg_types_with_uids (is_return_arg, t1, t2);
++      maybe_register_non_void_aliases (t1, t2);
++    }
++
++  tree argt = TYPE_ARG_TYPES (fntype);
++  if (!argt)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Finish call stmt analysis\n");
++      return;
++    }
++  gcc_assert (argt);
++  unsigned num_args = gimple_call_num_args (stmt);
++  for (unsigned i = 0; i < num_args && argt; ++i, argt = TREE_CHAIN (argt))
++    {
++      tree arg = gimple_call_arg (stmt, i);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	dump_arg_types_with_uids (i, TREE_VALUE (argt), TREE_TYPE (arg));
++      if (TREE_VALUE (argt) == TREE_TYPE (arg)
++	  || !POINTER_TYPE_P (TREE_VALUE (argt))
++	  || !POINTER_TYPE_P (TREE_TYPE (arg)))
++	continue;
++      maybe_register_non_void_aliases (TREE_VALUE (argt), TREE_TYPE (arg));
++      tree t1 = TREE_TYPE (TREE_VALUE (argt));
++      tree t2 = TREE_TYPE (TREE_TYPE (arg));
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Call's %d-arg base types: (%d) and (%d)\n",
++		 i, (t1 ? TYPE_UID (t1) : 0), (t2 ? TYPE_UID (t2) : 0));
++      maybe_register_non_void_aliases (t1, t2);
++    }
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "End list of args\n");
++  tree fndecl_type = NULL;
++  if (e->callee && e->callee->decl)
++    fndecl_type = TREE_TYPE (e->callee->decl);
++  if (fndecl_type && fndecl_type != fntype)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Function decl and edge types mismatch:\n");
++      register_ailas_type (fntype, fndecl_type, fta_map);
++    }
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "End call stmt analysis\n");
++}
++
++static void
++dump_assign_info (gimple *stmt, tree rhs, tree lhs_type, tree rhs_type)
++{
++  fprintf (dump_file, "\nAnalyse assign cast/copy stmt, rhs=%s: ",
++	   get_tree_code_name (TREE_CODE (rhs)));
++  print_gimple_stmt (dump_file, stmt, 3, TDF_DETAILS);
++  fprintf (dump_file, "Types: ");
++  print_generic_expr (dump_file, lhs_type);
++  fprintf (dump_file, ", ");
++  print_generic_expr (dump_file, rhs_type);
++  fprintf (dump_file, "\n");
++}
++
++/* Analyze cast/copy assign stmt to find type aliases.  */
++
++static void
++analyze_assign_stmt (gimple *stmt)
++{
++  gcc_assert (is_gimple_assign (stmt));
++  tree rhs_type = NULL_TREE;
++  tree lhs_type = TREE_TYPE (gimple_assign_lhs (stmt));
++  tree rhs = gimple_assign_rhs1 (stmt);
++  if (TREE_CODE (rhs) == MEM_REF)
++    {
++      rhs = TREE_OPERAND (rhs, 0);
++      tree ptr_type = TREE_TYPE (rhs);
++      gcc_assert (POINTER_TYPE_P (ptr_type));
++      rhs_type = TREE_TYPE (ptr_type);
++    }
++  else if (TREE_CODE (rhs) == ADDR_EXPR)
++    {
++      rhs = TREE_OPERAND (rhs, 0);
++      if (VAR_OR_FUNCTION_DECL_P (rhs) || TREE_CODE (rhs) == STRING_CST
++	  || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL)
++	rhs_type = build_pointer_type (TREE_TYPE (rhs));
++      else if (TREE_CODE (rhs) == COMPONENT_REF)
++	{
++	  rhs = TREE_OPERAND (rhs, 1);
++	  rhs_type = build_pointer_type (TREE_TYPE (rhs));
++	}
++      else if (TREE_CODE (rhs) == MEM_REF)
++	{
++	  rhs = TREE_OPERAND (rhs, 0);
++	  rhs_type = TREE_TYPE (rhs);
++	  gcc_assert (POINTER_TYPE_P (rhs_type));
++	}
++      else
++	gcc_unreachable();
++    }
++  else
++    rhs_type = TREE_TYPE (rhs);
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    dump_assign_info (stmt, rhs, lhs_type, rhs_type);
++  if (CONSTANT_CLASS_P (rhs) && !zerop (rhs)
++      && FUNCTION_POINTER_TYPE_P (TREE_TYPE (rhs)))
++    {
++      tree ftype = TREE_TYPE (rhs_type);
++      unsafe_types->insert (TYPE_UID (ftype));
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Function type (%d) is unsafe due to assign "
++		 "non-zero cst to function pointer\n", TYPE_UID (ftype));
++    }
++  maybe_register_non_void_aliases (lhs_type, rhs_type);
++}
++
++/* Walk all fn's stmt to analyze assigns.  */
++
++static void
++analyze_assigns (function* fn)
++{
++  push_cfun (fn);
++  basic_block bb;
++  gimple_stmt_iterator si;
++  FOR_EACH_BB_FN (bb, fn)
++    for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
++      {
++	gimple *stmt = gsi_stmt (si);
++	if (!gimple_assign_cast_p (stmt) && !gimple_assign_copy_p (stmt))
++	  continue;
++	analyze_assign_stmt (stmt);
++      }
++  pop_cfun ();
++}
++
++/* Walk all functions to collect sets of type aliases.  */
++
++static void
++collect_type_alias_sets ()
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\n\nCollect type alias sets walking global vars.\n");
++
++  varpool_node *var;
++  FOR_EACH_VARIABLE (var)
++    if (var->real_symbol_p ())
++      analyze_global_var (var);
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nCollect type alias sets walking functions.\n");
++
++  struct cgraph_node *n;
++  FOR_EACH_FUNCTION (n)
++    {
++      if (!n->has_gimple_body_p ())
++	continue;
++      n->get_body ();
++      function *fn = DECL_STRUCT_FUNCTION (n->decl);
++      if (!fn)
++	continue;
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	dump_function_node_info (n);
++      /* Analyze direct/indirect function calls.  */
++      for (cgraph_edge *e = n->callees; e; e = e->next_callee)
++	analyze_cgraph_edge (e);
++      for (cgraph_edge *e = n->indirect_calls; e; e = e->next_callee)
++	analyze_cgraph_edge (e);
++      /* Analyze assign (with casts) statements.  */
++      analyze_assigns (fn);
++    }
++}
++
++static void
++process_cbase_to_ptype_map ()
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nProcess types in cbase-to-ptypes map:\n");
++
++  for (type_alias_map::iterator it1 = cbase_to_ptype->begin ();
++       it1 != cbase_to_ptype->end (); ++it1)
++    {
++      type_set *set = it1->second;
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	dump_type_uid_with_set ("cb=(%d): ", (*type_uid_map)[it1->first],
++				cbase_to_ptype);
++      tree ctype = NULL;
++      for (type_set::const_iterator it2 = set->begin ();
++	   it2 != set->end (); it2++)
++	{
++	  tree t2 = (*type_uid_map)[*it2];
++	  if (t2 == TYPE_MAIN_VARIANT (t2))
++	    {
++	      ctype = t2;
++	      break;
++	    }
++	}
++      if (!ctype)
++	continue;
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	dump_type_with_uid ("Select canonical type: ", ctype);
++      for (type_set::const_iterator it2 = set->begin ();
++	   it2 != set->end (); it2++)
++	{
++	  tree t = (*type_uid_map)[*it2];
++	  if (!ctype_map->count (t))
++	    {
++	      (*ctype_map)[t] = ctype;
++	      if (dump_file && (dump_flags & TDF_DETAILS))
++		fprintf (dump_file, "Set canonical type for (%d)->c(%d)\n",
++			 *it2, TYPE_UID (ctype));
++	    }
++	  else if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "Canonical type is already set (%d)->c(%d)\n",
++		     *it2, TYPE_UID ((*ctype_map)[t]));
++	}
++    }
++}
++
++static void
++set_canonical_type_for_type_set (type_set *set)
++{
++  tree one_canonical = NULL;
++  for (type_set::const_iterator it = set->begin (); it != set->end (); it++)
++    {
++      tree t = (*type_uid_map)[*it];
++      gcc_assert (t);
++      if ((TYPE_CANONICAL (t) || ctype_map->count (t)))
++	{
++	  one_canonical = TYPE_CANONICAL (t) ? TYPE_CANONICAL (t)
++					     : (*ctype_map)[t];
++	  gcc_assert (COMPLETE_TYPE_P (t));
++	  break;
++	}
++    }
++  for (type_set::const_iterator it = set->begin (); it != set->end (); it++)
++    {
++      tree t = (*type_uid_map)[*it];
++      if (!ctype_map->count (t))
++	{
++	  (*ctype_map)[t] = one_canonical;
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      if (one_canonical)
++		fprintf (dump_file, "Set canonical type for (%d)->c(%d)\n",
++			 TYPE_UID (t), TYPE_UID (one_canonical));
++	      else
++		fprintf (dump_file, "Set NULL canonical for (%d)\n", *it);
++	    }
++	}
++      else if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  tree ct = (*ctype_map)[t];
++	  fprintf (dump_file, "Canonical type is already set (%d)->c(%d)\n",
++		   TYPE_UID (t), ct ? TYPE_UID (ct) : -1);
++	}
++    }
++}
++
++static void
++dump_is_type_set_incomplete (type_set * set)
++{
++  bool has_complete_types = false;
++  for (type_set::const_iterator it = set->begin (); it != set->end (); it++)
++    if (COMPLETE_TYPE_P ((*type_uid_map)[*it]))
++      {
++	has_complete_types = true;
++	break;
++      }
++  if (!has_complete_types)
++    fprintf (dump_file, "Set of incomplete types\n");
++}
++
++static void
++process_alias_type_sets ()
++{
++  if (dump_file)
++    fprintf (dump_file, "\nProcess alias sets of types:\n");
++  /* Keep processed types to process each type set (in ta_map) only once.  */
++  type_set processed_types;
++  for (type_alias_map::iterator it1 = ta_map->begin ();
++       it1 != ta_map->end (); ++it1)
++    {
++      tree type = (*type_uid_map)[it1->first];
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	dump_type_uid_with_set ("(%d) ", type, ta_map);
++      if (processed_types.count (TYPE_UID (type)) != 0
++	  || unsafe_types->count (TYPE_UID (type)) != 0)
++	continue;
++      type_set *set = it1->second;
++      for (type_set::const_iterator it2 = set->begin ();
++	   it2 != set->end (); it2++)
++	processed_types.insert (*it2);
++      /* Check if this type set contains function pointers and
++	 non-function pointers.  */
++      bool has_no_fp = false, has_fp = false;
++      for (type_set::const_iterator it2 = set->begin ();
++	   it2 != set->end (); it2++)
++	{
++	  tree t2 = (*type_uid_map)[*it2];
++	  if (FUNCTION_POINTER_TYPE_P (t2))
++	    has_fp = true;
++	  else
++	    has_no_fp = true;
++	  if (has_fp && has_no_fp)
++	    break;
++	}
++      if (has_fp)
++	{
++	  for (type_set::const_iterator it2 = set->begin ();
++	       it2 != set->end (); it2++)
++	    {
++	      tree t2 = (*type_uid_map)[*it2];
++	      /* If it's a type set with mixed function and not-function types,
++		 mark all function pointer types in the set as unsafe.  */
++	      if (has_no_fp && FUNCTION_POINTER_TYPE_P (t2))
++		{
++		  tree ftype = TREE_TYPE (t2);
++		  unsafe_types->insert (TYPE_UID (ftype));
++		  if (dump_file && (dump_flags & TDF_DETAILS))
++		    fprintf (dump_file, "Insert function type (%d) to unsafe "
++			     "due to escape its pointer type (%d) to mixed "
++			     "alias set (printed before)\n",
++			     TYPE_UID (ftype), TYPE_UID (t2));
++		}
++	      /* If it's a type set with only function pointer types,
++		 mark all base function types in the set as aliases.  */
++	      if (!has_no_fp)
++		{
++		  gcc_assert (FUNCTION_POINTER_TYPE_P (type)
++			      && FUNCTION_POINTER_TYPE_P (t2));
++		  if (dump_file && (dump_flags & TDF_DETAILS))
++		    fprintf (dump_file, "Insert function type aliases by "
++			     "function pointer aliases:\n");
++		  register_ailas_type (TREE_TYPE (type), TREE_TYPE (t2),
++				       fta_map);
++		}
++	    }
++	}
++      set_canonical_type_for_type_set (set);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	dump_is_type_set_incomplete (set);
++    }
++}
++
++static void
++dump_unsafe_and_canonical_types ()
++{
++  fprintf (dump_file, "\nList of unsafe types:\n");
++  for (type_set::iterator it = unsafe_types->begin ();
++       it != unsafe_types->end (); ++it)
++    {
++      print_generic_expr (dump_file, (*type_uid_map)[*it]);
++      fprintf (dump_file, " (%d)\n", *it);
++    }
++  fprintf (dump_file, "\nList of alias canonical types:\n");
++  for (type_alias_map::iterator it = ta_map->begin ();
++       it != ta_map->end (); ++it)
++    {
++      tree type = (*type_uid_map)[it->first];
++      if (ctype_map->count (type) == 0)
++	continue;
++      print_generic_expr (dump_file, type);
++      fprintf (dump_file, " -> ");
++      tree ctype = (*ctype_map)[type];
++      if (ctype != NULL)
++	{
++	  print_generic_expr (dump_file, ctype);
++	  fprintf (dump_file, " (%d)->(%d)\n",
++		   TYPE_UID (type), TYPE_UID (ctype));
++	}
++      else
++	 fprintf (dump_file, " null\n");
++    }
++}
++
++static void
++init_function_type_alias_for_edge (cgraph_edge *e)
++{
++  gcall *stmt = e->call_stmt;
++  gcc_assert (stmt != NULL);
++  tree fntype = get_call_fntype (stmt);
++  if (fta_map->count (TYPE_UID (fntype)) == 0)
++    register_ailas_type (fntype, fntype, fta_map);
++}
++
++/* This pass over all function types makes each function type to have
++   at least one alias (itself).  */
++
++static void
++init_function_type_aliases ()
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nInit aliases for all function types.\n");
++
++  struct cgraph_node *n;
++  FOR_EACH_FUNCTION (n)
++    {
++      tree fntype = TREE_TYPE (n->decl);
++      if (fta_map->count (TYPE_UID (fntype)) == 0)
++	register_ailas_type (fntype, fntype, fta_map);
++
++      if (!n->has_gimple_body_p ())
++	continue;
++      n->get_body ();
++      function *fn = DECL_STRUCT_FUNCTION (n->decl);
++      if (!fn)
++	continue;
++
++      /* Init for function types of direct/indirect callees.  */
++      for (cgraph_edge *e = n->callees; e; e = e->next_callee)
++	init_function_type_alias_for_edge (e);
++      for (cgraph_edge *e = n->indirect_calls; e; e = e->next_callee)
++	init_function_type_alias_for_edge (e);
++    }
++}
++
++/* In lto-common.c there is the global canonical type table and the
++   corresponding machinery which detects the same types from differens
++   modules and joins them assigning the one canonical type.  However
++   lto does not set the goal to do a complete and precise matching, so
++   sometimes a few types has no TYPE_CANONICAL set.  Since ICP relies on
++   precise type matching, we create the similar table and register all
++   the required types in it.  */
++
++static std::map<const_tree, hashval_t> *canonical_type_hash_cache = NULL;
++static std::map<hashval_t, tree> *icp_canonical_types = NULL;
++
++static hashval_t hash_canonical_type (tree type);
++
++/* Register canonical type in icp_canonical_types and ctype_map evaluating
++   its hash (using hash_canonical_type) if it's needed.  */
++
++static hashval_t
++icp_register_canonical_type (tree t)
++{
++  hashval_t hash;
++  if (canonical_type_hash_cache->count ((const_tree) t) == 0)
++    {
++      tree t1 = TYPE_MAIN_VARIANT (t);
++      if (!COMPLETE_TYPE_P (t1) && TYPE_CANONICAL (t1)
++	  && COMPLETE_TYPE_P (TYPE_CANONICAL (t1)))
++	{
++	  t1 = TYPE_CANONICAL (t1);
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "Use complete canonical (%d) for (%d)\n",
++		     TYPE_UID (t1), TYPE_UID (t));
++	}
++      hash = hash_canonical_type (t1);
++      /* Cache the just computed hash value.  */
++      (*canonical_type_hash_cache)[(const_tree) t] = hash;
++    }
++  else
++    hash = (*canonical_type_hash_cache)[(const_tree) t];
++
++  tree new_type = t;
++  if (icp_canonical_types->count (hash))
++    {
++      new_type = (*icp_canonical_types)[hash];
++      gcc_checking_assert (new_type != t);
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Found canonical (%d) for (%d), h=%u\n",
++		 TYPE_UID (new_type), TYPE_UID (t), (unsigned int) hash);
++    }
++  else
++    {
++      (*icp_canonical_types)[hash] = t;
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Register canonical %d, h=%u\n", TYPE_UID (t),
++		 (unsigned int) hash);
++    }
++  if (ctype_map->count (t) == 0)
++    (*ctype_map)[t] = new_type;
++  return hash;
++}
++
++/* Merge hstate with hash of the given type.  If the type is not registered,
++   register it in the maps of the canonical types. */
++
++static void
++iterative_hash_canonical_type (tree type, inchash::hash &hstate)
++{
++  hashval_t v;
++  /* All type variants have same TYPE_CANONICAL.  */
++  type = TYPE_MAIN_VARIANT (type);
++  if (canonical_type_hash_cache->count ((const_tree) type))
++    v = (*canonical_type_hash_cache)[(const_tree) type];
++  else
++    v = icp_register_canonical_type (type);
++  hstate.merge_hash (v);
++}
++
++/* Compute and return hash for the given type.  It does not take into account
++   base types of pointer types.  */
++
++static hashval_t
++hash_canonical_type (tree type)
++{
++  inchash::hash hstate;
++  enum tree_code code;
++  /* Combine a few common features of types so that types are grouped into
++     smaller sets; when searching for existing matching types to merge,
++     only existing types having the same features as the new type will be
++     checked.  */
++  code = tree_code_for_canonical_type_merging (TREE_CODE (type));
++  hstate.add_int (code);
++  if (!RECORD_OR_UNION_TYPE_P (type))
++    hstate.add_int (TYPE_MODE (type));
++  /* Incorporate common features of numerical types.  */
++  if (INTEGRAL_TYPE_P (type)
++      || SCALAR_FLOAT_TYPE_P (type)
++      || FIXED_POINT_TYPE_P (type)
++      || TREE_CODE (type) == OFFSET_TYPE
++      || POINTER_TYPE_P (type))
++    {
++      hstate.add_int (TYPE_PRECISION (type));
++      if (!type_with_interoperable_signedness (type))
++	hstate.add_int (TYPE_UNSIGNED (type));
++    }
++  if (VECTOR_TYPE_P (type))
++    {
++      hstate.add_poly_int (TYPE_VECTOR_SUBPARTS (type));
++      hstate.add_int (TYPE_UNSIGNED (type));
++    }
++  if (TREE_CODE (type) == COMPLEX_TYPE)
++    hstate.add_int (TYPE_UNSIGNED (type));
++  if (POINTER_TYPE_P (type))
++    hstate.add_int (TYPE_ADDR_SPACE (TREE_TYPE (type)));
++  /* For array types hash the domain bounds and the string flag.  */
++  if (TREE_CODE (type) == ARRAY_TYPE && TYPE_DOMAIN (type))
++    {
++      hstate.add_int (TYPE_STRING_FLAG (type));
++      /* OMP lowering can introduce error_mark_node in place of
++	 random local decls in types.  */
++      if (TYPE_MIN_VALUE (TYPE_DOMAIN (type)) != error_mark_node)
++	inchash::add_expr (TYPE_MIN_VALUE (TYPE_DOMAIN (type)), hstate);
++      if (TYPE_MAX_VALUE (TYPE_DOMAIN (type)) != error_mark_node)
++	inchash::add_expr (TYPE_MAX_VALUE (TYPE_DOMAIN (type)), hstate);
++    }
++  /* Recurse for aggregates with a single element type.  */
++  if (TREE_CODE (type) == ARRAY_TYPE
++      || TREE_CODE (type) == COMPLEX_TYPE
++      || TREE_CODE (type) == VECTOR_TYPE)
++    iterative_hash_canonical_type (TREE_TYPE (type), hstate);
++  /* Incorporate function return and argument types.  */
++  if (TREE_CODE (type) == FUNCTION_TYPE || TREE_CODE (type) == METHOD_TYPE)
++    {
++      unsigned nargs = 0;
++      iterative_hash_canonical_type (TREE_TYPE (type), hstate);
++      for (tree p = TYPE_ARG_TYPES (type); p; p = TREE_CHAIN (p))
++	{
++	  iterative_hash_canonical_type (TREE_VALUE (p), hstate);
++	  nargs++;
++	}
++      hstate.add_int (nargs);
++    }
++  if (RECORD_OR_UNION_TYPE_P (type))
++    {
++      unsigned nfields = 0;
++      for (tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
++	if (TREE_CODE (f) == FIELD_DECL)
++	  {
++	    iterative_hash_canonical_type (TREE_TYPE (f), hstate);
++	    nfields++;
++	  }
++      hstate.add_int (nfields);
++    }
++  return hstate.end ();
++}
++
++/* It finds canonical type in ctype_map and icp_canonical_types maps.  */
++
++static tree
++find_canonical_type (tree type)
++{
++  if (ctype_map->count (type))
++    return (*ctype_map)[type];
++  if (canonical_type_hash_cache->count ((const_tree) type) == 0)
++    return NULL;
++  hashval_t h = (*canonical_type_hash_cache)[(const_tree) type];
++  if (icp_canonical_types->count (h))
++    return (*icp_canonical_types)[h];
++  return NULL;
++}
++
++/* It updates hash for the given type taking into account pointees in pointer
++   types.  If the type is incomplete function type, it returns true.  It's used
++   only for function type hash calculation. */
++
++static bool
++initial_hash_canonical_type (tree type, inchash::hash &hstate)
++{
++  /* All type variants have same TYPE_CANONICAL.  */
++  type = TYPE_MAIN_VARIANT (type);
++  if (VOID_TYPE_P (type))
++    {
++      hstate.add_int (POINTER_TYPE);
++      return false;
++    }
++  hstate.add_int (TREE_CODE (type));
++  hstate.add_int (TYPE_MODE (type));
++  if (POINTER_TYPE_P (type))
++    {
++      tree base_type = TREE_TYPE (type);
++      hstate.add_int (TYPE_ADDR_SPACE (base_type));
++      return initial_hash_canonical_type (base_type, hstate);
++    }
++  tree ctype = find_canonical_type (type);
++  if (!ctype)
++    {
++      if (TREE_CODE (type) == FUNCTION_TYPE || TREE_CODE (type) == METHOD_TYPE)
++	{
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "Due to ftype (%d)\n", TYPE_UID (type));
++	  return true;
++	}
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	dump_type_with_uid ("Has NO canonical type: ", type, TDF_UID);
++      icp_register_canonical_type (type);
++      if (ctype_map->count(type))
++	ctype = (*ctype_map)[type];
++      if (ctype && dump_file && (dump_flags & TDF_DETAILS))
++	dump_type_with_uid ("Found canonical type: ", ctype, TDF_UID);
++    }
++  else if (dump_file && (dump_flags & TDF_DETAILS))
++    dump_type_with_uid ("Canonical type: ", ctype, TDF_UID);
++  hstate.add_int (TYPE_UID (ctype));
++  return false;
++}
++
++/* It returns hash value for the given function type. If the function type is
++   incomplete, insert it in the incomplete_hash_ftype set.  */
++
++static hashval_t
++get_hash_for_ftype (tree type, type_set *incomplete_hash_ftype)
++{
++  bool incomplete = false;
++  inchash::hash hstate;
++  /* Function type is expected.  */
++  gcc_assert (TREE_CODE (type) == FUNCTION_TYPE
++	      || TREE_CODE (type) == METHOD_TYPE);
++  /* Hash return type.  */
++  tree rt = TREE_TYPE (type);
++  tree ct = rt ? find_canonical_type (rt) : void_type_node;
++  incomplete |= initial_hash_canonical_type (ct ? ct : rt, hstate);
++  /* Hash arg types.  */
++  tree argt = TYPE_ARG_TYPES (type);
++  if (!argt)
++    incomplete |= initial_hash_canonical_type (void_type_node, hstate);
++  else
++    for (unsigned i = 1; argt; ++i, argt = TREE_CHAIN (argt))
++      {
++	tree ct = find_canonical_type (TREE_VALUE (argt));
++	ct = ct ? ct : TREE_VALUE (argt);
++	incomplete |= initial_hash_canonical_type (ct, hstate);
++      }
++  if (incomplete && incomplete_hash_ftype->count (TYPE_UID (type)) == 0)
++    incomplete_hash_ftype->insert (TYPE_UID (type));
++  else if (!incomplete && incomplete_hash_ftype->count (TYPE_UID (type)) != 0)
++    incomplete_hash_ftype->erase (TYPE_UID (type));
++  return hstate.end();
++}
++
++/* Find type aliases evaluating type hashes and connecting types with
++   the same hash values.  */
++
++static void
++find_type_aliases_by_compatibility ()
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "\nFind type aliases checking their compatibility.\n");
++
++  std::map<hashval_t, tree> hash_to_ftype;
++  type_set *incomplete_hash_ftype = new type_set;
++  canonical_type_hash_cache = new std::map<const_tree, hashval_t>;
++  icp_canonical_types = new std::map<hashval_t, tree>;
++
++  bool changed;
++  int i = 0;
++  do
++    {
++      changed = false;
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Iteration %d\n", i);
++      for (type_alias_map::iterator it = fta_map->begin ();
++	   it != fta_map->end (); ++it)
++	{
++	  tree type = (*type_uid_map)[it->first];
++	  if (TYPE_CANONICAL (type))
++	    continue;
++	  hashval_t hash = get_hash_for_ftype (type, incomplete_hash_ftype);
++	  if (incomplete_hash_ftype->count (TYPE_UID (type)) != 0)
++	    {
++	      if (dump_file && (dump_flags & TDF_DETAILS))
++		fprintf (dump_file, "Incomplete (%d), h=%u\n", TYPE_UID (type),
++			 (unsigned int) hash);
++	      continue;
++	    }
++	  if (hash_to_ftype.count (hash) == 0)
++	    hash_to_ftype[hash] = type;
++	  TYPE_CANONICAL (type) = hash_to_ftype[hash];
++	  changed = true;
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    fprintf (dump_file, "(%d)->(%d), h=%u\n", TYPE_UID (type),
++		     TYPE_UID (TYPE_CANONICAL (type)), (unsigned int) hash);
++	}
++      i++;
++    }
++  while (changed);
++
++  delete incomplete_hash_ftype;
++  delete icp_canonical_types;
++  delete canonical_type_hash_cache;
++}
++
++static void
++dump_function_type_aliases_list ()
++{
++  fprintf (dump_file, "\nList of function type aliases:\n");
++  for (type_alias_map::iterator it = fta_map->begin ();
++       it != fta_map->end (); ++it)
++    dump_type_uid_with_set ("(%d) ", (*type_uid_map)[it->first], fta_map);
++}
++
++/* Collect type aliases and find missed canonical types.  */
++
++static void
++collect_function_type_aliases ()
++{
++  collect_type_alias_sets ();
++  process_cbase_to_ptype_map ();
++  process_alias_type_sets ();
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    dump_unsafe_and_canonical_types ();
++
++  /* TODO: maybe remove this pass.  */
++  init_function_type_aliases ();
++  for (type_alias_map::iterator it = fta_map->begin ();
++       it != fta_map->end (); ++it)
++    set_canonical_type_for_type_set (it->second);
++  find_type_aliases_by_compatibility ();
++
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    dump_function_type_aliases_list ();
++}
++
++static void
++dump_function_signature_info (struct cgraph_node *n, tree ftype, bool varargs)
++{
++  fprintf (dump_file, "Function decl: ");
++  print_generic_expr (dump_file, n->decl);
++  dump_type_uid_with_set (" with type (%d) ", ftype, fta_map, true, false);
++  if (varargs)
++    fprintf (dump_file, "has varargs, ");
++  if (TREE_CODE (ftype) == METHOD_TYPE)
++    fprintf (dump_file, "is method, ");
++  if (!n->address_taken)
++    fprintf (dump_file, "is not address taken, ");
++  if (unsafe_types->count (TYPE_UID (ftype)))
++    fprintf (dump_file, "is unsafe, ");
++  fprintf (dump_file, "\n");
++}
++
++/* Check if the function has variadic arguments.
++   It's corrected count_num_arguments ().  */
++
++static bool
++has_varargs (tree decl)
++{
++  tree t;
++  unsigned int num = 0;
++  for (t = TYPE_ARG_TYPES (TREE_TYPE (decl));
++       t && TREE_VALUE (t) != void_type_node; t = TREE_CHAIN (t))
++    num++;
++  if (!t && num)
++    return true;
++  return false;
++}
++
++/* Join fs_map's sets for function type aliases.  */
++
++static void
++merge_fs_map_for_ftype_aliases ()
++{
++  if (dump_file)
++    fprintf (dump_file, "\n\nMerge decl sets for function type aliases:\n");
++  type_set processed_types;
++  for (type_decl_map::iterator it1 = fs_map->begin ();
++       it1 != fs_map->end (); ++it1)
++    {
++      if (processed_types.count (it1->first) != 0)
++	continue;
++      decl_set *d_set = it1->second;
++      tree type = (*type_uid_map)[it1->first];
++      type_set *set = (*fta_map)[it1->first];
++      for (type_set::const_iterator it2 = set->begin ();
++	   it2 != set->end (); it2++)
++	{
++	  tree t2 = (*type_uid_map)[*it2];
++	  processed_types.insert (*it2);
++	  if (type == t2)
++	    continue;
++	  gcc_assert ((TREE_CODE (type) == FUNCTION_TYPE
++		       || TREE_CODE (type) == METHOD_TYPE)
++		      && (TREE_CODE (t2) == FUNCTION_TYPE
++			  || TREE_CODE (t2) == METHOD_TYPE));
++	  if (fs_map->count (*it2) == 0 || (*fs_map)[*it2] == NULL)
++	    (*fs_map)[*it2] = d_set;
++	  else
++	    {
++	      decl_set *t2_decl_set = (*fs_map)[*it2];
++	      (*fs_map)[*it2] = d_set;
++	      gcc_assert (t2_decl_set && t2_decl_set->size() > 0);
++	      d_set->insert (t2_decl_set->begin (), t2_decl_set->end ());
++	      delete t2_decl_set;
++	    }
++	}
++    }
++}
++
++/* Dump function types with set of functions corresponding to it.  */
++
++static void
++dump_function_signature_sets ()
++{
++  fprintf (dump_file, "\n\nUnique sets of function signatures:\n");
++  std::set<decl_set *> processed_sets;
++  for (type_decl_map::iterator it1 = fs_map->begin ();
++       it1 != fs_map->end (); ++it1)
++    {
++      decl_set *set = it1->second;
++      if (processed_sets.count (set) != 0)
++	continue;
++      processed_sets.insert (set);
++      fprintf (dump_file, "{ ");
++      print_type_set (it1->first, fta_map);
++      fprintf (dump_file, " : ");
++      for (decl_set::const_iterator it2 = set->begin ();
++	   it2 != set->end (); it2++)
++	{
++	  fprintf (dump_file, it2 == set->begin () ? "" : ", ");
++	  print_generic_expr (dump_file, *it2);
++	  fprintf (dump_file, "(%d)", DECL_UID (*it2));
++	}
++      fprintf (dump_file, "}\n");
++    }
++}
++
++/* Fill the map of function types to sets of function decls.  */
++
++static void
++collect_function_signatures ()
++{
++  if (dump_file)
++    fprintf (dump_file, "\n\nCollect function signatures:\n");
++  struct cgraph_node *n;
++  FOR_EACH_FUNCTION (n)
++    {
++      gcc_assert (n->decl && TREE_TYPE (n->decl));
++      tree ftype = TREE_TYPE (n->decl);
++      bool varargs = has_varargs (n->decl);
++      if (varargs && n->address_taken)
++	has_address_taken_functions_with_varargs = true;
++      if (dump_file)
++	dump_function_signature_info (n, ftype, varargs);
++      if (!n->address_taken)
++	continue;
++      /* TODO: make a separate pass at the end to remove canonicals.  */
++      tree ctype = TYPE_CANONICAL (ftype);
++      unsigned alias_type_fs = ctype ? TYPE_UID (ctype) : 0;
++      if (dump_file)
++	fprintf (dump_file, "canonical type: %d %ld\n",
++		 alias_type_fs, fs_map->count (alias_type_fs));
++      if (alias_type_fs)
++	{
++	  if (fs_map->count (TYPE_UID (ctype)) == 0)
++	    (*fs_map)[TYPE_UID (ctype)] = new decl_set ();
++	  if (dump_file)
++	    fprintf (dump_file, "insert decl (%d) to set of map [%d]\n",
++		     DECL_UID (n->decl), TYPE_UID (ctype));
++	  (*fs_map)[TYPE_UID (ctype)]->insert (n->decl);
++	}
++    }
++  merge_fs_map_for_ftype_aliases ();
++  if (dump_file)
++    dump_function_signature_sets ();
++}
++
++#define MAX_TARG_STAT 4
++struct icp_stats
++{
++  int npolymorphic;
++  int nspeculated;
++  int nsubst;
++  int ncold;
++  int nmultiple;
++  int noverwritable;
++  int nnotdefined;
++  int nexternal;
++  int nartificial;
++  int nremove;
++  int nicp;
++  int nspec;
++  int nf;
++  int ncalls;
++  int nindir;
++  int nind_only;
++  int ntargs[MAX_TARG_STAT + 1];
++};
++
++static void
++dump_processing_function (struct cgraph_node *n, struct icp_stats &stats)
++{
++  fprintf (dump_file, "\n\nProcesing function %s\n", n->dump_name ());
++  print_generic_expr (dump_file, n->decl);
++  fprintf (dump_file, "\n");
++  dump_type_with_uid ("Func's type: ", TREE_TYPE (n->decl));
++  if (dump_file && (dump_flags & TDF_STATS))
++    {
++      struct cgraph_edge *e;
++      stats.nf++;
++      for (e = n->indirect_calls; e; e = e->next_callee)
++	stats.nindir++;
++      for (e = n->callees; e; e = e->next_callee)
++	stats.ncalls++;
++      stats.ncalls += stats.nindir;
++      if (n->callers == NULL)
++	{
++	  fprintf (dump_file, "Function has NO callers\n");
++	  stats.nind_only++;
++	}
++    }
++}
++
++static void
++dump_indirect_call_site (tree call_fn, tree call_fn_ty)
++{
++  fprintf (dump_file, "Indirect call site: ");
++  print_generic_expr (dump_file, call_fn);
++  dump_type_with_uid ("\nFunction pointer type: ", call_fn_ty);
++}
++
++static void
++erase_from_unreachable (unsigned type_uid, type_set &unreachable)
++{
++  unreachable.erase (type_uid);
++  if (!fta_map->count (type_uid))
++    return;
++  type_set *set = (*fta_map)[type_uid];
++  for (type_set::const_iterator it = set->begin (); it != set->end (); it++)
++    unreachable.erase (*it);
++}
++
++static void
++dump_found_fdecls (decl_set *decls, unsigned ctype_uid)
++{
++  fprintf (dump_file, "Signature analysis FOUND decls (%d):", ctype_uid);
++  for (decl_set::const_iterator it = decls->begin (); it != decls->end (); it++)
++    {
++      print_generic_expr (dump_file, *it);
++      fprintf (dump_file, "(%d), ", DECL_UID (*it));
++    }
++  if (unsafe_types->count (ctype_uid))
++    fprintf (dump_file, "type is UNSAFE");
++  fprintf (dump_file, "\n");
++}
++
++static void
++count_found_targets (struct icp_stats &stats, unsigned size)
++{
++  gcc_assert (size > 0);
++  stats.ntargs[size > MAX_TARG_STAT ? MAX_TARG_STAT : size - 1]++;
++}
++
++/* Promote the indirect call.  */
++
++static void
++promote_call (struct cgraph_edge *e, struct cgraph_node *n,
++	      struct cgraph_node *likely_target, struct icp_stats *stats)
++{
++  if (dump_enabled_p ())
++    {
++      dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, e->call_stmt,
++		       "promoting indirect call in %s to %s\n",
++		       n->dump_name (), likely_target->dump_name ());
++    }
++  if (!likely_target->can_be_discarded_p ())
++    {
++      symtab_node *sn = likely_target->noninterposable_alias ();
++      cgraph_node *alias = dyn_cast<cgraph_node *> (sn);
++      if (alias)
++	likely_target = alias;
++    }
++  gimple *new_call;
++  if (flag_icp_speculatively)
++    {
++      e->make_speculative (likely_target, e->count.apply_scale (5, 10));
++      new_call = e->call_stmt;
++      stats->nspec++;
++    }
++  else
++    {
++      cgraph_edge *e2 = cgraph_edge::make_direct (e, likely_target);
++      new_call = cgraph_edge::redirect_call_stmt_to_callee (e2);
++      stats->nsubst++;
++    }
++  if (dump_file)
++    {
++      fprintf (dump_file, "The call is substituted by: ");
++      print_gimple_stmt (dump_file, new_call, 0);
++      fprintf (dump_file, "\n");
++    }
++}
++
++/* Find functions which are called only indirectly and if they are not in
++   fs_map, they can be removed.  For now it is used only to print stats.  */
++
++static int
++find_functions_can_be_removed (type_set &unreachable)
++{
++  int nremove = 0;
++  if (dump_file)
++    fprintf (dump_file, "\nRemove unused functions:\n");
++  struct cgraph_node *n;
++  FOR_EACH_FUNCTION (n)
++    {
++      gcc_assert (n->decl && TREE_TYPE (n->decl));
++      if (n->callers != NULL)
++	continue;
++      tree ftype = TREE_TYPE (n->decl);
++      tree ctype = TYPE_CANONICAL (ftype);
++      if (!ctype || !unreachable.count (TYPE_UID (ctype))
++	  || unsafe_types->count (TYPE_UID (ftype))
++	  || TREE_CODE (ftype) == METHOD_TYPE || n->callers != NULL
++	  || !n->definition || n->alias || n->thunk || n->clones)
++	continue;
++      if (dump_file)
++	fprintf (dump_file, "%s is not used\n", n->dump_name ());
++      nremove++;
++    }
++  return nremove;
++}
++
++static void
++dump_stats (struct icp_stats &st)
++{
++  fprintf (dump_file, "\nSTATS: %i candidates for indirect call promotion,"
++	   " %i substituted, %i speculatively promoted, %i cold\n"
++	   "%i have multiple targets, %i already speculated, %i external,"
++	   " %i not defined, %i artificial, %i polymorphic calls,"
++	   " %i overwritable\n", st.nicp, st.nsubst, st.nspec, st.ncold,
++	   st.nmultiple, st.nspeculated, st.nexternal, st.nnotdefined,
++	   st.nartificial, st.npolymorphic, st.noverwritable);
++  if (!(dump_flags & TDF_STATS))
++    return;
++  fprintf (dump_file, "EXTRA STATS: %i functions, %i indirect calls,"
++	   " %i total calls, %i called only indirectly, %i may be removed\n"
++	   "Indirect call sites with found targets ", st.nf, st.nindir,
++	   st.ncalls, st.nind_only, st.nremove);
++  for (unsigned i = 0; i < MAX_TARG_STAT; i++)
++    fprintf (dump_file, "%u:%i, ", i + 1, st.ntargs[i]);
++  fprintf (dump_file, "more:%i\n", st.ntargs[MAX_TARG_STAT]);
++}
++
++/* Optimize indirect calls.  When an indirect call has only one target,
++   promote it into a direct call.  */
++
++static bool
++optimize_indirect_calls ()
++{
++  /* TODO: maybe move to the top of ipa_icp.  */
++  if (has_address_taken_functions_with_varargs)
++    {
++      if (dump_file)
++	fprintf (dump_file, "\n\nAddress taken function with varargs is found."
++		 " Skip the optimization.\n");
++      return false;
++    }
++  struct icp_stats stats = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
++			    0, 0, 0, 0, 0, {0, 0, 0, 0, 0}};
++  /* At first assume all function types are unreadchable.  */
++  type_set unreachable_ftypes;
++  if (dump_file && (dump_flags & TDF_STATS))
++    for (type_decl_map::iterator it = fs_map->begin ();
++	 it != fs_map->end (); ++it)
++      unreachable_ftypes.insert (it->first);
++
++  struct cgraph_node *n;
++  FOR_EACH_DEFINED_FUNCTION (n)
++    {
++      if (dump_file)
++	dump_processing_function (n, stats);
++      struct cgraph_edge *e;
++      bool update = false;
++      if (!opt_for_fn (n->decl, flag_icp) || !n->has_gimple_body_p ()
++	  || n->inlined_to || !n->indirect_calls)
++	{
++	  if (dump_file)
++	    fprintf (dump_file, "Skip the function\n");
++	  continue;
++	}
++      /* If the function has indirect calls which are not polymorphic,
++	 process its body, otherwise continue.  */
++      bool non_polymorphic_calls = false;
++      for (e = n->indirect_calls; e; e = e->next_callee)
++	if (!e->indirect_info->polymorphic)
++	  {
++	    non_polymorphic_calls = true;
++	    break;
++	  }
++      if (!non_polymorphic_calls)
++	{
++	  if (dump_file)
++	    fprintf (dump_file, "All indirect calls are polymorphic,"
++		     "skip...\n");
++	  continue;
++	}
++      /* Get the function body to operate with call statements.  */
++      n->get_body ();
++      /* Walk indirect call sites and apply the optimization.  */
++      cgraph_edge *next;
++      for (e = n->indirect_calls; e; e = next)
++	{
++	  next = e->next_callee;
++	  if (e->indirect_info->polymorphic)
++	    {
++	      if (dump_file)
++		fprintf (dump_file, "Target is polymorphic, skip...\n\n");
++	      stats.npolymorphic++;
++	      continue;
++	    }
++	  stats.nicp++;
++	  struct cgraph_node *likely_target = NULL;
++	  gcall *stmt = e->call_stmt;
++	  gcc_assert (stmt != NULL);
++	  tree call_fn = gimple_call_fn (stmt);
++	  tree call_fn_ty = TREE_TYPE (call_fn);
++	  if (dump_file)
++	    dump_indirect_call_site (call_fn, call_fn_ty);
++	  tree decl = NULL_TREE;
++	  if (POINTER_TYPE_P (call_fn_ty))
++	    {
++	      if (dump_file)
++		dump_type_with_uid ("Pointee type: ", TREE_TYPE (call_fn_ty));
++	      if (dump_file && (dump_flags & TDF_STATS))
++		erase_from_unreachable (TYPE_UID (TREE_TYPE (call_fn_ty)),
++					unreachable_ftypes);
++	      /* Try to use the signature analysis results.  */
++	      tree ctype = TYPE_CANONICAL (TREE_TYPE (call_fn_ty));
++	      unsigned ctype_uid = ctype ? TYPE_UID (ctype) : 0;
++	      if (ctype_uid && fs_map->count (ctype_uid))
++		{
++		  if (dump_flags && (dump_flags & TDF_STATS))
++		    erase_from_unreachable (ctype_uid, unreachable_ftypes);
++		  decl_set *decls = (*fs_map)[ctype_uid];
++		  if (dump_file)
++		    dump_found_fdecls (decls, ctype_uid);
++		  /* TODO: optimize for multple targets.  */
++		  if (!unsafe_types->count (ctype_uid) && decls->size () == 1)
++		    {
++		      decl = *(decls->begin ());
++		      likely_target = cgraph_node::get (decl);
++		    }
++		  if (!unsafe_types->count (ctype_uid)
++		      && (dump_flags & TDF_STATS))
++		    count_found_targets (stats, decls->size ());
++		}
++	    }
++	  if (!decl || !likely_target)
++	    {
++	      if (dump_file)
++		fprintf (dump_file, "Callee is unknown\n\n");
++	      continue;
++	    }
++	  if (TREE_CODE (TREE_TYPE (decl)) == METHOD_TYPE)
++	    {
++	      if (dump_file)
++		fprintf (dump_file, "Callee is method\n\n");
++	      continue;
++	    }
++	  if (e->speculative)
++	    {
++	      if (dump_file)
++		fprintf (dump_file, "Call is already speculated\n\n");
++	      stats.nspeculated++;
++	      continue;
++	    }
++	  if (!likely_target->definition)
++	    {
++	      if (dump_file)
++		fprintf (dump_file, "Target is not a definition\n\n");
++	      stats.nnotdefined++;
++	      continue;
++	    }
++	  /* Do not introduce new references to external symbols.  While we
++	     can handle these just well, it is common for programs to
++	     incorrectly with headers defining methods they are linked
++	     with.  */
++	  if (DECL_EXTERNAL (likely_target->decl))
++	    {
++	      if (dump_file)
++		fprintf (dump_file, "Target is external\n\n");
++	      stats.nexternal++;
++	      continue;
++	    }
++	  /* Don't use an implicitly-declared destructor (c++/58678).  */
++	  struct cgraph_node *non_thunk_target
++	    = likely_target->function_symbol ();
++	  if (DECL_ARTIFICIAL (non_thunk_target->decl))
++	    {
++	      if (dump_file)
++		fprintf (dump_file, "Target is artificial\n\n");
++	      stats.nartificial++;
++	      continue;
++	    }
++	  if (likely_target->get_availability () <= AVAIL_INTERPOSABLE
++	      && likely_target->can_be_discarded_p ())
++	    {
++	      if (dump_file)
++		fprintf (dump_file, "Target is overwritable\n\n");
++	      stats.noverwritable++;
++	      continue;
++	    }
++	  else if (dbg_cnt (icp))
++	    {
++	      promote_call (e, n, likely_target, &stats);
++	      update = true;
++	    }
++	}
++      if (update)
++	ipa_update_overall_fn_summary (n);
++    }
++
++  if (dump_file && (dump_flags & TDF_STATS))
++    stats.nremove = find_functions_can_be_removed (unreachable_ftypes);
++
++  if (dump_file)
++    dump_stats (stats);
++  return stats.nsubst || stats.nspec;
++}
++
++/* Delete the given MAP with allocated sets.  One set may be associated with
++   more then one type/decl.  */
++
++template <typename MAP>
++static void
++remove_type_alias_map (MAP *map)
++{
++  std::set<typename MAP::mapped_type> processed_sets;
++  for (typename MAP::iterator it = map->begin (); it != map->end (); it++)
++    {
++      typename MAP::mapped_type set = it->second;
++      if (processed_sets.count (set) != 0)
++	continue;
++      processed_sets.insert (set);
++      delete set;
++    }
++  delete map;
++}
++
++/* The ipa indirect call promotion pass. Run required analysis and optimize
++   indirect calls.
++   When indirect call has only one target, promote it into a direct call.  */
++
++static unsigned int
++ipa_icp (void)
++{
++  ta_map = new type_alias_map;
++  fta_map = new type_alias_map;
++  cbase_to_ptype = new type_alias_map;
++  fs_map = new type_decl_map;
++  ctype_map = new type_map;
++  unsafe_types = new type_set;
++  type_uid_map = new uid_to_type_map;
++
++  /* Find type aliases, fill the function signature map and
++     optimize indirect calls.  */
++  collect_function_type_aliases ();
++  collect_function_signatures ();
++  bool optimized = optimize_indirect_calls ();
++
++  remove_type_alias_map (ta_map);
++  remove_type_alias_map (fta_map);
++  remove_type_alias_map (cbase_to_ptype);
++  remove_type_alias_map (fs_map);
++  delete ctype_map;
++  delete unsafe_types;
++  delete type_uid_map;
++
++  return optimized ? TODO_remove_functions : 0;
++}
++
++namespace {
++
++const pass_data pass_data_ipa_icp =
++{
++  IPA_PASS, /* type */
++  "icp", /* name */
++  OPTGROUP_NONE, /* optinfo_flags */
++  TV_IPA_ICP, /* tv_id */
++  0, /* properties_required */
++  0, /* properties_provided */
++  0, /* properties_destroyed */
++  0, /* todo_flags_start */
++  0, /* todo_flags_finish */
++};
++
++class pass_ipa_icp : public ipa_opt_pass_d
++{
++public:
++  pass_ipa_icp (gcc::context *ctxt)
++    : ipa_opt_pass_d (pass_data_ipa_icp, ctxt,
++		      NULL, /* generate_summary */
++		      NULL, /* write_summary */
++		      NULL, /* read_summary */
++		      NULL, /* write_optimization_summary */
++		      NULL, /* read_optimization_summary */
++		      NULL, /* stmt_fixup */
++		      0, /* function_transform_todo_flags_start */
++		      NULL, /* function_transform */
++		      NULL) /* variable_transform */
++  {}
++
++  /* opt_pass methods: */
++  virtual bool gate (function *)
++    {
++      return (optimize && flag_icp && !seen_error ()
++	      && (in_lto_p || flag_whole_program));
++    }
++
++  virtual unsigned int execute (function *) { return ipa_icp (); }
++
++}; // class pass_ipa_icp
++
++} // anon namespace
++
++ipa_opt_pass_d *
++make_pass_ipa_icp (gcc::context *ctxt)
++{
++  return new pass_ipa_icp (ctxt);
++}
+ 
+ #include "gt-ipa-devirt.h"
+diff --git a/gcc/passes.def b/gcc/passes.def
+index 9692066e4..d6db9be6e 100644
+--- a/gcc/passes.def
++++ b/gcc/passes.def
+@@ -156,6 +156,7 @@ along with GCC; see the file COPYING3.  If not see
+   NEXT_PASS (pass_ipa_profile);
+   NEXT_PASS (pass_ipa_icf);
+   NEXT_PASS (pass_ipa_devirt);
++  NEXT_PASS (pass_ipa_icp);
+   NEXT_PASS (pass_ipa_cp);
+   NEXT_PASS (pass_ipa_sra);
+   NEXT_PASS (pass_ipa_cdtor_merge);
+diff --git a/gcc/testsuite/gcc.dg/icp1.c b/gcc/testsuite/gcc.dg/icp1.c
+new file mode 100644
+index 000000000..c2117f738
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/icp1.c
+@@ -0,0 +1,40 @@
++/* { dg-do run } */
++/* { dg-options "-O2 -flto -ficp -fdump-ipa-icp=./icp1.c.077i.icp" } */
++
++int dummy = 0;
++
++typedef int (*ftype1)(int a);
++typedef float (*ftype2)(int a);
++
++ftype1 func1;
++
++struct {
++ int a;
++ int* b;
++ ftype1 myf1;
++ ftype2 myf2;
++} my_str;
++
++int foo(int a) {
++  my_str.myf1 = func1;
++  if (a % 2 == 0)
++    dummy += dummy % (dummy - a);
++  return a + 1;
++}
++
++float bar(int a) {
++  my_str.myf2 = &bar;
++  func1 = &foo;
++  return foo(a);
++}
++
++int main() {
++  bar(1);
++  my_str.myf2(3);
++  return (my_str.myf1(2) + func1(4)) != 8;
++}
++
++/* { dg-final { scan-ipa-dump "The call is substituted by:.*= foo \\(4\\);" "icp" } } */
++/* { dg-final { scan-ipa-dump "The call is substituted by:.*= foo \\(2\\);" "icp" } } */
++/* { dg-final { scan-ipa-dump "The call is substituted by: bar \\(3\\);" "icp" } } */
++/* { dg-final { scan-ipa-dump "STATS: 3 candidates for indirect call promotion, 3 substituted, 0 speculatively promoted, 0 cold" "icp" } } */
+diff --git a/gcc/testsuite/gcc.dg/icp2.c b/gcc/testsuite/gcc.dg/icp2.c
+new file mode 100644
+index 000000000..03d31d407
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/icp2.c
+@@ -0,0 +1,38 @@
++/* { dg-do run } */
++/* { dg-options "-O2 -flto -ficp -fdump-ipa-icp=./icp2.c.077i.icp" } */
++
++int dummy = 0;
++
++typedef int (*ftype1)(int a);
++typedef float (*ftype2)(int a);
++
++ftype1 func1;
++
++struct {
++ int a;
++ int* b;
++ ftype1 myf1;
++ ftype2 myf2;
++} my_str;
++
++int foo(int a) {
++  my_str.myf1 = func1;
++  if (a % 2 == 0)
++    dummy += dummy % (dummy - a);
++  return a + 1;
++}
++
++float bar(int a) {
++  my_str.myf2 = dummy ? (ftype2) &foo : &bar;
++  func1 = (ftype1) &bar;
++  return foo(a);
++}
++
++int main() {
++  bar(1);
++  my_str.myf2(3);
++  return (my_str.myf1(2) + func1(4)) != 8;
++}
++
++/* { dg-final { scan-ipa-dump-not "The call is substituted by.*" "icp" } } */
++/* { dg-final { scan-ipa-dump "STATS: 3 candidates for indirect call promotion, 0 substituted, 0 speculatively promoted, 0 cold" "icp" } } */
+diff --git a/gcc/testsuite/gcc.dg/icp3.c b/gcc/testsuite/gcc.dg/icp3.c
+new file mode 100644
+index 000000000..2a7d1e6f5
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/icp3.c
+@@ -0,0 +1,52 @@
++/* { dg-do run } */
++/* { dg-options "-O2 -flto -ficp -fdump-ipa-icp=./icp3.c.077i.icp" } */
++
++#include <stdio.h>
++
++int dummy = 0;
++
++typedef int (*ftype1)(int a);
++typedef float (*ftype2)(int a);
++typedef ftype1 (*ftype3) (ftype2);
++
++ftype1 func1;
++
++struct {
++ int a;
++ int* b;
++ ftype1 myf1;
++ ftype2 myf2;
++ ftype3 myf3;
++} my_str;
++
++ftype1 boo(ftype2 a) {
++  printf ("Call boo\n");
++  return (ftype1) a;
++}
++
++int foo(int a) {
++  printf ("Call foo\n");
++  my_str.myf1 = func1;
++  if (a % 2 == 0)
++    dummy += dummy % (dummy - a);
++  return a + 1;
++}
++
++float bar(int a) {
++  printf("Call bar\n");
++  my_str.myf2 = (ftype2) my_str.myf3((ftype2) foo);
++  func1 = &foo;
++  return foo(a);
++}
++
++int main() {
++  my_str.myf3 = &boo;
++  bar(1);
++  my_str.myf2(3);
++  return (my_str.myf1(2) + func1(4)) != 8;
++}
++
++/* { dg-final { scan-ipa-dump "The call is substituted by:.*= foo \\(4\\);" "icp" } } */
++/* { dg-final { scan-ipa-dump "The call is substituted by:.*= foo \\(2\\);" "icp" } } */
++/* { dg-final { scan-ipa-dump "The call is substituted by: foo \\(3\\);" "icp" } } */
++/* { dg-final { scan-ipa-dump "STATS: 4 candidates for indirect call promotion, 3 substituted, 0 speculatively promoted, 0 cold" "icp" } } */
+diff --git a/gcc/testsuite/gcc.dg/icp4.c b/gcc/testsuite/gcc.dg/icp4.c
+new file mode 100644
+index 000000000..e3e1d5116
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/icp4.c
+@@ -0,0 +1,55 @@
++/* { dg-do run } */
++/* { dg-options "-O2 -flto -ficp -fdump-ipa-icp=./icp4.c.077i.icp" } */
++
++#include <stdio.h>
++
++int dummy = 0;
++
++typedef int (*ftype1)(int a);
++typedef float (*ftype2)(int a);
++typedef ftype1 (*ftype3) (ftype2);
++
++ftype1 func1;
++ftype1 boo(ftype2 a);
++int foo(int a);
++float bar(int a);
++
++typedef struct {
++ int a;
++ int* b;
++ ftype1 myf1;
++ ftype2 myf2;
++ ftype3 myf3;
++} T;
++
++T my_str = {0, (int*) &dummy, (ftype1) &boo, (ftype2) &foo, (ftype3) &bar};
++
++ftype1 boo(ftype2 a) {
++  printf ("Call boo\n");
++  return (ftype1) a;
++}
++
++int foo(int a) {
++  printf ("Call foo\n");
++  my_str.myf1 = func1;
++  if (a % 2 == 0)
++    dummy += dummy % (dummy - a);
++  return a + 1;
++}
++
++float bar(int a) {
++  printf("Call bar\n");
++  my_str.myf2 = (ftype2) my_str.myf3((ftype2) foo);
++  func1 = &foo;
++  return foo(a);
++}
++
++int main() {
++  my_str.myf3 = &boo;
++  bar(1);
++  my_str.myf2(3);
++  return (my_str.myf1(2) + func1(4)) != 8;
++}
++
++/* { dg-final { scan-ipa-dump-not "The call is substituted by.*" "icp" } } */
++/* { dg-final { scan-ipa-dump "STATS: 4 candidates for indirect call promotion, 0 substituted, 0 speculatively promoted, 0 cold" "icp" } } */
+diff --git a/gcc/testsuite/gcc.dg/icp5.c b/gcc/testsuite/gcc.dg/icp5.c
+new file mode 100644
+index 000000000..c7709243c
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/icp5.c
+@@ -0,0 +1,66 @@
++/* { dg-do run } */
++/* { dg-options "-O2 -flto -ficp -fdump-ipa-icp=./icp5.c.077i.icp" } */
++
++#include <stdio.h>
++
++int dummy = 0;
++
++typedef int (*ftype1)(int a);
++typedef float (*ftype2)(int a);
++typedef ftype1 (*ftype3) (ftype2);
++
++ftype1 func1;
++ftype1 boo(ftype2 a);
++int foo(int a);
++float bar(int a);
++
++typedef struct {
++ int a;
++ int* b;
++ ftype1 myf1;
++ ftype2 myf2;
++ ftype3 myf3;
++} T;
++
++T my_str;
++
++typedef struct {
++ int a;
++ int* b;
++ ftype3 myf1;
++ ftype2 myf2;
++ ftype1 myf3;
++} T1;
++
++T1 my1 = {0, &dummy, boo, &bar, &foo};
++
++ftype1 boo(ftype2 a) {
++  printf("Call boo\n");
++  return (ftype1) a;
++}
++
++int foo(int a) {
++  printf("Call foo\n");
++  my_str.myf1 = func1;
++  if (a % 2 == 0)
++    dummy += dummy % (dummy - a);
++  return a + 1;
++}
++
++float bar(int a) {
++  printf("Call bar\n");
++  my_str.myf2 = (ftype2) my_str.myf3((ftype2) foo);
++  func1 = &foo;
++  return foo(a);
++}
++
++int main() {
++  my_str = *(T*)&my1;
++  my_str.myf3 = &boo;
++  bar(1);
++  my_str.myf2(3);
++  return (my_str.myf1(2) + func1(4)) != 8;
++}
++
++/* { dg-final { scan-ipa-dump-not "The call is substituted by.*" "icp" } } */
++/* { dg-final { scan-ipa-dump "STATS: 4 candidates for indirect call promotion, 0 substituted, 0 speculatively promoted, 0 cold" "icp" } } */
+diff --git a/gcc/testsuite/gcc.dg/icp6.c b/gcc/testsuite/gcc.dg/icp6.c
+new file mode 100644
+index 000000000..5a9f15045
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/icp6.c
+@@ -0,0 +1,66 @@
++/* { dg-do run } */
++/* { dg-options "-O2 -flto -ficp -fdump-ipa-icp=./icp6.c.077i.icp -Wno-int-conversion -Wno-incompatible-pointer-types" } */
++int dummy = 0;
++
++typedef int (*ftype1)(int a);
++typedef float (*ftype2)(int a);
++typedef int (*ftype3)();
++typedef int (*ftype4)(int a, int b);
++
++ftype1 func1;
++ftype4 func2;
++
++struct {
++ int a;
++ int* b;
++ ftype1 myf1;
++ ftype2 myf2;
++ ftype3 myf3;
++} my_str;
++
++int foo3(float a) {
++  return dummy;
++}
++
++int foo4(int a, int b) {
++  return a*b;
++}
++
++int foo(int a) {
++  my_str.myf1 = func1;
++  if (a % 2 == 0)
++    dummy += dummy % (dummy - a);
++  return a + 1;
++}
++
++int foo2(float a) {
++ func1 = (ftype1) &foo;
++ func2 = &foo4;
++ return dummy + foo3 (a);
++}
++
++float bar2(int a) {
++  my_str.myf2 = (ftype2)(0x864213);
++  func2 = 0x65378;
++  return foo(a);
++}
++
++float bar(int a) {
++  my_str.myf3 = &foo2;
++  my_str.myf2 = &bar;
++  func1 = (ftype1) &dummy;
++  func2 = (ftype4) &bar2;
++  return foo(a);
++}
++
++int main() {
++  bar(1);
++  bar2(1);
++  bar(0);
++  my_str.myf2(3);
++  ((ftype1) my_str.myf3)(0.0);
++  int sum = func1(4);
++  return (sum + my_str.myf1(2) + func2(5, 6)) != 38;
++}
++/* { dg-final { scan-ipa-dump "The call is substituted by.*foo2 \\(0\\);" "icp" } } */
++/* { dg-final { scan-ipa-dump "STATS: 5 candidates for indirect call promotion, 1 substituted, 0 speculatively promoted, 0 cold" "icp" } } */
+diff --git a/gcc/testsuite/gcc.dg/icp7.c b/gcc/testsuite/gcc.dg/icp7.c
+new file mode 100644
+index 000000000..fa52197f4
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/icp7.c
+@@ -0,0 +1,48 @@
++/* { dg-do run } */
++/* { dg-options "-O2 -flto -ficp -fdump-ipa-icp=./icp7.c.077i.icp" } */
++
++#include <stdarg.h>
++
++int dummy = 0;
++
++typedef int (*ftype1)(int a);
++typedef float (*ftype2)(int a);
++
++ftype1 func1;
++
++struct {
++ int a;
++ int* b;
++ ftype1 myf1;
++ ftype2 myf2;
++} my_str;
++
++int boo(int a, ...) {
++  va_list ap;
++  va_start(ap, a);
++  if (a == 0)
++    dummy += va_arg(ap, int);
++  va_end(ap);
++  return dummy;
++}
++
++int foo(int a) {
++  my_str.myf1 = func1;
++  if (a % 2 == 0)
++    dummy += dummy % (dummy - a);
++  return a + 1;
++}
++
++float bar(int a) {
++  my_str.myf2 = &bar;
++  func1 = (ftype1) &boo;
++  return foo(a);
++}
++
++int main() {
++  bar(1);
++  my_str.myf2(3);
++  return (my_str.myf1(2) + func1(4));
++}
++
++/* { dg-final { scan-ipa-dump "Address taken function with varargs is found. Skip the optimization." "icp" } } */
+diff --git a/gcc/timevar.def b/gcc/timevar.def
+index 98a5a490f..ca4156066 100644
+--- a/gcc/timevar.def
++++ b/gcc/timevar.def
+@@ -71,6 +71,7 @@ DEFTIMEVAR (TV_CGRAPHOPT             , "callgraph optimization")
+ DEFTIMEVAR (TV_CGRAPH_FUNC_EXPANSION , "callgraph functions expansion")
+ DEFTIMEVAR (TV_CGRAPH_IPA_PASSES     , "callgraph ipa passes")
+ DEFTIMEVAR (TV_IPA_ODR		     , "ipa ODR types")
++DEFTIMEVAR (TV_IPA_ICP               , "ipa indirect call promotion")
+ DEFTIMEVAR (TV_IPA_FNSUMMARY         , "ipa function summary")
+ DEFTIMEVAR (TV_IPA_UNREACHABLE       , "ipa dead code removal")
+ DEFTIMEVAR (TV_IPA_INHERITANCE       , "ipa inheritance graph")
+diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
+index 56898e019..5f09e4f8b 100644
+--- a/gcc/tree-pass.h
++++ b/gcc/tree-pass.h
+@@ -524,6 +524,7 @@ extern ipa_opt_pass_d *make_pass_ipa_cp (gcc::context *ctxt);
+ extern ipa_opt_pass_d *make_pass_ipa_sra (gcc::context *ctxt);
+ extern ipa_opt_pass_d *make_pass_ipa_icf (gcc::context *ctxt);
+ extern ipa_opt_pass_d *make_pass_ipa_devirt (gcc::context *ctxt);
++extern ipa_opt_pass_d *make_pass_ipa_icp (gcc::context *ctxt);
+ extern ipa_opt_pass_d *make_pass_ipa_odr (gcc::context *ctxt);
+ extern ipa_opt_pass_d *make_pass_ipa_reference (gcc::context *ctxt);
+ extern ipa_opt_pass_d *make_pass_ipa_pure_const (gcc::context *ctxt);
+-- 
+2.33.0
+
diff --git a/0035-Port-fixes-in-icp-to-GCC-12.patch b/0035-Port-fixes-in-icp-to-GCC-12.patch
new file mode 100644
index 0000000000000000000000000000000000000000..723f8b074caf1b33cdbca7e49ece489fcb4a7ba7
--- /dev/null
+++ b/0035-Port-fixes-in-icp-to-GCC-12.patch
@@ -0,0 +1,100 @@
+From aaa117a9ff58fb208e8c8859e075ca425f995f63 Mon Sep 17 00:00:00 2001
+From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
+Date: Tue, 27 Feb 2024 07:43:57 +0800
+Subject: [PATCH 07/18] Port fixes in icp to GCC 12
+
+---
+ gcc/ipa-devirt.cc | 37 ++++++++++++++++++++++++++++++-------
+ 1 file changed, 30 insertions(+), 7 deletions(-)
+
+diff --git a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc
+index 383839189..318535d06 100644
+--- a/gcc/ipa-devirt.cc
++++ b/gcc/ipa-devirt.cc
+@@ -4431,6 +4431,11 @@ print_type_set(unsigned ftype_uid, type_alias_map *map)
+   if (!map->count (ftype_uid))
+     return;
+   type_set* s = (*map)[ftype_uid];
++  if (!s)
++    {
++      fprintf (dump_file, "%d (no set)", ftype_uid);
++      return;
++    }
+   for (type_set::const_iterator it = s->begin (); it != s->end (); it++)
+     fprintf (dump_file, it == s->begin () ? "%d" : ", %d", *it);
+ }
+@@ -4696,12 +4701,19 @@ maybe_register_aliases (tree type1, tree type2)
+       if (register_ailas_type (type1, type2, ta_map))
+ 	analyze_pointees (type1, type2);
+     }
++  unsigned type1_uid = TYPE_UID (type1);
++  unsigned type2_uid = TYPE_UID (type2);
++  if (type_uid_map->count (type1_uid) == 0)
++    (*type_uid_map)[type1_uid] = type1;
++  if (type_uid_map->count (type2_uid) == 0)
++    (*type_uid_map)[type2_uid] = type2;
++
+   /* If function and non-function type pointers alias,
+      the function type is unsafe.  */
+   if (FUNCTION_POINTER_TYPE_P (type1) && !FUNCTION_POINTER_TYPE_P (type2))
+-    unsafe_types->insert (TYPE_UID (type1));
++    unsafe_types->insert (type1_uid);
+   if (FUNCTION_POINTER_TYPE_P (type2) && !FUNCTION_POINTER_TYPE_P (type1))
+-    unsafe_types->insert (TYPE_UID (type2));
++    unsafe_types->insert (type2_uid);
+ 
+   /* Try to figure out with pointers to incomplete types.  */
+   if (POINTER_TYPE_P (type1) && POINTER_TYPE_P (type2))
+@@ -4825,10 +4837,12 @@ compare_block_and_init_type (tree block, tree t1)
+ static void
+ analyze_global_var (varpool_node *var)
+ {
+-  var->get_constructor();
+   tree decl = var->decl;
+-  if (TREE_CODE (decl) == SSA_NAME || !DECL_INITIAL (decl)
+-      || integer_zerop (DECL_INITIAL (decl)))
++  if (decl || !DECL_INITIAL (decl))
++    return;
++  var->get_constructor ();
++  if (TREE_CODE (decl) == SSA_NAME || integer_zerop (DECL_INITIAL (decl))
++      || TREE_CODE (DECL_INITIAL (decl)) == ERROR_MARK)
+     return;
+ 
+   if (dump_file && (dump_flags & TDF_DETAILS))
+@@ -4998,7 +5012,9 @@ analyze_assign_stmt (gimple *stmt)
+     {
+       rhs = TREE_OPERAND (rhs, 0);
+       if (VAR_OR_FUNCTION_DECL_P (rhs) || TREE_CODE (rhs) == STRING_CST
+-	  || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL)
++	  || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL
++	  || TREE_CODE (rhs) == LABEL_DECL || TREE_CODE (rhs) == CONST_DECL
++	  || TREE_CODE (rhs) == RESULT_DECL)
+ 	rhs_type = build_pointer_type (TREE_TYPE (rhs));
+       else if (TREE_CODE (rhs) == COMPONENT_REF)
+ 	{
+@@ -5012,7 +5028,12 @@ analyze_assign_stmt (gimple *stmt)
+ 	  gcc_assert (POINTER_TYPE_P (rhs_type));
+ 	}
+       else
+-	gcc_unreachable();
++	{
++	  fprintf (dump_file, "\nUnsupported rhs type %s in assign stmt: ",
++		   get_tree_code_name (TREE_CODE (rhs)));
++	  print_gimple_stmt (dump_file, stmt, 0);
++	  gcc_unreachable ();
++	}
+     }
+   else
+     rhs_type = TREE_TYPE (rhs);
+@@ -5710,6 +5731,8 @@ merge_fs_map_for_ftype_aliases ()
+       decl_set *d_set = it1->second;
+       tree type = (*type_uid_map)[it1->first];
+       type_set *set = (*fta_map)[it1->first];
++      if (!set)
++	continue;
+       for (type_set::const_iterator it2 = set->begin ();
+ 	   it2 != set->end (); it2++)
+ 	{
+-- 
+2.33.0
+
diff --git a/0036-Add-split-complex-instructions-pass.patch b/0036-Add-split-complex-instructions-pass.patch
new file mode 100644
index 0000000000000000000000000000000000000000..b73affdc48bd22b9c62c6c491c28a45b27c33a9f
--- /dev/null
+++ b/0036-Add-split-complex-instructions-pass.patch
@@ -0,0 +1,1245 @@
+From 9a8e5716543972dec36bae1f9d380d27bfbcdae1 Mon Sep 17 00:00:00 2001
+From: Agrachev Andrey WX1228450 <agrachev.andrey@huawei-partners.com>
+Date: Mon, 21 Aug 2023 12:35:19 +0300
+Subject: [PATCH 09/18] Add split-complex-instructions pass
+
+ - Add option -fsplit-ldp-stp
+ - Add functionality to detect and split depended from store LDP instructions.
+ - Add -param=param-ldp-dependency-search-range= to configure ldp dependency search range
+ - Add RTL tests
+
+Co-authored-by: Chernonog Vyacheslav 00812786 <chernonog.vyacheslav@huawei.com>
+Co-authored-by: Zinin Ivan WX1305386 <zinin.ivan@huawei-partners.com>
+Co-authored-by: Gadzhiev Emin WX1195297 <gadzhiev.emin@huawei-partners.com>
+---
+ gcc/common.opt                                |   5 +
+ gcc/config/aarch64/aarch64.cc                 |  42 ++
+ gcc/doc/tm.texi                               |   8 +
+ gcc/doc/tm.texi.in                            |   4 +
+ gcc/params.opt                                |   3 +
+ gcc/passes.def                                |   1 +
+ gcc/sched-rgn.cc                              | 704 +++++++++++++++++-
+ gcc/target.def                                |  10 +
+ .../gcc.dg/rtl/aarch64/test-ldp-dont-split.c  |  74 ++
+ .../rtl/aarch64/test-ldp-split-rearrange.c    |  40 +
+ .../gcc.dg/rtl/aarch64/test-ldp-split.c       | 174 +++++
+ gcc/timevar.def                               |   1 +
+ gcc/tree-pass.h                               |   1 +
+ 13 files changed, 1066 insertions(+), 1 deletion(-)
+ create mode 100644 gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c
+ create mode 100644 gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c
+ create mode 100644 gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c
+
+diff --git a/gcc/common.opt b/gcc/common.opt
+index a42bee250..c0e3f5687 100644
+--- a/gcc/common.opt
++++ b/gcc/common.opt
+@@ -1797,6 +1797,11 @@ floop-nest-optimize
+ Common Var(flag_loop_nest_optimize) Optimization
+ Enable the loop nest optimizer.
+ 
++fsplit-ldp-stp
++Common Var(flag_split_ldp_stp) Optimization
++Split load/store pair instructions into separate load/store operations
++for better performance.
++
+ fstrict-volatile-bitfields
+ Common Var(flag_strict_volatile_bitfields) Init(-1) Optimization
+ Force bitfield accesses to match their type width.
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index 04072ca25..48e2eded0 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -27507,6 +27507,48 @@ aarch64_run_selftests (void)
+ 
+ #endif /* #if CHECKING_P */
+ 
++/* TODO: refuse to use ranges intead of full list of an instruction codes.  */
++
++bool
++is_aarch64_ldp_insn (int icode)
++{
++  if ((icode >= CODE_FOR_load_pair_sw_sisi
++	  && icode <= CODE_FOR_load_pair_dw_tftf)
++      || (icode >= CODE_FOR_loadwb_pairsi_si
++	     && icode <= CODE_FOR_loadwb_pairtf_di)
++      || (icode >= CODE_FOR_load_pairv8qiv8qi
++	     && icode <= CODE_FOR_load_pairdfdf)
++      || (icode >= CODE_FOR_load_pairv16qiv16qi
++	     && icode <= CODE_FOR_load_pairv8bfv2df)
++      || (icode >= CODE_FOR_load_pair_lanesv8qi
++	     && icode <= CODE_FOR_load_pair_lanesdf))
++    return true;
++  return false;
++}
++
++bool
++is_aarch64_stp_insn (int icode)
++{
++  if ((icode >= CODE_FOR_store_pair_sw_sisi
++	  && icode <= CODE_FOR_store_pair_dw_tftf)
++      || (icode >= CODE_FOR_storewb_pairsi_si
++	     && icode <= CODE_FOR_storewb_pairtf_di)
++      || (icode >= CODE_FOR_vec_store_pairv8qiv8qi
++	     && icode <= CODE_FOR_vec_store_pairdfdf)
++      || (icode >= CODE_FOR_vec_store_pairv16qiv16qi
++	     && icode <= CODE_FOR_vec_store_pairv8bfv2df)
++      || (icode >= CODE_FOR_store_pair_lanesv8qi
++	     && icode <= CODE_FOR_store_pair_lanesdf))
++    return true;
++  return false;
++}
++
++#undef TARGET_IS_LDP_INSN
++#define TARGET_IS_LDP_INSN is_aarch64_ldp_insn
++
++#undef TARGET_IS_STP_INSN
++#define TARGET_IS_STP_INSN is_aarch64_stp_insn
++
+ #undef TARGET_STACK_PROTECT_GUARD
+ #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
+ 
+diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
+index c5006afc0..0c6415a9c 100644
+--- a/gcc/doc/tm.texi
++++ b/gcc/doc/tm.texi
+@@ -12113,6 +12113,14 @@ object files that are not referenced from @code{main} and uses export
+ lists.
+ @end defmac
+ 
++@deftypefn {Target Hook} bool TARGET_IS_LDP_INSN (int @var{icode})
++Return true if icode is corresponding to any of the LDP instruction types.
++@end deftypefn
++
++@deftypefn {Target Hook} bool TARGET_IS_STP_INSN (int @var{icode})
++Return true if icode is corresponding to any of the STP instruction types.
++@end deftypefn
++
+ @deftypefn {Target Hook} bool TARGET_CANNOT_MODIFY_JUMPS_P (void)
+ This target hook returns @code{true} past the point in which new jump
+ instructions could be created.  On machines that require a register for
+diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
+index f869ddd5e..6ff60e562 100644
+--- a/gcc/doc/tm.texi.in
++++ b/gcc/doc/tm.texi.in
+@@ -7977,6 +7977,10 @@ object files that are not referenced from @code{main} and uses export
+ lists.
+ @end defmac
+ 
++@hook TARGET_IS_LDP_INSN
++
++@hook TARGET_IS_STP_INSN
++
+ @hook TARGET_CANNOT_MODIFY_JUMPS_P
+ 
+ @hook TARGET_HAVE_CONDITIONAL_EXECUTION
+diff --git a/gcc/params.opt b/gcc/params.opt
+index 7fcc2398d..6176d4790 100644
+--- a/gcc/params.opt
++++ b/gcc/params.opt
+@@ -1217,4 +1217,7 @@ Enum(vrp_mode) String(ranger) Value(VRP_MODE_RANGER)
+ Common Joined UInteger Var(param_pointer_compression_size) Init(32) IntegerRange(8, 32) Param Optimization
+ Target size of compressed pointer, which should be 8, 16 or 32.
+ 
++-param=param-ldp-dependency-search-range=
++Common Joined UInteger Var(param_ldp_dependency_search_range) Init(16) IntegerRange(1, 32) Param Optimization
++Range for depended ldp search in split-ldp-stp path.
+ ; This comment is to ensure we retain the blank line above.
+diff --git a/gcc/passes.def b/gcc/passes.def
+index 941bbadf0..a30e05688 100644
+--- a/gcc/passes.def
++++ b/gcc/passes.def
+@@ -514,6 +514,7 @@ along with GCC; see the file COPYING3.  If not see
+ 	  NEXT_PASS (pass_reorder_blocks);
+ 	  NEXT_PASS (pass_leaf_regs);
+ 	  NEXT_PASS (pass_split_before_sched2);
++	  NEXT_PASS (pass_split_complex_instructions);
+ 	  NEXT_PASS (pass_sched2);
+ 	  NEXT_PASS (pass_stack_regs);
+ 	  PUSH_INSERT_PASSES_WITHIN (pass_stack_regs)
+diff --git a/gcc/sched-rgn.cc b/gcc/sched-rgn.cc
+index a0dfdb788..b4df8bdc5 100644
+--- a/gcc/sched-rgn.cc
++++ b/gcc/sched-rgn.cc
+@@ -44,6 +44,8 @@ along with GCC; see the file COPYING3.  If not see
+    are actually scheduled.  */
+ 
+ #include "config.h"
++#define INCLUDE_SET
++#define INCLUDE_VECTOR
+ #include "system.h"
+ #include "coretypes.h"
+ #include "backend.h"
+@@ -65,6 +67,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "dbgcnt.h"
+ #include "pretty-print.h"
+ #include "print-rtl.h"
++#include "cfgrtl.h"
+ 
+ /* Disable warnings about quoting issues in the pp_xxx calls below
+    that (intentionally) don't follow GCC diagnostic conventions.  */
+@@ -3951,6 +3954,705 @@ make_pass_sched_fusion (gcc::context *ctxt)
+   return new pass_sched_fusion (ctxt);
+ }
+ 
++namespace {
++
++/* Def-use analisys special functions implementation.  */
++
++static struct df_link *
++get_defs (rtx_insn *insn, rtx reg)
++{
++  df_ref use;
++  struct df_link *ref_chain, *ref_link;
++
++  FOR_EACH_INSN_USE (use, insn)
++    {
++      if (GET_CODE (DF_REF_REG (use)) == SUBREG)
++	return NULL;
++      if (REGNO (DF_REF_REG (use)) == REGNO (reg))
++	break;
++    }
++
++  gcc_assert (use != NULL);
++
++  ref_chain = DF_REF_CHAIN (use);
++
++  for (ref_link = ref_chain; ref_link; ref_link = ref_link->next)
++    {
++      /* Problem getting some definition for this instruction.  */
++      if (ref_link->ref == NULL)
++	return NULL;
++      if (DF_REF_INSN_INFO (ref_link->ref) == NULL)
++	return NULL;
++      /* As global regs are assumed to be defined at each function call
++	  dataflow can report a call_insn as being a definition of REG.
++	  But we can't do anything with that in this pass so proceed only
++	  if the instruction really sets REG in a way that can be deduced
++	  from the RTL structure.  */
++      if (global_regs[REGNO (reg)]
++	  && !set_of (reg, DF_REF_INSN (ref_link->ref)))
++	return NULL;
++    }
++
++  return ref_chain;
++}
++
++static struct df_link *
++get_uses (rtx_insn *insn, rtx reg)
++{
++  df_ref def;
++  struct df_link *ref_chain, *ref_link;
++
++  FOR_EACH_INSN_DEF (def, insn)
++    if (REGNO (DF_REF_REG (def)) == REGNO (reg))
++      break;
++
++  gcc_assert (def != NULL && "Broken def-use analisys chain.");
++
++  ref_chain = DF_REF_CHAIN (def);
++
++  for (ref_link = ref_chain; ref_link; ref_link = ref_link->next)
++    {
++      /* Problem getting some use for this instruction.  */
++      if (ref_link->ref == NULL)
++	return NULL;
++    }
++
++  return ref_chain;
++}
++
++const pass_data pass_data_split_complex_instructions = {
++  RTL_PASS,			     /* Type.  */
++  "split_complex_instructions",	     /* Name.  */
++  OPTGROUP_NONE,		     /* Optinfo_flags.  */
++  TV_SPLIT_CMP_INS,		     /* Tv_id.  */
++  0,				     /* Properties_required.  */
++  0,				     /* Properties_provided.  */
++  0,				     /* Properties_destroyed.  */
++  0,				     /* Todo_flags_start.  */
++  (TODO_df_verify | TODO_df_finish), /* Todo_flags_finish.  */
++};
++
++class pass_split_complex_instructions : public rtl_opt_pass
++{
++private:
++  enum complex_instructions_t
++  {
++    UNDEFINED,
++    LDP,
++    LDP_TI,
++    STP,
++    STR
++  };
++
++  void split_complex_insn (rtx_insn *insn);
++  void split_ldp_ti (rtx_insn *insn);
++  void split_ldp_with_offset (rtx_insn *ldp_insn);
++  void split_simple_ldp (rtx_insn *ldp_insn);
++  void split_ldp_stp (rtx_insn *insn);
++  complex_instructions_t get_insn_type (rtx_insn *insn);
++
++  basic_block bb;
++  rtx_insn *insn;
++  std::set<rtx_insn *> dependent_stores_candidates;
++  std::set<rtx_insn *> ldp_to_split_list;
++
++  complex_instructions_t complex_insn_type = UNDEFINED;
++  bool is_store_insn (rtx_insn *insn);
++  bool is_ldp_dependent_on_store (rtx_insn *ldp_insn, basic_block bb);
++  bool bfs_for_reg_dependent_store (rtx_insn *ldp_insn, basic_block search_bb,
++				    rtx_insn *search_insn,
++				    int search_range
++				    = param_ldp_dependency_search_range);
++  bool is_store_reg_dependent (rtx_insn *ldp_insn, rtx_insn *str_insn);
++  void init_df ();
++  void find_dependent_stores_candidates (rtx_insn *ldp_insn);
++  int get_insn_offset (rtx_insn *insn, complex_instructions_t insn_type,
++		       int *arith_operation_ptr = NULL);
++
++public:
++  pass_split_complex_instructions (gcc::context *ctxt)
++      : rtl_opt_pass (pass_data_split_complex_instructions, ctxt)
++  {
++  }
++  /* opt_pass methods: */
++  virtual bool gate (function *);
++
++  virtual unsigned int
++  execute (function *)
++  {
++    enum rtx_code ldp_memref_code;
++    init_df ();
++    ldp_to_split_list.clear ();
++    FOR_EACH_BB_FN (bb, cfun)
++      {
++	FOR_BB_INSNS (bb, insn)
++	  {
++	    complex_instructions_t insn_type = get_insn_type (insn);
++	    /* TODO: Add splitting of STP instructions.  */
++	    if (insn_type != LDP && insn_type != LDP_TI)
++	      continue;
++	    /* TODO: Currently support only ldp_ti and ldp with REG or
++	       PLUS/MINUS offset expression.  */
++	    if (insn_type == LDP_TI)
++	      {
++		ldp_memref_code = GET_CODE (XEXP (XEXP (PATTERN (insn), 1),
++						  0));
++		if (ldp_memref_code != REG && ldp_memref_code != PLUS
++		    && ldp_memref_code != MINUS)
++		  continue;
++	      }
++	    if (is_ldp_dependent_on_store (insn, bb))
++	      {
++		ldp_to_split_list.insert (insn);
++	      }
++	  }
++      }
++
++    for (std::set<rtx_insn *>::iterator i = ldp_to_split_list.begin ();
++	 i != ldp_to_split_list.end (); ++i)
++      split_complex_insn (*i);
++
++    return 0;
++  }
++}; // class pass_split_complex_instructions
++
++bool
++pass_split_complex_instructions::is_ldp_dependent_on_store (rtx_insn *ldp_insn,
++							    basic_block bb)
++{
++  find_dependent_stores_candidates (ldp_insn);
++  return bfs_for_reg_dependent_store (ldp_insn, bb, ldp_insn);
++}
++
++bool
++pass_split_complex_instructions::bfs_for_reg_dependent_store (
++    rtx_insn *ldp_insn, basic_block search_bb, rtx_insn *search_insn,
++    int search_range)
++{
++  rtx_insn *current_search_insn = search_insn;
++
++  for (int i = search_range; i > 0; --i)
++    {
++      if (!current_search_insn)
++	return false;
++      bool checking_result
++	  = is_store_reg_dependent (ldp_insn, current_search_insn);
++      if (checking_result)
++	{
++	  if (dump_file)
++	    {
++	      fprintf (dump_file, "LDP to split:\n");
++	      print_rtl_single (dump_file, ldp_insn);
++	      fprintf (dump_file, "Found STR:\n");
++	      print_rtl_single (dump_file, current_search_insn);
++	    }
++	  return true;
++	}
++      if (current_search_insn == BB_HEAD (search_bb))
++	{
++	  /* Search in all parent BBs for the reg_dependent store.  */
++	  edge_iterator ei;
++	  edge e;
++
++	  FOR_EACH_EDGE (e, ei, search_bb->preds)
++	    if (e->src->index != 0
++		&& bfs_for_reg_dependent_store (ldp_insn, e->src,
++						BB_END (e->src), i - 1))
++	      return true;
++	  return false;
++	}
++      else
++	{
++	  if (!active_insn_p (current_search_insn))
++	    i++;
++	  current_search_insn = PREV_INSN (current_search_insn);
++	}
++    }
++  return false;
++}
++
++void
++pass_split_complex_instructions::init_df ()
++{
++  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
++  df_chain_add_problem (DF_UD_CHAIN + DF_DU_CHAIN);
++  df_mir_add_problem ();
++  df_live_add_problem ();
++  df_live_set_all_dirty ();
++  df_analyze ();
++  df_set_flags (DF_DEFER_INSN_RESCAN);
++}
++
++void
++pass_split_complex_instructions::find_dependent_stores_candidates (
++    rtx_insn *ldp_insn)
++{
++  dependent_stores_candidates.clear ();
++  df_ref use;
++
++  FOR_EACH_INSN_USE (use, ldp_insn)
++    {
++      df_link *defs = get_defs (ldp_insn, DF_REF_REG (use));
++      if (!defs)
++	return;
++
++      for (df_link *def = defs; def; def = def->next)
++	{
++	  df_link *uses
++	      = get_uses (DF_REF_INSN (def->ref), DF_REF_REG (def->ref));
++	  if (!uses)
++	    continue;
++
++	  for (df_link *use = uses; use; use = use->next)
++	    {
++	      if (DF_REF_CLASS (use->ref) == DF_REF_REGULAR
++		  && is_store_insn (DF_REF_INSN (use->ref)))
++		dependent_stores_candidates.insert (DF_REF_INSN (use->ref));
++	    }
++	}
++    }
++}
++
++bool
++pass_split_complex_instructions::is_store_reg_dependent (rtx_insn *ldp_insn,
++							 rtx_insn *str_insn)
++{
++  if (!is_store_insn (str_insn)
++      || dependent_stores_candidates.find (str_insn)
++	     == dependent_stores_candidates.end ())
++    return false;
++
++  int ldp_offset_sign = UNDEFINED;
++  int ldp_offset
++      = get_insn_offset (ldp_insn, get_insn_type (ldp_insn), &ldp_offset_sign);
++  if (ldp_offset_sign == MINUS)
++    ldp_offset = -ldp_offset;
++
++  int str_offset_sign = UNDEFINED;
++  int str_offset = get_insn_offset (str_insn, STR, &str_offset_sign);
++  if (str_offset_sign == MINUS)
++    str_offset = -str_offset;
++
++  if (str_offset == ldp_offset || str_offset == ldp_offset + 8)
++    return true;
++
++  return false;
++}
++
++bool
++pass_split_complex_instructions::is_store_insn (rtx_insn *insn)
++{
++  if (!insn)
++    return false;
++  rtx sset_b = single_set (insn);
++  /* TODO: The condition below allow to take only store instructions in which
++     the memory location's operand is either a register (base) or an plus/minus
++     operation (base + #imm). So it might make sense to add support for other
++     cases (e.g. multiply and shift).  */
++  if (sset_b && MEM_P (SET_DEST (sset_b))
++      && GET_MODE (XEXP (sset_b, 0)) != BLKmode
++      && (GET_CODE (XEXP (XEXP (sset_b, 0), 0)) == REG
++	  || (GET_CODE (XEXP (XEXP (sset_b, 0), 0)) == PLUS
++	      || GET_CODE (XEXP (XEXP (sset_b, 0), 0)) == MINUS)
++	  && (GET_CODE (XEXP (XEXP (XEXP (sset_b, 0), 0), 1)) == CONST_INT)))
++    return true;
++
++  return false;
++}
++
++int
++pass_split_complex_instructions::get_insn_offset (
++    rtx_insn *insn, complex_instructions_t insn_type, int *arith_operation_ptr)
++{
++  rtx insn_pat = PATTERN (insn);
++  int returned_offset = 0;
++
++  rtx offset_expr = NULL;
++  rtx offset_value_expr = NULL;
++
++  switch (insn_type)
++    {
++    case LDP:
++      {
++	int number_of_sub_insns = XVECLEN (insn_pat, 0);
++
++	/* Calculate it's own ofsset of first load insn.  */
++	rtx_insn *first_load_insn = NULL;
++	if (number_of_sub_insns == 2)
++	  {
++	    first_load_insn
++		= make_insn_raw (copy_rtx (XVECEXP (insn_pat, 0, 0)));
++	    arith_operation_ptr = NULL;
++
++	    offset_expr = XEXP (XEXP (PATTERN (first_load_insn), 1), 0);
++	    if (GET_CODE (offset_expr) == PLUS
++		|| GET_CODE (offset_expr) == MINUS)
++	      offset_value_expr
++		  = XEXP (XEXP (XEXP (PATTERN (first_load_insn), 1), 0), 1);
++	    else
++	      offset_expr = NULL;
++	  }
++	else if (number_of_sub_insns == 3)
++	  {
++	    rtx_insn *offset_sub_insn
++		= make_insn_raw (copy_rtx (XVECEXP (insn_pat, 0, 0)));
++
++	    offset_expr = XEXP (PATTERN (offset_sub_insn), 1);
++	    offset_value_expr = XEXP (XEXP (PATTERN (offset_sub_insn), 1), 1);
++	  }
++	else
++	  {
++	    gcc_assert (false
++			&& "Wrong number of elements in the ldp_insn vector");
++	  }
++	break;
++      }
++    case LDP_TI:
++      {
++	offset_expr = XEXP (XEXP (insn_pat, 1), 0);
++	if (GET_CODE (offset_expr) != PLUS && GET_CODE (offset_expr) != MINUS)
++	  return 0;
++	offset_value_expr = XEXP (XEXP (XEXP (insn_pat, 1), 0), 1);
++	break;
++      }
++    case STR:
++      {
++	offset_expr = XEXP (XEXP (insn_pat, 0), 0);
++	/* If memory location is specified by single base register then the
++	   offset is zero.  */
++	if (GET_CODE (offset_expr) == REG)
++	  return 0;
++	offset_value_expr = XEXP (XEXP (XEXP (insn_pat, 0), 0), 1);
++	break;
++      }
++    default:
++      {
++	if (dumps_are_enabled && dump_file)
++	  {
++	    fprintf (dump_file, "Instruction that was tried to split:\n");
++	    print_rtl_single (dump_file, insn);
++	  }
++	gcc_assert (false && "Unsupported instruction type");
++	break;
++      }
++    }
++
++  if (offset_expr != NULL && offset_value_expr
++      && GET_CODE (offset_value_expr) == CONST_INT)
++    returned_offset = XINT (offset_value_expr, 0);
++
++  if (arith_operation_ptr != NULL)
++    {
++      *arith_operation_ptr = GET_CODE (offset_expr);
++      gcc_assert ((*arith_operation_ptr == MINUS
++		   || *arith_operation_ptr == PLUS)
++		  && "Unexpected arithmetic operation in the offset expr");
++    }
++
++  return returned_offset;
++}
++
++void
++pass_split_complex_instructions::split_simple_ldp (rtx_insn *ldp_insn)
++{
++  rtx pat = PATTERN (ldp_insn);
++
++  rtx_insn *mem_insn_1 = make_insn_raw (copy_rtx (XVECEXP (pat, 0, 0)));
++  rtx_insn *mem_insn_2 = make_insn_raw (copy_rtx (XVECEXP (pat, 0, 1)));
++
++  int dest_regno = REGNO (SET_DEST (PATTERN (mem_insn_1)));
++  int src_regno;
++
++  rtx srs_reg_insn = XEXP (SET_SRC (PATTERN (mem_insn_1)), 0);
++
++  if (GET_CODE (srs_reg_insn) == REG)
++    src_regno = REGNO (srs_reg_insn);
++  else
++    src_regno = REGNO (XEXP (srs_reg_insn, 0));
++
++  rtx_insn *emited_insn_1, *emited_insn_2;
++
++  /* in cases like ldp r1,r2,[r1] we emit ldr r2,[r1] first.  */
++  if (src_regno == dest_regno)
++    std::swap (mem_insn_1, mem_insn_2);
++
++  emited_insn_1 = emit_insn (PATTERN (mem_insn_1));
++  emited_insn_2 = emit_insn (PATTERN (mem_insn_2));
++
++  int sub_insn_1_code = recog (PATTERN (mem_insn_1), mem_insn_1, 0);
++  int sub_insn_2_code = recog (PATTERN (mem_insn_2), mem_insn_2, 0);
++
++  INSN_CODE (emited_insn_1) = sub_insn_1_code;
++  INSN_CODE (emited_insn_2) = sub_insn_2_code;
++}
++
++void
++pass_split_complex_instructions::split_ldp_with_offset (rtx_insn *ldp_insn)
++{
++  rtx pat = PATTERN (ldp_insn);
++  bool post_index = true;
++
++  rtx_insn offset_insn;
++  rtx_insn mem_insn_1;
++  rtx_insn mem_insn_2;
++
++  int offset_insn_code;
++  int mem_insn_1_code = -1;
++  int mem_insn_2_code = -1;
++
++  int offset = 0;
++  int arith_operation = UNDEFINED;
++
++  for (int i = 0; i < 3; i++)
++    {
++      rtx sub_insn = XVECEXP (pat, 0, i);
++      rtx_insn *copy_of_sub_insn = make_insn_raw (copy_rtx (sub_insn));
++      int sub_insn_code
++	  = recog (PATTERN (copy_of_sub_insn), copy_of_sub_insn, 0);
++
++      /* If sub_insn is offset related.  */
++      if (GET_RTX_CLASS (sub_insn_code) == RTX_UNARY)
++	{
++	  offset_insn = *copy_of_sub_insn;
++	  offset_insn_code = sub_insn_code;
++	  gcc_assert (i == 0
++		      && "Offset related insn must be the first "
++			 "element of a parallel insn vector");
++
++	  offset = get_insn_offset (ldp_insn, LDP, &arith_operation);
++	}
++      else
++	{
++	  if (GET_CODE (XEXP (PATTERN (copy_of_sub_insn), 0)) != REG)
++	    {
++	      rtx &offset_expr
++		  = XEXP (XEXP (XEXP (PATTERN (copy_of_sub_insn), 0), 0), 1);
++	      if (GET_CODE (offset_expr) == CONST_INT)
++		{
++		  int local_offset = XINT (offset_expr, 0);
++		  offset = (arith_operation == PLUS ? offset : -offset);
++
++		  offset_expr = GEN_INT (local_offset + offset);
++
++		  gcc_assert (
++		      (arith_operation == MINUS || arith_operation == PLUS)
++		      && "Unexpected arithmetic operation in offset related "
++			 "sub_insn");
++
++		  if (i == 1)
++		    post_index = false;
++		}
++	      else
++		{
++		  post_index = true;
++		}
++	    }
++	}
++      if (i == 1)
++	{
++	  mem_insn_1 = *copy_of_sub_insn;
++	  mem_insn_1_code = sub_insn_code;
++	}
++      if (i == 2)
++	{
++	  mem_insn_2 = *copy_of_sub_insn;
++	  mem_insn_2_code = sub_insn_code;
++	}
++    }
++  gcc_assert (mem_insn_1_code != -1 && mem_insn_2_code != -1
++	      && "Uninitialized memory insns");
++
++  int dest_regno = REGNO (SET_DEST (PATTERN (&mem_insn_1)));
++  int src_regno;
++
++  rtx srs_reg_insn = XEXP (SET_SRC (PATTERN (&mem_insn_1)), 0);
++
++  if (GET_CODE (srs_reg_insn) == REG)
++    src_regno = REGNO (srs_reg_insn);
++  else
++    src_regno = REGNO (XEXP (srs_reg_insn, 0));
++
++  /* Don't split such weird LDP.  */
++  if (src_regno == dest_regno)
++    return;
++
++  rtx_insn *emited_offset_insn;
++  if (!post_index)
++    {
++      emited_offset_insn = emit_insn (PATTERN (&offset_insn));
++      INSN_CODE (emited_offset_insn) = offset_insn_code;
++    }
++
++  rtx_insn *emited_insn_1 = emit_insn (PATTERN (&mem_insn_1));
++  rtx_insn *emited_insn_2 = emit_insn (PATTERN (&mem_insn_2));
++
++
++  INSN_CODE (emited_insn_1) = mem_insn_1_code;
++  INSN_CODE (emited_insn_2) = mem_insn_2_code;
++
++  if (post_index)
++    {
++      emited_offset_insn = emit_insn (PATTERN (&offset_insn));
++      INSN_CODE (emited_offset_insn) = offset_insn_code;
++    }
++}
++
++void
++pass_split_complex_instructions::split_ldp_stp (rtx_insn *insn)
++{
++  rtx_insn *prev_insn = PREV_INSN (insn);
++  int number_of_sub_insns = XVECLEN (PATTERN (insn), 0);
++
++  start_sequence ();
++
++  if (number_of_sub_insns == 2)
++    split_simple_ldp (insn);
++  else if (number_of_sub_insns == 3)
++    split_ldp_with_offset (insn);
++  else
++    gcc_assert (false && "Broken complex insn vector");
++
++  rtx_insn *seq = get_insns ();
++  unshare_all_rtl_in_chain (seq);
++  end_sequence ();
++
++  emit_insn_after_setloc (seq, prev_insn, INSN_LOCATION (insn));
++  delete_insn_and_edges (insn);
++}
++
++void
++pass_split_complex_instructions::split_ldp_ti (rtx_insn *insn)
++{
++  rtx_insn *prev_insn = PREV_INSN (insn);
++  rtx_insn *load_insn_1 = make_insn_raw (copy_rtx (PATTERN (insn)));
++  rtx_insn *load_insn_2 = make_insn_raw (copy_rtx (PATTERN (insn)));
++
++  rtx reg_insn_1 = XEXP (PATTERN (load_insn_1), 0);
++  rtx mem_insn_1 = XEXP (PATTERN (load_insn_1), 1);
++  rtx mem_insn_2 = XEXP (PATTERN (load_insn_2), 1);
++
++  PUT_MODE (mem_insn_1, DImode);
++  PUT_MODE (mem_insn_2, DImode);
++
++  int reg_no_1 = REGNO (reg_insn_1);
++
++  XEXP (PATTERN (load_insn_1), 0) = gen_rtx_REG (DImode, reg_no_1);
++  XEXP (PATTERN (load_insn_2), 0) = gen_rtx_REG (DImode, reg_no_1 + 1);
++
++  rtx load_insn_2_plus_expr = XEXP (XEXP (PATTERN (load_insn_2), 1), 0);
++  if (GET_CODE (load_insn_2_plus_expr) == REG)
++    {
++	XEXP (XEXP (PATTERN (load_insn_2), 1), 0)
++	  = gen_rtx_PLUS (DImode,
++			  gen_rtx_REG (DImode, REGNO (load_insn_2_plus_expr)),
++			  GEN_INT (GET_MODE_SIZE (DImode)));
++    }
++  else
++    {
++      rtx load_insn_2_offset_expr
++      = XEXP (XEXP (XEXP (PATTERN (load_insn_2), 1), 0), 1);
++
++      if (load_insn_2_offset_expr == NULL)
++	return;
++
++      if (GET_CODE (load_insn_2_offset_expr) == CONST_INT)
++	{
++	  int load_insn_2_offset = XINT (load_insn_2_offset_expr, 0);
++	  XEXP (XEXP (XEXP (PATTERN (load_insn_2), 1), 0), 1)
++	    = GEN_INT (load_insn_2_offset + GET_MODE_SIZE (DImode));
++	}
++    }
++
++  start_sequence ();
++
++  int src_regno;
++  rtx srs_reg_insn = XEXP (XEXP (PATTERN (load_insn_1), 1), 0);
++
++  if (GET_CODE (srs_reg_insn) == REG)
++    src_regno = REGNO (srs_reg_insn);
++  else
++    src_regno = REGNO (XEXP (srs_reg_insn, 0));
++
++  /* in cases like ldp r1,r2,[r1] we emit ldr r2,[r1] first.  */
++  if (src_regno == reg_no_1)
++    std::swap (load_insn_1, load_insn_2);
++
++  rtx_insn *emited_load_insn_1 = emit_insn (PATTERN (load_insn_1));
++  rtx_insn *emited_load_insn_2 = emit_insn (PATTERN (load_insn_2));
++
++  INSN_CODE (emited_load_insn_1)
++      = recog (PATTERN (emited_load_insn_1), emited_load_insn_1, 0);
++  INSN_CODE (emited_load_insn_2)
++      = recog (PATTERN (emited_load_insn_2), emited_load_insn_2, 0);
++
++  rtx_insn *seq = get_insns ();
++  unshare_all_rtl_in_chain (seq);
++  end_sequence ();
++
++  emit_insn_after_setloc (seq, prev_insn, INSN_LOCATION (insn));
++  delete_insn_and_edges (insn);
++}
++
++void
++pass_split_complex_instructions::split_complex_insn (rtx_insn *insn)
++{
++  complex_instructions_t insn_type = get_insn_type (insn);
++  /* TODO: Add splitting of STP instructions.  */
++  if (insn_type == LDP || insn_type == STP)
++    split_ldp_stp (insn);
++  else if (insn_type == LDP_TI)
++    split_ldp_ti (insn);
++  else
++    gcc_assert (false && "Unsupported type of insn to split");
++}
++
++pass_split_complex_instructions::complex_instructions_t
++pass_split_complex_instructions::get_insn_type (rtx_insn *insn)
++{
++  if (!INSN_P (insn))
++    return UNDEFINED;
++
++  rtx pat = PATTERN (insn);
++  int icode = recog (PATTERN (insn), insn, NULL);
++
++  if (GET_CODE (pat) == PARALLEL)
++    {
++      if (targetm.is_ldp_insn (icode))
++	{
++	  return LDP;
++	}
++      if (targetm.is_stp_insn (icode))
++	{
++	  return STP;
++	}
++      else
++	{
++	  return UNDEFINED;
++	}
++    }
++  rtx set_insn = single_set (insn);
++  if (set_insn && GET_CODE (XEXP (set_insn, 1)) == MEM
++      && GET_MODE (XEXP (set_insn, 1)) == E_TImode)
++    return LDP_TI;
++
++  return UNDEFINED;
++}
++
++bool
++pass_split_complex_instructions::gate (function *)
++{
++  return targetm.is_ldp_insn && targetm.is_stp_insn && optimize > 0
++	 && flag_split_ldp_stp > 0;
++}
++
++} // anon namespace
++
++rtl_opt_pass *
++make_pass_split_complex_instructions (gcc::context *ctxt)
++{
++  return new pass_split_complex_instructions (ctxt);
++}
++
+ #if __GNUC__ >= 10
+ #  pragma GCC diagnostic pop
+-#endif
++#endif
+\ No newline at end of file
+diff --git a/gcc/target.def b/gcc/target.def
+index d85adf36a..a3a50b474 100644
+--- a/gcc/target.def
++++ b/gcc/target.def
+@@ -2677,6 +2677,16 @@ modes and they have different conditional execution capability, such as ARM.",
+  bool, (void),
+  default_have_conditional_execution)
+ 
++DEFHOOK
++(is_ldp_insn,
++  "Return true if icode is corresponding to any of the LDP instruction types.",
++  bool, (int icode), NULL)
++
++DEFHOOK
++(is_stp_insn,
++  "Return true if icode is corresponding to any of the STP instruction types.",
++  bool, (int icode), NULL)
++
+ DEFHOOK
+ (gen_ccmp_first,
+  "This function prepares to emit a comparison insn for the first compare in a\n\
+diff --git a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c
+new file mode 100644
+index 000000000..3918d43f6
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c
+@@ -0,0 +1,74 @@
++/* { dg-do compile { target aarch64-*-* } } */
++/* { dg-additional-options "-fsplit-ldp-stp" } */
++/*
++ *    Tests are:
++ *          Patterns where LDP insns should NOT be split
++ *                       */
++
++int __RTL (startwith ("split_complex_instructions"))
++simple_ldp_after_store ()
++{
++(function "simple_ldp_after_store"
++  (insn-chain
++    (block 2
++      (edge-from entry (flags "FALLTHRU"))
++      (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK)
++      (cinsn 228 (set (reg/i:DI sp) 
++                   (reg/i:DI x0)))
++      (cinsn 101 (set (mem/c:DI
++                        (plus:DI (reg/f:DI sp)
++                          (const_int 32))[1 S4 A32])(reg:DI x0)))
++      (cinsn 10 (parallel [
++        (set (reg:DI x29)
++          (mem:DI (plus:DI (reg/f:DI sp) (const_int 8)) [1 S4 A32]))
++        (set (reg:DI x30)
++          (mem:DI (plus:DI (reg/f:DI sp)
++            (const_int 16)) [1 S4 A32]))]))
++      (cinsn 11 (use (reg/i:DI sp)))
++      (cinsn 12 (use (reg/i:DI cc)))
++      (cinsn 13 (use (reg/i:DI x29)))
++      (cinsn 14 (use (reg/i:DI x30)))
++      (cinsn 15 (use (reg/i:DI x0)))
++      (edge-to exit (flags "FALLTHRU"))
++    ) ;; block 2
++  ) ;; insn-chain
++) ;; function "simple_ldp_after_store"
++}
++
++int __RTL (startwith ("split_complex_instructions"))
++ldp_after_store_in_different_bb ()
++{
++(function "ldp_after_store_in_different_bb"
++  (insn-chain
++    (block 2
++      (edge-from entry (flags "FALLTHRU"))
++      (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK)
++      (cinsn 228 (set (reg/i:DI sp) 
++                   (reg/i:DI x0)))
++      (cinsn 101 (set (mem/c:DI
++                        (plus:DI (reg/f:DI sp)
++                          (const_int 32))[1 S4 A32])(reg:DI x0)))
++      (edge-to 3 (flags "FALLTHRU"))
++    ) ;; block 2
++    (block 3
++      (edge-from 2 (flags "FALLTHRU"))
++      (cnote 4 [bb 3] NOTE_INSN_BASIC_BLOCK)
++      (cinsn 10 (parallel [
++        (set (reg:DI x29)
++          (mem:DI (plus:DI (reg/f:DI sp) (const_int 8)) [1 S4 A32]))
++        (set (reg:DI x30)
++          (mem:DI (plus:DI (reg/f:DI sp)
++            (const_int 16)) [1 S4 A32]))]))
++      (cinsn 11 (use (reg/i:DI sp)))
++      (cinsn 12 (use (reg/i:DI cc)))
++      (cinsn 13 (use (reg/i:DI x29)))
++      (cinsn 14 (use (reg/i:DI x30)))
++      (cinsn 15 (use (reg/i:DI x0)))
++      (edge-to exit (flags "FALLTHRU"))
++    ) ;; block 3
++  ) ;; insn-chain
++) ;; function "ldp_after_store_in_different_bb"
++}
++
++/* Verify that the output code contains exactly 2 ldp.  */
++/* { dg-final { scan-assembler-times {ldp\t} 2 } }  */
+\ No newline at end of file
+diff --git a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c
+new file mode 100644
+index 000000000..653c30f83
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c
+@@ -0,0 +1,40 @@
++/* { dg-do compile { target aarch64-*-* } } */
++/* { dg-additional-options "-fsplit-ldp-stp" } */
++/*
++ *    Test is:
++ *        Pattern where LDP insns should be split with rearrangement in order
++ *        to deal with data dependecy betwen subinstruction.  
++ *                                                                          */
++
++int __RTL (startwith ("split_complex_instructions"))
++simple_ldp_after_store ()
++{
++(function "ldp_equal_registers"
++  (insn-chain
++    (block 2
++      (edge-from entry (flags "FALLTHRU"))
++      (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK)
++      (cinsn 228 (set (reg/i:DI x1) 
++                   (reg/i:DI x0)))
++      (cinsn 101 (set (mem/c:DI
++                        (plus:DI (reg/f:DI x1)
++                          (const_int 8))[1 S4 A32])(reg:DI x0)))
++      (cinsn 10 (parallel [
++        (set (reg:DI x1)
++          (mem:DI (plus:DI (reg/f:DI x1) (const_int 8)) [1 S4 A32]))
++        (set (reg:DI x2)
++          (mem:DI (plus:DI (reg/f:DI x1)
++            (const_int 16)) [1 S4 A32]))]))
++      (cinsn 11 (use (reg/i:DI sp)))
++      (cinsn 12 (use (reg/i:DI cc)))
++      (cinsn 13 (use (reg/i:DI x0)))
++      (cinsn 14 (use (reg/i:DI x1)))
++      (cinsn 15 (use (reg/i:DI x2)))
++      (edge-to exit (flags "FALLTHRU"))
++    ) ;; block 2
++  ) ;; insn-chain
++) ;; function "ldp_equal_registers"
++}
++
++/* Verify that the output code rearrange ldrs.  */
++/* { dg-final { scan-assembler-times ".*ldr.*x2.*x1,.*16.*ldr.*x1.*x1.*8" 1 } }  */
+\ No newline at end of file
+diff --git a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c
+new file mode 100644
+index 000000000..dc9f26efb
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c
+@@ -0,0 +1,174 @@
++/* { dg-do compile { target aarch64-*-* } } */
++/* { dg-additional-options "-O1 -fsplit-ldp-stp" } */
++/*
++ *    Tests are:
++ *          Patterns where LDP insns should be split
++ *                       */
++
++int __RTL (startwith ("split_complex_instructions"))
++simple_ldp_after_store ()
++{
++(function "simple_ldp_after_store"
++  (insn-chain
++    (block 2
++      (edge-from entry (flags "FALLTHRU"))
++      (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK)
++      (cinsn 228 (set (reg/i:DI sp)
++                   (reg/i:DI x0)))
++      (cinsn 238 (set (reg/i:DI x1)
++                   (reg/i:DI x0)))
++
++      (cinsn 101 (set (mem/c:DI
++                        (plus:DI (reg/f:DI sp)
++                          (const_int 8))[1 S4 A32])(reg:DI x0)))
++      (cinsn 10 (parallel [
++        (set (reg:DI x29)
++          (mem:DI (plus:DI (reg/f:DI sp) (const_int 8)) [1 S4 A32]))
++        (set (reg:DI x30)
++          (mem:DI (plus:DI (reg/f:DI sp)
++            (const_int 16)) [1 S4 A32]))]))
++
++      (cinsn 102 (set (mem/c:DI (plus:DI (reg/f:DI x1)
++                                          (const_int -16)) [1 S4 A32])
++                      (reg:DI x0)))
++      (cinsn 11 (parallel [
++        (set (reg:DI x3)
++          (mem:DI (plus:DI (reg/f:DI x1) (const_int -16)) [1 S4 A32]))
++        (set (reg:DI x4)
++          (mem:DI (plus:DI (reg/f:DI x1) (const_int -8)) [1 S4 A32]))
++      ]))
++
++      (cinsn 103 (set (mem/c:DI (reg/f:DI x1) [1 S4 A32])
++                      (reg:DI x0)))
++      (cinsn 12 (parallel [
++        (set (reg:DI x5) (mem:DI (reg/f:DI x1) [1 S4 A32]))
++        (set (reg:DI x6) (mem:DI (plus:DI (reg/f:DI x1)
++                                          (const_int 8)) [1 S4 A32]))
++      ]))
++
++      (cinsn 13 (use (reg/i:DI sp)))
++      (cinsn 14 (use (reg/i:DI cc)))
++      (cinsn 15 (use (reg/i:DI x29)))
++      (cinsn 16 (use (reg/i:DI x30)))
++      (cinsn 17 (use (reg/i:DI x0)))
++      (cinsn 18 (use (reg/i:DI x3)))
++      (cinsn 19 (use (reg/i:DI x4)))
++      (cinsn 20 (use (reg/i:DI x5)))
++      (cinsn 21 (use (reg/i:DI x6)))
++      (edge-to exit (flags "FALLTHRU"))
++    ) ;; block 2
++  ) ;; insn-chain
++) ;; function "simple_ldp_after_store"
++}
++
++int __RTL (startwith ("split_complex_instructions"))
++ldp_ti_after_store ()
++{
++  (function "ldp_ti_after_store"
++    (insn-chain
++      (block 2
++      (edge-from entry (flags "FALLTHRU"))
++      (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK)
++      (cinsn 228 (set (reg/i:DI sp)
++                   (reg/i:DI x0)))
++      (cinsn 238 (set (reg/i:DI x2)
++                   (reg/i:DI x0)))
++
++      (cinsn 101 (set (mem/c:DI
++                        (plus:DI (reg/f:DI sp)
++                          (const_int 136))[1 S4 A32])(reg:DI x0)))
++      (insn 81 (set (reg:TI x0 [1 S4 A32])
++              (mem/c:TI (plus:DI (reg/f:DI sp)
++                      (const_int 136 )) [1 S4 A32]))
++           (expr_list:REG_EQUIV (mem/c:TI (plus:DI (reg/f:DI sfp)
++                      (const_int -24 )) [1 S4 A32])
++              (nil)))
++
++      (cinsn 102 (set (mem/c:DI (plus:DI (reg/f:DI x2)
++                                          (const_int -16)) [1 S4 A32])
++                      (reg:DI x0)))
++      (insn 82 (set (reg:TI x3 [1 S4 A32])
++                    (mem/c:TI (plus:DI (reg/f:DI x2)
++                                        (const_int -16)) [1 S4 A32])))
++
++      (cinsn 103 (set (mem/c:DI (reg/f:DI x2) [1 S4 A32])
++                      (reg:DI x0)))
++      (insn 83 (set (reg:TI x5 [1 S4 A32])
++                    (mem/c:TI (reg/f:DI x2) [1 S4 A32])))
++
++      (cinsn 11 (use (reg/i:DI sp)))
++      (cinsn 12 (use (reg/i:DI cc)))
++      (cinsn 13 (use (reg/i:DI x29)))
++      (cinsn 14 (use (reg/i:DI x30)))
++      (cinsn 15 (use (reg/i:DI x0)))
++      (cinsn 16 (use (reg/i:DI x3)))
++      (cinsn 17 (use (reg/i:DI x5)))
++      (cinsn 18 (use (reg/i:DI x1)))
++      (cinsn 19 (use (reg/i:DI x4)))
++      (cinsn 20 (use (reg/i:DI x6)))
++      (edge-to exit (flags "FALLTHRU"))
++    ) ;; block 2
++  ) ;; insn-chain
++) ;; function "ldp_ti_after_store"
++}
++
++int __RTL (startwith ("split_complex_instructions"))
++ldp_after_store_in_different_bb ()
++{
++(function "ldp_after_store_in_different_bb"
++  (insn-chain
++    (block 2
++      (edge-from entry (flags "FALLTHRU"))
++      (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK)
++      (cinsn 228 (set (reg/i:DI sp)
++                   (reg/i:DI x0)))
++      (cinsn 238 (set (reg/i:DI x1)
++                   (reg/i:DI x0)))
++
++      (cinsn 101 (set (mem/c:DI
++                        (plus:DI (reg/f:DI sp)
++                          (const_int 8))[1 S4 A32])(reg:DI x0)))
++      (cinsn 102 (set (mem/c:DI (plus:DI (reg/f:DI x1)
++                                          (const_int -16)) [1 S4 A32])
++                      (reg:DI x0)))
++      (cinsn 103 (set (mem/c:DI (reg/f:DI x1) [1 S4 A32])
++                      (reg:DI x0)))
++      (edge-to 3 (flags "FALLTHRU"))
++    ) ;; block 2
++    (block 3
++      (edge-from 2 (flags "FALLTHRU"))
++      (cnote 4 [bb 3] NOTE_INSN_BASIC_BLOCK)
++      (cinsn 10 (parallel [
++        (set (reg:DI x29)
++          (mem:DI (plus:DI (reg/f:DI sp) (const_int 8)) [1 S4 A32]))
++        (set (reg:DI x30)
++          (mem:DI (plus:DI (reg/f:DI sp)
++            (const_int 16)) [1 S4 A32]))]))
++      (cinsn 11 (parallel [
++        (set (reg:DI x3)
++          (mem:DI (plus:DI (reg/f:DI x1) (const_int -16)) [1 S4 A32]))
++        (set (reg:DI x4)
++          (mem:DI (plus:DI (reg/f:DI x1) (const_int -8)) [1 S4 A32]))
++      ]))
++      (cinsn 12 (parallel [
++        (set (reg:DI x5) (mem:DI (reg/f:DI x1) [1 S4 A32]))
++        (set (reg:DI x6) (mem:DI (plus:DI (reg/f:DI x1)
++                                          (const_int 8)) [1 S4 A32]))
++      ]))
++      (cinsn 13 (use (reg/i:DI sp)))
++      (cinsn 14 (use (reg/i:DI cc)))
++      (cinsn 15 (use (reg/i:DI x29)))
++      (cinsn 16 (use (reg/i:DI x30)))
++      (cinsn 17 (use (reg/i:DI x0)))
++      (cinsn 18 (use (reg/i:DI x3)))
++      (cinsn 19 (use (reg/i:DI x4)))
++      (cinsn 20 (use (reg/i:DI x5)))
++      (cinsn 21 (use (reg/i:DI x6)))
++      (edge-to exit (flags "FALLTHRU"))
++    ) ;; block 3
++  ) ;; insn-chain
++) ;; function "ldp_after_store_in_different_bb"
++}
++
++/* Verify that the output code doesn't contain ldp.  */
++/* { dg-final { scan-assembler-not {ldp\t} } }  */
+\ No newline at end of file
+diff --git a/gcc/timevar.def b/gcc/timevar.def
+index 1e7d4e74b..2ccecffb5 100644
+--- a/gcc/timevar.def
++++ b/gcc/timevar.def
+@@ -280,6 +280,7 @@ DEFTIMEVAR (TV_RELOAD_CSE_REGS       , "reload CSE regs")
+ DEFTIMEVAR (TV_GCSE_AFTER_RELOAD     , "load CSE after reload")
+ DEFTIMEVAR (TV_REE		     , "ree")
+ DEFTIMEVAR (TV_THREAD_PROLOGUE_AND_EPILOGUE, "thread pro- & epilogue")
++DEFTIMEVAR (TV_SPLIT_CMP_INS         , "split complex instructions")
+ DEFTIMEVAR (TV_IFCVT2		     , "if-conversion 2")
+ DEFTIMEVAR (TV_SPLIT_PATHS	     , "split paths")
+ DEFTIMEVAR (TV_COMBINE_STACK_ADJUST  , "combine stack adjustments")
+diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
+index 86f38e2f2..6daac7fc1 100644
+--- a/gcc/tree-pass.h
++++ b/gcc/tree-pass.h
+@@ -612,6 +612,7 @@ extern rtl_opt_pass *make_pass_split_after_reload (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_thread_prologue_and_epilogue (gcc::context
+ 							     *ctxt);
+ extern rtl_opt_pass *make_pass_zero_call_used_regs (gcc::context *ctxt);
++extern rtl_opt_pass *make_pass_split_complex_instructions (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_stack_adjustments (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_sched_fusion (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_peephole2 (gcc::context *ctxt);
+-- 
+2.33.0
+
diff --git a/0037-Extending-and-refactoring-of-pass_split_complex_inst.patch b/0037-Extending-and-refactoring-of-pass_split_complex_inst.patch
new file mode 100644
index 0000000000000000000000000000000000000000..509a534f042554b56e658fda16b1ee63a04649c8
--- /dev/null
+++ b/0037-Extending-and-refactoring-of-pass_split_complex_inst.patch
@@ -0,0 +1,1426 @@
+From a49db831320ac70ca8f46b94ee60d7c6951f65c3 Mon Sep 17 00:00:00 2001
+From: Gadzhiev Emin WX1195297 <gadzhiev.emin@huawei-partners.com>
+Date: Wed, 20 Dec 2023 21:36:07 +0300
+Subject: [PATCH 10/18] Extending and refactoring of
+ pass_split_complex_instructions
+
+- Add flag parameter in is_ldp_insn and is_stp_insn to know
+  if instruction has writeback operation
+- Add support of PRE_*, POST_* operands as a memory address
+  expression
+- Split only LDPs that intersect with a dependent store
+  instruction
+- Make the selection of dependent store instructions stricter
+  so it will be enough to check by BFS that dependent store
+  instruction appears in search range.
+- Add helper methods to retrieve fields of rtx
+- Remove redundant iterations in find_dependent_stores_candidates
+- Refactor generation of instructions
+- Add more test cases
+---
+ gcc/config/aarch64/aarch64.cc                 |  62 +-
+ gcc/doc/tm.texi                               |  12 +-
+ gcc/sched-rgn.cc                              | 771 +++++++++---------
+ gcc/target.def                                |  14 +-
+ .../gcc.dg/rtl/aarch64/test-ldp-dont-split.c  |  35 +-
+ .../rtl/aarch64/test-ldp-split-rearrange.c    |   2 +-
+ .../gcc.dg/rtl/aarch64/test-ldp-split.c       | 181 +++-
+ 7 files changed, 603 insertions(+), 474 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index 48e2eded0..fa566dd80 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -27507,39 +27507,59 @@ aarch64_run_selftests (void)
+ 
+ #endif /* #if CHECKING_P */
+ 
+-/* TODO: refuse to use ranges intead of full list of an instruction codes.  */
++/* TODO: refuse to use ranges instead of full list of an instruction codes.  */
+ 
+ bool
+-is_aarch64_ldp_insn (int icode)
++is_aarch64_ldp_insn (int icode, bool *has_wb)
+ {
+   if ((icode >= CODE_FOR_load_pair_sw_sisi
+-	  && icode <= CODE_FOR_load_pair_dw_tftf)
++	  && icode <= CODE_FOR_load_pair_sw_sfsf)
++      || (icode >= CODE_FOR_load_pair_dw_didi
++	  && icode <= CODE_FOR_load_pair_dw_dfdf)
++      || (icode == CODE_FOR_load_pair_dw_tftf)
+       || (icode >= CODE_FOR_loadwb_pairsi_si
+-	     && icode <= CODE_FOR_loadwb_pairtf_di)
+-      || (icode >= CODE_FOR_load_pairv8qiv8qi
+-	     && icode <= CODE_FOR_load_pairdfdf)
+-      || (icode >= CODE_FOR_load_pairv16qiv16qi
+-	     && icode <= CODE_FOR_load_pairv8bfv2df)
+-      || (icode >= CODE_FOR_load_pair_lanesv8qi
+-	     && icode <= CODE_FOR_load_pair_lanesdf))
+-    return true;
++	  && icode <= CODE_FOR_loadwb_pairdi_di)
++      || (icode >= CODE_FOR_loadwb_pairsf_si
++	  && icode <= CODE_FOR_loadwb_pairdf_di)
++      || (icode >= CODE_FOR_loadwb_pairti_si
++	  && icode <= CODE_FOR_loadwb_pairtf_di))
++    {
++      if (has_wb)
++	*has_wb = ((icode >= CODE_FOR_loadwb_pairsi_si
++		     && icode <= CODE_FOR_loadwb_pairdi_di)
++		   || (icode >= CODE_FOR_loadwb_pairsf_si
++		     && icode <= CODE_FOR_loadwb_pairdf_di)
++		   || (icode >= CODE_FOR_loadwb_pairti_si
++		      && icode <= CODE_FOR_loadwb_pairtf_di));
++      return true;
++    }
+   return false;
+ }
+ 
+ bool
+-is_aarch64_stp_insn (int icode)
++is_aarch64_stp_insn (int icode, bool *has_wb)
+ {
+   if ((icode >= CODE_FOR_store_pair_sw_sisi
+-	  && icode <= CODE_FOR_store_pair_dw_tftf)
++	  && icode <= CODE_FOR_store_pair_sw_sfsf)
++      || (icode >= CODE_FOR_store_pair_dw_didi
++	  && icode <= CODE_FOR_store_pair_dw_dfdf)
++      || (icode == CODE_FOR_store_pair_dw_tftf)
+       || (icode >= CODE_FOR_storewb_pairsi_si
+-	     && icode <= CODE_FOR_storewb_pairtf_di)
+-      || (icode >= CODE_FOR_vec_store_pairv8qiv8qi
+-	     && icode <= CODE_FOR_vec_store_pairdfdf)
+-      || (icode >= CODE_FOR_vec_store_pairv16qiv16qi
+-	     && icode <= CODE_FOR_vec_store_pairv8bfv2df)
+-      || (icode >= CODE_FOR_store_pair_lanesv8qi
+-	     && icode <= CODE_FOR_store_pair_lanesdf))
+-    return true;
++	  && icode <= CODE_FOR_storewb_pairdi_di)
++      || (icode >= CODE_FOR_storewb_pairsf_si
++	  && icode <= CODE_FOR_storewb_pairdf_di)
++      || (icode >= CODE_FOR_storewb_pairti_si
++	  && icode <= CODE_FOR_storewb_pairtf_di))
++    {
++      if (has_wb)
++	*has_wb = ((icode >= CODE_FOR_storewb_pairsi_si
++		     && icode <= CODE_FOR_storewb_pairdi_di)
++		   || (icode >= CODE_FOR_storewb_pairsf_si
++		     && icode <= CODE_FOR_storewb_pairdf_di)
++		   || (icode >= CODE_FOR_storewb_pairti_si
++		     && icode <= CODE_FOR_storewb_pairtf_di));
++      return true;
++    }
+   return false;
+ }
+ 
+diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
+index 0c6415a9c..3b6e90bf2 100644
+--- a/gcc/doc/tm.texi
++++ b/gcc/doc/tm.texi
+@@ -12113,12 +12113,16 @@ object files that are not referenced from @code{main} and uses export
+ lists.
+ @end defmac
+ 
+-@deftypefn {Target Hook} bool TARGET_IS_LDP_INSN (int @var{icode})
+-Return true if icode is corresponding to any of the LDP instruction types.
++@deftypefn {Target Hook} bool TARGET_IS_LDP_INSN (int @var{icode}, bool *@var{has_wb})
++Return true if @var{icode} is corresponding to any of the LDP instruction
++types.  If @var{has_wb} is not NULL then its value is set to true if LDP
++contains post-index or pre-index operation.
+ @end deftypefn
+ 
+-@deftypefn {Target Hook} bool TARGET_IS_STP_INSN (int @var{icode})
+-Return true if icode is corresponding to any of the STP instruction types.
++@deftypefn {Target Hook} bool TARGET_IS_STP_INSN (int @var{icode}, bool *@var{has_wb})
++Return true if @var{icode} is corresponding to any of the STP instruction
++types.  If @var{has_wb} is not NULL then its value is set to true if STP
++contains post-index or pre-index operation.
+ @end deftypefn
+ 
+ @deftypefn {Target Hook} bool TARGET_CANNOT_MODIFY_JUMPS_P (void)
+diff --git a/gcc/sched-rgn.cc b/gcc/sched-rgn.cc
+index b4df8bdc5..5f61de1c8 100644
+--- a/gcc/sched-rgn.cc
++++ b/gcc/sched-rgn.cc
+@@ -3956,7 +3956,7 @@ make_pass_sched_fusion (gcc::context *ctxt)
+ 
+ namespace {
+ 
+-/* Def-use analisys special functions implementation.  */
++/* Def-use analysis special functions implementation.  */
+ 
+ static struct df_link *
+ get_defs (rtx_insn *insn, rtx reg)
+@@ -4032,42 +4032,66 @@ const pass_data pass_data_split_complex_instructions = {
+   (TODO_df_verify | TODO_df_finish), /* Todo_flags_finish.  */
+ };
+ 
++/* Pass split_complex_instructions finds LOAD PAIR instructions (LDP) that can
++   be split into two LDR instructions.  It splits only those LDP for which one
++   half of the requested memory is contained in the preceding STORE (STR/STP)
++   instruction whose base register has the same definition.  This allows
++   to use hardware store-to-load forwarding mechanism and to get one half of
++   requested memory from the store queue of CPU.
++
++   TODO: Add split of STP.
++   TODO: Add split of vector STP and LDP.  */
+ class pass_split_complex_instructions : public rtl_opt_pass
+ {
+ private:
+-  enum complex_instructions_t
++  enum mem_access_insn_t
+   {
+     UNDEFINED,
+     LDP,
++    /* LDP with post-index (see loadwb_pair in config/aarch64.md).  */
++    LDP_WB,
++    /* LDP that contains one destination register in RTL IR
++       (see movti_aarch64 in config/aarch64.md).  */
+     LDP_TI,
+     STP,
++    /* STP with pre-index (see storewb_pair in config/aarch64.md).  */
++    STP_WB,
++    /* STP that contains one source register in RTL IR
++       (see movti_aarch64 in config/aarch64.md).  */
++    STP_TI,
+     STR
+   };
+ 
+-  void split_complex_insn (rtx_insn *insn);
+-  void split_ldp_ti (rtx_insn *insn);
+-  void split_ldp_with_offset (rtx_insn *ldp_insn);
+-  void split_simple_ldp (rtx_insn *ldp_insn);
+-  void split_ldp_stp (rtx_insn *insn);
+-  complex_instructions_t get_insn_type (rtx_insn *insn);
+-
+-  basic_block bb;
+-  rtx_insn *insn;
+   std::set<rtx_insn *> dependent_stores_candidates;
+   std::set<rtx_insn *> ldp_to_split_list;
+ 
+-  complex_instructions_t complex_insn_type = UNDEFINED;
+-  bool is_store_insn (rtx_insn *insn);
+-  bool is_ldp_dependent_on_store (rtx_insn *ldp_insn, basic_block bb);
++  void split_ldp_ti (rtx_insn *insn);
++  void split_ldp (rtx_insn *ldp_insn);
++  /* Emit a NEW_INSNS chain, recognize instruction code of each new instruction
++     and replace OLD_INSN with the emitted sequence.  */
++  void replace_insn (rtx_insn *old_insn, rtx_insn *new_insns);
++
++  mem_access_insn_t get_insn_type (rtx_insn *insn);
++  bool is_typeof_ldp (mem_access_insn_t insn_type);
++  bool is_typeof_stp (mem_access_insn_t insn_type);
++
+   bool bfs_for_reg_dependent_store (rtx_insn *ldp_insn, basic_block search_bb,
+ 				    rtx_insn *search_insn,
+ 				    int search_range
+ 				    = param_ldp_dependency_search_range);
+   bool is_store_reg_dependent (rtx_insn *ldp_insn, rtx_insn *str_insn);
+   void init_df ();
+-  void find_dependent_stores_candidates (rtx_insn *ldp_insn);
+-  int get_insn_offset (rtx_insn *insn, complex_instructions_t insn_type,
+-		       int *arith_operation_ptr = NULL);
++  void find_dependent_stores_candidates (rtx_insn *ldp_insn,
++					 mem_access_insn_t insn_type);
++
++  rtx get_memref (rtx_insn *insn, mem_access_insn_t insn_type);
++  rtx get_base_reg (rtx memref);
++  /* Set OFFSET to the offset value.  Returns TRUE if MEMREF's address
++     expression is supported, FALSE otherwise.  */
++  bool get_offset (rtx memref, int &offset);
++  /* Return size of memory referenced by MEMREF.  Returns -1 if INSN_TYPE
++     wasn't recognized.  */
++  int get_unit_size (rtx memref, mem_access_insn_t insn_type);
+ 
+ public:
+   pass_split_complex_instructions (gcc::context *ctxt)
+@@ -4080,28 +4104,22 @@ public:
+   virtual unsigned int
+   execute (function *)
+   {
+-    enum rtx_code ldp_memref_code;
++    basic_block bb;
++    rtx_insn *insn;
++
+     init_df ();
+     ldp_to_split_list.clear ();
+     FOR_EACH_BB_FN (bb, cfun)
+       {
+ 	FOR_BB_INSNS (bb, insn)
+ 	  {
+-	    complex_instructions_t insn_type = get_insn_type (insn);
+-	    /* TODO: Add splitting of STP instructions.  */
+-	    if (insn_type != LDP && insn_type != LDP_TI)
++	    mem_access_insn_t insn_type = get_insn_type (insn);
++	    if (!is_typeof_ldp (insn_type))
+ 	      continue;
+-	    /* TODO: Currently support only ldp_ti and ldp with REG or
+-	       PLUS/MINUS offset expression.  */
+-	    if (insn_type == LDP_TI)
+-	      {
+-		ldp_memref_code = GET_CODE (XEXP (XEXP (PATTERN (insn), 1),
+-						  0));
+-		if (ldp_memref_code != REG && ldp_memref_code != PLUS
+-		    && ldp_memref_code != MINUS)
+-		  continue;
+-	      }
+-	    if (is_ldp_dependent_on_store (insn, bb))
++
++	    find_dependent_stores_candidates (insn, insn_type);
++	    if (!dependent_stores_candidates.empty ()
++	       && bfs_for_reg_dependent_store (insn, bb, insn))
+ 	      {
+ 		ldp_to_split_list.insert (insn);
+ 	      }
+@@ -4110,18 +4128,107 @@ public:
+ 
+     for (std::set<rtx_insn *>::iterator i = ldp_to_split_list.begin ();
+ 	 i != ldp_to_split_list.end (); ++i)
+-      split_complex_insn (*i);
++      split_ldp (*i);
+ 
+     return 0;
+   }
+ }; // class pass_split_complex_instructions
+ 
+ bool
+-pass_split_complex_instructions::is_ldp_dependent_on_store (rtx_insn *ldp_insn,
+-							    basic_block bb)
++pass_split_complex_instructions::is_typeof_ldp (
++    mem_access_insn_t insn_type)
+ {
+-  find_dependent_stores_candidates (ldp_insn);
+-  return bfs_for_reg_dependent_store (ldp_insn, bb, ldp_insn);
++  return (insn_type == LDP || insn_type == LDP_WB || insn_type == LDP_TI);
++}
++
++bool
++pass_split_complex_instructions::is_typeof_stp (
++    mem_access_insn_t insn_type)
++{
++  return (insn_type == STP || insn_type == STP_WB || insn_type == STP_TI);
++}
++
++rtx
++pass_split_complex_instructions::get_memref (
++    rtx_insn *insn, mem_access_insn_t insn_type)
++{
++  rtx insn_pat = PATTERN (insn);
++  rtx memref = NULL;
++
++  switch (insn_type)
++    {
++      case LDP:
++	memref = SET_SRC (XVECEXP (insn_pat, 0, 0));
++	break;
++      case LDP_WB:
++	memref = SET_SRC (XVECEXP (insn_pat, 0, 1));
++	break;
++      case LDP_TI:
++	memref = SET_SRC (insn_pat);
++	break;
++      case STP:
++	memref = SET_DEST (XVECEXP (insn_pat, 0, 0));
++	break;
++      case STP_WB:
++	memref = SET_DEST (XVECEXP (insn_pat, 0, 1));
++	break;
++      case STP_TI:
++      case STR:
++	memref = SET_DEST (insn_pat);
++	break;
++      default:
++	break;
++    }
++
++  if (memref && !MEM_P (memref))
++    return NULL;
++  return memref;
++}
++
++rtx
++pass_split_complex_instructions::get_base_reg (rtx memref)
++{
++  if (!memref || !MEM_P (memref))
++    return NULL;
++  rtx addr_exp = XEXP (memref, 0);
++
++  switch (GET_CODE (addr_exp))
++    {
++      case REG:
++	return addr_exp;
++      case PLUS:
++      case PRE_DEC:
++      case PRE_INC:
++      case POST_DEC:
++      case POST_INC:
++	if (REG_P (XEXP (addr_exp, 0)))
++	  return XEXP (addr_exp, 0);
++      default:
++	return NULL;
++    }
++}
++
++int
++pass_split_complex_instructions::get_unit_size (
++    rtx memref, mem_access_insn_t insn_type)
++{
++  if (!memref)
++    return -1;
++
++  switch (insn_type)
++    {
++      case LDP:
++      case STP:
++      case LDP_WB:
++      case STP_WB:
++      case STR:
++	return GET_MODE_SIZE (GET_MODE (memref)).to_constant ();
++      case LDP_TI:
++      case STP_TI:
++	return GET_MODE_SIZE (E_DImode).to_constant ();
++      default:
++	return -1;
++    }
+ }
+ 
+ bool
+@@ -4135,9 +4242,9 @@ pass_split_complex_instructions::bfs_for_reg_dependent_store (
+     {
+       if (!current_search_insn)
+ 	return false;
+-      bool checking_result
+-	  = is_store_reg_dependent (ldp_insn, current_search_insn);
+-      if (checking_result)
++
++      if (dependent_stores_candidates.find (current_search_insn)
++	  != dependent_stores_candidates.end ())
+ 	{
+ 	  if (dump_file)
+ 	    {
+@@ -4185,30 +4292,29 @@ pass_split_complex_instructions::init_df ()
+ 
+ void
+ pass_split_complex_instructions::find_dependent_stores_candidates (
+-    rtx_insn *ldp_insn)
++    rtx_insn *ldp_insn, mem_access_insn_t insn_type)
+ {
+   dependent_stores_candidates.clear ();
+-  df_ref use;
+ 
+-  FOR_EACH_INSN_USE (use, ldp_insn)
+-    {
+-      df_link *defs = get_defs (ldp_insn, DF_REF_REG (use));
+-      if (!defs)
+-	return;
++  rtx base_reg = get_base_reg (get_memref (ldp_insn, insn_type));
++  if (!base_reg)
++    return;
+ 
+-      for (df_link *def = defs; def; def = def->next)
+-	{
+-	  df_link *uses
+-	      = get_uses (DF_REF_INSN (def->ref), DF_REF_REG (def->ref));
+-	  if (!uses)
+-	    continue;
++  df_link *defs = get_defs (ldp_insn, base_reg);
++  if (!defs)
++    return;
+ 
+-	  for (df_link *use = uses; use; use = use->next)
+-	    {
+-	      if (DF_REF_CLASS (use->ref) == DF_REF_REGULAR
+-		  && is_store_insn (DF_REF_INSN (use->ref)))
+-		dependent_stores_candidates.insert (DF_REF_INSN (use->ref));
+-	    }
++  for (df_link *def = defs; def; def = def->next)
++    {
++      df_link *uses = get_uses (DF_REF_INSN (def->ref), DF_REF_REG (def->ref));
++      if (!uses)
++	continue;
++      for (df_link *use = uses; use; use = use->next)
++	{
++	  if (DF_REF_CLASS (use->ref) == DF_REF_REGULAR
++	      && DF_REF_INSN (use->ref) != ldp_insn
++	      && is_store_reg_dependent (ldp_insn, DF_REF_INSN (use->ref)))
++	    dependent_stores_candidates.insert (DF_REF_INSN (use->ref));
+ 	}
+     }
+ }
+@@ -4217,423 +4323,274 @@ bool
+ pass_split_complex_instructions::is_store_reg_dependent (rtx_insn *ldp_insn,
+ 							 rtx_insn *str_insn)
+ {
+-  if (!is_store_insn (str_insn)
+-      || dependent_stores_candidates.find (str_insn)
+-	     == dependent_stores_candidates.end ())
++  if (!str_insn)
+     return false;
+ 
+-  int ldp_offset_sign = UNDEFINED;
+-  int ldp_offset
+-      = get_insn_offset (ldp_insn, get_insn_type (ldp_insn), &ldp_offset_sign);
+-  if (ldp_offset_sign == MINUS)
+-    ldp_offset = -ldp_offset;
++  mem_access_insn_t st_type = get_insn_type (str_insn);
++  if (!is_typeof_stp (st_type) && st_type != STR)
++    return false;
+ 
+-  int str_offset_sign = UNDEFINED;
+-  int str_offset = get_insn_offset (str_insn, STR, &str_offset_sign);
+-  if (str_offset_sign == MINUS)
+-    str_offset = -str_offset;
++  mem_access_insn_t ld_type = get_insn_type (ldp_insn);
++  rtx ld_memref = get_memref (ldp_insn, ld_type);
++  rtx st_memref = get_memref (str_insn, st_type);
++  rtx ld_base_reg = get_base_reg (ld_memref);
++  rtx st_base_reg =  get_base_reg (st_memref);
+ 
+-  if (str_offset == ldp_offset || str_offset == ldp_offset + 8)
+-    return true;
++  if (!ld_base_reg || !st_base_reg
++      || REGNO (ld_base_reg) != REGNO (st_base_reg))
++    return false;
+ 
+-  return false;
+-}
++  int ld_offset = 0;
++  int st_offset = 0;
++  if (get_offset (ld_memref, ld_offset)
++      && get_offset (st_memref, st_offset))
++    {
++      int ld_unit_size = get_unit_size (ld_memref, ld_type);
++      int st_size = get_unit_size (st_memref, st_type);
++      if (st_type != STR)
++	st_size *= 2;
+ 
+-bool
+-pass_split_complex_instructions::is_store_insn (rtx_insn *insn)
+-{
+-  if (!insn)
+-    return false;
+-  rtx sset_b = single_set (insn);
+-  /* TODO: The condition below allow to take only store instructions in which
+-     the memory location's operand is either a register (base) or an plus/minus
+-     operation (base + #imm). So it might make sense to add support for other
+-     cases (e.g. multiply and shift).  */
+-  if (sset_b && MEM_P (SET_DEST (sset_b))
+-      && GET_MODE (XEXP (sset_b, 0)) != BLKmode
+-      && (GET_CODE (XEXP (XEXP (sset_b, 0), 0)) == REG
+-	  || (GET_CODE (XEXP (XEXP (sset_b, 0), 0)) == PLUS
+-	      || GET_CODE (XEXP (XEXP (sset_b, 0), 0)) == MINUS)
+-	  && (GET_CODE (XEXP (XEXP (XEXP (sset_b, 0), 0), 1)) == CONST_INT)))
+-    return true;
++      if (ld_unit_size < 0 || st_size < 0)
++	return false;
++
++      bool st_has_low_ld_part = (ld_offset >= st_offset
++	&& (ld_offset + ld_unit_size <= st_offset + st_size));
++      bool st_has_high_ld_part = ((ld_offset + ld_unit_size >= st_offset)
++	&& (ld_offset + 2 * ld_unit_size <= st_offset + st_size));
++      bool st_has_not_full_ld = (ld_offset < st_offset
++	|| (ld_offset + 2 * ld_unit_size > st_offset + st_size));
++
++      if ((st_has_low_ld_part || st_has_high_ld_part) && st_has_not_full_ld)
++	return true;
++    }
+ 
+   return false;
+ }
+ 
+-int
+-pass_split_complex_instructions::get_insn_offset (
+-    rtx_insn *insn, complex_instructions_t insn_type, int *arith_operation_ptr)
++bool
++pass_split_complex_instructions::get_offset (rtx memref, int &offset)
+ {
+-  rtx insn_pat = PATTERN (insn);
+-  int returned_offset = 0;
++  rtx addr_exp = XEXP (memref, 0);
+ 
+-  rtx offset_expr = NULL;
+-  rtx offset_value_expr = NULL;
+-
+-  switch (insn_type)
++  switch (GET_CODE (addr_exp))
+     {
+-    case LDP:
+-      {
+-	int number_of_sub_insns = XVECLEN (insn_pat, 0);
+-
+-	/* Calculate it's own ofsset of first load insn.  */
+-	rtx_insn *first_load_insn = NULL;
+-	if (number_of_sub_insns == 2)
++      case REG:
++      case POST_DEC:
++      case POST_INC:
++	offset = 0;
++	return true;
++      case PRE_DEC:
++	offset = -(GET_MODE_SIZE (GET_MODE (memref)).to_constant ());
++	return true;
++      case PRE_INC:
++	offset = GET_MODE_SIZE (GET_MODE (memref)).to_constant ();
++	return true;
++      case PLUS:
++	if (CONST_INT_P (XEXP (addr_exp, 1)))
+ 	  {
+-	    first_load_insn
+-		= make_insn_raw (copy_rtx (XVECEXP (insn_pat, 0, 0)));
+-	    arith_operation_ptr = NULL;
+-
+-	    offset_expr = XEXP (XEXP (PATTERN (first_load_insn), 1), 0);
+-	    if (GET_CODE (offset_expr) == PLUS
+-		|| GET_CODE (offset_expr) == MINUS)
+-	      offset_value_expr
+-		  = XEXP (XEXP (XEXP (PATTERN (first_load_insn), 1), 0), 1);
+-	    else
+-	      offset_expr = NULL;
++	    offset = INTVAL (XEXP (addr_exp, 1));
++	    return true;
+ 	  }
+-	else if (number_of_sub_insns == 3)
+-	  {
+-	    rtx_insn *offset_sub_insn
+-		= make_insn_raw (copy_rtx (XVECEXP (insn_pat, 0, 0)));
+-
+-	    offset_expr = XEXP (PATTERN (offset_sub_insn), 1);
+-	    offset_value_expr = XEXP (XEXP (PATTERN (offset_sub_insn), 1), 1);
+-	  }
+-	else
+-	  {
+-	    gcc_assert (false
+-			&& "Wrong number of elements in the ldp_insn vector");
+-	  }
+-	break;
+-      }
+-    case LDP_TI:
+-      {
+-	offset_expr = XEXP (XEXP (insn_pat, 1), 0);
+-	if (GET_CODE (offset_expr) != PLUS && GET_CODE (offset_expr) != MINUS)
+-	  return 0;
+-	offset_value_expr = XEXP (XEXP (XEXP (insn_pat, 1), 0), 1);
+-	break;
+-      }
+-    case STR:
+-      {
+-	offset_expr = XEXP (XEXP (insn_pat, 0), 0);
+-	/* If memory location is specified by single base register then the
+-	   offset is zero.  */
+-	if (GET_CODE (offset_expr) == REG)
+-	  return 0;
+-	offset_value_expr = XEXP (XEXP (XEXP (insn_pat, 0), 0), 1);
+-	break;
+-      }
+-    default:
+-      {
+-	if (dumps_are_enabled && dump_file)
+-	  {
+-	    fprintf (dump_file, "Instruction that was tried to split:\n");
+-	    print_rtl_single (dump_file, insn);
+-	  }
+-	gcc_assert (false && "Unsupported instruction type");
+-	break;
+-      }
+-    }
+-
+-  if (offset_expr != NULL && offset_value_expr
+-      && GET_CODE (offset_value_expr) == CONST_INT)
+-    returned_offset = XINT (offset_value_expr, 0);
+-
+-  if (arith_operation_ptr != NULL)
+-    {
+-      *arith_operation_ptr = GET_CODE (offset_expr);
+-      gcc_assert ((*arith_operation_ptr == MINUS
+-		   || *arith_operation_ptr == PLUS)
+-		  && "Unexpected arithmetic operation in the offset expr");
++      default:
++	return false;
+     }
+-
+-  return returned_offset;
+ }
+ 
+ void
+-pass_split_complex_instructions::split_simple_ldp (rtx_insn *ldp_insn)
++pass_split_complex_instructions::replace_insn (rtx_insn *old_insn,
++					       rtx_insn *new_insns)
+ {
+-  rtx pat = PATTERN (ldp_insn);
+-
+-  rtx_insn *mem_insn_1 = make_insn_raw (copy_rtx (XVECEXP (pat, 0, 0)));
+-  rtx_insn *mem_insn_2 = make_insn_raw (copy_rtx (XVECEXP (pat, 0, 1)));
+-
+-  int dest_regno = REGNO (SET_DEST (PATTERN (mem_insn_1)));
+-  int src_regno;
+-
+-  rtx srs_reg_insn = XEXP (SET_SRC (PATTERN (mem_insn_1)), 0);
+-
+-  if (GET_CODE (srs_reg_insn) == REG)
+-    src_regno = REGNO (srs_reg_insn);
+-  else
+-    src_regno = REGNO (XEXP (srs_reg_insn, 0));
+-
+-  rtx_insn *emited_insn_1, *emited_insn_2;
++  rtx_insn *prev_insn = PREV_INSN (old_insn);
++  start_sequence ();
+ 
+-  /* in cases like ldp r1,r2,[r1] we emit ldr r2,[r1] first.  */
+-  if (src_regno == dest_regno)
+-    std::swap (mem_insn_1, mem_insn_2);
++  emit_insn (new_insns);
++  if (dump_file)
++    {
++      fprintf (dump_file, "Split LDP:\n");
++      print_rtl_single (dump_file, old_insn);
++      fprintf (dump_file, "Split into:\n");
++    }
+ 
+-  emited_insn_1 = emit_insn (PATTERN (mem_insn_1));
+-  emited_insn_2 = emit_insn (PATTERN (mem_insn_2));
++  for (rtx_insn *insn = new_insns; insn; insn = NEXT_INSN (insn))
++    {
++	INSN_CODE (insn) = recog (PATTERN (insn), insn, NULL);
++	if (dump_file)
++	  {
++	    print_rtl_single (dump_file, insn);
++	  }
++    }
+ 
+-  int sub_insn_1_code = recog (PATTERN (mem_insn_1), mem_insn_1, 0);
+-  int sub_insn_2_code = recog (PATTERN (mem_insn_2), mem_insn_2, 0);
++  rtx_insn *seq = get_insns ();
++  unshare_all_rtl_in_chain (seq);
++  end_sequence ();
+ 
+-  INSN_CODE (emited_insn_1) = sub_insn_1_code;
+-  INSN_CODE (emited_insn_2) = sub_insn_2_code;
++  emit_insn_after_setloc (seq, prev_insn, INSN_LOCATION (old_insn));
++  delete_insn_and_edges (old_insn);
+ }
+ 
+ void
+-pass_split_complex_instructions::split_ldp_with_offset (rtx_insn *ldp_insn)
++pass_split_complex_instructions::split_ldp (rtx_insn *ldp_insn)
+ {
+   rtx pat = PATTERN (ldp_insn);
+-  bool post_index = true;
+-
+-  rtx_insn offset_insn;
+-  rtx_insn mem_insn_1;
+-  rtx_insn mem_insn_2;
++  mem_access_insn_t insn_type = get_insn_type (ldp_insn);
++  gcc_assert (is_typeof_ldp (insn_type));
+ 
+-  int offset_insn_code;
+-  int mem_insn_1_code = -1;
+-  int mem_insn_2_code = -1;
++  rtx load_rtx_1 = NULL;
++  rtx load_rtx_2 = NULL;
++  rtx post_index_rtx = NULL;
+ 
+-  int offset = 0;
+-  int arith_operation = UNDEFINED;
+-
+-  for (int i = 0; i < 3; i++)
++  switch (insn_type)
+     {
+-      rtx sub_insn = XVECEXP (pat, 0, i);
+-      rtx_insn *copy_of_sub_insn = make_insn_raw (copy_rtx (sub_insn));
+-      int sub_insn_code
+-	  = recog (PATTERN (copy_of_sub_insn), copy_of_sub_insn, 0);
+-
+-      /* If sub_insn is offset related.  */
+-      if (GET_RTX_CLASS (sub_insn_code) == RTX_UNARY)
+-	{
+-	  offset_insn = *copy_of_sub_insn;
+-	  offset_insn_code = sub_insn_code;
+-	  gcc_assert (i == 0
+-		      && "Offset related insn must be the first "
+-			 "element of a parallel insn vector");
+-
+-	  offset = get_insn_offset (ldp_insn, LDP, &arith_operation);
+-	}
+-      else
+-	{
+-	  if (GET_CODE (XEXP (PATTERN (copy_of_sub_insn), 0)) != REG)
+-	    {
+-	      rtx &offset_expr
+-		  = XEXP (XEXP (XEXP (PATTERN (copy_of_sub_insn), 0), 0), 1);
+-	      if (GET_CODE (offset_expr) == CONST_INT)
+-		{
+-		  int local_offset = XINT (offset_expr, 0);
+-		  offset = (arith_operation == PLUS ? offset : -offset);
+-
+-		  offset_expr = GEN_INT (local_offset + offset);
+-
+-		  gcc_assert (
+-		      (arith_operation == MINUS || arith_operation == PLUS)
+-		      && "Unexpected arithmetic operation in offset related "
+-			 "sub_insn");
+-
+-		  if (i == 1)
+-		    post_index = false;
+-		}
+-	      else
+-		{
+-		  post_index = true;
+-		}
+-	    }
+-	}
+-      if (i == 1)
+-	{
+-	  mem_insn_1 = *copy_of_sub_insn;
+-	  mem_insn_1_code = sub_insn_code;
+-	}
+-      if (i == 2)
+-	{
+-	  mem_insn_2 = *copy_of_sub_insn;
+-	  mem_insn_2_code = sub_insn_code;
+-	}
++      case LDP:
++	load_rtx_1 = copy_rtx (XVECEXP (pat, 0, 0));
++	load_rtx_2 = copy_rtx (XVECEXP (pat, 0, 1));
++	break;
++      case LDP_WB:
++	post_index_rtx = copy_rtx (XVECEXP (pat, 0, 0));
++	load_rtx_1 = copy_rtx (XVECEXP (pat, 0, 1));
++	load_rtx_2 = copy_rtx (XVECEXP (pat, 0, 2));
++	break;
++      case LDP_TI:
++	split_ldp_ti (ldp_insn);
++	return;
++      default:
++	return;
+     }
+-  gcc_assert (mem_insn_1_code != -1 && mem_insn_2_code != -1
+-	      && "Uninitialized memory insns");
+ 
+-  int dest_regno = REGNO (SET_DEST (PATTERN (&mem_insn_1)));
+-  int src_regno;
+-
+-  rtx srs_reg_insn = XEXP (SET_SRC (PATTERN (&mem_insn_1)), 0);
+-
+-  if (GET_CODE (srs_reg_insn) == REG)
+-    src_regno = REGNO (srs_reg_insn);
+-  else
+-    src_regno = REGNO (XEXP (srs_reg_insn, 0));
++  int dest_regno = REGNO (SET_DEST (load_rtx_1));
++  int base_regno = REGNO (get_base_reg (get_memref (ldp_insn, insn_type)));
+ 
+-  /* Don't split such weird LDP.  */
+-  if (src_regno == dest_regno)
+-    return;
+-
+-  rtx_insn *emited_offset_insn;
+-  if (!post_index)
++  /* In cases like ldp r1,r2,[r1[, #imm]] emit ldr r2,[r1[, #imm]] first.
++     For LDP with post-index don't split such instruction.  */
++  if (base_regno == dest_regno)
+     {
+-      emited_offset_insn = emit_insn (PATTERN (&offset_insn));
+-      INSN_CODE (emited_offset_insn) = offset_insn_code;
++      if (insn_type == LDP)
++	std::swap (load_rtx_1, load_rtx_2);
++      else
++	return;
+     }
+ 
+-  rtx_insn *emited_insn_1 = emit_insn (PATTERN (&mem_insn_1));
+-  rtx_insn *emited_insn_2 = emit_insn (PATTERN (&mem_insn_2));
+-
+-
+-  INSN_CODE (emited_insn_1) = mem_insn_1_code;
+-  INSN_CODE (emited_insn_2) = mem_insn_2_code;
+-
+-  if (post_index)
++  /* Construct the instruction chain for subsequent emitting.  */
++  rtx_insn *insn_seq = make_insn_raw (load_rtx_1);
++  rtx_insn *load_insn_2 = make_insn_raw (load_rtx_2);
++  SET_NEXT_INSN (insn_seq) = load_insn_2;
++  SET_NEXT_INSN (load_insn_2) = NULL;
++  if (post_index_rtx)
+     {
+-      emited_offset_insn = emit_insn (PATTERN (&offset_insn));
+-      INSN_CODE (emited_offset_insn) = offset_insn_code;
++      rtx_insn *post_index_insn = make_insn_raw (post_index_rtx);
++      SET_NEXT_INSN (load_insn_2) = post_index_insn;
++      SET_NEXT_INSN (post_index_insn) = NULL;
+     }
+-}
+-
+-void
+-pass_split_complex_instructions::split_ldp_stp (rtx_insn *insn)
+-{
+-  rtx_insn *prev_insn = PREV_INSN (insn);
+-  int number_of_sub_insns = XVECLEN (PATTERN (insn), 0);
+-
+-  start_sequence ();
+ 
+-  if (number_of_sub_insns == 2)
+-    split_simple_ldp (insn);
+-  else if (number_of_sub_insns == 3)
+-    split_ldp_with_offset (insn);
+-  else
+-    gcc_assert (false && "Broken complex insn vector");
+-
+-  rtx_insn *seq = get_insns ();
+-  unshare_all_rtl_in_chain (seq);
+-  end_sequence ();
+-
+-  emit_insn_after_setloc (seq, prev_insn, INSN_LOCATION (insn));
+-  delete_insn_and_edges (insn);
++  replace_insn (ldp_insn, insn_seq);
+ }
+ 
+ void
+ pass_split_complex_instructions::split_ldp_ti (rtx_insn *insn)
+ {
+-  rtx_insn *prev_insn = PREV_INSN (insn);
+-  rtx_insn *load_insn_1 = make_insn_raw (copy_rtx (PATTERN (insn)));
+-  rtx_insn *load_insn_2 = make_insn_raw (copy_rtx (PATTERN (insn)));
+-
+-  rtx reg_insn_1 = XEXP (PATTERN (load_insn_1), 0);
+-  rtx mem_insn_1 = XEXP (PATTERN (load_insn_1), 1);
+-  rtx mem_insn_2 = XEXP (PATTERN (load_insn_2), 1);
+-
+-  PUT_MODE (mem_insn_1, DImode);
+-  PUT_MODE (mem_insn_2, DImode);
+-
+-  int reg_no_1 = REGNO (reg_insn_1);
++  rtx pat = PATTERN (insn);
++  rtx memref = get_memref (insn, LDP_TI);
++  int unit_size = get_unit_size (memref, LDP_TI);
++  rtx base_reg = get_base_reg (memref);
++  rtx dest_reg = SET_DEST (pat);
++
++  rtx reg_index_rtx = NULL;
++  rtx load_rtx_1 = NULL;
++  rtx load_rtx_2 = NULL;
++  bool post_index = false;
++  int offset = 0;
+ 
+-  XEXP (PATTERN (load_insn_1), 0) = gen_rtx_REG (DImode, reg_no_1);
+-  XEXP (PATTERN (load_insn_2), 0) = gen_rtx_REG (DImode, reg_no_1 + 1);
++  rtx load_1_memref = gen_rtx_MEM (DImode, base_reg);
+ 
+-  rtx load_insn_2_plus_expr = XEXP (XEXP (PATTERN (load_insn_2), 1), 0);
+-  if (GET_CODE (load_insn_2_plus_expr) == REG)
++  rtx addr_expr = XEXP (memref, 0);
++  if (GET_CODE (addr_expr) == PLUS)
+     {
+-	XEXP (XEXP (PATTERN (load_insn_2), 1), 0)
+-	  = gen_rtx_PLUS (DImode,
+-			  gen_rtx_REG (DImode, REGNO (load_insn_2_plus_expr)),
+-			  GEN_INT (GET_MODE_SIZE (DImode)));
++      offset = INTVAL (XEXP (addr_expr, 1));
++      XEXP (load_1_memref, 0) = gen_rtx_PLUS (DImode, base_reg,
++					      GEN_INT (offset));
+     }
+-  else
+-    {
+-      rtx load_insn_2_offset_expr
+-      = XEXP (XEXP (XEXP (PATTERN (load_insn_2), 1), 0), 1);
+ 
+-      if (load_insn_2_offset_expr == NULL)
+-	return;
+-
+-      if (GET_CODE (load_insn_2_offset_expr) == CONST_INT)
+-	{
+-	  int load_insn_2_offset = XINT (load_insn_2_offset_expr, 0);
+-	  XEXP (XEXP (XEXP (PATTERN (load_insn_2), 1), 0), 1)
+-	    = GEN_INT (load_insn_2_offset + GET_MODE_SIZE (DImode));
+-	}
+-    }
+-
+-  start_sequence ();
++  rtx load_2_memref = gen_rtx_MEM (DImode,
++    gen_rtx_PLUS (DImode, base_reg, GEN_INT (offset + unit_size)));
+ 
+-  int src_regno;
+-  rtx srs_reg_insn = XEXP (XEXP (PATTERN (load_insn_1), 1), 0);
++  load_rtx_1 = gen_rtx_SET (gen_rtx_REG (DImode, REGNO (dest_reg)),
++			    load_1_memref);
++  load_rtx_2 = gen_rtx_SET (gen_rtx_REG (DImode, REGNO (dest_reg) + 1),
++			    load_2_memref);
+ 
+-  if (GET_CODE (srs_reg_insn) == REG)
+-    src_regno = REGNO (srs_reg_insn);
+-  else
+-    src_regno = REGNO (XEXP (srs_reg_insn, 0));
++  if (GET_CODE (addr_expr) == PRE_INC || GET_CODE (addr_expr) == PRE_DEC
++      || GET_CODE (addr_expr) == POST_INC || GET_CODE (addr_expr) == POST_DEC)
++    {
++      /* The amount of increment or decrement is equal to size of
++	 machine-mode of the containing MEMREF (see rtl.def).  */
++      int index_offset = GET_MODE_SIZE (GET_MODE (memref)).to_constant ();
+ 
+-  /* in cases like ldp r1,r2,[r1] we emit ldr r2,[r1] first.  */
+-  if (src_regno == reg_no_1)
+-    std::swap (load_insn_1, load_insn_2);
++      if (GET_CODE (addr_expr) == PRE_DEC || GET_CODE (addr_expr) == POST_DEC)
++	index_offset = -index_offset;
+ 
+-  rtx_insn *emited_load_insn_1 = emit_insn (PATTERN (load_insn_1));
+-  rtx_insn *emited_load_insn_2 = emit_insn (PATTERN (load_insn_2));
++      if (GET_CODE (addr_expr) == POST_INC || GET_CODE (addr_expr) == POST_DEC)
++	post_index = true;
+ 
+-  INSN_CODE (emited_load_insn_1)
+-      = recog (PATTERN (emited_load_insn_1), emited_load_insn_1, 0);
+-  INSN_CODE (emited_load_insn_2)
+-      = recog (PATTERN (emited_load_insn_2), emited_load_insn_2, 0);
++      reg_index_rtx = gen_rtx_SET (base_reg,
++				   gen_rtx_PLUS (DImode, base_reg,
++						 GEN_INT (index_offset)));
++    }
+ 
+-  rtx_insn *seq = get_insns ();
+-  unshare_all_rtl_in_chain (seq);
+-  end_sequence ();
++  /* In cases like ldp r1,r2,[r1] we emit ldr r2,[r1] first.  */
++  if (REGNO (base_reg) == REGNO (dest_reg))
++    std::swap (load_rtx_1, load_rtx_2);
+ 
+-  emit_insn_after_setloc (seq, prev_insn, INSN_LOCATION (insn));
+-  delete_insn_and_edges (insn);
+-}
++  /* Construct the instruction chain for subsequent emitting.  */
++  rtx_insn *insn_seq = make_insn_raw (load_rtx_1);
++  rtx_insn *load_insn_2 = make_insn_raw (load_rtx_2);
++  SET_NEXT_INSN (insn_seq) = load_insn_2;
++  SET_NEXT_INSN (load_insn_2) = NULL;
++  if (post_index && reg_index_rtx)
++    {
++      rtx_insn *post_index_insn = make_insn_raw (reg_index_rtx);
++      SET_NEXT_INSN (load_insn_2) = post_index_insn;
++      SET_NEXT_INSN (post_index_insn) = NULL;
++    }
++  else if (!post_index && reg_index_rtx)
++    {
++      rtx_insn *pre_index = make_insn_raw (reg_index_rtx);
++      SET_NEXT_INSN (pre_index) = insn_seq;
++      insn_seq = pre_index;
++    }
+ 
+-void
+-pass_split_complex_instructions::split_complex_insn (rtx_insn *insn)
+-{
+-  complex_instructions_t insn_type = get_insn_type (insn);
+-  /* TODO: Add splitting of STP instructions.  */
+-  if (insn_type == LDP || insn_type == STP)
+-    split_ldp_stp (insn);
+-  else if (insn_type == LDP_TI)
+-    split_ldp_ti (insn);
+-  else
+-    gcc_assert (false && "Unsupported type of insn to split");
++  replace_insn (insn, insn_seq);
+ }
+ 
+-pass_split_complex_instructions::complex_instructions_t
++pass_split_complex_instructions::mem_access_insn_t
+ pass_split_complex_instructions::get_insn_type (rtx_insn *insn)
+ {
+   if (!INSN_P (insn))
+     return UNDEFINED;
+ 
+-  rtx pat = PATTERN (insn);
+-  int icode = recog (PATTERN (insn), insn, NULL);
++  int icode = INSN_CODE (insn);
++  if (icode == -1)
++    icode = recog (PATTERN (insn), insn, 0);
++  bool has_wb = false;
++
++  if (targetm.is_ldp_insn (icode, &has_wb))
++    return (has_wb ? LDP_WB : LDP);
+ 
+-  if (GET_CODE (pat) == PARALLEL)
++  if (targetm.is_stp_insn (icode, &has_wb))
++    return (has_wb ? STP_WB : STP);
++
++  rtx set_insn = single_set (insn);
++  if (set_insn && (GET_MODE (SET_SRC (set_insn)) == E_TImode
++      || GET_MODE (SET_DEST (set_insn)) == E_TImode))
+     {
+-      if (targetm.is_ldp_insn (icode))
+-	{
+-	  return LDP;
+-	}
+-      if (targetm.is_stp_insn (icode))
+-	{
+-	  return STP;
+-	}
+-      else
+-	{
+-	  return UNDEFINED;
+-	}
++      if (MEM_P (SET_SRC (set_insn)) && REG_P (SET_DEST (set_insn)))
++	return LDP_TI;
++      if (MEM_P (SET_DEST (set_insn)) && REG_P (SET_SRC (set_insn)))
++	return STP_TI;
+     }
+-  rtx set_insn = single_set (insn);
+-  if (set_insn && GET_CODE (XEXP (set_insn, 1)) == MEM
+-      && GET_MODE (XEXP (set_insn, 1)) == E_TImode)
+-    return LDP_TI;
++
++  if (set_insn && MEM_P (SET_DEST (set_insn)) && REG_P (SET_SRC (set_insn))
++      && GET_MODE (SET_DEST (set_insn)) != BLKmode)
++    return STR;
+ 
+   return UNDEFINED;
+ }
+diff --git a/gcc/target.def b/gcc/target.def
+index a3a50b474..8797a21d5 100644
+--- a/gcc/target.def
++++ b/gcc/target.def
+@@ -2679,13 +2679,19 @@ modes and they have different conditional execution capability, such as ARM.",
+ 
+ DEFHOOK
+ (is_ldp_insn,
+-  "Return true if icode is corresponding to any of the LDP instruction types.",
+-  bool, (int icode), NULL)
++ "Return true if @var{icode} is corresponding to any of the LDP instruction\n\
++types.  If @var{has_wb} is not NULL then its value is set to true if LDP\n\
++contains post-index or pre-index operation.",
++  bool, (int icode, bool *has_wb),
++  NULL)
+ 
+ DEFHOOK
+ (is_stp_insn,
+-  "Return true if icode is corresponding to any of the STP instruction types.",
+-  bool, (int icode), NULL)
++ "Return true if @var{icode} is corresponding to any of the STP instruction\n\
++types.  If @var{has_wb} is not NULL then its value is set to true if STP\n\
++contains post-index or pre-index operation.",
++  bool, (int icode, bool *has_wb),
++  NULL)
+ 
+ DEFHOOK
+ (gen_ccmp_first,
+diff --git a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c
+index 3918d43f6..2d42231dc 100644
+--- a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c
++++ b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile { target aarch64-*-* } } */
+-/* { dg-additional-options "-fsplit-ldp-stp" } */
++/* { dg-additional-options "-O1 -fsplit-ldp-stp" } */
+ /*
+  *    Tests are:
+  *          Patterns where LDP insns should NOT be split
+@@ -15,6 +15,9 @@ simple_ldp_after_store ()
+       (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK)
+       (cinsn 228 (set (reg/i:DI sp) 
+                    (reg/i:DI x0)))
++      (cinsn 238 (set (reg/i:DI x1)
++                   (reg/i:DI x0)))
++
+       (cinsn 101 (set (mem/c:DI
+                         (plus:DI (reg/f:DI sp)
+                           (const_int 32))[1 S4 A32])(reg:DI x0)))
+@@ -24,11 +27,27 @@ simple_ldp_after_store ()
+         (set (reg:DI x30)
+           (mem:DI (plus:DI (reg/f:DI sp)
+             (const_int 16)) [1 S4 A32]))]))
+-      (cinsn 11 (use (reg/i:DI sp)))
+-      (cinsn 12 (use (reg/i:DI cc)))
+-      (cinsn 13 (use (reg/i:DI x29)))
+-      (cinsn 14 (use (reg/i:DI x30)))
+-      (cinsn 15 (use (reg/i:DI x0)))
++      (cinsn 11 (use (reg/i:DI x29)))
++      (cinsn 12 (use (reg/i:DI x30)))
++
++      /* stp x0, x2, [x1].  */
++      (cinsn 102 (parallel [
++        (set (mem:DI (reg/f:DI x1) [1 S4 A32])
++             (reg:DI x0))
++        (set (mem:DI (plus:DI (reg/f:DI x1) (const_int 8)) [1 S4 A32])
++             (reg:DI x2))]))
++      /* ldp x5, x6, [x1].  */
++      (cinsn 13 (parallel [
++        (set (reg:DI x5) (mem:DI (reg/f:DI x1) [1 S4 A32]))
++        (set (reg:DI x6) (mem:DI (plus:DI (reg/f:DI x1)
++                                          (const_int 8)) [1 S4 A32]))
++      ]))
++      (cinsn 14 (use (reg/i:DI x5)))
++      (cinsn 15 (use (reg/i:DI x6)))
++
++      (cinsn 100 (use (reg/i:DI sp)))
++      (cinsn 200 (use (reg/i:DI cc)))
++      (cinsn 300 (use (reg/i:DI x0)))
+       (edge-to exit (flags "FALLTHRU"))
+     ) ;; block 2
+   ) ;; insn-chain
+@@ -70,5 +89,5 @@ ldp_after_store_in_different_bb ()
+ ) ;; function "ldp_after_store_in_different_bb"
+ }
+ 
+-/* Verify that the output code contains exactly 2 ldp.  */
+-/* { dg-final { scan-assembler-times {ldp\t} 2 } }  */
+\ No newline at end of file
++/* Verify that the output code contains exactly 3 ldp.  */
++/* { dg-final { scan-assembler-times {ldp\t} 3 } }  */
+\ No newline at end of file
+diff --git a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c
+index 653c30f83..59ff82df9 100644
+--- a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c
++++ b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile { target aarch64-*-* } } */
+-/* { dg-additional-options "-fsplit-ldp-stp" } */
++/* { dg-additional-options "-O1 -fsplit-ldp-stp" } */
+ /*
+  *    Test is:
+  *        Pattern where LDP insns should be split with rearrangement in order
+diff --git a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c
+index dc9f26efb..e25762160 100644
+--- a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c
++++ b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c
+@@ -13,48 +13,131 @@ simple_ldp_after_store ()
+     (block 2
+       (edge-from entry (flags "FALLTHRU"))
+       (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK)
++      /* mov sp, x0.  */
+       (cinsn 228 (set (reg/i:DI sp)
+-                   (reg/i:DI x0)))
++                      (reg/i:DI x0)))
++      /* mov x1, x0.  */
+       (cinsn 238 (set (reg/i:DI x1)
+-                   (reg/i:DI x0)))
++                      (reg/i:DI x0)))
+ 
++      /* str x0, [sp, 8].  */
+       (cinsn 101 (set (mem/c:DI
+                         (plus:DI (reg/f:DI sp)
+                           (const_int 8))[1 S4 A32])(reg:DI x0)))
++      /* ldp x29, x30, [sp, 8].  */
+       (cinsn 10 (parallel [
+         (set (reg:DI x29)
+           (mem:DI (plus:DI (reg/f:DI sp) (const_int 8)) [1 S4 A32]))
+         (set (reg:DI x30)
+           (mem:DI (plus:DI (reg/f:DI sp)
+             (const_int 16)) [1 S4 A32]))]))
++      (cinsn 11 (use (reg/i:DI x29)))
++      (cinsn 12 (use (reg/i:DI x30)))
+ 
++      /* str x0, [x1, -16].  */
+       (cinsn 102 (set (mem/c:DI (plus:DI (reg/f:DI x1)
+                                           (const_int -16)) [1 S4 A32])
+                       (reg:DI x0)))
+-      (cinsn 11 (parallel [
++      /* ldp x3, x4, [x1, -16].  */
++      (cinsn 13 (parallel [
+         (set (reg:DI x3)
+           (mem:DI (plus:DI (reg/f:DI x1) (const_int -16)) [1 S4 A32]))
+         (set (reg:DI x4)
+           (mem:DI (plus:DI (reg/f:DI x1) (const_int -8)) [1 S4 A32]))
+       ]))
++      (cinsn 14 (use (reg/i:DI x3)))
++      (cinsn 15 (use (reg/i:DI x4)))
+ 
++      /* str x0, [x1].  */
+       (cinsn 103 (set (mem/c:DI (reg/f:DI x1) [1 S4 A32])
+                       (reg:DI x0)))
+-      (cinsn 12 (parallel [
++      /* ldp x5, x6, [x1].  */
++      (cinsn 16 (parallel [
+         (set (reg:DI x5) (mem:DI (reg/f:DI x1) [1 S4 A32]))
+         (set (reg:DI x6) (mem:DI (plus:DI (reg/f:DI x1)
+                                           (const_int 8)) [1 S4 A32]))
+       ]))
++      (cinsn 17 (use (reg/i:DI x5)))
++      (cinsn 18 (use (reg/i:DI x6)))
+ 
+-      (cinsn 13 (use (reg/i:DI sp)))
+-      (cinsn 14 (use (reg/i:DI cc)))
+-      (cinsn 15 (use (reg/i:DI x29)))
+-      (cinsn 16 (use (reg/i:DI x30)))
+-      (cinsn 17 (use (reg/i:DI x0)))
+-      (cinsn 18 (use (reg/i:DI x3)))
+-      (cinsn 19 (use (reg/i:DI x4)))
+-      (cinsn 20 (use (reg/i:DI x5)))
+-      (cinsn 21 (use (reg/i:DI x6)))
++      /* ldp x29, x30, [sp], 96.  */
++      (cinsn 19 (parallel [
++        (set (reg/f:DI sp)
++          (plus:DI (reg/f:DI sp) (const_int 96)))
++        (set (reg:DI x29)
++          (mem:DI (reg/f:DI sp) [1 S4 A32]))
++        (set (reg:DI x30)
++          (mem:DI (plus:DI (reg/f:DI sp)
++            (const_int 8)) [1 S4 A32]))]))
++      (cinsn 20 (use (reg/i:DI x29)))
++      (cinsn 21 (use (reg/i:DI x30)))
++
++      /* stp x0, x2, [x1, 128].  */
++      (cinsn 104 (parallel [
++        (set (mem:DI (plus:DI (reg/f:DI x1) (const_int 128)) [1 S4 A32])
++             (reg:DI x0))
++        (set (mem:DI (plus:DI (reg/f:DI x1) (const_int 136)) [1 S4 A32])
++             (reg:DI x2))]))
++      /* ldp x29, x30, [x1, 120].  */
++      (cinsn 22 (parallel [
++        (set (reg:DI x29)
++          (mem:DI (plus:DI (reg/f:DI x1) (const_int 120)) [1 S4 A32]))
++        (set (reg:DI x30)
++          (mem:DI (plus:DI (reg/f:DI x1) (const_int 128)) [1 S4 A32]))]))
++      (cinsn 23 (use (reg/i:DI x29)))
++      (cinsn 24 (use (reg/i:DI x30)))
++
++      /* stp x0, x2, [x1, 128].  */
++      (cinsn 105 (parallel [
++        (set (mem:DI (plus:DI (reg/f:DI x1) (const_int 128)) [1 S4 A32])
++             (reg:DI x0))
++        (set (mem:DI (plus:DI (reg/f:DI x1) (const_int 136)) [1 S4 A32])
++             (reg:DI x2))]))
++      /* ldp x3, x4, [x1, 136].  */
++      (cinsn 25 (parallel [
++        (set (reg:DI x3)
++          (mem:DI (plus:DI (reg/f:DI x1) (const_int 136)) [1 S4 A32]))
++        (set (reg:DI x4)
++          (mem:DI (plus:DI (reg/f:DI x1) (const_int 144)) [1 S4 A32]))
++      ]))
++      (cinsn 26 (use (reg/i:DI x3)))
++      (cinsn 27 (use (reg/i:DI x4)))
++
++      /* stp w0, w2, [x1, 32].  */
++      (cinsn 106 (parallel [
++        (set (mem:SI (plus:DI (reg/f:DI x1) (const_int 32)) [1 S4 A32])
++             (reg:SI x0))
++        (set (mem:SI (plus:DI (reg/f:DI x1) (const_int 36)) [1 S4 A32])
++             (reg:SI x2))]))
++      /* ldp x5, x6, [x1, 32].  */
++      (cinsn 28 (parallel [
++        (set (reg:DI x5) (mem:DI (plus:DI (reg/f:DI x1)
++                                          (const_int 32)) [1 S4 A32]))
++        (set (reg:DI x6) (mem:DI (plus:DI (reg/f:DI x1)
++                                          (const_int 40)) [1 S4 A32]))
++      ]))
++      (cinsn 29 (use (reg/i:DI x5)))
++      (cinsn 30 (use (reg/i:DI x6)))
++
++      /* stp w0, w2, [x1, 40].  */
++      (cinsn 107 (parallel [
++        (set (mem:SI (plus:DI (reg/f:DI x1) (const_int 40)) [1 S4 A32])
++             (reg:SI x0))
++        (set (mem:SI (plus:DI (reg/f:DI x1) (const_int 44)) [1 S4 A32])
++             (reg:SI x2))]))
++      /* ldp x5, x6, [x1, 32].  */
++      (cinsn 31 (parallel [
++        (set (reg:DI x5) (mem:DI (plus:DI (reg/f:DI x1)
++                                          (const_int 32)) [1 S4 A32]))
++        (set (reg:DI x6) (mem:DI (plus:DI (reg/f:DI x1)
++                                          (const_int 40)) [1 S4 A32]))
++      ]))
++      (cinsn 32 (use (reg/i:DI x5)))
++      (cinsn 33 (use (reg/i:DI x6)))
++
++      (cinsn 100 (use (reg/i:DI sp)))
++      (cinsn 200 (use (reg/i:DI cc)))
++      (cinsn 400 (use (reg/i:DI x0)))
+       (edge-to exit (flags "FALLTHRU"))
+     ) ;; block 2
+   ) ;; insn-chain
+@@ -69,43 +152,83 @@ ldp_ti_after_store ()
+       (block 2
+       (edge-from entry (flags "FALLTHRU"))
+       (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK)
++      /* mov sp, x0.  */
+       (cinsn 228 (set (reg/i:DI sp)
+-                   (reg/i:DI x0)))
++                      (reg/i:DI x0)))
++      /* mov x2, x0.  */
+       (cinsn 238 (set (reg/i:DI x2)
+-                   (reg/i:DI x0)))
+-
++                      (reg/i:DI x0)))
++      /* str x0, [sp, 136].  */
+       (cinsn 101 (set (mem/c:DI
+                         (plus:DI (reg/f:DI sp)
+                           (const_int 136))[1 S4 A32])(reg:DI x0)))
+-      (insn 81 (set (reg:TI x0 [1 S4 A32])
++      /* ldp x0, x1, [sp, 136].  */
++      (cinsn 81 (set (reg:TI x0 [1 S4 A32])
+               (mem/c:TI (plus:DI (reg/f:DI sp)
+-                      (const_int 136 )) [1 S4 A32]))
+-           (expr_list:REG_EQUIV (mem/c:TI (plus:DI (reg/f:DI sfp)
+-                      (const_int -24 )) [1 S4 A32])
+-              (nil)))
+-
++                      (const_int 136)) [1 S4 A32])))
++      /* str x0, [x2, -16].  */
+       (cinsn 102 (set (mem/c:DI (plus:DI (reg/f:DI x2)
+-                                          (const_int -16)) [1 S4 A32])
++                                         (const_int -16)) [1 S4 A32])
+                       (reg:DI x0)))
+-      (insn 82 (set (reg:TI x3 [1 S4 A32])
++      /* ldp x3, x4, [x2, -16].  */
++      (cinsn 82 (set (reg:TI x3 [1 S4 A32])
+                     (mem/c:TI (plus:DI (reg/f:DI x2)
+-                                        (const_int -16)) [1 S4 A32])))
+-
++                                       (const_int -16)) [1 S4 A32])))
++      /* str x0, [x2].  */
+       (cinsn 103 (set (mem/c:DI (reg/f:DI x2) [1 S4 A32])
+                       (reg:DI x0)))
+-      (insn 83 (set (reg:TI x5 [1 S4 A32])
++      /* ldp x5, x6, [x2].  */
++      (cinsn 83 (set (reg:TI x5 [1 S4 A32])
+                     (mem/c:TI (reg/f:DI x2) [1 S4 A32])))
+ 
++      /* stp x0, x1, [sp, -8].  */
++      (cinsn 104 (set (mem:TI (plus:DI (reg/v/f:DI sp)
++                                       (const_int -8)) [1 S4 A32])
++                      (reg:TI x0)))
++      /* ldp x5, x6, [sp], -16.  */
++      (cinsn 84 (set (reg/v:TI x5 [1 S4 A32])
++                    (mem:TI (post_dec:DI (reg/v/f:DI sp)) [1 S4 A32])))
++      (cinsn 85 (use (reg/i:DI x5)))
++      (cinsn 86 (use (reg/i:DI x6)))
++
++      /* stp x0, x1, [sp, 8].  */
++      (cinsn 105 (set (mem:TI (plus:DI (reg/v/f:DI sp)
++                                       (const_int 8)) [1 S4 A32])
++                      (reg:TI x0)))
++      /* ldp x5, x6, [sp], -16.  */
++      (cinsn 87 (set (reg/v:TI x5 [1 S4 A32])
++                    (mem:TI (post_dec:DI (reg/v/f:DI sp)) [1 S4 A32])))
++      (cinsn 88 (use (reg/i:DI x5)))
++      (cinsn 89 (use (reg/i:DI x6)))
++
++      /* Intersects with insn 102.  */
++      /* ldp x2, x3, [x2, -16]!.  */
++      (cinsn 90 (set (reg/v:TI x2 [1 S4 A32])
++                    (mem:TI (pre_dec:DI (reg/v/f:DI x2)) [1 S4 A32])))
++      (cinsn 91 (use (reg/i:DI x2)))
++      (cinsn 92 (use (reg/i:DI x3)))
++
++      /* mov x2, x0.  */
++      (cinsn 248 (set (reg/i:DI x2)
++                      (reg/i:DI x0)))
++      /* str x0, [x2, 16].  */
++      (cinsn 106 (set (mem:DI (plus:DI (reg/v/f:DI x2)
++                                       (const_int 16)) [1 S4 A32])
++                      (reg:DI x0)))
++      /* ldp x3, x4, [x2, 16]!.  */
++      (cinsn 93 (set (reg/v:TI x3 [1 S4 A32])
++                    (mem:TI (pre_inc:DI (reg/v/f:DI x2)) [1 S4 A32])))
++      (cinsn 94 (use (reg/i:DI x3)))
++      (cinsn 95 (use (reg/i:DI x4)))
++
+       (cinsn 11 (use (reg/i:DI sp)))
+       (cinsn 12 (use (reg/i:DI cc)))
+       (cinsn 13 (use (reg/i:DI x29)))
+       (cinsn 14 (use (reg/i:DI x30)))
+       (cinsn 15 (use (reg/i:DI x0)))
+       (cinsn 16 (use (reg/i:DI x3)))
+-      (cinsn 17 (use (reg/i:DI x5)))
+       (cinsn 18 (use (reg/i:DI x1)))
+       (cinsn 19 (use (reg/i:DI x4)))
+-      (cinsn 20 (use (reg/i:DI x6)))
+       (edge-to exit (flags "FALLTHRU"))
+     ) ;; block 2
+   ) ;; insn-chain
+-- 
+2.33.0
+
diff --git a/0038-Port-maxmin-patch-to-GCC-12.patch b/0038-Port-maxmin-patch-to-GCC-12.patch
new file mode 100644
index 0000000000000000000000000000000000000000..2423c12ca70fdc1daa4d4d6cc9000e336238c541
--- /dev/null
+++ b/0038-Port-maxmin-patch-to-GCC-12.patch
@@ -0,0 +1,378 @@
+From a3013c074cd2ab5f71eb98a587a627f38c68656c Mon Sep 17 00:00:00 2001
+From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
+Date: Thu, 22 Feb 2024 17:07:24 +0800
+Subject: [PATCH 12/18] Port maxmin patch to GCC 12
+
+---
+ gcc/config/aarch64/aarch64-simd.md    | 256 ++++++++++++++++++++++++++
+ gcc/config/aarch64/predicates.md      |  19 ++
+ gcc/testsuite/gcc.dg/combine-maxmin.c |  46 +++++
+ 3 files changed, 321 insertions(+)
+ create mode 100755 gcc/testsuite/gcc.dg/combine-maxmin.c
+
+diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
+index 82f73805f..de92802f5 100644
+--- a/gcc/config/aarch64/aarch64-simd.md
++++ b/gcc/config/aarch64/aarch64-simd.md
+@@ -1138,6 +1138,82 @@
+   [(set_attr "type" "neon_compare<q>,neon_shift_imm<q>")]
+ )
+ 
++;; Simplify the extension with following truncation for shift+neg operation.
++
++(define_insn_and_split "*aarch64_sshr_neg_v8hi"
++  [(set (match_operand:V8HI 0 "register_operand" "=w")
++	(vec_concat:V8HI
++	  (truncate:V4HI
++	    (ashiftrt:V4SI
++	      (neg:V4SI
++		(sign_extend:V4SI
++		  (vec_select:V4HI
++		    (match_operand:V8HI 1 "register_operand")
++		    (match_operand:V8HI 3 "vect_par_cnst_lo_half"))))
++	      (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
++	  (truncate:V4HI
++	    (ashiftrt:V4SI
++	      (neg:V4SI
++		(sign_extend:V4SI
++		  (vec_select:V4HI
++		    (match_dup 1)
++		    (match_operand:V8HI 4 "vect_par_cnst_hi_half"))))
++	      (match_dup 2)))))]
++  "TARGET_SIMD"
++  "#"
++  "&& true"
++  [(set (match_operand:V8HI 0 "register_operand" "=w")
++	(ashiftrt:V8HI
++	  (neg:V8HI
++	    (match_operand:V8HI 1 "register_operand" "w"))
++	  (match_operand:V8HI 2 "aarch64_simd_imm_minus_one")))]
++  {
++    /* Reduce the shift amount to smaller mode.  */
++    int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[2], 0))
++	      - (GET_MODE_UNIT_BITSIZE (GET_MODE (operands[2])) / 2);
++    operands[2] = aarch64_simd_gen_const_vector_dup (V8HImode, val);
++  }
++  [(set_attr "type" "multiple")]
++)
++
++;; The helper definition that allows combiner to use the previous pattern.
++
++(define_insn_and_split "*aarch64_sshr_neg_tmpv8hi"
++  [(set (match_operand:V8HI 0 "register_operand" "=w")
++	(vec_concat:V8HI
++	  (truncate:V4HI
++	    (ashiftrt:V4SI
++	      (neg:V4SI
++		(match_operand:V4SI 1 "register_operand" "w"))
++	      (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
++	  (truncate:V4HI
++	    (ashiftrt:V4SI
++	      (neg:V4SI
++		(match_operand:V4SI 3 "register_operand" "w"))
++	      (match_dup 2)))))]
++  "TARGET_SIMD"
++  "#"
++  "&& true"
++  [(set (match_operand:V4SI 1 "register_operand" "=w")
++	(ashiftrt:V4SI
++	  (neg:V4SI
++	    (match_dup 1))
++	  (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
++   (set (match_operand:V4SI 3 "register_operand" "=w")
++	(ashiftrt:V4SI
++	  (neg:V4SI
++	    (match_dup 3))
++	  (match_dup 2)))
++   (set (match_operand:V8HI 0 "register_operand" "=w")
++	(vec_concat:V8HI
++	  (truncate:V4HI
++	    (match_dup 1))
++	  (truncate:V4HI
++	    (match_dup 3))))]
++  ""
++  [(set_attr "type" "multiple")]
++)
++
+ (define_insn "*aarch64_simd_sra<mode>"
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
+ 	(plus:VDQ_I
+@@ -1714,6 +1790,26 @@
+  }
+ )
+ 
++(define_insn "vec_pack_trunc_shifted_<mode>"
++ [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=&w")
++       (vec_concat:<VNARROWQ2>
++	 (truncate:<VNARROWQ>
++	   (ashiftrt:VQN (match_operand:VQN 1 "register_operand" "w")
++	      (match_operand:VQN 2 "half_size_operand" "w")))
++	 (truncate:<VNARROWQ>
++	   (ashiftrt:VQN (match_operand:VQN 3 "register_operand" "w")
++	      (match_operand:VQN 4 "half_size_operand" "w")))))]
++ "TARGET_SIMD"
++ {
++   if (BYTES_BIG_ENDIAN)
++     return "uzp2\\t%0.<V2ntype>, %3.<V2ntype>, %1.<V2ntype>";
++   else
++     return "uzp2\\t%0.<V2ntype>, %1.<V2ntype>, %3.<V2ntype>";
++ }
++  [(set_attr "type" "neon_permute<q>")
++   (set_attr "length" "4")]
++)
++
+ (define_insn "aarch64_shrn<mode>_insn_le"
+   [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
+ 	(vec_concat:<VNARROWQ2>
+@@ -6652,6 +6748,166 @@
+   [(set_attr "type" "neon_tst<q>")]
+ )
+ 
++;; Simplify the extension with following truncation for cmtst-like operation.
++
++(define_insn_and_split "*aarch64_cmtst_arith_v8hi"
++  [(set (match_operand:V8HI 0 "register_operand" "=w")
++	(vec_concat:V8HI
++	  (plus:V4HI
++	    (truncate:V4HI
++	      (eq:V4SI
++		(sign_extend:V4SI
++		  (vec_select:V4HI
++		    (and:V8HI
++		      (match_operand:V8HI 1 "register_operand")
++		      (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
++		    (match_operand:V8HI 3 "vect_par_cnst_lo_half")))
++		(match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero")))
++	    (match_operand:V4HI 5 "aarch64_simd_imm_minus_one"))
++	  (plus:V4HI
++	    (truncate:V4HI
++	      (eq:V4SI
++		(sign_extend:V4SI
++		  (vec_select:V4HI
++		    (and:V8HI
++		      (match_dup 1)
++		      (match_dup 2))
++		    (match_operand:V8HI 6 "vect_par_cnst_hi_half")))
++		(match_dup 4)))
++	    (match_dup 5))))]
++  "TARGET_SIMD && !reload_completed"
++  "#"
++  "&& true"
++  [(set (match_operand:V8HI 6 "register_operand" "=w")
++	(match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
++   (set (match_operand:V8HI 0 "register_operand" "=w")
++	(plus:V8HI
++	  (eq:V8HI
++	    (and:V8HI
++	      (match_operand:V8HI 1 "register_operand" "w")
++	      (match_dup 6))
++	    (match_operand:V8HI 4 "aarch64_simd_imm_zero"))
++	  (match_operand:V8HI 5 "aarch64_simd_imm_minus_one")))]
++  {
++    if (can_create_pseudo_p ())
++      {
++	int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[4], 0));
++	operands[4] = aarch64_simd_gen_const_vector_dup (V8HImode, val);
++	int val2 = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[5], 0));
++	operands[5] = aarch64_simd_gen_const_vector_dup (V8HImode, val2);
++
++	operands[6] = gen_reg_rtx (V8HImode);
++      }
++    else
++      FAIL;
++  }
++  [(set_attr "type" "neon_tst_q")]
++)
++
++;; Three helper definitions that allow combiner to use the previous pattern.
++
++(define_insn_and_split "*aarch64_cmtst_arith_tmp_lo_v8hi"
++  [(set (match_operand:V4SI 0 "register_operand" "=w")
++	(neg:V4SI
++	  (eq:V4SI
++	    (sign_extend:V4SI
++	      (vec_select:V4HI
++		(and:V8HI
++		  (match_operand:V8HI 1 "register_operand")
++		  (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
++		(match_operand:V8HI 3 "vect_par_cnst_lo_half")))
++	    (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
++  "TARGET_SIMD && !reload_completed"
++  "#"
++  "&& true"
++  [(set (match_operand:V8HI 5 "register_operand" "=w")
++	(and:V8HI
++	  (match_operand:V8HI 1 "register_operand")
++	  (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")))
++   (set (match_operand:V4SI 0 "register_operand" "=w")
++	(sign_extend:V4SI
++	  (vec_select:V4HI
++	    (match_dup 5)
++	    (match_operand:V8HI 3 "vect_par_cnst_lo_half"))))
++   (set (match_dup 0)
++	(neg:V4SI
++	  (eq:V4SI
++	    (match_dup 0)
++	    (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
++  {
++    if (can_create_pseudo_p ())
++      operands[5] = gen_reg_rtx (V8HImode);
++    else
++      FAIL;
++  }
++  [(set_attr "type" "multiple")]
++)
++
++(define_insn_and_split "*aarch64_cmtst_arith_tmp_hi_v8hi"
++  [(set (match_operand:V4SI 0 "register_operand" "=w")
++	  (neg:V4SI
++	    (eq:V4SI
++	      (sign_extend:V4SI
++		(vec_select:V4HI
++		  (and:V8HI
++		    (match_operand:V8HI 1 "register_operand")
++		    (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
++		  (match_operand:V8HI 3 "vect_par_cnst_hi_half")))
++	      (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
++  "TARGET_SIMD && !reload_completed"
++  "#"
++  "&& true"
++  [(set (match_operand:V8HI 5 "register_operand" "=w")
++	(and:V8HI
++	  (match_operand:V8HI 1 "register_operand")
++	  (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")))
++   (set (match_operand:V4SI 0 "register_operand" "=w")
++	(sign_extend:V4SI
++	  (vec_select:V4HI
++	    (match_dup 5)
++	    (match_operand:V8HI 3 "vect_par_cnst_hi_half"))))
++   (set (match_dup 0)
++	  (neg:V4SI
++	    (eq:V4SI
++	      (match_dup 0)
++	      (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
++  {
++    if (can_create_pseudo_p ())
++      operands[5] = gen_reg_rtx (V8HImode);
++    else
++      FAIL;
++  }
++  [(set_attr "type" "multiple")]
++)
++
++(define_insn_and_split "*aarch64_cmtst_arith_tmpv8hi"
++  [(set (match_operand:V8HI 0 "register_operand" "=w")
++	(vec_concat:V8HI
++	  (truncate:V4HI
++	    (not:V4SI
++	      (match_operand:V4SI 1 "register_operand" "w")))
++	  (truncate:V4HI
++	    (not:V4SI
++	      (match_operand:V4SI 2 "register_operand" "w")))))]
++  "TARGET_SIMD"
++  "#"
++  "&& true"
++  [(set (match_operand:V4SI 1 "register_operand" "=w")
++	(not:V4SI
++	  (match_dup 1)))
++   (set (match_operand:V4SI 2 "register_operand" "=w")
++	(not:V4SI
++	  (match_dup 2)))
++   (set (match_operand:V8HI 0 "register_operand" "=w")
++	(vec_concat:V8HI
++	  (truncate:V4HI
++	    (match_dup 1))
++	  (truncate:V4HI
++	    (match_dup 2))))]
++  ""
++  [(set_attr "type" "multiple")]
++)
++
+ (define_insn_and_split "aarch64_cmtstdi"
+   [(set (match_operand:DI 0 "register_operand" "=w,r")
+ 	(neg:DI
+diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
+index 07c14aacb..1b8496c07 100644
+--- a/gcc/config/aarch64/predicates.md
++++ b/gcc/config/aarch64/predicates.md
+@@ -118,6 +118,25 @@
+ 	     (match_test "aarch64_simd_valid_immediate (op, NULL,
+ 							AARCH64_CHECK_ORR)"))))
+ 
++(define_predicate "aarch64_bic_imm_for_maxmin"
++   (match_code "const_vector")
++{
++  if (!aarch64_simd_valid_immediate (op, NULL, AARCH64_CHECK_BIC))
++    return false;
++  op = unwrap_const_vec_duplicate (op);
++  unsigned int size = GET_MODE_UNIT_BITSIZE (mode);
++  return CONST_INT_P (op)
++	 && ((~UINTVAL (op)) < (((long unsigned int) 1 << size) - 1));
++})
++
++(define_predicate "maxmin_arith_shift_operand"
++   (match_code "const_vector")
++{
++  op = unwrap_const_vec_duplicate (op);
++  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) - 1;
++  return CONST_INT_P (op) && (UINTVAL (op) == size);
++})
++
+ (define_predicate "aarch64_reg_or_bic_imm"
+    (ior (match_operand 0 "register_operand")
+ 	(and (match_code "const_vector")
+diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c
+new file mode 100755
+index 000000000..06bce7029
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/combine-maxmin.c
+@@ -0,0 +1,46 @@
++/* { dg-do compile { target aarch64-*-* } } */
++/* { dg-options "-O3 -fdump-rtl-combine-all" } */
++
++/* The test checks usage of smax/smin insns for clip evaluation and
++ * uzp1/uzp2 insns for vector element narrowing.  It's inspired by
++ * sources of x264 codec.  */
++
++typedef unsigned char uint8_t;
++typedef long int intptr_t;
++typedef signed short int int16_t;
++
++static __attribute__((always_inline)) inline uint8_t clip (int x )
++{
++    return ( (x & ~((1 << 8)-1)) ? (-x)>>31 & ((1 << 8)-1) : x );
++}
++
++void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
++	 intptr_t stride, int width, int height, int16_t *buf)
++{
++    const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0;
++    for( int y = 0; y < height; y++ ) {
++        for( int x = -2; x < width+3; x++ ) {
++            int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride]
++		     + (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride]));
++            dstv[x] = clip ( (v + 16) >> 5 );
++            buf[x+2] = v + pad;
++        }
++        for( int x = 0; x < width; x++ )
++            dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1]
++			      + (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1]))
++			     - 32*pad + 512) >> 10);
++        for( int x = 0; x < width; x++ )
++            dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1]
++			      + (src)[x+2*1]) + 20*((src)[x] + (src)[x+1]))
++			     + 16) >> 5);
++        dsth += stride;
++        dstv += stride;
++        dstc += stride;
++        src += stride;
++    }
++}
++
++/* { dg-final { scan-assembler-times {smax\t} 4 } }  */
++/* { dg-final { scan-assembler-times {smin\t} 4 } }  */
++/* { dg-final { scan-assembler-times {cmtst\t} 2 } }  */
++/* { dg-final { scan-assembler-times {uzp1\t} 6 } }  */
+-- 
+2.33.0
+
diff --git a/0039-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch b/0039-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch
new file mode 100644
index 0000000000000000000000000000000000000000..a5a786f6fb6cc05c72ed413ea09775d4279f20a7
--- /dev/null
+++ b/0039-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch
@@ -0,0 +1,239 @@
+From 11da40d18e35219961226d40f11b0702b8649044 Mon Sep 17 00:00:00 2001
+From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
+Date: Thu, 22 Feb 2024 17:13:27 +0800
+Subject: [PATCH 13/18] Port moving minmask pattern to gimple to GCC 12
+
+---
+ gcc/common.opt                          |   4 +
+ gcc/match.pd                            | 104 ++++++++++++++++++++++++
+ gcc/testsuite/gcc.dg/combine-maxmin-1.c |  15 ++++
+ gcc/testsuite/gcc.dg/combine-maxmin-2.c |  14 ++++
+ gcc/testsuite/gcc.dg/combine-maxmin.c   |  19 +++--
+ 5 files changed, 151 insertions(+), 5 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-1.c
+ create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-2.c
+
+diff --git a/gcc/common.opt b/gcc/common.opt
+index 6c6fabb31..3a5004271 100644
+--- a/gcc/common.opt
++++ b/gcc/common.opt
+@@ -1846,6 +1846,10 @@ fif-conversion-gimple
+ Common Var(flag_if_conversion_gimple) Optimization
+ Perform conversion of conditional jumps to branchless equivalents during gimple transformations.
+ 
++fconvert-minmax
++Common Var(flag_convert_minmax) Optimization
++Convert saturating clipping to min max.
++
+ fstack-reuse=
+ Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization
+ -fstack-reuse=[all|named_vars|none]	Set stack reuse level for local variables.
+diff --git a/gcc/match.pd b/gcc/match.pd
+index 61866cb90..3a19e93b3 100644
+--- a/gcc/match.pd
++++ b/gcc/match.pd
+@@ -8031,3 +8031,107 @@ and,
+    (plus:c@4 (op2:c @0 @1)
+     (plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1))))
+      (if (single_use (@4) && single_use (@5)))))
++
++/* MinMax pattern matching helpers.  More info on the transformation below.  */
++
++/* Match (a & 0b11..100..0) pattern.  */
++(match (minmax_cmp_arg @0 @1)
++ (bit_and @0 INTEGER_CST@1)
++ (if (wi::popcount (~wi::to_widest (@1) + 1) == 1)))
++
++/* Match (inversed_sign_bit >> sign_bit_pos) pattern.
++   This statement is blocking for the transformation of unsigned integers.
++   Do type check here to avoid unnecessary duplications.  */
++(match (minmax_sat_arg @0)
++ (rshift (negate @0) INTEGER_CST@1)
++ (if (!TYPE_UNSIGNED (TREE_TYPE (@0))
++      && wi::eq_p (wi::to_widest (@1), TYPE_PRECISION (TREE_TYPE (@0)) - 1))))
++
++/* Transform ((x & ~mask) ? (-x)>>31 & mask : x) to (min (max (x, 0), mask)).
++   The matched pattern can be described as saturated clipping.
++
++   The pattern supports truncation via both casts and bit_and.
++   Also there are patterns for possible inverted conditions.  */
++(if (flag_convert_minmax)
++/* Truncation via casts.  Unfortunately convert? cannot be applied here
++   because convert and cond take different number of arguments.  */
++ (simplify
++  (convert
++   (cond
++    (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
++    (convert? (minmax_sat_arg @0))
++    (convert? @0)))
++  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
++   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
++    (convert (min (max @0 { integer_zero_node; })
++		  { mask; })))))
++ (simplify
++  (cond
++   (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
++   (convert? (minmax_sat_arg @0))
++   (convert? @0))
++  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
++   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
++    (convert (min (max @0 { integer_zero_node; })
++		  { mask; })))))
++
++ (simplify
++  (convert
++   (cond
++    (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
++    (convert? @0)
++    (convert? (minmax_sat_arg @0))))
++  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
++   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
++    (convert (min (max @0 { integer_zero_node; })
++		  { mask; })))))
++ (simplify
++  (cond
++   (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
++   (convert? @0)
++   (convert? (minmax_sat_arg @0)))
++  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
++   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
++    (convert (min (max @0 { integer_zero_node; })
++		  { mask; })))))
++
++ /* Truncation via bit_and with mask.  Same concerns on convert? here.  */
++ (simplify
++  (convert
++   (cond
++    (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
++    (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))
++    (convert? @0)))
++  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
++   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
++    (convert (min (max @0 { integer_zero_node; })
++		  { mask; })))))
++ (simplify
++  (cond
++   (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
++   (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))
++   (convert? @0))
++  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
++   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
++    (convert (min (max @0 { integer_zero_node; })
++		  { mask; })))))
++
++ (simplify
++  (convert
++   (cond
++    (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
++    (convert? @0)
++    (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))))
++  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
++   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
++    (convert (min (max @0 { integer_zero_node; })
++		  { mask; })))))
++ (simplify
++  (cond
++   (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
++   (convert? @0)
++   (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2)))
++  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
++   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
++    (convert (min (max @0 { integer_zero_node; })
++		  { mask; }))))))
+diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-1.c b/gcc/testsuite/gcc.dg/combine-maxmin-1.c
+new file mode 100644
+index 000000000..859ff7df8
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/combine-maxmin-1.c
+@@ -0,0 +1,15 @@
++/* { dg-do compile { target aarch64-*-* } } */
++/* { dg-options "-O3 -fconvert-minmax" } */
++
++#include <inttypes.h>
++
++__attribute__((noinline))
++void test (int32_t *restrict a, int32_t *restrict x)
++{
++  for (int i = 0; i < 4; i++)
++    a[i] = ((((-x[i]) >> 31) ^ x[i])
++            & (-((int32_t)((x[i] & (~((1 << 8)-1))) == 0)))) ^ ((-x[i]) >> 31);
++}
++
++/* { dg-final { scan-assembler-not {smax\t} } }  */
++/* { dg-final { scan-assembler-not {smin\t} } }  */
+diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-2.c b/gcc/testsuite/gcc.dg/combine-maxmin-2.c
+new file mode 100644
+index 000000000..63d4d85b3
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/combine-maxmin-2.c
+@@ -0,0 +1,14 @@
++/* { dg-do compile { target aarch64-*-* } } */
++/* { dg-options "-O3 -fconvert-minmax" } */
++
++#include <inttypes.h>
++
++__attribute__((noinline))
++void test (int8_t *restrict a, int32_t *restrict x)
++{
++  for (int i = 0; i < 8; i++)
++    a[i] = ((x[i] & ~((1 << 9)-1)) ? (-x[i])>>31 & ((1 << 9)-1) : x[i]);
++}
++
++/* { dg-final { scan-assembler-times {smax\t} 4 } }  */
++/* { dg-final { scan-assembler-times {smin\t} 4 } }  */
+diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c
+index 06bce7029..a984fa560 100755
+--- a/gcc/testsuite/gcc.dg/combine-maxmin.c
++++ b/gcc/testsuite/gcc.dg/combine-maxmin.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile { target aarch64-*-* } } */
+-/* { dg-options "-O3 -fdump-rtl-combine-all" } */
++/* { dg-options "-O3 -fconvert-minmax" } */
+ 
+ /* The test checks usage of smax/smin insns for clip evaluation and
+  * uzp1/uzp2 insns for vector element narrowing.  It's inspired by
+@@ -19,20 +19,26 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
+ {
+     const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0;
+     for( int y = 0; y < height; y++ ) {
++        /* This loop is not being vectorized now.  */
+         for( int x = -2; x < width+3; x++ ) {
+             int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride]
+ 		     + (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride]));
+             dstv[x] = clip ( (v + 16) >> 5 );
+             buf[x+2] = v + pad;
+         }
++
++        /* Produces two versions of the code: 3xUZP1/2xMAX/2xMIN + 1xUZP1/1xMAX/1xMIN.  */
+         for( int x = 0; x < width; x++ )
+             dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1]
+ 			      + (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1]))
+ 			     - 32*pad + 512) >> 10);
++
++        /* Priduces two versions of the code: 1xUZP1/2xMAX/2xMIN + 0xUZP1/1xMAX/1xMIN.  */
+         for( int x = 0; x < width; x++ )
+             dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1]
+ 			      + (src)[x+2*1]) + 20*((src)[x] + (src)[x+1]))
+ 			     + 16) >> 5);
++
+         dsth += stride;
+         dstv += stride;
+         dstc += stride;
+@@ -40,7 +46,10 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
+     }
+ }
+ 
+-/* { dg-final { scan-assembler-times {smax\t} 4 } }  */
+-/* { dg-final { scan-assembler-times {smin\t} 4 } }  */
+-/* { dg-final { scan-assembler-times {cmtst\t} 2 } }  */
+-/* { dg-final { scan-assembler-times {uzp1\t} 6 } }  */
++/* Max is performed on 0 from signed values, match smax exactly.  */
++/* { dg-final { scan-assembler-times {smax\t} 6 } }  */
++/* Min is performed on signed val>0 and a mask, min sign doesn't matter.  */
++/* { dg-final { scan-assembler-times {[us]min\t} 6 } }  */
++/* All of the vectorized patterns are expected to be matched.  */
++/* { dg-final { scan-assembler-not {cmtst\t} } }  */
++/* { dg-final { scan-assembler-times {uzp1\t} 5 } }  */
+-- 
+2.33.0
+
diff --git a/0040-Add-new-pattern-to-pass-the-maxmin-tests.patch b/0040-Add-new-pattern-to-pass-the-maxmin-tests.patch
new file mode 100644
index 0000000000000000000000000000000000000000..9ceba88090b58a20e5d3c4d2d6c70327cfbd9f47
--- /dev/null
+++ b/0040-Add-new-pattern-to-pass-the-maxmin-tests.patch
@@ -0,0 +1,65 @@
+From dbcb2630c426c8dd2117b5ce625da8422dd8cd65 Mon Sep 17 00:00:00 2001
+From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
+Date: Thu, 22 Feb 2024 17:20:17 +0800
+Subject: [PATCH 14/18] Add new pattern to pass the maxmin tests
+
+---
+ gcc/match.pd                          | 24 ++++++++++++++++++++++++
+ gcc/testsuite/gcc.dg/combine-maxmin.c |  2 +-
+ 2 files changed, 25 insertions(+), 1 deletion(-)
+
+diff --git a/gcc/match.pd b/gcc/match.pd
+index 3a19e93b3..aee58e47b 100644
+--- a/gcc/match.pd
++++ b/gcc/match.pd
+@@ -8038,6 +8038,10 @@ and,
+ (match (minmax_cmp_arg @0 @1)
+  (bit_and @0 INTEGER_CST@1)
+  (if (wi::popcount (~wi::to_widest (@1) + 1) == 1)))
++/* Match ((unsigned) a > 0b0..01..1) pattern.  */
++(match (minmax_cmp_arg1 @0 @1)
++ (gt @0 INTEGER_CST@1)
++ (if (wi::popcount (wi::to_widest (@1) + 1) == 1)))
+ 
+ /* Match (inversed_sign_bit >> sign_bit_pos) pattern.
+    This statement is blocking for the transformation of unsigned integers.
+@@ -8095,6 +8099,26 @@ and,
+     (convert (min (max @0 { integer_zero_node; })
+ 		  { mask; })))))
+ 
++ (simplify
++  (convert
++   (cond
++    (minmax_cmp_arg1 (convert? @0) INTEGER_CST@1)
++    (convert? (minmax_sat_arg @0))
++    (convert? @0)))
++  (if (wi::geu_p (wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
++   (with { tree mask = build_int_cst (integer_type_node, tree_to_shwi (@1)); }
++    (convert (min (max (convert:integer_type_node @0) { integer_zero_node; })
++		  { mask; })))))
++ (simplify
++  (cond
++   (minmax_cmp_arg1 (convert? @0) INTEGER_CST@1)
++   (convert? (minmax_sat_arg @0))
++   (convert? @0))
++  (if (wi::geu_p (wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
++   (with { tree mask = build_int_cst (integer_type_node, tree_to_shwi (@1)); }
++    (convert (min (max (convert:integer_type_node @0) { integer_zero_node; })
++		  { mask; })))))
++
+  /* Truncation via bit_and with mask.  Same concerns on convert? here.  */
+  (simplify
+   (convert
+diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c
+index a984fa560..5c0c9cc49 100755
+--- a/gcc/testsuite/gcc.dg/combine-maxmin.c
++++ b/gcc/testsuite/gcc.dg/combine-maxmin.c
+@@ -52,4 +52,4 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
+ /* { dg-final { scan-assembler-times {[us]min\t} 6 } }  */
+ /* All of the vectorized patterns are expected to be matched.  */
+ /* { dg-final { scan-assembler-not {cmtst\t} } }  */
+-/* { dg-final { scan-assembler-times {uzp1\t} 5 } }  */
++/* { dg-final { scan-assembler-times {uzp1\t} 2 } }  */
+-- 
+2.33.0
+
diff --git a/0041-AES-Implement-AES-pattern-matching.patch b/0041-AES-Implement-AES-pattern-matching.patch
new file mode 100644
index 0000000000000000000000000000000000000000..cd983bf2c5ff86e3037b9f0963e6bdc3c9b77fc0
--- /dev/null
+++ b/0041-AES-Implement-AES-pattern-matching.patch
@@ -0,0 +1,3968 @@
+From 53d321d2fe08f69a29527be157d4bcaaefea04ab Mon Sep 17 00:00:00 2001
+From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
+Date: Wed, 6 Dec 2023 10:46:28 +0300
+Subject: [PATCH 15/18] [AES] Implement AES pattern matching
+
+---
+ gcc/Makefile.in                               |    1 +
+ gcc/common.opt                                |    4 +
+ gcc/config/aarch64/aarch64.cc                 |   24 +
+ gcc/crypto-accel.cc                           | 2415 +++++++++++++++++
+ gcc/doc/tm.texi                               |   29 +
+ gcc/doc/tm.texi.in                            |   12 +
+ gcc/passes.def                                |    1 +
+ gcc/rtl-matcher.h                             |  367 +++
+ gcc/target.def                                |   41 +
+ .../gcc.target/aarch64/aes-decrypt.c          |  478 ++++
+ .../gcc.target/aarch64/aes-encrypt.c          |  443 +++
+ gcc/timevar.def                               |    1 +
+ gcc/tree-pass.h                               |    1 +
+ 13 files changed, 3817 insertions(+)
+ create mode 100644 gcc/crypto-accel.cc
+ create mode 100644 gcc/rtl-matcher.h
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/aes-decrypt.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/aes-encrypt.c
+
+diff --git a/gcc/Makefile.in b/gcc/Makefile.in
+index 45705c1f3..876000bda 100644
+--- a/gcc/Makefile.in
++++ b/gcc/Makefile.in
+@@ -1332,6 +1332,7 @@ OBJS = \
+ 	cgraphunit.o \
+ 	cgraphclones.o \
+ 	combine.o \
++	crypto-accel.o \
+ 	combine-stack-adj.o \
+ 	compare-elim.o \
+ 	context.o \
+diff --git a/gcc/common.opt b/gcc/common.opt
+index 3a5004271..1eb62ada5 100644
+--- a/gcc/common.opt
++++ b/gcc/common.opt
+@@ -1129,6 +1129,10 @@ Common Var(flag_array_widen_compare) Optimization
+ Extends types for pointers to arrays to improve array comparsion performance.
+ In some extreme situations this may result in unsafe behavior.
+ 
++fcrypto-accel-aes
++Common Var(flag_crypto_accel_aes) Init(0) Optimization
++Perform crypto acceleration AES pattern matching.
++
+ fauto-inc-dec
+ Common Var(flag_auto_inc_dec) Init(1) Optimization
+ Generate auto-inc/dec instructions.
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index fa566dd80..9171d9d56 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -27569,6 +27569,30 @@ is_aarch64_stp_insn (int icode, bool *has_wb)
+ #undef TARGET_IS_STP_INSN
+ #define TARGET_IS_STP_INSN is_aarch64_stp_insn
+ 
++machine_mode
++aarch64_get_v16qi_mode ()
++{
++  return V16QImode;
++}
++
++#undef TARGET_GET_V16QI_MODE
++#define TARGET_GET_V16QI_MODE aarch64_get_v16qi_mode
++
++#undef TARGET_GEN_REV32V16QI
++#define TARGET_GEN_REV32V16QI gen_aarch64_rev32v16qi
++
++#undef TARGET_GEN_AESEV16QI
++#define TARGET_GEN_AESEV16QI gen_aarch64_crypto_aesev16qi
++
++#undef TARGET_GEN_AESDV16QI
++#define TARGET_GEN_AESDV16QI gen_aarch64_crypto_aesdv16qi
++
++#undef TARGET_GEN_AESMCV16QI
++#define TARGET_GEN_AESMCV16QI gen_aarch64_crypto_aesmcv16qi
++
++#undef TARGET_GEN_AESIMCV16QI
++#define TARGET_GEN_AESIMCV16QI gen_aarch64_crypto_aesimcv16qi
++
+ #undef TARGET_STACK_PROTECT_GUARD
+ #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
+ 
+diff --git a/gcc/crypto-accel.cc b/gcc/crypto-accel.cc
+new file mode 100644
+index 000000000..f4e810a6b
+--- /dev/null
++++ b/gcc/crypto-accel.cc
+@@ -0,0 +1,2415 @@
++/* Crypto-pattern optimizer.
++   Copyright (C) 2003-2023 Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify it under
++the terms of the GNU General Public License as published by the Free
++Software Foundation; either version 3, or (at your option) any later
++version.
++
++GCC is distributed in the hope that it will be useful, but WITHOUT ANY
++WARRANTY; without even the implied warranty of MERCHANTABILITY or
++FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#include "config.h"
++#define INCLUDE_VECTOR
++#define INCLUDE_MAP
++#define INCLUDE_SET
++#define INCLUDE_ALGORITHM
++#include "system.h"
++#include "coretypes.h"
++#include "backend.h"
++#include "target.h"
++#include "rtl.h"
++#include "tree.h"
++#include "df.h"
++#include "memmodel.h"
++#include "optabs.h"
++#include "regs.h"
++#include "emit-rtl.h"
++#include "recog.h"
++#include "cfgrtl.h"
++#include "cfgcleanup.h"
++#include "expr.h"
++#include "tree-pass.h"
++#include "rtl-matcher.h"
++
++/* Basic AES table descryption.  */
++struct aes_table
++{
++  /* Number of elements per table.  */
++  static const unsigned int table_nelts = 256;
++  /* Number of tables.  */
++  static const unsigned int basic_tables_num = 4;
++  /* Number of rounds.  */
++  static const unsigned int rounds_num = 4;
++  /* Common ID for wrong table.  */
++  static const unsigned int BAD_TABLE = -1;
++
++  typedef const unsigned int table_type[table_nelts];
++  typedef table_type *table_map[basic_tables_num];
++
++  template<typename T>
++  static bool is_basic_table (tree ctor, const T ethalon[table_nelts])
++    {
++      if (TREE_CODE (ctor) != CONSTRUCTOR
++	  ||CONSTRUCTOR_NELTS (ctor) != table_nelts)
++	return false;
++
++      unsigned ix;
++      tree val;
++      FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (ctor), ix, val)
++	if (TREE_INT_CST_LOW (val) != ethalon[ix])
++	  return false;
++      return true;
++    }
++
++  static unsigned check_table (tree ctor,
++			       table_map tables)
++    {
++      for (unsigned i = 0; i < 4; ++i)
++	if (is_basic_table (ctor, *tables[i]))
++	  return i;
++      return BAD_TABLE;
++    }
++};
++
++/* AES encryption info.  */
++struct aes_encrypt_table : aes_table
++{
++  typedef enum
++  {
++    TE0,
++    TE1,
++    TE2,
++    TE3,
++    BAD_TABLE = aes_table::BAD_TABLE
++  } table_entry;
++
++  static table_type Te0;
++  static table_type Te1;
++  static table_type Te2;
++  static table_type Te3;
++
++  static table_map tables;
++  static table_entry rounds[rounds_num];
++  static table_entry final_rounds[rounds_num];
++
++  static table_entry get_table_id (tree ctor)
++    {
++      return static_cast<table_entry> (check_table (ctor, tables));
++    }
++};
++
++/* AES decryption info.  */
++struct aes_decrypt_table : aes_table
++{
++  typedef enum
++  {
++    TD0,
++    TD1,
++    TD2,
++    TD3,
++    TD4,
++    BAD_TABLE = aes_table::BAD_TABLE
++  } table_entry;
++
++  static table_type Td0;
++  static table_type Td1;
++  static table_type Td2;
++  static table_type Td3;
++
++  static table_map tables;
++  static table_entry rounds[rounds_num];
++  static table_entry final_rounds[rounds_num];
++
++  static const unsigned char Td4[table_nelts];
++
++  /* TD4 requires special handler due to type shrinking optimizations.  */
++  static bool is_td4 (tree ctor)
++    {
++      if (is_basic_table (ctor, Td4))
++	return true;
++
++      if (TREE_CODE (ctor) != STRING_CST
++	  || TREE_STRING_LENGTH (ctor) != table_nelts)
++	return false;
++
++      const unsigned char *p
++	= (const unsigned char *) TREE_STRING_POINTER (ctor);
++      for (int i = 0; i < TREE_STRING_LENGTH (ctor); ++i)
++	if (p[i] != Td4[i])
++	  return false;
++
++      return true;
++    }
++
++  static table_entry get_table_id (tree ctor)
++    {
++      unsigned int res = check_table (ctor, tables);
++      if (res == aes_table::BAD_TABLE
++	  && is_td4 (ctor))
++	return TD4;
++      return static_cast<table_entry> (res);
++    }
++};
++
++/* Basic tables info.  */
++aes_encrypt_table::table_map aes_encrypt_table::tables
++  = { &Te0, &Te1, &Te2, &Te3 };
++aes_decrypt_table::table_map aes_decrypt_table::tables
++  = { &Td0, &Td1, &Td2, &Td3 };
++
++/* Round tables permutations info.  */
++aes_encrypt_table::table_entry aes_encrypt_table::rounds[]
++  = {TE0, TE1, TE2, TE3};
++aes_decrypt_table::table_entry aes_decrypt_table::rounds[]
++  = {TD0, TD1, TD2, TD3};
++aes_encrypt_table::table_entry aes_encrypt_table::final_rounds[]
++  = {TE2, TE3, TE0, TE1};
++aes_decrypt_table::table_entry aes_decrypt_table::final_rounds[]
++  = {TD4, TD4, TD4, TD4};
++
++aes_encrypt_table::table_type aes_encrypt_table::Te0 = {
++    0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
++    0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
++    0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
++    0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
++    0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
++    0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
++    0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
++    0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
++    0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
++    0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
++    0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
++    0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
++    0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
++    0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
++    0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
++    0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
++    0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
++    0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
++    0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
++    0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
++    0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
++    0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
++    0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
++    0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
++    0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
++    0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
++    0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
++    0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
++    0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
++    0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
++    0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
++    0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
++    0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
++    0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
++    0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
++    0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
++    0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
++    0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
++    0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
++    0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
++    0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
++    0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
++    0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
++    0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
++    0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
++    0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
++    0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
++    0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
++    0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
++    0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
++    0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
++    0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
++    0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
++    0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
++    0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
++    0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
++    0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
++    0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
++    0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
++    0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
++    0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
++    0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
++    0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
++    0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,
++};
++
++aes_encrypt_table::table_type aes_encrypt_table::Te1 = {
++    0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
++    0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
++    0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
++    0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
++    0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
++    0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
++    0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
++    0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
++    0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
++    0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
++    0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
++    0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
++    0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
++    0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
++    0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
++    0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
++    0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
++    0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
++    0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
++    0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
++    0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
++    0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
++    0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
++    0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
++    0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
++    0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
++    0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
++    0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
++    0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
++    0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
++    0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
++    0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
++    0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
++    0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
++    0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
++    0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
++    0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
++    0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
++    0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
++    0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
++    0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
++    0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
++    0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
++    0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
++    0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
++    0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
++    0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
++    0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
++    0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
++    0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
++    0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
++    0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
++    0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
++    0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
++    0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
++    0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
++    0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
++    0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
++    0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
++    0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
++    0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
++    0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
++    0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
++    0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,
++};
++
++aes_encrypt_table::table_type aes_encrypt_table::Te2 = {
++    0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
++    0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
++    0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
++    0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
++    0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
++    0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
++    0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
++    0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
++    0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
++    0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
++    0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
++    0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
++    0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
++    0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
++    0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
++    0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
++    0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
++    0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
++    0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
++    0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
++    0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
++    0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
++    0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
++    0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
++    0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
++    0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
++    0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
++    0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
++    0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
++    0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
++    0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
++    0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
++    0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
++    0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
++    0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
++    0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
++    0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
++    0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
++    0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
++    0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
++    0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
++    0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
++    0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
++    0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
++    0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
++    0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
++    0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
++    0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
++    0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
++    0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
++    0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
++    0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
++    0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
++    0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
++    0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
++    0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
++    0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
++    0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
++    0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
++    0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
++    0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
++    0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
++    0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
++    0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,
++};
++
++aes_encrypt_table::table_type aes_encrypt_table::Te3 = {
++    0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
++    0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
++    0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
++    0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
++    0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
++    0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
++    0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
++    0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
++    0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
++    0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
++    0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
++    0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
++    0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
++    0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
++    0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
++    0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
++    0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
++    0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
++    0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
++    0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
++    0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
++    0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
++    0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
++    0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
++    0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
++    0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
++    0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
++    0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
++    0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
++    0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
++    0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
++    0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
++    0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
++    0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
++    0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
++    0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
++    0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
++    0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
++    0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
++    0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
++    0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
++    0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
++    0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
++    0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
++    0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
++    0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
++    0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
++    0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
++    0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
++    0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
++    0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
++    0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
++    0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
++    0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
++    0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
++    0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
++    0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
++    0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
++    0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
++    0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
++    0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
++    0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
++    0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
++    0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,
++};
++
++aes_decrypt_table::table_type aes_decrypt_table::Td0 = {
++    0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,
++    0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,
++    0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,
++    0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,
++    0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,
++    0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,
++    0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,
++    0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,
++    0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,
++    0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,
++    0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,
++    0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,
++    0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,
++    0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,
++    0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,
++    0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,
++    0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,
++    0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,
++    0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,
++    0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,
++    0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,
++    0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,
++    0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,
++    0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,
++    0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,
++    0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,
++    0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,
++    0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,
++    0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,
++    0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,
++    0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,
++    0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,
++    0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,
++    0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,
++    0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,
++    0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,
++    0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,
++    0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,
++    0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,
++    0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,
++    0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,
++    0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,
++    0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,
++    0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,
++    0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,
++    0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,
++    0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,
++    0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,
++    0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,
++    0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,
++    0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,
++    0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,
++    0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,
++    0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,
++    0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,
++    0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,
++    0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,
++    0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,
++    0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,
++    0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,
++    0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,
++    0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,
++    0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,
++    0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U,
++};
++
++aes_decrypt_table::table_type aes_decrypt_table::Td1 = {
++    0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,
++    0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,
++    0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,
++    0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,
++    0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,
++    0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,
++    0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,
++    0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,
++    0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,
++    0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,
++    0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,
++    0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,
++    0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,
++    0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,
++    0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,
++    0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,
++    0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,
++    0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,
++    0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,
++    0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,
++    0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,
++    0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,
++    0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,
++    0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,
++    0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,
++    0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,
++    0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,
++    0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,
++    0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,
++    0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,
++    0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,
++    0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,
++    0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,
++    0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,
++    0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,
++    0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,
++    0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,
++    0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,
++    0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,
++    0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,
++    0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,
++    0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,
++    0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,
++    0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,
++    0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,
++    0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,
++    0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,
++    0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,
++    0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,
++    0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,
++    0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,
++    0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,
++    0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,
++    0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,
++    0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,
++    0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,
++    0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,
++    0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,
++    0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,
++    0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,
++    0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,
++    0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,
++    0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,
++    0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U,
++};
++
++aes_decrypt_table::table_type aes_decrypt_table::Td2 = {
++    0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,
++    0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,
++    0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,
++    0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,
++    0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,
++    0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,
++    0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,
++    0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,
++    0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,
++    0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,
++    0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,
++    0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,
++    0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,
++    0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,
++    0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,
++    0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,
++    0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,
++    0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,
++    0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,
++    0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,
++    0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,
++    0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,
++    0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,
++    0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,
++    0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,
++    0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,
++    0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,
++    0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,
++    0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,
++    0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,
++    0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,
++    0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,
++    0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,
++    0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,
++    0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,
++    0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,
++    0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,
++    0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,
++    0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,
++    0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,
++    0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,
++    0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,
++    0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,
++    0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,
++    0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,
++    0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,
++    0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,
++    0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,
++    0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,
++    0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,
++    0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,
++    0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,
++    0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,
++    0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,
++    0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,
++    0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,
++    0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,
++    0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,
++    0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,
++    0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,
++    0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,
++    0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,
++    0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,
++    0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U,
++};
++
++aes_decrypt_table::table_type aes_decrypt_table::Td3 = {
++    0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,
++    0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,
++    0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,
++    0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,
++    0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,
++    0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,
++    0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,
++    0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,
++    0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,
++    0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,
++    0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,
++    0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,
++    0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,
++    0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,
++    0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,
++    0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,
++    0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,
++    0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,
++    0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,
++    0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,
++    0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,
++    0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,
++    0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,
++    0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,
++    0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,
++    0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,
++    0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,
++    0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,
++    0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,
++    0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,
++    0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,
++    0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,
++    0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,
++    0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,
++    0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,
++    0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,
++    0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,
++    0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,
++    0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,
++    0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,
++    0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,
++    0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,
++    0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,
++    0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,
++    0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,
++    0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,
++    0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,
++    0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,
++    0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,
++    0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,
++    0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,
++    0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,
++    0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,
++    0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,
++    0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,
++    0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,
++    0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,
++    0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,
++    0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,
++    0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,
++    0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,
++    0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,
++    0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,
++    0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U,
++};
++
++const unsigned char aes_decrypt_table::Td4[table_nelts] = {
++    0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
++    0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
++    0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
++    0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
++    0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
++    0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
++    0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
++    0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
++    0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
++    0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
++    0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
++    0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
++    0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
++    0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
++    0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
++    0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
++    0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
++    0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
++    0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
++    0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
++    0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
++    0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
++    0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
++    0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
++    0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
++    0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
++    0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
++    0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
++    0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
++    0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
++    0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
++    0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU,
++};
++
++/* In-round shifts info.  */
++static const unsigned HOST_WIDE_INT shift_csts[4] = {24, 16, 8, 0};
++
++/* Check if the pattern is plus-const.  Helper for memref analysis.  */
++static bool
++plus_const_int_p (rtx op)
++{
++  return GET_CODE (op) == PLUS && CONST_INT_P (XEXP (op, 1));
++}
++
++/* Obtain info about memory access.  */
++static bool
++decompose_mem (rtx mem, rtx &base, unsigned HOST_WIDE_INT &offset)
++{
++  address_info info;
++  decompose_mem_address (&info, mem);
++  if (!info.base)
++    return false;
++
++  base = *info.base;
++
++  rtx op = XEXP (mem, 0);
++  if (plus_const_int_p (op))
++    offset = UINTVAL (XEXP (op, 1));
++  /* TODO: WRONG IN GENERAL CASE: we cannot guarantee that the offsets were not
++     changed.  */
++  else if ((GET_CODE (op) == PRE_MODIFY && plus_const_int_p (XEXP (op, 1)))
++	   || REG_P (op))
++    offset = 0;
++  else
++    return false;
++
++  return true;
++}
++
++/* Check if the regs in stmt are same as the provided ones.  */
++static bool
++cmp_regs_in_stmt (rtx stmt, rtx lhs, rtx rhs)
++{
++  return (XEXP (stmt, 0) == lhs) && (XEXP (stmt, 1) == rhs);
++}
++
++/* AES key info.  Inhereted from mem_term_info to be used inside
++   matchers without any unnecessary casts.  */
++struct aes_key : mem_term_info
++{
++  aes_key ()
++    {}
++  aes_key (void *)
++    : mem_term_info (NULL, NULL_RTX)
++    {}
++  aes_key (const mem_term_info &m)
++    : mem_term_info (m)
++    {}
++
++  /* Check if the key has the same base pointer origin as another one.
++     This check is required due to some possible CSE optimizations applied on
++     pointers before this pass.  */
++  bool has_same_origin (const aes_key &other, rtx_insn *use_point) const
++    {
++      /* Simple case: the pointer is same.  */
++      if (src == other.src)
++	return true;
++
++      if (!use_point)
++	return false;
++
++      basic_block curr_bb = BLOCK_FOR_INSN (use_point);
++      if (!single_pred_p (curr_bb)
++	  || modified_between_p (src, BB_HEAD (curr_bb), use_point)
++	  || modified_between_p (other.src, BB_HEAD (curr_bb), use_point))
++	return false;
++
++      edge e = single_pred_edge (curr_bb);
++      rtx_insn *jump = BB_END (e->src);
++      if (!any_condjump_p (jump))
++	return false;
++
++      basic_block from_bb = BLOCK_FOR_INSN (jump);
++      if (EDGE_COUNT (from_bb->succs) != 2)
++	return false;
++
++      /* Need proof that the sources are equal: try to get it from
++	 terminating condition.  */
++      rtx cond = XEXP (SET_SRC (pc_set (jump)), 0);
++      rtx_code code = GET_CODE (cond);
++      if (!((code == EQ && EDGE_SUCC (from_bb, 0) == e)
++	    || (code == NE && EDGE_SUCC (from_bb, 1) == e)))
++	return false;
++
++      rtx arg1 = XEXP (cond, 0);
++      if (XEXP (cond, 1) != CONST0_RTX (GET_MODE (arg1))
++	  || COMPARISON_P (arg1))
++	return false;
++
++      rtx_insn *cmp_insn = get_single_def_insn (jump, arg1);
++      rtx cmp;
++      if (!cmp_insn || !(cmp = get_single_set_op<COMPARE> (cmp_insn)))
++	return false;
++
++      if (!(cmp_regs_in_stmt (cmp, src, other.src)
++	    || cmp_regs_in_stmt (cmp, other.src, src)))
++	return false;
++
++      return true;
++    }
++};
++
++/* AES basic state input info.  Inhereted from mem_term_info
++   to use it in matchers without any unnecessary casts.  */
++struct state_input_info : mem_term_info
++{
++  state_input_info ()
++    {}
++  state_input_info (const aes_key &k)
++    : mem_term_info (k), is_key (true)
++    {}
++  state_input_info (const mem_term_info &m)
++    : mem_term_info (m), is_key (false)
++    {}
++
++  bool is_key;
++
++  bool verify (const state_input_info *prev) const
++    {
++      if (!prev)
++	return true;
++
++      return BLOCK_FOR_INSN (loc) == BLOCK_FOR_INSN (prev->loc);
++    }
++};
++
++/* Memory matcher to filter only suitable memory instructions.  */
++struct mem_matcher : matcher_term<mem_term_info>
++{
++  static bool match (rtx_insn *insn, holder_type &m)
++    {
++      rtx src = get_single_set_op<MEM> (insn);
++      return src && match (src, insn, m);
++    }
++
++  static bool match (rtx src, rtx_insn *insn, holder_type &m)
++    {
++      if (!MEM_P (src))
++	return false;
++
++      mem_term_info info (NULL, NULL_RTX);
++      if (!decompose_mem (src, info.src, info.offset))
++	return false;
++
++      info.loc = insn;
++      m[0] = info;
++      return true;
++    }
++};
++
++/* AES entry input info.  Enhanced from state input due to ideological
++   similarities.  */
++struct input_info : state_input_info
++{
++  input_info ()
++    {}
++  input_info (const mem_term_info &m, unsigned HOST_WIDE_INT shift_cst)
++    : state_input_info (m), shift_cst (shift_cst)
++    {}
++  input_info (const aes_key &k)
++    : state_input_info (k)
++    {}
++
++  unsigned HOST_WIDE_INT shift_cst;
++
++  /* Input info is sorted by references offsets.  */
++  bool operator < (const input_info &rhs) const
++    {
++      return offset < rhs.offset;
++    }
++
++  std::pair<rtx, unsigned HOST_WIDE_INT> input () const
++    {
++      return std::make_pair (src, offset);
++    }
++
++  bool verify (const input_info *prev, unsigned i) const
++    {
++      if (!state_input_info::verify (prev))
++	return false;
++
++      /* Previous state should reference the previous element
++	 of the same buffer.  */
++      if (prev && (src != prev->src || offset != prev->offset + 1))
++	return false;
++
++      /* State should use the corresponding shift constant.  */
++      return shift_csts[i] == shift_cst;
++    }
++
++  static bool finalize (rtx_insn *insn, input_info *m)
++    {
++      typedef unop_matcher<ZERO_EXTEND, mem_matcher> zext_matcher;
++
++      zext_matcher::holder_type zext;
++      if (zext_matcher::match (insn, zext))
++	{
++	  *m = input_info (zext[0], 0);
++	  return true;
++	}
++
++      typedef binop_matcher<ASHIFT,
++			    zext_matcher, int_cst_matcher<mem_term_info> >
++	shifted_variant;
++      shifted_variant::holder_type lsh;
++      if (!shifted_variant::match (insn, lsh))
++	return false;
++
++      gcc_assert (CONST_INT_P (lsh[1].src));
++      *m = input_info (lsh[0], UINTVAL (lsh[1].src));
++      return true;
++    }
++};
++
++/* Check if the corresponding constants combinations may be used for
++   AES table access.  */
++static bool
++verify_table_access (unsigned HOST_WIDE_INT shift_cst,
++		     unsigned HOST_WIDE_INT and_cst = 0xFF,
++		     bool and_present = true)
++{
++  if (and_cst != 0xFF)
++    return false;
++
++  switch (shift_cst)
++    {
++    case 0:
++    case 8:
++    case 16:
++      return and_present;
++    case 24:
++      return true;
++    default:
++      return false;
++    }
++}
++
++/* AES table reference description.  */
++template<typename TABLE_T>
++struct aes_table_ref
++{
++  rtx_insn *insn;
++  rtx_insn *output_insn;
++  unsigned HOST_WIDE_INT lsr_cst;
++  rtx reg;
++  rtx output;
++  typename TABLE_T::table_entry itable;
++  bool is_final;
++
++  bool verify (unsigned i) const
++    {
++      typename TABLE_T::table_entry (&ethalon)[TABLE_T::rounds_num]
++	= is_final ? TABLE_T::final_rounds : TABLE_T::rounds;
++      return lsr_cst == shift_csts[i] && itable == ethalon[i];
++    }
++};
++
++/* Check the minimal requirements of the pattern to be a table reference
++   and wrap the table id getter function.  */
++template<typename T>
++static typename T::table_entry
++check_table (rtx mem)
++{
++  tree expr = MEM_EXPR (mem);
++  if (!expr || TREE_CODE (expr) != ARRAY_REF)
++    return T::BAD_TABLE;
++
++  tree decl = TREE_OPERAND (expr, 0);
++  if (!decl || !DECL_P (decl) || !TREE_READONLY (decl))
++    return T::BAD_TABLE;
++
++  tree ctor = DECL_INITIAL (decl);
++  if (!ctor)
++    return T::BAD_TABLE;
++
++  return T::get_table_id (ctor);
++}
++
++/* Simplified memory info.  Used for simplier table ref analysis.  */
++struct simplified_mem_info
++{
++  rtx base_reg;
++  rtx index;
++};
++
++/* Try to obtain table reference info.  */
++static bool
++decompose_tref_mem_address (simplified_mem_info &info, rtx mem)
++{
++  address_info addr_info;
++  decompose_mem_address (&addr_info, mem);
++  if (!addr_info.base || !addr_info.index)
++    return false;
++
++  info.base_reg = *addr_info.base;
++  info.index = *addr_info.index;
++
++  if (!REG_P (info.base_reg))
++    return false;
++
++  if (addr_info.mode == SImode)
++    {
++      if (GET_CODE (info.index) != MULT)
++	return false;
++
++      rtx cst = XEXP (info.index, 1);
++      if (!CONST_INT_P (cst) || UINTVAL (cst) != 4)
++	return false;
++
++      info.index = XEXP (info.index, 0);
++      return true;
++    }
++
++  return (addr_info.mode == QImode);
++}
++
++/* Find the possible final output instruction.  */
++template<typename TABLE_T>
++static rtx_insn *
++get_possible_final_output (rtx_insn *insn, rtx reg,
++			   unsigned HOST_WIDE_INT shift_cst,
++			   typename TABLE_T::table_entry itable);
++
++/* Specialize the function for AES encryption.  The output is AND instruction
++   with propper constant.  */
++template<>
++rtx_insn *
++get_possible_final_output<aes_encrypt_table> (rtx_insn *insn, rtx reg,
++					      unsigned HOST_WIDE_INT shift_cst,
++					      aes_encrypt_table::table_entry)
++{
++  rtx_insn *out = get_single_use_insn (insn, reg);
++  if (!out)
++    return NULL;
++
++  rtx cst_val = get_op_const_cst<AND> (out);
++  if (!cst_val)
++    return NULL;
++
++  unsigned HOST_WIDE_INT ethalon;
++  switch (shift_cst)
++    {
++    case 24:
++      ethalon = 0xffffffffff000000;
++      break;
++    case 16:
++      ethalon = 0xff0000;
++      break;
++    case 8:
++      ethalon = 0xff00;
++      break;
++    case 0:
++      ethalon = 0xff;
++      break;
++    default:
++      gcc_unreachable ();
++    }
++
++  return UINTVAL (cst_val) == ethalon ? out : NULL;
++}
++
++/* Specialize the function for AES decryption.  The output is ASHIFT instruction
++   with propper constant or direct reference to TD4 table.
++
++   TODO: TD4 check might be done here for all the cases.  However, now it is not
++   done here to make decryption and encryption matching
++   more general in common.  */
++template<>
++rtx_insn *
++get_possible_final_output<aes_decrypt_table> (rtx_insn *insn, rtx reg,
++					      unsigned HOST_WIDE_INT shift_cst,
++					      aes_decrypt_table::table_entry it)
++{
++  rtx_insn *out = get_single_use_insn (insn, reg);
++  if (!out)
++    return NULL;
++
++  rtx cst_val = get_op_const_cst<ASHIFT> (out);
++  if (!cst_val)
++    // no shift case
++    return it == aes_decrypt_table::TD4 ? insn : NULL;
++
++  return UINTVAL (cst_val) == shift_cst ? out : NULL;
++}
++
++typedef arg_op_matcher<REG> reg_matcher;
++
++/* Helper that matches suitable AES table references.  */
++template<typename TABLE_T>
++class tref_matcher
++{
++  /* (reg >> cst) matcher.  Helper.  */
++  typedef binop_matcher<LSHIFTRT,
++			reg_matcher,
++			int_cst_matcher<minimal_term_info> > table_access;
++  /* zext (reg >> cst) matcher.  Used for TABLE[(val >> 24)] variant.  */
++  typedef unop_matcher<ZERO_EXTEND, table_access> direct;
++  /* zext ((reg >> cst1) & cst2) matcher.  Used for
++     TABLE[(val >> (16|8)) & 0xff] variant.  */
++  typedef unop_matcher<ZERO_EXTEND,
++		       binop_matcher<AND,
++			 table_access,
++			 int_cst_matcher<minimal_term_info> > > shifted;
++  /* zext (reg & cst) matcher.  Used for TABLE[val & 0xff] variant.  */
++  typedef unop_matcher<ZERO_EXTEND,
++		       binop_matcher<AND,
++			 reg_matcher,
++			 int_cst_matcher<minimal_term_info> > > noshift;
++
++  std::map<rtx, typename TABLE_T::table_entry> table_alias;
++
++  bool finalize (aes_table_ref<TABLE_T> &tref,
++		 minimal_term_info &input_info,
++		 minimal_term_info *shift_info = NULL,
++		 minimal_term_info *mask_info = NULL)
++    {
++      gcc_assert (REG_P (input_info.src));
++      gcc_assert (!shift_info || CONST_INT_P (shift_info->src));
++      gcc_assert (!mask_info || CONST_INT_P (mask_info->src));
++
++      unsigned HOST_WIDE_INT shift
++	= shift_info ? UINTVAL (shift_info->src) : 0;
++      unsigned HOST_WIDE_INT mask
++	= mask_info ? UINTVAL (mask_info->src) : 0xFF;
++      if (!verify_table_access (shift, mask, mask_info))
++	return false;
++
++      tref.insn = input_info.loc;
++      tref.reg = input_info.src;
++      tref.lsr_cst = shift;
++      return true;
++    }
++
++  bool match (rtx_insn *insn, rtx index, aes_table_ref<TABLE_T> &tref)
++    {
++      direct::holder_type direct_res;
++      if (direct::match (index, insn, direct_res))
++	return finalize (tref, direct_res[0], &direct_res[1]);
++
++      shifted::holder_type shifted_res;
++      if (shifted::match (index, insn, shifted_res))
++	return finalize (tref, shifted_res[0],
++			 &shifted_res[1], &shifted_res[2]);
++
++      noshift::holder_type noshift_res;
++      return noshift::match (index, insn, noshift_res)
++	&& finalize (tref, noshift_res[0], NULL, &noshift_res[1]);
++    }
++
++public:
++  bool match (rtx_insn *insn, aes_table_ref<TABLE_T> &tref)
++    {
++      rtx mem = get_single_set_op<MEM> (insn);
++      if (!mem && (mem = get_single_set_op<ZERO_EXTEND> (insn)))
++	mem = XEXP (mem, 0);
++
++      rtx dst = get_single_set_dst<REG> (insn);
++      if (!mem || !MEM_P (mem) || !dst || GET_MODE (dst) != SImode)
++	return false;
++
++      simplified_mem_info info;
++      if (!decompose_tref_mem_address (info, mem)
++	  || !match (insn, info.index, tref))
++	return false;
++
++      typename TABLE_T::table_entry itable;
++      if (!table_alias.count (info.base_reg))
++	{
++	  itable = check_table<TABLE_T> (mem);
++	  if (itable == TABLE_T::BAD_TABLE)
++	    return false;
++	  table_alias[info.base_reg] = itable;
++	}
++      else
++	itable = table_alias.at (info.base_reg);
++
++      if (rtx_insn *out = get_possible_final_output<TABLE_T> (insn, dst,
++							      tref.lsr_cst,
++							      itable))
++	{
++	  tref.is_final = true;
++	  tref.output_insn = out;
++	  tref.output = NULL_RTX;
++	}
++      else
++	{
++	  tref.is_final = false;
++	  tref.output_insn = insn;
++	  tref.output = dst;
++	}
++
++      tref.itable = itable;
++      return true;
++    }
++};
++
++/* AES stage description.  Required for some specializations
++   for curtain rounds.  */
++typedef enum { INPUT, MIDDLE, FINAL } aes_stage;
++
++/* AES entity description.  It can be both round or state inside round.
++   It provides interface for unified analysis between blocks of 4 parts:
++   round -> 4 states -> 4 * 4 arguments.  */
++template<typename ENTRY_T, aes_stage STAGE, typename K>
++struct aes_entity
++{
++  aes_key key;
++  std::set<ENTRY_T> entries;
++  rtx_insn *loc;
++
++  aes_entity ()
++    : key (NULL), loc (NULL)
++    {}
++
++  /* Push new entry to the entity.  */
++  bool push_entry (const ENTRY_T &v)
++    {
++      if (entries.size () == 4)
++	return false;
++
++      entries.insert (v);
++      return true;
++    }
++
++  /* The entities are sorted by key offset.  */
++  bool operator < (const aes_entity &rhs) const
++    {
++      return key.offset < rhs.key.offset;
++    }
++
++  /* Verify that all of the entries are correct within their positions inside
++     the entity.  */
++  bool finalize ()
++    {
++      if (entries.size () != 4)
++	return false;
++
++      unsigned i = 0;
++      const ENTRY_T *prev = NULL;
++      for (typename std::set<ENTRY_T>::iterator it = entries.begin ();
++	   it != entries.end (); prev = &*it++, ++i)
++	if (!it->verify (prev, i))
++	  return false;
++
++      loc = entries.begin ()->loc;
++      return true;
++    }
++};
++
++/* Check the correctness of input regs permutations.  */
++template<typename K>
++static bool
++check_input_regs (const std::vector<rtx> &curr,
++		  const std::vector<rtx> &prev);
++
++/* Specialize the function for AES encryption.  */
++template<>
++bool
++check_input_regs<aes_encrypt_table> (const std::vector<rtx> &curr,
++				 const std::vector<rtx> &prev)
++{
++  gcc_assert (curr.size () == 4 && prev.size () == 4);
++  unsigned idx[4] = { 1, 2, 3, 0 };
++  for (int i = 0; i < 4; ++i)
++    if (curr[i] != prev[idx[i]])
++      return false;
++  return true;
++}
++
++/* Specialize the function for AES decryption.  */
++template<>
++bool
++check_input_regs<aes_decrypt_table> (const std::vector<rtx> &curr,
++				 const std::vector<rtx> &prev)
++{
++  gcc_assert (curr.size () == 4 && prev.size () == 4);
++  unsigned idx[4] = { 3, 0, 1, 2 };
++  for (int i = 0; i < 4; ++i)
++    if (curr[i] != prev[idx[i]])
++      return false;
++  return true;
++}
++
++/* Basic descryption of state input.  */
++template<aes_stage STAGE>
++struct state_input
++{
++  typedef std::vector<rtx> type;
++
++  static void finalize (type &in, rtx v)
++    {
++      in.push_back (v);
++    }
++
++  template<typename K>
++  static bool verify (const type &lhs, const type &rhs)
++    {
++      return check_input_regs<K> (lhs, rhs);
++    }
++};
++
++/* Input round state uses special input.  */
++template<>
++struct state_input<INPUT>
++{
++  typedef std::pair<rtx, unsigned HOST_WIDE_INT> type;
++
++  static void finalize (type &in, const type &v)
++    {
++      in = v;
++      // Order is inverted
++      in.second -= 3;
++    }
++
++  template<typename>
++  static bool verify (const type &lhs, const type &rhs)
++    {
++      return lhs.first == rhs.first
++	&& lhs.second == rhs.second + 4;
++    }
++};
++
++/* Basic descryption of state output.  */
++template<aes_stage STAGE>
++struct state_output
++{
++  typedef rtx type;
++
++  static bool verify (const type &, const type &)
++    {
++      return true;
++    }
++};
++
++/* Final round state generates special output.  */
++template<>
++struct state_output<FINAL>
++{
++  typedef std::pair<rtx, unsigned HOST_WIDE_INT> type;
++
++  static bool verify (const type &lhs, const type &rhs)
++    {
++      return lhs.first == rhs.first
++	&& lhs.second == rhs.second + 4;
++    }
++};
++
++/* Basic descryption of round input.  */
++template<aes_stage STAGE>
++struct round_input
++{
++  typedef std::vector<rtx> type;
++};
++
++/* Input round uses special input just as its state.  */
++template<>
++struct round_input<INPUT>
++{
++  typedef std::pair<rtx, unsigned HOST_WIDE_INT> type;
++};
++
++/* Basic descryption of round output.  */
++template<aes_stage STAGE>
++struct round_output
++{
++  typedef std::vector<rtx> type;
++
++  template<typename T>
++  static void finalize (type &out, const T &v)
++    {
++      gcc_assert (v.size () == 4);
++      for (typename T::const_iterator it = v.begin (); it != v.end (); ++it)
++	out.push_back (it->output);
++    }
++
++  template<typename K>
++  static void reorder (type &)
++    {}
++};
++
++/* Reorder output for AES decryption: the order is changed compared to
++   AES encryption.  */
++template<>
++template<>
++void round_output<INPUT>::reorder<aes_decrypt_table> (type &out)
++{
++  gcc_assert (out.size () == 4);
++  std::swap (out[1], out[3]);
++}
++
++template<>
++template<>
++void round_output<MIDDLE>::reorder<aes_decrypt_table> (type &out)
++{
++  round_output<INPUT>::reorder<aes_decrypt_table> (out);
++}
++
++/* Final round generates special output.  */
++template<>
++struct round_output<FINAL> : state_output<FINAL>
++{
++  template<typename T>
++  static void finalize (type &out, const T &v)
++    {
++      gcc_assert (v.size () == 4);
++      out = v.begin ()->output;
++    }
++
++  template<typename K>
++  static void reorder (type &)
++    {}
++};
++
++/* AES state descryption.  */
++template<typename ENTRY_T, aes_stage STAGE, typename K>
++struct aes_state : aes_entity<ENTRY_T, STAGE, K>
++{
++  typedef aes_entity<ENTRY_T, STAGE, K> base_entity;
++
++  typename state_input<STAGE>::type input;
++  typename state_output<STAGE>::type output;
++
++  aes_state ()
++    : base_entity ()
++    {}
++
++  void set_output (const typename state_output<STAGE>::type &o)
++    {
++      output = o;
++    }
++
++  bool push_entry (const ENTRY_T &v)
++    {
++      if (!v.is_key)
++	return base_entity::push_entry (v);
++
++      if (this->key.src)
++	return false;
++
++      this->key = v;
++      return true;
++    }
++
++  /* Verify if the state is correct within its position in round.  */
++  bool verify (const aes_state *prev, unsigned) const
++    {
++      if (!prev)
++	return true;
++
++      if (!this->key.has_same_origin (prev->key, this->loc)
++	  || this->key.offset != prev->key.offset + 4
++	  || BLOCK_FOR_INSN (this->loc) != BLOCK_FOR_INSN (prev->loc))
++	return false;
++
++      return state_input<STAGE>::template verify<K> (input, prev->input)
++	&& state_output<STAGE>::verify (output, prev->output);
++    }
++
++  /* Check if the entries of the state are correct and finalize stored info.  */
++  bool finalize ()
++    {
++      if (!base_entity::finalize ())
++	return false;
++
++      for (typename std::set<ENTRY_T>::iterator it = this->entries.begin ();
++	   it != this->entries.end (); ++it)
++	state_input<STAGE>::finalize (input, it->input ());
++
++      return true;
++    }
++};
++
++/* AES round descryption.  */
++template<typename ENTRY_T, aes_stage STAGE, typename K>
++struct aes_round : aes_entity<aes_state<ENTRY_T, STAGE, K>, STAGE, K>
++{
++  typedef aes_entity<aes_state<ENTRY_T, STAGE, K>, STAGE, K> base_entity;
++
++  typename round_input<STAGE>::type input;
++  typename round_output<STAGE>::type output;
++
++  /* Check if the states are correct and finalize stored info.  */
++  bool finalize ()
++    {
++      if (!base_entity::finalize ())
++	return false;
++
++      input = this->entries.begin ()->input;
++      this->key = this->entries.begin ()->key;
++
++      round_output<STAGE>::finalize (output, this->entries);
++      round_output<STAGE>::template reorder<K> (output);
++
++      return true;
++    }
++};
++
++template<typename T>
++class aes_optimizer;
++
++/* AES round input info.  Used to find and store info about
++   table references.
++
++   Must be inited and finalized before and after usage.  */
++template<typename T>
++struct round_input_info : state_input_info
++{
++  typedef typename aes_optimizer<T>::table_ref_map tref_map;
++
++  round_input_info ()
++    {}
++  round_input_info (rtx_insn *insn, const aes_table_ref<T> *tref)
++    : state_input_info (mem_term_info (insn, NULL_RTX)), tref (tref)
++    {}
++  round_input_info (const aes_key &k)
++    : state_input_info (k)
++    {}
++
++  rtx input () const
++    {
++      return tref->reg;
++    }
++
++  rtx output () const
++    {
++      return tref->output;
++    }
++
++  /* Table references are sorted by shift constants.
++     TODO: probably sort by key offset?  */
++  bool operator < (const round_input_info &rhs) const
++    {
++      return tref->lsr_cst > rhs.tref->lsr_cst;
++    }
++
++  bool verify (const round_input_info *prev, unsigned i) const
++    {
++      return state_input_info::verify (prev) && tref->verify (i);
++    }
++
++  static bool finalize (rtx_insn *insn, round_input_info *m)
++    {
++      if (checked_p->count (insn))
++	return false;
++
++      typename tref_map::const_iterator it = table_refs_p->find (insn);
++      if (it == table_refs_p->end ())
++	return false;
++
++      m[0] = round_input_info (insn, &it->second);
++      return true;
++    }
++
++  const aes_table_ref<T> *tref;
++
++  static const tref_map *table_refs_p;
++  static const std::set<rtx_insn *> *checked_p;
++
++  /* Store lookup table references.  */
++  static void init (const tref_map &t, const std::set<rtx_insn *> &c)
++    {
++      gcc_assert (!table_refs_p && !checked_p);
++      table_refs_p = &t;
++      checked_p = &c;
++    }
++
++  /* Remove lookup table references.  */
++  static void fin ()
++    {
++      gcc_assert (table_refs_p && checked_p);
++      table_refs_p = NULL;
++      checked_p = NULL;
++    }
++};
++
++template<typename T>
++const typename aes_optimizer<T>::table_ref_map *
++round_input_info<T>::table_refs_p = NULL;
++
++template<typename T>
++const std::set<rtx_insn *> *
++round_input_info<T>::checked_p = NULL;
++
++/* AES encryption/decryption optimizer.  */
++template<typename T>
++class aes_optimizer
++{
++public:
++  typedef std::map<rtx_insn *, aes_table_ref<T> > table_ref_map;
++
++  /* AES states typedefs.  */
++  typedef aes_state<input_info, INPUT, T> aes_input_state;
++  typedef aes_state<round_input_info<T>, MIDDLE, T> aes_body_state;
++  typedef aes_state<round_input_info<T>, FINAL, T> aes_final_state;
++
++  /* AES rounds typedefs.  */
++  typedef aes_round<input_info, INPUT, T> aes_input_round;
++  typedef aes_round<round_input_info<T>, MIDDLE, T> aes_body_round;
++  typedef aes_round<round_input_info<T>, FINAL, T> aes_final_round;
++
++  bool run ();
++
++private:
++  bool collect_aes_lookup_tables ();
++  bool form_rounds ();
++  bool find_aes_init_round ();
++  bool collect_state (rtx_insn * insn, aes_body_state &state,
++		      std::set<rtx_insn *> &checked);
++  bool find_aes_rounds ();
++  bool collect_final_round (rtx_insn *insn, aes_final_state &state,
++			    std::set<rtx_insn *> &checked);
++  bool find_aes_final_round ();
++  bool check_aes_pattern ();
++  void erase_unused_rounds (std::set<const std::vector<rtx> *> &used);
++
++  bool gen_aes_code ();
++  bool gen_init_round ();
++  bool gen_round (const aes_body_round &round);
++  bool gen_final_round ();
++
++  rtx gen_or_get_vreg (const std::vector<rtx> &vec);
++  rtx get_vreg (const std::vector<rtx> &vec);
++  rtx gen_vreg (const std::vector<rtx> &vec);
++
++  table_ref_map table_refs;
++  table_ref_map final_table_refs;
++
++  aes_input_round input_round;
++  std::map<std::vector<rtx>, aes_body_round> rounds;
++  aes_final_round final_round;
++
++  std::map<std::vector<rtx>, rtx> vec_regs;
++  std::vector<rtx_insn*> to_delete;
++};
++
++/* Find all the AES table references in function.  */
++template<typename T>
++bool
++aes_optimizer<T>::collect_aes_lookup_tables ()
++{
++  basic_block bb;
++  rtx_insn *insn;
++
++  tref_matcher<T> m;
++  FOR_EACH_BB_FN (bb, cfun)
++    FOR_BB_INSNS (bb, insn)
++      {
++	aes_table_ref<T> tref;
++	if (!m.match (insn, tref))
++	  continue;
++
++	if (!tref.is_final)
++	  table_refs[insn] = tref;
++	else
++	  final_table_refs[tref.output_insn] = tref;
++      }
++
++  return !table_refs.empty () && !final_table_refs.empty ();
++}
++
++/* Helper function to match all the permutations of five arg
++   calculations.  */
++template<rtx_code OP1, typename TERM, rtx_code OP2 = OP1,
++	 bool store_ops = false>
++struct five_args_calc_matcher
++{
++  /* Helper for matching (op1 * op2).  */
++  typedef binop_matcher<OP1, TERM, TERM, false, store_ops, OP2> two_args_block;
++  /* Helper for matching (op1 * (op2 * op3)).  */
++  typedef binop_matcher<OP1, two_args_block, TERM,
++			true, store_ops, OP2> three_args_block;
++  /* Helper for matching ((op1 * op2) * (op3 * op4)).  */
++  typedef binop_matcher<OP1, two_args_block, two_args_block,
++			false, store_ops, OP2> opt_four_args_block;
++  /* Helper for matching (op1 * (op2 * (op3 * op4))).  */
++  typedef binop_matcher<OP1, three_args_block, TERM,
++			       true, store_ops, OP2> linear_four_args_block;
++
++  /* Match the (op1 * ((op2 * op3) * (op4 * op5))) variant.  */
++  typedef binop_matcher<OP1, opt_four_args_block, TERM,
++			true, store_ops, OP2> opt_op_term;
++  /* Match the ((op1 * op2) * (op3 * (op4 * op5))) variant.  */
++  typedef binop_matcher<OP1, three_args_block, two_args_block,
++			true, store_ops, OP2> three_op_two;
++  /* Match the (op1 * (op2 * (op3 * (op4 * op5)))) variant.  */
++  typedef binop_matcher<OP1, linear_four_args_block, TERM,
++			true, store_ops, OP2> fully_linear;
++
++  static const int holder_size = fully_linear::holder_size;
++  static const int op_num = fully_linear::op_num;
++  typedef typename fully_linear::term_type term_type;
++  typedef typename fully_linear::holder_type holder_type;
++
++  static rtx_insn* match (rtx_insn *insn, holder_type &m, unsigned depth = 1)
++    {
++      for (rtx dst = get_single_set_dst (insn); depth && insn && dst;
++	   insn = get_single_use_insn (insn, dst),
++	   dst = insn ? get_single_set_dst (insn) : NULL_RTX,
++	   --depth)
++	if (opt_op_term::match (insn, m) || three_op_two::match (insn, m)
++	    || fully_linear::match (insn, m))
++	  return insn;
++      return NULL;
++    }
++};
++
++/* Match the AES key.  */
++struct key_matcher : matcher_term<aes_key>
++{
++  static bool match (rtx_insn *insn, holder_type &m)
++    {
++      mem_matcher::holder_type info;
++      if (!mem_matcher::match (insn, info))
++	return false;
++
++      m[0] = info[0];
++      return true;
++    }
++};
++
++/* Matcher term for state input.  */
++template<typename T>
++struct state_input_term : matcher_term<T>
++{
++  typedef typename matcher_term<T>::holder_type holder_type;
++
++  static bool match (rtx, rtx_insn *, holder_type &)
++    {
++      return false;
++    }
++
++  static bool match (rtx_insn *insn, holder_type &m)
++    {
++      key_matcher::holder_type k;
++      if (key_matcher::match (insn, k))
++	{
++	  m[0] = k[0];
++	  return true;
++	}
++
++      return matcher_term<T>::term_type::finalize (insn, m);
++    }
++};
++
++/* Fill state from args.  */
++template <typename STATE, typename T>
++static bool
++finalize_input (const T (&args)[5], STATE &state)
++{
++  for (unsigned i = 0; i < 5; ++i)
++    if (!state.push_entry (args[i]))
++      return false;
++
++  return state.finalize ();
++}
++
++/* Construct input state.  */
++template<typename T>
++static bool
++form_input (rtx_insn *insn, T &state)
++{
++  typedef five_args_calc_matcher<XOR, state_input_term<input_info> >
++    matcher;
++
++  matcher::holder_type m;
++  if (!matcher::match (insn, m) || !finalize_input (m, state))
++    return false;
++
++  /* TODO: probably should not be set here.  */
++  state.set_output (SET_DEST (single_set (insn)));
++  return true;
++}
++
++/* Get definitions chain for the reg being used in the insn.  */
++static df_link *
++get_defs (rtx_insn *insn, rtx reg)
++{
++  df_link *ref_chain = get_def_chain (insn, reg);
++  gcc_assert (ref_chain);
++
++  for (df_link *ref_link = ref_chain; ref_link; ref_link = ref_link->next)
++    if (!check_def_chain_ref (ref_link->ref, reg))
++      return NULL;
++
++  return ref_chain;
++}
++
++/* Find AES init round.  To do this, find the table references that depends on
++   two definitions.  One of them is our input.  */
++template<typename T>
++bool
++aes_optimizer<T>::find_aes_init_round ()
++{
++  std::set<rtx_insn *> checked;
++
++  for (typename table_ref_map::iterator it = table_refs.begin (),
++       end = table_refs.end (); it != end; ++it)
++    for (df_link *def = get_defs (it->second.insn, it->second.reg);
++	 def; def = def->next)
++      {
++	rtx_insn *def_insn = DF_REF_INSN (def->ref);
++	if (checked.count (def_insn))
++	  continue;
++
++	aes_input_state input_state;
++	if (form_input (def_insn, input_state)
++	    && !input_round.push_entry (input_state))
++	  return false;
++
++	checked.insert (def_insn);
++      }
++
++  return input_round.finalize ();
++}
++
++/* Collect AES inner state.  */
++template<typename T>
++bool
++aes_optimizer<T>::collect_state (rtx_insn *insn, aes_body_state &state,
++				 std::set<rtx_insn*> &checked)
++{
++  typedef round_input_info<T> term_info;
++  typedef five_args_calc_matcher<XOR, state_input_term<term_info> > matcher;
++
++  typename matcher::holder_type m;
++  term_info::init (table_refs, checked);
++  rtx_insn *match_entry = matcher::match (insn, m, 3);
++  term_info::fin ();
++
++  if (!match_entry || !finalize_input (m, state))
++    return false;
++
++  /* TODO: probably should not be set here.  */
++  state.set_output (SET_DEST (single_set (match_entry)));
++  for (unsigned i = 0; i < 5; ++i)
++    if (!m[i].is_key)
++      checked.insert (m[i].tref->output_insn);
++
++  return true;
++}
++
++/* Simple sorter to link rounds by their registers.  */
++struct reg_comp
++{
++  bool operator () (rtx lhs, rtx rhs) const
++    {
++      return REGNO (lhs) < REGNO (rhs);
++    }
++};
++
++/* Find AES inner rounds.  */
++template<typename T>
++bool
++aes_optimizer<T>::find_aes_rounds ()
++{
++  typedef std::set<rtx, reg_comp> input_key;
++
++  std::set<rtx_insn*> checked;
++  std::map<input_key, aes_body_round> candidate_rounds;
++  for (typename table_ref_map::iterator it = table_refs.begin (),
++       end = table_refs.end (); it != end; ++it)
++    {
++      rtx_insn *insn = it->first;
++      if (checked.count (insn))
++	continue;
++
++      rtx_insn *use = get_single_use_insn (insn, SET_DEST (single_set (insn)));
++      if (!use)
++	continue;
++
++      aes_body_state state;
++      if (!collect_state (use, state, checked))
++	continue;
++
++      /* Sort the input so we can found the corresponding state.  */
++      input_key input (state.input.begin (), state.input.end ());
++      candidate_rounds[input].push_entry (state);
++    }
++
++  for (typename std::map<input_key, aes_body_round>::iterator
++       it = candidate_rounds.begin ();
++       it != candidate_rounds.end (); ++it)
++    if (it->second.finalize ())
++      rounds[it->second.input] = it->second;
++
++  return !rounds.empty ();
++}
++
++template<typename T>
++struct final_state_matcher;
++
++/* AES encrypt matcher requires additional check on key calculations
++   due to possible optimizations.  */
++template<>
++struct final_state_matcher<aes_encrypt_table>
++{
++  typedef round_input_info<aes_encrypt_table> term_info;
++  typedef five_args_calc_matcher<XOR, state_input_term<term_info>, IOR, true>
++    matcher;
++  typedef typename matcher::term_type
++    holder_type[matcher::holder_size - matcher::op_num];
++
++  static rtx_insn *match (rtx_insn *insn, holder_type &m, unsigned depth)
++    {
++      matcher::holder_type inner_m;
++      rtx_insn *res = matcher::match (insn, inner_m, depth);
++      if (!res)
++	return NULL;
++
++      /* Run pre-order traversal of the operands to check the correctness
++	 of key usage.  */
++      gcc_assert (inner_m[0].is_op);
++      unsigned pos = 0;
++      if (!check_key_calculations (inner_m, pos))
++	return NULL;
++      gcc_assert (pos == (matcher::holder_size - 1));
++
++      unsigned idx = 0;
++      for (unsigned i = 0; i < matcher::holder_size; ++i)
++	if (!inner_m[i].is_op)
++	  m[idx++] = inner_m[i];
++
++      gcc_assert (idx == 5);
++      return res;
++    }
++
++  static bool check_key_calculations (const matcher::holder_type &m,
++				      unsigned &idx,
++				      bool failure_on_key = false)
++    {
++      gcc_assert (idx < matcher::holder_size);
++      if (!m[idx].is_op)
++	return !(failure_on_key && m[idx].is_key);
++
++      failure_on_key |= (GET_CODE (m[idx].src) == IOR);
++      return check_key_calculations (m, ++idx, failure_on_key)
++	&& check_key_calculations (m, ++idx, failure_on_key);
++    }
++};
++
++
++/* The final state is simple wrapper since no additional checks are required
++   here.  */
++template<>
++struct final_state_matcher<aes_decrypt_table>
++{
++  typedef round_input_info<aes_decrypt_table> term_info;
++  typedef five_args_calc_matcher<XOR, state_input_term<term_info> > matcher;
++  typedef typename matcher::holder_type holder_type;
++
++  static rtx_insn *match (rtx_insn *insn, holder_type &m, unsigned depth)
++    {
++      return matcher::match (insn, m, depth);
++    }
++};
++
++/* Match the AES final state.  */
++template<typename T>
++bool
++aes_optimizer<T>::collect_final_round (rtx_insn *insn, aes_final_state &state,
++				       std::set<rtx_insn*> &checked)
++{
++  typedef final_state_matcher<T> matcher_wrapper;
++
++  typename matcher_wrapper::holder_type m;
++  matcher_wrapper::term_info::init (final_table_refs, checked);
++  rtx_insn *match_entry = matcher_wrapper::match (insn, m, 3);
++  matcher_wrapper::term_info::fin ();
++
++  rtx dst;
++  if (!match_entry || !(dst = get_single_set_dst (match_entry))
++      || !finalize_input (m, state))
++    return false;
++
++  rtx src;
++  if (!(match_entry = get_single_use_insn (match_entry, dst))
++      || !(check_simple_op<BSWAP> (match_entry, src, dst))
++      || !dst)
++    return false;
++
++  std::pair<rtx, unsigned HOST_WIDE_INT> output;
++  if (!(match_entry = get_single_use_insn (match_entry, dst))
++      || !(dst = get_single_set_dst<MEM> (match_entry))
++      || !decompose_mem (dst, output.first, output.second))
++    return false;
++
++  to_delete.push_back (match_entry);
++  state.set_output (output);
++  for (unsigned i = 0; i < 5; ++i)
++    if (!m[i].is_key)
++      checked.insert (m[i].tref->output_insn);
++
++  return true;
++}
++
++/* Find the final round.  */
++template<typename T>
++bool
++aes_optimizer<T>::find_aes_final_round ()
++{
++  std::set<rtx_insn*> checked;
++  for (typename table_ref_map::iterator it = final_table_refs.begin (),
++       end = final_table_refs.end (); it != end; ++it)
++    {
++      rtx_insn *insn = it->first;
++
++      if (checked.count (insn))
++	continue;
++
++      rtx_insn *use = get_single_use_insn (insn, SET_DEST (single_set (insn)));
++      if (!use)
++	continue;
++
++      aes_final_state state;
++      if (collect_final_round (use, state, checked))
++	final_round.push_entry (state);
++    }
++
++  return final_round.finalize ();
++}
++
++template<typename T>
++bool
++aes_optimizer<T>::form_rounds ()
++{
++  return find_aes_final_round ()
++    && find_aes_init_round ()
++    && find_aes_rounds ();
++}
++
++template<typename T>
++void
++aes_optimizer<T>::erase_unused_rounds (std::set<const std::vector<rtx> *> &used)
++{
++  if (used.size () == rounds.size ())
++    return;
++
++  for (typename std::map<std::vector<rtx>, aes_body_round>::iterator
++       it = rounds.begin (), next = it,
++       end = rounds.end (); it != end; it = next)
++    {
++      ++next;
++      if (!used.count (&it->first))
++	rounds.erase (it);
++    }
++}
++
++/* Find round starts and link them together.  */
++template<typename T>
++bool
++aes_optimizer<T>::check_aes_pattern ()
++{
++  std::set<const std::vector<rtx> *> checked;
++
++  typename std::map<std::vector<rtx>, aes_body_round>::iterator fit
++    = rounds.find (input_round.output);
++
++  bool to_final = false;
++  while (fit != rounds.end () && !checked.count (&fit->first))
++    {
++      checked.insert (&fit->first);
++
++      if (fit->second.output == final_round.input)
++	to_final = true;
++
++      fit = rounds.find (fit->second.output);
++    }
++
++  if (!to_final)
++    return false;
++
++  erase_unused_rounds (checked);
++
++  return true;
++}
++
++static bool
++gen_insns (const rtx patterns[4], rtx_insn *loc)
++{
++  start_sequence ();
++  for (unsigned i = 0; i < 4; ++i)
++    {
++      rtx_insn *insn = emit_insn (patterns[i]);
++      if (recog_memoized (insn) < 0)
++	{
++	  end_sequence ();
++	  return false;
++	}
++    }
++
++  rtx_insn *seq = get_insns ();
++  end_sequence ();
++  emit_insn_after (seq, loc);
++
++  return true;
++}
++
++static rtx
++gen_offset_access (rtx base, unsigned HOST_WIDE_INT offset)
++{
++  if (!offset)
++    return base;
++
++  machine_mode mode = GET_MODE (base);
++  return gen_rtx_PLUS (mode, base, gen_rtx_CONST_INT (mode, offset));
++}
++
++template<typename T>
++rtx
++aes_optimizer<T>::get_vreg (const std::vector<rtx> &vec)
++{
++  std::map<std::vector<rtx>, rtx>::iterator fit = vec_regs.find (vec);
++  if (fit != vec_regs.end ())
++    return fit->second;
++
++  return 0;
++}
++
++template<typename T>
++rtx
++aes_optimizer<T>::gen_vreg (const std::vector<rtx> &vec)
++{
++  machine_mode vmode = targetm.get_v16qi_mode ();
++  rtx vreg = gen_reg_rtx (vmode);
++  vec_regs.insert (std::make_pair (vec, vreg));
++
++  return vreg;
++}
++
++template<typename T>
++rtx
++aes_optimizer<T>::gen_or_get_vreg (const std::vector<rtx> &vec)
++{
++  rtx vreg = get_vreg (vec);
++  if (!vreg)
++    vreg = gen_vreg (vec);
++
++  return vreg;
++}
++
++template<typename T>
++static rtx
++gen_aes_single_round (rtx vout, rtx vreg, rtx vkey);
++template<typename T>
++static rtx
++gen_aes_mix_columns (rtx vreg, rtx vin);
++
++template<>
++rtx
++gen_aes_single_round<aes_encrypt_table> (rtx vout, rtx vreg, rtx vkey)
++{
++  return targetm.gen_aesev16qi (vout, vreg, vkey);
++}
++
++template<>
++rtx
++gen_aes_mix_columns<aes_encrypt_table> (rtx vreg, rtx vin)
++{
++  return targetm.gen_aesmcv16qi (vreg, vin);
++}
++
++template<>
++rtx
++gen_aes_single_round<aes_decrypt_table> (rtx vout, rtx vreg, rtx vkey)
++{
++  return targetm.gen_aesdv16qi (vout, vreg, vkey);
++}
++
++template<>
++rtx
++gen_aes_mix_columns<aes_decrypt_table> (rtx vreg, rtx vin)
++{
++  return targetm.gen_aesimcv16qi (vreg, vin);
++}
++
++template<typename T>
++bool
++aes_optimizer<T>::gen_init_round ()
++{
++  rtx_insn *loc = input_round.loc;
++
++  machine_mode vmode = targetm.get_v16qi_mode ();
++
++  rtx vreg = gen_reg_rtx (vmode);
++  rtx vkey = gen_reg_rtx (vmode);
++  rtx vout = gen_vreg (input_round.output);
++
++  rtx buf = input_round.input.first;
++  rtx key = gen_offset_access (input_round.key.src, input_round.key.offset);
++
++  rtx vload_pat = gen_rtx_SET (vreg,
++			       gen_rtx_MEM (vmode, buf));
++  rtx vkey_load_pat = gen_rtx_SET (vkey,
++				   gen_rtx_MEM (vmode, key));
++  rtx vrev_pat = targetm.gen_rev32v16qi (vkey, vkey);
++  rtx vaes_pat = gen_aes_single_round<T> (vout, vreg, vkey);
++
++  const rtx patterns[4] = {vload_pat, vkey_load_pat, vrev_pat, vaes_pat};
++
++  return gen_insns (patterns, loc);
++}
++
++template<typename T>
++bool
++aes_optimizer<T>::gen_round (const aes_body_round &round)
++{
++  rtx_insn *loc = round.loc;
++
++  machine_mode vmode = targetm.get_v16qi_mode ();
++
++  rtx vreg = gen_reg_rtx (vmode);
++  rtx vkey = gen_reg_rtx (vmode);
++  rtx vin  = gen_or_get_vreg (round.input);
++  rtx vout = gen_or_get_vreg (round.output);
++
++  rtx key = gen_offset_access (round.key.src, round.key.offset);
++
++  rtx vkey_load_pat = gen_rtx_SET (vkey,
++				   gen_rtx_MEM (vmode, key));
++  rtx vrev_pat = targetm.gen_rev32v16qi (vkey, vkey);
++  rtx vmix_pat = gen_aes_mix_columns<T> (vreg, vin);
++  rtx vaes_pat = gen_aes_single_round<T> (vout, vreg, vkey);
++
++  const rtx patterns[4] = {vkey_load_pat, vrev_pat, vmix_pat, vaes_pat};
++
++  return gen_insns (patterns, loc);
++}
++
++template<typename T>
++bool
++aes_optimizer<T>::gen_final_round ()
++{
++  rtx_insn *loc = final_round.loc;
++
++  machine_mode vmode = targetm.get_v16qi_mode ();
++
++  rtx vreg = gen_reg_rtx (vmode);
++  rtx vkey = gen_reg_rtx (vmode);
++  rtx vin = get_vreg (final_round.input);
++
++  gcc_assert (vin);
++
++  rtx buf = final_round.output.first;
++  rtx key = gen_offset_access (final_round.key.src, final_round.key.offset);
++
++  rtx vkey_load_pat = gen_rtx_SET (vkey,
++				   gen_rtx_MEM (vmode, key));
++  rtx vrev_pat = targetm.gen_rev32v16qi (vkey, vkey);
++  rtx vxor_pat = gen_rtx_SET (vreg, gen_rtx_XOR (vmode, vin, vkey));
++  rtx vstore_pat = gen_rtx_SET (gen_rtx_MEM (vmode, buf), vreg);
++
++  const rtx patterns[4] = {vkey_load_pat, vrev_pat, vxor_pat, vstore_pat};
++
++  return gen_insns (patterns, loc);
++}
++
++template<typename T>
++bool
++aes_optimizer<T>::gen_aes_code ()
++{
++  if (!gen_init_round ())
++    return false;
++
++  for (typename std::map<std::vector<rtx>, aes_body_round>::iterator
++       it = rounds.begin (), end = rounds.end (); it != end; ++it)
++    {
++      if (!gen_round (it->second))
++	return false;
++    }
++
++  if (!gen_final_round ())
++    return false;
++
++  for (std::vector<rtx_insn*>::iterator it = to_delete.begin (),
++       end = to_delete.end (); it != end; ++it)
++    SET_INSN_DELETED (*it);
++
++  return true;
++}
++
++template<typename T>
++bool
++aes_optimizer<T>::run ()
++{
++  return collect_aes_lookup_tables ()
++    && form_rounds ()
++    && check_aes_pattern ()
++    && gen_aes_code ();
++}
++
++static unsigned int
++crypto_acceleration ()
++{
++  aes_optimizer<aes_encrypt_table> enc;
++  aes_optimizer<aes_decrypt_table> dec;
++  enc.run ();
++  dec.run ();
++
++  return 0;
++}
++
++static void
++init_df ()
++{
++  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
++  df_chain_add_problem (DF_UD_CHAIN + DF_DU_CHAIN);
++  df_mir_add_problem ();
++  df_live_add_problem ();
++  df_live_set_all_dirty ();
++  df_analyze ();
++  df_set_flags (DF_DEFER_INSN_RESCAN);
++}
++
++namespace {
++
++const pass_data pass_data_crypto_accel =
++{
++  RTL_PASS,	   // type
++  "crypto_accel",  // name
++  OPTGROUP_NONE,   // optinfo_flags
++  TV_CRYPTO_ACCEL, // tv_id
++  PROP_cfglayout,  // properties_required
++  0,		   // properties_provided
++  0,		   // properties_destroyed
++  0,		   // todo_flags_start
++  TODO_df_finish,  // todo_flags_finish
++};
++
++class pass_crypto_accel : public rtl_opt_pass
++{
++public:
++  pass_crypto_accel (gcc::context *ctxt)
++    : rtl_opt_pass (pass_data_crypto_accel, ctxt)
++  {}
++
++  /* opt_pass methods: */
++  virtual bool gate (function *)
++    {
++      if (flag_crypto_accel_aes <= 0)
++	return false;
++      return targetm.get_v16qi_mode
++	&& targetm.gen_rev32v16qi
++	&& targetm.gen_aesev16qi
++	&& targetm.gen_aesmcv16qi;
++    }
++
++  virtual unsigned int execute (function *)
++    {
++      init_df ();
++      return crypto_acceleration ();
++    }
++}; // class pass_crypto_accel
++
++} // anon namespace
++
++rtl_opt_pass *
++make_pass_crypto_accel (gcc::context *ctxt)
++{
++  return new pass_crypto_accel (ctxt);
++}
+diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
+index 3b6e90bf2..2aba523bb 100644
+--- a/gcc/doc/tm.texi
++++ b/gcc/doc/tm.texi
+@@ -12125,6 +12125,35 @@ types.  If @var{has_wb} is not NULL then its value is set to true if STP
+ contains post-index or pre-index operation.
+ @end deftypefn
+ 
++@deftypefn {Target Hook} machine_mode TARGET_GET_V16QI_MODE ()
++This function get the 16 byte elements vector mode if target supports this.
++@end deftypefn
++
++@deftypefn {Target Hook} rtx TARGET_GEN_REV32V16QI (rtx @var{dest}, rtx @var{src})
++This function generate the byte reverse instruction
++ of 16 byte elements vector if target supports this.
++@end deftypefn
++
++@deftypefn {Target Hook} rtx TARGET_GEN_AESEV16QI (rtx @var{dest}, rtx @var{src1}, rtx @var{src2})
++This function generate the AES encryption instruction
++ of 16 byte elements vector if target supports this.
++@end deftypefn
++
++@deftypefn {Target Hook} rtx TARGET_GEN_AESDV16QI (rtx @var{dest}, rtx @var{src1}, rtx @var{src2})
++This function generate the AES decryption instruction
++ of 16 byte elements vector if target supports this.
++@end deftypefn
++
++@deftypefn {Target Hook} rtx TARGET_GEN_AESMCV16QI (rtx @var{dest}, rtx @var{src})
++This function generate the AES mix columns instruction
++ of 16 byte elements vector if target supports this.
++@end deftypefn
++
++@deftypefn {Target Hook} rtx TARGET_GEN_AESIMCV16QI (rtx @var{dest}, rtx @var{src})
++This function generate the AES inversed mix columns instruction
++ of 16 byte elements vector if target supports this.
++@end deftypefn
++
+ @deftypefn {Target Hook} bool TARGET_CANNOT_MODIFY_JUMPS_P (void)
+ This target hook returns @code{true} past the point in which new jump
+ instructions could be created.  On machines that require a register for
+diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
+index 6ff60e562..817d586ff 100644
+--- a/gcc/doc/tm.texi.in
++++ b/gcc/doc/tm.texi.in
+@@ -7981,6 +7981,18 @@ lists.
+ 
+ @hook TARGET_IS_STP_INSN
+ 
++@hook TARGET_GET_V16QI_MODE
++
++@hook TARGET_GEN_REV32V16QI
++
++@hook TARGET_GEN_AESEV16QI
++
++@hook TARGET_GEN_AESDV16QI
++
++@hook TARGET_GEN_AESMCV16QI
++
++@hook TARGET_GEN_AESIMCV16QI
++
+ @hook TARGET_CANNOT_MODIFY_JUMPS_P
+ 
+ @hook TARGET_HAVE_CONDITIONAL_EXECUTION
+diff --git a/gcc/passes.def b/gcc/passes.def
+index a30e05688..b7d4f7b4e 100644
+--- a/gcc/passes.def
++++ b/gcc/passes.def
+@@ -475,6 +475,7 @@ along with GCC; see the file COPYING3.  If not see
+       NEXT_PASS (pass_rtl_fwprop_addr);
+       NEXT_PASS (pass_inc_dec);
+       NEXT_PASS (pass_initialize_regs);
++      NEXT_PASS (pass_crypto_accel);
+       NEXT_PASS (pass_ud_rtl_dce);
+       NEXT_PASS (pass_combine);
+       NEXT_PASS (pass_if_after_combine);
+diff --git a/gcc/rtl-matcher.h b/gcc/rtl-matcher.h
+new file mode 100644
+index 000000000..6aed8d98d
+--- /dev/null
++++ b/gcc/rtl-matcher.h
+@@ -0,0 +1,367 @@
++/* Helpers for RTL pattern matchers.
++   Copyright (C) 2003-2023 Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify it under
++the terms of the GNU General Public License as published by the Free
++Software Foundation; either version 3, or (at your option) any later
++version.
++
++GCC is distributed in the hope that it will be useful, but WITHOUT ANY
++WARRANTY; without even the implied warranty of MERCHANTABILITY or
++FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#ifndef GCC_RTL_MATCHER_H
++#define GCC_RTL_MATCHER_H
++
++#include "config.h"
++#include "system.h"
++#include "rtl.h"
++#include "df.h"
++
++/* Get definitions chain for the reg being used in insn.  */
++static df_link *
++get_def_chain (rtx_insn *insn, rtx reg)
++{
++  df_ref use;
++  FOR_EACH_INSN_USE (use, insn)
++    {
++      rtx use_reg = DF_REF_REG (use);
++      if (GET_CODE (use_reg) == SUBREG)
++	{
++	  if (REGNO (SUBREG_REG (use_reg)) == REGNO (reg))
++	    return NULL;
++	}
++      else
++	{
++	  gcc_assert (REG_P (use_reg));
++	  if (REGNO (use_reg) == REGNO (reg))
++	    return DF_REF_CHAIN (use);
++	}
++    }
++
++  return NULL;
++}
++
++/* Check if the reg is not global and actually modified in the ref.  */
++static bool
++check_def_chain_ref (df_ref ref, rtx reg)
++{
++  if (!ref || !DF_REF_INSN_INFO (ref))
++    return false;
++
++  return !global_regs[REGNO (reg)]
++    || set_of (reg, DF_REF_INSN (ref));
++}
++
++/* Get the single def instruction of the reg being used in the insn.  */
++static rtx_insn *
++get_single_def_insn (rtx_insn *insn, rtx reg)
++{
++  if (!REG_P (reg))
++    return NULL;
++
++  df_link *ref_chain = get_def_chain (insn, reg);
++  gcc_assert (ref_chain);
++
++  if (!ref_chain || ref_chain->next
++      || !check_def_chain_ref (ref_chain->ref, reg))
++    return NULL;
++
++  return DF_REF_INSN (ref_chain->ref);
++}
++
++/* Get the single user instruction of the reg being set in the insn.  */
++static rtx_insn *
++get_single_use_insn (rtx_insn *insn, rtx reg)
++{
++  df_ref def;
++  struct df_link *ref_chain;
++
++  if (!REG_P (reg))
++    return NULL;
++
++  FOR_EACH_INSN_DEF (def, insn)
++    if (REGNO (DF_REF_REG (def)) == REGNO (reg))
++      break;
++
++  gcc_assert (def && "Broken def-use analysis chain.");
++
++  ref_chain = DF_REF_CHAIN (def);
++
++  if (!ref_chain || ref_chain->next || !ref_chain->ref)
++    return NULL;
++
++  return DF_REF_INSN (ref_chain->ref);
++}
++
++/* Get the rtx pattern of suitable opcode from single set instruction.  */
++template <rtx_code OP1, rtx_code OP2>
++static rtx
++get_single_set_op (rtx_insn *insn)
++{
++  rtx pat = single_set (insn);
++  if (!pat)
++    return NULL_RTX;
++
++  rtx src = SET_SRC (pat);
++  if (GET_CODE (src) != OP1 && GET_CODE (src) != OP2)
++    return NULL_RTX;
++
++  return src;
++}
++
++/* Get the rtx pattern of suitable opcode from single set instruction.  */
++template <rtx_code OP>
++static rtx
++get_single_set_op (rtx_insn *insn)
++{
++  return get_single_set_op<OP, OP> (insn);
++}
++
++/* Get the rtx constant from single set instruction of suitable opcode.  */
++template<rtx_code OP>
++static rtx
++get_op_const_cst (rtx_insn *insn)
++{
++  rtx src = get_single_set_op<OP> (insn);
++  if (!src)
++    return NULL_RTX;
++
++  rtx cst = XEXP (src, 1);
++  return CONST_INT_P (cst) ? cst : NULL_RTX;
++}
++
++/* Get the rtx destination from single set instruction of suitable opcode.  */
++template <rtx_code OP>
++static rtx
++get_single_set_dst (rtx_insn *insn)
++{
++  rtx pat = single_set (insn);
++  if (!pat)
++    return NULL_RTX;
++
++  rtx dst = SET_DEST (pat);
++  if (GET_CODE (dst) != OP)
++    return NULL_RTX;
++
++  return dst;
++}
++
++/* Get the rtx destination from single set instruction.  */
++static rtx
++get_single_set_dst (rtx_insn *insn)
++{
++  rtx pat = single_set (insn);
++  if (!pat)
++    return NULL_RTX;
++
++  return SET_DEST (pat);
++}
++
++/* Check if the instruction is single set of suitable opcode.
++   Also gather its source and destination patterns.  */
++template <rtx_code OP>
++static bool
++check_simple_op (rtx_insn *insn, rtx &src, rtx &dst)
++{
++  rtx pat = single_set (insn);
++  if (!pat)
++    return false;
++
++  src = SET_SRC (pat);
++  dst = SET_DEST (pat);
++
++  if (GET_CODE (src) != OP)
++    return false;
++
++  return true;
++}
++
++/* Minimal term info of the RTL matcher.  All of the custom matchers should
++   inherit from it.
++
++   It stores information about matched pattern, instruction
++   of its location and predicate if the matched term represents operator
++   inside the matched tree.  */
++struct minimal_term_info
++{
++  minimal_term_info ()
++    {}
++  minimal_term_info (rtx_insn *loc, rtx src, bool is_op = false)
++    : loc (loc), src (src), is_op (is_op)
++    {}
++
++  rtx_insn *loc;
++  rtx src;
++  bool is_op;
++};
++
++/* Term info for memory matcher.  */
++struct mem_term_info : minimal_term_info
++{
++  mem_term_info ()
++    {}
++  mem_term_info (rtx_insn *loc, rtx src, unsigned HOST_WIDE_INT offset = 0)
++    : minimal_term_info (loc, src), offset (offset)
++    {}
++
++  unsigned HOST_WIDE_INT offset;
++};
++
++/* A wrapper being used to turn a term into a matcher-like entity.  */
++template<typename T = minimal_term_info>
++struct matcher_term
++{
++  /* Required storage size information of the matcher.  */
++  static const int holder_size = 1;
++  static const int op_num = 0;
++  typedef T term_type;
++  typedef term_type holder_type[holder_size];
++};
++
++/* Simple matcher of patterns of suitable opcode.  */
++template<rtx_code ARGOP, typename TERM = minimal_term_info>
++struct arg_op_matcher : matcher_term<TERM>
++{
++  typedef typename matcher_term<TERM>::holder_type holder_type;
++
++  static bool match (rtx_insn *, holder_type &)
++    {
++      return false;
++    }
++
++  static bool match (rtx src, rtx_insn *insn, holder_type &m)
++    {
++      if (GET_CODE (src) != ARGOP)
++	return false;
++
++      static_cast<minimal_term_info &> (m[0]) = minimal_term_info (insn, src);
++      return true;
++    }
++};
++
++/* Simple matcher of integer constants.  */
++template<typename T>
++struct int_cst_matcher : arg_op_matcher <CONST_INT, T>
++{};
++
++/* Unary operator matcher.  */
++template<rtx_code OP1, typename ARG, bool store_op = false, rtx_code OP2 = OP1>
++struct unop_matcher
++{
++  /* Required storage size information of the matcher.  */
++  static const int holder_size = ARG::holder_size + store_op;
++  static const int op_num = ARG::op_num + store_op;
++  typedef typename ARG::term_type term_type;
++  typedef term_type holder_type[holder_size];
++
++  static bool match (rtx_insn *insn, holder_type &m)
++    {
++      rtx src = get_single_set_op<OP1, OP2> (insn);
++      return src && match (src, insn, m);
++    }
++
++  static bool match (rtx src, rtx_insn *insn, holder_type &m)
++    {
++      if (REG_P (src))
++	{
++	  insn = get_single_def_insn (insn, src);
++	  if (insn && (src = single_set (insn)))
++	    src = SET_SRC (src);
++	}
++
++      if (!src || !insn || (GET_CODE (src) != OP1 && GET_CODE (src) != OP2))
++	return false;
++
++      /* Store current operation if needed.  */
++      if (store_op)
++	static_cast<minimal_term_info &> (m[0]) = minimal_term_info (insn, src,
++								     true);
++
++      rtx op = XEXP (src, 0);
++      rtx_insn *def = get_single_def_insn (insn, op);
++      typename ARG::holder_type &m_arg
++	= (typename ARG::holder_type &) *(m + store_op);
++      return (def && ARG::match (def, m_arg)) || ARG::match (op, insn, m_arg);
++    }
++};
++
++/* Binary operator matcher.  */
++template<rtx_code OP1, typename LHS, typename RHS, bool COMMUTATIVE = false,
++	 bool store_op = false, rtx_code OP2 = OP1>
++struct binop_matcher
++{
++  /* Required storage size information of the matcher.  */
++  static const int holder_size = LHS::holder_size + RHS::holder_size + store_op;
++  static const int op_num = LHS::op_num + RHS::op_num + store_op;
++  typedef typename LHS::term_type term_type;
++  typedef term_type holder_type[holder_size];
++
++  static bool match (rtx_insn *insn, holder_type &m)
++    {
++      rtx src = get_single_set_op<OP1, OP2> (insn);
++      return src && match (src, insn, m);
++    }
++
++  static bool match (rtx src, rtx_insn *insn, holder_type &m)
++    {
++      if (GET_CODE (src) != OP1 && GET_CODE (src) != OP2)
++	return false;
++
++      /* Store current operation if needed.  */
++      if (store_op)
++	static_cast<minimal_term_info &> (m[0]) = minimal_term_info (insn, src,
++								     true);
++
++      rtx lhs_op = XEXP (src, 0);
++      rtx rhs_op = XEXP (src, 1);
++      rtx_insn *lhs_def = get_single_def_insn (insn, lhs_op);
++      rtx_insn *rhs_def = get_single_def_insn (insn, rhs_op);
++
++      return match (lhs_def, rhs_def, lhs_op, rhs_op, insn, m)
++	|| (COMMUTATIVE && match (rhs_def, lhs_def, rhs_op, lhs_op, insn, m));
++    }
++
++private:
++  static bool match (rtx_insn *lhs_def, rtx_insn *rhs_def,
++		     rtx lhs_op, rtx rhs_op, rtx_insn *insn,
++		     holder_type &m)
++    {
++      /* Force template instantiation error on non-matching types.  */
++      gcc_assert ((typename LHS::term_type *) NULL
++		  == (typename RHS::term_type *) NULL);
++
++      /* Obtain locations in the storage.  */
++      typename LHS::holder_type &m_lhs
++	= (typename LHS::holder_type &) *(m + store_op);
++      typename RHS::holder_type &m_rhs
++	= (typename RHS::holder_type &) *(m + store_op
++					  + LHS::holder_size);
++
++      /* Try match both instructions.  */
++      if (lhs_def && rhs_def && LHS::match (lhs_def, m_lhs)
++	  && RHS::match (rhs_def, m_rhs))
++	return true;
++      /* Try match instruction and pattern.  */
++      else if (lhs_def && LHS::match (lhs_def, m_lhs)
++	       && RHS::match (rhs_op, insn, m_rhs))
++	return true;
++      /* Try match pattern and instruction.  */
++      else if (rhs_def && LHS::match (lhs_op, insn, m_lhs)
++	       && RHS::match (rhs_def, m_rhs))
++	return true;
++      /* Try match both patterns.  */
++      else
++	return LHS::match (lhs_op, insn, m_lhs)
++	  && RHS::match (rhs_op, insn, m_rhs);
++    }
++};
++
++#endif // GCC_RTL_MATCHER_H
+diff --git a/gcc/target.def b/gcc/target.def
+index 8797a21d5..c9bb2b4c2 100644
+--- a/gcc/target.def
++++ b/gcc/target.def
+@@ -2693,6 +2693,47 @@ contains post-index or pre-index operation.",
+   bool, (int icode, bool *has_wb),
+   NULL)
+ 
++DEFHOOK
++(get_v16qi_mode,
++ "This function get the 16 byte elements vector mode if target supports this.",
++ machine_mode, (),
++ NULL)
++
++DEFHOOK
++(gen_rev32v16qi,
++ "This function generate the byte reverse instruction\n\
++ of 16 byte elements vector if target supports this.",
++ rtx, (rtx dest, rtx src),
++ NULL)
++
++DEFHOOK
++(gen_aesev16qi,
++ "This function generate the AES encryption instruction\n\
++ of 16 byte elements vector if target supports this.",
++ rtx, (rtx dest, rtx src1, rtx src2),
++ NULL)
++
++DEFHOOK
++(gen_aesdv16qi,
++ "This function generate the AES decryption instruction\n\
++ of 16 byte elements vector if target supports this.",
++ rtx, (rtx dest, rtx src1, rtx src2),
++ NULL)
++
++DEFHOOK
++(gen_aesmcv16qi,
++ "This function generate the AES mix columns instruction\n\
++ of 16 byte elements vector if target supports this.",
++ rtx, (rtx dest, rtx src),
++ NULL)
++
++DEFHOOK
++(gen_aesimcv16qi,
++ "This function generate the AES inversed mix columns instruction\n\
++ of 16 byte elements vector if target supports this.",
++ rtx, (rtx dest, rtx src),
++ NULL)
++
+ DEFHOOK
+ (gen_ccmp_first,
+  "This function prepares to emit a comparison insn for the first compare in a\n\
+diff --git a/gcc/testsuite/gcc.target/aarch64/aes-decrypt.c b/gcc/testsuite/gcc.target/aarch64/aes-decrypt.c
+new file mode 100644
+index 000000000..966ec5532
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/aes-decrypt.c
+@@ -0,0 +1,478 @@
++/* { dg-do run } */
++/* { dg-options "-O3 -fno-inline --save-temps -fcrypto-accel-aes -march=armv8.2-a+lse+crypto" } */
++
++#include <stdint.h>
++#include <arm_neon.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++
++typedef uint8_t u8;
++typedef uint32_t u32;
++
++static const u32 Td0[256] = {
++    0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,
++    0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,
++    0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,
++    0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,
++    0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,
++    0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,
++    0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,
++    0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,
++    0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,
++    0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,
++    0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,
++    0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,
++    0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,
++    0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,
++    0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,
++    0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,
++    0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,
++    0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,
++    0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,
++    0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,
++    0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,
++    0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,
++    0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,
++    0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,
++    0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,
++    0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,
++    0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,
++    0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,
++    0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,
++    0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,
++    0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,
++    0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,
++    0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,
++    0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,
++    0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,
++    0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,
++    0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,
++    0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,
++    0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,
++    0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,
++    0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,
++    0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,
++    0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,
++    0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,
++    0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,
++    0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,
++    0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,
++    0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,
++    0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,
++    0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,
++    0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,
++    0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,
++    0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,
++    0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,
++    0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,
++    0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,
++    0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,
++    0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,
++    0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,
++    0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,
++    0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,
++    0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,
++    0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,
++    0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U,
++};
++
++static const u32 Td1[256] = {
++    0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,
++    0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,
++    0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,
++    0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,
++    0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,
++    0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,
++    0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,
++    0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,
++    0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,
++    0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,
++    0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,
++    0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,
++    0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,
++    0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,
++    0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,
++    0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,
++    0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,
++    0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,
++    0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,
++    0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,
++    0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,
++    0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,
++    0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,
++    0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,
++    0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,
++    0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,
++    0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,
++    0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,
++    0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,
++    0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,
++    0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,
++    0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,
++    0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,
++    0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,
++    0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,
++    0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,
++    0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,
++    0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,
++    0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,
++    0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,
++    0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,
++    0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,
++    0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,
++    0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,
++    0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,
++    0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,
++    0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,
++    0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,
++    0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,
++    0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,
++    0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,
++    0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,
++    0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,
++    0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,
++    0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,
++    0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,
++    0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,
++    0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,
++    0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,
++    0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,
++    0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,
++    0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,
++    0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,
++    0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U,
++};
++
++static const u32 Td2[256] = {
++    0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,
++    0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,
++    0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,
++    0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,
++    0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,
++    0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,
++    0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,
++    0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,
++    0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,
++    0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,
++    0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,
++    0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,
++    0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,
++    0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,
++    0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,
++    0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,
++    0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,
++    0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,
++    0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,
++    0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,
++    0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,
++    0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,
++    0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,
++    0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,
++    0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,
++    0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,
++    0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,
++    0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,
++    0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,
++    0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,
++    0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,
++    0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,
++    0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,
++    0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,
++    0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,
++    0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,
++    0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,
++    0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,
++    0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,
++    0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,
++    0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,
++    0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,
++    0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,
++    0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,
++    0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,
++    0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,
++    0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,
++    0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,
++    0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,
++    0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,
++    0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,
++    0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,
++    0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,
++    0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,
++    0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,
++    0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,
++    0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,
++    0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,
++    0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,
++    0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,
++    0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,
++    0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,
++    0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,
++    0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U,
++};
++
++static const u32 Td3[256] = {
++    0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,
++    0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,
++    0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,
++    0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,
++    0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,
++    0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,
++    0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,
++    0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,
++    0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,
++    0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,
++    0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,
++    0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,
++    0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,
++    0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,
++    0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,
++    0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,
++    0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,
++    0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,
++    0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,
++    0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,
++    0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,
++    0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,
++    0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,
++    0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,
++    0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,
++    0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,
++    0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,
++    0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,
++    0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,
++    0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,
++    0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,
++    0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,
++    0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,
++    0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,
++    0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,
++    0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,
++    0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,
++    0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,
++    0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,
++    0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,
++    0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,
++    0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,
++    0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,
++    0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,
++    0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,
++    0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,
++    0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,
++    0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,
++    0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,
++    0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,
++    0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,
++    0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,
++    0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,
++    0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,
++    0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,
++    0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,
++    0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,
++    0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,
++    0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,
++    0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,
++    0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,
++    0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,
++    0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,
++    0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U,
++};
++
++static const u8 Td4[256] = {
++    0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
++    0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
++    0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
++    0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
++    0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
++    0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
++    0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
++    0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
++    0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
++    0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
++    0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
++    0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
++    0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
++    0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
++    0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
++    0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
++    0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
++    0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
++    0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
++    0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
++    0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
++    0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
++    0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
++    0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
++    0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
++    0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
++    0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
++    0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
++    0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
++    0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
++    0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
++    0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU,
++};
++
++#define GETU32(pt)	   \
++  (			   \
++    ((u32)(pt)[0] << 24)   \
++    ^ ((u32)(pt)[1] << 16) \
++    ^ ((u32)(pt)[2] <<  8) \
++    ^ ((u32)(pt)[3])       \
++  )
++
++#define PUTU32(ct, st)		\
++  {				\
++    (ct)[0] = (u8)((st) >> 24); \
++    (ct)[1] = (u8)((st) >> 16); \
++    (ct)[2] = (u8)((st) >>  8); \
++    (ct)[3] = (u8)(st);		\
++  }
++
++void
++aes_decrypt (const unsigned char *in, unsigned char *out,
++	     const u32 *rk, int nr)
++{
++  u32 s0, s1, s2, s3, t0, t1, t2, t3;
++
++  int r = nr >> 1;
++
++  s0 = GETU32 (in     ) ^ rk[0];
++  s1 = GETU32 (in +  4) ^ rk[1];
++  s2 = GETU32 (in +  8) ^ rk[2];
++  s3 = GETU32 (in + 12) ^ rk[3];
++
++  for (;;) {
++      t0 =
++	Td0[(s0 >> 24)       ] ^
++	Td1[(s3 >> 16) & 0xff] ^
++	Td2[(s2 >>  8) & 0xff] ^
++	Td3[(s1      ) & 0xff] ^
++	rk[4];
++      t1 =
++	Td0[(s1 >> 24)       ] ^
++	Td1[(s0 >> 16) & 0xff] ^
++	Td2[(s3 >>  8) & 0xff] ^
++	Td3[(s2      ) & 0xff] ^
++	rk[5];
++      t2 =
++	Td0[(s2 >> 24)       ] ^
++	Td1[(s1 >> 16) & 0xff] ^
++	Td2[(s0 >>  8) & 0xff] ^
++	Td3[(s3      ) & 0xff] ^
++	rk[6];
++      t3 =
++	Td0[(s3 >> 24)       ] ^
++	Td1[(s2 >> 16) & 0xff] ^
++	Td2[(s1 >>  8) & 0xff] ^
++	Td3[(s0      ) & 0xff] ^
++	rk[7];
++
++      rk += 8;
++      if (--r == 0) {
++	  break;
++      }
++
++      s0 =
++	Td0[(t0 >> 24)       ] ^
++	Td1[(t3 >> 16) & 0xff] ^
++	Td2[(t2 >>  8) & 0xff] ^
++	Td3[(t1      ) & 0xff] ^
++	rk[0];
++      s1 =
++	Td0[(t1 >> 24)       ] ^
++	Td1[(t0 >> 16) & 0xff] ^
++	Td2[(t3 >>  8) & 0xff] ^
++	Td3[(t2      ) & 0xff] ^
++	rk[1];
++      s2 =
++	Td0[(t2 >> 24)       ] ^
++	Td1[(t1 >> 16) & 0xff] ^
++	Td2[(t0 >>  8) & 0xff] ^
++	Td3[(t3      ) & 0xff] ^
++	rk[2];
++      s3 =
++	Td0[(t3 >> 24)       ] ^
++	Td1[(t2 >> 16) & 0xff] ^
++	Td2[(t1 >>  8) & 0xff] ^
++	Td3[(t0      ) & 0xff] ^
++	rk[3];
++    }
++
++    s0 =
++	((u32)Td4[(t0 >> 24)       ] << 24) ^
++	((u32)Td4[(t3 >> 16) & 0xff] << 16) ^
++	((u32)Td4[(t2 >>  8) & 0xff] <<  8) ^
++	((u32)Td4[(t1      ) & 0xff])       ^
++	rk[0];
++    PUTU32 (out     , s0);
++
++    s1 =
++	((u32)Td4[(t1 >> 24)       ] << 24) ^
++	((u32)Td4[(t0 >> 16) & 0xff] << 16) ^
++	((u32)Td4[(t3 >>  8) & 0xff] <<  8) ^
++	((u32)Td4[(t2      ) & 0xff])       ^
++	rk[1];
++    PUTU32 (out +  4, s1);
++
++    s2 =
++	((u32)Td4[(t2 >> 24)       ] << 24) ^
++	((u32)Td4[(t1 >> 16) & 0xff] << 16) ^
++	((u32)Td4[(t0 >>  8) & 0xff] <<  8) ^
++	((u32)Td4[(t3      ) & 0xff])       ^
++	rk[2];
++    PUTU32 (out +  8, s2);
++
++    s3 =
++	((u32)Td4[(t3 >> 24)       ] << 24) ^
++	((u32)Td4[(t2 >> 16) & 0xff] << 16) ^
++	((u32)Td4[(t1 >>  8) & 0xff] <<  8) ^
++	((u32)Td4[(t0      ) & 0xff])       ^
++	rk[3];
++    PUTU32 (out + 12, s3);
++}
++
++int main ()
++{
++  const u8 input[16] = { 0x39, 0x25, 0x84, 0x1d, 0x02, 0xdc, 0x09, 0xfb,
++			 0xdc, 0x11, 0x85, 0x97, 0x19, 0x6a, 0x0b, 0x32 };
++
++  const u8 expected[16] = { 0x32, 0x43, 0xf6, 0xa8, 0x88, 0x5a, 0x30, 0x8d,
++			    0x31, 0x31, 0x98, 0xa2, 0xe0, 0x37, 0x07, 0x34 };
++
++  const u8 key[] = { 0xa8, 0xf9, 0x14, 0xd0, 0x89, 0x25, 0xee, 0xc9,
++		     0xc8, 0x0c, 0x3f, 0xe1, 0xa6, 0x0c, 0x63, 0xb6,
++      		     0x63, 0x5a, 0x7b, 0x0c, 0xfe, 0xea, 0x19, 0x13,
++      		     0x90, 0x88, 0x39, 0xb0, 0xb4, 0xfb, 0x4c, 0x66,
++      		     0x5a, 0x92, 0x7d, 0xdf, 0x9d, 0xb0, 0x62, 0x1f,
++      		     0x6e, 0x62, 0x20, 0xa3, 0x24, 0x73, 0x75, 0xd6,
++      		     0x47, 0x76, 0xc0, 0x12, 0xc7, 0x22, 0x1f, 0xc0,
++      		     0xf3, 0xd2, 0x42, 0xbc, 0x4a, 0x11, 0x55, 0x75,
++      		     0x76, 0xd8, 0xfc, 0x6e, 0x80, 0x54, 0xdf, 0xd2,
++      		     0x34, 0xf0, 0x5d, 0x7c, 0xb9, 0xc3, 0x17, 0xc9,
++      		     0xfc, 0x0a, 0xa3, 0x6e, 0xf6, 0x8c, 0x23, 0xbc,
++      		     0xb4, 0xa4, 0x82, 0xae, 0x8d, 0x33, 0x4a, 0xb5,
++      		     0x13, 0x44, 0x88, 0x90, 0x0a, 0x86, 0x80, 0xd2,
++      		     0x42, 0x28, 0xa1, 0x12, 0x39, 0x97, 0xc8, 0x1b,
++      		     0xf7, 0x13, 0x1f, 0x7c, 0x19, 0xc2, 0x08, 0x42,
++      		     0x48, 0xae, 0x21, 0xc0, 0x7b, 0xbf, 0x69, 0x09,
++      		     0xeb, 0x05, 0x75, 0xcc, 0xee, 0xd1, 0x17, 0x3e,
++      		     0x51, 0x6c, 0x29, 0x82, 0x33, 0x11, 0x48, 0xc9,
++      		     0xa7, 0x08, 0x37, 0x2b, 0x05, 0xd4, 0x62, 0xf2,
++      		     0xbf, 0xbd, 0x3e, 0xbc, 0x62, 0x7d, 0x61, 0x4b,
++      		     0x16, 0x15, 0x7e, 0x2b, 0xa6, 0xd2, 0xae, 0x28,
++      		     0x88, 0x15, 0xf7, 0xab, 0x3c, 0x4f, 0xcf, 0x09 };
++
++  u8 output[16] = { 0 };
++
++  aes_decrypt (input, output, (u32*) key, 10);
++
++  if (memcmp (output, expected, 16) != 0)
++    abort ();
++
++  return 0;
++}
++
++/* { dg-final { scan-assembler "rev32" } } */
++/* { dg-final { scan-assembler "aesimc" } } */
++/* { dg-final { scan-assembler "aesd" } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/aes-encrypt.c b/gcc/testsuite/gcc.target/aarch64/aes-encrypt.c
+new file mode 100644
+index 000000000..e3f3c446f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/aes-encrypt.c
+@@ -0,0 +1,443 @@
++/* { dg-do run } */
++/* { dg-options "-O3 -fno-inline --save-temps -fcrypto-accel-aes -march=armv8.2-a+lse+crypto" } */
++
++#include <stdint.h>
++#include <arm_neon.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++
++typedef uint8_t u8;
++typedef uint32_t u32;
++
++static const u32 Te0[256] = {
++    0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
++    0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
++    0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
++    0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
++    0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
++    0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
++    0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
++    0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
++    0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
++    0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
++    0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
++    0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
++    0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
++    0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
++    0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
++    0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
++    0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
++    0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
++    0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
++    0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
++    0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
++    0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
++    0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
++    0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
++    0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
++    0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
++    0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
++    0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
++    0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
++    0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
++    0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
++    0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
++    0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
++    0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
++    0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
++    0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
++    0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
++    0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
++    0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
++    0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
++    0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
++    0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
++    0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
++    0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
++    0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
++    0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
++    0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
++    0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
++    0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
++    0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
++    0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
++    0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
++    0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
++    0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
++    0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
++    0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
++    0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
++    0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
++    0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
++    0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
++    0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
++    0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
++    0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
++    0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,
++};
++
++static const u32 Te1[256] = {
++    0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
++    0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
++    0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
++    0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
++    0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
++    0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
++    0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
++    0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
++    0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
++    0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
++    0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
++    0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
++    0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
++    0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
++    0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
++    0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
++    0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
++    0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
++    0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
++    0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
++    0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
++    0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
++    0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
++    0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
++    0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
++    0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
++    0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
++    0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
++    0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
++    0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
++    0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
++    0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
++    0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
++    0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
++    0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
++    0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
++    0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
++    0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
++    0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
++    0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
++    0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
++    0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
++    0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
++    0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
++    0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
++    0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
++    0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
++    0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
++    0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
++    0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
++    0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
++    0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
++    0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
++    0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
++    0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
++    0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
++    0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
++    0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
++    0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
++    0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
++    0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
++    0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
++    0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
++    0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,
++};
++
++static const u32 Te2[256] = {
++    0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
++    0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
++    0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
++    0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
++    0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
++    0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
++    0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
++    0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
++    0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
++    0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
++    0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
++    0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
++    0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
++    0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
++    0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
++    0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
++    0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
++    0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
++    0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
++    0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
++    0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
++    0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
++    0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
++    0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
++    0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
++    0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
++    0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
++    0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
++    0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
++    0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
++    0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
++    0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
++    0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
++    0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
++    0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
++    0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
++    0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
++    0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
++    0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
++    0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
++    0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
++    0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
++    0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
++    0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
++    0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
++    0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
++    0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
++    0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
++    0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
++    0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
++    0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
++    0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
++    0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
++    0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
++    0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
++    0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
++    0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
++    0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
++    0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
++    0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
++    0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
++    0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
++    0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
++    0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,
++};
++
++static const u32 Te3[256] = {
++    0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
++    0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
++    0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
++    0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
++    0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
++    0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
++    0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
++    0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
++    0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
++    0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
++    0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
++    0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
++    0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
++    0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
++    0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
++    0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
++    0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
++    0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
++    0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
++    0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
++    0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
++    0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
++    0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
++    0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
++    0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
++    0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
++    0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
++    0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
++    0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
++    0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
++    0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
++    0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
++    0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
++    0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
++    0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
++    0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
++    0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
++    0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
++    0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
++    0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
++    0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
++    0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
++    0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
++    0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
++    0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
++    0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
++    0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
++    0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
++    0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
++    0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
++    0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
++    0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
++    0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
++    0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
++    0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
++    0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
++    0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
++    0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
++    0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
++    0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
++    0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
++    0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
++    0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
++    0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,
++};
++
++#define GETU32(pt)	   \
++  (			   \
++    ((u32)(pt)[0] << 24)   \
++    ^ ((u32)(pt)[1] << 16) \
++    ^ ((u32)(pt)[2] <<  8) \
++    ^ ((u32)(pt)[3])       \
++  )
++
++#define PUTU32(ct, st)		\
++  {				\
++    (ct)[0] = (u8)((st) >> 24); \
++    (ct)[1] = (u8)((st) >> 16); \
++    (ct)[2] = (u8)((st) >>  8); \
++    (ct)[3] = (u8)(st);		\
++  }
++
++void
++aes_encrypt (const unsigned char *in, unsigned char *out,
++	     const u32 *rk, int nr)
++{
++  u32 s0, s1, s2, s3, t0, t1, t2, t3;
++
++  int r = nr >> 1;
++
++  s0 = GETU32 (in     ) ^ rk[0];
++  s1 = GETU32 (in +  4) ^ rk[1];
++  s2 = GETU32 (in +  8) ^ rk[2];
++  s3 = GETU32 (in + 12) ^ rk[3];
++
++  for (;;) {
++    t0 =
++	Te0[(s0 >> 24)       ] ^
++	Te1[(s1 >> 16) & 0xff] ^
++	Te2[(s2 >>  8) & 0xff] ^
++	Te3[(s3      ) & 0xff] ^
++	rk[4];
++    t1 =
++	Te0[(s1 >> 24)       ] ^
++	Te1[(s2 >> 16) & 0xff] ^
++	Te2[(s3 >>  8) & 0xff] ^
++	Te3[(s0      ) & 0xff] ^
++	rk[5];
++    t2 =
++	Te0[(s2 >> 24)       ] ^
++	Te1[(s3 >> 16) & 0xff] ^
++	Te2[(s0 >>  8) & 0xff] ^
++	Te3[(s1      ) & 0xff] ^
++	rk[6];
++    t3 =
++	Te0[(s3 >> 24)       ] ^
++	Te1[(s0 >> 16) & 0xff] ^
++	Te2[(s1 >>  8) & 0xff] ^
++	Te3[(s2      ) & 0xff] ^
++	rk[7];
++
++    rk += 8;
++    if (--r == 0)
++	break;
++
++    s0 =
++	Te0[(t0 >> 24)       ] ^
++	Te1[(t1 >> 16) & 0xff] ^
++	Te2[(t2 >>  8) & 0xff] ^
++	Te3[(t3      ) & 0xff] ^
++	rk[0];
++    s1 =
++	Te0[(t1 >> 24)       ] ^
++	Te1[(t2 >> 16) & 0xff] ^
++	Te2[(t3 >>  8) & 0xff] ^
++	Te3[(t0      ) & 0xff] ^
++	rk[1];
++    s2 =
++	Te0[(t2 >> 24)       ] ^
++	Te1[(t3 >> 16) & 0xff] ^
++	Te2[(t0 >>  8) & 0xff] ^
++	Te3[(t1      ) & 0xff] ^
++	rk[2];
++    s3 =
++	Te0[(t3 >> 24)       ] ^
++	Te1[(t0 >> 16) & 0xff] ^
++	Te2[(t1 >>  8) & 0xff] ^
++	Te3[(t2      ) & 0xff] ^
++	rk[3];
++  }
++
++  s0 =
++      (Te2[(t0 >> 24)       ] & 0xff000000) ^
++      (Te3[(t1 >> 16) & 0xff] & 0x00ff0000) ^
++      (Te0[(t2 >>  8) & 0xff] & 0x0000ff00) ^
++      (Te1[(t3      ) & 0xff] & 0x000000ff) ^
++      rk[0];
++  PUTU32 (out     , s0);
++
++  s1 =
++      (Te2[(t1 >> 24)       ] & 0xff000000) ^
++      (Te3[(t2 >> 16) & 0xff] & 0x00ff0000) ^
++      (Te0[(t3 >>  8) & 0xff] & 0x0000ff00) ^
++      (Te1[(t0      ) & 0xff] & 0x000000ff) ^
++      rk[1];
++  PUTU32 (out +  4, s1);
++
++  s2 =
++      (Te2[(t2 >> 24)       ] & 0xff000000) ^
++      (Te3[(t3 >> 16) & 0xff] & 0x00ff0000) ^
++      (Te0[(t0 >>  8) & 0xff] & 0x0000ff00) ^
++      (Te1[(t1      ) & 0xff] & 0x000000ff) ^
++      rk[2];
++  PUTU32 (out +  8, s2);
++
++  s3 =
++      (Te2[(t3 >> 24)       ] & 0xff000000) ^
++      (Te3[(t0 >> 16) & 0xff] & 0x00ff0000) ^
++      (Te0[(t1 >>  8) & 0xff] & 0x0000ff00) ^
++      (Te1[(t2      ) & 0xff] & 0x000000ff) ^
++      rk[3];
++  PUTU32 (out + 12, s3);
++}
++
++
++int main ()
++{
++  const u8 input[16] = { 0x32, 0x43, 0xf6, 0xa8, 0x88, 0x5a, 0x30, 0x8d,
++			 0x31, 0x31, 0x98, 0xa2, 0xe0, 0x37, 0x07, 0x34 };
++
++  const u8 expected[16] = { 0x39, 0x25, 0x84, 0x1d, 0x02, 0xdc, 0x09, 0xfb,
++			    0xdc, 0x11, 0x85, 0x97, 0x19, 0x6a, 0x0b, 0x32 };
++
++  const u8 key[] = { 0x16, 0x15, 0x7e, 0x2b, 0xa6, 0xd2, 0xae, 0x28,
++		     0x88, 0x15, 0xf7, 0xab, 0x3c, 0x4f, 0xcf, 0x09,
++		     0x17, 0xfe, 0xfa, 0xa0, 0xb1, 0x2c, 0x54, 0x88,
++		     0x39, 0x39, 0xa3, 0x23, 0x05, 0x76, 0x6c, 0x2a,
++		     0xf2, 0x95, 0xc2, 0xf2, 0x43, 0xb9, 0x96, 0x7a,
++		     0x7a, 0x80, 0x35, 0x59, 0x7f, 0xf6, 0x59, 0x73,
++		     0x7d, 0x47, 0x80, 0x3d, 0x3e, 0xfe, 0x16, 0x47,
++		     0x44, 0x7e, 0x23, 0x1e, 0x3b, 0x88, 0x7a, 0x6d,
++		     0x41, 0xa5, 0x44, 0xef, 0x7f, 0x5b, 0x52, 0xa8,
++		     0x3b, 0x25, 0x71, 0xb6, 0x00, 0xad, 0x0b, 0xdb,
++		     0xf8, 0xc6, 0xd1, 0xd4, 0x87, 0x9d, 0x83, 0x7c,
++		     0xbc, 0xb8, 0xf2, 0xca, 0xbc, 0x15, 0xf9, 0x11,
++		     0x7a, 0xa3, 0x88, 0x6d, 0xfd, 0x3e, 0x0b, 0x11,
++		     0x41, 0x86, 0xf9, 0xdb, 0xfd, 0x93, 0x00, 0xca,
++		     0x0e, 0xf7, 0x54, 0x4e, 0xf3, 0xc9, 0x5f, 0x5f,
++		     0xb2, 0x4f, 0xa6, 0x84, 0x4f, 0xdc, 0xa6, 0x4e,
++		     0x21, 0x73, 0xd2, 0xea, 0xd2, 0xba, 0x8d, 0xb5,
++		     0x60, 0xf5, 0x2b, 0x31, 0x2f, 0x29, 0x8d, 0x7f,
++		     0xf3, 0x66, 0x77, 0xac, 0x21, 0xdc, 0xfa, 0x19,
++		     0x41, 0x29, 0xd1, 0x28, 0x6e, 0x00, 0x5c, 0x57,
++		     0xa8, 0xf9, 0x14, 0xd0, 0x89, 0x25, 0xee, 0xc9,
++		     0xc8, 0x0c, 0x3f, 0xe1, 0xa6, 0x0c, 0x63, 0xb6 };
++
++  u8 output[16] = { 0 };
++
++  aes_encrypt (input, output, (u32*) key, 10);
++
++  if (memcmp (output, expected, 16) != 0)
++    abort ();
++
++  return 0;
++}
++
++/* { dg-final { scan-assembler "rev32" } } */
++/* { dg-final { scan-assembler "aesmc" } } */
++/* { dg-final { scan-assembler "aese" } } */
+diff --git a/gcc/timevar.def b/gcc/timevar.def
+index 2ccecffb5..18a9f62cc 100644
+--- a/gcc/timevar.def
++++ b/gcc/timevar.def
+@@ -261,6 +261,7 @@ DEFTIMEVAR (TV_AUTO_INC_DEC          , "auto inc dec")
+ DEFTIMEVAR (TV_CSE2                  , "CSE 2")
+ DEFTIMEVAR (TV_BRANCH_PROB           , "branch prediction")
+ DEFTIMEVAR (TV_COMBINE               , "combiner")
++DEFTIMEVAR (TV_CRYPTO_ACCEL	     , "crypto accel")
+ DEFTIMEVAR (TV_IFCVT		     , "if-conversion")
+ DEFTIMEVAR (TV_MODE_SWITCH           , "mode switching")
+ DEFTIMEVAR (TV_SMS		     , "sms modulo scheduling")
+diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
+index 6daac7fc1..1733931c3 100644
+--- a/gcc/tree-pass.h
++++ b/gcc/tree-pass.h
+@@ -583,6 +583,7 @@ extern rtl_opt_pass *make_pass_cse2 (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_df_initialize_opt (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_df_initialize_no_opt (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_reginfo_init (gcc::context *ctxt);
++extern rtl_opt_pass *make_pass_crypto_accel (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_inc_dec (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_stack_ptr_mod (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_initialize_regs (gcc::context *ctxt);
+-- 
+2.33.0
+
diff --git a/0042-crypto-accel-add-optimization-level-requirement-to-t.patch b/0042-crypto-accel-add-optimization-level-requirement-to-t.patch
new file mode 100644
index 0000000000000000000000000000000000000000..49dfc1d3b8009ddce6d5752959f9825a0ee025fa
--- /dev/null
+++ b/0042-crypto-accel-add-optimization-level-requirement-to-t.patch
@@ -0,0 +1,27 @@
+From 915d549b03c10ab403538888149facd417a02ebc Mon Sep 17 00:00:00 2001
+From: vchernon <chernonog.vyacheslav@huawei.com>
+Date: Wed, 27 Dec 2023 23:31:26 +0800
+Subject: [PATCH 16/18] [crypto-accel] add optimization level requirement to
+ the gate
+
+fix issue (src-openEuler/gcc: I8RRDW)
+---
+ gcc/crypto-accel.cc | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/gcc/crypto-accel.cc b/gcc/crypto-accel.cc
+index f4e810a6b..e7766a585 100644
+--- a/gcc/crypto-accel.cc
++++ b/gcc/crypto-accel.cc
+@@ -2391,7 +2391,7 @@ public:
+   /* opt_pass methods: */
+   virtual bool gate (function *)
+     {
+-      if (flag_crypto_accel_aes <= 0)
++      if (flag_crypto_accel_aes <= 0 || optimize < 1)
+ 	return false;
+       return targetm.get_v16qi_mode
+ 	&& targetm.gen_rev32v16qi
+-- 
+2.33.0
+
diff --git a/0043-Add-more-flexible-check-for-pointer-aliasing-during-.patch b/0043-Add-more-flexible-check-for-pointer-aliasing-during-.patch
new file mode 100644
index 0000000000000000000000000000000000000000..df88789c75c5829bff8dee4c5a4e7d817af2227d
--- /dev/null
+++ b/0043-Add-more-flexible-check-for-pointer-aliasing-during-.patch
@@ -0,0 +1,239 @@
+From b5865aef36ebaac87ae30d51f08bfe081795ed67 Mon Sep 17 00:00:00 2001
+From: Chernonog Viacheslav <chernonog.vyacheslav@huawei.com>
+Date: Tue, 12 Mar 2024 23:30:56 +0800
+Subject: [PATCH 17/18] Add more flexible check for pointer aliasing during
+ vectorization It takes minimum between number of iteration and segment length
+ it helps to speed up loops with small number of iterations when only tail can
+ be vectorized
+
+---
+ gcc/params.opt                                |  5 ++
+ .../sve/var_stride_flexible_segment_len_1.c   | 23 +++++++
+ gcc/tree-data-ref.cc                          | 67 +++++++++++++------
+ gcc/tree-data-ref.h                           | 11 ++-
+ gcc/tree-vect-data-refs.cc                    | 14 +++-
+ 5 files changed, 95 insertions(+), 25 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
+
+diff --git a/gcc/params.opt b/gcc/params.opt
+index 6176d4790..7e5c119cf 100644
+--- a/gcc/params.opt
++++ b/gcc/params.opt
+@@ -1180,6 +1180,11 @@ Maximum number of loop peels to enhance alignment of data references in a loop.
+ Common Joined UInteger Var(param_vect_max_version_for_alias_checks) Init(10) Param Optimization
+ Bound on number of runtime checks inserted by the vectorizer's loop versioning for alias check.
+ 
++-param=vect-alias-flexible-segment-len=
++Common Joined UInteger Var(param_flexible_seg_len) Init(0) IntegerRange(0, 1) Param Optimization
++Use a minimum length of different segments.  Currenlty the minimum between
++iteration number and vectorization length is chosen by this param.
++
+ -param=vect-max-version-for-alignment-checks=
+ Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization
+ Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check.
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
+new file mode 100644
+index 000000000..894f075f3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
+@@ -0,0 +1,23 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize --param=vect-alias-flexible-segment-len=1" } */
++
++#define TYPE int
++#define SIZE 257
++
++void __attribute__ ((weak))
++f (TYPE *x, TYPE *y, unsigned short n, long m __attribute__((unused)))
++{
++  for (int i = 0; i < SIZE; ++i)
++    x[i * n] += y[i * n];
++}
++
++/* { dg-final { scan-assembler {\tld1w\tz[0-9]+} } } */
++/* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */
++/* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
++/* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
++/* Should use a WAR check that multiplies by (VF-2)*4 rather than
++   an overlap check that multiplies by (257-1)*4.  */
++/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */
++/* One range check and a check for n being zero.  */
++/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 2 } } */
++/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */
+diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
+index 397792c35..e6ae9e847 100644
+--- a/gcc/tree-data-ref.cc
++++ b/gcc/tree-data-ref.cc
+@@ -2329,31 +2329,15 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr,
+    same arguments.  Try to optimize cases in which the second access
+    is a write and in which some overlap is valid.  */
+ 
+-static bool
+-create_waw_or_war_checks (tree *cond_expr,
++static void
++create_waw_or_war_checks2 (tree *cond_expr, tree seg_len_a,
+ 			  const dr_with_seg_len_pair_t &alias_pair)
+ {
+   const dr_with_seg_len& dr_a = alias_pair.first;
+   const dr_with_seg_len& dr_b = alias_pair.second;
+ 
+-  /* Check for cases in which:
+-
+-     (a) DR_B is always a write;
+-     (b) the accesses are well-ordered in both the original and new code
+-	 (see the comment above the DR_ALIAS_* flags for details); and
+-     (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR.  */
+-  if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
+-    return false;
+-
+-  /* Check for equal (but possibly variable) steps.  */
+   tree step = DR_STEP (dr_a.dr);
+-  if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
+-    return false;
+-
+-  /* Make sure that we can operate on sizetype without loss of precision.  */
+   tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
+-  if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
+-    return false;
+ 
+   /* All addresses involved are known to have a common alignment ALIGN.
+      We can therefore subtract ALIGN from an exclusive endpoint to get
+@@ -2370,9 +2354,6 @@ create_waw_or_war_checks (tree *cond_expr,
+ 			       fold_convert (ssizetype, indicator),
+ 			       ssize_int (0));
+ 
+-  /* Get lengths in sizetype.  */
+-  tree seg_len_a
+-    = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len));
+   step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step));
+ 
+   /* Each access has the following pattern:
+@@ -2479,6 +2460,50 @@ create_waw_or_war_checks (tree *cond_expr,
+   *cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit);
+   if (dump_enabled_p ())
+     dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n");
++}
++
++/* This is a wrapper function for create_waw_or_war_checks2.  */
++static bool
++create_waw_or_war_checks (tree *cond_expr,
++			  const dr_with_seg_len_pair_t &alias_pair)
++{
++  const dr_with_seg_len& dr_a = alias_pair.first;
++  const dr_with_seg_len& dr_b = alias_pair.second;
++
++  /* Check for cases in which:
++
++     (a) DR_B is always a write;
++     (b) the accesses are well-ordered in both the original and new code
++     (see the comment above the DR_ALIAS_* flags for details); and
++     (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR.  */
++  if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
++    return false;
++
++  /* Check for equal (but possibly variable) steps.  */
++  tree step = DR_STEP (dr_a.dr);
++  if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
++    return false;
++
++  /* Make sure that we can operate on sizetype without loss of precision.  */
++  tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
++  if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
++    return false;
++
++  /* Get lengths in sizetype.  */
++  tree seg_len_a
++    = fold_convert (sizetype,
++		    rewrite_to_non_trapping_overflow (dr_a.seg_len));
++  create_waw_or_war_checks2 (cond_expr, seg_len_a, alias_pair);
++  if (param_flexible_seg_len && dr_a.seg_len != dr_a.seg_len2)
++    {
++      tree seg_len2_a
++	= fold_convert (sizetype,
++			rewrite_to_non_trapping_overflow (dr_a.seg_len2));
++      tree cond_expr2;
++      create_waw_or_war_checks2 (&cond_expr2, seg_len2_a, alias_pair);
++      *cond_expr =  fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
++				 *cond_expr, cond_expr2);
++   }
+   return true;
+ }
+ 
+diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h
+index f643a95b2..9bc5f16ee 100644
+--- a/gcc/tree-data-ref.h
++++ b/gcc/tree-data-ref.h
+@@ -213,12 +213,19 @@ class dr_with_seg_len
+ public:
+   dr_with_seg_len (data_reference_p d, tree len, unsigned HOST_WIDE_INT size,
+ 		   unsigned int a)
+-    : dr (d), seg_len (len), access_size (size), align (a) {}
+-
++    : dr (d), seg_len (len), seg_len2 (len), access_size (size), align (a)
++    {}
++  dr_with_seg_len (data_reference_p d, tree len, tree len2,
++		   unsigned HOST_WIDE_INT size, unsigned int a)
++    : dr (d), seg_len (len), seg_len2 (len2), access_size (size), align (a)
++    {}
+   data_reference_p dr;
+   /* The offset of the last access that needs to be checked minus
+      the offset of the first.  */
+   tree seg_len;
++  /* The second version of segment length.  Currently this is used to
++     soften checks for a small number of iterations.  */
++  tree seg_len2;
+   /* A value that, when added to abs (SEG_LEN), gives the total number of
+      bytes in the segment.  */
+   poly_uint64 access_size;
+diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
+index 4e615b80b..04e68f621 100644
+--- a/gcc/tree-vect-data-refs.cc
++++ b/gcc/tree-vect-data-refs.cc
+@@ -3646,6 +3646,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
+     {
+       poly_uint64 lower_bound;
+       tree segment_length_a, segment_length_b;
++      tree segment_length2_a, segment_length2_b;
+       unsigned HOST_WIDE_INT access_size_a, access_size_b;
+       unsigned int align_a, align_b;
+ 
+@@ -3751,6 +3752,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
+ 	{
+ 	  segment_length_a = size_zero_node;
+ 	  segment_length_b = size_zero_node;
++	  segment_length2_a = size_zero_node;
++	  segment_length2_b = size_zero_node;
+ 	}
+       else
+ 	{
+@@ -3759,8 +3762,15 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
+ 	    length_factor = scalar_loop_iters;
+ 	  else
+ 	    length_factor = size_int (vect_factor);
++	  /* In any case we should rememeber scalar_loop_iters
++	     this helps to create flexible aliasing check
++	     for small number of iterations.  */
+ 	  segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
+ 	  segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
++	  segment_length2_a
++	    = vect_vfa_segment_size (dr_info_a, scalar_loop_iters);
++	  segment_length2_b
++	    = vect_vfa_segment_size (dr_info_b, scalar_loop_iters);
+ 	}
+       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
+       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
+@@ -3805,9 +3815,9 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
+ 	}
+ 
+       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
+-			    access_size_a, align_a);
++			    segment_length2_a, access_size_a, align_a);
+       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
+-			    access_size_b, align_b);
++			    segment_length2_b, access_size_b, align_b);
+       /* Canonicalize the order to be the one that's needed for accurate
+ 	 RAW, WAR and WAW flags, in cases where the data references are
+ 	 well-ordered.  The order doesn't really matter otherwise,
+-- 
+2.33.0
+
diff --git a/0045-Port-fixes-for-IPA-prefetch-to-GCC-12.patch b/0045-Port-fixes-for-IPA-prefetch-to-GCC-12.patch
new file mode 100644
index 0000000000000000000000000000000000000000..dae19fa2574f55f6d82b4d3088d46691eff58bfb
--- /dev/null
+++ b/0045-Port-fixes-for-IPA-prefetch-to-GCC-12.patch
@@ -0,0 +1,2216 @@
+From 4c262af8e178ac7c81b32be5b159b4d09a5841c9 Mon Sep 17 00:00:00 2001
+From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
+Date: Fri, 8 Mar 2024 07:07:50 +0800
+Subject: [PATCH 1/2] Port fixes for IPA prefetch to GCC 12
+
+---
+ gcc/ipa-devirt.cc                          |    9 +-
+ gcc/ipa-prefetch.cc                        |  174 +-
+ gcc/ipa-sra.cc                             |    7 +
+ gcc/params.opt                             |    4 +-
+ gcc/testsuite/gcc.dg/completion-1.c        |    1 +
+ gcc/testsuite/gcc.dg/ipa/ipa-prefetch-xz.c | 1843 ++++++++++++++++++++
+ 6 files changed, 1974 insertions(+), 64 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.dg/ipa/ipa-prefetch-xz.c
+
+diff --git a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc
+index dd3562d56..dd000b401 100644
+--- a/gcc/ipa-devirt.cc
++++ b/gcc/ipa-devirt.cc
+@@ -5029,9 +5029,12 @@ analyze_assign_stmt (gimple *stmt)
+ 	}
+       else
+ 	{
+-	  fprintf (dump_file, "\nUnsupported rhs type %s in assign stmt: ",
+-		   get_tree_code_name (TREE_CODE (rhs)));
+-	  print_gimple_stmt (dump_file, stmt, 0);
++	  if (dump_file && (dump_flags & TDF_DETAILS))
++	    {
++	      fprintf (dump_file, "\nUnsupported rhs type %s in assign stmt: ",
++		       get_tree_code_name (TREE_CODE (rhs)));
++	      print_gimple_stmt (dump_file, stmt, 0);
++	    }
+ 	  gcc_unreachable ();
+ 	}
+     }
+diff --git a/gcc/ipa-prefetch.cc b/gcc/ipa-prefetch.cc
+index aeea51105..9537e4835 100644
+--- a/gcc/ipa-prefetch.cc
++++ b/gcc/ipa-prefetch.cc
+@@ -167,6 +167,7 @@ analyse_cgraph ()
+ 	}
+ 
+       /* TODO: maybe remove loop info here.  */
++      n->get_body ();
+       push_cfun (DECL_STRUCT_FUNCTION (n->decl));
+       calculate_dominance_info (CDI_DOMINATORS);
+       loop_optimizer_init (LOOPS_NORMAL);
+@@ -942,6 +943,9 @@ compare_memrefs (memref_t* mr, memref_t* mr2)
+       (*mr_candidate_map)[mr] = mr2;
+       return;
+     }
++  /* Probably we shouldn't leave nulls in the map.  */
++  if ((*mr_candidate_map)[mr] == NULL)
++    return;
+   /* TODO: support analysis with incrementation of different fields.  */
+   if ((*mr_candidate_map)[mr]->offset != mr2->offset)
+     {
+@@ -1090,6 +1094,15 @@ analyse_loops ()
+ 	  memref_t *mr = it->first, *mr2 = it->second;
+ 	  if (mr2 == NULL || !(*fmrs_map)[fn]->count (mr))
+ 	    continue;
++	  /* For now optimize only MRs that mem is MEM_REF.
++	     TODO: support other MR types.  */
++	  if (TREE_CODE (mr->mem) != MEM_REF)
++	    {
++	      if (dump_file)
++		fprintf (dump_file, "Skip MR %d: unsupported tree code = %s\n",
++			 mr->mr_id, get_tree_code_name (TREE_CODE (mr->mem)));
++	      continue;
++	    }
+ 	  if (!optimize_mrs_map->count (fn))
+ 	    (*optimize_mrs_map)[fn] = new memref_set;
+ 	  (*optimize_mrs_map)[fn]->insert (mr);
+@@ -1102,7 +1115,7 @@ analyse_loops ()
+ 	       it != (*optimize_mrs_map)[fn]->end (); it++)
+ 	    {
+ 	      memref_t *mr = *it, *mr2 = (*mr_candidate_map)[mr];
+-	      fprintf (dump_file, "MRs %d,%d with incremental offset ",
++	      fprintf (dump_file, "MRs %d, %d with incremental offset ",
+ 		       mr->mr_id, mr2->mr_id);
+ 	      print_generic_expr (dump_file, mr2->offset);
+ 	      fprintf (dump_file, "\n");
+@@ -1435,6 +1448,52 @@ remap_gimple_op_r (tree *tp, int *walk_subtrees, void *data)
+   return NULL_TREE;
+ }
+ 
++/* Copy stmt and remap its operands.  */
++
++static gimple *
++gimple_copy_and_remap (gimple *stmt)
++{
++  gimple *copy = gimple_copy (stmt);
++  gcc_checking_assert (!is_gimple_debug (copy));
++
++  /* Remap all the operands in COPY.  */
++  struct walk_stmt_info wi;
++  memset (&wi, 0, sizeof (wi));
++  wi.info = copy;
++  walk_gimple_op (copy, remap_gimple_op_r, &wi);
++  if (dump_file)
++    {
++      fprintf (dump_file, "Stmt copy after remap:\n");
++      print_gimple_stmt (dump_file, copy, 0);
++    }
++  return copy;
++}
++
++/* Copy and remap stmts listed in MR in reverse order to last_idx, skipping
++   processed ones.  Insert new stmts to the sequence.  */
++
++static gimple *
++gimple_copy_and_remap_memref_stmts (memref_t *mr, gimple_seq &stmts,
++				    int last_idx, stmt_set &processed)
++{
++  gimple *last_stmt = NULL;
++  for (int i = mr->stmts.length () - 1; i >= last_idx ; i--)
++    {
++      if (processed.count (mr->stmts[i]))
++	continue;
++      processed.insert (mr->stmts[i]);
++      if (dump_file)
++	{
++	  fprintf (dump_file, "Copy stmt %d from used MR (%d):\n",
++		   i, mr->mr_id);
++	  print_gimple_stmt (dump_file, mr->stmts[i], 0);
++	}
++      last_stmt = gimple_copy_and_remap (mr->stmts[i]);
++      gimple_seq_add_stmt (&stmts, last_stmt);
++  }
++  return last_stmt;
++}
++
+ static void
+ create_cgraph_edge (cgraph_node *n, gimple *stmt)
+ {
+@@ -1490,6 +1549,13 @@ optimize_function (cgraph_node *n, function *fn)
+ 		 "Skip the case.\n");
+       return 0;
+     }
++  if (!tree_fits_shwi_p (inc_mr->step))
++    {
++      if (dump_file)
++	fprintf (dump_file, "Cannot represent incremental MR's step as "
++		 "integer.  Skip the case.\n");
++      return 0;
++    }
+   if (dump_file && !used_mrs.empty ())
+     print_mrs_ids (used_mrs, "Common list of used mrs:\n");
+ 
+@@ -1539,16 +1605,44 @@ optimize_function (cgraph_node *n, function *fn)
+       return 0;
+     }
+   else if (dump_file)
+-    fprintf (dump_file, "Dominator bb %d for MRs\n", dom_bb->index);
++    {
++      fprintf (dump_file, "Dominator bb %d for MRs:\n", dom_bb->index);
++      gimple_dump_bb (dump_file, dom_bb, 0, dump_flags);
++      fprintf (dump_file, "\n");
++    }
+ 
+-  split_block (dom_bb, (gimple *) NULL);
++  /* Try to find comp_mr's stmt in the dominator bb.  */
++  gimple *last_used = NULL;
++  for (gimple_stmt_iterator si = gsi_last_bb (dom_bb); !gsi_end_p (si);
++       gsi_prev (&si))
++    if (comp_mr->stmts[0] == gsi_stmt (si))
++      {
++	last_used = gsi_stmt (si);
++	if (dump_file)
++	  {
++	    fprintf (dump_file, "Last used stmt in dominator bb:\n");
++	    print_gimple_stmt (dump_file, last_used, 0);
++	  }
++	break;
++      }
++
++  split_block (dom_bb, last_used);
+   gimple_stmt_iterator gsi = gsi_last_bb (dom_bb);
+ 
+   /* Create new inc var.  Insert new_var = old_var + step * factor.  */
+   decl_map = new tree_map;
+   gcc_assert (comp_mr->stmts[0] && gimple_assign_single_p (comp_mr->stmts[0]));
+   tree inc_var = gimple_assign_lhs (comp_mr->stmts[0]);
++  /* If old_var definition dominates the current use, just use it, otherwise
++     evaluate it just before new inc var evaluation.  */
+   gimple_seq stmts = NULL;
++  stmt_set processed_stmts;
++  if (!dominated_by_p (CDI_DOMINATORS, dom_bb, gimple_bb (comp_mr->stmts[0])))
++    {
++      gimple *tmp = gimple_copy_and_remap_memref_stmts (comp_mr, stmts, 0,
++							processed_stmts);
++      inc_var = gimple_assign_lhs (tmp);
++    }
+   tree var_type = TREE_TYPE (inc_var);
+   enum tree_code inc_code;
+   if (TREE_CODE (var_type) == POINTER_TYPE)
+@@ -1556,52 +1650,28 @@ optimize_function (cgraph_node *n, function *fn)
+   else
+     inc_code = PLUS_EXPR;
+   tree step = inc_mr->step;
+-  unsigned dist_val = tree_to_uhwi (step) * param_ipa_prefetch_distance_factor;
++  HOST_WIDE_INT dist_val = tree_to_shwi (step)
++			   * param_ipa_prefetch_distance_factor;
+   tree dist = build_int_cst (TREE_TYPE (step), dist_val);
+   tree new_inc_var = gimple_build (&stmts, inc_code, var_type, inc_var, dist);
+   (*decl_map)[inc_var] = new_inc_var;
++  if (dump_file)
++    {
++      fprintf (dump_file, "New distance value: %ld, new inc var: ", dist_val);
++      print_generic_expr (dump_file, new_inc_var);
++      fprintf (dump_file, "\n");
++    }
+ 
+   /* Create other new vars.  Insert new stmts.  */
+-  struct walk_stmt_info wi;
+-  stmt_set processed_stmts;
+-  memref_tree_map mr_new_trees;
+   for (memref_set::const_iterator it = used_mrs.begin ();
+        it != used_mrs.end (); it++)
+     {
+       memref_t *mr = *it;
+-      gimple *last_stmt = NULL;
+       if (mr == comp_mr)
+ 	continue;
+-      for (int i = mr->stmts.length () - 1; i >= 0 ; i--)
+-	{
+-	  if (processed_stmts.count (mr->stmts[i]))
+-	    continue;
+-	  processed_stmts.insert (mr->stmts[i]);
+-	  if (dump_file)
+-	    {
+-	      fprintf (dump_file, "Copy stmt %d from used MR (%d):\n",
+-		       i, mr->mr_id);
+-	      print_gimple_stmt (dump_file, mr->stmts[i], 0);
+-	    }
+-	  /* Create a new copy of STMT and duplicate STMT's virtual
+-	     operands.  */
+-	  gimple *copy = gimple_copy (mr->stmts[i]);
+-	  gcc_checking_assert (!is_gimple_debug (copy));
+-
+-	  /* Remap all the operands in COPY.  */
+-	  memset (&wi, 0, sizeof (wi));
+-	  last_stmt = copy;
+-	  wi.info = copy;
+-	  walk_gimple_op (copy, remap_gimple_op_r, &wi);
+-	  if (dump_file)
+-	    {
+-	      fprintf (dump_file, "Stmt %d after remap:\n",i);
+-	      print_gimple_stmt (dump_file, copy, 0);
+-	    }
+-	  gimple_seq_add_stmt (&stmts, copy);
+-	}
++      gimple *last_stmt = gimple_copy_and_remap_memref_stmts (mr, stmts, 0,
++							      processed_stmts);
+       gcc_assert (last_stmt);
+-      mr_new_trees[mr] = gimple_assign_lhs (last_stmt);
+       if (dump_file)
+ 	{
+ 	  fprintf (dump_file, "MR (%d) new mem: ", mr->mr_id);
+@@ -1637,29 +1707,9 @@ optimize_function (cgraph_node *n, function *fn)
+       memref_t *mr = vmrs[j];
+       /* Don't need to copy the last stmt, since we insert prefetch insn
+ 	 instead of it.  */
+-      for (int i = mr->stmts.length () - 1; i >= 1 ; i--)
+-	{
+-	  if (processed_stmts.count (mr->stmts[i]))
+-	    continue;
+-	  processed_stmts.insert (mr->stmts[i]);
+-
+-	  gimple *copy = gimple_copy (mr->stmts[i]);
+-	  gcc_checking_assert (!is_gimple_debug (copy));
+-
+-	  /* Remap all the operands in COPY.  */
+-	  memset (&wi, 0, sizeof (wi));
+-	  wi.info = copy;
+-	  walk_gimple_op (copy, remap_gimple_op_r, &wi);
+-	  if (dump_file)
+-	    {
+-	      fprintf (dump_file, "Stmt %d after remap:\n",i);
+-	      print_gimple_stmt (dump_file, copy, 0);
+-	    }
+-	  gimple_seq_add_stmt (&stmts, copy);
+-	}
++      gimple_copy_and_remap_memref_stmts (mr, stmts, 1, processed_stmts);
+       gimple *last_stmt = mr->stmts[0];
+       gcc_assert (last_stmt);
+-      mr_new_trees[mr] = gimple_assign_lhs (last_stmt);
+       tree write_p = mr->is_store ? integer_one_node : integer_zero_node;
+       tree addr = get_mem_ref_address_ssa_name (mr->mem, NULL_TREE);
+       if (decl_map->count (addr))
+@@ -1668,6 +1718,11 @@ optimize_function (cgraph_node *n, function *fn)
+ 				     3, addr, write_p, local);
+       pcalls.safe_push (last_stmt);
+       gimple_seq_add_stmt (&stmts, last_stmt);
++      if (dump_file)
++	{
++	  fprintf (dump_file, "Insert %d prefetch stmt:\n", j);
++	  print_gimple_stmt (dump_file, last_stmt, 0);
++	}
+     }
+ 
+   gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT);
+@@ -1677,6 +1732,7 @@ optimize_function (cgraph_node *n, function *fn)
+   for (unsigned i = 0; i < pcalls.length (); i++)
+     create_cgraph_edge (n, pcalls[i]);
+   ipa_update_overall_fn_summary (n);
++  renumber_gimple_stmt_uids (DECL_STRUCT_FUNCTION (n->decl));
+ 
+   return 1;
+ }
+@@ -1806,7 +1862,7 @@ pass_ipa_prefetch::gate (function *)
+ 	  /* Don't bother doing anything if the program has errors.  */
+ 	  && !seen_error ()
+ 	  && flag_lto_partition == LTO_PARTITION_ONE
+-	  /* Only enable struct optimizations in lto or whole_program.  */
++	  /* Only enable prefetch optimizations in lto or whole_program.  */
+ 	  && (in_lto_p || flag_whole_program));
+ }
+ 
+diff --git a/gcc/ipa-sra.cc b/gcc/ipa-sra.cc
+index 5355cf2f4..471b3927c 100644
+--- a/gcc/ipa-sra.cc
++++ b/gcc/ipa-sra.cc
+@@ -3393,6 +3393,13 @@ param_splitting_across_edge (cgraph_edge *cs)
+   gcc_checking_assert (from_ifs && from_ifs->m_parameters);
+ 
+   isra_call_summary *csum = call_sums->get (cs);
++  /* TODO: implement better support for call edges inserted after summary
++     collection but before sra wpa invocation.  */
++  if (!csum)
++    {
++      csum = call_sums->get_create (cs);
++      csum->m_return_ignored = true;
++    }
+   gcc_checking_assert (csum);
+   unsigned args_count = csum->m_arg_flow.length ();
+   isra_func_summary *to_ifs = func_sums->get (callee);
+diff --git a/gcc/params.opt b/gcc/params.opt
+index 5c07e3986..50385dfd7 100644
+--- a/gcc/params.opt
++++ b/gcc/params.opt
+@@ -314,8 +314,8 @@ Common Joined UInteger Var(param_ipa_prefetch_distance_factor) Init(4) Param Opt
+ The factor represents the number of inductive variable incrementations to evaluate an indirect memory address for IPA prefetch.
+ 
+ -param=ipa-prefetch-locality=
+-Common Joined UInteger Var(param_ipa_prefetch_locality) Init(3) Param Optimization
+-The flag represents temporal locality values in the following way: 0:pstl1strm, 1:pstl3keep, 2:pstl2keep, 3:pstl1keep.
++Common Joined UInteger Var(param_ipa_prefetch_locality) Init(3) IntegerRange(0, 3) Param Optimization
++The flag represents temporal locality value between 0 and 3, the higher value means the higher temporal locality in the data.
+ 
+ -param=ira-loop-reserved-regs=
+ Common Joined UInteger Var(param_ira_loop_reserved_regs) Init(2) Param Optimization
+diff --git a/gcc/testsuite/gcc.dg/completion-1.c b/gcc/testsuite/gcc.dg/completion-1.c
+index 64da64f1c..df2319c76 100644
+--- a/gcc/testsuite/gcc.dg/completion-1.c
++++ b/gcc/testsuite/gcc.dg/completion-1.c
+@@ -2,6 +2,7 @@
+ /* { dg-options "--completion=-fipa-ic" } */
+ 
+ /* { dg-begin-multiline-output "" }
++-fipa-ic
+ -fipa-icf
+ -fipa-icf-functions
+ -fipa-icf-variables
+diff --git a/gcc/testsuite/gcc.dg/ipa/ipa-prefetch-xz.c b/gcc/testsuite/gcc.dg/ipa/ipa-prefetch-xz.c
+new file mode 100644
+index 000000000..bd4fb2bdc
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/ipa/ipa-prefetch-xz.c
+@@ -0,0 +1,1843 @@
++/* { dg-do link } */
++/* { dg-options "-O3 -fipa-ic -fipa-prefetch -flto -flto-partition=one -fdump-ipa-ipa_prefetch -fdump-ipa-icp" } */
++/* { dg-require-effective-target lto } */
++
++/* Based on opensource xz code.  */
++
++#include <stdlib.h>
++#include <string.h>
++
++typedef long int ptrdiff_t;
++typedef long unsigned int size_t;
++typedef unsigned int wchar_t;
++
++typedef unsigned char __u_char;
++typedef unsigned short int __u_short;
++typedef unsigned int __u_int;
++typedef unsigned long int __u_long;
++
++typedef signed char __int8_t;
++typedef unsigned char __uint8_t;
++typedef signed short int __int16_t;
++typedef unsigned short int __uint16_t;
++typedef signed int __int32_t;
++typedef unsigned int __uint32_t;
++
++typedef signed long int __int64_t;
++typedef unsigned long int __uint64_t;
++
++typedef __int8_t __int_least8_t;
++typedef __uint8_t __uint_least8_t;
++typedef __int16_t __int_least16_t;
++typedef __uint16_t __uint_least16_t;
++typedef __int32_t __int_least32_t;
++typedef __uint32_t __uint_least32_t;
++typedef __int64_t __int_least64_t;
++typedef __uint64_t __uint_least64_t;
++
++typedef __int8_t int8_t;
++typedef __int16_t int16_t;
++typedef __int32_t int32_t;
++typedef __int64_t int64_t;
++
++typedef __uint8_t uint8_t;
++typedef __uint16_t uint16_t;
++typedef __uint32_t uint32_t;
++typedef __uint64_t uint64_t;
++
++typedef long int intptr_t;
++typedef unsigned long int uintptr_t;
++
++static inline uint16_t
++read16ne(const uint8_t *buf)
++{
++ uint16_t num;
++ memcpy(&num, buf, sizeof(num));
++ return num;
++}
++
++static inline uint32_t
++read32ne(const uint8_t *buf)
++{
++ uint32_t num;
++ memcpy(&num, buf, sizeof(num));
++ return num;
++}
++
++static inline uint16_t
++aligned_read16ne(const uint8_t *buf)
++{
++ uint16_t num;
++ memcpy(&num, __builtin_assume_aligned(buf, sizeof(num)), sizeof(num));
++ return num;
++}
++
++
++static inline uint32_t
++aligned_read32ne(const uint8_t *buf)
++{
++ uint32_t num;
++ memcpy(&num, __builtin_assume_aligned(buf, sizeof(num)), sizeof(num));
++ return num;
++}
++
++static inline uint64_t
++aligned_read64ne(const uint8_t *buf)
++{
++ uint64_t num;
++ memcpy(&num, __builtin_assume_aligned(buf, sizeof(num)), sizeof(num));
++ return num;
++}
++
++typedef unsigned char lzma_bool;
++
++typedef enum {
++ LZMA_RESERVED_ENUM = 0
++} lzma_reserved_enum;
++
++typedef enum {
++ LZMA_OK = 0,
++ LZMA_STREAM_END = 1,
++ LZMA_NO_CHECK = 2,
++ LZMA_UNSUPPORTED_CHECK = 3,
++ LZMA_GET_CHECK = 4,
++ LZMA_MEM_ERROR = 5,
++ LZMA_MEMLIMIT_ERROR = 6,
++ LZMA_FORMAT_ERROR = 7,
++ LZMA_OPTIONS_ERROR = 8,
++ LZMA_DATA_ERROR = 9,
++ LZMA_BUF_ERROR = 10,
++ LZMA_PROG_ERROR = 11,
++} lzma_ret;
++
++typedef enum {
++ LZMA_RUN = 0,
++ LZMA_SYNC_FLUSH = 1,
++ LZMA_FULL_FLUSH = 2,
++ LZMA_FULL_BARRIER = 4,
++ LZMA_FINISH = 3
++} lzma_action;
++
++typedef struct {
++ void *( *alloc)(void *opaque, size_t nmemb, size_t size);
++
++ void ( *free)(void *opaque, void *ptr);
++
++ void *opaque;
++} lzma_allocator;
++
++typedef uint64_t lzma_vli;
++
++typedef enum {
++ LZMA_CHECK_NONE = 0,
++ LZMA_CHECK_CRC32 = 1,
++ LZMA_CHECK_CRC64 = 4,
++ LZMA_CHECK_SHA256 = 10
++} lzma_check;
++
++typedef struct {
++ lzma_vli id;
++ void *options;
++} lzma_filter;
++
++typedef enum {
++ LZMA_MF_HC3 = 0x03,
++ LZMA_MF_HC4 = 0x04,
++ LZMA_MF_BT2 = 0x12,
++ LZMA_MF_BT3 = 0x13,
++ LZMA_MF_BT4 = 0x14
++} lzma_match_finder;
++
++typedef struct lzma_next_coder_s lzma_next_coder;
++
++typedef struct lzma_filter_info_s lzma_filter_info;
++
++typedef lzma_ret (*lzma_init_function)(
++  lzma_next_coder *next, const lzma_allocator *allocator,
++  const lzma_filter_info *filters);
++
++typedef lzma_ret (*lzma_code_function)(
++  void *coder, const lzma_allocator *allocator,
++  const uint8_t *restrict in, size_t *restrict in_pos,
++  size_t in_size, uint8_t *restrict out,
++  size_t *restrict out_pos, size_t out_size,
++  lzma_action action);
++
++typedef void (*lzma_end_function)(
++  void *coder, const lzma_allocator *allocator);
++
++struct lzma_filter_info_s {
++ lzma_vli id;
++ lzma_init_function init;
++ void *options;
++};
++
++struct lzma_next_coder_s {
++ void *coder;
++ lzma_vli id;
++ uintptr_t init;
++
++ lzma_code_function code;
++ lzma_end_function end;
++ void (*get_progress)(void *coder,
++   uint64_t *progress_in, uint64_t *progress_out);
++
++ lzma_check (*get_check)(const void *coder);
++ lzma_ret (*memconfig)(void *coder, uint64_t *memusage,
++   uint64_t *old_memlimit, uint64_t new_memlimit);
++ lzma_ret (*update)(void *coder, const lzma_allocator *allocator,
++   const lzma_filter *filters, const lzma_filter *reversed_filters);
++};
++
++typedef struct {
++ uint32_t len;
++ uint32_t dist;
++} lzma_match;
++
++typedef struct lzma_mf_s lzma_mf;
++struct lzma_mf_s {
++ uint8_t *buffer;
++ uint32_t size;
++ uint32_t keep_size_before;
++ uint32_t keep_size_after;
++ uint32_t offset;
++ uint32_t read_pos;
++ uint32_t read_ahead;
++ uint32_t read_limit;
++ uint32_t write_pos;
++ uint32_t pending;
++ uint32_t (*find)(lzma_mf *mf, lzma_match *matches);
++ void (*skip)(lzma_mf *mf, uint32_t num);
++ uint32_t *hash;
++ uint32_t *son;
++ uint32_t cyclic_pos;
++ uint32_t cyclic_size;
++ uint32_t hash_mask;
++ uint32_t depth;
++ uint32_t nice_len;
++ uint32_t match_len_max;
++ lzma_action action;
++ uint32_t hash_count;
++ uint32_t sons_count;
++};
++
++typedef struct {
++ size_t before_size;
++ size_t dict_size;
++ size_t after_size;
++ size_t match_len_max;
++ size_t nice_len;
++ lzma_match_finder match_finder;
++ uint32_t depth;
++ const uint8_t *preset_dict;
++ uint32_t preset_dict_size;
++} lzma_lz_options;
++
++typedef struct {
++ void *coder;
++ lzma_ret (*code)(void *coder,
++   lzma_mf *restrict mf, uint8_t *restrict out,
++   size_t *restrict out_pos, size_t out_size);
++ void (*end)(void *coder, const lzma_allocator *allocator);
++ lzma_ret (*options_update)(void *coder, const lzma_filter *filter);
++} lzma_lz_encoder;
++
++static inline const uint8_t *
++mf_ptr(const lzma_mf *mf)
++{
++ return mf->buffer + mf->read_pos;
++}
++
++static inline uint32_t
++mf_avail(const lzma_mf *mf)
++{
++ return mf->write_pos - mf->read_pos;
++}
++
++typedef struct {
++ uint32_t state[8];
++ uint64_t size;
++} lzma_sha256_state;
++
++typedef struct {
++ union {
++  uint8_t u8[64];
++  uint32_t u32[16];
++  uint64_t u64[8];
++ } buffer;
++ union {
++  uint32_t crc32;
++  uint64_t crc64;
++  lzma_sha256_state sha256;
++ } state;
++} lzma_check_state;
++
++// The table is constantly initialized in the original code.
++// Skip it in the test.
++const uint32_t lzma_crc32_table[8][256];
++
++static inline uint32_t __attribute__((__always_inline__))
++lzma_memcmplen(const uint8_t *buf1, const uint8_t *buf2,
++  uint32_t len, uint32_t limit)
++{
++ while (len < limit) {
++  uint32_t x = read32ne(buf1 + len) - read32ne(buf2 + len);
++  if (x != 0) {
++   if ((x & 0xFFFF) == 0) {
++    len += 2;
++    x >>= 16;
++   }
++
++   if ((x & 0xFF) == 0)
++    ++len;
++
++   return ((len) < (limit) ? (len) : (limit));
++  }
++
++  len += 4;
++ }
++
++ return limit;
++}
++
++extern uint32_t
++lzma_mf_find(lzma_mf *mf, uint32_t *count_ptr, lzma_match *matches)
++{
++ const uint32_t count = mf->find(mf, matches);
++ uint32_t len_best = 0;
++
++ if (count > 0) {
++  len_best = matches[count - 1].len;
++  if (len_best == mf->nice_len) {
++   uint32_t limit = mf_avail(mf) + 1;
++   if (limit > mf->match_len_max)
++    limit = mf->match_len_max;
++   const uint8_t *p1 = mf_ptr(mf) - 1;
++   const uint8_t *p2 = p1 - matches[count - 1].dist - 1;
++   len_best = lzma_memcmplen(p1, p2, len_best, limit);
++  }
++ }
++
++ *count_ptr = count;
++ ++mf->read_ahead;
++
++ return len_best;
++}
++
++static void
++normalize(lzma_mf *mf)
++{
++ const uint32_t subvalue = ((4294967295U) - mf->cyclic_size);
++
++ for (uint32_t i = 0; i < mf->hash_count; ++i) {
++  if (mf->hash[i] <= subvalue)
++   mf->hash[i] = 0;
++  else
++   mf->hash[i] -= subvalue;
++ }
++
++ for (uint32_t i = 0; i < mf->sons_count; ++i) {
++  if (mf->son[i] <= subvalue)
++   mf->son[i] = 0;
++  else
++   mf->son[i] -= subvalue;
++ }
++
++ mf->offset -= subvalue;
++ return;
++}
++
++static void
++move_pos(lzma_mf *mf)
++{
++ if (++mf->cyclic_pos == mf->cyclic_size)
++  mf->cyclic_pos = 0;
++ ++mf->read_pos;
++ if (__builtin_expect(mf->read_pos + mf->offset == (4294967295U), 0 ))
++  normalize(mf);
++}
++
++static void
++move_pending(lzma_mf *mf)
++{
++ ++mf->read_pos;
++ ++mf->pending;
++}
++
++static lzma_match *
++hc_find_func(
++  const uint32_t len_limit,
++  const uint32_t pos,
++  const uint8_t *const cur,
++  uint32_t cur_match,
++  uint32_t depth,
++  uint32_t *const son,
++  const uint32_t cyclic_pos,
++  const uint32_t cyclic_size,
++  lzma_match *matches,
++  uint32_t len_best)
++{
++ son[cyclic_pos] = cur_match;
++
++ while (1) {
++  const uint32_t delta = pos - cur_match;
++  if (depth-- == 0 || delta >= cyclic_size)
++   return matches;
++
++  const uint8_t *const pb = cur - delta;
++  cur_match = son[cyclic_pos - delta
++    + (delta > cyclic_pos ? cyclic_size : 0)];
++
++  if (pb[len_best] == cur[len_best] && pb[0] == cur[0]) {
++   uint32_t len = lzma_memcmplen(pb, cur, 1, len_limit);
++
++   if (len_best < len) {
++    len_best = len;
++    matches->len = len;
++    matches->dist = delta - 1;
++    ++matches;
++
++    if (len == len_limit)
++     return matches;
++   }
++  }
++ }
++}
++
++extern uint32_t
++lzma_mf_hc3_find(lzma_mf *mf, lzma_match *matches)
++{
++ uint32_t len_limit = mf_avail(mf);
++ if (mf->nice_len <= len_limit) {
++  len_limit = mf->nice_len;
++ } else if (len_limit < (3)) {
++  move_pending(mf);
++  return 0;
++ }
++ const uint8_t *cur = mf_ptr(mf);
++ const uint32_t pos = mf->read_pos + mf->offset;
++ uint32_t matches_count = 0;
++
++ const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1];
++ const uint32_t hash_2_value = temp & ((1U << 10) - 1);
++ const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8)) & mf->hash_mask;
++
++ const uint32_t delta2 = pos - mf->hash[hash_2_value];
++ const uint32_t cur_match = mf->hash[((1U << 10)) + hash_value];
++
++ mf->hash[hash_2_value] = pos;
++ mf->hash[((1U << 10)) + hash_value] = pos;
++
++ uint32_t len_best = 2;
++
++ if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
++  len_best = lzma_memcmplen(cur - delta2, cur, len_best, len_limit);
++
++  matches[0].len = len_best;
++  matches[0].dist = delta2 - 1;
++  matches_count = 1;
++
++  if (len_best == len_limit) {
++   mf->son[mf->cyclic_pos] = cur_match;
++   move_pos(mf);
++   return 1;
++  }
++ }
++
++ matches_count = hc_find_func(len_limit, pos, cur, cur_match, mf->depth,
++			      mf->son, mf->cyclic_pos, mf->cyclic_size,
++			      matches + matches_count, len_best) - matches;
++ move_pos(mf);
++ return matches_count;
++}
++
++extern void
++lzma_mf_hc3_skip(lzma_mf *mf, uint32_t amount)
++{
++ do {
++  if (mf_avail(mf) < 3) {
++   move_pending(mf);
++   continue;
++  }
++
++  const uint8_t *cur = mf_ptr(mf);
++  const uint32_t pos = mf->read_pos + mf->offset;
++
++  const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1];
++  const uint32_t hash_2_value = temp & ((1U << 10) - 1);
++  const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8)) & mf->hash_mask;
++
++  const uint32_t cur_match
++    = mf->hash[((1U << 10)) + hash_value];
++
++  mf->hash[hash_2_value] = pos;
++  mf->hash[((1U << 10)) + hash_value] = pos;
++
++  do { mf->son[mf->cyclic_pos] = cur_match; move_pos(mf); } while (0);
++
++ } while (--amount != 0);
++}
++
++extern uint32_t
++lzma_mf_hc4_find(lzma_mf *mf, lzma_match *matches)
++{
++ uint32_t len_limit = mf_avail(mf);
++ if (mf->nice_len <= len_limit) {
++  len_limit = mf->nice_len;
++ } else if (len_limit < (4)) {
++  move_pending(mf);
++  return 0;
++ }
++ const uint8_t *cur = mf_ptr(mf);
++ const uint32_t pos = mf->read_pos + mf->offset;
++ uint32_t matches_count = 0;
++
++ const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1];
++ const uint32_t hash_2_value = temp & ((1U << 10) - 1);
++ const uint32_t hash_3_value = (temp ^ ((uint32_t)(cur[2]) << 8))
++				& ((1U << 16) - 1);
++ const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8)
++				      ^ (lzma_crc32_table[0][cur[3]] << 5))
++			      & mf->hash_mask;
++ uint32_t delta2 = pos - mf->hash[hash_2_value];
++ const uint32_t delta3
++   = pos - mf->hash[((1U << 10)) + hash_3_value];
++ const uint32_t cur_match = mf->hash[((1U << 10) + (1U << 16)) + hash_value];
++
++ mf->hash[hash_2_value ] = pos;
++ mf->hash[((1U << 10)) + hash_3_value] = pos;
++ mf->hash[((1U << 10) + (1U << 16)) + hash_value] = pos;
++
++ uint32_t len_best = 1;
++
++ if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
++  len_best = 2;
++  matches[0].len = 2;
++  matches[0].dist = delta2 - 1;
++  matches_count = 1;
++ }
++
++ if (delta2 != delta3 && delta3 < mf->cyclic_size
++   && *(cur - delta3) == *cur) {
++  len_best = 3;
++  matches[matches_count++].dist = delta3 - 1;
++  delta2 = delta3;
++ }
++
++ if (matches_count != 0) {
++  len_best = lzma_memcmplen(cur - delta2, cur,
++    len_best, len_limit);
++
++  matches[matches_count - 1].len = len_best;
++
++  if (len_best == len_limit) {
++   mf->son[mf->cyclic_pos] = cur_match; move_pos(mf);
++   return matches_count;
++  }
++ }
++
++ if (len_best < 3)
++  len_best = 3;
++
++ matches_count = hc_find_func(len_limit, pos, cur, cur_match, mf->depth,
++			      mf->son, mf->cyclic_pos, mf->cyclic_size,
++			      matches + matches_count, len_best) - matches;
++ move_pos(mf);
++ return matches_count;
++}
++
++extern void
++lzma_mf_hc4_skip(lzma_mf *mf, uint32_t amount)
++{
++ do {
++  if (mf_avail(mf) < 4) {
++   move_pending(mf);
++   continue;
++  }
++
++  const uint8_t *cur = mf_ptr(mf);
++  const uint32_t pos = mf->read_pos + mf->offset;
++
++  const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1];
++  const uint32_t hash_2_value = temp & ((1U << 10) - 1);
++  const uint32_t hash_3_value = (temp ^ ((uint32_t)(cur[2]) << 8)) & ((1U << 16) - 1);
++  const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8)
++				       ^ (lzma_crc32_table[0][cur[3]] << 5))
++			       & mf->hash_mask;
++
++  const uint32_t cur_match
++    = mf->hash[((1U << 10) + (1U << 16)) + hash_value];
++
++  mf->hash[hash_2_value] = pos;
++  mf->hash[((1U << 10)) + hash_3_value] = pos;
++  mf->hash[((1U << 10) + (1U << 16)) + hash_value] = pos;
++
++  mf->son[mf->cyclic_pos] = cur_match;
++  move_pos(mf);
++ } while (--amount != 0);
++}
++
++static lzma_match *
++bt_find_func(
++  const uint32_t len_limit,
++  const uint32_t pos,
++  const uint8_t *const cur,
++  uint32_t cur_match,
++  uint32_t depth,
++  uint32_t *const son,
++  const uint32_t cyclic_pos,
++  const uint32_t cyclic_size,
++  lzma_match *matches,
++  uint32_t len_best)
++{
++ uint32_t *ptr0 = son + (cyclic_pos << 1) + 1;
++ uint32_t *ptr1 = son + (cyclic_pos << 1);
++
++ uint32_t len0 = 0;
++ uint32_t len1 = 0;
++
++ while (1) {
++  const uint32_t delta = pos - cur_match;
++  if (depth-- == 0 || delta >= cyclic_size) {
++   *ptr0 = 0;
++   *ptr1 = 0;
++   return matches;
++  }
++
++  uint32_t *const pair = son + ((cyclic_pos - delta
++    + (delta > cyclic_pos ? cyclic_size : 0))
++    << 1);
++
++  const uint8_t *const pb = cur - delta;
++  uint32_t len = ((len0) < (len1) ? (len0) : (len1));
++
++  if (pb[len] == cur[len]) {
++   len = lzma_memcmplen(pb, cur, len + 1, len_limit);
++
++   if (len_best < len) {
++    len_best = len;
++    matches->len = len;
++    matches->dist = delta - 1;
++    ++matches;
++
++    if (len == len_limit) {
++     *ptr1 = pair[0];
++     *ptr0 = pair[1];
++     return matches;
++    }
++   }
++  }
++
++  if (pb[len] < cur[len]) {
++   *ptr1 = cur_match;
++   ptr1 = pair + 1;
++   cur_match = *ptr1;
++   len1 = len;
++  } else {
++   *ptr0 = cur_match;
++   ptr0 = pair;
++   cur_match = *ptr0;
++   len0 = len;
++  }
++ }
++}
++
++
++static void
++bt_skip_func(
++  const uint32_t len_limit,
++  const uint32_t pos,
++  const uint8_t *const cur,
++  uint32_t cur_match,
++  uint32_t depth,
++  uint32_t *const son,
++  const uint32_t cyclic_pos,
++  const uint32_t cyclic_size)
++{
++ uint32_t *ptr0 = son + (cyclic_pos << 1) + 1;
++ uint32_t *ptr1 = son + (cyclic_pos << 1);
++
++ uint32_t len0 = 0;
++ uint32_t len1 = 0;
++
++ while (1) {
++  const uint32_t delta = pos - cur_match;
++  if (depth-- == 0 || delta >= cyclic_size) {
++   *ptr0 = 0;
++   *ptr1 = 0;
++   return;
++  }
++
++  uint32_t *pair = son + ((cyclic_pos - delta
++    + (delta > cyclic_pos ? cyclic_size : 0))
++    << 1);
++  const uint8_t *pb = cur - delta;
++  uint32_t len = ((len0) < (len1) ? (len0) : (len1));
++
++  if (pb[len] == cur[len]) {
++   len = lzma_memcmplen(pb, cur, len + 1, len_limit);
++
++   if (len == len_limit) {
++    *ptr1 = pair[0];
++    *ptr0 = pair[1];
++    return;
++   }
++  }
++
++  if (pb[len] < cur[len]) {
++   *ptr1 = cur_match;
++   ptr1 = pair + 1;
++   cur_match = *ptr1;
++   len1 = len;
++  } else {
++   *ptr0 = cur_match;
++   ptr0 = pair;
++   cur_match = *ptr0;
++   len0 = len;
++  }
++ }
++}
++
++extern uint32_t
++lzma_mf_bt2_find(lzma_mf *mf, lzma_match *matches)
++{
++ uint32_t len_limit = mf_avail(mf);
++ if (mf->nice_len <= len_limit) {
++  len_limit = mf->nice_len;
++ } else if (len_limit < (2) || (mf->action == LZMA_SYNC_FLUSH)) {
++  move_pending(mf);
++  return 0;
++ }
++ const uint8_t *cur = mf_ptr(mf);
++ const uint32_t pos = mf->read_pos + mf->offset;
++ uint32_t matches_count = 0;
++ const uint32_t hash_value = read16ne(cur);
++ const uint32_t cur_match = mf->hash[hash_value];
++ mf->hash[hash_value] = pos;
++
++ matches_count = bt_find_func(len_limit, pos, cur, cur_match, mf->depth,
++                              mf->son, mf->cyclic_pos, mf->cyclic_size,
++                              matches + matches_count, 1) - matches;
++ move_pos(mf);
++ return matches_count;
++}
++
++extern void
++lzma_mf_bt2_skip(lzma_mf *mf, uint32_t amount)
++{
++ do {
++  uint32_t len_limit = mf_avail(mf);
++  if (mf->nice_len <= len_limit) {
++   len_limit = mf->nice_len;
++  } else if (len_limit < (2) || (mf->action == LZMA_SYNC_FLUSH)) { 
++   move_pending(mf);
++   continue;
++  }
++  const uint8_t *cur = mf_ptr(mf);
++  const uint32_t pos = mf->read_pos + mf->offset;
++
++  const uint32_t hash_value = read16ne(cur);
++  const uint32_t cur_match = mf->hash[hash_value];
++  mf->hash[hash_value] = pos;
++
++  bt_skip_func(len_limit, pos, cur, cur_match, mf->depth, mf->son,
++	       mf->cyclic_pos, mf->cyclic_size);
++  move_pos(mf);
++ } while (--amount != 0);
++}
++
++extern uint32_t
++lzma_mf_bt3_find(lzma_mf *mf, lzma_match *matches)
++{
++ uint32_t len_limit = mf_avail(mf);
++ if (mf->nice_len <= len_limit) {
++  len_limit = mf->nice_len;
++ } else if (len_limit < (3) || (1 && mf->action == LZMA_SYNC_FLUSH)) { 
++  move_pending(mf);
++  return 0;
++ }
++ const uint8_t *cur = mf_ptr(mf);
++ const uint32_t pos = mf->read_pos + mf->offset;
++ uint32_t matches_count = 0;
++
++ const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1];
++ const uint32_t hash_2_value = temp & ((1U << 10) - 1);
++ const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8)) & mf->hash_mask;
++
++ const uint32_t delta2 = pos - mf->hash[hash_2_value];
++ const uint32_t cur_match = mf->hash[((1U << 10)) + hash_value];
++
++ mf->hash[hash_2_value] = pos;
++ mf->hash[((1U << 10)) + hash_value] = pos;
++
++ uint32_t len_best = 2;
++
++ if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
++  len_best = lzma_memcmplen(
++    cur, cur - delta2, len_best, len_limit);
++
++  matches[0].len = len_best;
++  matches[0].dist = delta2 - 1;
++  matches_count = 1;
++
++  if (len_best == len_limit) {
++   bt_skip_func(len_limit, pos, cur, cur_match, mf->depth, mf->son,
++		mf->cyclic_pos, mf->cyclic_size);
++   move_pos(mf);
++   return 1;
++  }
++ }
++
++ matches_count = bt_find_func(len_limit, pos, cur, cur_match, mf->depth,
++			      mf->son, mf->cyclic_pos, mf->cyclic_size,
++			      matches + matches_count, len_best) - matches;
++ move_pos(mf);
++ return matches_count;
++}
++
++
++extern void
++lzma_mf_bt3_skip(lzma_mf *mf, uint32_t amount)
++{
++ do {
++  uint32_t len_limit = mf_avail(mf);
++  if (mf->nice_len <= len_limit) {
++    len_limit = mf->nice_len; }
++  else if (len_limit < (3) || (1 && mf->action == LZMA_SYNC_FLUSH)) { 
++    move_pending(mf);
++    continue;
++  }
++  const uint8_t *cur = mf_ptr(mf);
++  const uint32_t pos = mf->read_pos + mf->offset;
++
++  const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1];
++  const uint32_t hash_2_value = temp & ((1U << 10) - 1);
++  const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8)) & mf->hash_mask;
++
++  const uint32_t cur_match = mf->hash[((1U << 10)) + hash_value];
++
++  mf->hash[hash_2_value] = pos;
++  mf->hash[((1U << 10)) + hash_value] = pos;
++
++  bt_skip_func(len_limit, pos, cur, cur_match, mf->depth, mf->son,
++	       mf->cyclic_pos, mf->cyclic_size); 
++  move_pos(mf);
++ } while (--amount != 0);
++}
++
++extern uint32_t
++lzma_mf_bt4_find(lzma_mf *mf, lzma_match *matches)
++{
++ uint32_t len_limit = mf->write_pos - mf->read_pos;
++ if (mf->nice_len <= len_limit) {
++  len_limit = mf->nice_len;
++ } else if (len_limit < (4) || (mf->action == LZMA_SYNC_FLUSH)) {
++  ++mf->read_pos;
++  ++mf->pending;
++  return 0;
++ }
++
++ const uint8_t *cur = mf->buffer + mf->read_pos;
++ const uint32_t pos = mf->read_pos + mf->offset;
++ uint32_t matches_count = 0;
++
++ const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1];
++ const uint32_t hash_2_value = temp & ((1U << 10) - 1);
++ const uint32_t hash_3_value = (temp ^ ((uint32_t)(cur[2]) << 8)) & ((1U << 16) - 1);
++ const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8)
++				      ^ (lzma_crc32_table[0][cur[3]] << 5))
++			      & mf->hash_mask;
++
++ uint32_t delta2 = pos - mf->hash[hash_2_value];
++ const uint32_t delta3 = pos - mf->hash[((1U << 10)) + hash_3_value];
++ const uint32_t cur_match = mf->hash[((1U << 10) + (1U << 16)) + hash_value];
++
++ mf->hash[hash_2_value] = pos;
++ mf->hash[((1U << 10)) + hash_3_value] = pos;
++ mf->hash[((1U << 10) + (1U << 16)) + hash_value] = pos;
++
++ uint32_t len_best = 1;
++
++ if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) {
++  len_best = 2;
++  matches[0].len = 2;
++  matches[0].dist = delta2 - 1;
++  matches_count = 1;
++ }
++
++ if (delta2 != delta3 && delta3 < mf->cyclic_size && *(cur - delta3) == *cur) {
++  len_best = 3;
++  matches[matches_count++].dist = delta3 - 1;
++  delta2 = delta3;
++ }
++
++ if (matches_count != 0) {
++  len_best = lzma_memcmplen(cur, cur - delta2, len_best, len_limit);
++
++  matches[matches_count - 1].len = len_best;
++
++  if (len_best == len_limit) {
++    bt_skip_func(len_limit, pos, cur, cur_match, mf->depth, mf->son,
++		 mf->cyclic_pos, mf->cyclic_size);
++    move_pos(mf);
++    return matches_count;
++  }
++ }
++
++ if (len_best < 3)
++  len_best = 3;
++
++ matches_count = bt_find_func(len_limit, pos, cur, cur_match, mf->depth, mf->son,
++                              mf->cyclic_pos, mf->cyclic_size,
++                              matches + matches_count, len_best) - matches;
++ move_pos(mf);
++ return matches_count;
++}
++
++extern void
++lzma_mf_bt4_skip(lzma_mf *mf, uint32_t amount)
++{
++ do {
++  uint32_t len_limit = mf_avail(mf);
++  if (mf->nice_len <= len_limit) {
++   len_limit = mf->nice_len;
++  } else if (len_limit < (4) || (mf->action == LZMA_SYNC_FLUSH)) {
++   move_pending(mf);
++   continue;
++  }
++
++  const uint8_t *cur = mf->buffer + mf->read_pos;
++  const uint32_t pos = mf->read_pos + mf->offset;
++
++  const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1];
++  const uint32_t hash_2_value = temp & ((1U << 10) - 1);
++  const uint32_t hash_3_value = (temp ^ ((uint32_t)(cur[2]) << 8))
++				& ((1U << 16) - 1);
++  const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8)
++				       ^ (lzma_crc32_table[0][cur[3]] << 5))
++			       & mf->hash_mask;
++
++  const uint32_t cur_match = mf->hash[((1U << 10) + (1U << 16)) + hash_value];
++
++  mf->hash[hash_2_value] = pos;
++  mf->hash[((1U << 10)) + hash_3_value] = pos;
++  mf->hash[((1U << 10) + (1U << 16)) + hash_value] = pos;
++
++  bt_skip_func(len_limit, pos, cur, cur_match, mf->depth, mf->son,
++	       mf->cyclic_pos, mf->cyclic_size);
++  move_pos(mf);
++ } while (--amount != 0);
++}
++
++static inline void
++mf_skip(lzma_mf *mf, uint32_t amount)
++{
++ if (amount != 0) {
++  mf->skip(mf, amount);
++  mf->read_ahead += amount;
++ }
++}
++
++typedef struct lzma_lzma1_encoder_s lzma_lzma1_encoder;
++typedef uint16_t probability;
++
++typedef struct {
++ probability choice;
++ probability choice2;
++ probability low[(1 << 4)][(1 << 3)];
++ probability mid[(1 << 4)][(1 << 3)];
++ probability high[(1 << 8)];
++ uint32_t prices[(1 << 4)][((1 << 3) + (1 << 3) + (1 << 8))];
++ uint32_t table_size;
++ uint32_t counters[(1 << 4)];
++} lzma_length_encoder;
++
++typedef struct {
++ uint64_t low;
++ uint64_t cache_size;
++ uint32_t range;
++ uint8_t cache;
++ size_t count;
++ size_t pos;
++
++ enum {
++  RC_BIT_0,
++  RC_BIT_1,
++  RC_DIRECT_0,
++  RC_DIRECT_1,
++  RC_FLUSH,
++ } symbols[58];
++
++ probability *probs[58];
++} lzma_range_encoder;
++
++
++typedef enum {
++ STATE_LIT_LIT,
++ STATE_MATCH_LIT_LIT,
++ STATE_REP_LIT_LIT,
++ STATE_SHORTREP_LIT_LIT,
++ STATE_MATCH_LIT,
++ STATE_REP_LIT,
++ STATE_SHORTREP_LIT,
++ STATE_LIT_MATCH,
++ STATE_LIT_LONGREP,
++ STATE_LIT_SHORTREP,
++ STATE_NONLIT_MATCH,
++ STATE_NONLIT_REP,
++} lzma_lzma_state;
++
++typedef struct {
++ lzma_lzma_state state;
++ _Bool prev_1_is_literal;
++ _Bool prev_2;
++
++ uint32_t pos_prev_2;
++ uint32_t back_prev_2;
++
++ uint32_t price;
++ uint32_t pos_prev;
++ uint32_t back_prev;
++
++ uint32_t backs[4];
++} lzma_optimal;
++
++struct lzma_lzma1_encoder_s {
++ lzma_range_encoder rc;
++ lzma_lzma_state state;
++ uint32_t reps[4];
++ lzma_match matches[(2 + ((1 << 3) + (1 << 3) + (1 << 8)) - 1) + 1];
++ uint32_t matches_count;
++ uint32_t longest_match_length;
++ _Bool fast_mode;
++ _Bool is_initialized;
++ _Bool is_flushed;
++ uint32_t pos_mask;
++ uint32_t literal_context_bits;
++ uint32_t literal_pos_mask;
++
++ probability literal[(1 << 4)][0x300];
++ probability is_match[12][(1 << 4)];
++ probability is_rep[12];
++ probability is_rep0[12];
++ probability is_rep1[12];
++ probability is_rep2[12];
++ probability is_rep0_long[12][(1 << 4)];
++ probability dist_slot[4][(1 << 6)];
++ probability dist_special[(1 << (14 / 2)) - 14];
++ probability dist_align[(1 << 4)];
++
++ lzma_length_encoder match_len_encoder;
++ lzma_length_encoder rep_len_encoder;
++
++ uint32_t dist_slot_prices[4][(1 << 6)];
++ uint32_t dist_prices[4][(1 << (14 / 2))];
++ uint32_t dist_table_size;
++ uint32_t match_price_count;
++
++ uint32_t align_prices[(1 << 4)];
++ uint32_t align_price_count;
++ uint32_t opts_end_index;
++ uint32_t opts_current_index;
++ lzma_optimal opts[(1 << 12)];
++};
++
++extern void
++lzma_lzma_optimum_fast(lzma_lzma1_encoder *restrict coder,
++  lzma_mf *restrict mf,
++  uint32_t *restrict back_res, uint32_t *restrict len_res)
++{
++ const uint32_t nice_len = mf->nice_len;
++
++ uint32_t len_main;
++ uint32_t matches_count;
++ if (mf->read_ahead == 0) {
++  len_main = lzma_mf_find(mf, &matches_count, coder->matches);
++ } else {
++  len_main = coder->longest_match_length;
++  matches_count = coder->matches_count;
++ }
++
++ const uint8_t *buf = mf_ptr(mf) - 1;
++ const uint32_t buf_avail
++   = ((mf_avail(mf) + 1) < ((2 + ((1 << 3) + (1 << 3) + (1 << 8)) - 1))
++      ? (mf_avail(mf) + 1) : ((2 + ((1 << 3) + (1 << 3) + (1 << 8)) - 1)));
++
++ if (buf_avail < 2) {
++  *back_res = (4294967295U);
++  *len_res = 1;
++  return;
++ }
++
++ uint32_t rep_len = 0;
++ uint32_t rep_index = 0;
++
++ for (uint32_t i = 0; i < 4; ++i) {
++  const uint8_t *const buf_back = buf - coder->reps[i] - 1;
++  if ((read16ne(buf) != read16ne(buf_back)))
++   continue;
++  const uint32_t len = lzma_memcmplen(buf, buf_back, 2, buf_avail);
++  if (len >= nice_len) {
++   *back_res = i;
++   *len_res = len;
++   mf_skip(mf, len - 1);
++   return;
++  }
++  if (len > rep_len) {
++   rep_index = i;
++   rep_len = len;
++  }
++ }
++ if (len_main >= nice_len) {
++  *back_res = coder->matches[matches_count - 1].dist + 4;
++  *len_res = len_main;
++  mf_skip(mf, len_main - 1);
++  return;
++ }
++
++ uint32_t back_main = 0;
++ if (len_main >= 2) {
++  back_main = coder->matches[matches_count - 1].dist;
++  while (matches_count > 1 && len_main ==
++    coder->matches[matches_count - 2].len + 1) {
++   if (!(((back_main) >> 7) > (coder->matches[ matches_count - 2].dist)))
++    break;
++   --matches_count;
++   len_main = coder->matches[matches_count - 1].len;
++   back_main = coder->matches[matches_count - 1].dist;
++  }
++  if (len_main == 2 && back_main >= 0x80)
++   len_main = 1;
++ }
++
++ if (rep_len >= 2) {
++  if (rep_len + 1 >= len_main
++    || (rep_len + 2 >= len_main
++     && back_main > (1U << 9))
++    || (rep_len + 3 >= len_main
++     && back_main > (1U << 15))) {
++   *back_res = rep_index;
++   *len_res = rep_len;
++   mf_skip(mf, rep_len - 1);
++   return;
++  }
++ }
++
++ if (len_main < 2 || buf_avail <= 2) {
++  *back_res = (4294967295U);
++  *len_res = 1;
++  return;
++ }
++
++ coder->longest_match_length = lzma_mf_find(mf,
++   &coder->matches_count, coder->matches);
++
++ if (coder->longest_match_length >= 2) {
++  const uint32_t new_dist = coder->matches[
++    coder->matches_count - 1].dist;
++
++  if ((coder->longest_match_length >= len_main
++     && new_dist < back_main)
++    || (coder->longest_match_length == len_main + 1
++     && !(((new_dist) >> 7) > (back_main)))
++    || (coder->longest_match_length > len_main + 1)
++    || (coder->longest_match_length + 1 >= len_main
++     && len_main >= 3
++     && (((back_main) >> 7) > (new_dist)))) {
++   *back_res = (4294967295U);
++   *len_res = 1;
++   return;
++  }
++ }
++ ++buf;
++ const uint32_t limit = ((2) > (len_main - 1) ? (2) : (len_main - 1));
++ for (uint32_t i = 0; i < 4; ++i) {
++  if (memcmp(buf, buf - coder->reps[i] - 1, limit) == 0) {
++   *back_res = (4294967295U);
++   *len_res = 1;
++   return;
++  }
++ }
++
++ *back_res = back_main + 4;
++ *len_res = len_main;
++ mf_skip(mf, len_main - 2);
++ return;
++}
++
++static inline void
++rc_bit(lzma_range_encoder *rc, probability *prob, uint32_t bit)
++{
++ rc->symbols[rc->count] = bit;
++ rc->probs[rc->count] = prob;
++ ++rc->count;
++}
++
++static inline void
++rc_bittree(lzma_range_encoder *rc, probability *probs,
++  uint32_t bit_count, uint32_t symbol)
++{
++ uint32_t model_index = 1;
++
++ do {
++  const uint32_t bit = (symbol >> --bit_count) & 1;
++  rc_bit(rc, &probs[model_index], bit);
++  model_index = (model_index << 1) + bit;
++ } while (bit_count != 0);
++}
++
++static _Bool
++encode_init(lzma_lzma1_encoder *coder, lzma_mf *mf)
++{
++ if (mf->read_pos == mf->read_limit) {
++  if (mf->action == LZMA_RUN)
++   return 0;
++ } else {
++  mf_skip(mf, 1);
++  mf->read_ahead = 0;
++  rc_bit(&coder->rc, &coder->is_match[0][0], 0);
++  rc_bittree(&coder->rc, coder->literal[0], 8, mf->buffer[0]);
++ }
++
++ coder->is_initialized = 1;
++
++ return 1;
++}
++
++static inline uint32_t
++mf_position(const lzma_mf *mf)
++{
++ return mf->read_pos - mf->read_ahead;
++}
++
++static inline _Bool
++rc_shift_low(lzma_range_encoder *rc,
++  uint8_t *out, size_t *out_pos, size_t out_size)
++{
++ if ((uint32_t)(rc->low) < (uint32_t)(0xFF000000)
++   || (uint32_t)(rc->low >> 32) != 0) {
++  do {
++   if (*out_pos == out_size)
++    return 1;
++
++   out[*out_pos] = rc->cache + (uint8_t)(rc->low >> 32);
++   ++*out_pos;
++   rc->cache = 0xFF;
++  } while (--rc->cache_size != 0);
++  rc->cache = (rc->low >> 24) & 0xFF;
++ }
++
++ ++rc->cache_size;
++ rc->low = (rc->low & 0x00FFFFFF) << 8;
++ return 0;
++}
++
++static inline void
++rc_reset(lzma_range_encoder *rc)
++{
++ rc->low = 0;
++ rc->cache_size = 1;
++ rc->range = (4294967295U);
++ rc->cache = 0;
++ rc->count = 0;
++ rc->pos = 0;
++}
++
++static inline _Bool
++rc_encode(lzma_range_encoder *rc,
++  uint8_t *out, size_t *out_pos, size_t out_size)
++{
++ while (rc->pos < rc->count) {
++  if (rc->range < (1U << 24)) {
++   if (rc_shift_low(rc, out, out_pos, out_size))
++    return 1;
++   rc->range <<= 8;
++  }
++
++  switch (rc->symbols[rc->pos]) {
++  case RC_BIT_0: {
++   probability prob = *rc->probs[rc->pos];
++   rc->range = (rc->range >> 11)
++     * prob;
++   prob += ((1U << 11) - prob) >> 5;
++   *rc->probs[rc->pos] = prob;
++   break;
++  }
++
++  case RC_BIT_1: {
++   probability prob = *rc->probs[rc->pos];
++   const uint32_t bound = prob * (rc->range
++     >> 11);
++   rc->low += bound;
++   rc->range -= bound;
++   prob -= prob >> 5;
++   *rc->probs[rc->pos] = prob;
++   break;
++  }
++
++  case RC_DIRECT_0:
++   rc->range >>= 1;
++   break;
++
++  case RC_DIRECT_1:
++   rc->range >>= 1;
++   rc->low += rc->range;
++   break;
++
++  case RC_FLUSH:
++   rc->range = (4294967295U);
++   do {
++    if (rc_shift_low(rc, out, out_pos, out_size))
++     return 1;
++   } while (++rc->pos < rc->count);
++
++   rc_reset(rc);
++   return 0;
++
++  default:
++   break;
++  }
++  ++rc->pos;
++ }
++
++ rc->count = 0;
++ rc->pos = 0;
++ return 0;
++}
++
++static inline uint64_t
++rc_pending(const lzma_range_encoder *rc)
++{
++ return rc->cache_size + 5 - 1;
++}
++
++static inline void
++literal_matched(lzma_range_encoder *rc, probability *subcoder,
++  uint32_t match_byte, uint32_t symbol)
++{
++ uint32_t offset = 0x100;
++ symbol += 1U << 8;
++
++ do {
++  match_byte <<= 1;
++  const uint32_t match_bit = match_byte & offset;
++  const uint32_t subcoder_index
++    = offset + match_bit + (symbol >> 8);
++  const uint32_t bit = (symbol >> 7) & 1;
++  rc_bit(rc, &subcoder[subcoder_index], bit);
++
++  symbol <<= 1;
++  offset &= ~(match_byte ^ symbol);
++
++ } while (symbol < (1U << 16));
++}
++
++static inline void
++literal(lzma_lzma1_encoder *coder, lzma_mf *mf, uint32_t position)
++{
++ const uint8_t cur_byte = mf->buffer[mf->read_pos - mf->read_ahead];
++ probability *subcoder  = ((coder->literal)[
++   (((position) & (coder->literal_pos_mask))
++    << (coder->literal_context_bits))
++   + ((uint32_t)(mf->buffer[mf->read_pos - mf->read_ahead - 1])
++   >> (8U - (coder->literal_context_bits)))]);
++
++ if (((coder->state) < 7)) {
++  rc_bittree(&coder->rc, subcoder, 8, cur_byte);
++ } else {
++  const uint8_t match_byte
++    = mf->buffer[mf->read_pos - coder->reps[0] - 1 - mf->read_ahead];
++  literal_matched(&coder->rc, subcoder, match_byte, cur_byte);
++ }
++ coder->state
++   = ((coder->state) <= STATE_SHORTREP_LIT_LIT
++      ? STATE_LIT_LIT : ((coder->state) <= STATE_LIT_SHORTREP
++			 ? (coder->state) - 3 : (coder->state) - 6));
++}
++
++const uint8_t lzma_rc_prices[] = {
++         128, 103,  91,  84,  78,  73,  69,  66,
++          63,  61,  58,  56,  54,  52,  51,  49,
++          48,  46,  45,  44,  43,  42,  41,  40,
++          39,  38,  37,  36,  35,  34,  34,  33,
++          32,  31,  31,  30,  29,  29,  28,  28,
++          27,  26,  26,  25,  25,  24,  24,  23,
++          23,  22,  22,  22,  21,  21,  20,  20,
++          19,  19,  19,  18,  18,  17,  17,  17,
++          16,  16,  16,  15,  15,  15,  14,  14,
++          14,  13,  13,  13,  12,  12,  12,  11,
++          11,  11,  11,  10,  10,  10,  10,   9,
++           9,   9,   9,   8,   8,   8,   8,   7,
++           7,   7,   7,   6,   6,   6,   6,   5,
++           5,   5,   5,   5,   4,   4,   4,   4,
++           3,   3,   3,   3,   3,   2,   2,   2,
++           2,   2,   2,   1,   1,   1,   1,   1
++};
++
++static inline uint32_t
++rc_bit_price(const probability prob, const uint32_t bit)
++{
++ return lzma_rc_prices[(prob ^ ((0U - bit)
++   & ((1U << 11) - 1))) >> 4];
++}
++
++static inline uint32_t
++rc_bit_0_price(const probability prob)
++{
++ return lzma_rc_prices[prob >> 4];
++}
++
++static inline uint32_t
++rc_bit_1_price(const probability prob)
++{
++ return lzma_rc_prices[(prob ^ ((1U << 11) - 1))
++   >> 4];
++}
++
++static inline uint32_t
++rc_bittree_price(const probability *const probs,
++  const uint32_t bit_levels, uint32_t symbol)
++{
++ uint32_t price = 0;
++ symbol += 1U << bit_levels;
++
++ do {
++  const uint32_t bit = symbol & 1;
++  symbol >>= 1;
++  price += rc_bit_price(probs[symbol], bit);
++ } while (symbol != 1);
++
++ return price;
++}
++
++static void
++length_update_prices(lzma_length_encoder *lc, const uint32_t pos_state)
++{
++ const uint32_t table_size = lc->table_size;
++ lc->counters[pos_state] = table_size;
++
++ const uint32_t a0 = rc_bit_0_price(lc->choice);
++ const uint32_t a1 = rc_bit_1_price(lc->choice);
++ const uint32_t b0 = a1 + rc_bit_0_price(lc->choice2);
++ const uint32_t b1 = a1 + rc_bit_1_price(lc->choice2);
++ uint32_t *const prices = lc->prices[pos_state];
++
++ uint32_t i;
++ for (i = 0; i < table_size && i < (1 << 3); ++i)
++  prices[i] = a0 + rc_bittree_price(lc->low[pos_state],
++    3, i);
++
++ for (; i < table_size && i < (1 << 3) + (1 << 3); ++i)
++  prices[i] = b0 + rc_bittree_price(lc->mid[pos_state],
++    3, i - (1 << 3));
++
++ for (; i < table_size; ++i)
++  prices[i] = b1 + rc_bittree_price(lc->high, 8,
++    i - (1 << 3) - (1 << 3));
++
++ return;
++}
++
++static inline void
++length(lzma_range_encoder *rc, lzma_length_encoder *lc,
++  const uint32_t pos_state, uint32_t len, const _Bool fast_mode)
++{
++ len -= 2;
++
++ if (len < (1 << 3)) {
++  rc_bit(rc, &lc->choice, 0);
++  rc_bittree(rc, lc->low[pos_state], 3, len);
++ } else {
++  rc_bit(rc, &lc->choice, 1);
++  len -= (1 << 3);
++
++  if (len < (1 << 3)) {
++   rc_bit(rc, &lc->choice2, 0);
++   rc_bittree(rc, lc->mid[pos_state], 3, len);
++  } else {
++   rc_bit(rc, &lc->choice2, 1);
++   len -= (1 << 3);
++   rc_bittree(rc, lc->high, 8, len);
++  }
++ }
++
++ if (!fast_mode)
++  if (--lc->counters[pos_state] == 0)
++   length_update_prices(lc, pos_state);
++}
++
++static inline void
++rep_match(lzma_lzma1_encoder *coder, const uint32_t pos_state,
++  const uint32_t rep, const uint32_t len)
++{
++ if (rep == 0) {
++  rc_bit(&coder->rc, &coder->is_rep0[coder->state], 0);
++  rc_bit(&coder->rc,
++    &coder->is_rep0_long[coder->state][pos_state],
++    len != 1);
++ } else {
++  const uint32_t distance = coder->reps[rep];
++  rc_bit(&coder->rc, &coder->is_rep0[coder->state], 1);
++
++  if (rep == 1) {
++   rc_bit(&coder->rc, &coder->is_rep1[coder->state], 0);
++  } else {
++   rc_bit(&coder->rc, &coder->is_rep1[coder->state], 1);
++   rc_bit(&coder->rc, &coder->is_rep2[coder->state],
++     rep - 2);
++
++   if (rep == 3)
++    coder->reps[3] = coder->reps[2];
++
++   coder->reps[2] = coder->reps[1];
++  }
++
++  coder->reps[1] = coder->reps[0];
++  coder->reps[0] = distance;
++ }
++
++ if (len == 1) {
++  coder->state = ((coder->state) < 7 ? STATE_LIT_SHORTREP : STATE_NONLIT_REP);
++ } else {
++  length(&coder->rc, &coder->rep_len_encoder, pos_state, len,
++    coder->fast_mode);
++  coder->state = ((coder->state) < 7 ? STATE_LIT_LONGREP : STATE_NONLIT_REP);
++ }
++}
++
++// This array is constantly initialized in the original code. It's quite big
++// so we skip it.
++const uint8_t lzma_fastpos[1 << 13];
++
++static inline uint32_t
++get_dist_slot(uint32_t dist)
++{
++ if (dist < (1U << (13 + ((0) + (0) * (13 - 1)))))
++  return lzma_fastpos[dist];
++
++ if (dist < (1U << (13 + ((0) + (1) * (13 - 1)))))
++  return (uint32_t)(lzma_fastpos[(dist) >> ((0) + (1) * (13 - 1))]) + 2 * ((0) + (1) * (13 - 1));
++
++ return (uint32_t)(lzma_fastpos[(dist) >> ((0) + (2) * (13 - 1))]) + 2 * ((0) + (2) * (13 - 1));
++}
++
++static inline void
++rc_bittree_reverse(lzma_range_encoder *rc, probability *probs,
++  uint32_t bit_count, uint32_t symbol)
++{
++ uint32_t model_index = 1;
++ do {
++  const uint32_t bit = symbol & 1;
++  symbol >>= 1;
++  rc_bit(rc, &probs[model_index], bit);
++  model_index = (model_index << 1) + bit;
++ } while (--bit_count != 0);
++}
++
++static inline void
++rc_direct(lzma_range_encoder *rc, uint32_t value, uint32_t bit_count)
++{
++ do {
++  rc->symbols[rc->count++]
++    = RC_DIRECT_0 + ((value >> --bit_count) & 1);
++ } while (bit_count != 0);
++}
++
++static inline void
++match(lzma_lzma1_encoder *coder, const uint32_t pos_state,
++      const uint32_t distance, const uint32_t len)
++{
++ coder->state = ((coder->state) < 7 ? STATE_LIT_MATCH : STATE_NONLIT_MATCH);
++
++ length(&coder->rc, &coder->match_len_encoder, pos_state, len,
++	coder->fast_mode);
++
++ const uint32_t dist_slot = get_dist_slot(distance);
++ const uint32_t dist_state = ((len) < 4 + 2 ? (len) - 2 : 4 - 1);
++ rc_bittree(&coder->rc, coder->dist_slot[dist_state], 6, dist_slot);
++
++ if (dist_slot >= 4) {
++  const uint32_t footer_bits = (dist_slot >> 1) - 1;
++  const uint32_t base = (2 | (dist_slot & 1)) << footer_bits;
++  const uint32_t dist_reduced = distance - base;
++
++  if (dist_slot < 14) {
++   rc_bittree_reverse(&coder->rc, coder->dist_special + base - dist_slot - 1,
++		     footer_bits, dist_reduced);
++  } else {
++   rc_direct(&coder->rc, dist_reduced >> 4,
++     footer_bits - 4);
++   rc_bittree_reverse(
++     &coder->rc, coder->dist_align,
++     4, dist_reduced & ((1 << 4) - 1));
++   ++coder->align_price_count;
++  }
++ }
++
++ coder->reps[3] = coder->reps[2];
++ coder->reps[2] = coder->reps[1];
++ coder->reps[1] = coder->reps[0];
++ coder->reps[0] = distance;
++ ++coder->match_price_count;
++}
++
++static void
++encode_symbol(lzma_lzma1_encoder *coder, lzma_mf *mf,
++  uint32_t back, uint32_t len, uint32_t position)
++{
++ const uint32_t pos_state = position & coder->pos_mask;
++
++ if (back == (4294967295U)) {
++  rc_bit(&coder->rc,
++    &coder->is_match[coder->state][pos_state], 0);
++  literal(coder, mf, position);
++ } else {
++  rc_bit(&coder->rc,
++   &coder->is_match[coder->state][pos_state], 1);
++
++  if (back < 4) {
++   rc_bit(&coder->rc, &coder->is_rep[coder->state], 1);
++   rep_match(coder, pos_state, back, len);
++  } else {
++   rc_bit(&coder->rc, &coder->is_rep[coder->state], 0);
++   match(coder, pos_state, back - 4, len);
++  }
++ }
++ mf->read_ahead -= len;
++}
++
++static void
++encode_eopm(lzma_lzma1_encoder *coder, uint32_t position)
++{
++ const uint32_t pos_state = position & coder->pos_mask;
++ rc_bit(&coder->rc, &coder->is_match[coder->state][pos_state], 1);
++ rc_bit(&coder->rc, &coder->is_rep[coder->state], 0);
++ match(coder, pos_state, (4294967295U), 2);
++}
++
++static inline void
++rc_flush(lzma_range_encoder *rc)
++{
++ for (size_t i = 0; i < 5; ++i)
++  rc->symbols[rc->count++] = RC_FLUSH;
++}
++
++extern void exit (int __status)
++ __attribute__ ((__nothrow__ , __leaf__ , __noreturn__));
++
++extern lzma_ret
++lzma_lzma_encode(lzma_lzma1_encoder *restrict coder, lzma_mf *restrict mf,
++  uint8_t *restrict out, size_t *restrict out_pos,
++  size_t out_size, uint32_t limit)
++{
++
++ if (!coder->is_initialized && !encode_init(coder, mf))
++  return LZMA_OK;
++
++ uint32_t position = mf_position(mf);
++
++ while (1) {
++  if (rc_encode(&coder->rc, out, out_pos, out_size)) {
++   return LZMA_OK;
++  }
++
++  if (limit != (4294967295U)
++      && (mf->read_pos - mf->read_ahead >= limit
++	 || *out_pos + rc_pending(&coder->rc)
++	    >= (1U << 16) - ((1 << 12) + 1)))
++   break;
++
++  if (mf->read_pos >= mf->read_limit) {
++   if (mf->action == LZMA_RUN)
++    return LZMA_OK;
++
++
++   if (mf->read_ahead == 0)
++    break;
++  }
++  uint32_t len;
++  uint32_t back;
++
++  if (coder->fast_mode)
++   lzma_lzma_optimum_fast(coder, mf, &back, &len);
++  else
++   // The original code contains the  call to
++   // lzma_lzma_optimum_normal(coder, mf, &back, &len, position);
++   exit (-1);
++
++  encode_symbol(coder, mf, back, len, position);
++
++  position += len;
++ }
++
++ if (!coder->is_flushed) {
++  coder->is_flushed = 1;
++  if (limit == (4294967295U))
++   encode_eopm(coder, position);
++
++  rc_flush(&coder->rc);
++
++  if (rc_encode(&coder->rc, out, out_pos, out_size)) {
++   return LZMA_OK;
++  }
++ }
++
++ coder->is_flushed = 0;
++ return LZMA_STREAM_END;
++}
++
++extern void
++lzma_free(void *ptr, const lzma_allocator *allocator)
++{
++ if (allocator != ((void *)0) && allocator->free != ((void *)0))
++  allocator->free(allocator->opaque, ptr);
++ else
++  free(ptr);
++ return;
++}
++
++static _Bool
++lz_encoder_prepare(lzma_mf *mf, const lzma_allocator *allocator,
++  const lzma_lz_options *lz_options)
++{
++ if (lz_options->dict_size < 4096U
++   || lz_options->dict_size
++    > (1U << 30) + (1U << 29)
++   || lz_options->nice_len > lz_options->match_len_max)
++  return 1;
++
++ mf->keep_size_before = lz_options->before_size + lz_options->dict_size;
++ mf->keep_size_after = lz_options->after_size
++   + lz_options->match_len_max;
++ uint32_t reserve = lz_options->dict_size / 2;
++ if (reserve > (1U << 30))
++  reserve /= 2;
++
++ reserve += (lz_options->before_size + lz_options->match_len_max
++   + lz_options->after_size) / 2 + (1U << 19);
++
++ const uint32_t old_size = mf->size;
++ mf->size = mf->keep_size_before + reserve + mf->keep_size_after;
++
++ if ((mf->buffer != ((void *)0)) && old_size != mf->size) {
++  lzma_free(mf->buffer, allocator);
++  mf->buffer = ((void *)0);
++ }
++
++ mf->match_len_max = lz_options->match_len_max;
++ mf->nice_len = lz_options->nice_len;
++ mf->cyclic_size = lz_options->dict_size + 1;
++
++ switch (lz_options->match_finder) {
++ case LZMA_MF_HC3:
++  mf->find = &lzma_mf_hc3_find;
++  mf->skip = &lzma_mf_hc3_skip;
++  break;
++
++ case LZMA_MF_HC4:
++  mf->find = &lzma_mf_hc4_find;
++  mf->skip = &lzma_mf_hc4_skip;
++  break;
++
++ case LZMA_MF_BT2:
++  mf->find = &lzma_mf_bt2_find;
++  mf->skip = &lzma_mf_bt2_skip;
++  break;
++
++ case LZMA_MF_BT3:
++  mf->find = &lzma_mf_bt3_find;
++  mf->skip = &lzma_mf_bt3_skip;
++  break;
++
++ case LZMA_MF_BT4:
++  mf->find = &lzma_mf_bt4_find;
++  mf->skip = &lzma_mf_bt4_skip;
++  break;
++
++ default:
++  return 1;
++ }
++
++ const uint32_t hash_bytes = lz_options->match_finder & 0x0F;
++ if (hash_bytes > mf->nice_len)
++  return 1;
++
++ const _Bool is_bt = (lz_options->match_finder & 0x10) != 0;
++ uint32_t hs;
++
++ if (hash_bytes == 2) {
++  hs = 0xFFFF;
++ } else {
++  hs = lz_options->dict_size - 1;
++  hs |= hs >> 1;
++  hs |= hs >> 2;
++  hs |= hs >> 4;
++  hs |= hs >> 8;
++  hs >>= 1;
++  hs |= 0xFFFF;
++
++  if (hs > (1U << 24)) {
++   if (hash_bytes == 3)
++    hs = (1U << 24) - 1;
++   else
++    hs >>= 1;
++  }
++ }
++
++ mf->hash_mask = hs;
++
++ ++hs;
++ if (hash_bytes > 2)
++  hs += (1U << 10);
++ if (hash_bytes > 3)
++  hs += (1U << 16);
++
++ const uint32_t old_hash_count = mf->hash_count;
++ const uint32_t old_sons_count = mf->sons_count;
++ mf->hash_count = hs;
++ mf->sons_count = mf->cyclic_size;
++ if (is_bt)
++  mf->sons_count *= 2;
++
++ if (old_hash_count != mf->hash_count
++   || old_sons_count != mf->sons_count) {
++  lzma_free(mf->hash, allocator);
++  mf->hash = ((void *)0);
++
++  lzma_free(mf->son, allocator);
++  mf->son = ((void *)0);
++ }
++
++ mf->depth = lz_options->depth;
++ if (mf->depth == 0) {
++  if (is_bt)
++   mf->depth = 16 + mf->nice_len / 2;
++  else
++   mf->depth = 4 + mf->nice_len / 4;
++ }
++
++ return 0;
++}
++
++int
++main ()
++{
++  lzma_mf mf;
++  lzma_allocator allocator;
++  lzma_lz_options lz_options;
++
++  void *coder;
++  uint8_t *restrict out;
++  size_t *restrict out_pos;
++  size_t out_size;
++
++  lz_encoder_prepare(&mf, &allocator, &lz_options);
++  return (int) lzma_lzma_encode(coder, &mf, out, out_pos, out_size, (4294967295U));
++}
++
++
++/* { dg-final { scan-wpa-ipa-dump "Save results of indirect call analysis." "icp"} } */
++/* { dg-final { scan-wpa-ipa-dump-times "For call" 2 "icp"} } */
++/* { dg-final { scan-wpa-ipa-dump-times "Insert 0 prefetch stmt:" 5 "ipa_prefetch"} } */
++/* { dg-final { scan-wpa-ipa-dump-times "Insert 1 prefetch stmt:" 4 "ipa_prefetch"} } */
++/* { dg-final { scan-wpa-ipa-dump-times "Insert 2 prefetch stmt:" 2 "ipa_prefetch"} } */
+-- 
+2.33.0
+
diff --git a/0046-Fix-fails-in-IPA-prefetch-src-openEuler-gcc-I96ID7.patch b/0046-Fix-fails-in-IPA-prefetch-src-openEuler-gcc-I96ID7.patch
new file mode 100644
index 0000000000000000000000000000000000000000..e50c9b2b682f7498c6b807b5db12cf7fc7c115d5
--- /dev/null
+++ b/0046-Fix-fails-in-IPA-prefetch-src-openEuler-gcc-I96ID7.patch
@@ -0,0 +1,94 @@
+From 0263daa1312d0cdcdf9c770bcf5d982a2d4fc16b Mon Sep 17 00:00:00 2001
+From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
+Date: Fri, 29 Mar 2024 17:15:41 +0800
+Subject: [PATCH 2/2] Fix fails in IPA prefetch (src-openEuler/gcc: I96ID7)
+
+---
+ gcc/ipa-prefetch.cc | 28 ++++++++++++++++++++++++++--
+ 1 file changed, 26 insertions(+), 2 deletions(-)
+
+diff --git a/gcc/ipa-prefetch.cc b/gcc/ipa-prefetch.cc
+index 9537e4835..1ceb5137f 100644
+--- a/gcc/ipa-prefetch.cc
++++ b/gcc/ipa-prefetch.cc
+@@ -366,6 +366,7 @@ typedef std::map<memref_t *, memref_t *> memref_map;
+ typedef std::map<memref_t *, tree> memref_tree_map;
+ 
+ typedef std::set<gimple *> stmt_set;
++typedef std::set<tree> tree_set;
+ typedef std::map<tree, tree> tree_map;
+ 
+ tree_memref_map *tm_map;
+@@ -1124,8 +1125,21 @@ analyse_loops ()
+     }
+ }
+ 
++/* Compare memrefs by IDs; helper for qsort.  */
++
++static int
++memref_id_cmp (const void *p1, const void *p2)
++{
++  const memref_t *mr1 = *(const memref_t **) p1;
++  const memref_t *mr2 = *(const memref_t **) p2;
++
++  if ((unsigned) mr1->mr_id > (unsigned) mr2->mr_id)
++    return 1;
++  return -1;
++}
++
+ /* Reduce the set filtering out memrefs with the same memory references,
+-   return the result vector of memrefs.  */
++   sort and return the result vector of memrefs.  */
+ 
+ static void
+ reduce_memref_set (memref_set *set, vec<memref_t *> &vec)
+@@ -1162,6 +1176,7 @@ reduce_memref_set (memref_set *set, vec<memref_t *> &vec)
+ 	    vec.safe_push (mr1);
+ 	}
+     }
++  vec.qsort (memref_id_cmp);
+   if (dump_file)
+     {
+       fprintf (dump_file, "MRs (%d) after filtering: ", vec.length ());
+@@ -1663,10 +1678,15 @@ optimize_function (cgraph_node *n, function *fn)
+     }
+ 
+   /* Create other new vars.  Insert new stmts.  */
++  vec<memref_t *> used_mr_vec = vNULL;
+   for (memref_set::const_iterator it = used_mrs.begin ();
+        it != used_mrs.end (); it++)
++    used_mr_vec.safe_push (*it);
++  used_mr_vec.qsort (memref_id_cmp);
++
++  for (unsigned int j = 0; j < used_mr_vec.length (); j++)
+     {
+-      memref_t *mr = *it;
++      memref_t *mr = used_mr_vec[j];
+       if (mr == comp_mr)
+ 	continue;
+       gimple *last_stmt = gimple_copy_and_remap_memref_stmts (mr, stmts, 0,
+@@ -1702,6 +1722,7 @@ optimize_function (cgraph_node *n, function *fn)
+       local = integer_three_node;
+       break;
+     }
++  tree_set prefetched_addrs;
+   for (unsigned int j = 0; j < vmrs.length (); j++)
+     {
+       memref_t *mr = vmrs[j];
+@@ -1714,10 +1735,13 @@ optimize_function (cgraph_node *n, function *fn)
+       tree addr = get_mem_ref_address_ssa_name (mr->mem, NULL_TREE);
+       if (decl_map->count (addr))
+ 	addr = (*decl_map)[addr];
++      if (prefetched_addrs.count (addr))
++	continue;
+       last_stmt = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
+ 				     3, addr, write_p, local);
+       pcalls.safe_push (last_stmt);
+       gimple_seq_add_stmt (&stmts, last_stmt);
++      prefetched_addrs.insert (addr);
+       if (dump_file)
+ 	{
+ 	  fprintf (dump_file, "Insert %d prefetch stmt:\n", j);
+-- 
+2.33.0
+
diff --git a/gcc.spec b/gcc.spec
index 7030b1209fa2312fa0df39de943c076f46af4d21..5edd361b9a4e0b9c2cb9288273860288f2eda123 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 19
+%global gcc_release 20
 
 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@@ -166,6 +166,24 @@ Patch25: 0025-AArch64-Rewrite-the-tsv110-option.patch
 Patch26: 0026-GOMP-Enabling-moutline-atomics-improves-libgomp-perf.patch
 Patch27: 0027-LoopElim-Redundant-loop-elimination-optimization.patch
 Patch28: 0028-Array-widen-compare-Fix-the-return-value-match-after.patch
+Patch29: 0029-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch   
+Patch30: 0030-rtl-ifcvt-introduce-rtl-ifcvt-enchancements.patch           
+Patch31: 0031-Perform-early-if-conversion-of-simple-arithmetic.patch      
+Patch32: 0032-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch  
+Patch33: 0033-Match-double-sized-mul-pattern.patch                        
+Patch34: 0034-Port-icp-patch-to-GCC-12.patch                              
+Patch35: 0035-Port-fixes-in-icp-to-GCC-12.patch
+Patch36: 0036-Add-split-complex-instructions-pass.patch
+Patch37: 0037-Extending-and-refactoring-of-pass_split_complex_inst.patch
+Patch38: 0038-Port-maxmin-patch-to-GCC-12.patch
+Patch39: 0039-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch
+Patch40: 0040-Add-new-pattern-to-pass-the-maxmin-tests.patch
+Patch41: 0041-AES-Implement-AES-pattern-matching.patch
+Patch42: 0042-crypto-accel-add-optimization-level-requirement-to-t.patch
+Patch43: 0043-Add-more-flexible-check-for-pointer-aliasing-during-.patch
+Patch44: 0044-Port-IPA-prefetch-to-GCC-12.patch
+Patch45: 0045-Port-fixes-for-IPA-prefetch-to-GCC-12.patch
+Patch46: 0046-Fix-fails-in-IPA-prefetch-src-openEuler-gcc-I96ID7.patch
 
 # Part 3000 ~ 4999
 %ifarch loongarch64
@@ -789,6 +807,24 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch26 -p1
 %patch27 -p1
 %patch28 -p1
+%patch29 -p1
+%patch30 -p1
+%patch31 -p1
+%patch32 -p1
+%patch33 -p1
+%patch34 -p1
+%patch35 -p1
+%patch36 -p1
+%patch37 -p1
+%patch38 -p1
+%patch39 -p1
+%patch40 -p1
+%patch41 -p1
+%patch42 -p1
+%patch43 -p1
+%patch44 -p1
+%patch45 -p1
+%patch46 -p1
 
 %ifarch loongarch64
 %patch3001 -p1
@@ -3174,6 +3210,10 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Thu Apr 11 2024 zhengchenhui <zhengchenhui1@huawei.com> 12.3.1-20
+- Type: Sync
+- DESC: Sync patch from openeuler/gcc
+
 * Mon Apr 1 2024 Peng Fan <fanpeng@loongson.cn> 12.3.1-19
 - Type: SPEC
 - DESC: fix libcc1 file path for LoongArch.